nltkor 1.2.5__tar.gz → 1.2.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nltkor-1.2.5 → nltkor-1.2.9}/PKG-INFO +10 -6
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/__init__.py +3 -2
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/metrics/classical.py +34 -1
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/misc/string2string_word_embeddings.py +1 -1
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/search/faiss_search.py +333 -13
- nltkor-1.2.9/nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor-1.2.9/nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor-1.2.9/nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
- nltkor-1.2.9/nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
- nltkor-1.2.9/nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/__init__.py +1 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/__init__.py +1 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/config.py +4 -3
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/network.c +43296 -30238
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/pos/pos_reader.py +10 -2
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/reader.py +38 -68
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/taggers.py +6 -6
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/utils.py +41 -1
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor.egg-info/PKG-INFO +10 -6
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor.egg-info/SOURCES.txt +4 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor.egg-info/requires.txt +9 -5
- {nltkor-1.2.5 → nltkor-1.2.9}/setup.py +50 -26
- nltkor-1.2.5/nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/LICENSE.txt +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/README.md +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/Kor_char.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/alignment/__init__.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/cider/__init__.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/cider/cider.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/cider/cider_scorer.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/distance/__init__.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/distance/wasserstein.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/etc.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/lazyimport.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/make_requirement.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/metrics/__init__.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/metrics/bartscore.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/metrics/bertscore.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/metrics/bleu_tensor.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/metrics/entment.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/metrics/eval.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/metrics/mauve.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/metrics/mauve_utils.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/misc/__init__.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/misc/string2string_basic_functions.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/misc/string2string_default_tokenizer.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/misc/string2string_hash_functions.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/search/__init__.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/search/classical.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/search/kobert_tokenizer.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/sejong/__init__.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/sejong/ch.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/sejong/dict_semClassNum.txt +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/sejong/layer.txt +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/sejong/sejong_download.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/sejong/ssem.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/similarity/__init__.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/similarity/bartscore____.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/similarity/bertscore____.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/similarity/classical.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/similarity/cosine_similarity.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/espresso_tag.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/arguments.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/attributes.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/metadata.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/ner/__init__.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/ner/macmorphoreader.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/ner/ner_reader.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/parse/__init__.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/parse/parse_reader.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/pos/__init__.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/pos/macmorphoreader.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/srl/__init__.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/srl/__srl_reader_.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/srl/srl_reader.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/srl/train_srl.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/word_dictionary.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/wsd/__init__.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/wsd/macmorphoreader.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/wsd/wsd_reader.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tokenize/__init__.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tokenize/ko_tokenize.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/trans.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor.egg-info/dependency_links.txt +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/nltkor.egg-info/top_level.txt +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/setup.cfg +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/test/test.py +0 -0
- {nltkor-1.2.5 → nltkor-1.2.9}/test/testespresso.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: nltkor
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.9
|
4
4
|
Home-page: https://modi.changwon.ac.kr/air_cwnu/nlp_tool/nltk_ko.git
|
5
5
|
Keywords: string matching,pattern matching,edit distance,string to string correction,string to string matching,Levenshtein edit distance,Hamming distance,Damerau-Levenshtein distance,Jaro-Winkler distance,longest common subsequence,longest common substring,dynamic programming,approximate string matching,semantic similarity,natural language processing,NLP,information retrieval,rouge,sacrebleu,bertscore,bartscore,fasttext,glove,cosine similarity,Smith-Waterman,Needleman-Wunsch,Hirschberg,Karp-Rabin,Knuth-Morris-Pratt,Boyer-Moore
|
6
6
|
Classifier: Programming Language :: Python :: 3.7
|
@@ -12,21 +12,25 @@ Classifier: Operating System :: OS Independent
|
|
12
12
|
Classifier: Typing :: Typed
|
13
13
|
Requires-Python: >=3.7
|
14
14
|
License-File: LICENSE.txt
|
15
|
+
Requires-Dist: Cython
|
16
|
+
Requires-Dist: numpy<=1.26.4,>=1.23.5
|
15
17
|
Requires-Dist: regex
|
16
18
|
Requires-Dist: tqdm>=4.40.0
|
17
19
|
Requires-Dist: joblib
|
18
|
-
Requires-Dist: numpy==1.23.0
|
19
20
|
Requires-Dist: requests
|
20
21
|
Requires-Dist: nltk>3.0
|
21
|
-
Requires-Dist: pyarrow
|
22
|
+
Requires-Dist: pyarrow
|
22
23
|
Requires-Dist: beautifulSoup4
|
23
|
-
Requires-Dist: faiss-cpu
|
24
|
+
Requires-Dist: faiss-cpu==1.7.3
|
24
25
|
Requires-Dist: datasets
|
25
26
|
Requires-Dist: torch
|
27
|
+
Requires-Dist: dill<0.3.9
|
26
28
|
Requires-Dist: scikit-learn>=0.22.1
|
27
|
-
Requires-Dist: transformers
|
29
|
+
Requires-Dist: transformers==4.42.2
|
28
30
|
Requires-Dist: protobuf
|
29
31
|
Requires-Dist: sentencepiece
|
30
32
|
Requires-Dist: pandas
|
31
33
|
Requires-Dist: bert_score
|
32
|
-
Requires-Dist:
|
34
|
+
Requires-Dist: chardet
|
35
|
+
Requires-Dist: GPUtil
|
36
|
+
Requires-Dist: fasttext
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from nltkor import alignment
|
2
2
|
from nltkor import cider
|
3
3
|
from nltkor import distance
|
4
|
+
|
4
5
|
from nltkor import sejong
|
5
6
|
from nltkor import metrics
|
6
7
|
from nltkor import misc
|
@@ -8,8 +9,8 @@ from nltkor import search
|
|
8
9
|
from nltkor import similarity
|
9
10
|
from nltkor import tag
|
10
11
|
from nltkor import tokenize
|
11
|
-
|
12
|
-
|
13
12
|
from nltkor import trans
|
14
13
|
from nltkor import Kor_char
|
15
14
|
from nltkor import etc
|
15
|
+
|
16
|
+
__version__ = '1.2.9'
|
@@ -5,6 +5,8 @@ import numpy as np
|
|
5
5
|
from typing import Callable, Iterable, List, Tuple, Union
|
6
6
|
from copy import deepcopy
|
7
7
|
import itertools
|
8
|
+
import torch
|
9
|
+
import time
|
8
10
|
from nltk.translate.bleu_score import *
|
9
11
|
from nltk.metrics import confusionmatrix
|
10
12
|
from collections import defaultdict
|
@@ -54,6 +56,37 @@ class DefaultMetric:
|
|
54
56
|
|
55
57
|
return float(tp/total)
|
56
58
|
|
59
|
+
def accuracy_norm(model, tokenizer, input_text: str, candidates: list, label: int):
|
60
|
+
reserved_memory = []
|
61
|
+
inference_time = []
|
62
|
+
tokenized_prompt = tokenizer(input_text, return_tensors='pt').input_ids
|
63
|
+
total_candidate = []
|
64
|
+
|
65
|
+
for ending in candidates:
|
66
|
+
len_ending = len(ending)
|
67
|
+
tokenized_ending = tokenizer(ending, return_tensors='pt').input_ids
|
68
|
+
tokenized_ending = tokenized_ending[:, 1:]
|
69
|
+
input_ids = torch.cat([tokenized_prompt, tokenized_ending], dim=-1).cuda()
|
70
|
+
labels = input_ids.clone()
|
71
|
+
labels[0, :tokenized_prompt.shape[1]] = -100
|
72
|
+
start = time.time()
|
73
|
+
with torch.no_grad():
|
74
|
+
outputs = model(input_ids, labels=labels)
|
75
|
+
inference_time.append(time.time() - start)
|
76
|
+
reserved_memory.append(torch.cuda.memory_reserved() / (1024**2))
|
77
|
+
total_logprobs = -outputs.loss.item() * tokenized_ending.shape[1]
|
78
|
+
total_candidate.append(total_logprobs/len_ending)
|
79
|
+
answer_idx = total_candidate.index(max(total_candidate))
|
80
|
+
if int(label) == answer_idx:
|
81
|
+
cor = 1
|
82
|
+
else:
|
83
|
+
cor = 0
|
84
|
+
metric_dict = {
|
85
|
+
"reserved_memory": reserved_memory,
|
86
|
+
"inference_time": inference_time
|
87
|
+
}
|
88
|
+
return cor, metric_dict
|
89
|
+
|
57
90
|
def recall_score(self, true, pred, avg='micro'):
|
58
91
|
|
59
92
|
mat=confusionmatrix.ConfusionMatrix(true,pred)
|
@@ -197,7 +230,7 @@ class DefaultMetric:
|
|
197
230
|
|
198
231
|
return (((precision*recall)/(precision+recall))*2)
|
199
232
|
|
200
|
-
|
233
|
+
|
201
234
|
|
202
235
|
|
203
236
|
def pos_eval(self, fin):
|
@@ -49,7 +49,7 @@ except ImportError:
|
|
49
49
|
raise Exception(f"""
|
50
50
|
Need to install Libraries, please pip install below libraries
|
51
51
|
\t pip install torch
|
52
|
-
\t pip install fasttext
|
52
|
+
\t pip install fasttext-wheel
|
53
53
|
Or, use pip install requirement.txt
|
54
54
|
\t pip install -r {file_path}
|
55
55
|
""")
|
@@ -33,8 +33,11 @@ SOFTWARE.
|
|
33
33
|
This module contains a wrapper for the Faiss library by Facebook AI Research.
|
34
34
|
"""
|
35
35
|
|
36
|
+
from collections import Counter
|
36
37
|
from typing import List, Union, Optional, Dict, Any
|
37
38
|
import os
|
39
|
+
import copy
|
40
|
+
import logging
|
38
41
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
39
42
|
|
40
43
|
from nltkor.make_requirement import make_requirement
|
@@ -62,13 +65,27 @@ except ImportError:
|
|
62
65
|
# from nltk.search.kobert_tokenizer import KoBERTTokenizer
|
63
66
|
|
64
67
|
|
65
|
-
|
66
|
-
# FAISS library wrapper class
|
67
68
|
class FaissSearch:
|
69
|
+
def __new__(cls,
|
70
|
+
mode = None,
|
71
|
+
model_name_or_path: str = 'klue/bert-base',
|
72
|
+
tokenizer_name_or_path: str = 'klue/bert-base',
|
73
|
+
device: str = 'cpu'
|
74
|
+
) -> None:
|
75
|
+
if mode == 'sentence':
|
76
|
+
return FaissSearch_SenEmbed(model_name_or_path)
|
77
|
+
elif mode == 'word':
|
78
|
+
return FaissSearch_WordEmbed(model_name_or_path)
|
79
|
+
else:
|
80
|
+
raise ValueError("choice 'sentence' or 'word'")
|
81
|
+
|
82
|
+
|
83
|
+
# FAISS original library wrapper class
|
84
|
+
class FaissSearch_SenEmbed:
|
68
85
|
def __init__(self,
|
69
86
|
model_name_or_path: str = 'klue/bert-base',
|
70
87
|
tokenizer_name_or_path: str = 'klue/bert-base',
|
71
|
-
device: str = 'cpu'
|
88
|
+
device: str = 'cpu',
|
72
89
|
) -> None:
|
73
90
|
r"""
|
74
91
|
This function initializes the wrapper for the FAISS library, which is used to perform semantic search.
|
@@ -171,8 +188,6 @@ class FaissSearch:
|
|
171
188
|
return mean_pooling
|
172
189
|
|
173
190
|
|
174
|
-
|
175
|
-
|
176
191
|
# Get the embeddings
|
177
192
|
def get_embeddings(self,
|
178
193
|
text: Union[str, List[str]],
|
@@ -369,14 +384,6 @@ class FaissSearch:
|
|
369
384
|
self.embedding_type = embedding_type
|
370
385
|
|
371
386
|
|
372
|
-
# Tokenize the dataset
|
373
|
-
# self.dataset = self.dataset.map(
|
374
|
-
# lambda x: x[section],
|
375
|
-
# batched=True,
|
376
|
-
# batch_size=batch_size,
|
377
|
-
# num_proc=num_workers,
|
378
|
-
# )
|
379
|
-
|
380
387
|
# Map the section of the dataset to the embeddings
|
381
388
|
self.dataset = self.dataset.map(
|
382
389
|
lambda x: {
|
@@ -465,3 +472,316 @@ class FaissSearch:
|
|
465
472
|
|
466
473
|
# Return the most similar elements
|
467
474
|
return results_df
|
475
|
+
|
476
|
+
|
477
|
+
|
478
|
+
|
479
|
+
# FAISS word embedding library wrapper class
|
480
|
+
class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
|
481
|
+
def __init__(self,
|
482
|
+
model_name_or_path: str = 'klue/bert-base',
|
483
|
+
tokenizer_name_or_path: str = 'klue/bert-base',
|
484
|
+
device: str = 'cpu',
|
485
|
+
) -> None:
|
486
|
+
r"""
|
487
|
+
This function initializes the wrapper for the FAISS library, which is used to perform semantic search.
|
488
|
+
|
489
|
+
|
490
|
+
.. attention::
|
491
|
+
|
492
|
+
* If you use this class, please make sure to cite the following paper:
|
493
|
+
|
494
|
+
.. code-block:: latex
|
495
|
+
|
496
|
+
@article{johnson2019billion,
|
497
|
+
title={Billion-scale similarity search with {GPUs}},
|
498
|
+
author={Johnson, Jeff and Douze, Matthijs and J{\'e}gou, Herv{\'e}},
|
499
|
+
journal={IEEE Transactions on Big Data},
|
500
|
+
volume={7},
|
501
|
+
number={3},
|
502
|
+
pages={535--547},
|
503
|
+
year={2019},
|
504
|
+
publisher={IEEE}
|
505
|
+
}
|
506
|
+
|
507
|
+
* The code is based on the following GitHub repository:
|
508
|
+
https://github.com/facebookresearch/faiss
|
509
|
+
|
510
|
+
Arguments:
|
511
|
+
model_name_or_path (str, optional): The name or path of the model to use. Defaults to 'facebook/bart-large'.
|
512
|
+
tokenizer_name_or_path (str, optional): The name or path of the tokenizer to use. Defaults to 'facebook/bart-large'.
|
513
|
+
device (str, optional): The device to use. Defaults to 'cpu'.
|
514
|
+
|
515
|
+
Returns:
|
516
|
+
None
|
517
|
+
"""
|
518
|
+
|
519
|
+
# Set the device
|
520
|
+
self.device = device
|
521
|
+
|
522
|
+
# If the tokenizer is not specified, use the model name or path
|
523
|
+
if tokenizer_name_or_path is None:
|
524
|
+
tokenizer_name_or_path = model_name_or_path
|
525
|
+
|
526
|
+
# Load the tokenizer
|
527
|
+
if tokenizer_name_or_path == 'skt/kobert-base-v1':
|
528
|
+
# self.tokenizer = KoBERTTokenizer.from_pretrained(tokenizer_name_or_path)
|
529
|
+
self.tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name_or_path)
|
530
|
+
else:
|
531
|
+
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
|
532
|
+
|
533
|
+
# Load the model
|
534
|
+
self.model = AutoModel.from_pretrained(model_name_or_path).to(self.device)
|
535
|
+
|
536
|
+
# Set the model to evaluation mode (since we do not need the gradients)
|
537
|
+
self.model.eval()
|
538
|
+
|
539
|
+
# Initialize the dataset
|
540
|
+
self.dataset = None
|
541
|
+
|
542
|
+
|
543
|
+
|
544
|
+
# Get the embeddings (new code)
|
545
|
+
def get_doc_embeddings(self,
|
546
|
+
#text: Union[str, List[str]],
|
547
|
+
text=None,
|
548
|
+
embedding_type: str = 'last_hidden_state',
|
549
|
+
batch_size: int = 8,
|
550
|
+
num_workers: int = 4,
|
551
|
+
) -> torch.Tensor:
|
552
|
+
"""
|
553
|
+
This function returns the embeddings of the input text.
|
554
|
+
|
555
|
+
Arguments:
|
556
|
+
text (Union[str, List[str]]): The input text.
|
557
|
+
embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
|
558
|
+
batch_size (int, optional): The batch size to use. Defaults to 8.
|
559
|
+
num_workers (int, optional): The number of workers to use. Defaults to 4.
|
560
|
+
|
561
|
+
Returns:
|
562
|
+
torch.Tensor: The embeddings.
|
563
|
+
|
564
|
+
Raises:
|
565
|
+
ValueError: If the embedding type is invalid.
|
566
|
+
"""
|
567
|
+
|
568
|
+
# Check if the embedding type is valid
|
569
|
+
if embedding_type not in ['last_hidden_state', 'mean_pooling']:
|
570
|
+
raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
|
571
|
+
|
572
|
+
ids_dict = {}
|
573
|
+
# Tokenize the input text
|
574
|
+
for sentence in text['text']:
|
575
|
+
encoded_text = self.tokenizer(
|
576
|
+
sentence,
|
577
|
+
padding=False,
|
578
|
+
truncation=True,
|
579
|
+
return_tensors='pt',
|
580
|
+
add_special_tokens=False,
|
581
|
+
)
|
582
|
+
|
583
|
+
# Move the input text to the device
|
584
|
+
encoded_text = encoded_text.to(self.device)
|
585
|
+
|
586
|
+
token_ids_list = encoded_text['input_ids'].tolist()
|
587
|
+
token_ids_list = token_ids_list[0]
|
588
|
+
for ids in token_ids_list:
|
589
|
+
if ids not in ids_dict.keys():
|
590
|
+
ids_dict[ids] = [sentence]
|
591
|
+
else:
|
592
|
+
if text not in ids_dict[ids]:
|
593
|
+
ids_dict[ids].append(sentence)
|
594
|
+
|
595
|
+
# Get the embeddings
|
596
|
+
embedding_dict = {}
|
597
|
+
self.model.eval()
|
598
|
+
for key, value in ids_dict.items():
|
599
|
+
embed = self.model(torch.tensor([[key]]), output_hidden_states=True).hidden_states[-1][:,0,:].detach()
|
600
|
+
embedding_dict[embed] = value
|
601
|
+
|
602
|
+
# Return the embeddings
|
603
|
+
return embedding_dict
|
604
|
+
|
605
|
+
|
606
|
+
|
607
|
+
# Get the embeddings (new code)
|
608
|
+
def get_query_embeddings(self,
|
609
|
+
text: Union[str, List[str]],
|
610
|
+
embedding_type: str = 'last_hidden_state',
|
611
|
+
batch_size: int = 8,
|
612
|
+
num_workers: int = 4,
|
613
|
+
) -> torch.Tensor:
|
614
|
+
"""
|
615
|
+
This function returns the embeddings of the input text.
|
616
|
+
|
617
|
+
Arguments:
|
618
|
+
text (Union[str, List[str]]): The input text.
|
619
|
+
embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
|
620
|
+
batch_size (int, optional): The batch size to use. Defaults to 8.
|
621
|
+
num_workers (int, optional): The number of workers to use. Defaults to 4.
|
622
|
+
|
623
|
+
Returns:
|
624
|
+
torch.Tensor: The embeddings.
|
625
|
+
|
626
|
+
Raises:
|
627
|
+
ValueError: If the embedding type is invalid.
|
628
|
+
"""
|
629
|
+
|
630
|
+
# Check if the embedding type is valid
|
631
|
+
if embedding_type not in ['last_hidden_state', 'mean_pooling']:
|
632
|
+
raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
|
633
|
+
|
634
|
+
# Tokenize the input text
|
635
|
+
encoded_text = self.tokenizer(
|
636
|
+
text,
|
637
|
+
padding=False,
|
638
|
+
truncation=True,
|
639
|
+
return_tensors='pt',
|
640
|
+
add_special_tokens=False,
|
641
|
+
)
|
642
|
+
|
643
|
+
# Move the input text to the device
|
644
|
+
encoded_text = encoded_text.to(self.device)
|
645
|
+
|
646
|
+
token_ids_list = encoded_text['input_ids'].tolist()
|
647
|
+
token_ids_list = token_ids_list[0]
|
648
|
+
tensor_list = [torch.tensor([[value]]) for value in token_ids_list]
|
649
|
+
|
650
|
+
# Get the embeddings
|
651
|
+
embeds = []
|
652
|
+
self.model.eval()
|
653
|
+
for index, tensor in enumerate(tensor_list):
|
654
|
+
embed = self.model(tensor, output_hidden_states=True).hidden_states[-1][:,0,:].detach().cpu().numpy()
|
655
|
+
embeds.append(embed)
|
656
|
+
|
657
|
+
# Return the embeddings
|
658
|
+
return embeds
|
659
|
+
|
660
|
+
|
661
|
+
|
662
|
+
# Initialize the corpus using a dictionary or pandas DataFrame or HuggingFace Datasets object
|
663
|
+
def initialize_corpus(self,
|
664
|
+
corpus: Union[Dict[str, List[str]], pd.DataFrame, Dataset],
|
665
|
+
section: str = 'text',
|
666
|
+
index_column_name: str = 'embeddings',
|
667
|
+
embedding_type: str = 'last_hidden_state',
|
668
|
+
batch_size: Optional[int] = None,
|
669
|
+
num_workers: Optional[int] = None,
|
670
|
+
save_path: Optional[str] = None,
|
671
|
+
) -> Dataset:
|
672
|
+
"""
|
673
|
+
This function initializes a dataset using a dictionary or pandas DataFrame or HuggingFace Datasets object.
|
674
|
+
|
675
|
+
Arguments:
|
676
|
+
dataset_dict (Dict[str, List[str]]): The dataset dictionary.
|
677
|
+
section (str): The section of the dataset to use whose embeddings will be used for semantic search (e.g., 'text', 'title', etc.) (default: 'text').
|
678
|
+
index_column_name (str): The name of the column containing the embeddings (default: 'embeddings')
|
679
|
+
embedding_type (str): The type of embedding to use (default: 'last_hidden_state').
|
680
|
+
batch_size (int, optional): The batch size to use (default: 8).
|
681
|
+
max_length (int, optional): The maximum length of the input sequences.
|
682
|
+
num_workers (int, optional): The number of workers to use.
|
683
|
+
save_path (Optional[str], optional): The path to save the dataset (default: None).
|
684
|
+
|
685
|
+
Returns:
|
686
|
+
Dataset: The dataset object (HuggingFace Datasets).
|
687
|
+
|
688
|
+
Raises:
|
689
|
+
ValueError: If the dataset is not a dictionary or pandas DataFrame or HuggingFace Datasets object.
|
690
|
+
"""
|
691
|
+
|
692
|
+
# corpus = { 'text': [...] } -> form_dict
|
693
|
+
|
694
|
+
# Set the embedding_type
|
695
|
+
self.embedding_type = embedding_type
|
696
|
+
|
697
|
+
# get embedding dict
|
698
|
+
embedding_dict = self.get_doc_embeddings(text=corpus, embedding_type=self.embedding_type)
|
699
|
+
|
700
|
+
data = {
|
701
|
+
'text' : embedding_dict.values(),
|
702
|
+
'embeddings': []
|
703
|
+
}
|
704
|
+
|
705
|
+
for embed in embedding_dict.keys():
|
706
|
+
embed_list = embed.tolist()
|
707
|
+
data['embeddings'].append(embed_list[0])
|
708
|
+
|
709
|
+
|
710
|
+
if isinstance(data, dict):
|
711
|
+
self.dataset = Dataset.from_dict(data)
|
712
|
+
elif isinstance(data, pd.DataFrame):
|
713
|
+
self.dataset = Dataset.from_pandas(data)
|
714
|
+
elif isinstance(data, Dataset):
|
715
|
+
self.dataset = corpus
|
716
|
+
else:
|
717
|
+
raise ValueError('The dataset must be a dictionary or pandas DataFrame.')
|
718
|
+
|
719
|
+
# Save the dataset
|
720
|
+
if save_path is not None:
|
721
|
+
self.dataset.to_json(save_path)
|
722
|
+
|
723
|
+
# Add FAISS index
|
724
|
+
self.add_faiss_index(
|
725
|
+
column_name=index_column_name,
|
726
|
+
)
|
727
|
+
|
728
|
+
# Return the dataset
|
729
|
+
return self.dataset
|
730
|
+
|
731
|
+
|
732
|
+
|
733
|
+
# Search for the most similar elements in the dataset, given a query
|
734
|
+
def search(self,
|
735
|
+
query: str,
|
736
|
+
k: int = 1,
|
737
|
+
index_column_name: str = 'embeddings',
|
738
|
+
) -> pd.DataFrame:
|
739
|
+
"""
|
740
|
+
This function searches for the most similar elements in the dataset, given a query.
|
741
|
+
|
742
|
+
Arguments:
|
743
|
+
query (str): The query.
|
744
|
+
k (int, optional): The number of elements to return (default: 1).
|
745
|
+
index_column_name (str, optional): The name of the column containing the embeddings (default: 'embeddings')
|
746
|
+
|
747
|
+
Returns:
|
748
|
+
pd.DataFrame: The most similar elements in the dataset (text, score, etc.), sorted by score.
|
749
|
+
|
750
|
+
Remarks:
|
751
|
+
The returned elements are dictionaries containing the text and the score.
|
752
|
+
"""
|
753
|
+
|
754
|
+
|
755
|
+
# Get the embeddings of the query
|
756
|
+
query_embeddings = self.get_query_embeddings([query], embedding_type=self.embedding_type)
|
757
|
+
|
758
|
+
# query_embedding이랑 self.dataset['embeddings'] 값 비교
|
759
|
+
scores = []
|
760
|
+
similar_elts = []
|
761
|
+
for query in query_embeddings:
|
762
|
+
# Search for the most similar elements in the dataset
|
763
|
+
score, similar_elt = self.dataset.get_nearest_examples(
|
764
|
+
index_name=index_column_name,
|
765
|
+
query=query,
|
766
|
+
k=k,
|
767
|
+
)
|
768
|
+
scores.append(score)
|
769
|
+
similar_elts.append(similar_elt)
|
770
|
+
|
771
|
+
text_list = []
|
772
|
+
for item in similar_elts:
|
773
|
+
for text in item['text']:
|
774
|
+
text_list.append(text)
|
775
|
+
|
776
|
+
flat_list = [sentence for sublist in text_list for sentence in sublist]
|
777
|
+
count = Counter(flat_list)
|
778
|
+
count = dict(count.most_common(5))
|
779
|
+
|
780
|
+
sorted_dict = dict(sorted(count.items(), key=lambda x: x[1], reverse=True))
|
781
|
+
|
782
|
+
# Convert the results to a pandas DataFrame
|
783
|
+
results_df = pd.DataFrame({'text': sorted_dict.keys() , 'freq': sorted_dict.values()})
|
784
|
+
|
785
|
+
|
786
|
+
# Return the most similar elements
|
787
|
+
return results_df
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -27,9 +27,10 @@ def get_config_paths(directory):
|
|
27
27
|
('network_text_pos' , 'pos-network.txt'),
|
28
28
|
('pos_tags' , 'pos-tags.txt'),
|
29
29
|
('pos_tag_dict' , 'pos-tags.txt'),
|
30
|
-
('pos_co_lexicon' , 'pos-co-lexicon.
|
31
|
-
('pos_morph_lexicon' , 'pos-morph-lexicon.
|
32
|
-
('pos_prob_dict' , 'pos-prob-dict.
|
30
|
+
('pos_co_lexicon' , 'pos-co-lexicon.pickle'),
|
31
|
+
('pos_morph_lexicon' , 'pos-morph-lexicon.pickle'),
|
32
|
+
('pos_prob_dict' , 'pos-prob-dict.pickle'),
|
33
|
+
('pos_morph_lexicon_txt' , 'pos-morph-lexicon.txt'),
|
33
34
|
('suffix' , 'suffixes.txt'),
|
34
35
|
('suffixes' , 'suffixes.txt'),
|
35
36
|
('prefix' , 'prefixes.txt'),
|