nltkor 1.2.19__cp39-cp39-macosx_10_9_universal2.whl → 1.2.20__cp39-cp39-macosx_10_9_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nltkor/__init__.py +1 -1
- nltkor/search/faiss_search.py +7 -314
- {nltkor-1.2.19.dist-info → nltkor-1.2.20.dist-info}/METADATA +1 -1
- {nltkor-1.2.19.dist-info → nltkor-1.2.20.dist-info}/RECORD +7 -7
- {nltkor-1.2.19.dist-info → nltkor-1.2.20.dist-info}/WHEEL +0 -0
- {nltkor-1.2.19.dist-info → nltkor-1.2.20.dist-info}/licenses/LICENSE.txt +0 -0
- {nltkor-1.2.19.dist-info → nltkor-1.2.20.dist-info}/top_level.txt +0 -0
nltkor/__init__.py
CHANGED
nltkor/search/faiss_search.py
CHANGED
@@ -68,24 +68,22 @@ except ImportError:
|
|
68
68
|
|
69
69
|
class FaissSearch:
|
70
70
|
def __new__(cls,
|
71
|
-
mode =
|
71
|
+
mode = 'dense',
|
72
72
|
model_name_or_path: str = 'klue/bert-base',
|
73
73
|
tokenizer_name_or_path: str = 'klue/bert-base',
|
74
74
|
embedding_type: str = 'last_hidden_state',
|
75
75
|
device: str = 'cpu'
|
76
76
|
) -> None:
|
77
|
-
if mode == '
|
78
|
-
return
|
79
|
-
elif mode == 'word':
|
80
|
-
return FaissSearch_WordEmbed(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
|
77
|
+
if mode == 'dense':
|
78
|
+
return FaissSearch_Dense(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
|
81
79
|
elif mode == 'sparse':
|
82
80
|
return FaissSearch_Sparse(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
|
83
81
|
else:
|
84
|
-
raise ValueError("choice '
|
82
|
+
raise ValueError("choice 'dense' or 'sparse'.")
|
85
83
|
|
86
84
|
|
87
85
|
|
88
|
-
class
|
86
|
+
class FaissSearch_Dense:
|
89
87
|
def __init__(self,
|
90
88
|
model_name_or_path: str = 'klue/bert-base',
|
91
89
|
tokenizer_name_or_path: str = 'klue/bert-base',
|
@@ -474,7 +472,7 @@ class FaissSearch_SenEmbed:
|
|
474
472
|
|
475
473
|
|
476
474
|
|
477
|
-
class FaissSearch_Sparse(
|
475
|
+
class FaissSearch_Sparse(FaissSearch_Dense):
|
478
476
|
def __init__(self,
|
479
477
|
model_name_or_path: str = 'klue/bert-base',
|
480
478
|
tokenizer_name_or_path: str = 'klue/bert-base',
|
@@ -586,312 +584,7 @@ class FaissSearch_Sparse(FaissSearch_SenEmbed):
|
|
586
584
|
embeddings = embeddings['logits']
|
587
585
|
|
588
586
|
embeddings = torch.sum(torch.log(1+torch.relu(embeddings)) * encoded_text['attention_mask'].unsqueeze(-1), dim=1)
|
589
|
-
e_norm = torch.nn.functional.normalize(embeddings, p=2, dim=1, eps=1e-8)
|
590
587
|
|
591
588
|
# Return the embeddings
|
592
|
-
return
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
# FAISS word embedding library wrapper class
|
597
|
-
class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
|
598
|
-
def __init__(self,
|
599
|
-
model_name_or_path: str = 'klue/bert-base',
|
600
|
-
tokenizer_name_or_path: str = 'klue/bert-base',
|
601
|
-
embedding_type: str = 'last_hidden_state',
|
602
|
-
device: str = 'cpu',
|
603
|
-
) -> None:
|
604
|
-
r"""
|
605
|
-
This function initializes the wrapper for the FAISS library, which is used to perform semantic search.
|
606
|
-
|
607
|
-
|
608
|
-
.. attention::
|
609
|
-
|
610
|
-
* If you use this class, please make sure to cite the following paper:
|
611
|
-
|
612
|
-
.. code-block:: latex
|
613
|
-
|
614
|
-
@article{johnson2019billion,
|
615
|
-
title={Billion-scale similarity search with {GPUs}},
|
616
|
-
author={Johnson, Jeff and Douze, Matthijs and J{\'e}gou, Herv{\'e}},
|
617
|
-
journal={IEEE Transactions on Big Data},
|
618
|
-
volume={7},
|
619
|
-
number={3},
|
620
|
-
pages={535--547},
|
621
|
-
year={2019},
|
622
|
-
publisher={IEEE}
|
623
|
-
}
|
624
|
-
|
625
|
-
* The code is based on the following GitHub repository:
|
626
|
-
https://github.com/facebookresearch/faiss
|
627
|
-
|
628
|
-
Arguments:
|
629
|
-
model_name_or_path (str, optional): The name or path of the model to use. Defaults to 'facebook/bart-large'.
|
630
|
-
tokenizer_name_or_path (str, optional): The name or path of the tokenizer to use. Defaults to 'facebook/bart-large'.
|
631
|
-
device (str, optional): The device to use. Defaults to 'cpu'.
|
632
|
-
|
633
|
-
Returns:
|
634
|
-
None
|
635
|
-
"""
|
636
|
-
|
637
|
-
# Set the device
|
638
|
-
self.device = device
|
639
|
-
|
640
|
-
# If the tokenizer is not specified, use the model name or path
|
641
|
-
if tokenizer_name_or_path is None:
|
642
|
-
tokenizer_name_or_path = model_name_or_path
|
643
|
-
|
644
|
-
# Load the tokenizer
|
645
|
-
if tokenizer_name_or_path == 'skt/kobert-base-v1':
|
646
|
-
# self.tokenizer = KoBERTTokenizer.from_pretrained(tokenizer_name_or_path)
|
647
|
-
self.tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name_or_path)
|
648
|
-
else:
|
649
|
-
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
|
650
|
-
|
651
|
-
# Load the model
|
652
|
-
self.model = AutoModel.from_pretrained(model_name_or_path).to(self.device)
|
653
|
-
|
654
|
-
|
655
|
-
# Set the model to evaluation mode (since we do not need the gradients)
|
656
|
-
self.model.eval()
|
657
|
-
|
658
|
-
# Initialize the dataset
|
659
|
-
self.dataset = None
|
660
|
-
|
661
|
-
|
662
|
-
# Get the embeddings (new code)
|
663
|
-
def get_doc_embeddings(self,
|
664
|
-
#text: Union[str, List[str]],
|
665
|
-
text=None,
|
666
|
-
embedding_type: str = 'last_hidden_state',
|
667
|
-
batch_size: int = 8,
|
668
|
-
num_workers: int = 4,
|
669
|
-
) -> torch.Tensor:
|
670
|
-
"""
|
671
|
-
This function returns the embeddings of the input text.
|
672
|
-
|
673
|
-
Arguments:
|
674
|
-
text (Union[str, List[str]]): The input text.
|
675
|
-
embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
|
676
|
-
batch_size (int, optional): The batch size to use. Defaults to 8.
|
677
|
-
num_workers (int, optional): The number of workers to use. Defaults to 4.
|
678
|
-
|
679
|
-
Returns:
|
680
|
-
torch.Tensor: The embeddings.
|
681
|
-
|
682
|
-
Raises:
|
683
|
-
ValueError: If the embedding type is invalid.
|
684
|
-
"""
|
685
|
-
|
686
|
-
# Check if the embedding type is valid
|
687
|
-
if embedding_type not in ['last_hidden_state', 'mean_pooling']:
|
688
|
-
raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
|
689
|
-
|
690
|
-
ids_dict = {}
|
691
|
-
# Tokenize the input text
|
692
|
-
for sentence in text['text']:
|
693
|
-
encoded_text = self.tokenizer(
|
694
|
-
sentence,
|
695
|
-
padding=False,
|
696
|
-
truncation=True,
|
697
|
-
return_tensors='pt',
|
698
|
-
add_special_tokens=False
|
699
|
-
)
|
700
|
-
# Move the input text to the device
|
701
|
-
encoded_text = encoded_text.to(self.device)
|
702
|
-
token_ids_list = encoded_text['input_ids'].tolist()
|
703
|
-
token_ids_list = token_ids_list[0]
|
704
|
-
for ids in token_ids_list:
|
705
|
-
if ids not in ids_dict.keys():
|
706
|
-
ids_dict[ids] = [sentence]
|
707
|
-
else:
|
708
|
-
if text not in ids_dict[ids]:
|
709
|
-
ids_dict[ids].append(sentence)
|
710
|
-
# Get the embeddings
|
711
|
-
embedding_dict = {}
|
712
|
-
self.model.eval()
|
713
|
-
for key, value in ids_dict.items():
|
714
|
-
embed = self.model(torch.tensor([[key]]), output_hidden_states=True).hidden_states[-1][:,0,:].detach()
|
715
|
-
embedding_dict[embed] = value
|
716
|
-
|
717
|
-
# Return the embeddings
|
718
|
-
return embedding_dict
|
719
|
-
|
720
|
-
|
721
|
-
# Get the embeddings (new code)
|
722
|
-
def get_query_embeddings(self,
|
723
|
-
text: Union[str, List[str]],
|
724
|
-
embedding_type: str = 'last_hidden_state',
|
725
|
-
batch_size: int = 8,
|
726
|
-
num_workers: int = 4,
|
727
|
-
) -> torch.Tensor:
|
728
|
-
"""
|
729
|
-
This function returns the embeddings of the input text.
|
730
|
-
|
731
|
-
Arguments:
|
732
|
-
text (Union[str, List[str]]): The input text.
|
733
|
-
embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
|
734
|
-
batch_size (int, optional): The batch size to use. Defaults to 8.
|
735
|
-
num_workers (int, optional): The number of workers to use. Defaults to 4.
|
736
|
-
|
737
|
-
Returns:
|
738
|
-
torch.Tensor: The embeddings.
|
739
|
-
|
740
|
-
Raises:
|
741
|
-
ValueError: If the embedding type is invalid.
|
742
|
-
"""
|
743
|
-
|
744
|
-
# Check if the embedding type is valid
|
745
|
-
if embedding_type not in ['last_hidden_state', 'mean_pooling']:
|
746
|
-
raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
|
747
|
-
|
748
|
-
# Tokenize the input text
|
749
|
-
encoded_text = self.tokenizer(
|
750
|
-
text,
|
751
|
-
padding=False,
|
752
|
-
truncation=True,
|
753
|
-
return_tensors='pt',
|
754
|
-
add_special_tokens=False,
|
755
|
-
)
|
756
|
-
|
757
|
-
# Move the input text to the device
|
758
|
-
encoded_text = encoded_text.to(self.device)
|
759
|
-
|
760
|
-
token_ids_list = encoded_text['input_ids'].tolist()
|
761
|
-
token_ids_list = token_ids_list[0]
|
762
|
-
tensor_list = [torch.tensor([[value]]) for value in token_ids_list]
|
763
|
-
|
764
|
-
# Get the embeddings
|
765
|
-
embeds = []
|
766
|
-
self.model.eval()
|
767
|
-
for index, tensor in enumerate(tensor_list):
|
768
|
-
embed = self.model(tensor, output_hidden_states=True).hidden_states[-1][:,0,:].detach().cpu().numpy()
|
769
|
-
embeds.append(embed)
|
770
|
-
|
771
|
-
# Return the embeddings
|
772
|
-
return embeds
|
773
|
-
|
774
|
-
|
775
|
-
# Initialize the corpus using a dictionary or pandas DataFrame or HuggingFace Datasets object
|
776
|
-
def initialize_corpus(self,
|
777
|
-
corpus: Union[Dict[str, List[str]], pd.DataFrame, Dataset],
|
778
|
-
section: str = 'text',
|
779
|
-
index_column_name: str = 'embeddings',
|
780
|
-
embedding_type: str = 'last_hidden_state',
|
781
|
-
batch_size: Optional[int] = None,
|
782
|
-
num_workers: Optional[int] = None,
|
783
|
-
save_path: Optional[str] = None,
|
784
|
-
) -> Dataset:
|
785
|
-
"""
|
786
|
-
This function initializes a dataset using a dictionary or pandas DataFrame or HuggingFace Datasets object.
|
787
|
-
|
788
|
-
Arguments:
|
789
|
-
dataset_dict (Dict[str, List[str]]): The dataset dictionary.
|
790
|
-
section (str): The section of the dataset to use whose embeddings will be used for semantic search (e.g., 'text', 'title', etc.) (default: 'text').
|
791
|
-
index_column_name (str): The name of the column containing the embeddings (default: 'embeddings')
|
792
|
-
embedding_type (str): The type of embedding to use (default: 'last_hidden_state').
|
793
|
-
batch_size (int, optional): The batch size to use (default: 8).
|
794
|
-
max_length (int, optional): The maximum length of the input sequences.
|
795
|
-
num_workers (int, optional): The number of workers to use.
|
796
|
-
save_path (Optional[str], optional): The path to save the dataset (default: None).
|
797
|
-
|
798
|
-
Returns:
|
799
|
-
Dataset: The dataset object (HuggingFace Datasets).
|
800
|
-
|
801
|
-
Raises:
|
802
|
-
ValueError: If the dataset is not a dictionary or pandas DataFrame or HuggingFace Datasets object.
|
803
|
-
"""
|
804
|
-
|
805
|
-
# corpus = { 'text': [...] } -> form_dict
|
806
|
-
|
807
|
-
# Set the embedding_type
|
808
|
-
self.embedding_type = embedding_type
|
809
|
-
|
810
|
-
# get embedding dict
|
811
|
-
embedding_dict = self.get_doc_embeddings(text=corpus, embedding_type=self.embedding_type)
|
812
|
-
|
813
|
-
data = {
|
814
|
-
'text' : embedding_dict.values(),
|
815
|
-
'embeddings': []
|
816
|
-
}
|
817
|
-
|
818
|
-
for embed in embedding_dict.keys():
|
819
|
-
embed_list = embed.tolist()
|
820
|
-
data['embeddings'].append(embed_list[0])
|
821
|
-
|
822
|
-
|
823
|
-
if isinstance(data, dict):
|
824
|
-
self.dataset = Dataset.from_dict(data)
|
825
|
-
elif isinstance(data, pd.DataFrame):
|
826
|
-
self.dataset = Dataset.from_pandas(data)
|
827
|
-
elif isinstance(data, Dataset):
|
828
|
-
self.dataset = corpus
|
829
|
-
else:
|
830
|
-
raise ValueError('The dataset must be a dictionary or pandas DataFrame.')
|
831
|
-
|
832
|
-
# Save the dataset
|
833
|
-
if save_path is not None:
|
834
|
-
self.dataset.to_json(save_path)
|
835
|
-
|
836
|
-
# Add FAISS index
|
837
|
-
self.add_faiss_index(
|
838
|
-
column_name=index_column_name,
|
839
|
-
)
|
840
|
-
|
841
|
-
# Return the dataset
|
842
|
-
return self.dataset
|
843
|
-
|
844
|
-
|
845
|
-
# Search for the most similar elements in the dataset, given a query
|
846
|
-
def search(self,
|
847
|
-
query: str,
|
848
|
-
k: int = 1,
|
849
|
-
index_column_name: str = 'embeddings',
|
850
|
-
) -> pd.DataFrame:
|
851
|
-
"""
|
852
|
-
This function searches for the most similar elements in the dataset, given a query.
|
853
|
-
|
854
|
-
Arguments:
|
855
|
-
query (str): The query.
|
856
|
-
k (int, optional): The number of elements to return (default: 1).
|
857
|
-
index_column_name (str, optional): The name of the column containing the embeddings (default: 'embeddings')
|
858
|
-
|
859
|
-
Returns:
|
860
|
-
pd.DataFrame: The most similar elements in the dataset (text, score, etc.), sorted by score.
|
861
|
-
|
862
|
-
Remarks:
|
863
|
-
The returned elements are dictionaries containing the text and the score.
|
864
|
-
"""
|
865
|
-
|
866
|
-
# Get the embeddings of the query
|
867
|
-
query_embeddings = self.get_query_embeddings([query], embedding_type=self.embedding_type)
|
868
|
-
|
869
|
-
# query_embedding이랑 self.dataset['embeddings'] 값 비교
|
870
|
-
scores = []
|
871
|
-
similar_elts = []
|
872
|
-
for query in query_embeddings:
|
873
|
-
# Search for the most similar elements in the dataset
|
874
|
-
score, similar_elt = self.dataset.get_nearest_examples(
|
875
|
-
index_name=index_column_name,
|
876
|
-
query=query,
|
877
|
-
k=k,
|
878
|
-
)
|
879
|
-
scores.append(score)
|
880
|
-
similar_elts.append(similar_elt)
|
881
|
-
|
589
|
+
return embeddings
|
882
590
|
|
883
|
-
text_list = []
|
884
|
-
for item in similar_elts:
|
885
|
-
for text in item['text']:
|
886
|
-
text_list.append(text)
|
887
|
-
|
888
|
-
flat_list = [sentence for sublist in text_list for sentence in sublist]
|
889
|
-
count = Counter(flat_list)
|
890
|
-
count = dict(count.most_common(5))
|
891
|
-
|
892
|
-
sorted_dict = dict(sorted(count.items(), key=lambda x: x[1], reverse=True))
|
893
|
-
# Convert the results to a pandas DataFrame
|
894
|
-
results_df = pd.DataFrame({'text': sorted_dict.keys() , 'freq': sorted_dict.values()})
|
895
|
-
|
896
|
-
# Return the most similar elements
|
897
|
-
return results_df
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: nltkor
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.20
|
4
4
|
Home-page: https://modi.changwon.ac.kr/air_cwnu/nlp_tool/nltk_ko.git
|
5
5
|
Keywords: string matching,pattern matching,edit distance,string to string correction,string to string matching,Levenshtein edit distance,Hamming distance,Damerau-Levenshtein distance,Jaro-Winkler distance,longest common subsequence,longest common substring,dynamic programming,approximate string matching,semantic similarity,natural language processing,NLP,information retrieval,rouge,sacrebleu,bertscore,bartscore,fasttext,glove,cosine similarity,Smith-Waterman,Needleman-Wunsch,Hirschberg,Karp-Rabin,Knuth-Morris-Pratt,Boyer-Moore
|
6
6
|
Classifier: Programming Language :: Python :: 3.7
|
@@ -1,5 +1,5 @@
|
|
1
1
|
nltkor/Kor_char.py,sha256=KtixIsoKCtKItbwnZ7ehk47jjXhdvW_luHJBkIUNYUo,4735
|
2
|
-
nltkor/__init__.py,sha256=
|
2
|
+
nltkor/__init__.py,sha256=_uvNJGh9igxlXMtbwdVt0NNGYQYBgpBeZtsjprllfvs,367
|
3
3
|
nltkor/etc.py,sha256=gbu4BZKe_x8g_OcuhqkKu6Z6_J-wQ0xE6pumnl4z3JE,387
|
4
4
|
nltkor/lazyimport.py,sha256=GFL1znsYUhV7mzW3U1IUQwvBcRH4d4YMFJNcGQ8eRLc,4561
|
5
5
|
nltkor/make_requirement.py,sha256=vvftdbp81DcaKzalNeZ-6AOmzSMveftZuAIar1NSWtE,283
|
@@ -25,7 +25,7 @@ nltkor/misc/string2string_hash_functions.py,sha256=OrxrqQOGOJy4tjNCiUSwvD1G51AJ2
|
|
25
25
|
nltkor/misc/string2string_word_embeddings.py,sha256=T_GtJMyJsYSY0FRrmg-LzSkfVCTuQzFEqSqV8_P1GNQ,19021
|
26
26
|
nltkor/search/__init__.py,sha256=uSR8pxjUQ2gX4dYhr5hN43YiMgtCQdNSDJ-Tgu_aY1w,330
|
27
27
|
nltkor/search/classical.py,sha256=su1yyfiWDI-0w5QOnV1n06GWqhZceeiKZqWK_2_ANAs,19547
|
28
|
-
nltkor/search/faiss_search.py,sha256=
|
28
|
+
nltkor/search/faiss_search.py,sha256=3kBC-QZoyNfx47iCMeE_2GTqJU8NKbwXqF2pbGZ_Vwk,20453
|
29
29
|
nltkor/search/kobert_tokenizer.py,sha256=vUrOsrbwZKV7TBRqut6P2P8j4XGcZZfHtgus3xdAMGE,6929
|
30
30
|
nltkor/search/test.py,sha256=12kcEeNhKmJrtFnao2OrNywMVZZmBt1SvAQkRz4W09s,696
|
31
31
|
nltkor/search/trie_search.py,sha256=Q94_Iiig4CbAhQCGNmD0tN-575TpYMMJh1Jkakl4rO4,3031
|
@@ -121,8 +121,8 @@ nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc,sha256=0_BAk6rs2fEkzcb
|
|
121
121
|
nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc,sha256=z3eOoU5yC3ON5Vkrld_SL07KEWxmcUkj1lejSkCy_ZI,2474
|
122
122
|
nltkor/tokenize/__init__.py,sha256=ZdWHiwNiIPUFdxZV1Yk1nVSjnC2xUZ6lfCMAtswimFQ,2613
|
123
123
|
nltkor/tokenize/ko_tokenize.py,sha256=T6IlXQXOEwa15TopybcRc9wLZfypLT2aAU_5CXhWuh4,3853
|
124
|
-
nltkor-1.2.
|
125
|
-
nltkor-1.2.
|
126
|
-
nltkor-1.2.
|
127
|
-
nltkor-1.2.
|
128
|
-
nltkor-1.2.
|
124
|
+
nltkor-1.2.20.dist-info/licenses/LICENSE.txt,sha256=c7URrdgMRPTfDHJt9SGSSOhqXOLzVQ_VdTh8Du2kMAY,58937
|
125
|
+
nltkor-1.2.20.dist-info/METADATA,sha256=MvtK-iBwQbr-aAqp4de_-ySnMZ-S3jW3aTz0wzJFJ_M,1733
|
126
|
+
nltkor-1.2.20.dist-info/WHEEL,sha256=FZpotpoE2pJWvPd4MXRrq-vJvPwnPHGw8XNlxl6WcTI,112
|
127
|
+
nltkor-1.2.20.dist-info/top_level.txt,sha256=XbFtt4S9DLUdj3lThO7ro_RyJnAobZaMFpAQpD3yEmQ,7
|
128
|
+
nltkor-1.2.20.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|