nltkor 1.2.18__cp39-cp39-macosx_10_9_universal2.whl → 1.2.20__cp39-cp39-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nltkor/__init__.py CHANGED
@@ -13,4 +13,4 @@ from nltkor import trans
13
13
  from nltkor import Kor_char
14
14
  from nltkor import etc
15
15
 
16
- __version__ = '1.2.18'
16
+ __version__ = '1.2.20'
nltkor/search/__init__.py CHANGED
@@ -8,4 +8,4 @@ from .classical import (
8
8
  )
9
9
  from .faiss_search import FaissSearch
10
10
  from .kobert_tokenizer import KoBERTTokenizer
11
- from .search_dict import SearchDic
11
+ from .trie_search import TRIESearch
@@ -68,24 +68,22 @@ except ImportError:
68
68
 
69
69
  class FaissSearch:
70
70
  def __new__(cls,
71
- mode = None,
71
+ mode = 'dense',
72
72
  model_name_or_path: str = 'klue/bert-base',
73
73
  tokenizer_name_or_path: str = 'klue/bert-base',
74
74
  embedding_type: str = 'last_hidden_state',
75
75
  device: str = 'cpu'
76
76
  ) -> None:
77
- if mode == 'sentence':
78
- return FaissSearch_SenEmbed(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
79
- elif mode == 'word':
80
- return FaissSearch_WordEmbed(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
77
+ if mode == 'dense':
78
+ return FaissSearch_Dense(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
81
79
  elif mode == 'sparse':
82
80
  return FaissSearch_Sparse(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
83
81
  else:
84
- raise ValueError("choice 'sentence' or 'word' or 'sparse'")
82
+ raise ValueError("choice 'dense' or 'sparse'.")
85
83
 
86
84
 
87
85
 
88
- class FaissSearch_SenEmbed:
86
+ class FaissSearch_Dense:
89
87
  def __init__(self,
90
88
  model_name_or_path: str = 'klue/bert-base',
91
89
  tokenizer_name_or_path: str = 'klue/bert-base',
@@ -474,7 +472,7 @@ class FaissSearch_SenEmbed:
474
472
 
475
473
 
476
474
 
477
- class FaissSearch_Sparse(FaissSearch_SenEmbed):
475
+ class FaissSearch_Sparse(FaissSearch_Dense):
478
476
  def __init__(self,
479
477
  model_name_or_path: str = 'klue/bert-base',
480
478
  tokenizer_name_or_path: str = 'klue/bert-base',
@@ -586,312 +584,7 @@ class FaissSearch_Sparse(FaissSearch_SenEmbed):
586
584
  embeddings = embeddings['logits']
587
585
 
588
586
  embeddings = torch.sum(torch.log(1+torch.relu(embeddings)) * encoded_text['attention_mask'].unsqueeze(-1), dim=1)
589
- e_norm = torch.nn.functional.normalize(embeddings, p=2, dim=1, eps=1e-8)
590
587
 
591
588
  # Return the embeddings
592
- return e_norm
593
-
594
-
595
-
596
- # FAISS word embedding library wrapper class
597
- class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
598
- def __init__(self,
599
- model_name_or_path: str = 'klue/bert-base',
600
- tokenizer_name_or_path: str = 'klue/bert-base',
601
- embedding_type: str = 'last_hidden_state',
602
- device: str = 'cpu',
603
- ) -> None:
604
- r"""
605
- This function initializes the wrapper for the FAISS library, which is used to perform semantic search.
606
-
607
-
608
- .. attention::
609
-
610
- * If you use this class, please make sure to cite the following paper:
611
-
612
- .. code-block:: latex
613
-
614
- @article{johnson2019billion,
615
- title={Billion-scale similarity search with {GPUs}},
616
- author={Johnson, Jeff and Douze, Matthijs and J{\'e}gou, Herv{\'e}},
617
- journal={IEEE Transactions on Big Data},
618
- volume={7},
619
- number={3},
620
- pages={535--547},
621
- year={2019},
622
- publisher={IEEE}
623
- }
624
-
625
- * The code is based on the following GitHub repository:
626
- https://github.com/facebookresearch/faiss
627
-
628
- Arguments:
629
- model_name_or_path (str, optional): The name or path of the model to use. Defaults to 'facebook/bart-large'.
630
- tokenizer_name_or_path (str, optional): The name or path of the tokenizer to use. Defaults to 'facebook/bart-large'.
631
- device (str, optional): The device to use. Defaults to 'cpu'.
632
-
633
- Returns:
634
- None
635
- """
636
-
637
- # Set the device
638
- self.device = device
639
-
640
- # If the tokenizer is not specified, use the model name or path
641
- if tokenizer_name_or_path is None:
642
- tokenizer_name_or_path = model_name_or_path
643
-
644
- # Load the tokenizer
645
- if tokenizer_name_or_path == 'skt/kobert-base-v1':
646
- # self.tokenizer = KoBERTTokenizer.from_pretrained(tokenizer_name_or_path)
647
- self.tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name_or_path)
648
- else:
649
- self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
650
-
651
- # Load the model
652
- self.model = AutoModel.from_pretrained(model_name_or_path).to(self.device)
653
-
654
-
655
- # Set the model to evaluation mode (since we do not need the gradients)
656
- self.model.eval()
657
-
658
- # Initialize the dataset
659
- self.dataset = None
660
-
661
-
662
- # Get the embeddings (new code)
663
- def get_doc_embeddings(self,
664
- #text: Union[str, List[str]],
665
- text=None,
666
- embedding_type: str = 'last_hidden_state',
667
- batch_size: int = 8,
668
- num_workers: int = 4,
669
- ) -> torch.Tensor:
670
- """
671
- This function returns the embeddings of the input text.
672
-
673
- Arguments:
674
- text (Union[str, List[str]]): The input text.
675
- embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
676
- batch_size (int, optional): The batch size to use. Defaults to 8.
677
- num_workers (int, optional): The number of workers to use. Defaults to 4.
678
-
679
- Returns:
680
- torch.Tensor: The embeddings.
681
-
682
- Raises:
683
- ValueError: If the embedding type is invalid.
684
- """
685
-
686
- # Check if the embedding type is valid
687
- if embedding_type not in ['last_hidden_state', 'mean_pooling']:
688
- raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
689
-
690
- ids_dict = {}
691
- # Tokenize the input text
692
- for sentence in text['text']:
693
- encoded_text = self.tokenizer(
694
- sentence,
695
- padding=False,
696
- truncation=True,
697
- return_tensors='pt',
698
- add_special_tokens=False
699
- )
700
- # Move the input text to the device
701
- encoded_text = encoded_text.to(self.device)
702
- token_ids_list = encoded_text['input_ids'].tolist()
703
- token_ids_list = token_ids_list[0]
704
- for ids in token_ids_list:
705
- if ids not in ids_dict.keys():
706
- ids_dict[ids] = [sentence]
707
- else:
708
- if text not in ids_dict[ids]:
709
- ids_dict[ids].append(sentence)
710
- # Get the embeddings
711
- embedding_dict = {}
712
- self.model.eval()
713
- for key, value in ids_dict.items():
714
- embed = self.model(torch.tensor([[key]]), output_hidden_states=True).hidden_states[-1][:,0,:].detach()
715
- embedding_dict[embed] = value
716
-
717
- # Return the embeddings
718
- return embedding_dict
719
-
720
-
721
- # Get the embeddings (new code)
722
- def get_query_embeddings(self,
723
- text: Union[str, List[str]],
724
- embedding_type: str = 'last_hidden_state',
725
- batch_size: int = 8,
726
- num_workers: int = 4,
727
- ) -> torch.Tensor:
728
- """
729
- This function returns the embeddings of the input text.
730
-
731
- Arguments:
732
- text (Union[str, List[str]]): The input text.
733
- embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
734
- batch_size (int, optional): The batch size to use. Defaults to 8.
735
- num_workers (int, optional): The number of workers to use. Defaults to 4.
736
-
737
- Returns:
738
- torch.Tensor: The embeddings.
739
-
740
- Raises:
741
- ValueError: If the embedding type is invalid.
742
- """
743
-
744
- # Check if the embedding type is valid
745
- if embedding_type not in ['last_hidden_state', 'mean_pooling']:
746
- raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
747
-
748
- # Tokenize the input text
749
- encoded_text = self.tokenizer(
750
- text,
751
- padding=False,
752
- truncation=True,
753
- return_tensors='pt',
754
- add_special_tokens=False,
755
- )
756
-
757
- # Move the input text to the device
758
- encoded_text = encoded_text.to(self.device)
759
-
760
- token_ids_list = encoded_text['input_ids'].tolist()
761
- token_ids_list = token_ids_list[0]
762
- tensor_list = [torch.tensor([[value]]) for value in token_ids_list]
763
-
764
- # Get the embeddings
765
- embeds = []
766
- self.model.eval()
767
- for index, tensor in enumerate(tensor_list):
768
- embed = self.model(tensor, output_hidden_states=True).hidden_states[-1][:,0,:].detach().cpu().numpy()
769
- embeds.append(embed)
770
-
771
- # Return the embeddings
772
- return embeds
773
-
774
-
775
- # Initialize the corpus using a dictionary or pandas DataFrame or HuggingFace Datasets object
776
- def initialize_corpus(self,
777
- corpus: Union[Dict[str, List[str]], pd.DataFrame, Dataset],
778
- section: str = 'text',
779
- index_column_name: str = 'embeddings',
780
- embedding_type: str = 'last_hidden_state',
781
- batch_size: Optional[int] = None,
782
- num_workers: Optional[int] = None,
783
- save_path: Optional[str] = None,
784
- ) -> Dataset:
785
- """
786
- This function initializes a dataset using a dictionary or pandas DataFrame or HuggingFace Datasets object.
787
-
788
- Arguments:
789
- dataset_dict (Dict[str, List[str]]): The dataset dictionary.
790
- section (str): The section of the dataset to use whose embeddings will be used for semantic search (e.g., 'text', 'title', etc.) (default: 'text').
791
- index_column_name (str): The name of the column containing the embeddings (default: 'embeddings')
792
- embedding_type (str): The type of embedding to use (default: 'last_hidden_state').
793
- batch_size (int, optional): The batch size to use (default: 8).
794
- max_length (int, optional): The maximum length of the input sequences.
795
- num_workers (int, optional): The number of workers to use.
796
- save_path (Optional[str], optional): The path to save the dataset (default: None).
797
-
798
- Returns:
799
- Dataset: The dataset object (HuggingFace Datasets).
800
-
801
- Raises:
802
- ValueError: If the dataset is not a dictionary or pandas DataFrame or HuggingFace Datasets object.
803
- """
804
-
805
- # corpus = { 'text': [...] } -> form_dict
806
-
807
- # Set the embedding_type
808
- self.embedding_type = embedding_type
809
-
810
- # get embedding dict
811
- embedding_dict = self.get_doc_embeddings(text=corpus, embedding_type=self.embedding_type)
812
-
813
- data = {
814
- 'text' : embedding_dict.values(),
815
- 'embeddings': []
816
- }
817
-
818
- for embed in embedding_dict.keys():
819
- embed_list = embed.tolist()
820
- data['embeddings'].append(embed_list[0])
821
-
822
-
823
- if isinstance(data, dict):
824
- self.dataset = Dataset.from_dict(data)
825
- elif isinstance(data, pd.DataFrame):
826
- self.dataset = Dataset.from_pandas(data)
827
- elif isinstance(data, Dataset):
828
- self.dataset = corpus
829
- else:
830
- raise ValueError('The dataset must be a dictionary or pandas DataFrame.')
831
-
832
- # Save the dataset
833
- if save_path is not None:
834
- self.dataset.to_json(save_path)
835
-
836
- # Add FAISS index
837
- self.add_faiss_index(
838
- column_name=index_column_name,
839
- )
840
-
841
- # Return the dataset
842
- return self.dataset
843
-
844
-
845
- # Search for the most similar elements in the dataset, given a query
846
- def search(self,
847
- query: str,
848
- k: int = 1,
849
- index_column_name: str = 'embeddings',
850
- ) -> pd.DataFrame:
851
- """
852
- This function searches for the most similar elements in the dataset, given a query.
853
-
854
- Arguments:
855
- query (str): The query.
856
- k (int, optional): The number of elements to return (default: 1).
857
- index_column_name (str, optional): The name of the column containing the embeddings (default: 'embeddings')
858
-
859
- Returns:
860
- pd.DataFrame: The most similar elements in the dataset (text, score, etc.), sorted by score.
861
-
862
- Remarks:
863
- The returned elements are dictionaries containing the text and the score.
864
- """
865
-
866
- # Get the embeddings of the query
867
- query_embeddings = self.get_query_embeddings([query], embedding_type=self.embedding_type)
868
-
869
- # query_embedding이랑 self.dataset['embeddings'] 값 비교
870
- scores = []
871
- similar_elts = []
872
- for query in query_embeddings:
873
- # Search for the most similar elements in the dataset
874
- score, similar_elt = self.dataset.get_nearest_examples(
875
- index_name=index_column_name,
876
- query=query,
877
- k=k,
878
- )
879
- scores.append(score)
880
- similar_elts.append(similar_elt)
881
-
589
+ return embeddings
882
590
 
883
- text_list = []
884
- for item in similar_elts:
885
- for text in item['text']:
886
- text_list.append(text)
887
-
888
- flat_list = [sentence for sublist in text_list for sentence in sublist]
889
- count = Counter(flat_list)
890
- count = dict(count.most_common(5))
891
-
892
- sorted_dict = dict(sorted(count.items(), key=lambda x: x[1], reverse=True))
893
- # Convert the results to a pandas DataFrame
894
- results_df = pd.DataFrame({'text': sorted_dict.keys() , 'freq': sorted_dict.values()})
895
-
896
- # Return the most similar elements
897
- return results_df
nltkor/search/test.py ADDED
@@ -0,0 +1,25 @@
1
+ from trie_search import TRIESearch
2
+
3
+ root = {}
4
+ dict_file = '/Users/chanhyeok/Downloads/lexicon.txt'
5
+ sc = TRIESearch(root)
6
+ with open(dict_file, 'r') as f:
7
+ for line in f:
8
+ if ';;' in line[:2]: continue
9
+ k, v = line.strip().split('\t')
10
+ sc.build_trie_search(k, v)
11
+ # print(root)
12
+ word = '고용 노동부'
13
+ values, value_data = sc.trie_search(word, True)
14
+ print(values, value_data)
15
+
16
+ word = '2시뉴스외전'
17
+ values, value_data = sc.trie_search( word, True)
18
+ print(values, value_data)
19
+ word = '2시 뉴스외전'
20
+ values, value_data = sc.trie_search( word, True)
21
+ print(values, value_data)
22
+
23
+ word = 'gbc'
24
+ values, value_data = sc.trie_search( word, True)
25
+ print(values, value_data)
@@ -4,11 +4,11 @@ import numpy as np
4
4
  import json
5
5
  import argparse
6
6
 
7
- class SearchDic :
7
+ class TRIESearch :
8
8
  def __init__ (self,root) :
9
9
  self.root = root
10
10
 
11
- def build_search_dict(self, word, data) -> dict:
11
+ def build_trie_search(self, word, data) -> dict:
12
12
  current_dict = self.root
13
13
  _end_word_ = '$$'
14
14
  for letter in word:
@@ -19,7 +19,7 @@ class SearchDic :
19
19
 
20
20
 
21
21
 
22
- def search_dict(self, word, space_flag=False):
22
+ def trie_search(self, word, space_flag=False):
23
23
  '''
24
24
  TRIE 탐색
25
25
  space_flag: if True then including space, otherwise do not including space
@@ -69,27 +69,27 @@ class SearchDic :
69
69
  return pickle.load(f)
70
70
  if __name__ == "__main__":
71
71
  root = {}
72
- dict_file = '텍스트파일경로'
73
- sc = SearchDic(root)
72
+ dict_file = '텍스트파일 경로'
73
+ sc = TRIESearch(root)
74
74
  with open(dict_file, 'r') as f:
75
75
  for line in f:
76
76
  if ';;' in line[:2]: continue
77
77
  k, v = line.strip().split('\t')
78
- sc.build_search_dict(k, v)
78
+ sc.build_trie_search(k, v)
79
79
  # print(root)
80
80
  word = '고용 노동부'
81
- values, value_data = sc.search_dict(word, True)
81
+ values, value_data = sc.trie_search(word, True)
82
82
  print(values, value_data)
83
83
 
84
84
  word = '2시뉴스외전'
85
- values, value_data = sc.search_dict( word, True)
85
+ values, value_data = sc.trie_search( word, True)
86
86
  print(values, value_data)
87
87
  word = '2시 뉴스외전'
88
- values, value_data = sc.search_dict( word, True)
88
+ values, value_data = sc.trie_search( word, True)
89
89
  print(values, value_data)
90
90
 
91
91
  word = 'gbc'
92
- values, value_data = sc.search_dict( word, True)
92
+ values, value_data = sc.trie_search( word, True)
93
93
  print(values, value_data)
94
94
 
95
95
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes