nltkor 1.2.18__cp39-cp39-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +16 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +62 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/classical.py +859 -0
  16. nltkor/metrics/entment.py +24 -0
  17. nltkor/metrics/eval.py +517 -0
  18. nltkor/metrics/mauve.py +273 -0
  19. nltkor/metrics/mauve_utils.py +131 -0
  20. nltkor/misc/__init__.py +11 -0
  21. nltkor/misc/string2string_basic_functions.py +59 -0
  22. nltkor/misc/string2string_default_tokenizer.py +83 -0
  23. nltkor/misc/string2string_hash_functions.py +159 -0
  24. nltkor/misc/string2string_word_embeddings.py +503 -0
  25. nltkor/search/__init__.py +11 -0
  26. nltkor/search/classical.py +569 -0
  27. nltkor/search/faiss_search.py +897 -0
  28. nltkor/search/kobert_tokenizer.py +181 -0
  29. nltkor/search/search_dict.py +95 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  32. nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  33. nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  34. nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  35. nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  36. nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  37. nltkor/sejong/ch.py +12 -0
  38. nltkor/sejong/dict_semClassNum.txt +491 -0
  39. nltkor/sejong/layer.txt +630 -0
  40. nltkor/sejong/sejong_download.py +87 -0
  41. nltkor/sejong/ssem.py +684 -0
  42. nltkor/similarity/__init__.py +3 -0
  43. nltkor/similarity/bartscore____.py +337 -0
  44. nltkor/similarity/bertscore____.py +339 -0
  45. nltkor/similarity/classical.py +245 -0
  46. nltkor/similarity/cosine_similarity.py +175 -0
  47. nltkor/tag/__init__.py +71 -0
  48. nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
  49. nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
  50. nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
  51. nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
  52. nltkor/tag/espresso_tag.py +220 -0
  53. nltkor/tag/libs/__init__.py +10 -0
  54. nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
  55. nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
  56. nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
  57. nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
  58. nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
  59. nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
  60. nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
  61. nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
  62. nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
  63. nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
  64. nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
  65. nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
  66. nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
  67. nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
  68. nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
  69. nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
  70. nltkor/tag/libs/arguments.py +280 -0
  71. nltkor/tag/libs/attributes.py +231 -0
  72. nltkor/tag/libs/config.py +159 -0
  73. nltkor/tag/libs/metadata.py +129 -0
  74. nltkor/tag/libs/ner/__init__.py +2 -0
  75. nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
  76. nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
  77. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
  78. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
  79. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  80. nltkor/tag/libs/ner/ner_reader.py +92 -0
  81. nltkor/tag/libs/network.c +68949 -0
  82. nltkor/tag/libs/network.cpython-39-darwin.so +0 -0
  83. nltkor/tag/libs/network.pyx +878 -0
  84. nltkor/tag/libs/networkconv.pyx +1028 -0
  85. nltkor/tag/libs/networkdependencyconv.pyx +451 -0
  86. nltkor/tag/libs/parse/__init__.py +1 -0
  87. nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
  88. nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
  89. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
  90. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
  91. nltkor/tag/libs/parse/parse_reader.py +283 -0
  92. nltkor/tag/libs/pos/__init__.py +2 -0
  93. nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
  94. nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
  95. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
  96. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
  97. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  98. nltkor/tag/libs/pos/pos_reader.py +97 -0
  99. nltkor/tag/libs/reader.py +485 -0
  100. nltkor/tag/libs/srl/__init__.py +3 -0
  101. nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
  102. nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
  103. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
  104. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
  105. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
  106. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
  107. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  108. nltkor/tag/libs/srl/srl_reader.py +436 -0
  109. nltkor/tag/libs/srl/train_srl.py +87 -0
  110. nltkor/tag/libs/taggers.py +926 -0
  111. nltkor/tag/libs/utils.py +384 -0
  112. nltkor/tag/libs/word_dictionary.py +239 -0
  113. nltkor/tag/libs/wsd/__init__.py +2 -0
  114. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
  115. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
  116. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
  117. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
  118. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  119. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  120. nltkor/tokenize/__init__.py +62 -0
  121. nltkor/tokenize/ko_tokenize.py +115 -0
  122. nltkor/trans.py +121 -0
  123. nltkor-1.2.18.dist-info/LICENSE.txt +1093 -0
  124. nltkor-1.2.18.dist-info/METADATA +42 -0
  125. nltkor-1.2.18.dist-info/RECORD +127 -0
  126. nltkor-1.2.18.dist-info/WHEEL +5 -0
  127. nltkor-1.2.18.dist-info/top_level.txt +1 -0
@@ -0,0 +1,897 @@
1
+ """
2
+ string2string search
3
+ src = https://github.com/stanfordnlp/string2string
4
+
5
+
6
+ MIT License
7
+
8
+ Copyright (c) 2023 Mirac Suzgun
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+
29
+ """
30
+
31
+
32
+ """
33
+ This module contains a wrapper for the Faiss library by Facebook AI Research.
34
+ """
35
+
36
+ from collections import Counter
37
+ from typing import List, Union, Optional, Dict, Any
38
+ import os
39
+ import copy
40
+ import logging
41
+ import transformers
42
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
43
+
44
+ from nltkor.make_requirement import make_requirement
45
+ try:
46
+ import torch
47
+ from transformers import AutoTokenizer, AutoModel, XLNetTokenizer
48
+ import pandas as pd
49
+ from datasets import Dataset
50
+ # import protobuf
51
+ except ImportError:
52
+ requirment = ['torch', 'transformers>=4.8.2', 'pandas', 'datasets', "protobuf", 'sentencepiece']
53
+ file_path = make_requirement(requirment)
54
+ raise Exception(f"""
55
+ Need to install Libraries, please pip install below libraries
56
+ \t pip install transformers>=4.8.2
57
+ \t pip install torch
58
+ \t pip install pandas
59
+ \t pip install datasets
60
+ \t pip install protobuf
61
+ \t pip install sentencepiece
62
+ Or, use pip install requirement.txt
63
+ \t pip install -r {file_path}
64
+ """)
65
+
66
+ # from nltk.search.kobert_tokenizer import KoBERTTokenizer
67
+
68
+
69
+ class FaissSearch:
70
+ def __new__(cls,
71
+ mode = None,
72
+ model_name_or_path: str = 'klue/bert-base',
73
+ tokenizer_name_or_path: str = 'klue/bert-base',
74
+ embedding_type: str = 'last_hidden_state',
75
+ device: str = 'cpu'
76
+ ) -> None:
77
+ if mode == 'sentence':
78
+ return FaissSearch_SenEmbed(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
79
+ elif mode == 'word':
80
+ return FaissSearch_WordEmbed(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
81
+ elif mode == 'sparse':
82
+ return FaissSearch_Sparse(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
83
+ else:
84
+ raise ValueError("choice 'sentence' or 'word' or 'sparse'")
85
+
86
+
87
+
88
+ class FaissSearch_SenEmbed:
89
+ def __init__(self,
90
+ model_name_or_path: str = 'klue/bert-base',
91
+ tokenizer_name_or_path: str = 'klue/bert-base',
92
+ embedding_type: str = 'last_hidden_state',
93
+ device: str = 'cpu',
94
+ ) -> None:
95
+ """
96
+ This function initializes the wrapper for the FAISS library, which is used to perform semantic search.
97
+
98
+
99
+ .. attention::
100
+
101
+ * If you use this class, please make sure to cite the following paper:
102
+
103
+ .. code-block:: latex
104
+
105
+ @article{johnson2019billion,
106
+ title={Billion-scale similarity search with {GPUs}},
107
+ author={Johnson, Jeff and Douze, Matthijs and J{\'e}gou, Herv{\'e}},
108
+ journal={IEEE Transactions on Big Data},
109
+ volume={7},
110
+ number={3},
111
+ pages={535--547},
112
+ year={2019},
113
+ publisher={IEEE}
114
+ }
115
+
116
+ * The code is based on the following GitHub repository:
117
+ https://github.com/facebookresearch/faiss
118
+
119
+ Arguments:
120
+ model_name_or_path (str, optional): The name or path of the model to use. Defaults to 'facebook/bart-large'.
121
+ tokenizer_name_or_path (str, optional): The name or path of the tokenizer to use. Defaults to 'facebook/bart-large'.
122
+ device (str, optional): The device to use. Defaults to 'cpu'.
123
+
124
+ Returns:
125
+ None
126
+ """
127
+
128
+ # Set the device
129
+ self.device = device
130
+
131
+ # If the tokenizer is not specified, use the model name or path
132
+ if tokenizer_name_or_path is None:
133
+ tokenizer_name_or_path = model_name_or_path
134
+
135
+ # Load the tokenizer
136
+ if tokenizer_name_or_path == 'skt/kobert-base-v1':
137
+ # self.tokenizer = KoBERTTokenizer.from_pretrained(tokenizer_name_or_path)
138
+ self.tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name_or_path)
139
+ else:
140
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
141
+
142
+ # Load the model
143
+ self.model = AutoModel.from_pretrained(model_name_or_path).to(self.device)
144
+
145
+ # Set the model to evaluation mode (since we do not need the gradients)
146
+ self.model.eval()
147
+
148
+ # Initialize the dataset
149
+ self.dataset = None
150
+
151
+
152
+ # Auxiliary function to get the last hidden state
153
+ def get_last_hidden_state(self,
154
+ embeddings: torch.Tensor,
155
+ ) -> torch.Tensor:
156
+ """
157
+ This function returns the last hidden state (e.g., [CLS] token's) of the input embeddings.
158
+
159
+ Arguments:
160
+ embeddings (torch.Tensor): The input embeddings.
161
+
162
+ Returns:
163
+ torch.Tensor: The last hidden state.
164
+ """
165
+
166
+ # Get the last hidden state
167
+ last_hidden_state = embeddings.last_hidden_state
168
+
169
+ # Return the last hidden state
170
+ return last_hidden_state[:, 0, :]
171
+
172
+
173
+ # Auxiliary function to get the mean pooling
174
+ def get_mean_pooling(self,
175
+ embeddings: torch.Tensor,
176
+ ) -> torch.Tensor:
177
+ """
178
+ This function returns the mean pooling of the input embeddings.
179
+
180
+ Arguments:
181
+ embeddings (torch.Tensor): The input embeddings.
182
+
183
+ Returns:
184
+ torch.Tensor: The mean pooling.
185
+ """
186
+
187
+ # Get the mean pooling
188
+ mean_pooling = embeddings.last_hidden_state.mean(dim=1)
189
+
190
+ # Return the mean pooling
191
+ return mean_pooling
192
+
193
+
194
+ # Get the embeddings
195
+ def get_embeddings(self,
196
+ text: Union[str, List[str]],
197
+ embedding_type: str = 'last_hidden_state',
198
+ batch_size: int = 8,
199
+ num_workers: int = 4,
200
+ ) -> torch.Tensor:
201
+ """
202
+ This function returns the embeddings of the input text.
203
+
204
+ Arguments:
205
+ text (Union[str, List[str]]): The input text.
206
+ embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
207
+ batch_size (int, optional): The batch size to use. Defaults to 8.
208
+ num_workers (int, optional): The number of workers to use. Defaults to 4.
209
+
210
+ Returns:
211
+ torch.Tensor: The embeddings.
212
+
213
+ Raises:
214
+ ValueError: If the embedding type is invalid.
215
+ """
216
+
217
+ # Check if the embedding type is valid
218
+ if embedding_type not in ['last_hidden_state', 'mean_pooling']:
219
+ raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
220
+
221
+ # Tokenize the input text
222
+ encoded_text = self.tokenizer(
223
+ text,
224
+ padding=True,
225
+ truncation=True,
226
+ return_tensors='pt',
227
+ )
228
+
229
+ # Move the input text to the device
230
+ encoded_text = encoded_text.to(self.device)
231
+
232
+ # encoded_inputs = {k: v.to(self.device) for k, v in encoded_inputs.items()}
233
+
234
+ # Get the embeddings
235
+ with torch.no_grad():
236
+ embeddings = self.model(encoded_text['input_ids'])
237
+
238
+ # Get the proper embedding type
239
+ if embedding_type == 'last_hidden_state':
240
+ # Get the last hidden state
241
+ embeddings = self.get_last_hidden_state(embeddings)
242
+ elif embedding_type == 'mean_pooling':
243
+ # Get the mean pooling
244
+ embeddings = self.get_mean_pooling(embeddings)
245
+
246
+ # Return the embeddings
247
+ return embeddings
248
+
249
+
250
+ # Add FAISS index
251
+ def add_faiss_index(self,
252
+ column_name: str = 'embeddings',
253
+ metric_type: Optional[int] = None,
254
+ batch_size: int = 8,
255
+ **kwargs,
256
+ ) -> None:
257
+ """
258
+ This function adds a FAISS index to the dataset.
259
+
260
+ Arguments:
261
+ column_name (str, optional): The name of the column containing the embeddings. Defaults to 'embeddings'.
262
+ index_type (str, optional): The index type to use. Defaults to 'Flat'.
263
+ metric_type (str, optional): The metric type to use. Defaults to 'L2'.
264
+
265
+ Returns:
266
+ None
267
+
268
+ Raises:
269
+ ValueError: If the dataset is not initialized.
270
+ """
271
+
272
+ # Check if the dataset is initialized
273
+ if self.dataset is None:
274
+ raise ValueError('The dataset is not initialized. Please initialize the dataset first.')
275
+
276
+ print('Adding FAISS index...')
277
+ self.dataset.add_faiss_index(
278
+ column_name,
279
+ # metric_type=metric_type,
280
+ # device=self.device,
281
+ # batch_size=batch_size,
282
+ faiss_verbose=True,
283
+ # **kwargs,
284
+ )
285
+
286
+
287
+ def save_faiss_index(self,
288
+ index_name: str,
289
+ file_path: str,
290
+ ) -> None:
291
+ """
292
+ This function saves the FAISS index to the specified file path.
293
+ * This is a wrapper function for the `save_faiss_index` function in the `Dataset` class.
294
+
295
+ Arguments:
296
+ index_name (str): The name of the FAISS index (e.g., "embeddings")
297
+ file_path (str): The file path to save the FAISS index.
298
+
299
+ Returns:
300
+ None
301
+
302
+ Raises:
303
+ ValueError: If the dataset is not initialized.
304
+ """
305
+
306
+ # Check if the dataset is initialized
307
+ if self.dataset is None:
308
+ raise ValueError('The dataset is not initialized. Please initialize the dataset first.')
309
+
310
+ print('Saving FAISS index...')
311
+ self.dataset.save_faiss_index(index_name=index_name, file=file_path)
312
+
313
+
314
+ def load_faiss_index(self,
315
+ index_name: str,
316
+ file_path: str,
317
+ device: str = 'cpu',
318
+ ) -> None:
319
+ """
320
+ This function loads the FAISS index from the specified file path.
321
+ * This is a wrapper function for the `load_faiss_index` function in the `Dataset` class.
322
+
323
+ Arguments:
324
+ index_name (str): The name of the FAISS index (e.g., "embeddings")
325
+ file_path (str): The file path to load the FAISS index from.
326
+ device (str, optional): The device to use ("cpu" or "cuda") (default: "cpu").
327
+
328
+ Returns:
329
+ None
330
+
331
+ Raises:
332
+ ValueError: If the dataset is not initialized.
333
+ """
334
+
335
+ # Check if the dataset is initialized
336
+ if self.dataset is None:
337
+ raise ValueError('The dataset is not initialized. Please initialize the dataset first.')
338
+
339
+ print('Loading FAISS index...')
340
+ self.dataset.load_faiss_index(index_name=index_name, file=file_path, device=device)
341
+
342
+
343
+ # Initialize the corpus using a dictionary or pandas DataFrame or HuggingFace Datasets object
344
+ def initialize_corpus(self,
345
+ corpus: Union[Dict[str, List[str]], pd.DataFrame, Dataset],
346
+ section: str = 'text',
347
+ index_column_name: str = 'embeddings',
348
+ embedding_type: str = 'last_hidden_state',
349
+ batch_size: Optional[int] = None,
350
+ num_workers: Optional[int] = None,
351
+ save_path: Optional[str] = None,
352
+ ) -> Dataset:
353
+ """
354
+ This function initializes a dataset using a dictionary or pandas DataFrame or HuggingFace Datasets object.
355
+
356
+ Arguments:
357
+ dataset_dict (Dict[str, List[str]]): The dataset dictionary.
358
+ section (str): The section of the dataset to use whose embeddings will be used for semantic search (e.g., 'text', 'title', etc.) (default: 'text').
359
+ index_column_name (str): The name of the column containing the embeddings (default: 'embeddings')
360
+ embedding_type (str): The type of embedding to use (default: 'last_hidden_state').
361
+ batch_size (int, optional): The batch size to use (default: 8).
362
+ max_length (int, optional): The maximum length of the input sequences.
363
+ num_workers (int, optional): The number of workers to use.
364
+ save_path (Optional[str], optional): The path to save the dataset (default: None).
365
+
366
+ Returns:
367
+ Dataset: The dataset object (HuggingFace Datasets).
368
+
369
+ Raises:
370
+ ValueError: If the dataset is not a dictionary or pandas DataFrame or HuggingFace Datasets object.
371
+ """
372
+
373
+ # Create the dataset
374
+ if isinstance(corpus, dict):
375
+ self.dataset = Dataset.from_dict(corpus)
376
+ elif isinstance(corpus, pd.DataFrame):
377
+ self.dataset = Dataset.from_pandas(corpus)
378
+ elif isinstance(corpus, Dataset):
379
+ self.dataset = corpus
380
+ else:
381
+ raise ValueError('The dataset must be a dictionary or pandas DataFrame.')
382
+
383
+ # Set the embedding_type
384
+ self.embedding_type = embedding_type
385
+
386
+
387
+ # Map the section of the dataset to the embeddings
388
+ self.dataset = self.dataset.map(
389
+ lambda x: {
390
+ index_column_name: self.get_embeddings(x[section], embedding_type=self.embedding_type).detach().cpu().numpy()[0]
391
+ },
392
+ # batched=True,
393
+ batch_size=batch_size,
394
+ num_proc=num_workers,
395
+ )
396
+
397
+ # Save the dataset
398
+ if save_path is not None:
399
+ self.dataset.to_json(save_path)
400
+
401
+ # Add FAISS index
402
+ self.add_faiss_index(
403
+ column_name=index_column_name,
404
+ )
405
+
406
+ # Return the dataset
407
+ return self.dataset
408
+
409
+
410
+ # Initialize the dataset using a JSON file
411
+ def load_dataset_from_json(self,
412
+ json_path: str,
413
+ ) -> Dataset:
414
+ """
415
+ This function loads a dataset from a JSON file.
416
+
417
+ Arguments:
418
+ json_path (str): The path to the JSON file.
419
+
420
+ Returns:
421
+ Dataset: The dataset.
422
+ """
423
+
424
+ # Load the dataset
425
+ self.dataset = Dataset.from_json(json_path)
426
+
427
+ # Return the dataset
428
+ return self.dataset
429
+
430
+
431
+ # Search for the most similar elements in the dataset, given a query
432
+ def search(self,
433
+ query: str,
434
+ k: int = 1,
435
+ index_column_name: str = 'embeddings',
436
+ ) -> pd.DataFrame:
437
+ """
438
+ This function searches for the most similar elements in the dataset, given a query.
439
+
440
+ Arguments:
441
+ query (str): The query.
442
+ k (int, optional): The number of elements to return (default: 1).
443
+ index_column_name (str, optional): The name of the column containing the embeddings (default: 'embeddings')
444
+
445
+ Returns:
446
+ pd.DataFrame: The most similar elements in the dataset (text, score, etc.), sorted by score.
447
+
448
+ Remarks:
449
+ The returned elements are dictionaries containing the text and the score.
450
+ """
451
+
452
+ # Get the embeddings of the query
453
+ query_embeddings = self.get_embeddings([query], embedding_type=self.embedding_type).detach().cpu().numpy()
454
+
455
+ # Search for the most similar elements in the dataset
456
+ scores, similar_elts = self.dataset.get_nearest_examples(
457
+ index_name=index_column_name,
458
+ query=query_embeddings,
459
+ k=k,
460
+ )
461
+
462
+ # Convert the results to a pandas DataFrame
463
+ results_df = pd.DataFrame.from_dict(similar_elts)
464
+
465
+ # Add the scores
466
+ results_df['score'] = scores
467
+
468
+
469
+ # Sort the results by score
470
+ results_df.sort_values("score", ascending=True, inplace=True)
471
+
472
+ # Return the most similar elements
473
+ return results_df
474
+
475
+
476
+
477
+ class FaissSearch_Sparse(FaissSearch_SenEmbed):
478
+ def __init__(self,
479
+ model_name_or_path: str = 'klue/bert-base',
480
+ tokenizer_name_or_path: str = 'klue/bert-base',
481
+ embedding_type: str = 'last_hidden_state',
482
+ device: str = 'cpu',
483
+ ) -> None:
484
+ r"""
485
+ This function initializes the wrapper for the FAISS library, which is used to perform semantic search.
486
+
487
+
488
+ .. attention::
489
+
490
+ * If you use this class, please make sure to cite the following paper:
491
+
492
+ .. code-block:: latex
493
+
494
+ @article{johnson2019billion,
495
+ title={Billion-scale similarity search with {GPUs}},
496
+ author={Johnson, Jeff and Douze, Matthijs and J{\'e}gou, Herv{\'e}},
497
+ journal={IEEE Transactions on Big Data},
498
+ volume={7},
499
+ number={3},
500
+ pages={535--547},
501
+ year={2019},
502
+ publisher={IEEE}
503
+ }
504
+
505
+ * The code is based on the following GitHub repository:
506
+ https://github.com/facebookresearch/faiss
507
+
508
+ Arguments:
509
+ model_name_or_path (str, optional): The name or path of the model to use. Defaults to 'facebook/bart-large'.
510
+ tokenizer_name_or_path (str, optional): The name or path of the tokenizer to use. Defaults to 'facebook/bart-large'.
511
+ device (str, optional): The device to use. Defaults to 'cpu'.
512
+
513
+ Returns:
514
+ None
515
+ """
516
+
517
+ # Set the device
518
+ self.device = device
519
+
520
+ # If the tokenizer is not specified, use the model name or path
521
+ if tokenizer_name_or_path is None:
522
+ tokenizer_name_or_path = model_name_or_path
523
+
524
+ # Load the tokenizer
525
+ if tokenizer_name_or_path == 'skt/kobert-base-v1':
526
+ # self.tokenizer = KoBERTTokenizer.from_pretrained(tokenizer_name_or_path)
527
+ self.tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name_or_path)
528
+ else:
529
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
530
+
531
+ # Load the model
532
+ self.model = transformers.BertForMaskedLM.from_pretrained(model_name_or_path).to(self.device)
533
+
534
+ # Set the model to evaluation mode (since we do not need the gradients)
535
+ self.model.eval()
536
+
537
+ # Initialize the dataset
538
+ self.dataset = None
539
+
540
+
541
+ # Get the embeddings
542
+ def get_embeddings(self,
543
+ text: Union[str, List[str]],
544
+ embedding_type: str = 'last_hidden_state',
545
+ batch_size: int = 8,
546
+ num_workers: int = 4,
547
+ ) -> torch.Tensor:
548
+ """
549
+ This function returns the embeddings of the input text.
550
+
551
+ Arguments:
552
+ text (Union[str, List[str]]): The input text.
553
+ embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
554
+ batch_size (int, optional): The batch size to use. Defaults to 8.
555
+ num_workers (int, optional): The number of workers to use. Defaults to 4.
556
+
557
+ Returns:
558
+ torch.Tensor: The embeddings.
559
+
560
+ Raises:
561
+ ValueError: If the embedding type is invalid.
562
+ """
563
+
564
+ # Check if the embedding type is valid
565
+ if embedding_type not in ['last_hidden_state', 'mean_pooling']:
566
+ raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
567
+
568
+ # Tokenize the input text
569
+ encoded_text = self.tokenizer(
570
+ text,
571
+ padding=True,
572
+ truncation=True,
573
+ return_tensors='pt',
574
+ )
575
+
576
+ # Move the input text to the device
577
+ encoded_text = encoded_text.to(self.device)
578
+
579
+ # encoded_inputs = {k: v.to(self.device) for k, v in encoded_inputs.items()}
580
+
581
+ # Get the embeddings
582
+ with torch.no_grad():
583
+ embeddings = self.model(encoded_text['input_ids'])
584
+
585
+ # Get the last hidden state
586
+ embeddings = embeddings['logits']
587
+
588
+ embeddings = torch.sum(torch.log(1+torch.relu(embeddings)) * encoded_text['attention_mask'].unsqueeze(-1), dim=1)
589
+ e_norm = torch.nn.functional.normalize(embeddings, p=2, dim=1, eps=1e-8)
590
+
591
+ # Return the embeddings
592
+ return e_norm
593
+
594
+
595
+
596
+ # FAISS word embedding library wrapper class
597
+ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
598
+ def __init__(self,
599
+ model_name_or_path: str = 'klue/bert-base',
600
+ tokenizer_name_or_path: str = 'klue/bert-base',
601
+ embedding_type: str = 'last_hidden_state',
602
+ device: str = 'cpu',
603
+ ) -> None:
604
+ r"""
605
+ This function initializes the wrapper for the FAISS library, which is used to perform semantic search.
606
+
607
+
608
+ .. attention::
609
+
610
+ * If you use this class, please make sure to cite the following paper:
611
+
612
+ .. code-block:: latex
613
+
614
+ @article{johnson2019billion,
615
+ title={Billion-scale similarity search with {GPUs}},
616
+ author={Johnson, Jeff and Douze, Matthijs and J{\'e}gou, Herv{\'e}},
617
+ journal={IEEE Transactions on Big Data},
618
+ volume={7},
619
+ number={3},
620
+ pages={535--547},
621
+ year={2019},
622
+ publisher={IEEE}
623
+ }
624
+
625
+ * The code is based on the following GitHub repository:
626
+ https://github.com/facebookresearch/faiss
627
+
628
+ Arguments:
629
+ model_name_or_path (str, optional): The name or path of the model to use. Defaults to 'facebook/bart-large'.
630
+ tokenizer_name_or_path (str, optional): The name or path of the tokenizer to use. Defaults to 'facebook/bart-large'.
631
+ device (str, optional): The device to use. Defaults to 'cpu'.
632
+
633
+ Returns:
634
+ None
635
+ """
636
+
637
+ # Set the device
638
+ self.device = device
639
+
640
+ # If the tokenizer is not specified, use the model name or path
641
+ if tokenizer_name_or_path is None:
642
+ tokenizer_name_or_path = model_name_or_path
643
+
644
+ # Load the tokenizer
645
+ if tokenizer_name_or_path == 'skt/kobert-base-v1':
646
+ # self.tokenizer = KoBERTTokenizer.from_pretrained(tokenizer_name_or_path)
647
+ self.tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name_or_path)
648
+ else:
649
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
650
+
651
+ # Load the model
652
+ self.model = AutoModel.from_pretrained(model_name_or_path).to(self.device)
653
+
654
+
655
+ # Set the model to evaluation mode (since we do not need the gradients)
656
+ self.model.eval()
657
+
658
+ # Initialize the dataset
659
+ self.dataset = None
660
+
661
+
662
+ # Get the embeddings (new code)
663
+ def get_doc_embeddings(self,
664
+ #text: Union[str, List[str]],
665
+ text=None,
666
+ embedding_type: str = 'last_hidden_state',
667
+ batch_size: int = 8,
668
+ num_workers: int = 4,
669
+ ) -> torch.Tensor:
670
+ """
671
+ This function returns the embeddings of the input text.
672
+
673
+ Arguments:
674
+ text (Union[str, List[str]]): The input text.
675
+ embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
676
+ batch_size (int, optional): The batch size to use. Defaults to 8.
677
+ num_workers (int, optional): The number of workers to use. Defaults to 4.
678
+
679
+ Returns:
680
+ torch.Tensor: The embeddings.
681
+
682
+ Raises:
683
+ ValueError: If the embedding type is invalid.
684
+ """
685
+
686
+ # Check if the embedding type is valid
687
+ if embedding_type not in ['last_hidden_state', 'mean_pooling']:
688
+ raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
689
+
690
+ ids_dict = {}
691
+ # Tokenize the input text
692
+ for sentence in text['text']:
693
+ encoded_text = self.tokenizer(
694
+ sentence,
695
+ padding=False,
696
+ truncation=True,
697
+ return_tensors='pt',
698
+ add_special_tokens=False
699
+ )
700
+ # Move the input text to the device
701
+ encoded_text = encoded_text.to(self.device)
702
+ token_ids_list = encoded_text['input_ids'].tolist()
703
+ token_ids_list = token_ids_list[0]
704
+ for ids in token_ids_list:
705
+ if ids not in ids_dict.keys():
706
+ ids_dict[ids] = [sentence]
707
+ else:
708
+ if text not in ids_dict[ids]:
709
+ ids_dict[ids].append(sentence)
710
+ # Get the embeddings
711
+ embedding_dict = {}
712
+ self.model.eval()
713
+ for key, value in ids_dict.items():
714
+ embed = self.model(torch.tensor([[key]]), output_hidden_states=True).hidden_states[-1][:,0,:].detach()
715
+ embedding_dict[embed] = value
716
+
717
+ # Return the embeddings
718
+ return embedding_dict
719
+
720
+
721
+ # Get the embeddings (new code)
722
+ def get_query_embeddings(self,
723
+ text: Union[str, List[str]],
724
+ embedding_type: str = 'last_hidden_state',
725
+ batch_size: int = 8,
726
+ num_workers: int = 4,
727
+ ) -> torch.Tensor:
728
+ """
729
+ This function returns the embeddings of the input text.
730
+
731
+ Arguments:
732
+ text (Union[str, List[str]]): The input text.
733
+ embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
734
+ batch_size (int, optional): The batch size to use. Defaults to 8.
735
+ num_workers (int, optional): The number of workers to use. Defaults to 4.
736
+
737
+ Returns:
738
+ torch.Tensor: The embeddings.
739
+
740
+ Raises:
741
+ ValueError: If the embedding type is invalid.
742
+ """
743
+
744
+ # Check if the embedding type is valid
745
+ if embedding_type not in ['last_hidden_state', 'mean_pooling']:
746
+ raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
747
+
748
+ # Tokenize the input text
749
+ encoded_text = self.tokenizer(
750
+ text,
751
+ padding=False,
752
+ truncation=True,
753
+ return_tensors='pt',
754
+ add_special_tokens=False,
755
+ )
756
+
757
+ # Move the input text to the device
758
+ encoded_text = encoded_text.to(self.device)
759
+
760
+ token_ids_list = encoded_text['input_ids'].tolist()
761
+ token_ids_list = token_ids_list[0]
762
+ tensor_list = [torch.tensor([[value]]) for value in token_ids_list]
763
+
764
+ # Get the embeddings
765
+ embeds = []
766
+ self.model.eval()
767
+ for index, tensor in enumerate(tensor_list):
768
+ embed = self.model(tensor, output_hidden_states=True).hidden_states[-1][:,0,:].detach().cpu().numpy()
769
+ embeds.append(embed)
770
+
771
+ # Return the embeddings
772
+ return embeds
773
+
774
+
775
+ # Initialize the corpus using a dictionary or pandas DataFrame or HuggingFace Datasets object
776
+ def initialize_corpus(self,
777
+ corpus: Union[Dict[str, List[str]], pd.DataFrame, Dataset],
778
+ section: str = 'text',
779
+ index_column_name: str = 'embeddings',
780
+ embedding_type: str = 'last_hidden_state',
781
+ batch_size: Optional[int] = None,
782
+ num_workers: Optional[int] = None,
783
+ save_path: Optional[str] = None,
784
+ ) -> Dataset:
785
+ """
786
+ This function initializes a dataset using a dictionary or pandas DataFrame or HuggingFace Datasets object.
787
+
788
+ Arguments:
789
+ dataset_dict (Dict[str, List[str]]): The dataset dictionary.
790
+ section (str): The section of the dataset to use whose embeddings will be used for semantic search (e.g., 'text', 'title', etc.) (default: 'text').
791
+ index_column_name (str): The name of the column containing the embeddings (default: 'embeddings')
792
+ embedding_type (str): The type of embedding to use (default: 'last_hidden_state').
793
+ batch_size (int, optional): The batch size to use (default: 8).
794
+ max_length (int, optional): The maximum length of the input sequences.
795
+ num_workers (int, optional): The number of workers to use.
796
+ save_path (Optional[str], optional): The path to save the dataset (default: None).
797
+
798
+ Returns:
799
+ Dataset: The dataset object (HuggingFace Datasets).
800
+
801
+ Raises:
802
+ ValueError: If the dataset is not a dictionary or pandas DataFrame or HuggingFace Datasets object.
803
+ """
804
+
805
+ # corpus = { 'text': [...] } -> form_dict
806
+
807
+ # Set the embedding_type
808
+ self.embedding_type = embedding_type
809
+
810
+ # get embedding dict
811
+ embedding_dict = self.get_doc_embeddings(text=corpus, embedding_type=self.embedding_type)
812
+
813
+ data = {
814
+ 'text' : embedding_dict.values(),
815
+ 'embeddings': []
816
+ }
817
+
818
+ for embed in embedding_dict.keys():
819
+ embed_list = embed.tolist()
820
+ data['embeddings'].append(embed_list[0])
821
+
822
+
823
+ if isinstance(data, dict):
824
+ self.dataset = Dataset.from_dict(data)
825
+ elif isinstance(data, pd.DataFrame):
826
+ self.dataset = Dataset.from_pandas(data)
827
+ elif isinstance(data, Dataset):
828
+ self.dataset = corpus
829
+ else:
830
+ raise ValueError('The dataset must be a dictionary or pandas DataFrame.')
831
+
832
+ # Save the dataset
833
+ if save_path is not None:
834
+ self.dataset.to_json(save_path)
835
+
836
+ # Add FAISS index
837
+ self.add_faiss_index(
838
+ column_name=index_column_name,
839
+ )
840
+
841
+ # Return the dataset
842
+ return self.dataset
843
+
844
+
845
+ # Search for the most similar elements in the dataset, given a query
846
+ def search(self,
847
+ query: str,
848
+ k: int = 1,
849
+ index_column_name: str = 'embeddings',
850
+ ) -> pd.DataFrame:
851
+ """
852
+ This function searches for the most similar elements in the dataset, given a query.
853
+
854
+ Arguments:
855
+ query (str): The query.
856
+ k (int, optional): The number of elements to return (default: 1).
857
+ index_column_name (str, optional): The name of the column containing the embeddings (default: 'embeddings')
858
+
859
+ Returns:
860
+ pd.DataFrame: The most similar elements in the dataset (text, score, etc.), sorted by score.
861
+
862
+ Remarks:
863
+ The returned elements are dictionaries containing the text and the score.
864
+ """
865
+
866
+ # Get the embeddings of the query
867
+ query_embeddings = self.get_query_embeddings([query], embedding_type=self.embedding_type)
868
+
869
+ # query_embedding이랑 self.dataset['embeddings'] 값 비교
870
+ scores = []
871
+ similar_elts = []
872
+ for query in query_embeddings:
873
+ # Search for the most similar elements in the dataset
874
+ score, similar_elt = self.dataset.get_nearest_examples(
875
+ index_name=index_column_name,
876
+ query=query,
877
+ k=k,
878
+ )
879
+ scores.append(score)
880
+ similar_elts.append(similar_elt)
881
+
882
+
883
+ text_list = []
884
+ for item in similar_elts:
885
+ for text in item['text']:
886
+ text_list.append(text)
887
+
888
+ flat_list = [sentence for sublist in text_list for sentence in sublist]
889
+ count = Counter(flat_list)
890
+ count = dict(count.most_common(5))
891
+
892
+ sorted_dict = dict(sorted(count.items(), key=lambda x: x[1], reverse=True))
893
+ # Convert the results to a pandas DataFrame
894
+ results_df = pd.DataFrame({'text': sorted_dict.keys() , 'freq': sorted_dict.values()})
895
+
896
+ # Return the most similar elements
897
+ return results_df