nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +16 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +847 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +787 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  32. nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  33. nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  34. nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  35. nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  36. nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  37. nltkor/sejong/ch.py +12 -0
  38. nltkor/sejong/dict_semClassNum.txt +491 -0
  39. nltkor/sejong/layer.txt +630 -0
  40. nltkor/sejong/sejong_download.py +87 -0
  41. nltkor/sejong/ssem.py +684 -0
  42. nltkor/similarity/__init__.py +3 -0
  43. nltkor/similarity/bartscore____.py +337 -0
  44. nltkor/similarity/bertscore____.py +339 -0
  45. nltkor/similarity/classical.py +245 -0
  46. nltkor/similarity/cosine_similarity.py +175 -0
  47. nltkor/tag/__init__.py +71 -0
  48. nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
  49. nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
  50. nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
  51. nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
  52. nltkor/tag/espresso_tag.py +220 -0
  53. nltkor/tag/libs/__init__.py +10 -0
  54. nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
  55. nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
  56. nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
  57. nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
  58. nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
  59. nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
  60. nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
  61. nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
  62. nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
  63. nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
  64. nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
  65. nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
  66. nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
  67. nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
  68. nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
  69. nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
  70. nltkor/tag/libs/arguments.py +280 -0
  71. nltkor/tag/libs/attributes.py +231 -0
  72. nltkor/tag/libs/config.py +159 -0
  73. nltkor/tag/libs/metadata.py +129 -0
  74. nltkor/tag/libs/ner/__init__.py +2 -0
  75. nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
  76. nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
  77. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
  78. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
  79. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  80. nltkor/tag/libs/ner/ner_reader.py +92 -0
  81. nltkor/tag/libs/network.c +72325 -0
  82. nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
  83. nltkor/tag/libs/network.pyx +878 -0
  84. nltkor/tag/libs/networkconv.pyx +1028 -0
  85. nltkor/tag/libs/networkdependencyconv.pyx +451 -0
  86. nltkor/tag/libs/parse/__init__.py +1 -0
  87. nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
  88. nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
  89. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
  90. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
  91. nltkor/tag/libs/parse/parse_reader.py +283 -0
  92. nltkor/tag/libs/pos/__init__.py +2 -0
  93. nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
  94. nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
  95. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
  96. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
  97. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  98. nltkor/tag/libs/pos/pos_reader.py +97 -0
  99. nltkor/tag/libs/reader.py +485 -0
  100. nltkor/tag/libs/srl/__init__.py +3 -0
  101. nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
  102. nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
  103. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
  104. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
  105. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
  106. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
  107. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  108. nltkor/tag/libs/srl/srl_reader.py +436 -0
  109. nltkor/tag/libs/srl/train_srl.py +87 -0
  110. nltkor/tag/libs/taggers.py +926 -0
  111. nltkor/tag/libs/utils.py +384 -0
  112. nltkor/tag/libs/word_dictionary.py +239 -0
  113. nltkor/tag/libs/wsd/__init__.py +2 -0
  114. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
  115. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
  116. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
  117. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
  118. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  119. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  120. nltkor/tokenize/__init__.py +62 -0
  121. nltkor/tokenize/ko_tokenize.py +115 -0
  122. nltkor/trans.py +121 -0
  123. nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
  124. nltkor-1.2.14.dist-info/METADATA +41 -0
  125. nltkor-1.2.14.dist-info/RECORD +127 -0
  126. nltkor-1.2.14.dist-info/WHEEL +5 -0
  127. nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,331 @@
1
+ """
2
+ string2string similarity
3
+ src = https://github.com/stanfordnlp/string2string
4
+
5
+
6
+ MIT License
7
+
8
+ Copyright (c) 2023 Mirac Suzgun
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+
29
+ """
30
+
31
+
32
+ """
33
+ This class contains the original implementation of the BERTScore algorithm by Zhang et al. (2020).
34
+
35
+ BERTScore: Evaluating Text Generation with BERT
36
+
37
+ @inproceedings{bertscore2020,
38
+ title={BERTScore: Evaluating Text Generation with BERT},
39
+ author={Tianyi Zhang* and Varsha Kishore* and Felix Wu* and Kilian Q. Weinberger and Yoav Artzi},
40
+ booktitle={International Conference on Learning Representations},
41
+ year={2020},
42
+ url={https://openreview.net/forum?id=SkeHuCVFDr}
43
+ }
44
+
45
+ Disclaimer:
46
+ This code is adapted from https://github.com/Tiiiger/bert_score
47
+ """
48
+
49
+ from typing import List, Union, Optional, Tuple
50
+
51
+ import os
52
+ import sys
53
+ import time
54
+
55
+ from collections import defaultdict
56
+
57
+
58
+ # from nltkor.search.kobert_tokenizer import KoBERTTokenizer
59
+ from nltkor.make_requirement import make_requirement
60
+
61
+ import torch
62
+ import pandas as pd
63
+ import bert_score
64
+ from transformers import AutoTokenizer, AutoModel, XLNetTokenizer
65
+ # import protobuf
66
+
67
+
68
+ from bert_score.utils import (bert_cos_score_idf, get_hash,
69
+ get_idf_dict, get_model, get_tokenizer,
70
+ lang2model, model2layers)
71
+
72
+ class BERTScore:
73
+ """
74
+ This class implements the BERTScore algorithm.
75
+ """
76
+
77
+ def __init__(self,
78
+ model_name_or_path: str = None,
79
+ lang: str = None,
80
+ num_layers: int = None,
81
+ all_layers: bool = False,
82
+ use_fast_tokenizer: bool = False,
83
+ device: str = 'cpu',
84
+ baseline_path: str = None,
85
+ ) -> None:
86
+ r"""
87
+ This function initializes the BERTScore class, which computes the BERTScore between two texts.
88
+
89
+ Arguments:
90
+ model_name_or_path (str): BERT model type to use (e.g., bert-base-uncased).
91
+ lang (str): Language of the texts (e.g., en).
92
+ num_layers (int): Number of layers to use.
93
+ all_layers (bool): Whether to use all layers
94
+ use_fast_tokenizer (bool): Whether to use the fast tokenizer.
95
+ device (str): Device to use (e.g., cpu or cuda).
96
+ baseline_path (str): Path to the baseline file.
97
+
98
+ Returns:
99
+ None
100
+
101
+ Raises:
102
+ ValueError: If model_name_or_path and lang are both None.
103
+
104
+ .. attention::
105
+
106
+ If you use this class, please make sure to cite the following paper:
107
+
108
+ .. code-block:: latex
109
+
110
+ @inproceedings{bertscore2020,
111
+ title={BERTScore: Evaluating Text Generation with BERT},
112
+ author={Tianyi Zhang* and Varsha Kishore* and Felix Wu* and Kilian Q. Weinberger and Yoav Artzi},
113
+ booktitle={International Conference on Learning Representations},
114
+ year={2020},
115
+ url={https://openreview.net/forum?id=SkeHuCVFDr}
116
+ }
117
+
118
+
119
+ .. note::
120
+ * If model_name_or_path is not specified, use the default model for the language.
121
+ * If num_layers is not specified, use the default number of layers.
122
+ * If device is not specified, use the GPU if available, otherwise use the CPU.
123
+ * If baseline_path is not specified, use the default baseline file.
124
+ """
125
+
126
+ # Check the arguments
127
+ if model_name_or_path is None and lang is None:
128
+ raise ValueError("You must specify either model_name_or_path or lang")
129
+
130
+ # Set the attributes
131
+ self.model_name_or_path = model_name_or_path
132
+ self.lang = lang
133
+ self.num_layers = num_layers
134
+ self.all_layers = all_layers
135
+ self.use_fast_tokenizer = use_fast_tokenizer
136
+ self.baseline_path = baseline_path
137
+
138
+ # If model_name_or_path is not specified, use the default model for the language
139
+ if self.model_name_or_path is None:
140
+ self.lang = lang.lower()
141
+ self.model_name_or_path = lang2model[self.lang]
142
+
143
+ # If num_layers is not specified, use the default number of layers
144
+ if num_layers is None:
145
+ self.num_layers = model2layers[self.model_name_or_path]
146
+
147
+ # Set the device
148
+ self.device = device
149
+ if self.device is None:
150
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
151
+
152
+ # Load model and tokenizer
153
+ if self.model_name_or_path == 'skt/kobert-base-v1':
154
+ # self.tokenizer = KoBERTTokenizer.from_pretrained(self.model_name_or_path)
155
+ self.tokenizer = XLNetTokenizer.from_pretrained(self.model_name_or_path)
156
+ else:
157
+ self.tokenizer = get_tokenizer(self.model_name_or_path, self.use_fast_tokenizer)
158
+ self.model = get_model(self.model_name_or_path, self.num_layers, self.all_layers)
159
+ self.model.eval()
160
+ self.model.to(device)
161
+
162
+
163
+
164
+ # Compute the BERTScore between source sentences and target sentences
165
+ def compute(self,
166
+ source_sentences: List[str],
167
+ target_sentences: Union[List[str], List[List[str]]],
168
+ batch_size: int = 4,
169
+ idf: bool = False,
170
+ nthreads: int = 4,
171
+ return_hash: bool = False,
172
+ rescale_with_baseline: bool = False,
173
+ verbose: bool = False,
174
+ ) -> Union[dict, Optional[str]]:
175
+ """
176
+ This function scores the source sentences based on their similarity to the target sentences using BERTScore.
177
+
178
+ Arguments:
179
+ source_sentences (list of str): candidate sentences
180
+ target_sentences (list of str or list of list of str): reference sentences
181
+ batch_size (int): bert score processing batch size
182
+ idf (bool or dict): use idf weighting, can also be a precomputed idf_dict
183
+ nthreads (int): number of threads
184
+ return_hash (bool): return hashcode of the setting
185
+ rescale_with_baseline (bool): rescale bertscore with pre-computed baseline
186
+ verbose (bool): turn on intermediate status update
187
+
188
+ Returns:
189
+ (Dict[str, Tensor], Optional[str]): A dictionary containing the precision, recall, and F1 score, and the hashcode (if return_hash is True).
190
+ where the precision, recall, and F1 score are tensors of shape (len(source_sentences),
191
+
192
+ Raises:
193
+ ValueError: If the number of source sentences and target sentences do not match.
194
+ """
195
+
196
+ # Check the arguments
197
+ if len(source_sentences) != len(target_sentences):
198
+ raise ValueError("The number of candidates and references do not match")
199
+
200
+ # If the target sentences are grouped, flatten them
201
+ ref_group_boundaries = None
202
+ if not isinstance(target_sentences[0], str):
203
+ ref_group_boundaries = []
204
+ ori_source_sentences, ori_target_sentences = source_sentences, target_sentences
205
+ source_sentences, target_sentences = [], []
206
+ count = 0
207
+ for cand, ref_group in zip(ori_source_sentences, ori_target_sentences):
208
+ source_sentences += [cand] * len(ref_group)
209
+ target_sentences += ref_group
210
+ ref_group_boundaries.append((count, count + len(ref_group)))
211
+ count += len(ref_group)
212
+
213
+ if rescale_with_baseline and self.baseline_path is None:
214
+ raise ValueError("Need to specify baseline_path when rescaling with baseline")
215
+
216
+ # Get the IDF dict
217
+ if not idf:
218
+ idf_dict = defaultdict(lambda: 1.0)
219
+ # set idf for [SEP] and [CLS] to 0
220
+ idf_dict[self.tokenizer.sep_token_id] = 0
221
+ idf_dict[self.tokenizer.cls_token_id] = 0
222
+ elif isinstance(idf, dict):
223
+ if verbose:
224
+ print("using predefined IDF dict...")
225
+ idf_dict = idf
226
+ else:
227
+ if verbose:
228
+ print("preparing IDF dict...")
229
+ start = time.perf_counter()
230
+ idf_dict = get_idf_dict(target_sentences, self.tokenizer, nthreads=nthreads)
231
+ if verbose:
232
+ print("done in {:.2f} seconds".format(time.perf_counter() - start))
233
+
234
+ if verbose:
235
+ print("calculating scores...")
236
+
237
+ start = time.perf_counter()
238
+
239
+ # Get all the predictions
240
+ all_preds = bert_cos_score_idf(
241
+ model = self.model,
242
+ refs = target_sentences,
243
+ hyps = source_sentences,
244
+ tokenizer= self.tokenizer,
245
+ idf_dict = idf_dict,
246
+ verbose = verbose,
247
+ device = self.device,
248
+ batch_size=batch_size,
249
+ all_layers=self.all_layers,
250
+ ).cpu()
251
+
252
+ # If the target sentences are grouped, take the max score
253
+ if ref_group_boundaries is not None:
254
+ max_preds = []
255
+ for beg, end in ref_group_boundaries:
256
+ max_preds.append(all_preds[beg:end].max(dim=0)[0])
257
+ all_preds = torch.stack(max_preds, dim=0)
258
+
259
+ # Rescale with baseline
260
+ use_custom_baseline = self.baseline_path is not None
261
+ if rescale_with_baseline:
262
+ if self.baseline_path is None:
263
+ self.baseline_path = os.path.join(
264
+ os.path.dirname(__file__), f"rescale_baseline/{self.lang}/{self.model_name_or_path}.tsv"
265
+ )
266
+ if os.path.isfile(self.baseline_path):
267
+ if not self.all_layers:
268
+ baselines = torch.from_numpy(
269
+ pd.read_csv(self.baseline_path).iloc[self.num_layers].to_numpy()
270
+ )[1:].float()
271
+ else:
272
+ baselines = (
273
+ torch.from_numpy(pd.read_csv(self.baseline_path).to_numpy())[:, 1:]
274
+ .unsqueeze(1)
275
+ .float()
276
+ )
277
+
278
+ all_preds = (all_preds - baselines) / (1 - baselines)
279
+ else:
280
+ print(
281
+ f"Warning: Baseline not Found for {self.model_name_or_path} on {self.lang} at {self.baseline_path}",
282
+ file=sys.stderr,
283
+ )
284
+
285
+ # Get the final output
286
+ out = all_preds[..., 0], all_preds[..., 1], all_preds[..., 2] # P, R, F
287
+ scores = {
288
+ "precision": out[0].numpy(),
289
+ "recall": out[1].numpy(),
290
+ "f1": out[2].numpy(),
291
+ }
292
+
293
+ # Print the time
294
+ if verbose:
295
+ time_diff = time.perf_counter() - start
296
+ print(
297
+ f"done in {time_diff:.2f} seconds, {len(target_sentences) / time_diff:.2f} sentences/sec"
298
+ )
299
+
300
+ # If return hash, return both the output and the hash
301
+ if return_hash:
302
+ return tuple(
303
+ [
304
+ scores,
305
+ get_hash(
306
+ self.model_name_or_path,
307
+ self.num_layers,
308
+ idf,
309
+ rescale_with_baseline,
310
+ use_custom_baseline=use_custom_baseline,
311
+ use_fast_tokenizer=self.use_fast_tokenizer,
312
+ ),
313
+ ]
314
+ )
315
+ # Otherwise, just return the output
316
+ return scores
317
+
318
+
319
+
320
+
321
+ def demo():
322
+ demo_setences = [
323
+ ("I am a student", "He is a teacher"),
324
+ ("나는 학생이다", "그는 선생님이다"),
325
+ ("점심에 온기동에서 삼겹차슈덮밥을 먹었다.", "저녁에 피나치공에서 피자와 치킨을 먹었다.")
326
+ ]
327
+ for str1, str2 in demo_setences:
328
+ print("demo : ", BERTScore(model_name_or_path='bert-base-uncased', lang='en', num_layers=12).compute([str1], [str2]))
329
+
330
+ if __name__ == "__main__":
331
+ demo()
@@ -0,0 +1,20 @@
1
+ from nltk.translate.bleu_score import *
2
+ from nltkor.tokenize import Ko_tokenize
3
+ import numpy as np
4
+ import torch
5
+ import time
6
+ import math
7
+
8
+ def bleu_tensor(reference,candidate,n=0, smoothing_function=None):
9
+ if n: weights = tuple(1 if i == n-1 else 0 for i in range(4))
10
+ else: weights = (0.25, 0.25, 0.25, 0.25)
11
+
12
+
13
+
14
+ reference=reference.unsqueeze(1)
15
+ reference=reference.numpy()
16
+ candidate=candidate.numpy()
17
+ return torch.tensor(corpus_bleu(reference,candidate,weights,smoothing_function=smoothing_function))
18
+
19
+
20
+