nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +16 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +847 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +787 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  32. nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  33. nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  34. nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  35. nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  36. nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  37. nltkor/sejong/ch.py +12 -0
  38. nltkor/sejong/dict_semClassNum.txt +491 -0
  39. nltkor/sejong/layer.txt +630 -0
  40. nltkor/sejong/sejong_download.py +87 -0
  41. nltkor/sejong/ssem.py +684 -0
  42. nltkor/similarity/__init__.py +3 -0
  43. nltkor/similarity/bartscore____.py +337 -0
  44. nltkor/similarity/bertscore____.py +339 -0
  45. nltkor/similarity/classical.py +245 -0
  46. nltkor/similarity/cosine_similarity.py +175 -0
  47. nltkor/tag/__init__.py +71 -0
  48. nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
  49. nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
  50. nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
  51. nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
  52. nltkor/tag/espresso_tag.py +220 -0
  53. nltkor/tag/libs/__init__.py +10 -0
  54. nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
  55. nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
  56. nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
  57. nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
  58. nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
  59. nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
  60. nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
  61. nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
  62. nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
  63. nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
  64. nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
  65. nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
  66. nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
  67. nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
  68. nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
  69. nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
  70. nltkor/tag/libs/arguments.py +280 -0
  71. nltkor/tag/libs/attributes.py +231 -0
  72. nltkor/tag/libs/config.py +159 -0
  73. nltkor/tag/libs/metadata.py +129 -0
  74. nltkor/tag/libs/ner/__init__.py +2 -0
  75. nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
  76. nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
  77. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
  78. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
  79. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  80. nltkor/tag/libs/ner/ner_reader.py +92 -0
  81. nltkor/tag/libs/network.c +72325 -0
  82. nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
  83. nltkor/tag/libs/network.pyx +878 -0
  84. nltkor/tag/libs/networkconv.pyx +1028 -0
  85. nltkor/tag/libs/networkdependencyconv.pyx +451 -0
  86. nltkor/tag/libs/parse/__init__.py +1 -0
  87. nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
  88. nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
  89. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
  90. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
  91. nltkor/tag/libs/parse/parse_reader.py +283 -0
  92. nltkor/tag/libs/pos/__init__.py +2 -0
  93. nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
  94. nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
  95. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
  96. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
  97. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  98. nltkor/tag/libs/pos/pos_reader.py +97 -0
  99. nltkor/tag/libs/reader.py +485 -0
  100. nltkor/tag/libs/srl/__init__.py +3 -0
  101. nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
  102. nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
  103. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
  104. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
  105. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
  106. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
  107. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  108. nltkor/tag/libs/srl/srl_reader.py +436 -0
  109. nltkor/tag/libs/srl/train_srl.py +87 -0
  110. nltkor/tag/libs/taggers.py +926 -0
  111. nltkor/tag/libs/utils.py +384 -0
  112. nltkor/tag/libs/word_dictionary.py +239 -0
  113. nltkor/tag/libs/wsd/__init__.py +2 -0
  114. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
  115. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
  116. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
  117. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
  118. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  119. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  120. nltkor/tokenize/__init__.py +62 -0
  121. nltkor/tokenize/ko_tokenize.py +115 -0
  122. nltkor/trans.py +121 -0
  123. nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
  124. nltkor-1.2.14.dist-info/METADATA +41 -0
  125. nltkor-1.2.14.dist-info/RECORD +127 -0
  126. nltkor-1.2.14.dist-info/WHEEL +5 -0
  127. nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,337 @@
1
+ """
2
+ string2string similarity
3
+ src = https://github.com/stanfordnlp/string2string
4
+
5
+
6
+ MIT License
7
+
8
+ Copyright (c) 2023 Mirac Suzgun
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+
29
+ """
30
+
31
+
32
+ """
33
+ This class contains the original implementation of the BARTScore algorithm by Yuan et al. (2021).
34
+
35
+ BARTScore: BART-based Evaluation Metric for Text Generation
36
+
37
+ @inproceedings{bartscore2021,
38
+ author = {Yuan, Weizhe and Neubig, Graham and Liu, Pengfei},
39
+ booktitle = {Advances in Neural Information Processing Systems},
40
+ editor = {M. Ranzato and A. Beygelzimer and Y. Dauphin and P.S. Liang and J. Wortman Vaughan},
41
+ pages = {27263--27277},
42
+ publisher = {Curran Associates, Inc.},
43
+ title = {BARTScore: Evaluating Generated Text as Text Generation},
44
+ url = {https://proceedings.neurips.cc/paper/2021/file/e4d2b6e6fdeca3e60e0f1a62fee3d9dd-Paper.pdf},
45
+ volume = {34},
46
+ year = {2021}
47
+ }
48
+
49
+ Disclaimer:
50
+ This code is adapted from https://github.com/neulab/BARTScore/blob/main/bart_score.py
51
+ """
52
+
53
+ import numpy as np
54
+ from typing import List, Union, Dict
55
+ import traceback
56
+
57
+ # import torch
58
+ # import torch.nn as nn
59
+ # from transformers import BartTokenizer, BartForConditionalGeneration
60
+
61
+ import subprocess
62
+ import sys
63
+ from nltk.make_requirement import make_requirement
64
+
65
+ # def install_and_import(package):
66
+ # try:
67
+ # __import__(package)
68
+ # except ImportError:
69
+ # print(f"{package} not found, installing with pip...")
70
+ # subprocess.check_call([sys.executable, "-m", "pip", "install", package])
71
+ # finally:
72
+ # globals()[package] = __import__(package)
73
+
74
+ # # 사용 예시
75
+ # install_and_import('numpy')
76
+
77
+
78
+ # # 사용 예시
79
+ # install_and_import('torch')
80
+ # install_and_import('transformers', '>=4.8.2')
81
+
82
+ try:
83
+ import torch
84
+ import torch.nn as nn
85
+ from transformers import BartTokenizer, BartForConditionalGeneration
86
+ except ImportError:
87
+ requirement = ['torch', 'transformers>=4.8.2']
88
+ file_path = make_requirement(requirement)
89
+ raise Exception(f"""You need to install Library
90
+ please pip install below Libaries
91
+ \t pip install torch
92
+ \t pip install transformers>=4.8.2
93
+ Or, use pip install requirement.txt
94
+ \t pip install -r {file_path}
95
+ """)
96
+
97
+
98
+
99
+ # BARTScore class
100
+ class BARTScore:
101
+ """
102
+ This class implements the BARTScore algorithm.
103
+ """
104
+
105
+ def __init__(self,
106
+ model_name_or_path='facebook/bart-large-cnn',
107
+ tokenizer_name_or_path: str = None,
108
+ device: str = 'cpu',
109
+ max_length=1024,
110
+ ) -> None:
111
+ r"""
112
+ This function initializes the BARTScore class, which computes the BARTScore between two pieces of text.
113
+
114
+ Arguments:
115
+ model_name_or_path (str): The name or path of the model. Defaults to 'facebook/bart-large-cnn'.
116
+ tokenizer_name_or_path (str): The name or path of the tokenizer. Defaults to None.
117
+ device (str): The device to use. Defaults to 'cpu'.
118
+ max_length (int): The maximum length of the input. Defaults to 1024.
119
+
120
+ Returns:
121
+ None
122
+
123
+ Raises:
124
+ ValueError: If the device is not 'cpu' or 'cuda'.
125
+
126
+ .. attention::
127
+
128
+ If you use this class, please make sure to cite the following paper:
129
+
130
+ .. code-block:: latex
131
+
132
+ @inproceedings{bartscore2021,
133
+ author = {Yuan, Weizhe and Neubig, Graham and Liu, Pengfei},
134
+ booktitle = {Advances in Neural Information Processing Systems},
135
+ editor = {M. Ranzato and A. Beygelzimer and Y. Dauphin and P.S. Liang and J. Wortman Vaughan},
136
+ pages = {27263--27277},
137
+ publisher = {Curran Associates, Inc.},
138
+ title = {BARTScore: Evaluating Generated Text as Text Generation},
139
+ url = {https://proceedings.neurips.cc/paper/2021/file/e4d2b6e6fdeca3e60e0f1a62fee3d9dd-Paper.pdf},
140
+ volume = {34},
141
+ year = {2021}
142
+ }
143
+
144
+ .. note::
145
+ * The default model is the BART-large-cnn model.
146
+ * If the tokenizer name or path is not specified, then the model name or path will be used.
147
+ * If the device is 'cuda', then the model will be loaded onto the GPU.
148
+ * If device is not specified, use the GPU if available, otherwise use the CPU.
149
+
150
+ """
151
+
152
+ if tokenizer_name_or_path is None:
153
+ tokenizer_name_or_path = model_name_or_path
154
+
155
+ # Set the attributes
156
+ self.device = device
157
+ self.max_length = max_length
158
+
159
+ # Load model and tokenizer
160
+ self.tokenizer = BartTokenizer.from_pretrained(tokenizer_name_or_path)
161
+ self.model = BartForConditionalGeneration.from_pretrained(model_name_or_path)
162
+ self.model.eval()
163
+ self.model.to(device)
164
+
165
+ # Set up loss
166
+ self.loss_fct = nn.NLLLoss(reduction='none', ignore_index=self.model.config.pad_token_id)
167
+ self.lsm = nn.LogSoftmax(dim=1)
168
+
169
+
170
+
171
+ # Loads the model weights from a specified path
172
+ def load(self,
173
+ weights_path=None,
174
+ ) -> None:
175
+ """
176
+ This function loads the model weights from a specified path.
177
+
178
+ Arguments:
179
+ weights_path (str): The path to the weights.
180
+
181
+ Returns:
182
+ None
183
+ """
184
+ if weights_path is None:
185
+ weights_path = 'models/bart.pth'
186
+
187
+ self.model.load_state_dict(torch.load(weights_path, map_location=self.device))
188
+
189
+
190
+
191
+ # Compute the BARTScore between source sentences and target sentences
192
+ def compute(self,
193
+ source_sentences: List[str],
194
+ target_sentences: Union[List[str], List[List[str]]],
195
+ batch_size: int = 4,
196
+ agg: str = 'mean',
197
+ ) -> Dict[str, List[float]]:
198
+ """
199
+ This function scores the target sentences against the source sentences using BARTScore.
200
+
201
+ Arguments:
202
+ source_sentences (List[str]): The source sentences.
203
+ target_sentences (Union[List[str], List[List[str]]]): The target sentences.
204
+ batch_size (int): The batch size to use (default: 4)
205
+ agg (str): The aggregation method. Defaults to 'mean'; used only when target_sentences is a list of lists.
206
+
207
+ Returns:
208
+ Dict[str, List[float]]: The BARTScore for each example.
209
+
210
+ Raises:
211
+ ValueError: If the number of source sentences and target sentences do not match.
212
+ """
213
+ # Check the number of source sentences and target sentences
214
+ if len(source_sentences) != len(target_sentences):
215
+ raise ValueError(f'Number of source sentences ({len(source_sentences)}) and number of target sentences ({len(target_sentences)}) do not match.')
216
+
217
+ # If the target sentences are a list of lists, then call the multi_ref_score function
218
+ if isinstance(target_sentences[0], list):
219
+ return self.compute_multi_ref_score(
220
+ source_sentences=source_sentences,
221
+ target_sentences=target_sentences,
222
+ batch_size=batch_size,
223
+ agg=agg
224
+ )
225
+
226
+ # Score for each example
227
+ score_list = []
228
+
229
+ for i in range(0, len(source_sentences), batch_size):
230
+ # Get the current batch
231
+ src_batch = source_sentences[i: i + batch_size]
232
+ tgt_batch = target_sentences[i: i + batch_size]
233
+ try:
234
+ with torch.no_grad():
235
+ # Encode the batch
236
+ encoded_src = self.tokenizer(
237
+ src_batch,
238
+ max_length=self.max_length,
239
+ truncation=True,
240
+ padding=True,
241
+ return_tensors='pt'
242
+ )
243
+ encoded_tgt = self.tokenizer(
244
+ tgt_batch,
245
+ max_length=self.max_length,
246
+ truncation=True,
247
+ padding=True,
248
+ return_tensors='pt'
249
+ )
250
+
251
+ # Get the input ids and attention masks for the source and target sentences
252
+ src_tokens = encoded_src['input_ids'].to(self.device)
253
+ src_mask = encoded_src['attention_mask'].to(self.device)
254
+ tgt_tokens = encoded_tgt['input_ids'].to(self.device)
255
+ tgt_mask = encoded_tgt['attention_mask']
256
+ tgt_len = tgt_mask.sum(dim=1).to(self.device)
257
+
258
+ # Feed the batch to the model and get the loss
259
+ output = self.model(
260
+ input_ids=src_tokens,
261
+ attention_mask=src_mask,
262
+ labels=tgt_tokens
263
+ )
264
+ logits = output.logits.view(-1, self.model.config.vocab_size)
265
+ # Compute the loss
266
+ loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1))
267
+ loss = loss.view(tgt_tokens.shape[0], -1)
268
+ loss = loss.sum(dim=1) / tgt_len
269
+ # Get the score
270
+ curr_score_list = [-x.item() for x in loss]
271
+ # Append the score to the list
272
+ score_list += curr_score_list
273
+
274
+ except:
275
+ # If there is an error, print the traceback
276
+ raise Exception(f'Error in scoring batch {i // batch_size}:\n{traceback.format_exc()}')
277
+ return {'score': np.array(score_list)}
278
+
279
+
280
+
281
+ # Score a batch of examples with multiple references
282
+ def compute_multi_ref_score(self,
283
+ source_sentences: List[str],
284
+ target_sentences: List[List[str]],
285
+ batch_size: int = 4,
286
+ agg: str = "mean",
287
+ ) -> Dict[str, List[float]]:
288
+ """
289
+ Score a batch of examples with multiple references.
290
+
291
+ Arguments:
292
+ source_sentences (List[str]): The source sentences.
293
+ target_sentences (List[List[str]]): The target sentences.
294
+ agg (str): The aggregation method. Can be "mean" or "max".
295
+ batch_size (int): The batch size.
296
+
297
+ Returns:
298
+ Dict[str, List[float]]: The BARTScore for each example.
299
+
300
+ Raises:
301
+ ValueError: If the number of source sentences and target sentences do not match.
302
+ """
303
+
304
+ # Assert we have the same number of references
305
+ ref_nums = [len(x) for x in target_sentences]
306
+ if len(set(ref_nums)) > 1:
307
+ raise Exception("You have different number of references per test sample.")
308
+
309
+ ref_num = len(target_sentences[0])
310
+ score_matrix = []
311
+ for i in range(ref_num):
312
+ curr_target_sentences = [x[i] for x in target_sentences]
313
+ scores = self.compute(source_sentences, curr_target_sentences, batch_size)
314
+ score_matrix.append(scores)
315
+ if agg == "mean":
316
+ score_list = np.mean(score_matrix, axis=0)
317
+ elif agg == "max":
318
+ score_list = np.max(score_matrix, axis=0)
319
+ else:
320
+ raise NotImplementedError(f"Aggregation method {agg} not implemented yet.")
321
+ return {"score": score_list}
322
+
323
+ def demo():
324
+ demo_setences = [
325
+ ("I am a student", "He is a teacher"),
326
+ ("나는 학생이다", "그는 선생님이다"),
327
+ ("점심에 온기동에서 삼겹차슈덮밥을 먹었다.", "저녁에 피나치공에서 피자와 치킨을 먹었다."),
328
+ ('제가 나와 있는 곳은 경남 거제시 옥포동 덕포 해수욕장에 나와 있습니다.', '강한 바람에 간판이나 지붕이 떨어지는 등 피해가 잇따르기도 했습니다.'),
329
+ ('Outraged mortuary workers in Kenya have criticised the country’s police chief after he accused them of leasing corpses to opposition politicians.',
330
+ 'Head of police Japheth Koome earlier this week claimed that opposition politicians hired bodies from mortuaries and planted them at the scenes of protests so as to blame the police for brutality.')
331
+
332
+ ]
333
+ for str1, str2 in demo_setences:
334
+ print("demo : ", BARTScore().compute([str1], [str2]))
335
+
336
+ if __name__ == "__main__":
337
+ demo()
@@ -0,0 +1,339 @@
1
+ """
2
+ string2string similarity
3
+ src = https://github.com/stanfordnlp/string2string
4
+
5
+
6
+ MIT License
7
+
8
+ Copyright (c) 2023 Mirac Suzgun
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+
29
+ """
30
+
31
+
32
+ """
33
+ This class contains the original implementation of the BERTScore algorithm by Zhang et al. (2020).
34
+
35
+ BERTScore: Evaluating Text Generation with BERT
36
+
37
+ @inproceedings{bertscore2020,
38
+ title={BERTScore: Evaluating Text Generation with BERT},
39
+ author={Tianyi Zhang* and Varsha Kishore* and Felix Wu* and Kilian Q. Weinberger and Yoav Artzi},
40
+ booktitle={International Conference on Learning Representations},
41
+ year={2020},
42
+ url={https://openreview.net/forum?id=SkeHuCVFDr}
43
+ }
44
+
45
+ Disclaimer:
46
+ This code is adapted from https://github.com/Tiiiger/bert_score
47
+ """
48
+
49
+ from typing import List, Union, Optional, Tuple
50
+
51
+ import os
52
+ import sys
53
+ import time
54
+ from collections import defaultdict
55
+
56
+ try:
57
+ import pandas as pd
58
+ import torch
59
+ from bert_score.utils import (bert_cos_score_idf, get_hash,
60
+ get_idf_dict, get_model, get_tokenizer,
61
+ lang2model, model2layers)
62
+ from nltk.search.kobert_tokenizer import KoBERTTokenizer
63
+ except ImportError:
64
+ raise Exception("""You need to install Library
65
+ please pip install below Libaries
66
+ \t pip install torch
67
+ \t pip install pandas
68
+ \t pip install bert_score
69
+ """)
70
+
71
+ # import torch
72
+ # from bert_score.utils import (bert_cos_score_idf, get_hash,
73
+ # get_idf_dict, get_model, get_tokenizer,
74
+ # lang2model, model2layers)
75
+ # from nltk.search.kobert_tokenizer import KoBERTTokenizer
76
+
77
+
78
+
79
+
80
+
81
+ class BERTScore:
82
+ """
83
+ This class implements the BERTScore algorithm.
84
+ """
85
+
86
+ def __init__(self,
87
+ model_name_or_path: str = None,
88
+ lang: str = None,
89
+ num_layers: int = None,
90
+ all_layers: bool = False,
91
+ use_fast_tokenizer: bool = False,
92
+ device: str = 'cpu',
93
+ baseline_path: str = None,
94
+ ) -> None:
95
+ r"""
96
+ This function initializes the BERTScore class, which computes the BERTScore between two texts.
97
+
98
+ Arguments:
99
+ model_name_or_path (str): BERT model type to use (e.g., bert-base-uncased).
100
+ lang (str): Language of the texts (e.g., en).
101
+ num_layers (int): Number of layers to use.
102
+ all_layers (bool): Whether to use all layers
103
+ use_fast_tokenizer (bool): Whether to use the fast tokenizer.
104
+ device (str): Device to use (e.g., cpu or cuda).
105
+ baseline_path (str): Path to the baseline file.
106
+
107
+ Returns:
108
+ None
109
+
110
+ Raises:
111
+ ValueError: If model_name_or_path and lang are both None.
112
+
113
+ .. attention::
114
+
115
+ If you use this class, please make sure to cite the following paper:
116
+
117
+ .. code-block:: latex
118
+
119
+ @inproceedings{bertscore2020,
120
+ title={BERTScore: Evaluating Text Generation with BERT},
121
+ author={Tianyi Zhang* and Varsha Kishore* and Felix Wu* and Kilian Q. Weinberger and Yoav Artzi},
122
+ booktitle={International Conference on Learning Representations},
123
+ year={2020},
124
+ url={https://openreview.net/forum?id=SkeHuCVFDr}
125
+ }
126
+
127
+
128
+ .. note::
129
+ * If model_name_or_path is not specified, use the default model for the language.
130
+ * If num_layers is not specified, use the default number of layers.
131
+ * If device is not specified, use the GPU if available, otherwise use the CPU.
132
+ * If baseline_path is not specified, use the default baseline file.
133
+ """
134
+
135
+ # Check the arguments
136
+ if model_name_or_path is None and lang is None:
137
+ raise ValueError("You must specify either model_name_or_path or lang")
138
+
139
+ # Set the attributes
140
+ self.model_name_or_path = model_name_or_path
141
+ self.lang = lang
142
+ self.num_layers = num_layers
143
+ self.all_layers = all_layers
144
+ self.use_fast_tokenizer = use_fast_tokenizer
145
+ self.baseline_path = baseline_path
146
+
147
+ # If model_name_or_path is not specified, use the default model for the language
148
+ if self.model_name_or_path is None:
149
+ self.lang = lang.lower()
150
+ self.model_name_or_path = lang2model[self.lang]
151
+
152
+ # If num_layers is not specified, use the default number of layers
153
+ if num_layers is None:
154
+ self.num_layers = model2layers[self.model_name_or_path]
155
+
156
+ # Set the device
157
+ self.device = device
158
+ if self.device is None:
159
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
160
+
161
+ # Load model and tokenizer
162
+ if self.model_name_or_path == 'skt/kobert-base-v1':
163
+ self.tokenizer = KoBERTTokenizer.from_pretrained(self.model_name_or_path)
164
+ else:
165
+ self.tokenizer = get_tokenizer(self.model_name_or_path, self.use_fast_tokenizer)
166
+ self.model = get_model(self.model_name_or_path, self.num_layers, self.all_layers)
167
+ self.model.eval()
168
+ self.model.to(device)
169
+
170
+
171
+
172
+ # Compute the BERTScore between source sentences and target sentences
173
+ def compute(self,
174
+ source_sentences: List[str],
175
+ target_sentences: Union[List[str], List[List[str]]],
176
+ batch_size: int = 4,
177
+ idf: bool = False,
178
+ nthreads: int = 4,
179
+ return_hash: bool = False,
180
+ rescale_with_baseline: bool = False,
181
+ verbose: bool = False,
182
+ ) -> Union[dict, Optional[str]]:
183
+ """
184
+ This function scores the source sentences based on their similarity to the target sentences using BERTScore.
185
+
186
+ Arguments:
187
+ source_sentences (list of str): candidate sentences
188
+ target_sentences (list of str or list of list of str): reference sentences
189
+ batch_size (int): bert score processing batch size
190
+ idf (bool or dict): use idf weighting, can also be a precomputed idf_dict
191
+ nthreads (int): number of threads
192
+ return_hash (bool): return hashcode of the setting
193
+ rescale_with_baseline (bool): rescale bertscore with pre-computed baseline
194
+ verbose (bool): turn on intermediate status update
195
+
196
+ Returns:
197
+ (Dict[str, Tensor], Optional[str]): A dictionary containing the precision, recall, and F1 score, and the hashcode (if return_hash is True).
198
+ where the precision, recall, and F1 score are tensors of shape (len(source_sentences),
199
+
200
+ Raises:
201
+ ValueError: If the number of source sentences and target sentences do not match.
202
+ """
203
+
204
+ # Check the arguments
205
+ if len(source_sentences) != len(target_sentences):
206
+ raise ValueError("The number of candidates and references do not match")
207
+
208
+ # If the target sentences are grouped, flatten them
209
+ ref_group_boundaries = None
210
+ if not isinstance(target_sentences[0], str):
211
+ ref_group_boundaries = []
212
+ ori_source_sentences, ori_target_sentences = source_sentences, target_sentences
213
+ source_sentences, target_sentences = [], []
214
+ count = 0
215
+ for cand, ref_group in zip(ori_source_sentences, ori_target_sentences):
216
+ source_sentences += [cand] * len(ref_group)
217
+ target_sentences += ref_group
218
+ ref_group_boundaries.append((count, count + len(ref_group)))
219
+ count += len(ref_group)
220
+
221
+ if rescale_with_baseline and self.baseline_path is None:
222
+ raise ValueError("Need to specify baseline_path when rescaling with baseline")
223
+
224
+ # Get the IDF dict
225
+ if not idf:
226
+ idf_dict = defaultdict(lambda: 1.0)
227
+ # set idf for [SEP] and [CLS] to 0
228
+ idf_dict[self.tokenizer.sep_token_id] = 0
229
+ idf_dict[self.tokenizer.cls_token_id] = 0
230
+ elif isinstance(idf, dict):
231
+ if verbose:
232
+ print("using predefined IDF dict...")
233
+ idf_dict = idf
234
+ else:
235
+ if verbose:
236
+ print("preparing IDF dict...")
237
+ start = time.perf_counter()
238
+ idf_dict = get_idf_dict(target_sentences, self.tokenizer, nthreads=nthreads)
239
+ if verbose:
240
+ print("done in {:.2f} seconds".format(time.perf_counter() - start))
241
+
242
+ if verbose:
243
+ print("calculating scores...")
244
+
245
+ start = time.perf_counter()
246
+
247
+ # Get all the predictions
248
+ all_preds = bert_cos_score_idf(
249
+ model = self.model,
250
+ refs = target_sentences,
251
+ hyps = source_sentences,
252
+ tokenizer= self.tokenizer,
253
+ idf_dict = idf_dict,
254
+ verbose = verbose,
255
+ device = self.device,
256
+ batch_size=batch_size,
257
+ all_layers=self.all_layers,
258
+ ).cpu()
259
+
260
+ # If the target sentences are grouped, take the max score
261
+ if ref_group_boundaries is not None:
262
+ max_preds = []
263
+ for beg, end in ref_group_boundaries:
264
+ max_preds.append(all_preds[beg:end].max(dim=0)[0])
265
+ all_preds = torch.stack(max_preds, dim=0)
266
+
267
+ # Rescale with baseline
268
+ use_custom_baseline = self.baseline_path is not None
269
+ if rescale_with_baseline:
270
+ if self.baseline_path is None:
271
+ self.baseline_path = os.path.join(
272
+ os.path.dirname(__file__), f"rescale_baseline/{self.lang}/{self.model_name_or_path}.tsv"
273
+ )
274
+ if os.path.isfile(self.baseline_path):
275
+ if not self.all_layers:
276
+ baselines = torch.from_numpy(
277
+ pd.read_csv(self.baseline_path).iloc[self.num_layers].to_numpy()
278
+ )[1:].float()
279
+ else:
280
+ baselines = (
281
+ torch.from_numpy(pd.read_csv(self.baseline_path).to_numpy())[:, 1:]
282
+ .unsqueeze(1)
283
+ .float()
284
+ )
285
+
286
+ all_preds = (all_preds - baselines) / (1 - baselines)
287
+ else:
288
+ print(
289
+ f"Warning: Baseline not Found for {self.model_name_or_path} on {self.lang} at {self.baseline_path}",
290
+ file=sys.stderr,
291
+ )
292
+
293
+ # Get the final output
294
+ out = all_preds[..., 0], all_preds[..., 1], all_preds[..., 2] # P, R, F
295
+ scores = {
296
+ "precision": out[0].numpy(),
297
+ "recall": out[1].numpy(),
298
+ "f1": out[2].numpy(),
299
+ }
300
+
301
+ # Print the time
302
+ if verbose:
303
+ time_diff = time.perf_counter() - start
304
+ print(
305
+ f"done in {time_diff:.2f} seconds, {len(target_sentences) / time_diff:.2f} sentences/sec"
306
+ )
307
+
308
+ # If return hash, return both the output and the hash
309
+ if return_hash:
310
+ return tuple(
311
+ [
312
+ scores,
313
+ get_hash(
314
+ self.model_name_or_path,
315
+ self.num_layers,
316
+ idf,
317
+ rescale_with_baseline,
318
+ use_custom_baseline=use_custom_baseline,
319
+ use_fast_tokenizer=self.use_fast_tokenizer,
320
+ ),
321
+ ]
322
+ )
323
+ # Otherwise, just return the output
324
+ return scores
325
+
326
+
327
+
328
+
329
+ def demo():
330
+ demo_setences = [
331
+ ("I am a student", "He is a teacher"),
332
+ ("나는 학생이다", "그는 선생님이다"),
333
+ ("점심에 온기동에서 삼겹차슈덮밥을 먹었다.", "저녁에 피나치공에서 피자와 치킨을 먹었다.")
334
+ ]
335
+ for str1, str2 in demo_setences:
336
+ print("demo : ", BERTScore(model_name_or_path='bert-base-uncased', lang='en', num_layers=12).compute([str1], [str2]))
337
+
338
+ if __name__ == "__main__":
339
+ demo()