nltkor 1.2.18__cp39-cp39-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +16 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +62 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/classical.py +859 -0
  16. nltkor/metrics/entment.py +24 -0
  17. nltkor/metrics/eval.py +517 -0
  18. nltkor/metrics/mauve.py +273 -0
  19. nltkor/metrics/mauve_utils.py +131 -0
  20. nltkor/misc/__init__.py +11 -0
  21. nltkor/misc/string2string_basic_functions.py +59 -0
  22. nltkor/misc/string2string_default_tokenizer.py +83 -0
  23. nltkor/misc/string2string_hash_functions.py +159 -0
  24. nltkor/misc/string2string_word_embeddings.py +503 -0
  25. nltkor/search/__init__.py +11 -0
  26. nltkor/search/classical.py +569 -0
  27. nltkor/search/faiss_search.py +897 -0
  28. nltkor/search/kobert_tokenizer.py +181 -0
  29. nltkor/search/search_dict.py +95 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  32. nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  33. nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  34. nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  35. nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  36. nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  37. nltkor/sejong/ch.py +12 -0
  38. nltkor/sejong/dict_semClassNum.txt +491 -0
  39. nltkor/sejong/layer.txt +630 -0
  40. nltkor/sejong/sejong_download.py +87 -0
  41. nltkor/sejong/ssem.py +684 -0
  42. nltkor/similarity/__init__.py +3 -0
  43. nltkor/similarity/bartscore____.py +337 -0
  44. nltkor/similarity/bertscore____.py +339 -0
  45. nltkor/similarity/classical.py +245 -0
  46. nltkor/similarity/cosine_similarity.py +175 -0
  47. nltkor/tag/__init__.py +71 -0
  48. nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
  49. nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
  50. nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
  51. nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
  52. nltkor/tag/espresso_tag.py +220 -0
  53. nltkor/tag/libs/__init__.py +10 -0
  54. nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
  55. nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
  56. nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
  57. nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
  58. nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
  59. nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
  60. nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
  61. nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
  62. nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
  63. nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
  64. nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
  65. nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
  66. nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
  67. nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
  68. nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
  69. nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
  70. nltkor/tag/libs/arguments.py +280 -0
  71. nltkor/tag/libs/attributes.py +231 -0
  72. nltkor/tag/libs/config.py +159 -0
  73. nltkor/tag/libs/metadata.py +129 -0
  74. nltkor/tag/libs/ner/__init__.py +2 -0
  75. nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
  76. nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
  77. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
  78. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
  79. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  80. nltkor/tag/libs/ner/ner_reader.py +92 -0
  81. nltkor/tag/libs/network.c +68949 -0
  82. nltkor/tag/libs/network.cpython-39-darwin.so +0 -0
  83. nltkor/tag/libs/network.pyx +878 -0
  84. nltkor/tag/libs/networkconv.pyx +1028 -0
  85. nltkor/tag/libs/networkdependencyconv.pyx +451 -0
  86. nltkor/tag/libs/parse/__init__.py +1 -0
  87. nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
  88. nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
  89. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
  90. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
  91. nltkor/tag/libs/parse/parse_reader.py +283 -0
  92. nltkor/tag/libs/pos/__init__.py +2 -0
  93. nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
  94. nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
  95. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
  96. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
  97. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  98. nltkor/tag/libs/pos/pos_reader.py +97 -0
  99. nltkor/tag/libs/reader.py +485 -0
  100. nltkor/tag/libs/srl/__init__.py +3 -0
  101. nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
  102. nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
  103. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
  104. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
  105. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
  106. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
  107. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  108. nltkor/tag/libs/srl/srl_reader.py +436 -0
  109. nltkor/tag/libs/srl/train_srl.py +87 -0
  110. nltkor/tag/libs/taggers.py +926 -0
  111. nltkor/tag/libs/utils.py +384 -0
  112. nltkor/tag/libs/word_dictionary.py +239 -0
  113. nltkor/tag/libs/wsd/__init__.py +2 -0
  114. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
  115. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
  116. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
  117. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
  118. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  119. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  120. nltkor/tokenize/__init__.py +62 -0
  121. nltkor/tokenize/ko_tokenize.py +115 -0
  122. nltkor/trans.py +121 -0
  123. nltkor-1.2.18.dist-info/LICENSE.txt +1093 -0
  124. nltkor-1.2.18.dist-info/METADATA +42 -0
  125. nltkor-1.2.18.dist-info/RECORD +127 -0
  126. nltkor-1.2.18.dist-info/WHEEL +5 -0
  127. nltkor-1.2.18.dist-info/top_level.txt +1 -0
@@ -0,0 +1,859 @@
1
+ import os
2
+ import sys
3
+ import argparse
4
+ import numpy as np
5
+ from typing import Callable, Iterable, List, Tuple, Union
6
+ from copy import deepcopy
7
+ import itertools
8
+ import torch
9
+ import time
10
+ import math
11
+ from nltk.translate.bleu_score import *
12
+ from nltk.metrics import confusionmatrix
13
+ from collections import defaultdict
14
+ from nltk.util import ngrams, skipgrams
15
+ #from nltkor.tokenize.ko_tokenize import word_tokenize,sent_tokenize,syllable_tokenize
16
+ from nltkor.make_requirement import make_requirement
17
+ from nltkor.tokenize import Ko_tokenize
18
+ from nltkor.cider.cider import Cider
19
+ from nltkor.tag import EspressoTagger
20
+ from nltkor.sejong import ssem
21
+
22
+ try:
23
+ import torch
24
+ except ImportError:
25
+ file_path = make_requirement(['torch'])
26
+ raise Exception(f"""
27
+ Need to install Libraries, please pip install below libraries
28
+ \t pip install torch
29
+ Or, use pip install requirement.txt
30
+ \t pip install -r {file_path}
31
+ """)
32
+
33
+
34
+ class DefaultMetric:
35
+
36
+ def __init__(self, lang="ko"):
37
+ self.lang = lang
38
+ #if lang not in ["korean", "english"]:
39
+ if lang not in ["ko", "en"]:
40
+ raise Exception("Only \"korean\" or \"english\" in lang parameter")
41
+ self.tokenize=lambda ref: Ko_tokenize.word(ref,self.lang)
42
+ self.skip_pos = ['EE']
43
+
44
+ def accuracy_score(self, true, pred):
45
+
46
+ mat=confusionmatrix.ConfusionMatrix(true,pred)
47
+
48
+ conf=mat._confusion
49
+ total=0
50
+ tp=0
51
+
52
+ for r, tmp in enumerate(conf):
53
+ for v, n in enumerate(tmp):
54
+ if r==v:
55
+ tp+=n
56
+ total+=n
57
+
58
+ return float(tp/total)
59
+
60
+ def accuracy_norm(model, tokenizer, input_text: str, candidates: list, label: int):
61
+ reserved_memory = []
62
+ inference_time = []
63
+ tokenized_prompt = tokenizer(input_text, return_tensors='pt').input_ids
64
+ total_candidate = []
65
+
66
+ for ending in candidates:
67
+ len_ending = len(ending)
68
+ tokenized_ending = tokenizer(ending, return_tensors='pt').input_ids
69
+ tokenized_ending = tokenized_ending[:, 1:]
70
+ input_ids = torch.cat([tokenized_prompt, tokenized_ending], dim=-1).cuda()
71
+ labels = input_ids.clone()
72
+ labels[0, :tokenized_prompt.shape[1]] = -100
73
+ start = time.time()
74
+ with torch.no_grad():
75
+ outputs = model(input_ids, labels=labels)
76
+ inference_time.append(time.time() - start)
77
+ reserved_memory.append(torch.cuda.memory_reserved() / (1024**2))
78
+ total_logprobs = -outputs.loss.item() * tokenized_ending.shape[1]
79
+ total_candidate.append(total_logprobs/len_ending)
80
+ answer_idx = total_candidate.index(max(total_candidate))
81
+ if int(label) == answer_idx:
82
+ cor = 1
83
+ else:
84
+ cor = 0
85
+ metric_dict = {
86
+ "reserved_memory": reserved_memory,
87
+ "inference_time": inference_time
88
+ }
89
+ return cor, metric_dict
90
+
91
+ def recall_score(self, true, pred, avg='micro'):
92
+
93
+ mat=confusionmatrix.ConfusionMatrix(true,pred)
94
+
95
+ conf=mat._confusion
96
+ indices=mat._indices
97
+ values=mat._values
98
+ total=0
99
+
100
+
101
+ if len(values)==2:
102
+ tp=0
103
+ fn=0
104
+ for r,i in enumerate(conf):
105
+ for r2,v in enumerate(i):
106
+ if r==0:
107
+ continue
108
+ elif r2==0:
109
+ fn=v
110
+ elif r==1:
111
+ tp=v
112
+
113
+ return float(tp/(tp+fn))
114
+
115
+
116
+ c_tp=[]
117
+ c_fn=[]
118
+ recall_cls=[]
119
+
120
+ for r, tmp in enumerate(conf):
121
+ temp=0
122
+ for v, n in enumerate(tmp):
123
+ if r==v:
124
+ c_tp.append(n)
125
+ else:
126
+ temp+=n
127
+ c_fn.append(temp)
128
+
129
+ if avg=='macro':
130
+
131
+ for tmp in range(len(values)):
132
+ try:
133
+ recall_cls.append(float(c_tp[tmp]/(c_tp[tmp]+c_fn[tmp])))
134
+ except:
135
+ recall_cls.append(0)
136
+
137
+ temp=0
138
+
139
+ for tmp in recall_cls:
140
+ temp+=tmp
141
+
142
+ return float(temp/len(recall_cls))
143
+
144
+ if avg=='micro':
145
+ ja=0
146
+ mo=0
147
+
148
+ for tmp in range(len(values)):
149
+ ja+=c_tp[tmp]
150
+ mo+=c_tp[tmp]+c_fn[tmp]
151
+
152
+ return float(ja/mo)
153
+
154
+ else:
155
+ return "avg expect micro/macro"
156
+
157
+
158
+
159
+ def precision_score(self, true, pred,avg='micro'):
160
+
161
+
162
+ mat=confusionmatrix.ConfusionMatrix(true,pred)
163
+
164
+ conf=mat._confusion
165
+ values=mat._values
166
+
167
+ total=0
168
+
169
+ if len(values)==2:
170
+ tp=0
171
+ fp=0
172
+ for r,i in enumerate(conf):
173
+ for r2,v in enumerate(i):
174
+ if r2==0:
175
+ continue
176
+ elif r==0:
177
+ fp=v
178
+ elif r==1:
179
+ tp=v
180
+
181
+ return float(tp/(tp+fp))
182
+
183
+ c_tp=list()
184
+ c_fp=[0 for _ in range(len(values))]
185
+ recall_cls=[]
186
+
187
+ for r, tmp in enumerate(conf):
188
+ for v, n in enumerate(tmp):
189
+ if r==v:#tp
190
+ c_tp.append(n)
191
+ else:
192
+ c_fp[v]+=n
193
+
194
+ if avg=='macro':
195
+ for tmp in range(len(values)):
196
+ try:
197
+ recall_cls.append(float(c_tp[tmp]/(c_tp[tmp]+c_fp[tmp])))
198
+ except:
199
+ recall_cls.append(0)
200
+
201
+ temp=0
202
+
203
+ for tmp in recall_cls:
204
+ temp+=tmp
205
+
206
+ return float(temp/len(recall_cls))
207
+
208
+
209
+ elif avg=='micro':
210
+ ja=0
211
+ mo=0
212
+
213
+ for tmp in range(len(values)):
214
+ ja+=c_tp[tmp]
215
+ mo+=c_tp[tmp]+c_fp[tmp]
216
+
217
+ return float(ja/mo)
218
+
219
+ else:
220
+ return "avg expect micro/macro"
221
+
222
+
223
+ def f1_score(self, true, pred, avg='micro'):
224
+
225
+ if avg =='micro' or avg =='macro':
226
+
227
+ precision=self.precision_score(true,pred,avg)
228
+ recall=self.recall_score(true,pred,avg)
229
+ else:
230
+ return "avg expect micro/macro"
231
+
232
+ return (((precision*recall)/(precision+recall))*2)
233
+
234
+
235
+
236
+
237
+ def pos_eval(self, fin):
238
+
239
+ #temp=os.getcwd()+'/'+fin
240
+ file=open(fin,'r').read()
241
+ sents=file.split("\n\n")
242
+
243
+ acc = defaultdict(float)
244
+ t_avg = defaultdict(float)
245
+
246
+ for sent in sents:
247
+ lines=sent.split('\n')
248
+ for line in lines:
249
+ tot=line.split('\t')
250
+
251
+ if line=='':continue
252
+
253
+ wd=tot[0]
254
+ gold=tot[1]
255
+ pred=tot[2]
256
+
257
+ acc['all']+=1
258
+ gold_list=gold.split('+')
259
+ pred_list=pred.split('+')
260
+
261
+ t_avg["pr_all"]+=len(pred_list)
262
+ t_avg["rc_all"]+=len(gold_list)
263
+
264
+ if gold==pred:
265
+ acc["true"]+=1
266
+ t_avg['pr']+=len(pred_list)
267
+ t_avg['rc']+=len(gold_list)
268
+ continue
269
+ else :
270
+ intersect=0
271
+ for g in gold_list:
272
+ if not g in pred_list: continue
273
+ intersect+=1
274
+ t_avg['pr']+=intersect
275
+ t_avg['rc']+=intersect
276
+
277
+
278
+ t_avg['pr_result'] = t_avg['pr'] / t_avg['pr_all']
279
+ t_avg['rc_result'] = t_avg['rc'] / t_avg['rc_all']
280
+
281
+ return float(acc['true']/acc['all']) ,t_avg['pr_result'],t_avg['rc_result'], f1(t_avg['pr_result'], t_avg['rc_result'])
282
+
283
+
284
+ def f1(self, p, r):
285
+ return 2 * p * r / (p + r) if p + r else 0
286
+
287
+
288
+ def precision_at_k(self, true: List[int], pred: List[int], k: int) -> float:
289
+ """
290
+ avg = ['micro', 'macro']
291
+ """
292
+
293
+ relevant = 0
294
+
295
+ if k > len(pred):
296
+ raise ValueError("`k` is bigger than pred's length")
297
+
298
+ pred = pred[:k]
299
+
300
+ for t in true:
301
+ if t in pred:
302
+ relevant += 1
303
+
304
+
305
+ return float(relevant/len(pred))
306
+
307
+ def recall_at_k(self, true: List[int], pred: List[int], k: int) -> float:
308
+
309
+ relevant = 0
310
+
311
+ if k > len(pred):
312
+ raise ValueError("`k` is bigger than pred's length")
313
+
314
+ pred = pred[:k]
315
+
316
+ for t in true:
317
+ if t in pred:
318
+ relevant += 1
319
+
320
+
321
+ return float(relevant/len(true))
322
+
323
+ def hit_rate_at_k(self, user: List[List[int]], pred: List[List[int]], k: int) -> float:
324
+ hit = 0
325
+
326
+ for u_list, p_list in zip(user, pred):
327
+ try:
328
+ p_list = p_list[:k]
329
+ except:
330
+ raise ValueError("`k` is bigger than pred's length ")
331
+ for u in u_list:
332
+ if u in p_list:
333
+ hit += 1
334
+ break
335
+
336
+ return float(hit/len(user))
337
+
338
+ def mean_absolute_error(self, true: Union[torch.Tensor, np.ndarray], pred: Union[torch.Tensor, np.ndarray]) -> float:
339
+ pass
340
+
341
+ def root_mean_square_error(self, true: Union[torch.Tensor, np.ndarray], pred: Union[torch.Tensor, np.ndarray]) -> float:
342
+ pass
343
+
344
+ def _W_CER(self, r, h):
345
+
346
+ costs = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]
347
+
348
+ DEL_PENALTY=1 # Tact
349
+ INS_PENALTY=1 # Tact
350
+ SUB_PENALTY=1 # Tact
351
+
352
+ for i in range(1, len(r)+1):
353
+ costs[i][0] = DEL_PENALTY*i
354
+
355
+ for j in range(1, len(h) + 1):
356
+ costs[0][j] = INS_PENALTY*j
357
+
358
+ # computation
359
+ for i in range(1, len(r)+1):
360
+ for j in range(1, len(h)+1):
361
+ if r[i-1] == h[j-1]:
362
+ costs[i][j] = costs[i-1][j-1]
363
+ else:
364
+ substitutionCost = costs[i-1][j-1] + SUB_PENALTY # penalty is always 1
365
+ insertionCost = costs[i][j-1] + INS_PENALTY # penalty is always 1
366
+ deletionCost = costs[i-1][j] + DEL_PENALTY # penalty is always 1
367
+
368
+ costs[i][j] = min(substitutionCost, insertionCost, deletionCost)
369
+
370
+ mo = len(r)
371
+ i = len(r)
372
+ j = len(h)
373
+
374
+ result=(costs[i][j])/mo
375
+
376
+ if result>1.0:
377
+ return 1.0
378
+ else:
379
+ return result
380
+
381
+
382
+ def wer(self, reference, candidate):
383
+ r = Ko_tokenize.word(reference)
384
+ h = Ko_tokenize.word(candidate)
385
+
386
+ return self._W_CER(r,h)
387
+
388
+
389
+ def cer(self, reference,candidate):
390
+ r = Ko_tokenize.syllable(reference)
391
+ h = Ko_tokenize.syllable(candidate)
392
+
393
+ return self._W_CER(r,h)
394
+
395
+
396
+ def bleu(self, reference, candidate,weights=(0.25,0.25,0.25,0.25), smoothing_function=None):
397
+
398
+ if type(candidate)!=list or type(reference)!=list:
399
+ print("parameter expect list type")
400
+ return
401
+
402
+ reference=list(map(self.tokenize,reference))
403
+ candidate=Ko_tokenize.word(candidate)
404
+
405
+ return sentence_bleu(reference,candidate,weights,smoothing_function=smoothing_function)
406
+
407
+
408
+ def bleu_n(self, reference,candiate,n=1, smoothing_function=None):
409
+
410
+ if n==1:
411
+ return self.bleu(reference,candiate,(1,0,0,0), smoothing_function=smoothing_function)
412
+ elif n==2:
413
+ return self.bleu(reference,candiate,(0,1,0,0), smoothing_function=smoothing_function)
414
+ elif n==3:
415
+ return self.bleu(reference,candiate,(0,0,1,0), smoothing_function=smoothing_function)
416
+ elif n==4:
417
+ return self.bleu(reference,candiate,(0,0,0,1), smoothing_function=smoothing_function)
418
+
419
+ def bleu_tensor(self,reference,candidate,n=0, smoothing_function=None):
420
+
421
+ if n: weights = tuple(1 if i == n-1 else 0 for i in range(4))
422
+ else: weights = (0.25, 0.25, 0.25, 0.25)
423
+
424
+ reference=reference.unsqueeze(1)
425
+ reference=reference.numpy()
426
+ candidate=candidate.numpy()
427
+ return torch.tensor(corpus_bleu(reference,candidate,weights,smoothing_function=smoothing_function))
428
+
429
+
430
+
431
+
432
+
433
+ def _hyp_sent_split_remove(self, can):
434
+
435
+ can_sent=[[tmp.rstrip('.'or'?'or'!'or','or'\n')] for tmp in Ko_tokenize.sentence(can)]
436
+ return can_sent
437
+
438
+ def _ref_sent_split_remove(self, ref):
439
+
440
+ ref_sent=[Ko_tokenize.sentence(tmp) for tmp in ref]
441
+ ref_sent_c=[]
442
+ for tmp in ref_sent:
443
+ ref_sent_in=[]
444
+ for tmp2 in tmp:
445
+ ref_sent_in.append(Ko_tokenize.word(tmp2.rstrip('.'or'?'or'!'or','or'\n')))
446
+ ref_sent_c.append(ref_sent_in)
447
+
448
+ return ref_sent_c
449
+
450
+ def _token(self, ref_stoken, can, n):
451
+
452
+ numer=[]
453
+ ref_len=0
454
+
455
+ can=list(ngrams(can,n))
456
+
457
+ for tmp in ref_stoken:
458
+
459
+ if n==1:
460
+ ref=list(ngrams(tmp,1))
461
+ elif n==2:
462
+ ref=list(ngrams(tmp,2))
463
+ elif n==3:
464
+ ref=list(ngrams(tmp,3))
465
+ else:return 0
466
+
467
+ intersect = [t for t in ref if t in can ]
468
+ numer.append(len(intersect))
469
+ ref_len+=len(ref)
470
+
471
+ try:
472
+ rec= sum(numer)/ref_len
473
+ except:
474
+ rec=0
475
+
476
+ return rec
477
+
478
+
479
+
480
+ def rouge_n(self, ref, can, n=1):
481
+
482
+ beta=1
483
+ rec,prec=0,0
484
+
485
+ can_sent=self._hyp_sent_split_remove(can)
486
+ can_word=list(itertools.chain(*[Ko_tokenize.word(tmp,self.lang) for tmp in can_sent]))
487
+ ref=self._ref_sent_split_remove(ref)
488
+
489
+ r_list=[]
490
+
491
+ for tmp in ref:
492
+ if n==1:
493
+ r_list.append(self._token(tmp, can_word, 1))
494
+ elif n==2:
495
+ r_list.append(self._token(tmp, can_word, 2))
496
+ elif n==3:
497
+ r_list.append(self._token(tmp, can_word, 3))
498
+
499
+ return max(r_list)
500
+
501
+
502
+
503
+ def rouge_l(self, ref, can):
504
+
505
+ beta=1
506
+ #check=0
507
+
508
+ can= self._hyp_sent_split_remove(can)
509
+ can=[Ko_tokenize.word(tmp,self.lang) for tmp in can]
510
+ refs=self._ref_sent_split_remove(ref)
511
+
512
+ can_word=list(itertools.chain(*can))
513
+
514
+ result_list=[]
515
+
516
+ for ref in refs:
517
+ lcs_list=[]
518
+ for ri in ref:
519
+ ri_C=[]
520
+ for ci in can:
521
+ temp=self._lcs(ci,ri)
522
+ ri_C.append(temp)
523
+
524
+ ri_C=list(itertools.chain(*ri_C))
525
+ ri_C=set(ri_C)
526
+ lcs_list.append(len(ri_C))
527
+
528
+ ref_word=list(itertools.chain(*ref))
529
+
530
+ R_lcs=sum(lcs_list)/len(ref_word)
531
+ P_lcs=sum(lcs_list)/len(can_word)
532
+
533
+ try:
534
+ F_lcs= (2*R_lcs*P_lcs)/(R_lcs+P_lcs)
535
+ except:
536
+ F_lcs=0
537
+ result_list.append(F_lcs)
538
+
539
+ return max(result_list)
540
+
541
+
542
+
543
+ def _lcs(self, can, ref):
544
+
545
+
546
+ s1=can
547
+ s2=ref
548
+ check=0
549
+
550
+ if len(s1)<=len(s2):
551
+ temp=s1
552
+ s1=s2
553
+ s2=temp
554
+ check=1
555
+
556
+ m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
557
+
558
+ for x in range(1, 1 + len(s1)):
559
+ for y in range(1, 1 + len(s2)):
560
+ if s1[x - 1] == s2[y - 1]:
561
+ m[x][y] = m[x - 1][y - 1] +1
562
+ else:
563
+ m[x][y]=max(m[x][y-1],m[x-1][y])
564
+ f_x=len(s2)+1
565
+ lcs=m[len(s1)][len(s2)]
566
+ temp=[]
567
+
568
+
569
+ i=len(s1)
570
+ j=len(s2)
571
+
572
+ while m[i][j]!=0:
573
+ if(m[i][j]==m[i][j-1]):
574
+ j-=1
575
+ elif (m[i][j]==m[i-1][j]):
576
+ i-=1
577
+ else:
578
+ if check==0:
579
+ temp.append(s1[i-1])
580
+ if check==1:
581
+ temp.append(s2[j-1])
582
+ i-=1
583
+ j-=1
584
+
585
+ return temp
586
+ '''
587
+ for y in reversed(range(1,1+len(s1))):
588
+ for x in reversed(range(1,1+len(s2))):
589
+ if (m[y][x]-m[y-1][x-1]==1) and (m[y][x]-m[y-1][x]==1) and (m[y][x]-m[y][x-1]==1):
590
+ if (y==len(s1)+1) and (x==len(s2)):
591
+ temp.append(x)
592
+ else:
593
+ temp.append(x-1)
594
+
595
+ print('the police 만 나와줘야',temp)
596
+ if check==0:
597
+ word=s1
598
+ elif check==1:
599
+ word=s2
600
+
601
+ ret_list=[]
602
+
603
+ for tmp in range(len(temp)):
604
+ ret_list.append(word[temp[tmp]])
605
+
606
+ return ret_list
607
+ '''
608
+
609
+
610
+ def _skip_bigrams(self, ref_stoken, can_sent, can, n=1):
611
+
612
+ beta=1
613
+ numer=[]
614
+ ref_len=0
615
+
616
+ candidate=list(skipgrams(can,2,n))
617
+ can_sent=[Ko_tokenize.word(tmp,self.lang) for tmp in can_sent]
618
+ can_sk_len=0
619
+
620
+ for tmp in ref_stoken:
621
+ ref=list(skipgrams(tmp,2,n))
622
+ intersect=[t for t in ref if t in candidate]
623
+ numer.append(len(intersect))
624
+ ref_len+=len(ref)
625
+
626
+ for tmp in can_sent:
627
+ can_sk_len+=len(list(skipgrams(tmp,2,n)))
628
+
629
+ prec=sum(numer)/can_sk_len
630
+ rec=sum(numer)/ref_len
631
+
632
+ if(prec!=0 and rec!=0):
633
+ score = ((1 + beta**2)*prec*rec)/float(rec + beta**2*prec)
634
+ else:
635
+ score = 0.0
636
+ return score
637
+
638
+
639
+ def rouge_s(self, ref, can, n):
640
+
641
+ can_sent= self._hyp_sent_split_remove(can)
642
+ can_word=list(itertools.chain(*[Ko_tokenize.word(tmp,self.lang) for tmp in can_sent]))
643
+ ref= self._ref_sent_split_remove(ref)
644
+
645
+
646
+ r_list=[]
647
+
648
+ for tmp in ref:
649
+ #tmp=list(itertools.chain(*tmp))
650
+ r_list.append(self._skip_bigrams(tmp,can_sent,can_word,n))
651
+
652
+ return max(r_list)
653
+
654
+
655
+ def cider(self, ref, hyp):
656
+
657
+ ref_dict=dict()
658
+ hyp_dict=dict()
659
+
660
+ ref_dict[0]=ref
661
+ hyp_dict[0]=hyp
662
+
663
+ cider_score=Cider()
664
+ score=cider_score.compute_score(ref_dict,hyp_dict)
665
+
666
+ return float(score)
667
+
668
+ def _process_espresso_output_format(self, result_list):
669
+ temp_list = []
670
+ for k in result_list:
671
+ #k = k.split('_')
672
+ k = list(k)
673
+ if k[1] == 'SP' or k[1] == 'SY':
674
+ continue
675
+ temp_list.append(k)
676
+ return temp_list
677
+
678
+ def _generate_enum(self, ref, hyp):
679
+ result_hyp = []
680
+ result_ref = []
681
+ for h in hyp:
682
+ enum_hyp_list = list(enumerate(h))
683
+ result_hyp.append(enum_hyp_list)
684
+ for r in ref:
685
+ enum_ref_list = list(enumerate(r))
686
+ result_ref.append(enum_ref_list)
687
+ return result_hyp, result_ref
688
+
689
+ def _tag_pos_meteor(self, sent_list):
690
+ result_list = list()
691
+ for sent in sent_list:
692
+ tagged_sent = EspressoTagger(task='pos').tag(sent)
693
+ tagged_sent = self._process_espresso_output_format(tagged_sent)
694
+ result_list.append(tagged_sent)
695
+ return result_list
696
+
697
+ def _match_enums(self,
698
+ enum_hypothesis_list: List[Tuple[int, str]],
699
+ enum_reference_list: List[Tuple[int, str]],
700
+ ) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
701
+ """
702
+ matches exact words in hypothesis and reference and returns
703
+ a word mapping between enum_hypothesis_list and enum_reference_list
704
+ based on the enumerated word id.
705
+
706
+ :param enum_hypothesis_list: enumerated hypothesis list
707
+ :param enum_reference_list: enumerated reference list
708
+ :return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
709
+ enumerated unmatched reference tuples
710
+ """
711
+ word_match = []
712
+ # print("test 213" , enum_hypothesis_list)
713
+ # print("test 124" , enum_reference_list)
714
+ for i in range(len(enum_hypothesis_list))[::-1]:
715
+ for j in range(len(enum_reference_list))[::-1]:
716
+ # print(f"\n \t {enum_hypothesis_list[i][1]} \t {enum_reference_list[j][1]}")
717
+ if enum_hypothesis_list[i][1] == enum_reference_list[j][1]:
718
+
719
+ # print("Check!!")
720
+ word_match.append(
721
+ (enum_hypothesis_list[i][0], enum_reference_list[j][0])
722
+ )
723
+ enum_hypothesis_list.pop(i)
724
+ enum_reference_list.pop(j)
725
+ break
726
+ return word_match, enum_hypothesis_list, enum_reference_list
727
+
728
+
729
+ def _count_chunks(self, matches: List[Tuple[int, int]]) -> int:
730
+ """
731
+ Counts the fewest possible number of chunks such that matched unigrams
732
+ of each chunk are adjacent to each other. This is used to calculate the
733
+ fragmentation part of the metric.
734
+
735
+ :param matches: list containing a mapping of matched words (output of align_words)
736
+ :return: Number of chunks a sentence is divided into post alignment
737
+ """
738
+ i = 0
739
+ chunks = 1
740
+ while i < len(matches) - 1:
741
+ if (matches[i + 1][0] == matches[i][0] + 1) and (
742
+ matches[i + 1][1] == matches[i][1] + 1
743
+ ):
744
+ i += 1
745
+ continue
746
+ i += 1
747
+ chunks += 1
748
+ return chunks
749
+
750
+ def _match_syn_with_sejong(self, hyp_list, ref_list):
751
+ print("2")
752
+ syn_match = []
753
+ for i in range(len(hyp_list))[::-1]:
754
+ temp_syn_list = []
755
+ #print("test 344434: ", hyp_list[i])
756
+ if hyp_list[i][1][1] not in self.skip_pos:
757
+ entrys = ssem.entrys(hyp_list[i][1][0])
758
+ print("1")
759
+ print(entrys)
760
+ for entry in entrys:
761
+ print(entry)
762
+ for sense in entry.senses():
763
+ if sense.syn():
764
+ temp_syn_list.append(sense.syn())
765
+ if temp_syn_list:
766
+ hyp_list[i][1].append(deepcopy(temp_syn_list))
767
+
768
+ for j in range(len(ref_list))[::-1]:
769
+ is_break = False
770
+ if len(hyp_list[i][1]) == 3:
771
+ for syn in hyp_list[i][1][2]:
772
+
773
+ if syn[0] == ref_list[j][1][0]:
774
+ syn_match.append(
775
+ (hyp_list[i][0], ref_list[j][0])
776
+ )
777
+ is_break = True
778
+ hyp_list.pop(i)
779
+ ref_list.pop(j)
780
+ break
781
+ else:
782
+ if hyp_list[i][1] == ref_list[1][1]:
783
+ syn_match.append(
784
+ (hyp_list[i][0], ref_list[j][0])
785
+ )
786
+ is_break = True
787
+ hyp_list.pop(i)
788
+ ref_list.pop(j)
789
+ if is_break:
790
+ break
791
+
792
+
793
+
794
+ # print("test 231232 ", hyp_list[i])
795
+
796
+
797
+ return syn_match, hyp_list, ref_list
798
+
799
+ def meteor(self, ref, hyp):
800
+ ref_tag = self._tag_pos_meteor(ref)
801
+ hyp_tag = self._tag_pos_meteor(hyp)
802
+ meteors = []
803
+ alpha = 0.9
804
+ beta = 3.0
805
+ gamma = 0.5
806
+ enum_hyp, enum_ref = self._generate_enum(ref_tag, hyp_tag)
807
+ print("$")
808
+ # print("test 13333 ", enum_hyp)
809
+ for reference in enum_ref:
810
+ hyp_len = len(enum_hyp[0])
811
+ ref_len = len(reference)
812
+
813
+ # 단어/어간 매칭
814
+ word_match, enum_hyp_list, enum_ref_list = self._match_enums(deepcopy(enum_hyp[0]), reference)
815
+ syn_match, enum_hyp_list, enum_ref_list = self._match_syn_with_sejong(enum_hyp_list, enum_ref_list)
816
+ # print("test 123344 " ,enum_ref_list) ## [(0, ['오늘', 'NN']), (6, ['이', 'VB']), (7, ['었다', 'EE'])]
817
+
818
+ final_match = sorted(word_match + syn_match)
819
+
820
+ #최종 결과 계산
821
+ final_match_count = len(final_match)
822
+
823
+
824
+ precision = float(final_match_count) / hyp_len
825
+ recall = float(final_match_count) / ref_len
826
+ fmean = (precision * recall) / (alpha * precision + (1 - alpha) * recall)
827
+ chunk_count = float(self._count_chunks(final_match))
828
+ frag = 0.0
829
+ if final_match_count != 0:
830
+ frag = chunk_count / final_match_count
831
+ else:
832
+ frag = 0.0
833
+ penalty = gamma * frag ** beta
834
+ meteors.append((1 - penalty) * fmean)
835
+
836
+ # print(word_match)
837
+
838
+ return max(meteors)
839
+
840
+
841
+ def demo():
842
+ y_pred = [5, 2, 4, 1, 3, 2, 5, 6, 7]
843
+ y_true = [1, 3, 6, 7, 1, 5]
844
+
845
+ user = [[5, 3, 2], [9, 1, 2], [3, 5, 6], [7, 2, 1]]
846
+ h_pred = [[15, 6, 21, 3], [15, 77, 23, 14], [51, 23, 21, 2], [53, 2, 1, 5]]
847
+
848
+ metric = DefaultMetric()
849
+ print(metric.precision_at_k(y_true, y_pred, 3))
850
+ print(metric.recall_at_k(y_true,y_pred, 3))
851
+ print(metric.hit_rate_at_k(user, h_pred, 2))
852
+
853
+ hyp='봉준호 감독이 아카데미에서 국제영화상을 수상했다.'
854
+ ref=['봉준호가 아카데미에서 각본상을 탔다.']
855
+ re = metric.meteor(ref, hyp)
856
+ print(re)
857
+
858
+ if __name__=="__main__":
859
+ demo()