nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +16 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +847 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +787 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  32. nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  33. nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  34. nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  35. nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  36. nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  37. nltkor/sejong/ch.py +12 -0
  38. nltkor/sejong/dict_semClassNum.txt +491 -0
  39. nltkor/sejong/layer.txt +630 -0
  40. nltkor/sejong/sejong_download.py +87 -0
  41. nltkor/sejong/ssem.py +684 -0
  42. nltkor/similarity/__init__.py +3 -0
  43. nltkor/similarity/bartscore____.py +337 -0
  44. nltkor/similarity/bertscore____.py +339 -0
  45. nltkor/similarity/classical.py +245 -0
  46. nltkor/similarity/cosine_similarity.py +175 -0
  47. nltkor/tag/__init__.py +71 -0
  48. nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
  49. nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
  50. nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
  51. nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
  52. nltkor/tag/espresso_tag.py +220 -0
  53. nltkor/tag/libs/__init__.py +10 -0
  54. nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
  55. nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
  56. nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
  57. nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
  58. nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
  59. nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
  60. nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
  61. nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
  62. nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
  63. nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
  64. nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
  65. nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
  66. nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
  67. nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
  68. nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
  69. nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
  70. nltkor/tag/libs/arguments.py +280 -0
  71. nltkor/tag/libs/attributes.py +231 -0
  72. nltkor/tag/libs/config.py +159 -0
  73. nltkor/tag/libs/metadata.py +129 -0
  74. nltkor/tag/libs/ner/__init__.py +2 -0
  75. nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
  76. nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
  77. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
  78. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
  79. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  80. nltkor/tag/libs/ner/ner_reader.py +92 -0
  81. nltkor/tag/libs/network.c +72325 -0
  82. nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
  83. nltkor/tag/libs/network.pyx +878 -0
  84. nltkor/tag/libs/networkconv.pyx +1028 -0
  85. nltkor/tag/libs/networkdependencyconv.pyx +451 -0
  86. nltkor/tag/libs/parse/__init__.py +1 -0
  87. nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
  88. nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
  89. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
  90. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
  91. nltkor/tag/libs/parse/parse_reader.py +283 -0
  92. nltkor/tag/libs/pos/__init__.py +2 -0
  93. nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
  94. nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
  95. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
  96. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
  97. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  98. nltkor/tag/libs/pos/pos_reader.py +97 -0
  99. nltkor/tag/libs/reader.py +485 -0
  100. nltkor/tag/libs/srl/__init__.py +3 -0
  101. nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
  102. nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
  103. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
  104. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
  105. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
  106. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
  107. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  108. nltkor/tag/libs/srl/srl_reader.py +436 -0
  109. nltkor/tag/libs/srl/train_srl.py +87 -0
  110. nltkor/tag/libs/taggers.py +926 -0
  111. nltkor/tag/libs/utils.py +384 -0
  112. nltkor/tag/libs/word_dictionary.py +239 -0
  113. nltkor/tag/libs/wsd/__init__.py +2 -0
  114. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
  115. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
  116. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
  117. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
  118. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  119. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  120. nltkor/tokenize/__init__.py +62 -0
  121. nltkor/tokenize/ko_tokenize.py +115 -0
  122. nltkor/trans.py +121 -0
  123. nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
  124. nltkor-1.2.14.dist-info/METADATA +41 -0
  125. nltkor-1.2.14.dist-info/RECORD +127 -0
  126. nltkor-1.2.14.dist-info/WHEEL +5 -0
  127. nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,847 @@
1
+ import os
2
+ import sys
3
+ import argparse
4
+ import numpy as np
5
+ from typing import Callable, Iterable, List, Tuple, Union
6
+ from copy import deepcopy
7
+ import itertools
8
+ import torch
9
+ import time
10
+ from nltk.translate.bleu_score import *
11
+ from nltk.metrics import confusionmatrix
12
+ from collections import defaultdict
13
+ from nltk.util import ngrams, skipgrams
14
+ #from nltkor.tokenize.ko_tokenize import word_tokenize,sent_tokenize,syllable_tokenize
15
+ from nltkor.make_requirement import make_requirement
16
+ from nltkor.tokenize import Ko_tokenize
17
+ from nltkor.cider.cider import Cider
18
+ from nltkor.tag import EspressoTagger
19
+ from nltkor.sejong import ssem
20
+
21
+ try:
22
+ import torch
23
+ except ImportError:
24
+ file_path = make_requirement(['torch'])
25
+ raise Exception(f"""
26
+ Need to install Libraries, please pip install below libraries
27
+ \t pip install torch
28
+ Or, use pip install requirement.txt
29
+ \t pip install -r {file_path}
30
+ """)
31
+
32
+
33
+ class DefaultMetric:
34
+
35
+ def __init__(self, lang="ko"):
36
+ self.lang = lang
37
+ #if lang not in ["korean", "english"]:
38
+ if lang not in ["ko", "en"]:
39
+ raise Exception("Only \"korean\" or \"english\" in lang parameter")
40
+ self.tokenize=lambda ref: Ko_tokenize.word(ref,self.lang)
41
+ self.skip_pos = ['EE']
42
+
43
+ def accuracy_score(self, true, pred):
44
+
45
+ mat=confusionmatrix.ConfusionMatrix(true,pred)
46
+
47
+ conf=mat._confusion
48
+ total=0
49
+ tp=0
50
+
51
+ for r, tmp in enumerate(conf):
52
+ for v, n in enumerate(tmp):
53
+ if r==v:
54
+ tp+=n
55
+ total+=n
56
+
57
+ return float(tp/total)
58
+
59
+ def accuracy_norm(model, tokenizer, input_text: str, candidates: list, label: int):
60
+ reserved_memory = []
61
+ inference_time = []
62
+ tokenized_prompt = tokenizer(input_text, return_tensors='pt').input_ids
63
+ total_candidate = []
64
+
65
+ for ending in candidates:
66
+ len_ending = len(ending)
67
+ tokenized_ending = tokenizer(ending, return_tensors='pt').input_ids
68
+ tokenized_ending = tokenized_ending[:, 1:]
69
+ input_ids = torch.cat([tokenized_prompt, tokenized_ending], dim=-1).cuda()
70
+ labels = input_ids.clone()
71
+ labels[0, :tokenized_prompt.shape[1]] = -100
72
+ start = time.time()
73
+ with torch.no_grad():
74
+ outputs = model(input_ids, labels=labels)
75
+ inference_time.append(time.time() - start)
76
+ reserved_memory.append(torch.cuda.memory_reserved() / (1024**2))
77
+ total_logprobs = -outputs.loss.item() * tokenized_ending.shape[1]
78
+ total_candidate.append(total_logprobs/len_ending)
79
+ answer_idx = total_candidate.index(max(total_candidate))
80
+ if int(label) == answer_idx:
81
+ cor = 1
82
+ else:
83
+ cor = 0
84
+ metric_dict = {
85
+ "reserved_memory": reserved_memory,
86
+ "inference_time": inference_time
87
+ }
88
+ return cor, metric_dict
89
+
90
+ def recall_score(self, true, pred, avg='micro'):
91
+
92
+ mat=confusionmatrix.ConfusionMatrix(true,pred)
93
+
94
+ conf=mat._confusion
95
+ indices=mat._indices
96
+ values=mat._values
97
+ total=0
98
+
99
+
100
+ if len(values)==2:
101
+ tp=0
102
+ fn=0
103
+ for r,i in enumerate(conf):
104
+ for r2,v in enumerate(i):
105
+ if r==0:
106
+ continue
107
+ elif r2==0:
108
+ fn=v
109
+ elif r==1:
110
+ tp=v
111
+
112
+ return float(tp/(tp+fn))
113
+
114
+
115
+ c_tp=[]
116
+ c_fn=[]
117
+ recall_cls=[]
118
+
119
+ for r, tmp in enumerate(conf):
120
+ temp=0
121
+ for v, n in enumerate(tmp):
122
+ if r==v:
123
+ c_tp.append(n)
124
+ else:
125
+ temp+=n
126
+ c_fn.append(temp)
127
+
128
+ if avg=='macro':
129
+
130
+ for tmp in range(len(values)):
131
+ try:
132
+ recall_cls.append(float(c_tp[tmp]/(c_tp[tmp]+c_fn[tmp])))
133
+ except:
134
+ recall_cls.append(0)
135
+
136
+ temp=0
137
+
138
+ for tmp in recall_cls:
139
+ temp+=tmp
140
+
141
+ return float(temp/len(recall_cls))
142
+
143
+ if avg=='micro':
144
+ ja=0
145
+ mo=0
146
+
147
+ for tmp in range(len(values)):
148
+ ja+=c_tp[tmp]
149
+ mo+=c_tp[tmp]+c_fn[tmp]
150
+
151
+ return float(ja/mo)
152
+
153
+ else:
154
+ return "avg expect micro/macro"
155
+
156
+
157
+
158
+ def precision_score(self, true, pred,avg='micro'):
159
+
160
+
161
+ mat=confusionmatrix.ConfusionMatrix(true,pred)
162
+
163
+ conf=mat._confusion
164
+ values=mat._values
165
+
166
+ total=0
167
+
168
+ if len(values)==2:
169
+ tp=0
170
+ fp=0
171
+ for r,i in enumerate(conf):
172
+ for r2,v in enumerate(i):
173
+ if r2==0:
174
+ continue
175
+ elif r==0:
176
+ fp=v
177
+ elif r==1:
178
+ tp=v
179
+
180
+ return float(tp/(tp+fp))
181
+
182
+ c_tp=list()
183
+ c_fp=[0 for _ in range(len(values))]
184
+ recall_cls=[]
185
+
186
+ for r, tmp in enumerate(conf):
187
+ for v, n in enumerate(tmp):
188
+ if r==v:#tp
189
+ c_tp.append(n)
190
+ else:
191
+ c_fp[v]+=n
192
+
193
+ if avg=='macro':
194
+ for tmp in range(len(values)):
195
+ try:
196
+ recall_cls.append(float(c_tp[tmp]/(c_tp[tmp]+c_fp[tmp])))
197
+ except:
198
+ recall_cls.append(0)
199
+
200
+ temp=0
201
+
202
+ for tmp in recall_cls:
203
+ temp+=tmp
204
+
205
+ return float(temp/len(recall_cls))
206
+
207
+
208
+ elif avg=='micro':
209
+ ja=0
210
+ mo=0
211
+
212
+ for tmp in range(len(values)):
213
+ ja+=c_tp[tmp]
214
+ mo+=c_tp[tmp]+c_fp[tmp]
215
+
216
+ return float(ja/mo)
217
+
218
+ else:
219
+ return "avg expect micro/macro"
220
+
221
+
222
+ def f1_score(self, true, pred, avg='micro'):
223
+
224
+ if avg =='micro' or avg =='macro':
225
+
226
+ precision=self.precision_score(true,pred,avg)
227
+ recall=self.recall_score(true,pred,avg)
228
+ else:
229
+ return "avg expect micro/macro"
230
+
231
+ return (((precision*recall)/(precision+recall))*2)
232
+
233
+
234
+
235
+
236
+ def pos_eval(self, fin):
237
+
238
+ #temp=os.getcwd()+'/'+fin
239
+ file=open(fin,'r').read()
240
+ sents=file.split("\n\n")
241
+
242
+ acc = defaultdict(float)
243
+ t_avg = defaultdict(float)
244
+
245
+ for sent in sents:
246
+ lines=sent.split('\n')
247
+ for line in lines:
248
+ tot=line.split('\t')
249
+
250
+ if line=='':continue
251
+
252
+ wd=tot[0]
253
+ gold=tot[1]
254
+ pred=tot[2]
255
+
256
+ acc['all']+=1
257
+ gold_list=gold.split('+')
258
+ pred_list=pred.split('+')
259
+
260
+ t_avg["pr_all"]+=len(pred_list)
261
+ t_avg["rc_all"]+=len(gold_list)
262
+
263
+ if gold==pred:
264
+ acc["true"]+=1
265
+ t_avg['pr']+=len(pred_list)
266
+ t_avg['rc']+=len(gold_list)
267
+ continue
268
+ else :
269
+ intersect=0
270
+ for g in gold_list:
271
+ if not g in pred_list: continue
272
+ intersect+=1
273
+ t_avg['pr']+=intersect
274
+ t_avg['rc']+=intersect
275
+
276
+
277
+ t_avg['pr_result'] = t_avg['pr'] / t_avg['pr_all']
278
+ t_avg['rc_result'] = t_avg['rc'] / t_avg['rc_all']
279
+
280
+ return float(acc['true']/acc['all']) ,t_avg['pr_result'],t_avg['rc_result'], f1(t_avg['pr_result'], t_avg['rc_result'])
281
+
282
+
283
+ def f1(self, p, r):
284
+ return 2 * p * r / (p + r) if p + r else 0
285
+
286
+
287
+ def precision_at_k(self, true: List[int], pred: List[int], k: int) -> float:
288
+ """
289
+ avg = ['micro', 'macro']
290
+ """
291
+
292
+ relevant = 0
293
+
294
+ if k > len(pred):
295
+ raise ValueError("`k` is bigger than pred's length")
296
+
297
+ pred = pred[:k]
298
+
299
+ for t in true:
300
+ if t in pred:
301
+ relevant += 1
302
+
303
+
304
+ return float(relevant/len(pred))
305
+
306
+ def recall_at_k(self, true: List[int], pred: List[int], k: int) -> float:
307
+
308
+ relevant = 0
309
+
310
+ if k > len(pred):
311
+ raise ValueError("`k` is bigger than pred's length")
312
+
313
+ pred = pred[:k]
314
+
315
+ for t in true:
316
+ if t in pred:
317
+ relevant += 1
318
+
319
+
320
+ return float(relevant/len(true))
321
+
322
+ def hit_rate_at_k(self, user: List[List[int]], pred: List[List[int]], k: int) -> float:
323
+ hit = 0
324
+
325
+ for u_list, p_list in zip(user, pred):
326
+ try:
327
+ p_list = p_list[:k]
328
+ except:
329
+ raise ValueError("`k` is bigger than pred's length ")
330
+ for u in u_list:
331
+ if u in p_list:
332
+ hit += 1
333
+ break
334
+
335
+ return float(hit/len(user))
336
+
337
+ def mean_absolute_error(self, true: Union[torch.Tensor, np.ndarray], pred: Union[torch.Tensor, np.ndarray]) -> float:
338
+ pass
339
+
340
+ def root_mean_square_error(self, true: Union[torch.Tensor, np.ndarray], pred: Union[torch.Tensor, np.ndarray]) -> float:
341
+ pass
342
+
343
+ def _W_CER(self, r, h):
344
+
345
+ costs = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]
346
+
347
+ DEL_PENALTY=1 # Tact
348
+ INS_PENALTY=1 # Tact
349
+ SUB_PENALTY=1 # Tact
350
+
351
+ for i in range(1, len(r)+1):
352
+ costs[i][0] = DEL_PENALTY*i
353
+
354
+ for j in range(1, len(h) + 1):
355
+ costs[0][j] = INS_PENALTY*j
356
+
357
+ # computation
358
+ for i in range(1, len(r)+1):
359
+ for j in range(1, len(h)+1):
360
+ if r[i-1] == h[j-1]:
361
+ costs[i][j] = costs[i-1][j-1]
362
+ else:
363
+ substitutionCost = costs[i-1][j-1] + SUB_PENALTY # penalty is always 1
364
+ insertionCost = costs[i][j-1] + INS_PENALTY # penalty is always 1
365
+ deletionCost = costs[i-1][j] + DEL_PENALTY # penalty is always 1
366
+
367
+ costs[i][j] = min(substitutionCost, insertionCost, deletionCost)
368
+
369
+ mo = len(r)
370
+ i = len(r)
371
+ j = len(h)
372
+
373
+ result=(costs[i][j])/mo
374
+
375
+ if result>1.0:
376
+ return 1.0
377
+ else:
378
+ return result
379
+
380
+
381
+ def wer(self, reference, candidate):
382
+ r = Ko_tokenize.word(reference)
383
+ h = Ko_tokenize.word(candidate)
384
+
385
+ return self._W_CER(r,h)
386
+
387
+
388
+ def cer(self, reference,candidate):
389
+ r = Ko_tokenize.syllable(reference)
390
+ h = Ko_tokenize.syllable(candidate)
391
+
392
+ return self._W_CER(r,h)
393
+
394
+
395
+ def bleu(self, reference, candidate,weights=(0.25,0.25,0.25,0.25), smoothing_function=None):
396
+
397
+ if type(candidate)!=list or type(reference)!=list:
398
+ print("parameter expect list type")
399
+ return
400
+
401
+ reference=list(map(self.tokenize,reference))
402
+ candidate=Ko_tokenize.word(candidate)
403
+
404
+ return sentence_bleu(reference,candidate,weights,smoothing_function=smoothing_function)
405
+
406
+
407
+ def bleu_n(self, reference,candiate,n=1, smoothing_function=None):
408
+
409
+ if n==1:
410
+ return self.bleu(reference,candiate,(1,0,0,0), smoothing_function=smoothing_function)
411
+ elif n==2:
412
+ return self.bleu(reference,candiate,(0,1,0,0), smoothing_function=smoothing_function)
413
+ elif n==3:
414
+ return self.bleu(reference,candiate,(0,0,1,0), smoothing_function=smoothing_function)
415
+ elif n==4:
416
+ return self.bleu(reference,candiate,(0,0,0,1), smoothing_function=smoothing_function)
417
+
418
+
419
+
420
+
421
+ def _hyp_sent_split_remove(self, can):
422
+
423
+ can_sent=[[tmp.rstrip('.'or'?'or'!'or','or'\n')] for tmp in Ko_tokenize.sentence(can)]
424
+ return can_sent
425
+
426
+ def _ref_sent_split_remove(self, ref):
427
+
428
+ ref_sent=[Ko_tokenize.sentence(tmp) for tmp in ref]
429
+ ref_sent_c=[]
430
+ for tmp in ref_sent:
431
+ ref_sent_in=[]
432
+ for tmp2 in tmp:
433
+ ref_sent_in.append(Ko_tokenize.word(tmp2.rstrip('.'or'?'or'!'or','or'\n')))
434
+ ref_sent_c.append(ref_sent_in)
435
+
436
+ return ref_sent_c
437
+
438
+ def _token(self, ref_stoken, can, n):
439
+
440
+ numer=[]
441
+ ref_len=0
442
+
443
+ can=list(ngrams(can,n))
444
+
445
+ for tmp in ref_stoken:
446
+
447
+ if n==1:
448
+ ref=list(ngrams(tmp,1))
449
+ elif n==2:
450
+ ref=list(ngrams(tmp,2))
451
+ elif n==3:
452
+ ref=list(ngrams(tmp,3))
453
+ else:return 0
454
+
455
+ intersect = [t for t in ref if t in can ]
456
+ numer.append(len(intersect))
457
+ ref_len+=len(ref)
458
+
459
+ try:
460
+ rec= sum(numer)/ref_len
461
+ except:
462
+ rec=0
463
+
464
+ return rec
465
+
466
+
467
+
468
+ def rouge_n(self, ref, can, n=1):
469
+
470
+ beta=1
471
+ rec,prec=0,0
472
+
473
+ can_sent=self._hyp_sent_split_remove(can)
474
+ can_word=list(itertools.chain(*[Ko_tokenize.word(tmp,self.lang) for tmp in can_sent]))
475
+ ref=self._ref_sent_split_remove(ref)
476
+
477
+ r_list=[]
478
+
479
+ for tmp in ref:
480
+ if n==1:
481
+ r_list.append(self._token(tmp, can_word, 1))
482
+ elif n==2:
483
+ r_list.append(self._token(tmp, can_word, 2))
484
+ elif n==3:
485
+ r_list.append(self._token(tmp, can_word, 3))
486
+
487
+ return max(r_list)
488
+
489
+
490
+
491
+ def rouge_l(self, ref, can):
492
+
493
+ beta=1
494
+ #check=0
495
+
496
+ can= self._hyp_sent_split_remove(can)
497
+ can=[Ko_tokenize.word(tmp,self.lang) for tmp in can]
498
+ refs=self._ref_sent_split_remove(ref)
499
+
500
+ can_word=list(itertools.chain(*can))
501
+
502
+ result_list=[]
503
+
504
+ for ref in refs:
505
+ lcs_list=[]
506
+ for ri in ref:
507
+ ri_C=[]
508
+ for ci in can:
509
+ temp=self._lcs(ci,ri)
510
+ ri_C.append(temp)
511
+
512
+ ri_C=list(itertools.chain(*ri_C))
513
+ ri_C=set(ri_C)
514
+ lcs_list.append(len(ri_C))
515
+
516
+ ref_word=list(itertools.chain(*ref))
517
+
518
+ R_lcs=sum(lcs_list)/len(ref_word)
519
+ P_lcs=sum(lcs_list)/len(can_word)
520
+
521
+ try:
522
+ F_lcs= (2*R_lcs*P_lcs)/(R_lcs+P_lcs)
523
+ except:
524
+ F_lcs=0
525
+ result_list.append(F_lcs)
526
+
527
+ return max(result_list)
528
+
529
+
530
+
531
+ def _lcs(self, can, ref):
532
+
533
+
534
+ s1=can
535
+ s2=ref
536
+ check=0
537
+
538
+ if len(s1)<=len(s2):
539
+ temp=s1
540
+ s1=s2
541
+ s2=temp
542
+ check=1
543
+
544
+ m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
545
+
546
+ for x in range(1, 1 + len(s1)):
547
+ for y in range(1, 1 + len(s2)):
548
+ if s1[x - 1] == s2[y - 1]:
549
+ m[x][y] = m[x - 1][y - 1] +1
550
+ else:
551
+ m[x][y]=max(m[x][y-1],m[x-1][y])
552
+ f_x=len(s2)+1
553
+ lcs=m[len(s1)][len(s2)]
554
+ temp=[]
555
+
556
+
557
+ i=len(s1)
558
+ j=len(s2)
559
+
560
+ while m[i][j]!=0:
561
+ if(m[i][j]==m[i][j-1]):
562
+ j-=1
563
+ elif (m[i][j]==m[i-1][j]):
564
+ i-=1
565
+ else:
566
+ if check==0:
567
+ temp.append(s1[i-1])
568
+ if check==1:
569
+ temp.append(s2[j-1])
570
+ i-=1
571
+ j-=1
572
+
573
+ return temp
574
+ '''
575
+ for y in reversed(range(1,1+len(s1))):
576
+ for x in reversed(range(1,1+len(s2))):
577
+ if (m[y][x]-m[y-1][x-1]==1) and (m[y][x]-m[y-1][x]==1) and (m[y][x]-m[y][x-1]==1):
578
+ if (y==len(s1)+1) and (x==len(s2)):
579
+ temp.append(x)
580
+ else:
581
+ temp.append(x-1)
582
+
583
+ print('the police 만 나와줘야',temp)
584
+ if check==0:
585
+ word=s1
586
+ elif check==1:
587
+ word=s2
588
+
589
+ ret_list=[]
590
+
591
+ for tmp in range(len(temp)):
592
+ ret_list.append(word[temp[tmp]])
593
+
594
+ return ret_list
595
+ '''
596
+
597
+
598
+ def _skip_bigrams(self, ref_stoken, can_sent, can, n=1):
599
+
600
+ beta=1
601
+ numer=[]
602
+ ref_len=0
603
+
604
+ candidate=list(skipgrams(can,2,n))
605
+ can_sent=[Ko_tokenize.word(tmp,self.lang) for tmp in can_sent]
606
+ can_sk_len=0
607
+
608
+ for tmp in ref_stoken:
609
+ ref=list(skipgrams(tmp,2,n))
610
+ intersect=[t for t in ref if t in candidate]
611
+ numer.append(len(intersect))
612
+ ref_len+=len(ref)
613
+
614
+ for tmp in can_sent:
615
+ can_sk_len+=len(list(skipgrams(tmp,2,n)))
616
+
617
+ prec=sum(numer)/can_sk_len
618
+ rec=sum(numer)/ref_len
619
+
620
+ if(prec!=0 and rec!=0):
621
+ score = ((1 + beta**2)*prec*rec)/float(rec + beta**2*prec)
622
+ else:
623
+ score = 0.0
624
+ return score
625
+
626
+
627
+ def rouge_s(self, ref, can, n):
628
+
629
+ can_sent= self._hyp_sent_split_remove(can)
630
+ can_word=list(itertools.chain(*[Ko_tokenize.word(tmp,self.lang) for tmp in can_sent]))
631
+ ref= self._ref_sent_split_remove(ref)
632
+
633
+
634
+ r_list=[]
635
+
636
+ for tmp in ref:
637
+ #tmp=list(itertools.chain(*tmp))
638
+ r_list.append(self._skip_bigrams(tmp,can_sent,can_word,n))
639
+
640
+ return max(r_list)
641
+
642
+
643
+ def cider(self, ref, hyp):
644
+
645
+ ref_dict=dict()
646
+ hyp_dict=dict()
647
+
648
+ ref_dict[0]=ref
649
+ hyp_dict[0]=hyp
650
+
651
+ cider_score=Cider()
652
+ score=cider_score.compute_score(ref_dict,hyp_dict)
653
+
654
+ return float(score)
655
+
656
+ def _process_espresso_output_format(self, result_list):
657
+ temp_list = []
658
+ for k in result_list:
659
+ #k = k.split('_')
660
+ k = list(k)
661
+ if k[1] == 'SP' or k[1] == 'SY':
662
+ continue
663
+ temp_list.append(k)
664
+ return temp_list
665
+
666
+ def _generate_enum(self, ref, hyp):
667
+ result_hyp = []
668
+ result_ref = []
669
+ for h in hyp:
670
+ enum_hyp_list = list(enumerate(h))
671
+ result_hyp.append(enum_hyp_list)
672
+ for r in ref:
673
+ enum_ref_list = list(enumerate(r))
674
+ result_ref.append(enum_ref_list)
675
+ return result_hyp, result_ref
676
+
677
+ def _tag_pos_meteor(self, sent_list):
678
+ result_list = list()
679
+ for sent in sent_list:
680
+ tagged_sent = EspressoTagger(task='pos').tag(sent)
681
+ tagged_sent = self._process_espresso_output_format(tagged_sent)
682
+ result_list.append(tagged_sent)
683
+ return result_list
684
+
685
+ def _match_enums(self,
686
+ enum_hypothesis_list: List[Tuple[int, str]],
687
+ enum_reference_list: List[Tuple[int, str]],
688
+ ) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
689
+ """
690
+ matches exact words in hypothesis and reference and returns
691
+ a word mapping between enum_hypothesis_list and enum_reference_list
692
+ based on the enumerated word id.
693
+
694
+ :param enum_hypothesis_list: enumerated hypothesis list
695
+ :param enum_reference_list: enumerated reference list
696
+ :return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
697
+ enumerated unmatched reference tuples
698
+ """
699
+ word_match = []
700
+ # print("test 213" , enum_hypothesis_list)
701
+ # print("test 124" , enum_reference_list)
702
+ for i in range(len(enum_hypothesis_list))[::-1]:
703
+ for j in range(len(enum_reference_list))[::-1]:
704
+ # print(f"\n \t {enum_hypothesis_list[i][1]} \t {enum_reference_list[j][1]}")
705
+ if enum_hypothesis_list[i][1] == enum_reference_list[j][1]:
706
+
707
+ # print("Check!!")
708
+ word_match.append(
709
+ (enum_hypothesis_list[i][0], enum_reference_list[j][0])
710
+ )
711
+ enum_hypothesis_list.pop(i)
712
+ enum_reference_list.pop(j)
713
+ break
714
+ return word_match, enum_hypothesis_list, enum_reference_list
715
+
716
+
717
+ def _count_chunks(self, matches: List[Tuple[int, int]]) -> int:
718
+ """
719
+ Counts the fewest possible number of chunks such that matched unigrams
720
+ of each chunk are adjacent to each other. This is used to calculate the
721
+ fragmentation part of the metric.
722
+
723
+ :param matches: list containing a mapping of matched words (output of align_words)
724
+ :return: Number of chunks a sentence is divided into post alignment
725
+ """
726
+ i = 0
727
+ chunks = 1
728
+ while i < len(matches) - 1:
729
+ if (matches[i + 1][0] == matches[i][0] + 1) and (
730
+ matches[i + 1][1] == matches[i][1] + 1
731
+ ):
732
+ i += 1
733
+ continue
734
+ i += 1
735
+ chunks += 1
736
+ return chunks
737
+
738
+ def _match_syn_with_sejong(self, hyp_list, ref_list):
739
+ print("2")
740
+ syn_match = []
741
+ for i in range(len(hyp_list))[::-1]:
742
+ temp_syn_list = []
743
+ #print("test 344434: ", hyp_list[i])
744
+ if hyp_list[i][1][1] not in self.skip_pos:
745
+ entrys = ssem.entrys(hyp_list[i][1][0])
746
+ print("1")
747
+ print(entrys)
748
+ for entry in entrys:
749
+ print(entry)
750
+ for sense in entry.senses():
751
+ if sense.syn():
752
+ temp_syn_list.append(sense.syn())
753
+ if temp_syn_list:
754
+ hyp_list[i][1].append(deepcopy(temp_syn_list))
755
+
756
+ for j in range(len(ref_list))[::-1]:
757
+ is_break = False
758
+ if len(hyp_list[i][1]) == 3:
759
+ for syn in hyp_list[i][1][2]:
760
+
761
+ if syn[0] == ref_list[j][1][0]:
762
+ syn_match.append(
763
+ (hyp_list[i][0], ref_list[j][0])
764
+ )
765
+ is_break = True
766
+ hyp_list.pop(i)
767
+ ref_list.pop(j)
768
+ break
769
+ else:
770
+ if hyp_list[i][1] == ref_list[1][1]:
771
+ syn_match.append(
772
+ (hyp_list[i][0], ref_list[j][0])
773
+ )
774
+ is_break = True
775
+ hyp_list.pop(i)
776
+ ref_list.pop(j)
777
+ if is_break:
778
+ break
779
+
780
+
781
+
782
+ # print("test 231232 ", hyp_list[i])
783
+
784
+
785
+ return syn_match, hyp_list, ref_list
786
+
787
+ def meteor(self, ref, hyp):
788
+ ref_tag = self._tag_pos_meteor(ref)
789
+ hyp_tag = self._tag_pos_meteor(hyp)
790
+ meteors = []
791
+ alpha = 0.9
792
+ beta = 3.0
793
+ gamma = 0.5
794
+ enum_hyp, enum_ref = self._generate_enum(ref_tag, hyp_tag)
795
+ print("$")
796
+ # print("test 13333 ", enum_hyp)
797
+ for reference in enum_ref:
798
+ hyp_len = len(enum_hyp[0])
799
+ ref_len = len(reference)
800
+
801
+ # 단어/어간 매칭
802
+ word_match, enum_hyp_list, enum_ref_list = self._match_enums(deepcopy(enum_hyp[0]), reference)
803
+ syn_match, enum_hyp_list, enum_ref_list = self._match_syn_with_sejong(enum_hyp_list, enum_ref_list)
804
+ # print("test 123344 " ,enum_ref_list) ## [(0, ['오늘', 'NN']), (6, ['이', 'VB']), (7, ['었다', 'EE'])]
805
+
806
+ final_match = sorted(word_match + syn_match)
807
+
808
+ #최종 결과 계산
809
+ final_match_count = len(final_match)
810
+
811
+
812
+ precision = float(final_match_count) / hyp_len
813
+ recall = float(final_match_count) / ref_len
814
+ fmean = (precision * recall) / (alpha * precision + (1 - alpha) * recall)
815
+ chunk_count = float(self._count_chunks(final_match))
816
+ frag = 0.0
817
+ if final_match_count != 0:
818
+ frag = chunk_count / final_match_count
819
+ else:
820
+ frag = 0.0
821
+ penalty = gamma * frag ** beta
822
+ meteors.append((1 - penalty) * fmean)
823
+
824
+ # print(word_match)
825
+
826
+ return max(meteors)
827
+
828
+
829
+ def demo():
830
+ y_pred = [5, 2, 4, 1, 3, 2, 5, 6, 7]
831
+ y_true = [1, 3, 6, 7, 1, 5]
832
+
833
+ user = [[5, 3, 2], [9, 1, 2], [3, 5, 6], [7, 2, 1]]
834
+ h_pred = [[15, 6, 21, 3], [15, 77, 23, 14], [51, 23, 21, 2], [53, 2, 1, 5]]
835
+
836
+ metric = DefaultMetric()
837
+ print(metric.precision_at_k(y_true, y_pred, 3))
838
+ print(metric.recall_at_k(y_true,y_pred, 3))
839
+ print(metric.hit_rate_at_k(user, h_pred, 2))
840
+
841
+ hyp='봉준호 감독이 아카데미에서 국제영화상을 수상했다.'
842
+ ref=['봉준호가 아카데미에서 각본상을 탔다.']
843
+ re = metric.meteor(ref, hyp)
844
+ print(re)
845
+
846
+ if __name__=="__main__":
847
+ demo()