nltkor 1.2.0__cp39-cp39-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +15 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +814 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +467 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/ch.py +12 -0
  32. nltkor/sejong/dict_semClassNum.txt +491 -0
  33. nltkor/sejong/layer.txt +630 -0
  34. nltkor/sejong/sejong_download.py +87 -0
  35. nltkor/sejong/ssem.py +685 -0
  36. nltkor/similarity/__init__.py +3 -0
  37. nltkor/similarity/bartscore____.py +337 -0
  38. nltkor/similarity/bertscore____.py +339 -0
  39. nltkor/similarity/classical.py +245 -0
  40. nltkor/similarity/cosine_similarity.py +175 -0
  41. nltkor/tag/__init__.py +70 -0
  42. nltkor/tag/espresso_tag.py +220 -0
  43. nltkor/tag/libs/__init__.py +9 -0
  44. nltkor/tag/libs/arguments.py +280 -0
  45. nltkor/tag/libs/attributes.py +231 -0
  46. nltkor/tag/libs/config.py +158 -0
  47. nltkor/tag/libs/metadata.py +129 -0
  48. nltkor/tag/libs/ner/__init__.py +2 -0
  49. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  50. nltkor/tag/libs/ner/ner_reader.py +92 -0
  51. nltkor/tag/libs/network.c +59267 -0
  52. nltkor/tag/libs/network.cpython-39-darwin.so +0 -0
  53. nltkor/tag/libs/parse/__init__.py +1 -0
  54. nltkor/tag/libs/parse/parse_reader.py +283 -0
  55. nltkor/tag/libs/pos/__init__.py +2 -0
  56. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  57. nltkor/tag/libs/pos/pos_reader.py +89 -0
  58. nltkor/tag/libs/reader.py +510 -0
  59. nltkor/tag/libs/srl/__init__.py +3 -0
  60. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  61. nltkor/tag/libs/srl/srl_reader.py +436 -0
  62. nltkor/tag/libs/srl/train_srl.py +87 -0
  63. nltkor/tag/libs/taggers.py +926 -0
  64. nltkor/tag/libs/utils.py +344 -0
  65. nltkor/tag/libs/word_dictionary.py +239 -0
  66. nltkor/tag/libs/wsd/__init__.py +2 -0
  67. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  68. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  69. nltkor/tokenize/__init__.py +62 -0
  70. nltkor/tokenize/ko_tokenize.py +115 -0
  71. nltkor/trans.py +121 -0
  72. nltkor-1.2.0.dist-info/LICENSE.txt +1093 -0
  73. nltkor-1.2.0.dist-info/METADATA +33 -0
  74. nltkor-1.2.0.dist-info/RECORD +76 -0
  75. nltkor-1.2.0.dist-info/WHEEL +5 -0
  76. nltkor-1.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,814 @@
1
+ import os
2
+ import sys
3
+ import argparse
4
+ import numpy as np
5
+ from typing import Callable, Iterable, List, Tuple, Union
6
+ from copy import deepcopy
7
+ import itertools
8
+ from nltk.translate.bleu_score import *
9
+ from nltk.metrics import confusionmatrix
10
+ from collections import defaultdict
11
+ from nltk.util import ngrams, skipgrams
12
+ #from nltkor.tokenize.ko_tokenize import word_tokenize,sent_tokenize,syllable_tokenize
13
+ from nltkor.make_requirement import make_requirement
14
+ from nltkor.tokenize import Ko_tokenize
15
+ from nltkor.cider.cider import Cider
16
+ from nltkor.tag import EspressoTagger
17
+ from nltkor.sejong import ssem
18
+
19
+ try:
20
+ import torch
21
+ except ImportError:
22
+ file_path = make_requirement(['torch'])
23
+ raise Exception(f"""
24
+ Need to install Libraries, please pip install below libraries
25
+ \t pip install torch
26
+ Or, use pip install requirement.txt
27
+ \t pip install -r {file_path}
28
+ """)
29
+
30
+
31
+ class DefaultMetric:
32
+
33
+ def __init__(self, lang="ko"):
34
+ self.lang = lang
35
+ #if lang not in ["korean", "english"]:
36
+ if lang not in ["ko", "en"]:
37
+ raise Exception("Only \"korean\" or \"english\" in lang parameter")
38
+ self.tokenize=lambda ref: Ko_tokenize.word(ref,self.lang)
39
+ self.skip_pos = ['EE']
40
+
41
+ def accuracy_score(self, true, pred):
42
+
43
+ mat=confusionmatrix.ConfusionMatrix(true,pred)
44
+
45
+ conf=mat._confusion
46
+ total=0
47
+ tp=0
48
+
49
+ for r, tmp in enumerate(conf):
50
+ for v, n in enumerate(tmp):
51
+ if r==v:
52
+ tp+=n
53
+ total+=n
54
+
55
+ return float(tp/total)
56
+
57
+ def recall_score(self, true, pred, avg='micro'):
58
+
59
+ mat=confusionmatrix.ConfusionMatrix(true,pred)
60
+
61
+ conf=mat._confusion
62
+ indices=mat._indices
63
+ values=mat._values
64
+ total=0
65
+
66
+
67
+ if len(values)==2:
68
+ tp=0
69
+ fn=0
70
+ for r,i in enumerate(conf):
71
+ for r2,v in enumerate(i):
72
+ if r==0:
73
+ continue
74
+ elif r2==0:
75
+ fn=v
76
+ elif r==1:
77
+ tp=v
78
+
79
+ return float(tp/(tp+fn))
80
+
81
+
82
+ c_tp=[]
83
+ c_fn=[]
84
+ recall_cls=[]
85
+
86
+ for r, tmp in enumerate(conf):
87
+ temp=0
88
+ for v, n in enumerate(tmp):
89
+ if r==v:
90
+ c_tp.append(n)
91
+ else:
92
+ temp+=n
93
+ c_fn.append(temp)
94
+
95
+ if avg=='macro':
96
+
97
+ for tmp in range(len(values)):
98
+ try:
99
+ recall_cls.append(float(c_tp[tmp]/(c_tp[tmp]+c_fn[tmp])))
100
+ except:
101
+ recall_cls.append(0)
102
+
103
+ temp=0
104
+
105
+ for tmp in recall_cls:
106
+ temp+=tmp
107
+
108
+ return float(temp/len(recall_cls))
109
+
110
+ if avg=='micro':
111
+ ja=0
112
+ mo=0
113
+
114
+ for tmp in range(len(values)):
115
+ ja+=c_tp[tmp]
116
+ mo+=c_tp[tmp]+c_fn[tmp]
117
+
118
+ return float(ja/mo)
119
+
120
+ else:
121
+ return "avg expect micro/macro"
122
+
123
+
124
+
125
+ def precision_score(self, true, pred,avg='micro'):
126
+
127
+
128
+ mat=confusionmatrix.ConfusionMatrix(true,pred)
129
+
130
+ conf=mat._confusion
131
+ values=mat._values
132
+
133
+ total=0
134
+
135
+ if len(values)==2:
136
+ tp=0
137
+ fp=0
138
+ for r,i in enumerate(conf):
139
+ for r2,v in enumerate(i):
140
+ if r2==0:
141
+ continue
142
+ elif r==0:
143
+ fp=v
144
+ elif r==1:
145
+ tp=v
146
+
147
+ return float(tp/(tp+fp))
148
+
149
+ c_tp=list()
150
+ c_fp=[0 for _ in range(len(values))]
151
+ recall_cls=[]
152
+
153
+ for r, tmp in enumerate(conf):
154
+ for v, n in enumerate(tmp):
155
+ if r==v:#tp
156
+ c_tp.append(n)
157
+ else:
158
+ c_fp[v]+=n
159
+
160
+ if avg=='macro':
161
+ for tmp in range(len(values)):
162
+ try:
163
+ recall_cls.append(float(c_tp[tmp]/(c_tp[tmp]+c_fp[tmp])))
164
+ except:
165
+ recall_cls.append(0)
166
+
167
+ temp=0
168
+
169
+ for tmp in recall_cls:
170
+ temp+=tmp
171
+
172
+ return float(temp/len(recall_cls))
173
+
174
+
175
+ elif avg=='micro':
176
+ ja=0
177
+ mo=0
178
+
179
+ for tmp in range(len(values)):
180
+ ja+=c_tp[tmp]
181
+ mo+=c_tp[tmp]+c_fp[tmp]
182
+
183
+ return float(ja/mo)
184
+
185
+ else:
186
+ return "avg expect micro/macro"
187
+
188
+
189
+ def f1_score(self, true, pred, avg='micro'):
190
+
191
+ if avg =='micro' or avg =='macro':
192
+
193
+ precision=self.precision_score(true,pred,avg)
194
+ recall=self.recall_score(true,pred,avg)
195
+ else:
196
+ return "avg expect micro/macro"
197
+
198
+ return (((precision*recall)/(precision+recall))*2)
199
+
200
+
201
+
202
+
203
+ def pos_eval(self, fin):
204
+
205
+ #temp=os.getcwd()+'/'+fin
206
+ file=open(fin,'r').read()
207
+ sents=file.split("\n\n")
208
+
209
+ acc = defaultdict(float)
210
+ t_avg = defaultdict(float)
211
+
212
+ for sent in sents:
213
+ lines=sent.split('\n')
214
+ for line in lines:
215
+ tot=line.split('\t')
216
+
217
+ if line=='':continue
218
+
219
+ wd=tot[0]
220
+ gold=tot[1]
221
+ pred=tot[2]
222
+
223
+ acc['all']+=1
224
+ gold_list=gold.split('+')
225
+ pred_list=pred.split('+')
226
+
227
+ t_avg["pr_all"]+=len(pred_list)
228
+ t_avg["rc_all"]+=len(gold_list)
229
+
230
+ if gold==pred:
231
+ acc["true"]+=1
232
+ t_avg['pr']+=len(pred_list)
233
+ t_avg['rc']+=len(gold_list)
234
+ continue
235
+ else :
236
+ intersect=0
237
+ for g in gold_list:
238
+ if not g in pred_list: continue
239
+ intersect+=1
240
+ t_avg['pr']+=intersect
241
+ t_avg['rc']+=intersect
242
+
243
+
244
+ t_avg['pr_result'] = t_avg['pr'] / t_avg['pr_all']
245
+ t_avg['rc_result'] = t_avg['rc'] / t_avg['rc_all']
246
+
247
+ return float(acc['true']/acc['all']) ,t_avg['pr_result'],t_avg['rc_result'], f1(t_avg['pr_result'], t_avg['rc_result'])
248
+
249
+
250
+ def f1(self, p, r):
251
+ return 2 * p * r / (p + r) if p + r else 0
252
+
253
+
254
+ def precision_at_k(self, true: List[int], pred: List[int], k: int) -> float:
255
+ """
256
+ avg = ['micro', 'macro']
257
+ """
258
+
259
+ relevant = 0
260
+
261
+ if k > len(pred):
262
+ raise ValueError("`k` is bigger than pred's length")
263
+
264
+ pred = pred[:k]
265
+
266
+ for t in true:
267
+ if t in pred:
268
+ relevant += 1
269
+
270
+
271
+ return float(relevant/len(pred))
272
+
273
+ def recall_at_k(self, true: List[int], pred: List[int], k: int) -> float:
274
+
275
+ relevant = 0
276
+
277
+ if k > len(pred):
278
+ raise ValueError("`k` is bigger than pred's length")
279
+
280
+ pred = pred[:k]
281
+
282
+ for t in true:
283
+ if t in pred:
284
+ relevant += 1
285
+
286
+
287
+ return float(relevant/len(true))
288
+
289
+ def hit_rate_at_k(self, user: List[List[int]], pred: List[List[int]], k: int) -> float:
290
+ hit = 0
291
+
292
+ for u_list, p_list in zip(user, pred):
293
+ try:
294
+ p_list = p_list[:k]
295
+ except:
296
+ raise ValueError("`k` is bigger than pred's length ")
297
+ for u in u_list:
298
+ if u in p_list:
299
+ hit += 1
300
+ break
301
+
302
+ return float(hit/len(user))
303
+
304
+ def mean_absolute_error(self, true: Union[torch.Tensor, np.ndarray], pred: Union[torch.Tensor, np.ndarray]) -> float:
305
+ pass
306
+
307
+ def root_mean_square_error(self, true: Union[torch.Tensor, np.ndarray], pred: Union[torch.Tensor, np.ndarray]) -> float:
308
+ pass
309
+
310
+ def _W_CER(self, r, h):
311
+
312
+ costs = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]
313
+
314
+ DEL_PENALTY=1 # Tact
315
+ INS_PENALTY=1 # Tact
316
+ SUB_PENALTY=1 # Tact
317
+
318
+ for i in range(1, len(r)+1):
319
+ costs[i][0] = DEL_PENALTY*i
320
+
321
+ for j in range(1, len(h) + 1):
322
+ costs[0][j] = INS_PENALTY*j
323
+
324
+ # computation
325
+ for i in range(1, len(r)+1):
326
+ for j in range(1, len(h)+1):
327
+ if r[i-1] == h[j-1]:
328
+ costs[i][j] = costs[i-1][j-1]
329
+ else:
330
+ substitutionCost = costs[i-1][j-1] + SUB_PENALTY # penalty is always 1
331
+ insertionCost = costs[i][j-1] + INS_PENALTY # penalty is always 1
332
+ deletionCost = costs[i-1][j] + DEL_PENALTY # penalty is always 1
333
+
334
+ costs[i][j] = min(substitutionCost, insertionCost, deletionCost)
335
+
336
+ mo = len(r)
337
+ i = len(r)
338
+ j = len(h)
339
+
340
+ result=(costs[i][j])/mo
341
+
342
+ if result>1.0:
343
+ return 1.0
344
+ else:
345
+ return result
346
+
347
+
348
+ def wer(self, reference, candidate):
349
+ r = Ko_tokenize.word(reference)
350
+ h = Ko_tokenize.word(candidate)
351
+
352
+ return self._W_CER(r,h)
353
+
354
+
355
+ def cer(self, reference,candidate):
356
+ r = Ko_tokenize.syllable(reference)
357
+ h = Ko_tokenize.syllable(candidate)
358
+
359
+ return self._W_CER(r,h)
360
+
361
+
362
+ def bleu(self, reference, candidate,weights=(0.25,0.25,0.25,0.25), smoothing_function=None):
363
+
364
+ if type(candidate)!=list or type(reference)!=list:
365
+ print("parameter expect list type")
366
+ return
367
+
368
+ reference=list(map(self.tokenize,reference))
369
+ candidate=Ko_tokenize.word(candidate)
370
+
371
+ return sentence_bleu(reference,candidate,weights,smoothing_function=smoothing_function)
372
+
373
+
374
+ def bleu_n(self, reference,candiate,n=1, smoothing_function=None):
375
+
376
+ if n==1:
377
+ return self.bleu(reference,candiate,(1,0,0,0), smoothing_function=smoothing_function)
378
+ elif n==2:
379
+ return self.bleu(reference,candiate,(0,1,0,0), smoothing_function=smoothing_function)
380
+ elif n==3:
381
+ return self.bleu(reference,candiate,(0,0,1,0), smoothing_function=smoothing_function)
382
+ elif n==4:
383
+ return self.bleu(reference,candiate,(0,0,0,1), smoothing_function=smoothing_function)
384
+
385
+
386
+
387
+
388
+ def _hyp_sent_split_remove(self, can):
389
+
390
+ can_sent=[[tmp.rstrip('.'or'?'or'!'or','or'\n')] for tmp in Ko_tokenize.sentence(can)]
391
+ return can_sent
392
+
393
+ def _ref_sent_split_remove(self, ref):
394
+
395
+ ref_sent=[Ko_tokenize.sentence(tmp) for tmp in ref]
396
+ ref_sent_c=[]
397
+ for tmp in ref_sent:
398
+ ref_sent_in=[]
399
+ for tmp2 in tmp:
400
+ ref_sent_in.append(Ko_tokenize.word(tmp2.rstrip('.'or'?'or'!'or','or'\n')))
401
+ ref_sent_c.append(ref_sent_in)
402
+
403
+ return ref_sent_c
404
+
405
+ def _token(self, ref_stoken, can, n):
406
+
407
+ numer=[]
408
+ ref_len=0
409
+
410
+ can=list(ngrams(can,n))
411
+
412
+ for tmp in ref_stoken:
413
+
414
+ if n==1:
415
+ ref=list(ngrams(tmp,1))
416
+ elif n==2:
417
+ ref=list(ngrams(tmp,2))
418
+ elif n==3:
419
+ ref=list(ngrams(tmp,3))
420
+ else:return 0
421
+
422
+ intersect = [t for t in ref if t in can ]
423
+ numer.append(len(intersect))
424
+ ref_len+=len(ref)
425
+
426
+ try:
427
+ rec= sum(numer)/ref_len
428
+ except:
429
+ rec=0
430
+
431
+ return rec
432
+
433
+
434
+
435
+ def rouge_n(self, ref, can, n=1):
436
+
437
+ beta=1
438
+ rec,prec=0,0
439
+
440
+ can_sent=self._hyp_sent_split_remove(can)
441
+ can_word=list(itertools.chain(*[Ko_tokenize.word(tmp,self.lang) for tmp in can_sent]))
442
+ ref=self._ref_sent_split_remove(ref)
443
+
444
+ r_list=[]
445
+
446
+ for tmp in ref:
447
+ if n==1:
448
+ r_list.append(self._token(tmp, can_word, 1))
449
+ elif n==2:
450
+ r_list.append(self._token(tmp, can_word, 2))
451
+ elif n==3:
452
+ r_list.append(self._token(tmp, can_word, 3))
453
+
454
+ return max(r_list)
455
+
456
+
457
+
458
+ def rouge_l(self, ref, can):
459
+
460
+ beta=1
461
+ #check=0
462
+
463
+ can= self._hyp_sent_split_remove(can)
464
+ can=[Ko_tokenize.word(tmp,self.lang) for tmp in can]
465
+ refs=self._ref_sent_split_remove(ref)
466
+
467
+ can_word=list(itertools.chain(*can))
468
+
469
+ result_list=[]
470
+
471
+ for ref in refs:
472
+ lcs_list=[]
473
+ for ri in ref:
474
+ ri_C=[]
475
+ for ci in can:
476
+ temp=self._lcs(ci,ri)
477
+ ri_C.append(temp)
478
+
479
+ ri_C=list(itertools.chain(*ri_C))
480
+ ri_C=set(ri_C)
481
+ lcs_list.append(len(ri_C))
482
+
483
+ ref_word=list(itertools.chain(*ref))
484
+
485
+ R_lcs=sum(lcs_list)/len(ref_word)
486
+ P_lcs=sum(lcs_list)/len(can_word)
487
+
488
+ try:
489
+ F_lcs= (2*R_lcs*P_lcs)/(R_lcs+P_lcs)
490
+ except:
491
+ F_lcs=0
492
+ result_list.append(F_lcs)
493
+
494
+ return max(result_list)
495
+
496
+
497
+
498
+ def _lcs(self, can, ref):
499
+
500
+
501
+ s1=can
502
+ s2=ref
503
+ check=0
504
+
505
+ if len(s1)<=len(s2):
506
+ temp=s1
507
+ s1=s2
508
+ s2=temp
509
+ check=1
510
+
511
+ m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
512
+
513
+ for x in range(1, 1 + len(s1)):
514
+ for y in range(1, 1 + len(s2)):
515
+ if s1[x - 1] == s2[y - 1]:
516
+ m[x][y] = m[x - 1][y - 1] +1
517
+ else:
518
+ m[x][y]=max(m[x][y-1],m[x-1][y])
519
+ f_x=len(s2)+1
520
+ lcs=m[len(s1)][len(s2)]
521
+ temp=[]
522
+
523
+
524
+ i=len(s1)
525
+ j=len(s2)
526
+
527
+ while m[i][j]!=0:
528
+ if(m[i][j]==m[i][j-1]):
529
+ j-=1
530
+ elif (m[i][j]==m[i-1][j]):
531
+ i-=1
532
+ else:
533
+ if check==0:
534
+ temp.append(s1[i-1])
535
+ if check==1:
536
+ temp.append(s2[j-1])
537
+ i-=1
538
+ j-=1
539
+
540
+ return temp
541
+ '''
542
+ for y in reversed(range(1,1+len(s1))):
543
+ for x in reversed(range(1,1+len(s2))):
544
+ if (m[y][x]-m[y-1][x-1]==1) and (m[y][x]-m[y-1][x]==1) and (m[y][x]-m[y][x-1]==1):
545
+ if (y==len(s1)+1) and (x==len(s2)):
546
+ temp.append(x)
547
+ else:
548
+ temp.append(x-1)
549
+
550
+ print('the police 만 나와줘야',temp)
551
+ if check==0:
552
+ word=s1
553
+ elif check==1:
554
+ word=s2
555
+
556
+ ret_list=[]
557
+
558
+ for tmp in range(len(temp)):
559
+ ret_list.append(word[temp[tmp]])
560
+
561
+ return ret_list
562
+ '''
563
+
564
+
565
+ def _skip_bigrams(self, ref_stoken, can_sent, can, n=1):
566
+
567
+ beta=1
568
+ numer=[]
569
+ ref_len=0
570
+
571
+ candidate=list(skipgrams(can,2,n))
572
+ can_sent=[Ko_tokenize.word(tmp,self.lang) for tmp in can_sent]
573
+ can_sk_len=0
574
+
575
+ for tmp in ref_stoken:
576
+ ref=list(skipgrams(tmp,2,n))
577
+ intersect=[t for t in ref if t in candidate]
578
+ numer.append(len(intersect))
579
+ ref_len+=len(ref)
580
+
581
+ for tmp in can_sent:
582
+ can_sk_len+=len(list(skipgrams(tmp,2,n)))
583
+
584
+ prec=sum(numer)/can_sk_len
585
+ rec=sum(numer)/ref_len
586
+
587
+ if(prec!=0 and rec!=0):
588
+ score = ((1 + beta**2)*prec*rec)/float(rec + beta**2*prec)
589
+ else:
590
+ score = 0.0
591
+ return score
592
+
593
+
594
+ def rouge_s(self, ref, can, n):
595
+
596
+ can_sent= self._hyp_sent_split_remove(can)
597
+ can_word=list(itertools.chain(*[Ko_tokenize.word(tmp,self.lang) for tmp in can_sent]))
598
+ ref= self._ref_sent_split_remove(ref)
599
+
600
+
601
+ r_list=[]
602
+
603
+ for tmp in ref:
604
+ #tmp=list(itertools.chain(*tmp))
605
+ r_list.append(self._skip_bigrams(tmp,can_sent,can_word,n))
606
+
607
+ return max(r_list)
608
+
609
+
610
+ def cider(self, ref, hyp):
611
+
612
+ ref_dict=dict()
613
+ hyp_dict=dict()
614
+
615
+ ref_dict[0]=ref
616
+ hyp_dict[0]=hyp
617
+
618
+ cider_score=Cider()
619
+ score=cider_score.compute_score(ref_dict,hyp_dict)
620
+
621
+ return float(score)
622
+
623
+ def _process_espresso_output_format(self, result_list):
624
+ temp_list = []
625
+ for k in result_list:
626
+ #k = k.split('_')
627
+ k = list(k)
628
+ if k[1] == 'SP' or k[1] == 'SY':
629
+ continue
630
+ temp_list.append(k)
631
+ return temp_list
632
+
633
+ def _generate_enum(self, ref, hyp):
634
+ result_hyp = []
635
+ result_ref = []
636
+ for h in hyp:
637
+ enum_hyp_list = list(enumerate(h))
638
+ result_hyp.append(enum_hyp_list)
639
+ for r in ref:
640
+ enum_ref_list = list(enumerate(r))
641
+ result_ref.append(enum_ref_list)
642
+ return result_hyp, result_ref
643
+
644
+ def _tag_pos_meteor(self, sent_list):
645
+ result_list = list()
646
+ for sent in sent_list:
647
+ tagged_sent = EspressoTagger(task='pos').tag(sent)
648
+ tagged_sent = self._process_espresso_output_format(tagged_sent)
649
+ result_list.append(tagged_sent)
650
+ return result_list
651
+
652
+ def _match_enums(self,
653
+ enum_hypothesis_list: List[Tuple[int, str]],
654
+ enum_reference_list: List[Tuple[int, str]],
655
+ ) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
656
+ """
657
+ matches exact words in hypothesis and reference and returns
658
+ a word mapping between enum_hypothesis_list and enum_reference_list
659
+ based on the enumerated word id.
660
+
661
+ :param enum_hypothesis_list: enumerated hypothesis list
662
+ :param enum_reference_list: enumerated reference list
663
+ :return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
664
+ enumerated unmatched reference tuples
665
+ """
666
+ word_match = []
667
+ # print("test 213" , enum_hypothesis_list)
668
+ # print("test 124" , enum_reference_list)
669
+ for i in range(len(enum_hypothesis_list))[::-1]:
670
+ for j in range(len(enum_reference_list))[::-1]:
671
+ # print(f"\n \t {enum_hypothesis_list[i][1]} \t {enum_reference_list[j][1]}")
672
+ if enum_hypothesis_list[i][1] == enum_reference_list[j][1]:
673
+
674
+ # print("Check!!")
675
+ word_match.append(
676
+ (enum_hypothesis_list[i][0], enum_reference_list[j][0])
677
+ )
678
+ enum_hypothesis_list.pop(i)
679
+ enum_reference_list.pop(j)
680
+ break
681
+ return word_match, enum_hypothesis_list, enum_reference_list
682
+
683
+
684
+ def _count_chunks(self, matches: List[Tuple[int, int]]) -> int:
685
+ """
686
+ Counts the fewest possible number of chunks such that matched unigrams
687
+ of each chunk are adjacent to each other. This is used to calculate the
688
+ fragmentation part of the metric.
689
+
690
+ :param matches: list containing a mapping of matched words (output of align_words)
691
+ :return: Number of chunks a sentence is divided into post alignment
692
+ """
693
+ i = 0
694
+ chunks = 1
695
+ while i < len(matches) - 1:
696
+ if (matches[i + 1][0] == matches[i][0] + 1) and (
697
+ matches[i + 1][1] == matches[i][1] + 1
698
+ ):
699
+ i += 1
700
+ continue
701
+ i += 1
702
+ chunks += 1
703
+ return chunks
704
+
705
+ def _match_syn_with_sejong(self, hyp_list, ref_list):
706
+ print("2")
707
+ syn_match = []
708
+ for i in range(len(hyp_list))[::-1]:
709
+ temp_syn_list = []
710
+ #print("test 344434: ", hyp_list[i])
711
+ if hyp_list[i][1][1] not in self.skip_pos:
712
+ entrys = ssem.entrys(hyp_list[i][1][0])
713
+ print("1")
714
+ print(entrys)
715
+ for entry in entrys:
716
+ print(entry)
717
+ for sense in entry.senses():
718
+ if sense.syn():
719
+ temp_syn_list.append(sense.syn())
720
+ if temp_syn_list:
721
+ hyp_list[i][1].append(deepcopy(temp_syn_list))
722
+
723
+ for j in range(len(ref_list))[::-1]:
724
+ is_break = False
725
+ if len(hyp_list[i][1]) == 3:
726
+ for syn in hyp_list[i][1][2]:
727
+
728
+ if syn[0] == ref_list[j][1][0]:
729
+ syn_match.append(
730
+ (hyp_list[i][0], ref_list[j][0])
731
+ )
732
+ is_break = True
733
+ hyp_list.pop(i)
734
+ ref_list.pop(j)
735
+ break
736
+ else:
737
+ if hyp_list[i][1] == ref_list[1][1]:
738
+ syn_match.append(
739
+ (hyp_list[i][0], ref_list[j][0])
740
+ )
741
+ is_break = True
742
+ hyp_list.pop(i)
743
+ ref_list.pop(j)
744
+ if is_break:
745
+ break
746
+
747
+
748
+
749
+ # print("test 231232 ", hyp_list[i])
750
+
751
+
752
+ return syn_match, hyp_list, ref_list
753
+
754
+ def meteor(self, ref, hyp):
755
+ ref_tag = self._tag_pos_meteor(ref)
756
+ hyp_tag = self._tag_pos_meteor(hyp)
757
+ meteors = []
758
+ alpha = 0.9
759
+ beta = 3.0
760
+ gamma = 0.5
761
+ enum_hyp, enum_ref = self._generate_enum(ref_tag, hyp_tag)
762
+ print("$")
763
+ # print("test 13333 ", enum_hyp)
764
+ for reference in enum_ref:
765
+ hyp_len = len(enum_hyp[0])
766
+ ref_len = len(reference)
767
+
768
+ # 단어/어간 매칭
769
+ word_match, enum_hyp_list, enum_ref_list = self._match_enums(deepcopy(enum_hyp[0]), reference)
770
+ syn_match, enum_hyp_list, enum_ref_list = self._match_syn_with_sejong(enum_hyp_list, enum_ref_list)
771
+ # print("test 123344 " ,enum_ref_list) ## [(0, ['오늘', 'NN']), (6, ['이', 'VB']), (7, ['었다', 'EE'])]
772
+
773
+ final_match = sorted(word_match + syn_match)
774
+
775
+ #최종 결과 계산
776
+ final_match_count = len(final_match)
777
+
778
+
779
+ precision = float(final_match_count) / hyp_len
780
+ recall = float(final_match_count) / ref_len
781
+ fmean = (precision * recall) / (alpha * precision + (1 - alpha) * recall)
782
+ chunk_count = float(self._count_chunks(final_match))
783
+ frag = 0.0
784
+ if final_match_count != 0:
785
+ frag = chunk_count / final_match_count
786
+ else:
787
+ frag = 0.0
788
+ penalty = gamma * frag ** beta
789
+ meteors.append((1 - penalty) * fmean)
790
+
791
+ # print(word_match)
792
+
793
+ return max(meteors)
794
+
795
+
796
+ def demo():
797
+ y_pred = [5, 2, 4, 1, 3, 2, 5, 6, 7]
798
+ y_true = [1, 3, 6, 7, 1, 5]
799
+
800
+ user = [[5, 3, 2], [9, 1, 2], [3, 5, 6], [7, 2, 1]]
801
+ h_pred = [[15, 6, 21, 3], [15, 77, 23, 14], [51, 23, 21, 2], [53, 2, 1, 5]]
802
+
803
+ metric = DefaultMetric()
804
+ print(metric.precision_at_k(y_true, y_pred, 3))
805
+ print(metric.recall_at_k(y_true,y_pred, 3))
806
+ print(metric.hit_rate_at_k(user, h_pred, 2))
807
+
808
+ hyp='봉준호 감독이 아카데미에서 국제영화상을 수상했다.'
809
+ ref=['봉준호가 아카데미에서 각본상을 탔다.']
810
+ re = metric.meteor(ref, hyp)
811
+ print(re)
812
+
813
+ if __name__=="__main__":
814
+ demo()