nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +16 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +847 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +787 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  32. nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  33. nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  34. nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  35. nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  36. nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  37. nltkor/sejong/ch.py +12 -0
  38. nltkor/sejong/dict_semClassNum.txt +491 -0
  39. nltkor/sejong/layer.txt +630 -0
  40. nltkor/sejong/sejong_download.py +87 -0
  41. nltkor/sejong/ssem.py +684 -0
  42. nltkor/similarity/__init__.py +3 -0
  43. nltkor/similarity/bartscore____.py +337 -0
  44. nltkor/similarity/bertscore____.py +339 -0
  45. nltkor/similarity/classical.py +245 -0
  46. nltkor/similarity/cosine_similarity.py +175 -0
  47. nltkor/tag/__init__.py +71 -0
  48. nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
  49. nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
  50. nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
  51. nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
  52. nltkor/tag/espresso_tag.py +220 -0
  53. nltkor/tag/libs/__init__.py +10 -0
  54. nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
  55. nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
  56. nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
  57. nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
  58. nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
  59. nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
  60. nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
  61. nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
  62. nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
  63. nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
  64. nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
  65. nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
  66. nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
  67. nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
  68. nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
  69. nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
  70. nltkor/tag/libs/arguments.py +280 -0
  71. nltkor/tag/libs/attributes.py +231 -0
  72. nltkor/tag/libs/config.py +159 -0
  73. nltkor/tag/libs/metadata.py +129 -0
  74. nltkor/tag/libs/ner/__init__.py +2 -0
  75. nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
  76. nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
  77. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
  78. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
  79. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  80. nltkor/tag/libs/ner/ner_reader.py +92 -0
  81. nltkor/tag/libs/network.c +72325 -0
  82. nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
  83. nltkor/tag/libs/network.pyx +878 -0
  84. nltkor/tag/libs/networkconv.pyx +1028 -0
  85. nltkor/tag/libs/networkdependencyconv.pyx +451 -0
  86. nltkor/tag/libs/parse/__init__.py +1 -0
  87. nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
  88. nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
  89. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
  90. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
  91. nltkor/tag/libs/parse/parse_reader.py +283 -0
  92. nltkor/tag/libs/pos/__init__.py +2 -0
  93. nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
  94. nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
  95. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
  96. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
  97. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  98. nltkor/tag/libs/pos/pos_reader.py +97 -0
  99. nltkor/tag/libs/reader.py +485 -0
  100. nltkor/tag/libs/srl/__init__.py +3 -0
  101. nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
  102. nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
  103. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
  104. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
  105. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
  106. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
  107. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  108. nltkor/tag/libs/srl/srl_reader.py +436 -0
  109. nltkor/tag/libs/srl/train_srl.py +87 -0
  110. nltkor/tag/libs/taggers.py +926 -0
  111. nltkor/tag/libs/utils.py +384 -0
  112. nltkor/tag/libs/word_dictionary.py +239 -0
  113. nltkor/tag/libs/wsd/__init__.py +2 -0
  114. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
  115. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
  116. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
  117. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
  118. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  119. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  120. nltkor/tokenize/__init__.py +62 -0
  121. nltkor/tokenize/ko_tokenize.py +115 -0
  122. nltkor/trans.py +121 -0
  123. nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
  124. nltkor-1.2.14.dist-info/METADATA +41 -0
  125. nltkor-1.2.14.dist-info/RECORD +127 -0
  126. nltkor-1.2.14.dist-info/WHEEL +5 -0
  127. nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,24 @@
1
+ from nltkor.tag import EspressoTagger
2
+
3
+ class EntMent :
4
+
5
+ def __init__(self):
6
+ self.entity_list = []
7
+ self.tagger = EspressoTagger(task = 'ner')
8
+ self.skip = ["*"]
9
+
10
+ def entity (self,orginal_text,summarized_text) :
11
+
12
+ self.entity_list = self.tagger.tag(orginal_text)
13
+ self.entity_list = [item for item in self.entity_list if item[1] not in self.skip]
14
+ self.entity_recall(summarized_text)
15
+
16
+ def entity_recall (self,summarized_text) :
17
+
18
+ if len(self.entity_list) == 0 :
19
+ return 0.0
20
+
21
+ summarized_entity_list = self.tagger.tag(summarized_text)
22
+ summarized_entity_list = [item for item in summarized_entity_list if item[1] not in self.skip]
23
+ recall = len(set(summarized_entity_list)&set(self.entity_list))/len(summarized_entity_list)
24
+ print(recall)
nltkor/metrics/eval.py ADDED
@@ -0,0 +1,517 @@
1
+ from collections import defaultdict
2
+ from nltk.translate.bleu_score import *
3
+ from nltk.metrics import scores
4
+ from nltk.metrics import confusionmatrix
5
+ from nltkor.tokenize import word_tokenize,sent_tokenize,syllable_tokenize
6
+ from nltk.util import ngrams, skipgrams
7
+ from nltkor.cider.cider import Cider
8
+ import sys
9
+ import itertools
10
+ from nltkor.tag import pos_tag, pos_tag_with_verb_form, EspressoTagger
11
+ from nltkor.sejong import ssem
12
+ from typing import Callable, Iterable, List, Tuple
13
+ from copy import deepcopy
14
+
15
+ # Natural Language Toolkit: Machine Translation
16
+ #
17
+ # Copyright (C) 2001-2023 NLTK Project
18
+ # Author: Uday Krishna <udaykrishna5@gmail.com>
19
+ # Contributor: Tom Aarsen
20
+ # URL: <https://www.nltk.org/>
21
+ # For license information, see LICENSE.TXT
22
+
23
+ class StringMetric:
24
+ def __init__(self, lang="korean"):
25
+ self.lang = lang
26
+ if lang not in ["korean", "english"]:
27
+ raise Exception("Only \"korean\" or \"english\" in lang parameter")
28
+ self.tokenize=lambda ref: word_tokenize(ref,self.lang)
29
+ self.skip_pos = ['EE']
30
+
31
+
32
+ def _W_CER(self, r, h):
33
+
34
+ costs = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]
35
+
36
+ DEL_PENALTY=1 # Tact
37
+ INS_PENALTY=1 # Tact
38
+ SUB_PENALTY=1 # Tact
39
+
40
+ for i in range(1, len(r)+1):
41
+ costs[i][0] = DEL_PENALTY*i
42
+
43
+ for j in range(1, len(h) + 1):
44
+ costs[0][j] = INS_PENALTY*j
45
+
46
+ # computation
47
+ for i in range(1, len(r)+1):
48
+ for j in range(1, len(h)+1):
49
+ if r[i-1] == h[j-1]:
50
+ costs[i][j] = costs[i-1][j-1]
51
+ else:
52
+ substitutionCost = costs[i-1][j-1] + SUB_PENALTY # penalty is always 1
53
+ insertionCost = costs[i][j-1] + INS_PENALTY # penalty is always 1
54
+ deletionCost = costs[i-1][j] + DEL_PENALTY # penalty is always 1
55
+
56
+ costs[i][j] = min(substitutionCost, insertionCost, deletionCost)
57
+
58
+ mo = len(r)
59
+ i = len(r)
60
+ j = len(h)
61
+
62
+ result=(costs[i][j])/mo
63
+
64
+ if result>1.0:
65
+ return 1.0
66
+ else:
67
+ return result
68
+
69
+
70
+ def wer(self, reference, candidate):
71
+ r = word_tokenize(reference, self.lang)
72
+ h = word_tokenize(candidate, self.lang)
73
+
74
+ return self._W_CER(r,h)
75
+
76
+
77
+ def cer(self, reference,candidate):
78
+ r = syllable_tokenize(reference, self.lang)
79
+ h = syllable_tokenize(candidate, self.lang)
80
+
81
+ return self._W_CER(r,h)
82
+
83
+
84
+ def bleu(self, reference, candidate,weights=(0.25,0.25,0.25,0.25), smoothing_function=None):
85
+
86
+ if type(candidate)!=list or type(reference)!=list:
87
+ print("parameter expect list type")
88
+ return
89
+
90
+ reference=list(map(self.tokenize,reference))
91
+ candidate=word_tokenize(candidate,self.lang)
92
+
93
+ return sentence_bleu(reference,candidate,weights,smoothing_function=smoothing_function)
94
+
95
+
96
+ def bleu_n(self, reference,candiate,n=1, smoothing_function=None):
97
+
98
+ if n==1:
99
+ return self.bleu(reference,candiate,(1,0,0,0), smoothing_function=smoothing_function)
100
+ elif n==2:
101
+ return self.bleu(reference,candiate,(0,1,0,0), smoothing_function=smoothing_function)
102
+ elif n==3:
103
+ return self.bleu(reference,candiate,(0,0,1,0), smoothing_function=smoothing_function)
104
+ elif n==4:
105
+ return self.bleu(reference,candiate,(0,0,0,1), smoothing_function=smoothing_function)
106
+
107
+
108
+
109
+
110
+ def _hyp_sent_split_remove(self, can):
111
+
112
+ can_sent=[[tmp.rstrip('.'or'?'or'!'or','or'\n')] for tmp in sent_tokenize(can, self.lang)]
113
+ return can_sent
114
+
115
+ def _ref_sent_split_remove(self, ref):
116
+
117
+ ref_sent=[sent_tokenize(tmp,self.lang) for tmp in ref]
118
+ ref_sent_c=[]
119
+ for tmp in ref_sent:
120
+ ref_sent_in=[]
121
+ for tmp2 in tmp:
122
+ ref_sent_in.append(word_tokenize(tmp2.rstrip('.'or'?'or'!'or','or'\n'),self.lang))
123
+ ref_sent_c.append(ref_sent_in)
124
+
125
+ return ref_sent_c
126
+
127
+ def _token(self, ref_stoken, can, n):
128
+
129
+ numer=[]
130
+ ref_len=0
131
+
132
+ can=list(ngrams(can,n))
133
+
134
+ for tmp in ref_stoken:
135
+
136
+ if n==1:
137
+ ref=list(ngrams(tmp,1))
138
+ elif n==2:
139
+ ref=list(ngrams(tmp,2))
140
+ elif n==3:
141
+ ref=list(ngrams(tmp,3))
142
+ else:return 0
143
+
144
+ intersect = [t for t in ref if t in can ]
145
+ numer.append(len(intersect))
146
+ ref_len+=len(ref)
147
+
148
+ try:
149
+ rec= sum(numer)/ref_len
150
+ except:
151
+ rec=0
152
+
153
+ return rec
154
+
155
+
156
+
157
+ def rouge_n(self, ref, can, n=1):
158
+
159
+ beta=1
160
+ rec,prec=0,0
161
+
162
+ can_sent=self._hyp_sent_split_remove(can)
163
+ can_word=list(itertools.chain(*[word_tokenize(tmp,self.lang) for tmp in can_sent]))
164
+ ref=self._ref_sent_split_remove(ref)
165
+
166
+ r_list=[]
167
+
168
+ for tmp in ref:
169
+ if n==1:
170
+ r_list.append(self._token(tmp, can_word, 1))
171
+ elif n==2:
172
+ r_list.append(self._token(tmp, can_word, 2))
173
+ elif n==3:
174
+ r_list.append(self._token(tmp, can_word, 3))
175
+
176
+ return max(r_list)
177
+
178
+
179
+
180
+ def rouge_l(self, ref, can):
181
+
182
+ beta=1
183
+ #check=0
184
+
185
+ can= self._hyp_sent_split_remove(can)
186
+ can=[word_tokenize(tmp,self.lang) for tmp in can]
187
+ refs=self._ref_sent_split_remove(ref)
188
+
189
+ can_word=list(itertools.chain(*can))
190
+
191
+ result_list=[]
192
+
193
+ for ref in refs:
194
+ lcs_list=[]
195
+ for ri in ref:
196
+ ri_C=[]
197
+ for ci in can:
198
+ temp=self._lcs(ci,ri)
199
+ ri_C.append(temp)
200
+
201
+ ri_C=list(itertools.chain(*ri_C))
202
+ ri_C=set(ri_C)
203
+ lcs_list.append(len(ri_C))
204
+
205
+ ref_word=list(itertools.chain(*ref))
206
+
207
+ R_lcs=sum(lcs_list)/len(ref_word)
208
+ P_lcs=sum(lcs_list)/len(can_word)
209
+
210
+ try:
211
+ F_lcs= (2*R_lcs*P_lcs)/(R_lcs+P_lcs)
212
+ except:
213
+ F_lcs=0
214
+ result_list.append(F_lcs)
215
+
216
+ return max(result_list)
217
+
218
+
219
+
220
+ def _lcs(self, can, ref):
221
+
222
+
223
+ s1=can
224
+ s2=ref
225
+ check=0
226
+
227
+ if len(s1)<=len(s2):
228
+ temp=s1
229
+ s1=s2
230
+ s2=temp
231
+ check=1
232
+
233
+ m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
234
+
235
+ for x in range(1, 1 + len(s1)):
236
+ for y in range(1, 1 + len(s2)):
237
+ if s1[x - 1] == s2[y - 1]:
238
+ m[x][y] = m[x - 1][y - 1] +1
239
+ else:
240
+ m[x][y]=max(m[x][y-1],m[x-1][y])
241
+ f_x=len(s2)+1
242
+ lcs=m[len(s1)][len(s2)]
243
+ temp=[]
244
+
245
+
246
+ i=len(s1)
247
+ j=len(s2)
248
+
249
+ while m[i][j]!=0:
250
+ if(m[i][j]==m[i][j-1]):
251
+ j-=1
252
+ elif (m[i][j]==m[i-1][j]):
253
+ i-=1
254
+ else:
255
+ if check==0:
256
+ temp.append(s1[i-1])
257
+ if check==1:
258
+ temp.append(s2[j-1])
259
+ i-=1
260
+ j-=1
261
+
262
+ return temp
263
+ '''
264
+ for y in reversed(range(1,1+len(s1))):
265
+ for x in reversed(range(1,1+len(s2))):
266
+ if (m[y][x]-m[y-1][x-1]==1) and (m[y][x]-m[y-1][x]==1) and (m[y][x]-m[y][x-1]==1):
267
+ if (y==len(s1)+1) and (x==len(s2)):
268
+ temp.append(x)
269
+ else:
270
+ temp.append(x-1)
271
+
272
+ print('the police 만 나와줘야',temp)
273
+ if check==0:
274
+ word=s1
275
+ elif check==1:
276
+ word=s2
277
+
278
+ ret_list=[]
279
+
280
+ for tmp in range(len(temp)):
281
+ ret_list.append(word[temp[tmp]])
282
+
283
+ return ret_list
284
+ '''
285
+
286
+
287
+ def _skip_bigrams(self, ref_stoken, can_sent, can, n=1):
288
+
289
+ beta=1
290
+ numer=[]
291
+ ref_len=0
292
+
293
+ candidate=list(skipgrams(can,2,n))
294
+ can_sent=[word_tokenize(tmp,self.lang) for tmp in can_sent]
295
+ can_sk_len=0
296
+
297
+ for tmp in ref_stoken:
298
+ ref=list(skipgrams(tmp,2,n))
299
+ intersect=[t for t in ref if t in candidate]
300
+ numer.append(len(intersect))
301
+ ref_len+=len(ref)
302
+
303
+ for tmp in can_sent:
304
+ can_sk_len+=len(list(skipgrams(tmp,2,n)))
305
+
306
+ prec=sum(numer)/can_sk_len
307
+ rec=sum(numer)/ref_len
308
+
309
+ if(prec!=0 and rec!=0):
310
+ score = ((1 + beta**2)*prec*rec)/float(rec + beta**2*prec)
311
+ else:
312
+ score = 0.0
313
+ return score
314
+
315
+
316
+ def rouge_s(self, ref, can, n):
317
+
318
+ can_sent= self._hyp_sent_split_remove(can)
319
+ can_word=list(itertools.chain(*[word_tokenize(tmp,self.lang) for tmp in can_sent]))
320
+ ref= self._ref_sent_split_remove(ref)
321
+
322
+
323
+ r_list=[]
324
+
325
+ for tmp in ref:
326
+ #tmp=list(itertools.chain(*tmp))
327
+ r_list.append(self._skip_bigrams(tmp,can_sent,can_word,n))
328
+
329
+ return max(r_list)
330
+
331
+
332
+ def cider(self, ref, hyp):
333
+
334
+ ref_dict=dict()
335
+ hyp_dict=dict()
336
+
337
+ ref_dict[0]=ref
338
+ hyp_dict[0]=hyp
339
+
340
+ cider_score=Cider()
341
+ score=cider_score.compute_score(ref_dict,hyp_dict)
342
+
343
+ return float(score)
344
+
345
+ def _process_espresso_output_format(self, result_list):
346
+ temp_list = []
347
+ for k in result_list:
348
+ k = k.split('_')
349
+ if k[1] == 'SP' or k[1] == 'SY':
350
+ continue
351
+ temp_list.append(k)
352
+ return temp_list
353
+
354
+ def _generate_enum(self, ref, hyp):
355
+ result_hyp = []
356
+ result_ref = []
357
+ for h in hyp:
358
+ enum_hyp_list = list(enumerate(h))
359
+ result_hyp.append(enum_hyp_list)
360
+ for r in ref:
361
+ enum_ref_list = list(enumerate(r))
362
+ result_ref.append(enum_ref_list)
363
+ return result_hyp, result_ref
364
+
365
+ def _tag_pos_meteor(self, sent_list):
366
+ result_list = list()
367
+ for sent in sent_list:
368
+ tagged_sent = EspressoTagger(task='pos').tag(sent)
369
+ tagged_sent = self._process_espresso_output_format(tagged_sent)
370
+ result_list.append(tagged_sent)
371
+ return result_list
372
+
373
+ def _match_enums(self,
374
+ enum_hypothesis_list: List[Tuple[int, str]],
375
+ enum_reference_list: List[Tuple[int, str]],
376
+ ) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
377
+ """
378
+ matches exact words in hypothesis and reference and returns
379
+ a word mapping between enum_hypothesis_list and enum_reference_list
380
+ based on the enumerated word id.
381
+
382
+ :param enum_hypothesis_list: enumerated hypothesis list
383
+ :param enum_reference_list: enumerated reference list
384
+ :return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
385
+ enumerated unmatched reference tuples
386
+ """
387
+ word_match = []
388
+ # print("test 213" , enum_hypothesis_list)
389
+ # print("test 124" , enum_reference_list)
390
+ for i in range(len(enum_hypothesis_list))[::-1]:
391
+ for j in range(len(enum_reference_list))[::-1]:
392
+ # print(f"\n \t {enum_hypothesis_list[i][1]} \t {enum_reference_list[j][1]}")
393
+ if enum_hypothesis_list[i][1] == enum_reference_list[j][1]:
394
+
395
+ # print("Check!!")
396
+ word_match.append(
397
+ (enum_hypothesis_list[i][0], enum_reference_list[j][0])
398
+ )
399
+ enum_hypothesis_list.pop(i)
400
+ enum_reference_list.pop(j)
401
+ break
402
+ return word_match, enum_hypothesis_list, enum_reference_list
403
+
404
+
405
+ def _count_chunks(self, matches: List[Tuple[int, int]]) -> int:
406
+ """
407
+ Counts the fewest possible number of chunks such that matched unigrams
408
+ of each chunk are adjacent to each other. This is used to calculate the
409
+ fragmentation part of the metric.
410
+
411
+ :param matches: list containing a mapping of matched words (output of align_words)
412
+ :return: Number of chunks a sentence is divided into post alignment
413
+ """
414
+ i = 0
415
+ chunks = 1
416
+ while i < len(matches) - 1:
417
+ if (matches[i + 1][0] == matches[i][0] + 1) and (
418
+ matches[i + 1][1] == matches[i][1] + 1
419
+ ):
420
+ i += 1
421
+ continue
422
+ i += 1
423
+ chunks += 1
424
+ return chunks
425
+
426
+ def _match_syn_with_sejong(self, hyp_list, ref_list):
427
+ syn_match = []
428
+ for i in range(len(hyp_list))[::-1]:
429
+ temp_syn_list = []
430
+ #print("test 344434: ", hyp_list[i])
431
+ if hyp_list[i][1][1] not in self.skip_pos:
432
+ entrys = ssem.entrys(hyp_list[i][1][0])
433
+ for entry in entrys:
434
+ for sense in entry.senses():
435
+ if sense.syn():
436
+ temp_syn_list.append(sense.syn())
437
+ if temp_syn_list:
438
+ hyp_list[i][1].append(deepcopy(temp_syn_list))
439
+
440
+ for j in range(len(ref_list))[::-1]:
441
+ is_break = False
442
+ if len(hyp_list[i][1]) == 3:
443
+ for syn in hyp_list[i][1][2]:
444
+
445
+ if syn[0] == ref_list[j][1][0]:
446
+ syn_match.append(
447
+ (hyp_list[i][0], ref_list[j][0])
448
+ )
449
+ is_break = True
450
+ hyp_list.pop(i)
451
+ ref_list.pop(j)
452
+ break
453
+ else:
454
+ if hyp_list[i][1] == ref_list[1][1]:
455
+ syn_match.append(
456
+ (hyp_list[i][0], ref_list[j][0])
457
+ )
458
+ is_break = True
459
+ hyp_list.pop(i)
460
+ ref_list.pop(j)
461
+ if is_break:
462
+ break
463
+
464
+
465
+
466
+ # print("test 231232 ", hyp_list[i])
467
+
468
+
469
+ return syn_match, hyp_list, ref_list
470
+
471
+ def meteor(self, ref, hyp):
472
+ ref_tag = self._tag_pos_meteor(ref)
473
+ hyp_tag = self._tag_pos_meteor(hyp)
474
+ meteors = []
475
+ alpha = 0.9
476
+ beta = 3.0
477
+ gamma = 0.5
478
+ enum_hyp, enum_ref = self._generate_enum(ref_tag, hyp_tag)
479
+ # print("test 13333 ", enum_hyp)
480
+ for reference in enum_ref:
481
+ hyp_len = len(enum_hyp[0])
482
+ ref_len = len(reference)
483
+
484
+ # 단어/어간 매칭
485
+ word_match, enum_hyp_list, enum_ref_list = self._match_enums(deepcopy(enum_hyp[0]), reference)
486
+ syn_match, enum_hyp_list, enum_ref_list = self._match_syn_with_sejong(enum_hyp_list, enum_ref_list)
487
+ # print("test 123344 " ,enum_ref_list) ## [(0, ['오늘', 'NN']), (6, ['이', 'VB']), (7, ['었다', 'EE'])]
488
+
489
+ final_match = sorted(word_match + syn_match)
490
+
491
+ #최종 결과 계산
492
+ final_match_count = len(final_match)
493
+
494
+
495
+ precision = float(final_match_count) / hyp_len
496
+ recall = float(final_match_count) / ref_len
497
+ fmean = (precision * recall) / (alpha * precision + (1 - alpha) * recall)
498
+ chunk_count = float(self._count_chunks(final_match))
499
+ frag = 0.0
500
+ if final_match_count != 0:
501
+ frag = chunk_count / final_match_count
502
+ else:
503
+ frag = 0.0
504
+ penalty = gamma * frag ** beta
505
+ meteors.append((1 - penalty) * fmean)
506
+
507
+ # print(word_match)
508
+
509
+ return max(meteors)
510
+
511
+
512
+ if __name__=="__main__":
513
+ hyp='봉준호 감독이 아카데미에서 국제영화상을 수상했다.'
514
+ ref=['봉준호가 아카데미에서 각본상을 탔다.']
515
+ metric = StringMetric()
516
+ re = metric.meteor(ref, hyp)
517
+ print(re)