nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nltkor/Kor_char.py +193 -0
- nltkor/__init__.py +16 -0
- nltkor/alignment/__init__.py +1315 -0
- nltkor/cider/__init__.py +2 -0
- nltkor/cider/cider.py +55 -0
- nltkor/cider/cider_scorer.py +207 -0
- nltkor/distance/__init__.py +441 -0
- nltkor/distance/wasserstein.py +126 -0
- nltkor/etc.py +22 -0
- nltkor/lazyimport.py +144 -0
- nltkor/make_requirement.py +11 -0
- nltkor/metrics/__init__.py +63 -0
- nltkor/metrics/bartscore.py +301 -0
- nltkor/metrics/bertscore.py +331 -0
- nltkor/metrics/bleu_tensor.py +20 -0
- nltkor/metrics/classical.py +847 -0
- nltkor/metrics/entment.py +24 -0
- nltkor/metrics/eval.py +517 -0
- nltkor/metrics/mauve.py +273 -0
- nltkor/metrics/mauve_utils.py +131 -0
- nltkor/misc/__init__.py +11 -0
- nltkor/misc/string2string_basic_functions.py +59 -0
- nltkor/misc/string2string_default_tokenizer.py +83 -0
- nltkor/misc/string2string_hash_functions.py +159 -0
- nltkor/misc/string2string_word_embeddings.py +503 -0
- nltkor/search/__init__.py +10 -0
- nltkor/search/classical.py +569 -0
- nltkor/search/faiss_search.py +787 -0
- nltkor/search/kobert_tokenizer.py +181 -0
- nltkor/sejong/__init__.py +3 -0
- nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
- nltkor/sejong/ch.py +12 -0
- nltkor/sejong/dict_semClassNum.txt +491 -0
- nltkor/sejong/layer.txt +630 -0
- nltkor/sejong/sejong_download.py +87 -0
- nltkor/sejong/ssem.py +684 -0
- nltkor/similarity/__init__.py +3 -0
- nltkor/similarity/bartscore____.py +337 -0
- nltkor/similarity/bertscore____.py +339 -0
- nltkor/similarity/classical.py +245 -0
- nltkor/similarity/cosine_similarity.py +175 -0
- nltkor/tag/__init__.py +71 -0
- nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
- nltkor/tag/espresso_tag.py +220 -0
- nltkor/tag/libs/__init__.py +10 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
- nltkor/tag/libs/arguments.py +280 -0
- nltkor/tag/libs/attributes.py +231 -0
- nltkor/tag/libs/config.py +159 -0
- nltkor/tag/libs/metadata.py +129 -0
- nltkor/tag/libs/ner/__init__.py +2 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/macmorphoreader.py +7 -0
- nltkor/tag/libs/ner/ner_reader.py +92 -0
- nltkor/tag/libs/network.c +72325 -0
- nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
- nltkor/tag/libs/network.pyx +878 -0
- nltkor/tag/libs/networkconv.pyx +1028 -0
- nltkor/tag/libs/networkdependencyconv.pyx +451 -0
- nltkor/tag/libs/parse/__init__.py +1 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/parse_reader.py +283 -0
- nltkor/tag/libs/pos/__init__.py +2 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/macmorphoreader.py +7 -0
- nltkor/tag/libs/pos/pos_reader.py +97 -0
- nltkor/tag/libs/reader.py +485 -0
- nltkor/tag/libs/srl/__init__.py +3 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__srl_reader_.py +535 -0
- nltkor/tag/libs/srl/srl_reader.py +436 -0
- nltkor/tag/libs/srl/train_srl.py +87 -0
- nltkor/tag/libs/taggers.py +926 -0
- nltkor/tag/libs/utils.py +384 -0
- nltkor/tag/libs/word_dictionary.py +239 -0
- nltkor/tag/libs/wsd/__init__.py +2 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
- nltkor/tag/libs/wsd/wsd_reader.py +93 -0
- nltkor/tokenize/__init__.py +62 -0
- nltkor/tokenize/ko_tokenize.py +115 -0
- nltkor/trans.py +121 -0
- nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
- nltkor-1.2.14.dist-info/METADATA +41 -0
- nltkor-1.2.14.dist-info/RECORD +127 -0
- nltkor-1.2.14.dist-info/WHEEL +5 -0
- nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,24 @@
|
|
1
|
+
from nltkor.tag import EspressoTagger
|
2
|
+
|
3
|
+
class EntMent :
|
4
|
+
|
5
|
+
def __init__(self):
|
6
|
+
self.entity_list = []
|
7
|
+
self.tagger = EspressoTagger(task = 'ner')
|
8
|
+
self.skip = ["*"]
|
9
|
+
|
10
|
+
def entity (self,orginal_text,summarized_text) :
|
11
|
+
|
12
|
+
self.entity_list = self.tagger.tag(orginal_text)
|
13
|
+
self.entity_list = [item for item in self.entity_list if item[1] not in self.skip]
|
14
|
+
self.entity_recall(summarized_text)
|
15
|
+
|
16
|
+
def entity_recall (self,summarized_text) :
|
17
|
+
|
18
|
+
if len(self.entity_list) == 0 :
|
19
|
+
return 0.0
|
20
|
+
|
21
|
+
summarized_entity_list = self.tagger.tag(summarized_text)
|
22
|
+
summarized_entity_list = [item for item in summarized_entity_list if item[1] not in self.skip]
|
23
|
+
recall = len(set(summarized_entity_list)&set(self.entity_list))/len(summarized_entity_list)
|
24
|
+
print(recall)
|
nltkor/metrics/eval.py
ADDED
@@ -0,0 +1,517 @@
|
|
1
|
+
from collections import defaultdict
|
2
|
+
from nltk.translate.bleu_score import *
|
3
|
+
from nltk.metrics import scores
|
4
|
+
from nltk.metrics import confusionmatrix
|
5
|
+
from nltkor.tokenize import word_tokenize,sent_tokenize,syllable_tokenize
|
6
|
+
from nltk.util import ngrams, skipgrams
|
7
|
+
from nltkor.cider.cider import Cider
|
8
|
+
import sys
|
9
|
+
import itertools
|
10
|
+
from nltkor.tag import pos_tag, pos_tag_with_verb_form, EspressoTagger
|
11
|
+
from nltkor.sejong import ssem
|
12
|
+
from typing import Callable, Iterable, List, Tuple
|
13
|
+
from copy import deepcopy
|
14
|
+
|
15
|
+
# Natural Language Toolkit: Machine Translation
|
16
|
+
#
|
17
|
+
# Copyright (C) 2001-2023 NLTK Project
|
18
|
+
# Author: Uday Krishna <udaykrishna5@gmail.com>
|
19
|
+
# Contributor: Tom Aarsen
|
20
|
+
# URL: <https://www.nltk.org/>
|
21
|
+
# For license information, see LICENSE.TXT
|
22
|
+
|
23
|
+
class StringMetric:
|
24
|
+
def __init__(self, lang="korean"):
|
25
|
+
self.lang = lang
|
26
|
+
if lang not in ["korean", "english"]:
|
27
|
+
raise Exception("Only \"korean\" or \"english\" in lang parameter")
|
28
|
+
self.tokenize=lambda ref: word_tokenize(ref,self.lang)
|
29
|
+
self.skip_pos = ['EE']
|
30
|
+
|
31
|
+
|
32
|
+
def _W_CER(self, r, h):
|
33
|
+
|
34
|
+
costs = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]
|
35
|
+
|
36
|
+
DEL_PENALTY=1 # Tact
|
37
|
+
INS_PENALTY=1 # Tact
|
38
|
+
SUB_PENALTY=1 # Tact
|
39
|
+
|
40
|
+
for i in range(1, len(r)+1):
|
41
|
+
costs[i][0] = DEL_PENALTY*i
|
42
|
+
|
43
|
+
for j in range(1, len(h) + 1):
|
44
|
+
costs[0][j] = INS_PENALTY*j
|
45
|
+
|
46
|
+
# computation
|
47
|
+
for i in range(1, len(r)+1):
|
48
|
+
for j in range(1, len(h)+1):
|
49
|
+
if r[i-1] == h[j-1]:
|
50
|
+
costs[i][j] = costs[i-1][j-1]
|
51
|
+
else:
|
52
|
+
substitutionCost = costs[i-1][j-1] + SUB_PENALTY # penalty is always 1
|
53
|
+
insertionCost = costs[i][j-1] + INS_PENALTY # penalty is always 1
|
54
|
+
deletionCost = costs[i-1][j] + DEL_PENALTY # penalty is always 1
|
55
|
+
|
56
|
+
costs[i][j] = min(substitutionCost, insertionCost, deletionCost)
|
57
|
+
|
58
|
+
mo = len(r)
|
59
|
+
i = len(r)
|
60
|
+
j = len(h)
|
61
|
+
|
62
|
+
result=(costs[i][j])/mo
|
63
|
+
|
64
|
+
if result>1.0:
|
65
|
+
return 1.0
|
66
|
+
else:
|
67
|
+
return result
|
68
|
+
|
69
|
+
|
70
|
+
def wer(self, reference, candidate):
|
71
|
+
r = word_tokenize(reference, self.lang)
|
72
|
+
h = word_tokenize(candidate, self.lang)
|
73
|
+
|
74
|
+
return self._W_CER(r,h)
|
75
|
+
|
76
|
+
|
77
|
+
def cer(self, reference,candidate):
|
78
|
+
r = syllable_tokenize(reference, self.lang)
|
79
|
+
h = syllable_tokenize(candidate, self.lang)
|
80
|
+
|
81
|
+
return self._W_CER(r,h)
|
82
|
+
|
83
|
+
|
84
|
+
def bleu(self, reference, candidate,weights=(0.25,0.25,0.25,0.25), smoothing_function=None):
|
85
|
+
|
86
|
+
if type(candidate)!=list or type(reference)!=list:
|
87
|
+
print("parameter expect list type")
|
88
|
+
return
|
89
|
+
|
90
|
+
reference=list(map(self.tokenize,reference))
|
91
|
+
candidate=word_tokenize(candidate,self.lang)
|
92
|
+
|
93
|
+
return sentence_bleu(reference,candidate,weights,smoothing_function=smoothing_function)
|
94
|
+
|
95
|
+
|
96
|
+
def bleu_n(self, reference,candiate,n=1, smoothing_function=None):
|
97
|
+
|
98
|
+
if n==1:
|
99
|
+
return self.bleu(reference,candiate,(1,0,0,0), smoothing_function=smoothing_function)
|
100
|
+
elif n==2:
|
101
|
+
return self.bleu(reference,candiate,(0,1,0,0), smoothing_function=smoothing_function)
|
102
|
+
elif n==3:
|
103
|
+
return self.bleu(reference,candiate,(0,0,1,0), smoothing_function=smoothing_function)
|
104
|
+
elif n==4:
|
105
|
+
return self.bleu(reference,candiate,(0,0,0,1), smoothing_function=smoothing_function)
|
106
|
+
|
107
|
+
|
108
|
+
|
109
|
+
|
110
|
+
def _hyp_sent_split_remove(self, can):
|
111
|
+
|
112
|
+
can_sent=[[tmp.rstrip('.'or'?'or'!'or','or'\n')] for tmp in sent_tokenize(can, self.lang)]
|
113
|
+
return can_sent
|
114
|
+
|
115
|
+
def _ref_sent_split_remove(self, ref):
|
116
|
+
|
117
|
+
ref_sent=[sent_tokenize(tmp,self.lang) for tmp in ref]
|
118
|
+
ref_sent_c=[]
|
119
|
+
for tmp in ref_sent:
|
120
|
+
ref_sent_in=[]
|
121
|
+
for tmp2 in tmp:
|
122
|
+
ref_sent_in.append(word_tokenize(tmp2.rstrip('.'or'?'or'!'or','or'\n'),self.lang))
|
123
|
+
ref_sent_c.append(ref_sent_in)
|
124
|
+
|
125
|
+
return ref_sent_c
|
126
|
+
|
127
|
+
def _token(self, ref_stoken, can, n):
|
128
|
+
|
129
|
+
numer=[]
|
130
|
+
ref_len=0
|
131
|
+
|
132
|
+
can=list(ngrams(can,n))
|
133
|
+
|
134
|
+
for tmp in ref_stoken:
|
135
|
+
|
136
|
+
if n==1:
|
137
|
+
ref=list(ngrams(tmp,1))
|
138
|
+
elif n==2:
|
139
|
+
ref=list(ngrams(tmp,2))
|
140
|
+
elif n==3:
|
141
|
+
ref=list(ngrams(tmp,3))
|
142
|
+
else:return 0
|
143
|
+
|
144
|
+
intersect = [t for t in ref if t in can ]
|
145
|
+
numer.append(len(intersect))
|
146
|
+
ref_len+=len(ref)
|
147
|
+
|
148
|
+
try:
|
149
|
+
rec= sum(numer)/ref_len
|
150
|
+
except:
|
151
|
+
rec=0
|
152
|
+
|
153
|
+
return rec
|
154
|
+
|
155
|
+
|
156
|
+
|
157
|
+
def rouge_n(self, ref, can, n=1):
|
158
|
+
|
159
|
+
beta=1
|
160
|
+
rec,prec=0,0
|
161
|
+
|
162
|
+
can_sent=self._hyp_sent_split_remove(can)
|
163
|
+
can_word=list(itertools.chain(*[word_tokenize(tmp,self.lang) for tmp in can_sent]))
|
164
|
+
ref=self._ref_sent_split_remove(ref)
|
165
|
+
|
166
|
+
r_list=[]
|
167
|
+
|
168
|
+
for tmp in ref:
|
169
|
+
if n==1:
|
170
|
+
r_list.append(self._token(tmp, can_word, 1))
|
171
|
+
elif n==2:
|
172
|
+
r_list.append(self._token(tmp, can_word, 2))
|
173
|
+
elif n==3:
|
174
|
+
r_list.append(self._token(tmp, can_word, 3))
|
175
|
+
|
176
|
+
return max(r_list)
|
177
|
+
|
178
|
+
|
179
|
+
|
180
|
+
def rouge_l(self, ref, can):
|
181
|
+
|
182
|
+
beta=1
|
183
|
+
#check=0
|
184
|
+
|
185
|
+
can= self._hyp_sent_split_remove(can)
|
186
|
+
can=[word_tokenize(tmp,self.lang) for tmp in can]
|
187
|
+
refs=self._ref_sent_split_remove(ref)
|
188
|
+
|
189
|
+
can_word=list(itertools.chain(*can))
|
190
|
+
|
191
|
+
result_list=[]
|
192
|
+
|
193
|
+
for ref in refs:
|
194
|
+
lcs_list=[]
|
195
|
+
for ri in ref:
|
196
|
+
ri_C=[]
|
197
|
+
for ci in can:
|
198
|
+
temp=self._lcs(ci,ri)
|
199
|
+
ri_C.append(temp)
|
200
|
+
|
201
|
+
ri_C=list(itertools.chain(*ri_C))
|
202
|
+
ri_C=set(ri_C)
|
203
|
+
lcs_list.append(len(ri_C))
|
204
|
+
|
205
|
+
ref_word=list(itertools.chain(*ref))
|
206
|
+
|
207
|
+
R_lcs=sum(lcs_list)/len(ref_word)
|
208
|
+
P_lcs=sum(lcs_list)/len(can_word)
|
209
|
+
|
210
|
+
try:
|
211
|
+
F_lcs= (2*R_lcs*P_lcs)/(R_lcs+P_lcs)
|
212
|
+
except:
|
213
|
+
F_lcs=0
|
214
|
+
result_list.append(F_lcs)
|
215
|
+
|
216
|
+
return max(result_list)
|
217
|
+
|
218
|
+
|
219
|
+
|
220
|
+
def _lcs(self, can, ref):
|
221
|
+
|
222
|
+
|
223
|
+
s1=can
|
224
|
+
s2=ref
|
225
|
+
check=0
|
226
|
+
|
227
|
+
if len(s1)<=len(s2):
|
228
|
+
temp=s1
|
229
|
+
s1=s2
|
230
|
+
s2=temp
|
231
|
+
check=1
|
232
|
+
|
233
|
+
m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
|
234
|
+
|
235
|
+
for x in range(1, 1 + len(s1)):
|
236
|
+
for y in range(1, 1 + len(s2)):
|
237
|
+
if s1[x - 1] == s2[y - 1]:
|
238
|
+
m[x][y] = m[x - 1][y - 1] +1
|
239
|
+
else:
|
240
|
+
m[x][y]=max(m[x][y-1],m[x-1][y])
|
241
|
+
f_x=len(s2)+1
|
242
|
+
lcs=m[len(s1)][len(s2)]
|
243
|
+
temp=[]
|
244
|
+
|
245
|
+
|
246
|
+
i=len(s1)
|
247
|
+
j=len(s2)
|
248
|
+
|
249
|
+
while m[i][j]!=0:
|
250
|
+
if(m[i][j]==m[i][j-1]):
|
251
|
+
j-=1
|
252
|
+
elif (m[i][j]==m[i-1][j]):
|
253
|
+
i-=1
|
254
|
+
else:
|
255
|
+
if check==0:
|
256
|
+
temp.append(s1[i-1])
|
257
|
+
if check==1:
|
258
|
+
temp.append(s2[j-1])
|
259
|
+
i-=1
|
260
|
+
j-=1
|
261
|
+
|
262
|
+
return temp
|
263
|
+
'''
|
264
|
+
for y in reversed(range(1,1+len(s1))):
|
265
|
+
for x in reversed(range(1,1+len(s2))):
|
266
|
+
if (m[y][x]-m[y-1][x-1]==1) and (m[y][x]-m[y-1][x]==1) and (m[y][x]-m[y][x-1]==1):
|
267
|
+
if (y==len(s1)+1) and (x==len(s2)):
|
268
|
+
temp.append(x)
|
269
|
+
else:
|
270
|
+
temp.append(x-1)
|
271
|
+
|
272
|
+
print('the police 만 나와줘야',temp)
|
273
|
+
if check==0:
|
274
|
+
word=s1
|
275
|
+
elif check==1:
|
276
|
+
word=s2
|
277
|
+
|
278
|
+
ret_list=[]
|
279
|
+
|
280
|
+
for tmp in range(len(temp)):
|
281
|
+
ret_list.append(word[temp[tmp]])
|
282
|
+
|
283
|
+
return ret_list
|
284
|
+
'''
|
285
|
+
|
286
|
+
|
287
|
+
def _skip_bigrams(self, ref_stoken, can_sent, can, n=1):
|
288
|
+
|
289
|
+
beta=1
|
290
|
+
numer=[]
|
291
|
+
ref_len=0
|
292
|
+
|
293
|
+
candidate=list(skipgrams(can,2,n))
|
294
|
+
can_sent=[word_tokenize(tmp,self.lang) for tmp in can_sent]
|
295
|
+
can_sk_len=0
|
296
|
+
|
297
|
+
for tmp in ref_stoken:
|
298
|
+
ref=list(skipgrams(tmp,2,n))
|
299
|
+
intersect=[t for t in ref if t in candidate]
|
300
|
+
numer.append(len(intersect))
|
301
|
+
ref_len+=len(ref)
|
302
|
+
|
303
|
+
for tmp in can_sent:
|
304
|
+
can_sk_len+=len(list(skipgrams(tmp,2,n)))
|
305
|
+
|
306
|
+
prec=sum(numer)/can_sk_len
|
307
|
+
rec=sum(numer)/ref_len
|
308
|
+
|
309
|
+
if(prec!=0 and rec!=0):
|
310
|
+
score = ((1 + beta**2)*prec*rec)/float(rec + beta**2*prec)
|
311
|
+
else:
|
312
|
+
score = 0.0
|
313
|
+
return score
|
314
|
+
|
315
|
+
|
316
|
+
def rouge_s(self, ref, can, n):
|
317
|
+
|
318
|
+
can_sent= self._hyp_sent_split_remove(can)
|
319
|
+
can_word=list(itertools.chain(*[word_tokenize(tmp,self.lang) for tmp in can_sent]))
|
320
|
+
ref= self._ref_sent_split_remove(ref)
|
321
|
+
|
322
|
+
|
323
|
+
r_list=[]
|
324
|
+
|
325
|
+
for tmp in ref:
|
326
|
+
#tmp=list(itertools.chain(*tmp))
|
327
|
+
r_list.append(self._skip_bigrams(tmp,can_sent,can_word,n))
|
328
|
+
|
329
|
+
return max(r_list)
|
330
|
+
|
331
|
+
|
332
|
+
def cider(self, ref, hyp):
|
333
|
+
|
334
|
+
ref_dict=dict()
|
335
|
+
hyp_dict=dict()
|
336
|
+
|
337
|
+
ref_dict[0]=ref
|
338
|
+
hyp_dict[0]=hyp
|
339
|
+
|
340
|
+
cider_score=Cider()
|
341
|
+
score=cider_score.compute_score(ref_dict,hyp_dict)
|
342
|
+
|
343
|
+
return float(score)
|
344
|
+
|
345
|
+
def _process_espresso_output_format(self, result_list):
|
346
|
+
temp_list = []
|
347
|
+
for k in result_list:
|
348
|
+
k = k.split('_')
|
349
|
+
if k[1] == 'SP' or k[1] == 'SY':
|
350
|
+
continue
|
351
|
+
temp_list.append(k)
|
352
|
+
return temp_list
|
353
|
+
|
354
|
+
def _generate_enum(self, ref, hyp):
|
355
|
+
result_hyp = []
|
356
|
+
result_ref = []
|
357
|
+
for h in hyp:
|
358
|
+
enum_hyp_list = list(enumerate(h))
|
359
|
+
result_hyp.append(enum_hyp_list)
|
360
|
+
for r in ref:
|
361
|
+
enum_ref_list = list(enumerate(r))
|
362
|
+
result_ref.append(enum_ref_list)
|
363
|
+
return result_hyp, result_ref
|
364
|
+
|
365
|
+
def _tag_pos_meteor(self, sent_list):
|
366
|
+
result_list = list()
|
367
|
+
for sent in sent_list:
|
368
|
+
tagged_sent = EspressoTagger(task='pos').tag(sent)
|
369
|
+
tagged_sent = self._process_espresso_output_format(tagged_sent)
|
370
|
+
result_list.append(tagged_sent)
|
371
|
+
return result_list
|
372
|
+
|
373
|
+
def _match_enums(self,
|
374
|
+
enum_hypothesis_list: List[Tuple[int, str]],
|
375
|
+
enum_reference_list: List[Tuple[int, str]],
|
376
|
+
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
|
377
|
+
"""
|
378
|
+
matches exact words in hypothesis and reference and returns
|
379
|
+
a word mapping between enum_hypothesis_list and enum_reference_list
|
380
|
+
based on the enumerated word id.
|
381
|
+
|
382
|
+
:param enum_hypothesis_list: enumerated hypothesis list
|
383
|
+
:param enum_reference_list: enumerated reference list
|
384
|
+
:return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
|
385
|
+
enumerated unmatched reference tuples
|
386
|
+
"""
|
387
|
+
word_match = []
|
388
|
+
# print("test 213" , enum_hypothesis_list)
|
389
|
+
# print("test 124" , enum_reference_list)
|
390
|
+
for i in range(len(enum_hypothesis_list))[::-1]:
|
391
|
+
for j in range(len(enum_reference_list))[::-1]:
|
392
|
+
# print(f"\n \t {enum_hypothesis_list[i][1]} \t {enum_reference_list[j][1]}")
|
393
|
+
if enum_hypothesis_list[i][1] == enum_reference_list[j][1]:
|
394
|
+
|
395
|
+
# print("Check!!")
|
396
|
+
word_match.append(
|
397
|
+
(enum_hypothesis_list[i][0], enum_reference_list[j][0])
|
398
|
+
)
|
399
|
+
enum_hypothesis_list.pop(i)
|
400
|
+
enum_reference_list.pop(j)
|
401
|
+
break
|
402
|
+
return word_match, enum_hypothesis_list, enum_reference_list
|
403
|
+
|
404
|
+
|
405
|
+
def _count_chunks(self, matches: List[Tuple[int, int]]) -> int:
|
406
|
+
"""
|
407
|
+
Counts the fewest possible number of chunks such that matched unigrams
|
408
|
+
of each chunk are adjacent to each other. This is used to calculate the
|
409
|
+
fragmentation part of the metric.
|
410
|
+
|
411
|
+
:param matches: list containing a mapping of matched words (output of align_words)
|
412
|
+
:return: Number of chunks a sentence is divided into post alignment
|
413
|
+
"""
|
414
|
+
i = 0
|
415
|
+
chunks = 1
|
416
|
+
while i < len(matches) - 1:
|
417
|
+
if (matches[i + 1][0] == matches[i][0] + 1) and (
|
418
|
+
matches[i + 1][1] == matches[i][1] + 1
|
419
|
+
):
|
420
|
+
i += 1
|
421
|
+
continue
|
422
|
+
i += 1
|
423
|
+
chunks += 1
|
424
|
+
return chunks
|
425
|
+
|
426
|
+
def _match_syn_with_sejong(self, hyp_list, ref_list):
|
427
|
+
syn_match = []
|
428
|
+
for i in range(len(hyp_list))[::-1]:
|
429
|
+
temp_syn_list = []
|
430
|
+
#print("test 344434: ", hyp_list[i])
|
431
|
+
if hyp_list[i][1][1] not in self.skip_pos:
|
432
|
+
entrys = ssem.entrys(hyp_list[i][1][0])
|
433
|
+
for entry in entrys:
|
434
|
+
for sense in entry.senses():
|
435
|
+
if sense.syn():
|
436
|
+
temp_syn_list.append(sense.syn())
|
437
|
+
if temp_syn_list:
|
438
|
+
hyp_list[i][1].append(deepcopy(temp_syn_list))
|
439
|
+
|
440
|
+
for j in range(len(ref_list))[::-1]:
|
441
|
+
is_break = False
|
442
|
+
if len(hyp_list[i][1]) == 3:
|
443
|
+
for syn in hyp_list[i][1][2]:
|
444
|
+
|
445
|
+
if syn[0] == ref_list[j][1][0]:
|
446
|
+
syn_match.append(
|
447
|
+
(hyp_list[i][0], ref_list[j][0])
|
448
|
+
)
|
449
|
+
is_break = True
|
450
|
+
hyp_list.pop(i)
|
451
|
+
ref_list.pop(j)
|
452
|
+
break
|
453
|
+
else:
|
454
|
+
if hyp_list[i][1] == ref_list[1][1]:
|
455
|
+
syn_match.append(
|
456
|
+
(hyp_list[i][0], ref_list[j][0])
|
457
|
+
)
|
458
|
+
is_break = True
|
459
|
+
hyp_list.pop(i)
|
460
|
+
ref_list.pop(j)
|
461
|
+
if is_break:
|
462
|
+
break
|
463
|
+
|
464
|
+
|
465
|
+
|
466
|
+
# print("test 231232 ", hyp_list[i])
|
467
|
+
|
468
|
+
|
469
|
+
return syn_match, hyp_list, ref_list
|
470
|
+
|
471
|
+
def meteor(self, ref, hyp):
|
472
|
+
ref_tag = self._tag_pos_meteor(ref)
|
473
|
+
hyp_tag = self._tag_pos_meteor(hyp)
|
474
|
+
meteors = []
|
475
|
+
alpha = 0.9
|
476
|
+
beta = 3.0
|
477
|
+
gamma = 0.5
|
478
|
+
enum_hyp, enum_ref = self._generate_enum(ref_tag, hyp_tag)
|
479
|
+
# print("test 13333 ", enum_hyp)
|
480
|
+
for reference in enum_ref:
|
481
|
+
hyp_len = len(enum_hyp[0])
|
482
|
+
ref_len = len(reference)
|
483
|
+
|
484
|
+
# 단어/어간 매칭
|
485
|
+
word_match, enum_hyp_list, enum_ref_list = self._match_enums(deepcopy(enum_hyp[0]), reference)
|
486
|
+
syn_match, enum_hyp_list, enum_ref_list = self._match_syn_with_sejong(enum_hyp_list, enum_ref_list)
|
487
|
+
# print("test 123344 " ,enum_ref_list) ## [(0, ['오늘', 'NN']), (6, ['이', 'VB']), (7, ['었다', 'EE'])]
|
488
|
+
|
489
|
+
final_match = sorted(word_match + syn_match)
|
490
|
+
|
491
|
+
#최종 결과 계산
|
492
|
+
final_match_count = len(final_match)
|
493
|
+
|
494
|
+
|
495
|
+
precision = float(final_match_count) / hyp_len
|
496
|
+
recall = float(final_match_count) / ref_len
|
497
|
+
fmean = (precision * recall) / (alpha * precision + (1 - alpha) * recall)
|
498
|
+
chunk_count = float(self._count_chunks(final_match))
|
499
|
+
frag = 0.0
|
500
|
+
if final_match_count != 0:
|
501
|
+
frag = chunk_count / final_match_count
|
502
|
+
else:
|
503
|
+
frag = 0.0
|
504
|
+
penalty = gamma * frag ** beta
|
505
|
+
meteors.append((1 - penalty) * fmean)
|
506
|
+
|
507
|
+
# print(word_match)
|
508
|
+
|
509
|
+
return max(meteors)
|
510
|
+
|
511
|
+
|
512
|
+
if __name__=="__main__":
|
513
|
+
hyp='봉준호 감독이 아카데미에서 국제영화상을 수상했다.'
|
514
|
+
ref=['봉준호가 아카데미에서 각본상을 탔다.']
|
515
|
+
metric = StringMetric()
|
516
|
+
re = metric.meteor(ref, hyp)
|
517
|
+
print(re)
|