nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nltkor/Kor_char.py +193 -0
- nltkor/__init__.py +16 -0
- nltkor/alignment/__init__.py +1315 -0
- nltkor/cider/__init__.py +2 -0
- nltkor/cider/cider.py +55 -0
- nltkor/cider/cider_scorer.py +207 -0
- nltkor/distance/__init__.py +441 -0
- nltkor/distance/wasserstein.py +126 -0
- nltkor/etc.py +22 -0
- nltkor/lazyimport.py +144 -0
- nltkor/make_requirement.py +11 -0
- nltkor/metrics/__init__.py +63 -0
- nltkor/metrics/bartscore.py +301 -0
- nltkor/metrics/bertscore.py +331 -0
- nltkor/metrics/bleu_tensor.py +20 -0
- nltkor/metrics/classical.py +847 -0
- nltkor/metrics/entment.py +24 -0
- nltkor/metrics/eval.py +517 -0
- nltkor/metrics/mauve.py +273 -0
- nltkor/metrics/mauve_utils.py +131 -0
- nltkor/misc/__init__.py +11 -0
- nltkor/misc/string2string_basic_functions.py +59 -0
- nltkor/misc/string2string_default_tokenizer.py +83 -0
- nltkor/misc/string2string_hash_functions.py +159 -0
- nltkor/misc/string2string_word_embeddings.py +503 -0
- nltkor/search/__init__.py +10 -0
- nltkor/search/classical.py +569 -0
- nltkor/search/faiss_search.py +787 -0
- nltkor/search/kobert_tokenizer.py +181 -0
- nltkor/sejong/__init__.py +3 -0
- nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
- nltkor/sejong/ch.py +12 -0
- nltkor/sejong/dict_semClassNum.txt +491 -0
- nltkor/sejong/layer.txt +630 -0
- nltkor/sejong/sejong_download.py +87 -0
- nltkor/sejong/ssem.py +684 -0
- nltkor/similarity/__init__.py +3 -0
- nltkor/similarity/bartscore____.py +337 -0
- nltkor/similarity/bertscore____.py +339 -0
- nltkor/similarity/classical.py +245 -0
- nltkor/similarity/cosine_similarity.py +175 -0
- nltkor/tag/__init__.py +71 -0
- nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
- nltkor/tag/espresso_tag.py +220 -0
- nltkor/tag/libs/__init__.py +10 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
- nltkor/tag/libs/arguments.py +280 -0
- nltkor/tag/libs/attributes.py +231 -0
- nltkor/tag/libs/config.py +159 -0
- nltkor/tag/libs/metadata.py +129 -0
- nltkor/tag/libs/ner/__init__.py +2 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/macmorphoreader.py +7 -0
- nltkor/tag/libs/ner/ner_reader.py +92 -0
- nltkor/tag/libs/network.c +72325 -0
- nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
- nltkor/tag/libs/network.pyx +878 -0
- nltkor/tag/libs/networkconv.pyx +1028 -0
- nltkor/tag/libs/networkdependencyconv.pyx +451 -0
- nltkor/tag/libs/parse/__init__.py +1 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/parse_reader.py +283 -0
- nltkor/tag/libs/pos/__init__.py +2 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/macmorphoreader.py +7 -0
- nltkor/tag/libs/pos/pos_reader.py +97 -0
- nltkor/tag/libs/reader.py +485 -0
- nltkor/tag/libs/srl/__init__.py +3 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__srl_reader_.py +535 -0
- nltkor/tag/libs/srl/srl_reader.py +436 -0
- nltkor/tag/libs/srl/train_srl.py +87 -0
- nltkor/tag/libs/taggers.py +926 -0
- nltkor/tag/libs/utils.py +384 -0
- nltkor/tag/libs/word_dictionary.py +239 -0
- nltkor/tag/libs/wsd/__init__.py +2 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
- nltkor/tag/libs/wsd/wsd_reader.py +93 -0
- nltkor/tokenize/__init__.py +62 -0
- nltkor/tokenize/ko_tokenize.py +115 -0
- nltkor/trans.py +121 -0
- nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
- nltkor-1.2.14.dist-info/METADATA +41 -0
- nltkor-1.2.14.dist-info/RECORD +127 -0
- nltkor-1.2.14.dist-info/WHEEL +5 -0
- nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,847 @@
|
|
1
|
+
import os
|
2
|
+
import sys
|
3
|
+
import argparse
|
4
|
+
import numpy as np
|
5
|
+
from typing import Callable, Iterable, List, Tuple, Union
|
6
|
+
from copy import deepcopy
|
7
|
+
import itertools
|
8
|
+
import torch
|
9
|
+
import time
|
10
|
+
from nltk.translate.bleu_score import *
|
11
|
+
from nltk.metrics import confusionmatrix
|
12
|
+
from collections import defaultdict
|
13
|
+
from nltk.util import ngrams, skipgrams
|
14
|
+
#from nltkor.tokenize.ko_tokenize import word_tokenize,sent_tokenize,syllable_tokenize
|
15
|
+
from nltkor.make_requirement import make_requirement
|
16
|
+
from nltkor.tokenize import Ko_tokenize
|
17
|
+
from nltkor.cider.cider import Cider
|
18
|
+
from nltkor.tag import EspressoTagger
|
19
|
+
from nltkor.sejong import ssem
|
20
|
+
|
21
|
+
try:
|
22
|
+
import torch
|
23
|
+
except ImportError:
|
24
|
+
file_path = make_requirement(['torch'])
|
25
|
+
raise Exception(f"""
|
26
|
+
Need to install Libraries, please pip install below libraries
|
27
|
+
\t pip install torch
|
28
|
+
Or, use pip install requirement.txt
|
29
|
+
\t pip install -r {file_path}
|
30
|
+
""")
|
31
|
+
|
32
|
+
|
33
|
+
class DefaultMetric:
|
34
|
+
|
35
|
+
def __init__(self, lang="ko"):
|
36
|
+
self.lang = lang
|
37
|
+
#if lang not in ["korean", "english"]:
|
38
|
+
if lang not in ["ko", "en"]:
|
39
|
+
raise Exception("Only \"korean\" or \"english\" in lang parameter")
|
40
|
+
self.tokenize=lambda ref: Ko_tokenize.word(ref,self.lang)
|
41
|
+
self.skip_pos = ['EE']
|
42
|
+
|
43
|
+
def accuracy_score(self, true, pred):
|
44
|
+
|
45
|
+
mat=confusionmatrix.ConfusionMatrix(true,pred)
|
46
|
+
|
47
|
+
conf=mat._confusion
|
48
|
+
total=0
|
49
|
+
tp=0
|
50
|
+
|
51
|
+
for r, tmp in enumerate(conf):
|
52
|
+
for v, n in enumerate(tmp):
|
53
|
+
if r==v:
|
54
|
+
tp+=n
|
55
|
+
total+=n
|
56
|
+
|
57
|
+
return float(tp/total)
|
58
|
+
|
59
|
+
def accuracy_norm(model, tokenizer, input_text: str, candidates: list, label: int):
|
60
|
+
reserved_memory = []
|
61
|
+
inference_time = []
|
62
|
+
tokenized_prompt = tokenizer(input_text, return_tensors='pt').input_ids
|
63
|
+
total_candidate = []
|
64
|
+
|
65
|
+
for ending in candidates:
|
66
|
+
len_ending = len(ending)
|
67
|
+
tokenized_ending = tokenizer(ending, return_tensors='pt').input_ids
|
68
|
+
tokenized_ending = tokenized_ending[:, 1:]
|
69
|
+
input_ids = torch.cat([tokenized_prompt, tokenized_ending], dim=-1).cuda()
|
70
|
+
labels = input_ids.clone()
|
71
|
+
labels[0, :tokenized_prompt.shape[1]] = -100
|
72
|
+
start = time.time()
|
73
|
+
with torch.no_grad():
|
74
|
+
outputs = model(input_ids, labels=labels)
|
75
|
+
inference_time.append(time.time() - start)
|
76
|
+
reserved_memory.append(torch.cuda.memory_reserved() / (1024**2))
|
77
|
+
total_logprobs = -outputs.loss.item() * tokenized_ending.shape[1]
|
78
|
+
total_candidate.append(total_logprobs/len_ending)
|
79
|
+
answer_idx = total_candidate.index(max(total_candidate))
|
80
|
+
if int(label) == answer_idx:
|
81
|
+
cor = 1
|
82
|
+
else:
|
83
|
+
cor = 0
|
84
|
+
metric_dict = {
|
85
|
+
"reserved_memory": reserved_memory,
|
86
|
+
"inference_time": inference_time
|
87
|
+
}
|
88
|
+
return cor, metric_dict
|
89
|
+
|
90
|
+
def recall_score(self, true, pred, avg='micro'):
|
91
|
+
|
92
|
+
mat=confusionmatrix.ConfusionMatrix(true,pred)
|
93
|
+
|
94
|
+
conf=mat._confusion
|
95
|
+
indices=mat._indices
|
96
|
+
values=mat._values
|
97
|
+
total=0
|
98
|
+
|
99
|
+
|
100
|
+
if len(values)==2:
|
101
|
+
tp=0
|
102
|
+
fn=0
|
103
|
+
for r,i in enumerate(conf):
|
104
|
+
for r2,v in enumerate(i):
|
105
|
+
if r==0:
|
106
|
+
continue
|
107
|
+
elif r2==0:
|
108
|
+
fn=v
|
109
|
+
elif r==1:
|
110
|
+
tp=v
|
111
|
+
|
112
|
+
return float(tp/(tp+fn))
|
113
|
+
|
114
|
+
|
115
|
+
c_tp=[]
|
116
|
+
c_fn=[]
|
117
|
+
recall_cls=[]
|
118
|
+
|
119
|
+
for r, tmp in enumerate(conf):
|
120
|
+
temp=0
|
121
|
+
for v, n in enumerate(tmp):
|
122
|
+
if r==v:
|
123
|
+
c_tp.append(n)
|
124
|
+
else:
|
125
|
+
temp+=n
|
126
|
+
c_fn.append(temp)
|
127
|
+
|
128
|
+
if avg=='macro':
|
129
|
+
|
130
|
+
for tmp in range(len(values)):
|
131
|
+
try:
|
132
|
+
recall_cls.append(float(c_tp[tmp]/(c_tp[tmp]+c_fn[tmp])))
|
133
|
+
except:
|
134
|
+
recall_cls.append(0)
|
135
|
+
|
136
|
+
temp=0
|
137
|
+
|
138
|
+
for tmp in recall_cls:
|
139
|
+
temp+=tmp
|
140
|
+
|
141
|
+
return float(temp/len(recall_cls))
|
142
|
+
|
143
|
+
if avg=='micro':
|
144
|
+
ja=0
|
145
|
+
mo=0
|
146
|
+
|
147
|
+
for tmp in range(len(values)):
|
148
|
+
ja+=c_tp[tmp]
|
149
|
+
mo+=c_tp[tmp]+c_fn[tmp]
|
150
|
+
|
151
|
+
return float(ja/mo)
|
152
|
+
|
153
|
+
else:
|
154
|
+
return "avg expect micro/macro"
|
155
|
+
|
156
|
+
|
157
|
+
|
158
|
+
def precision_score(self, true, pred,avg='micro'):
|
159
|
+
|
160
|
+
|
161
|
+
mat=confusionmatrix.ConfusionMatrix(true,pred)
|
162
|
+
|
163
|
+
conf=mat._confusion
|
164
|
+
values=mat._values
|
165
|
+
|
166
|
+
total=0
|
167
|
+
|
168
|
+
if len(values)==2:
|
169
|
+
tp=0
|
170
|
+
fp=0
|
171
|
+
for r,i in enumerate(conf):
|
172
|
+
for r2,v in enumerate(i):
|
173
|
+
if r2==0:
|
174
|
+
continue
|
175
|
+
elif r==0:
|
176
|
+
fp=v
|
177
|
+
elif r==1:
|
178
|
+
tp=v
|
179
|
+
|
180
|
+
return float(tp/(tp+fp))
|
181
|
+
|
182
|
+
c_tp=list()
|
183
|
+
c_fp=[0 for _ in range(len(values))]
|
184
|
+
recall_cls=[]
|
185
|
+
|
186
|
+
for r, tmp in enumerate(conf):
|
187
|
+
for v, n in enumerate(tmp):
|
188
|
+
if r==v:#tp
|
189
|
+
c_tp.append(n)
|
190
|
+
else:
|
191
|
+
c_fp[v]+=n
|
192
|
+
|
193
|
+
if avg=='macro':
|
194
|
+
for tmp in range(len(values)):
|
195
|
+
try:
|
196
|
+
recall_cls.append(float(c_tp[tmp]/(c_tp[tmp]+c_fp[tmp])))
|
197
|
+
except:
|
198
|
+
recall_cls.append(0)
|
199
|
+
|
200
|
+
temp=0
|
201
|
+
|
202
|
+
for tmp in recall_cls:
|
203
|
+
temp+=tmp
|
204
|
+
|
205
|
+
return float(temp/len(recall_cls))
|
206
|
+
|
207
|
+
|
208
|
+
elif avg=='micro':
|
209
|
+
ja=0
|
210
|
+
mo=0
|
211
|
+
|
212
|
+
for tmp in range(len(values)):
|
213
|
+
ja+=c_tp[tmp]
|
214
|
+
mo+=c_tp[tmp]+c_fp[tmp]
|
215
|
+
|
216
|
+
return float(ja/mo)
|
217
|
+
|
218
|
+
else:
|
219
|
+
return "avg expect micro/macro"
|
220
|
+
|
221
|
+
|
222
|
+
def f1_score(self, true, pred, avg='micro'):
|
223
|
+
|
224
|
+
if avg =='micro' or avg =='macro':
|
225
|
+
|
226
|
+
precision=self.precision_score(true,pred,avg)
|
227
|
+
recall=self.recall_score(true,pred,avg)
|
228
|
+
else:
|
229
|
+
return "avg expect micro/macro"
|
230
|
+
|
231
|
+
return (((precision*recall)/(precision+recall))*2)
|
232
|
+
|
233
|
+
|
234
|
+
|
235
|
+
|
236
|
+
def pos_eval(self, fin):
|
237
|
+
|
238
|
+
#temp=os.getcwd()+'/'+fin
|
239
|
+
file=open(fin,'r').read()
|
240
|
+
sents=file.split("\n\n")
|
241
|
+
|
242
|
+
acc = defaultdict(float)
|
243
|
+
t_avg = defaultdict(float)
|
244
|
+
|
245
|
+
for sent in sents:
|
246
|
+
lines=sent.split('\n')
|
247
|
+
for line in lines:
|
248
|
+
tot=line.split('\t')
|
249
|
+
|
250
|
+
if line=='':continue
|
251
|
+
|
252
|
+
wd=tot[0]
|
253
|
+
gold=tot[1]
|
254
|
+
pred=tot[2]
|
255
|
+
|
256
|
+
acc['all']+=1
|
257
|
+
gold_list=gold.split('+')
|
258
|
+
pred_list=pred.split('+')
|
259
|
+
|
260
|
+
t_avg["pr_all"]+=len(pred_list)
|
261
|
+
t_avg["rc_all"]+=len(gold_list)
|
262
|
+
|
263
|
+
if gold==pred:
|
264
|
+
acc["true"]+=1
|
265
|
+
t_avg['pr']+=len(pred_list)
|
266
|
+
t_avg['rc']+=len(gold_list)
|
267
|
+
continue
|
268
|
+
else :
|
269
|
+
intersect=0
|
270
|
+
for g in gold_list:
|
271
|
+
if not g in pred_list: continue
|
272
|
+
intersect+=1
|
273
|
+
t_avg['pr']+=intersect
|
274
|
+
t_avg['rc']+=intersect
|
275
|
+
|
276
|
+
|
277
|
+
t_avg['pr_result'] = t_avg['pr'] / t_avg['pr_all']
|
278
|
+
t_avg['rc_result'] = t_avg['rc'] / t_avg['rc_all']
|
279
|
+
|
280
|
+
return float(acc['true']/acc['all']) ,t_avg['pr_result'],t_avg['rc_result'], f1(t_avg['pr_result'], t_avg['rc_result'])
|
281
|
+
|
282
|
+
|
283
|
+
def f1(self, p, r):
|
284
|
+
return 2 * p * r / (p + r) if p + r else 0
|
285
|
+
|
286
|
+
|
287
|
+
def precision_at_k(self, true: List[int], pred: List[int], k: int) -> float:
|
288
|
+
"""
|
289
|
+
avg = ['micro', 'macro']
|
290
|
+
"""
|
291
|
+
|
292
|
+
relevant = 0
|
293
|
+
|
294
|
+
if k > len(pred):
|
295
|
+
raise ValueError("`k` is bigger than pred's length")
|
296
|
+
|
297
|
+
pred = pred[:k]
|
298
|
+
|
299
|
+
for t in true:
|
300
|
+
if t in pred:
|
301
|
+
relevant += 1
|
302
|
+
|
303
|
+
|
304
|
+
return float(relevant/len(pred))
|
305
|
+
|
306
|
+
def recall_at_k(self, true: List[int], pred: List[int], k: int) -> float:
|
307
|
+
|
308
|
+
relevant = 0
|
309
|
+
|
310
|
+
if k > len(pred):
|
311
|
+
raise ValueError("`k` is bigger than pred's length")
|
312
|
+
|
313
|
+
pred = pred[:k]
|
314
|
+
|
315
|
+
for t in true:
|
316
|
+
if t in pred:
|
317
|
+
relevant += 1
|
318
|
+
|
319
|
+
|
320
|
+
return float(relevant/len(true))
|
321
|
+
|
322
|
+
def hit_rate_at_k(self, user: List[List[int]], pred: List[List[int]], k: int) -> float:
|
323
|
+
hit = 0
|
324
|
+
|
325
|
+
for u_list, p_list in zip(user, pred):
|
326
|
+
try:
|
327
|
+
p_list = p_list[:k]
|
328
|
+
except:
|
329
|
+
raise ValueError("`k` is bigger than pred's length ")
|
330
|
+
for u in u_list:
|
331
|
+
if u in p_list:
|
332
|
+
hit += 1
|
333
|
+
break
|
334
|
+
|
335
|
+
return float(hit/len(user))
|
336
|
+
|
337
|
+
def mean_absolute_error(self, true: Union[torch.Tensor, np.ndarray], pred: Union[torch.Tensor, np.ndarray]) -> float:
|
338
|
+
pass
|
339
|
+
|
340
|
+
def root_mean_square_error(self, true: Union[torch.Tensor, np.ndarray], pred: Union[torch.Tensor, np.ndarray]) -> float:
|
341
|
+
pass
|
342
|
+
|
343
|
+
def _W_CER(self, r, h):
|
344
|
+
|
345
|
+
costs = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]
|
346
|
+
|
347
|
+
DEL_PENALTY=1 # Tact
|
348
|
+
INS_PENALTY=1 # Tact
|
349
|
+
SUB_PENALTY=1 # Tact
|
350
|
+
|
351
|
+
for i in range(1, len(r)+1):
|
352
|
+
costs[i][0] = DEL_PENALTY*i
|
353
|
+
|
354
|
+
for j in range(1, len(h) + 1):
|
355
|
+
costs[0][j] = INS_PENALTY*j
|
356
|
+
|
357
|
+
# computation
|
358
|
+
for i in range(1, len(r)+1):
|
359
|
+
for j in range(1, len(h)+1):
|
360
|
+
if r[i-1] == h[j-1]:
|
361
|
+
costs[i][j] = costs[i-1][j-1]
|
362
|
+
else:
|
363
|
+
substitutionCost = costs[i-1][j-1] + SUB_PENALTY # penalty is always 1
|
364
|
+
insertionCost = costs[i][j-1] + INS_PENALTY # penalty is always 1
|
365
|
+
deletionCost = costs[i-1][j] + DEL_PENALTY # penalty is always 1
|
366
|
+
|
367
|
+
costs[i][j] = min(substitutionCost, insertionCost, deletionCost)
|
368
|
+
|
369
|
+
mo = len(r)
|
370
|
+
i = len(r)
|
371
|
+
j = len(h)
|
372
|
+
|
373
|
+
result=(costs[i][j])/mo
|
374
|
+
|
375
|
+
if result>1.0:
|
376
|
+
return 1.0
|
377
|
+
else:
|
378
|
+
return result
|
379
|
+
|
380
|
+
|
381
|
+
def wer(self, reference, candidate):
|
382
|
+
r = Ko_tokenize.word(reference)
|
383
|
+
h = Ko_tokenize.word(candidate)
|
384
|
+
|
385
|
+
return self._W_CER(r,h)
|
386
|
+
|
387
|
+
|
388
|
+
def cer(self, reference,candidate):
|
389
|
+
r = Ko_tokenize.syllable(reference)
|
390
|
+
h = Ko_tokenize.syllable(candidate)
|
391
|
+
|
392
|
+
return self._W_CER(r,h)
|
393
|
+
|
394
|
+
|
395
|
+
def bleu(self, reference, candidate,weights=(0.25,0.25,0.25,0.25), smoothing_function=None):
|
396
|
+
|
397
|
+
if type(candidate)!=list or type(reference)!=list:
|
398
|
+
print("parameter expect list type")
|
399
|
+
return
|
400
|
+
|
401
|
+
reference=list(map(self.tokenize,reference))
|
402
|
+
candidate=Ko_tokenize.word(candidate)
|
403
|
+
|
404
|
+
return sentence_bleu(reference,candidate,weights,smoothing_function=smoothing_function)
|
405
|
+
|
406
|
+
|
407
|
+
def bleu_n(self, reference,candiate,n=1, smoothing_function=None):
|
408
|
+
|
409
|
+
if n==1:
|
410
|
+
return self.bleu(reference,candiate,(1,0,0,0), smoothing_function=smoothing_function)
|
411
|
+
elif n==2:
|
412
|
+
return self.bleu(reference,candiate,(0,1,0,0), smoothing_function=smoothing_function)
|
413
|
+
elif n==3:
|
414
|
+
return self.bleu(reference,candiate,(0,0,1,0), smoothing_function=smoothing_function)
|
415
|
+
elif n==4:
|
416
|
+
return self.bleu(reference,candiate,(0,0,0,1), smoothing_function=smoothing_function)
|
417
|
+
|
418
|
+
|
419
|
+
|
420
|
+
|
421
|
+
def _hyp_sent_split_remove(self, can):
|
422
|
+
|
423
|
+
can_sent=[[tmp.rstrip('.'or'?'or'!'or','or'\n')] for tmp in Ko_tokenize.sentence(can)]
|
424
|
+
return can_sent
|
425
|
+
|
426
|
+
def _ref_sent_split_remove(self, ref):
|
427
|
+
|
428
|
+
ref_sent=[Ko_tokenize.sentence(tmp) for tmp in ref]
|
429
|
+
ref_sent_c=[]
|
430
|
+
for tmp in ref_sent:
|
431
|
+
ref_sent_in=[]
|
432
|
+
for tmp2 in tmp:
|
433
|
+
ref_sent_in.append(Ko_tokenize.word(tmp2.rstrip('.'or'?'or'!'or','or'\n')))
|
434
|
+
ref_sent_c.append(ref_sent_in)
|
435
|
+
|
436
|
+
return ref_sent_c
|
437
|
+
|
438
|
+
def _token(self, ref_stoken, can, n):
|
439
|
+
|
440
|
+
numer=[]
|
441
|
+
ref_len=0
|
442
|
+
|
443
|
+
can=list(ngrams(can,n))
|
444
|
+
|
445
|
+
for tmp in ref_stoken:
|
446
|
+
|
447
|
+
if n==1:
|
448
|
+
ref=list(ngrams(tmp,1))
|
449
|
+
elif n==2:
|
450
|
+
ref=list(ngrams(tmp,2))
|
451
|
+
elif n==3:
|
452
|
+
ref=list(ngrams(tmp,3))
|
453
|
+
else:return 0
|
454
|
+
|
455
|
+
intersect = [t for t in ref if t in can ]
|
456
|
+
numer.append(len(intersect))
|
457
|
+
ref_len+=len(ref)
|
458
|
+
|
459
|
+
try:
|
460
|
+
rec= sum(numer)/ref_len
|
461
|
+
except:
|
462
|
+
rec=0
|
463
|
+
|
464
|
+
return rec
|
465
|
+
|
466
|
+
|
467
|
+
|
468
|
+
def rouge_n(self, ref, can, n=1):
|
469
|
+
|
470
|
+
beta=1
|
471
|
+
rec,prec=0,0
|
472
|
+
|
473
|
+
can_sent=self._hyp_sent_split_remove(can)
|
474
|
+
can_word=list(itertools.chain(*[Ko_tokenize.word(tmp,self.lang) for tmp in can_sent]))
|
475
|
+
ref=self._ref_sent_split_remove(ref)
|
476
|
+
|
477
|
+
r_list=[]
|
478
|
+
|
479
|
+
for tmp in ref:
|
480
|
+
if n==1:
|
481
|
+
r_list.append(self._token(tmp, can_word, 1))
|
482
|
+
elif n==2:
|
483
|
+
r_list.append(self._token(tmp, can_word, 2))
|
484
|
+
elif n==3:
|
485
|
+
r_list.append(self._token(tmp, can_word, 3))
|
486
|
+
|
487
|
+
return max(r_list)
|
488
|
+
|
489
|
+
|
490
|
+
|
491
|
+
def rouge_l(self, ref, can):
|
492
|
+
|
493
|
+
beta=1
|
494
|
+
#check=0
|
495
|
+
|
496
|
+
can= self._hyp_sent_split_remove(can)
|
497
|
+
can=[Ko_tokenize.word(tmp,self.lang) for tmp in can]
|
498
|
+
refs=self._ref_sent_split_remove(ref)
|
499
|
+
|
500
|
+
can_word=list(itertools.chain(*can))
|
501
|
+
|
502
|
+
result_list=[]
|
503
|
+
|
504
|
+
for ref in refs:
|
505
|
+
lcs_list=[]
|
506
|
+
for ri in ref:
|
507
|
+
ri_C=[]
|
508
|
+
for ci in can:
|
509
|
+
temp=self._lcs(ci,ri)
|
510
|
+
ri_C.append(temp)
|
511
|
+
|
512
|
+
ri_C=list(itertools.chain(*ri_C))
|
513
|
+
ri_C=set(ri_C)
|
514
|
+
lcs_list.append(len(ri_C))
|
515
|
+
|
516
|
+
ref_word=list(itertools.chain(*ref))
|
517
|
+
|
518
|
+
R_lcs=sum(lcs_list)/len(ref_word)
|
519
|
+
P_lcs=sum(lcs_list)/len(can_word)
|
520
|
+
|
521
|
+
try:
|
522
|
+
F_lcs= (2*R_lcs*P_lcs)/(R_lcs+P_lcs)
|
523
|
+
except:
|
524
|
+
F_lcs=0
|
525
|
+
result_list.append(F_lcs)
|
526
|
+
|
527
|
+
return max(result_list)
|
528
|
+
|
529
|
+
|
530
|
+
|
531
|
+
def _lcs(self, can, ref):
|
532
|
+
|
533
|
+
|
534
|
+
s1=can
|
535
|
+
s2=ref
|
536
|
+
check=0
|
537
|
+
|
538
|
+
if len(s1)<=len(s2):
|
539
|
+
temp=s1
|
540
|
+
s1=s2
|
541
|
+
s2=temp
|
542
|
+
check=1
|
543
|
+
|
544
|
+
m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
|
545
|
+
|
546
|
+
for x in range(1, 1 + len(s1)):
|
547
|
+
for y in range(1, 1 + len(s2)):
|
548
|
+
if s1[x - 1] == s2[y - 1]:
|
549
|
+
m[x][y] = m[x - 1][y - 1] +1
|
550
|
+
else:
|
551
|
+
m[x][y]=max(m[x][y-1],m[x-1][y])
|
552
|
+
f_x=len(s2)+1
|
553
|
+
lcs=m[len(s1)][len(s2)]
|
554
|
+
temp=[]
|
555
|
+
|
556
|
+
|
557
|
+
i=len(s1)
|
558
|
+
j=len(s2)
|
559
|
+
|
560
|
+
while m[i][j]!=0:
|
561
|
+
if(m[i][j]==m[i][j-1]):
|
562
|
+
j-=1
|
563
|
+
elif (m[i][j]==m[i-1][j]):
|
564
|
+
i-=1
|
565
|
+
else:
|
566
|
+
if check==0:
|
567
|
+
temp.append(s1[i-1])
|
568
|
+
if check==1:
|
569
|
+
temp.append(s2[j-1])
|
570
|
+
i-=1
|
571
|
+
j-=1
|
572
|
+
|
573
|
+
return temp
|
574
|
+
'''
|
575
|
+
for y in reversed(range(1,1+len(s1))):
|
576
|
+
for x in reversed(range(1,1+len(s2))):
|
577
|
+
if (m[y][x]-m[y-1][x-1]==1) and (m[y][x]-m[y-1][x]==1) and (m[y][x]-m[y][x-1]==1):
|
578
|
+
if (y==len(s1)+1) and (x==len(s2)):
|
579
|
+
temp.append(x)
|
580
|
+
else:
|
581
|
+
temp.append(x-1)
|
582
|
+
|
583
|
+
print('the police 만 나와줘야',temp)
|
584
|
+
if check==0:
|
585
|
+
word=s1
|
586
|
+
elif check==1:
|
587
|
+
word=s2
|
588
|
+
|
589
|
+
ret_list=[]
|
590
|
+
|
591
|
+
for tmp in range(len(temp)):
|
592
|
+
ret_list.append(word[temp[tmp]])
|
593
|
+
|
594
|
+
return ret_list
|
595
|
+
'''
|
596
|
+
|
597
|
+
|
598
|
+
def _skip_bigrams(self, ref_stoken, can_sent, can, n=1):
|
599
|
+
|
600
|
+
beta=1
|
601
|
+
numer=[]
|
602
|
+
ref_len=0
|
603
|
+
|
604
|
+
candidate=list(skipgrams(can,2,n))
|
605
|
+
can_sent=[Ko_tokenize.word(tmp,self.lang) for tmp in can_sent]
|
606
|
+
can_sk_len=0
|
607
|
+
|
608
|
+
for tmp in ref_stoken:
|
609
|
+
ref=list(skipgrams(tmp,2,n))
|
610
|
+
intersect=[t for t in ref if t in candidate]
|
611
|
+
numer.append(len(intersect))
|
612
|
+
ref_len+=len(ref)
|
613
|
+
|
614
|
+
for tmp in can_sent:
|
615
|
+
can_sk_len+=len(list(skipgrams(tmp,2,n)))
|
616
|
+
|
617
|
+
prec=sum(numer)/can_sk_len
|
618
|
+
rec=sum(numer)/ref_len
|
619
|
+
|
620
|
+
if(prec!=0 and rec!=0):
|
621
|
+
score = ((1 + beta**2)*prec*rec)/float(rec + beta**2*prec)
|
622
|
+
else:
|
623
|
+
score = 0.0
|
624
|
+
return score
|
625
|
+
|
626
|
+
|
627
|
+
def rouge_s(self, ref, can, n):
|
628
|
+
|
629
|
+
can_sent= self._hyp_sent_split_remove(can)
|
630
|
+
can_word=list(itertools.chain(*[Ko_tokenize.word(tmp,self.lang) for tmp in can_sent]))
|
631
|
+
ref= self._ref_sent_split_remove(ref)
|
632
|
+
|
633
|
+
|
634
|
+
r_list=[]
|
635
|
+
|
636
|
+
for tmp in ref:
|
637
|
+
#tmp=list(itertools.chain(*tmp))
|
638
|
+
r_list.append(self._skip_bigrams(tmp,can_sent,can_word,n))
|
639
|
+
|
640
|
+
return max(r_list)
|
641
|
+
|
642
|
+
|
643
|
+
def cider(self, ref, hyp):
|
644
|
+
|
645
|
+
ref_dict=dict()
|
646
|
+
hyp_dict=dict()
|
647
|
+
|
648
|
+
ref_dict[0]=ref
|
649
|
+
hyp_dict[0]=hyp
|
650
|
+
|
651
|
+
cider_score=Cider()
|
652
|
+
score=cider_score.compute_score(ref_dict,hyp_dict)
|
653
|
+
|
654
|
+
return float(score)
|
655
|
+
|
656
|
+
def _process_espresso_output_format(self, result_list):
|
657
|
+
temp_list = []
|
658
|
+
for k in result_list:
|
659
|
+
#k = k.split('_')
|
660
|
+
k = list(k)
|
661
|
+
if k[1] == 'SP' or k[1] == 'SY':
|
662
|
+
continue
|
663
|
+
temp_list.append(k)
|
664
|
+
return temp_list
|
665
|
+
|
666
|
+
def _generate_enum(self, ref, hyp):
|
667
|
+
result_hyp = []
|
668
|
+
result_ref = []
|
669
|
+
for h in hyp:
|
670
|
+
enum_hyp_list = list(enumerate(h))
|
671
|
+
result_hyp.append(enum_hyp_list)
|
672
|
+
for r in ref:
|
673
|
+
enum_ref_list = list(enumerate(r))
|
674
|
+
result_ref.append(enum_ref_list)
|
675
|
+
return result_hyp, result_ref
|
676
|
+
|
677
|
+
def _tag_pos_meteor(self, sent_list):
|
678
|
+
result_list = list()
|
679
|
+
for sent in sent_list:
|
680
|
+
tagged_sent = EspressoTagger(task='pos').tag(sent)
|
681
|
+
tagged_sent = self._process_espresso_output_format(tagged_sent)
|
682
|
+
result_list.append(tagged_sent)
|
683
|
+
return result_list
|
684
|
+
|
685
|
+
def _match_enums(self,
|
686
|
+
enum_hypothesis_list: List[Tuple[int, str]],
|
687
|
+
enum_reference_list: List[Tuple[int, str]],
|
688
|
+
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
|
689
|
+
"""
|
690
|
+
matches exact words in hypothesis and reference and returns
|
691
|
+
a word mapping between enum_hypothesis_list and enum_reference_list
|
692
|
+
based on the enumerated word id.
|
693
|
+
|
694
|
+
:param enum_hypothesis_list: enumerated hypothesis list
|
695
|
+
:param enum_reference_list: enumerated reference list
|
696
|
+
:return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
|
697
|
+
enumerated unmatched reference tuples
|
698
|
+
"""
|
699
|
+
word_match = []
|
700
|
+
# print("test 213" , enum_hypothesis_list)
|
701
|
+
# print("test 124" , enum_reference_list)
|
702
|
+
for i in range(len(enum_hypothesis_list))[::-1]:
|
703
|
+
for j in range(len(enum_reference_list))[::-1]:
|
704
|
+
# print(f"\n \t {enum_hypothesis_list[i][1]} \t {enum_reference_list[j][1]}")
|
705
|
+
if enum_hypothesis_list[i][1] == enum_reference_list[j][1]:
|
706
|
+
|
707
|
+
# print("Check!!")
|
708
|
+
word_match.append(
|
709
|
+
(enum_hypothesis_list[i][0], enum_reference_list[j][0])
|
710
|
+
)
|
711
|
+
enum_hypothesis_list.pop(i)
|
712
|
+
enum_reference_list.pop(j)
|
713
|
+
break
|
714
|
+
return word_match, enum_hypothesis_list, enum_reference_list
|
715
|
+
|
716
|
+
|
717
|
+
def _count_chunks(self, matches: List[Tuple[int, int]]) -> int:
|
718
|
+
"""
|
719
|
+
Counts the fewest possible number of chunks such that matched unigrams
|
720
|
+
of each chunk are adjacent to each other. This is used to calculate the
|
721
|
+
fragmentation part of the metric.
|
722
|
+
|
723
|
+
:param matches: list containing a mapping of matched words (output of align_words)
|
724
|
+
:return: Number of chunks a sentence is divided into post alignment
|
725
|
+
"""
|
726
|
+
i = 0
|
727
|
+
chunks = 1
|
728
|
+
while i < len(matches) - 1:
|
729
|
+
if (matches[i + 1][0] == matches[i][0] + 1) and (
|
730
|
+
matches[i + 1][1] == matches[i][1] + 1
|
731
|
+
):
|
732
|
+
i += 1
|
733
|
+
continue
|
734
|
+
i += 1
|
735
|
+
chunks += 1
|
736
|
+
return chunks
|
737
|
+
|
738
|
+
def _match_syn_with_sejong(self, hyp_list, ref_list):
|
739
|
+
print("2")
|
740
|
+
syn_match = []
|
741
|
+
for i in range(len(hyp_list))[::-1]:
|
742
|
+
temp_syn_list = []
|
743
|
+
#print("test 344434: ", hyp_list[i])
|
744
|
+
if hyp_list[i][1][1] not in self.skip_pos:
|
745
|
+
entrys = ssem.entrys(hyp_list[i][1][0])
|
746
|
+
print("1")
|
747
|
+
print(entrys)
|
748
|
+
for entry in entrys:
|
749
|
+
print(entry)
|
750
|
+
for sense in entry.senses():
|
751
|
+
if sense.syn():
|
752
|
+
temp_syn_list.append(sense.syn())
|
753
|
+
if temp_syn_list:
|
754
|
+
hyp_list[i][1].append(deepcopy(temp_syn_list))
|
755
|
+
|
756
|
+
for j in range(len(ref_list))[::-1]:
|
757
|
+
is_break = False
|
758
|
+
if len(hyp_list[i][1]) == 3:
|
759
|
+
for syn in hyp_list[i][1][2]:
|
760
|
+
|
761
|
+
if syn[0] == ref_list[j][1][0]:
|
762
|
+
syn_match.append(
|
763
|
+
(hyp_list[i][0], ref_list[j][0])
|
764
|
+
)
|
765
|
+
is_break = True
|
766
|
+
hyp_list.pop(i)
|
767
|
+
ref_list.pop(j)
|
768
|
+
break
|
769
|
+
else:
|
770
|
+
if hyp_list[i][1] == ref_list[1][1]:
|
771
|
+
syn_match.append(
|
772
|
+
(hyp_list[i][0], ref_list[j][0])
|
773
|
+
)
|
774
|
+
is_break = True
|
775
|
+
hyp_list.pop(i)
|
776
|
+
ref_list.pop(j)
|
777
|
+
if is_break:
|
778
|
+
break
|
779
|
+
|
780
|
+
|
781
|
+
|
782
|
+
# print("test 231232 ", hyp_list[i])
|
783
|
+
|
784
|
+
|
785
|
+
return syn_match, hyp_list, ref_list
|
786
|
+
|
787
|
+
def meteor(self, ref, hyp):
|
788
|
+
ref_tag = self._tag_pos_meteor(ref)
|
789
|
+
hyp_tag = self._tag_pos_meteor(hyp)
|
790
|
+
meteors = []
|
791
|
+
alpha = 0.9
|
792
|
+
beta = 3.0
|
793
|
+
gamma = 0.5
|
794
|
+
enum_hyp, enum_ref = self._generate_enum(ref_tag, hyp_tag)
|
795
|
+
print("$")
|
796
|
+
# print("test 13333 ", enum_hyp)
|
797
|
+
for reference in enum_ref:
|
798
|
+
hyp_len = len(enum_hyp[0])
|
799
|
+
ref_len = len(reference)
|
800
|
+
|
801
|
+
# 단어/어간 매칭
|
802
|
+
word_match, enum_hyp_list, enum_ref_list = self._match_enums(deepcopy(enum_hyp[0]), reference)
|
803
|
+
syn_match, enum_hyp_list, enum_ref_list = self._match_syn_with_sejong(enum_hyp_list, enum_ref_list)
|
804
|
+
# print("test 123344 " ,enum_ref_list) ## [(0, ['오늘', 'NN']), (6, ['이', 'VB']), (7, ['었다', 'EE'])]
|
805
|
+
|
806
|
+
final_match = sorted(word_match + syn_match)
|
807
|
+
|
808
|
+
#최종 결과 계산
|
809
|
+
final_match_count = len(final_match)
|
810
|
+
|
811
|
+
|
812
|
+
precision = float(final_match_count) / hyp_len
|
813
|
+
recall = float(final_match_count) / ref_len
|
814
|
+
fmean = (precision * recall) / (alpha * precision + (1 - alpha) * recall)
|
815
|
+
chunk_count = float(self._count_chunks(final_match))
|
816
|
+
frag = 0.0
|
817
|
+
if final_match_count != 0:
|
818
|
+
frag = chunk_count / final_match_count
|
819
|
+
else:
|
820
|
+
frag = 0.0
|
821
|
+
penalty = gamma * frag ** beta
|
822
|
+
meteors.append((1 - penalty) * fmean)
|
823
|
+
|
824
|
+
# print(word_match)
|
825
|
+
|
826
|
+
return max(meteors)
|
827
|
+
|
828
|
+
|
829
|
+
def demo():
|
830
|
+
y_pred = [5, 2, 4, 1, 3, 2, 5, 6, 7]
|
831
|
+
y_true = [1, 3, 6, 7, 1, 5]
|
832
|
+
|
833
|
+
user = [[5, 3, 2], [9, 1, 2], [3, 5, 6], [7, 2, 1]]
|
834
|
+
h_pred = [[15, 6, 21, 3], [15, 77, 23, 14], [51, 23, 21, 2], [53, 2, 1, 5]]
|
835
|
+
|
836
|
+
metric = DefaultMetric()
|
837
|
+
print(metric.precision_at_k(y_true, y_pred, 3))
|
838
|
+
print(metric.recall_at_k(y_true,y_pred, 3))
|
839
|
+
print(metric.hit_rate_at_k(user, h_pred, 2))
|
840
|
+
|
841
|
+
hyp='봉준호 감독이 아카데미에서 국제영화상을 수상했다.'
|
842
|
+
ref=['봉준호가 아카데미에서 각본상을 탔다.']
|
843
|
+
re = metric.meteor(ref, hyp)
|
844
|
+
print(re)
|
845
|
+
|
846
|
+
if __name__=="__main__":
|
847
|
+
demo()
|