nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nltkor/Kor_char.py +193 -0
- nltkor/__init__.py +16 -0
- nltkor/alignment/__init__.py +1315 -0
- nltkor/cider/__init__.py +2 -0
- nltkor/cider/cider.py +55 -0
- nltkor/cider/cider_scorer.py +207 -0
- nltkor/distance/__init__.py +441 -0
- nltkor/distance/wasserstein.py +126 -0
- nltkor/etc.py +22 -0
- nltkor/lazyimport.py +144 -0
- nltkor/make_requirement.py +11 -0
- nltkor/metrics/__init__.py +63 -0
- nltkor/metrics/bartscore.py +301 -0
- nltkor/metrics/bertscore.py +331 -0
- nltkor/metrics/bleu_tensor.py +20 -0
- nltkor/metrics/classical.py +847 -0
- nltkor/metrics/entment.py +24 -0
- nltkor/metrics/eval.py +517 -0
- nltkor/metrics/mauve.py +273 -0
- nltkor/metrics/mauve_utils.py +131 -0
- nltkor/misc/__init__.py +11 -0
- nltkor/misc/string2string_basic_functions.py +59 -0
- nltkor/misc/string2string_default_tokenizer.py +83 -0
- nltkor/misc/string2string_hash_functions.py +159 -0
- nltkor/misc/string2string_word_embeddings.py +503 -0
- nltkor/search/__init__.py +10 -0
- nltkor/search/classical.py +569 -0
- nltkor/search/faiss_search.py +787 -0
- nltkor/search/kobert_tokenizer.py +181 -0
- nltkor/sejong/__init__.py +3 -0
- nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
- nltkor/sejong/ch.py +12 -0
- nltkor/sejong/dict_semClassNum.txt +491 -0
- nltkor/sejong/layer.txt +630 -0
- nltkor/sejong/sejong_download.py +87 -0
- nltkor/sejong/ssem.py +684 -0
- nltkor/similarity/__init__.py +3 -0
- nltkor/similarity/bartscore____.py +337 -0
- nltkor/similarity/bertscore____.py +339 -0
- nltkor/similarity/classical.py +245 -0
- nltkor/similarity/cosine_similarity.py +175 -0
- nltkor/tag/__init__.py +71 -0
- nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
- nltkor/tag/espresso_tag.py +220 -0
- nltkor/tag/libs/__init__.py +10 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
- nltkor/tag/libs/arguments.py +280 -0
- nltkor/tag/libs/attributes.py +231 -0
- nltkor/tag/libs/config.py +159 -0
- nltkor/tag/libs/metadata.py +129 -0
- nltkor/tag/libs/ner/__init__.py +2 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/macmorphoreader.py +7 -0
- nltkor/tag/libs/ner/ner_reader.py +92 -0
- nltkor/tag/libs/network.c +72325 -0
- nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
- nltkor/tag/libs/network.pyx +878 -0
- nltkor/tag/libs/networkconv.pyx +1028 -0
- nltkor/tag/libs/networkdependencyconv.pyx +451 -0
- nltkor/tag/libs/parse/__init__.py +1 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/parse_reader.py +283 -0
- nltkor/tag/libs/pos/__init__.py +2 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/macmorphoreader.py +7 -0
- nltkor/tag/libs/pos/pos_reader.py +97 -0
- nltkor/tag/libs/reader.py +485 -0
- nltkor/tag/libs/srl/__init__.py +3 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__srl_reader_.py +535 -0
- nltkor/tag/libs/srl/srl_reader.py +436 -0
- nltkor/tag/libs/srl/train_srl.py +87 -0
- nltkor/tag/libs/taggers.py +926 -0
- nltkor/tag/libs/utils.py +384 -0
- nltkor/tag/libs/word_dictionary.py +239 -0
- nltkor/tag/libs/wsd/__init__.py +2 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
- nltkor/tag/libs/wsd/wsd_reader.py +93 -0
- nltkor/tokenize/__init__.py +62 -0
- nltkor/tokenize/ko_tokenize.py +115 -0
- nltkor/trans.py +121 -0
- nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
- nltkor-1.2.14.dist-info/METADATA +41 -0
- nltkor-1.2.14.dist-info/RECORD +127 -0
- nltkor-1.2.14.dist-info/WHEEL +5 -0
- nltkor-1.2.14.dist-info/top_level.txt +1 -0
nltkor/cider/__init__.py
ADDED
nltkor/cider/cider.py
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
#-*-coding:utf8-*-
|
2
|
+
# Filename: cider.py
|
3
|
+
#
|
4
|
+
# Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric
|
5
|
+
# by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
|
6
|
+
#
|
7
|
+
# Creation Date: Sun Feb 8 14:16:54 2015
|
8
|
+
#
|
9
|
+
# Authors: Ramakrishna Vedantam <vrama91@vt.edu> and Tsung-Yi Lin <tl483@cornell.edu>
|
10
|
+
|
11
|
+
from nltkor.cider.cider_scorer import CiderScorer
|
12
|
+
|
13
|
+
class Cider:
|
14
|
+
"""
|
15
|
+
Main Class to compute the CIDEr metric
|
16
|
+
|
17
|
+
"""
|
18
|
+
def __init__(self, test=None, refs=None, n=4, sigma=6.0):
|
19
|
+
# set cider to sum over 1 to 4-grams
|
20
|
+
self._n = n
|
21
|
+
# set the standard deviation parameter for gaussian penalty
|
22
|
+
self._sigma = sigma
|
23
|
+
|
24
|
+
def compute_score(self, gts, res):
|
25
|
+
"""
|
26
|
+
Main function to compute CIDEr score
|
27
|
+
:param hypo_for_image (dict) : dictionary with key <image> and value <tokenized hypothesis / candidate sentence>
|
28
|
+
ref_for_image (dict) : dictionary with key <image> and value <tokenized reference sentence>
|
29
|
+
:return: cider (float) : computed CIDEr score for the corpus
|
30
|
+
"""
|
31
|
+
'''
|
32
|
+
gts={0:['안 녕 하 세 요 홍성태입니다.','안 녕 해 요 홍성태입니다']}
|
33
|
+
res={0:['a b ']}
|
34
|
+
'''
|
35
|
+
assert(gts.keys() == res.keys())
|
36
|
+
imgIds = gts.keys()
|
37
|
+
|
38
|
+
cider_scorer = CiderScorer(n=self._n, sigma=self._sigma)
|
39
|
+
|
40
|
+
for id in imgIds:
|
41
|
+
|
42
|
+
hypo = res[id]
|
43
|
+
ref = gts[id]
|
44
|
+
|
45
|
+
# Sanity check.
|
46
|
+
assert(type(hypo) is list)
|
47
|
+
assert(len(hypo) == 1)
|
48
|
+
assert(type(ref) is list)
|
49
|
+
assert(len(ref) > 0)
|
50
|
+
|
51
|
+
cider_scorer += (hypo[0], ref)
|
52
|
+
|
53
|
+
score = cider_scorer.compute_score()
|
54
|
+
|
55
|
+
return format(score,".7f")
|
@@ -0,0 +1,207 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
#-*-coding:utf8-*-
|
3
|
+
# Tsung-Yi Lin <tl483@cornell.edu>
|
4
|
+
# Ramakrishna Vedantam <vrama91@vt.edu>
|
5
|
+
|
6
|
+
import copy
|
7
|
+
from collections import defaultdict
|
8
|
+
import math
|
9
|
+
import statistics
|
10
|
+
|
11
|
+
def precook(s, n=4, out=False):
|
12
|
+
"""
|
13
|
+
Takes a string as input and returns an object that can be given to
|
14
|
+
either cook_refs or cook_test. This is optional: cook_refs and cook_test
|
15
|
+
can take string arguments as well.
|
16
|
+
:param s: string : sentence to be converted into ngrams
|
17
|
+
:param n: int : number of ngrams for which representation is calculated
|
18
|
+
:return: term frequency vector for occuring ngrams
|
19
|
+
"""
|
20
|
+
words = s.split()
|
21
|
+
counts = defaultdict(int)
|
22
|
+
for k in range(1,n+1):
|
23
|
+
for i in range(len(words)-k+1):
|
24
|
+
ngram = tuple(words[i:i+k])
|
25
|
+
counts[ngram] += 1
|
26
|
+
return counts
|
27
|
+
|
28
|
+
def cook_refs(refs, n=4): ## lhuang: oracle will call with "average"
|
29
|
+
'''Takes a list of reference sentences for a single segment
|
30
|
+
and returns an object that encapsulates everything that BLEU
|
31
|
+
needs to know about them.
|
32
|
+
:param refs: list of string : reference sentences for some image
|
33
|
+
:param n: int : number of ngrams for which (ngram) representation is calculated
|
34
|
+
:return: result (list of dict)
|
35
|
+
'''
|
36
|
+
return [precook(ref, n) for ref in refs]
|
37
|
+
|
38
|
+
def cook_test(test, n=4):
|
39
|
+
'''Takes a test sentence and returns an object that
|
40
|
+
encapsulates everything that BLEU needs to know about it.
|
41
|
+
:param test: list of string : hypothesis sentence for some image
|
42
|
+
:param n: int : number of ngrams for which (ngram) representation is calculated
|
43
|
+
:return: result (dict)
|
44
|
+
'''
|
45
|
+
return precook(test, n, True)
|
46
|
+
|
47
|
+
class CiderScorer(object):
|
48
|
+
"""CIDEr scorer.
|
49
|
+
"""
|
50
|
+
|
51
|
+
def copy(self):
|
52
|
+
''' copy the refs.'''
|
53
|
+
new = CiderScorer(n=self.n)
|
54
|
+
new.ctest = copy.copy(self.ctest)
|
55
|
+
new.crefs = copy.copy(self.crefs)
|
56
|
+
return new
|
57
|
+
|
58
|
+
def __init__(self, test=None, refs=None, n=4, sigma=6.0):
|
59
|
+
''' singular instance '''
|
60
|
+
|
61
|
+
self.n = n
|
62
|
+
self.sigma = sigma
|
63
|
+
|
64
|
+
#요기서 ~₩
|
65
|
+
self.crefs = []
|
66
|
+
self.ctest = []
|
67
|
+
self.document_frequency = defaultdict(float)
|
68
|
+
self.cook_append(test, refs)
|
69
|
+
self.ref_len = None
|
70
|
+
|
71
|
+
def cook_append(self, test, refs):
|
72
|
+
'''called by constructor and __iadd__ to avoid creating new instances.'''
|
73
|
+
|
74
|
+
if refs is not None:
|
75
|
+
self.crefs.append(cook_refs(refs))
|
76
|
+
if test is not None:
|
77
|
+
self.ctest.append(cook_test(test)) ## N.B.: -1
|
78
|
+
else:
|
79
|
+
self.ctest.append(None) # lens of crefs and ctest have to match
|
80
|
+
|
81
|
+
|
82
|
+
def size(self):
|
83
|
+
assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
|
84
|
+
return len(self.crefs)
|
85
|
+
|
86
|
+
def __iadd__(self, other):
|
87
|
+
'''add an instance (e.g., from another sentence).'''
|
88
|
+
|
89
|
+
if type(other) is tuple:
|
90
|
+
## avoid creating new CiderScorer instances
|
91
|
+
|
92
|
+
self.cook_append(other[0], other[1])
|
93
|
+
else:
|
94
|
+
self.ctest.extend(other.ctest)
|
95
|
+
self.crefs.extend(other.crefs)
|
96
|
+
|
97
|
+
return self
|
98
|
+
def compute_doc_freq(self):
|
99
|
+
'''
|
100
|
+
Compute term frequency for reference data.
|
101
|
+
This will be used to compute idf (inverse document frequency later)
|
102
|
+
The term frequency is stored in the object
|
103
|
+
:return: None
|
104
|
+
'''
|
105
|
+
for refs in self.crefs:
|
106
|
+
# refs, k ref captions of one image
|
107
|
+
for ngram in set([ngram for ref in refs for (ngram,count) in ref.items()]):
|
108
|
+
self.document_frequency[ngram] += 1
|
109
|
+
# maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
|
110
|
+
|
111
|
+
def compute_cider(self):
|
112
|
+
def counts2vec(cnts):
|
113
|
+
"""
|
114
|
+
Function maps counts of ngram to vector of tfidf weights.
|
115
|
+
The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights.
|
116
|
+
The n-th entry of array denotes length of n-grams.
|
117
|
+
:param cnts:
|
118
|
+
:return: vec (array of dict), norm (array of float), length (int)
|
119
|
+
"""
|
120
|
+
|
121
|
+
vec = [defaultdict(float) for _ in range(self.n)]
|
122
|
+
length = 0
|
123
|
+
norm = [0.0 for _ in range(self.n)]
|
124
|
+
for (ngram,term_freq) in cnts.items():
|
125
|
+
# give word count 1 if it doesn't appear in reference corpus
|
126
|
+
df = math.log(max(1.0, self.document_frequency[ngram]))
|
127
|
+
# ngram index
|
128
|
+
n = len(ngram)-1
|
129
|
+
# tf (term_freq) * idf (precomputed idf) for n-grams
|
130
|
+
vec[n][ngram] = float(term_freq)*(self.ref_len - df)
|
131
|
+
|
132
|
+
if vec[n][ngram]== 0:
|
133
|
+
vec[n][ngram]=1
|
134
|
+
|
135
|
+
# compute norm for the vector. the norm will be used for computing similarity
|
136
|
+
norm[n] += pow(vec[n][ngram], 2)
|
137
|
+
|
138
|
+
if n == 1:
|
139
|
+
length += term_freq
|
140
|
+
|
141
|
+
norm = [math.sqrt(n) for n in norm]
|
142
|
+
return vec, norm, length
|
143
|
+
|
144
|
+
def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
|
145
|
+
'''
|
146
|
+
Compute the cosine similarity of two vectors.
|
147
|
+
:param vec_hyp: array of dictionary for vector corresponding to hypothesis
|
148
|
+
:param vec_ref: array of dictionary for vector corresponding to reference
|
149
|
+
:param norm_hyp: array of float for vector corresponding to hypothesis
|
150
|
+
:param norm_ref: array of float for vector corresponding to reference
|
151
|
+
:param length_hyp: int containing length of hypothesis
|
152
|
+
:param length_ref: int containing length of reference
|
153
|
+
:return: array of score for each n-grams cosine similarity
|
154
|
+
'''
|
155
|
+
delta = float(length_hyp - length_ref)
|
156
|
+
# measure consine similarity
|
157
|
+
val = [0.0 for _ in range(self.n)]
|
158
|
+
for n in range(self.n):
|
159
|
+
# ngram
|
160
|
+
for (ngram,count) in vec_hyp[n].items():
|
161
|
+
# vrama91 : added clipping
|
162
|
+
val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]
|
163
|
+
|
164
|
+
if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
|
165
|
+
val[n] /= (norm_hyp[n]*norm_ref[n])
|
166
|
+
|
167
|
+
assert(not math.isnan(val[n]))
|
168
|
+
# vrama91: added a length based gaussian penalty
|
169
|
+
val[n] *= math.exp((-(delta**2))/(2*self.sigma**2))
|
170
|
+
return val
|
171
|
+
|
172
|
+
# compute log reference length
|
173
|
+
self.ref_len = math.log(float(len(self.crefs)))
|
174
|
+
|
175
|
+
scores = []
|
176
|
+
for test, refs in zip(self.ctest, self.crefs):
|
177
|
+
# compute vector for test captions
|
178
|
+
|
179
|
+
vec, norm, length = counts2vec(test)
|
180
|
+
# compute vector for ref captions
|
181
|
+
score = [0.0 for _ in range(self.n)]
|
182
|
+
for ref in refs:
|
183
|
+
vec_ref, norm_ref, length_ref = counts2vec(ref)
|
184
|
+
ret_list=sim(vec, vec_ref, norm, norm_ref, length, length_ref)
|
185
|
+
for num,val in zip(range(self.n),ret_list):
|
186
|
+
score[num]+=val
|
187
|
+
#score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
|
188
|
+
# change by vrama91 - mean of ngram scores, instead of sum
|
189
|
+
score_avg = statistics.mean(score)
|
190
|
+
# divide by number of references
|
191
|
+
score_avg /= len(refs)
|
192
|
+
# multiply score by 10
|
193
|
+
#score_avg *= 10.0
|
194
|
+
# append score of an image to the score list
|
195
|
+
scores.append(score_avg)
|
196
|
+
return scores
|
197
|
+
|
198
|
+
def compute_score(self, option=None, verbose=0):
|
199
|
+
# compute idf
|
200
|
+
self.compute_doc_freq()
|
201
|
+
# assert to check document frequency
|
202
|
+
assert(len(self.ctest) >= max(self.document_frequency.values()))
|
203
|
+
# compute cider score
|
204
|
+
score = self.compute_cider()
|
205
|
+
# debug
|
206
|
+
# print score
|
207
|
+
return statistics.mean(score)
|