nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +16 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +847 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +787 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  32. nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  33. nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  34. nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  35. nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  36. nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  37. nltkor/sejong/ch.py +12 -0
  38. nltkor/sejong/dict_semClassNum.txt +491 -0
  39. nltkor/sejong/layer.txt +630 -0
  40. nltkor/sejong/sejong_download.py +87 -0
  41. nltkor/sejong/ssem.py +684 -0
  42. nltkor/similarity/__init__.py +3 -0
  43. nltkor/similarity/bartscore____.py +337 -0
  44. nltkor/similarity/bertscore____.py +339 -0
  45. nltkor/similarity/classical.py +245 -0
  46. nltkor/similarity/cosine_similarity.py +175 -0
  47. nltkor/tag/__init__.py +71 -0
  48. nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
  49. nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
  50. nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
  51. nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
  52. nltkor/tag/espresso_tag.py +220 -0
  53. nltkor/tag/libs/__init__.py +10 -0
  54. nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
  55. nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
  56. nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
  57. nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
  58. nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
  59. nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
  60. nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
  61. nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
  62. nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
  63. nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
  64. nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
  65. nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
  66. nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
  67. nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
  68. nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
  69. nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
  70. nltkor/tag/libs/arguments.py +280 -0
  71. nltkor/tag/libs/attributes.py +231 -0
  72. nltkor/tag/libs/config.py +159 -0
  73. nltkor/tag/libs/metadata.py +129 -0
  74. nltkor/tag/libs/ner/__init__.py +2 -0
  75. nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
  76. nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
  77. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
  78. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
  79. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  80. nltkor/tag/libs/ner/ner_reader.py +92 -0
  81. nltkor/tag/libs/network.c +72325 -0
  82. nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
  83. nltkor/tag/libs/network.pyx +878 -0
  84. nltkor/tag/libs/networkconv.pyx +1028 -0
  85. nltkor/tag/libs/networkdependencyconv.pyx +451 -0
  86. nltkor/tag/libs/parse/__init__.py +1 -0
  87. nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
  88. nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
  89. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
  90. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
  91. nltkor/tag/libs/parse/parse_reader.py +283 -0
  92. nltkor/tag/libs/pos/__init__.py +2 -0
  93. nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
  94. nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
  95. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
  96. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
  97. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  98. nltkor/tag/libs/pos/pos_reader.py +97 -0
  99. nltkor/tag/libs/reader.py +485 -0
  100. nltkor/tag/libs/srl/__init__.py +3 -0
  101. nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
  102. nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
  103. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
  104. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
  105. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
  106. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
  107. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  108. nltkor/tag/libs/srl/srl_reader.py +436 -0
  109. nltkor/tag/libs/srl/train_srl.py +87 -0
  110. nltkor/tag/libs/taggers.py +926 -0
  111. nltkor/tag/libs/utils.py +384 -0
  112. nltkor/tag/libs/word_dictionary.py +239 -0
  113. nltkor/tag/libs/wsd/__init__.py +2 -0
  114. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
  115. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
  116. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
  117. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
  118. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  119. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  120. nltkor/tokenize/__init__.py +62 -0
  121. nltkor/tokenize/ko_tokenize.py +115 -0
  122. nltkor/trans.py +121 -0
  123. nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
  124. nltkor-1.2.14.dist-info/METADATA +41 -0
  125. nltkor-1.2.14.dist-info/RECORD +127 -0
  126. nltkor-1.2.14.dist-info/WHEEL +5 -0
  127. nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2 @@
1
+
2
+ __all__=['Cider']
nltkor/cider/cider.py ADDED
@@ -0,0 +1,55 @@
1
+ #-*-coding:utf8-*-
2
+ # Filename: cider.py
3
+ #
4
+ # Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric
5
+ # by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
6
+ #
7
+ # Creation Date: Sun Feb 8 14:16:54 2015
8
+ #
9
+ # Authors: Ramakrishna Vedantam <vrama91@vt.edu> and Tsung-Yi Lin <tl483@cornell.edu>
10
+
11
+ from nltkor.cider.cider_scorer import CiderScorer
12
+
13
+ class Cider:
14
+ """
15
+ Main Class to compute the CIDEr metric
16
+
17
+ """
18
+ def __init__(self, test=None, refs=None, n=4, sigma=6.0):
19
+ # set cider to sum over 1 to 4-grams
20
+ self._n = n
21
+ # set the standard deviation parameter for gaussian penalty
22
+ self._sigma = sigma
23
+
24
+ def compute_score(self, gts, res):
25
+ """
26
+ Main function to compute CIDEr score
27
+ :param hypo_for_image (dict) : dictionary with key <image> and value <tokenized hypothesis / candidate sentence>
28
+ ref_for_image (dict) : dictionary with key <image> and value <tokenized reference sentence>
29
+ :return: cider (float) : computed CIDEr score for the corpus
30
+ """
31
+ '''
32
+ gts={0:['안 녕 하 세 요 홍성태입니다.','안 녕 해 요 홍성태입니다']}
33
+ res={0:['a b ']}
34
+ '''
35
+ assert(gts.keys() == res.keys())
36
+ imgIds = gts.keys()
37
+
38
+ cider_scorer = CiderScorer(n=self._n, sigma=self._sigma)
39
+
40
+ for id in imgIds:
41
+
42
+ hypo = res[id]
43
+ ref = gts[id]
44
+
45
+ # Sanity check.
46
+ assert(type(hypo) is list)
47
+ assert(len(hypo) == 1)
48
+ assert(type(ref) is list)
49
+ assert(len(ref) > 0)
50
+
51
+ cider_scorer += (hypo[0], ref)
52
+
53
+ score = cider_scorer.compute_score()
54
+
55
+ return format(score,".7f")
@@ -0,0 +1,207 @@
1
+ #!/usr/bin/env python
2
+ #-*-coding:utf8-*-
3
+ # Tsung-Yi Lin <tl483@cornell.edu>
4
+ # Ramakrishna Vedantam <vrama91@vt.edu>
5
+
6
+ import copy
7
+ from collections import defaultdict
8
+ import math
9
+ import statistics
10
+
11
+ def precook(s, n=4, out=False):
12
+ """
13
+ Takes a string as input and returns an object that can be given to
14
+ either cook_refs or cook_test. This is optional: cook_refs and cook_test
15
+ can take string arguments as well.
16
+ :param s: string : sentence to be converted into ngrams
17
+ :param n: int : number of ngrams for which representation is calculated
18
+ :return: term frequency vector for occuring ngrams
19
+ """
20
+ words = s.split()
21
+ counts = defaultdict(int)
22
+ for k in range(1,n+1):
23
+ for i in range(len(words)-k+1):
24
+ ngram = tuple(words[i:i+k])
25
+ counts[ngram] += 1
26
+ return counts
27
+
28
+ def cook_refs(refs, n=4): ## lhuang: oracle will call with "average"
29
+ '''Takes a list of reference sentences for a single segment
30
+ and returns an object that encapsulates everything that BLEU
31
+ needs to know about them.
32
+ :param refs: list of string : reference sentences for some image
33
+ :param n: int : number of ngrams for which (ngram) representation is calculated
34
+ :return: result (list of dict)
35
+ '''
36
+ return [precook(ref, n) for ref in refs]
37
+
38
+ def cook_test(test, n=4):
39
+ '''Takes a test sentence and returns an object that
40
+ encapsulates everything that BLEU needs to know about it.
41
+ :param test: list of string : hypothesis sentence for some image
42
+ :param n: int : number of ngrams for which (ngram) representation is calculated
43
+ :return: result (dict)
44
+ '''
45
+ return precook(test, n, True)
46
+
47
+ class CiderScorer(object):
48
+ """CIDEr scorer.
49
+ """
50
+
51
+ def copy(self):
52
+ ''' copy the refs.'''
53
+ new = CiderScorer(n=self.n)
54
+ new.ctest = copy.copy(self.ctest)
55
+ new.crefs = copy.copy(self.crefs)
56
+ return new
57
+
58
+ def __init__(self, test=None, refs=None, n=4, sigma=6.0):
59
+ ''' singular instance '''
60
+
61
+ self.n = n
62
+ self.sigma = sigma
63
+
64
+ #요기서 ~₩
65
+ self.crefs = []
66
+ self.ctest = []
67
+ self.document_frequency = defaultdict(float)
68
+ self.cook_append(test, refs)
69
+ self.ref_len = None
70
+
71
+ def cook_append(self, test, refs):
72
+ '''called by constructor and __iadd__ to avoid creating new instances.'''
73
+
74
+ if refs is not None:
75
+ self.crefs.append(cook_refs(refs))
76
+ if test is not None:
77
+ self.ctest.append(cook_test(test)) ## N.B.: -1
78
+ else:
79
+ self.ctest.append(None) # lens of crefs and ctest have to match
80
+
81
+
82
+ def size(self):
83
+ assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
84
+ return len(self.crefs)
85
+
86
+ def __iadd__(self, other):
87
+ '''add an instance (e.g., from another sentence).'''
88
+
89
+ if type(other) is tuple:
90
+ ## avoid creating new CiderScorer instances
91
+
92
+ self.cook_append(other[0], other[1])
93
+ else:
94
+ self.ctest.extend(other.ctest)
95
+ self.crefs.extend(other.crefs)
96
+
97
+ return self
98
+ def compute_doc_freq(self):
99
+ '''
100
+ Compute term frequency for reference data.
101
+ This will be used to compute idf (inverse document frequency later)
102
+ The term frequency is stored in the object
103
+ :return: None
104
+ '''
105
+ for refs in self.crefs:
106
+ # refs, k ref captions of one image
107
+ for ngram in set([ngram for ref in refs for (ngram,count) in ref.items()]):
108
+ self.document_frequency[ngram] += 1
109
+ # maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
110
+
111
+ def compute_cider(self):
112
+ def counts2vec(cnts):
113
+ """
114
+ Function maps counts of ngram to vector of tfidf weights.
115
+ The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights.
116
+ The n-th entry of array denotes length of n-grams.
117
+ :param cnts:
118
+ :return: vec (array of dict), norm (array of float), length (int)
119
+ """
120
+
121
+ vec = [defaultdict(float) for _ in range(self.n)]
122
+ length = 0
123
+ norm = [0.0 for _ in range(self.n)]
124
+ for (ngram,term_freq) in cnts.items():
125
+ # give word count 1 if it doesn't appear in reference corpus
126
+ df = math.log(max(1.0, self.document_frequency[ngram]))
127
+ # ngram index
128
+ n = len(ngram)-1
129
+ # tf (term_freq) * idf (precomputed idf) for n-grams
130
+ vec[n][ngram] = float(term_freq)*(self.ref_len - df)
131
+
132
+ if vec[n][ngram]== 0:
133
+ vec[n][ngram]=1
134
+
135
+ # compute norm for the vector. the norm will be used for computing similarity
136
+ norm[n] += pow(vec[n][ngram], 2)
137
+
138
+ if n == 1:
139
+ length += term_freq
140
+
141
+ norm = [math.sqrt(n) for n in norm]
142
+ return vec, norm, length
143
+
144
+ def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
145
+ '''
146
+ Compute the cosine similarity of two vectors.
147
+ :param vec_hyp: array of dictionary for vector corresponding to hypothesis
148
+ :param vec_ref: array of dictionary for vector corresponding to reference
149
+ :param norm_hyp: array of float for vector corresponding to hypothesis
150
+ :param norm_ref: array of float for vector corresponding to reference
151
+ :param length_hyp: int containing length of hypothesis
152
+ :param length_ref: int containing length of reference
153
+ :return: array of score for each n-grams cosine similarity
154
+ '''
155
+ delta = float(length_hyp - length_ref)
156
+ # measure consine similarity
157
+ val = [0.0 for _ in range(self.n)]
158
+ for n in range(self.n):
159
+ # ngram
160
+ for (ngram,count) in vec_hyp[n].items():
161
+ # vrama91 : added clipping
162
+ val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]
163
+
164
+ if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
165
+ val[n] /= (norm_hyp[n]*norm_ref[n])
166
+
167
+ assert(not math.isnan(val[n]))
168
+ # vrama91: added a length based gaussian penalty
169
+ val[n] *= math.exp((-(delta**2))/(2*self.sigma**2))
170
+ return val
171
+
172
+ # compute log reference length
173
+ self.ref_len = math.log(float(len(self.crefs)))
174
+
175
+ scores = []
176
+ for test, refs in zip(self.ctest, self.crefs):
177
+ # compute vector for test captions
178
+
179
+ vec, norm, length = counts2vec(test)
180
+ # compute vector for ref captions
181
+ score = [0.0 for _ in range(self.n)]
182
+ for ref in refs:
183
+ vec_ref, norm_ref, length_ref = counts2vec(ref)
184
+ ret_list=sim(vec, vec_ref, norm, norm_ref, length, length_ref)
185
+ for num,val in zip(range(self.n),ret_list):
186
+ score[num]+=val
187
+ #score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
188
+ # change by vrama91 - mean of ngram scores, instead of sum
189
+ score_avg = statistics.mean(score)
190
+ # divide by number of references
191
+ score_avg /= len(refs)
192
+ # multiply score by 10
193
+ #score_avg *= 10.0
194
+ # append score of an image to the score list
195
+ scores.append(score_avg)
196
+ return scores
197
+
198
+ def compute_score(self, option=None, verbose=0):
199
+ # compute idf
200
+ self.compute_doc_freq()
201
+ # assert to check document frequency
202
+ assert(len(self.ctest) >= max(self.document_frequency.values()))
203
+ # compute cider score
204
+ score = self.compute_cider()
205
+ # debug
206
+ # print score
207
+ return statistics.mean(score)