nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +16 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +847 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +787 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  32. nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  33. nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  34. nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  35. nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  36. nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  37. nltkor/sejong/ch.py +12 -0
  38. nltkor/sejong/dict_semClassNum.txt +491 -0
  39. nltkor/sejong/layer.txt +630 -0
  40. nltkor/sejong/sejong_download.py +87 -0
  41. nltkor/sejong/ssem.py +684 -0
  42. nltkor/similarity/__init__.py +3 -0
  43. nltkor/similarity/bartscore____.py +337 -0
  44. nltkor/similarity/bertscore____.py +339 -0
  45. nltkor/similarity/classical.py +245 -0
  46. nltkor/similarity/cosine_similarity.py +175 -0
  47. nltkor/tag/__init__.py +71 -0
  48. nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
  49. nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
  50. nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
  51. nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
  52. nltkor/tag/espresso_tag.py +220 -0
  53. nltkor/tag/libs/__init__.py +10 -0
  54. nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
  55. nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
  56. nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
  57. nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
  58. nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
  59. nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
  60. nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
  61. nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
  62. nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
  63. nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
  64. nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
  65. nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
  66. nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
  67. nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
  68. nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
  69. nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
  70. nltkor/tag/libs/arguments.py +280 -0
  71. nltkor/tag/libs/attributes.py +231 -0
  72. nltkor/tag/libs/config.py +159 -0
  73. nltkor/tag/libs/metadata.py +129 -0
  74. nltkor/tag/libs/ner/__init__.py +2 -0
  75. nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
  76. nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
  77. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
  78. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
  79. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  80. nltkor/tag/libs/ner/ner_reader.py +92 -0
  81. nltkor/tag/libs/network.c +72325 -0
  82. nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
  83. nltkor/tag/libs/network.pyx +878 -0
  84. nltkor/tag/libs/networkconv.pyx +1028 -0
  85. nltkor/tag/libs/networkdependencyconv.pyx +451 -0
  86. nltkor/tag/libs/parse/__init__.py +1 -0
  87. nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
  88. nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
  89. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
  90. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
  91. nltkor/tag/libs/parse/parse_reader.py +283 -0
  92. nltkor/tag/libs/pos/__init__.py +2 -0
  93. nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
  94. nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
  95. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
  96. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
  97. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  98. nltkor/tag/libs/pos/pos_reader.py +97 -0
  99. nltkor/tag/libs/reader.py +485 -0
  100. nltkor/tag/libs/srl/__init__.py +3 -0
  101. nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
  102. nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
  103. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
  104. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
  105. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
  106. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
  107. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  108. nltkor/tag/libs/srl/srl_reader.py +436 -0
  109. nltkor/tag/libs/srl/train_srl.py +87 -0
  110. nltkor/tag/libs/taggers.py +926 -0
  111. nltkor/tag/libs/utils.py +384 -0
  112. nltkor/tag/libs/word_dictionary.py +239 -0
  113. nltkor/tag/libs/wsd/__init__.py +2 -0
  114. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
  115. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
  116. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
  117. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
  118. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  119. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  120. nltkor/tokenize/__init__.py +62 -0
  121. nltkor/tokenize/ko_tokenize.py +115 -0
  122. nltkor/trans.py +121 -0
  123. nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
  124. nltkor-1.2.14.dist-info/METADATA +41 -0
  125. nltkor-1.2.14.dist-info/RECORD +127 -0
  126. nltkor-1.2.14.dist-info/WHEEL +5 -0
  127. nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,283 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ '''
4
+ Class for dealing with dependency parsing data.
5
+ '''
6
+
7
+ import os
8
+ import logging
9
+ import numpy as np
10
+
11
+ from .. import attributes
12
+ from .. import reader
13
+ from ..word_dictionary import WordDictionary
14
+
15
+ class ConllPos(object):
16
+ '''
17
+ Dummy class to store field positions in a CoNLL-like file
18
+ for dependency parsing. NB: The positions are different from
19
+ those used in SRL!
20
+ '''
21
+ id = 0
22
+ word = 1 # 어절
23
+ morph_h = 2 # 접두사를 제외한 첫 형태소
24
+ pos_h = 3 # 접두사를 제외한 첫 형태소의 품사
25
+ morph_t = 4 # 마지막 형태소
26
+ pos_t = 5 # 마지막 형태소 품사
27
+ dep_head = 6 # dependency head 의존정보
28
+ dep_rel = 7 # dependency relation 구문 태그
29
+ SEP = '\t'
30
+
31
+
32
+ class DependencyReader(reader.TaggerReader):
33
+ '''
34
+ Class to read dependency files in CoNLL X format.
35
+ '''
36
+
37
+ def __init__(self, md=None, filename=None, labeled=False):
38
+ '''
39
+ Constructor.
40
+ :param md: Metadata object containing the description for this reader
41
+ :param filename: file containing data to be read and used in training
42
+ or tagging
43
+ :param labeled: (ignored if md is supplied) whether it is intended
44
+ to be used in labeled dependency parsing. Note that if it is
45
+ True, another reader object will be needed for unlabeled dependency.
46
+ '''
47
+ if md is not None:
48
+ self.labeled = md.task.startswith('labeled')
49
+ else:
50
+ self.labeled = labeled
51
+
52
+ if filename is not None:
53
+ self._read_conll(filename)
54
+
55
+ if self.labeled:
56
+ self.taskname = 'labeled_dependency'
57
+ else:
58
+ self.taskname = 'unlabeled_dependency'
59
+
60
+ self.rare_tag = None
61
+ self.pos_dict = None
62
+ super(DependencyReader, self).__init__(md)
63
+
64
+ @property
65
+ def task(self):
66
+ """
67
+ Abstract Base Class (ABC) attribute.
68
+ """
69
+ return self.taskname
70
+
71
+
72
+ def _read_conll(self, filename):
73
+ '''
74
+ Read data from a CoNLL formatted file.
75
+ '''
76
+ lines = []
77
+ self.sentences = []
78
+ self.heads = []
79
+
80
+ # this keeps track of the tokens
81
+ sentence = []
82
+
83
+ # this has the number of each token's head, in the same order as
84
+ # the tokens appear
85
+ sentence_heads = []
86
+ if self.labeled:
87
+ self.labels = []
88
+ sentence_labels = []
89
+
90
+ with open(filename, 'r') as f:
91
+ for line in f:
92
+ line = line.strip()
93
+ #line = unicode(line, 'utf-8').strip()
94
+ lines.append(line)
95
+
96
+ for line in lines:
97
+ if line == '':
98
+ # empty line, last sentence is finished
99
+ if len(sentence) > 0:
100
+ self.sentences.append(sentence)
101
+ self.heads.append(np.array(sentence_heads))
102
+
103
+ if self.labeled:
104
+ self.labels.append(sentence_labels)
105
+ sentence_labels = []
106
+
107
+ sentence = []
108
+ sentence_heads = []
109
+
110
+ continue
111
+
112
+ fields = line.split(ConllPos.SEP)
113
+ word = fields[ConllPos.word]
114
+ morph_h = fields[ConllPos.morph_h]
115
+ pos_h = fields[ConllPos.pos_h]
116
+ morph_t = fields[ConllPos.morph_t]
117
+ pos_t = fields[ConllPos.pos_t]
118
+ head = int(fields[ConllPos.dep_head])
119
+ label = fields[ConllPos.dep_rel]
120
+
121
+ if head == 0:
122
+ # we represent a dependency to root as an edge to the token itself
123
+ head = int(fields[ConllPos.id])
124
+
125
+ # -1 because tokens are numbered from 1
126
+ head -= 1
127
+
128
+ token = attributes.Token(word, pos_h=pos_h, morph_t=morph_t, pos_t=pos_t )
129
+ #token = attributes.Token(word, morph_h=morph_h, pos_h=pos_h, morph_t=morph_t, pos_t=pos_t )
130
+ sentence.append(token)
131
+ sentence_heads.append(head)
132
+ if self.labeled:
133
+ sentence_labels.append(label)
134
+
135
+ # in case there was not an empty line after the last sentence
136
+ if len(sentence) > 0:
137
+ self.sentences.append(sentence)
138
+ self.heads.append(np.array(sentence_heads))
139
+ if self.labeled:
140
+ self.labels.append(sentence_labels)
141
+
142
+ def _create_pos_dict(self):
143
+ """
144
+ Examine all POS tags in the sentences and create a dictionary based on them.
145
+ """
146
+ logger = logging.getLogger("Logger")
147
+ logger.info('Creating new POS tag dictionary (for dependency parsing)')
148
+ tags = {token.pos_h for sent in self.sentences
149
+ for token in sent} | {token.pos_t for sent in self.sentences
150
+ for token in sent}
151
+ pos_dict = {tag: code for code, tag in enumerate(tags)}
152
+
153
+ code = max(pos_dict.values()) + 1
154
+ pos_dict[attributes.PADDING_POS] = code
155
+
156
+ return pos_dict
157
+
158
+ def load_pos_dict(self):
159
+ """
160
+ Load the pos tag dictionary (specific for dependency parsing)
161
+ from its default location.
162
+ """
163
+ logger = logging.getLogger("Logger")
164
+ logger.debug('Loading POS tag dictionary (for dependency parsing)')
165
+ pos_dict = reader.load_tag_dict(self.md.paths['dependency_pos_tags'])
166
+ return pos_dict
167
+
168
+ def load_tag_dict(self, filename=None):
169
+ """
170
+ Verify if this reader is for the unlabeled dependency task. If so,
171
+ it doesn't use a tag dictionary and the call is ignored.
172
+ """
173
+ if not self.labeled:
174
+ return
175
+
176
+ super(DependencyReader, self).load_tag_dict(filename)
177
+
178
+ def load_or_create_tag_dict(self):
179
+ """
180
+ Try to load the tag dictionary from the default location. If the dictinaty
181
+ file is not available, scan the available sentences and create a new one.
182
+
183
+ It only is needed in labeled dependency parsing.
184
+ """
185
+ if not self.labeled:
186
+ return
187
+
188
+ logger = logging.getLogger('Logger')
189
+ filename = self.md.paths['dependency_tag_dict']
190
+ if os.path.isfile(filename):
191
+ self.load_tag_dict(filename)
192
+ logger.debug('Loaded dependency tag dictionary')
193
+ return
194
+
195
+ tags = {tag for sent_labels in self.labels for tag in sent_labels}
196
+ self.tag_dict = {tag: code for code, tag in enumerate(tags)}
197
+
198
+ reader.save_tag_dict(filename, self.tag_dict)
199
+ logger.debug('Saved dependency tag dictionary')
200
+
201
+
202
+ def generate_dictionary(self, dict_size=None, minimum_occurrences=2):
203
+ """
204
+ Generates a token dictionary based on the given sentences.
205
+
206
+ :param dict_size: Max number of tokens to be included in the dictionary.
207
+ :param minimum_occurrences: Minimum number of times that a token must
208
+ appear in the text in order to be included in the dictionary.
209
+ """
210
+ logger = logging.getLogger("Logger")
211
+ #all_tokens = [token.morph_t for sent in self.sentences for token in sent]
212
+ all_tokens = [token.morph_h for sent in self.sentences for token in sent] \
213
+ + [token.morph_t for sent in self.sentences for token in sent]
214
+ self.word_dict = WordDictionary(all_tokens, dict_size, minimum_occurrences)
215
+ logger.info("Created dictionary with %d tokens" % self.word_dict.num_tokens)
216
+
217
+ def codify_sentences(self):
218
+ """
219
+ Converts each token in each sequence into indices to their feature vectors
220
+ in feature matrices. The previous sentences as text are not accessible anymore.
221
+ Tags are left as the index of the each token's head.
222
+ """
223
+ if self.converter is None:
224
+ self.create_converter()
225
+
226
+ self.sentences = [np.array([self.converter.convert(token) for token in sent])
227
+ for sent in self.sentences]
228
+
229
+ if self.labeled:
230
+ self.labels = [np.array([self.tag_dict[label] for label in sent_labels])
231
+ for sent_labels in self.labels]
232
+
233
+ self.codified = True
234
+
235
+ def _load_or_create_pos_dict(self):
236
+ """
237
+ Try to load the pos tag dictionary to be used with this reader (when
238
+ using POS tags as additional features). If there isn't a file in the
239
+ data directory with the right name, a new dictionary is created
240
+ after examining the data.
241
+ """
242
+ if self.pos_dict is not None:
243
+ return
244
+
245
+ if os.path.isfile(self.md.paths['dependency_pos_tags']):
246
+ self.pos_dict = self.load_pos_dict()
247
+ else:
248
+ self.pos_dict = self._create_pos_dict()
249
+ self.save_tag_dict(self.md.paths['dependency_pos_tags'], self.pos_dict)
250
+
251
+ def get_num_pos_tags(self):
252
+ """
253
+ Return the number of POS tags that can be used as an additional feature
254
+ by this reader.
255
+ """
256
+ self._load_or_create_pos_dict()
257
+ return len(self.pos_dict)
258
+
259
+ def create_converter(self):
260
+ """
261
+ This function overrides the TextReader's one in order to deal with Token
262
+ objects instead of raw strings. It also allows POS as an attribute.
263
+ 여기 순서와 utils.create_feature_table의 순서가 같아야 한다.
264
+ """
265
+ #f = lambda token: self.word_dict[token.morph_h]
266
+ self.converter = attributes.TokenConverter()
267
+ #self.converter.add_extractor(f)
268
+
269
+ f = lambda token: self.word_dict[token.morph_t]
270
+ self.converter.add_extractor(f)
271
+
272
+ #if self.md.use_caps:
273
+ # caps_lookup = lambda t: attributes.get_capitalization(t.word)
274
+ # self.converter.add_extractor(caps_lookup)
275
+
276
+ if self.md.use_pos:
277
+ self._load_or_create_pos_dict()
278
+ g = lambda token: self.pos_dict[token.pos_h]
279
+ self.converter.add_extractor(g)
280
+ g = lambda token: self.pos_dict[token.pos_t]
281
+ self.converter.add_extractor(g)
282
+
283
+
@@ -0,0 +1,2 @@
1
+
2
+ from .pos_reader import POSReader
@@ -0,0 +1,7 @@
1
+
2
+ import warnings
3
+
4
+ # backwards compatibility
5
+ from .pos_reader import *
6
+
7
+ warnings.warn('Module macmorphoreader is deprecated. Use module pos_reader instead.')
@@ -0,0 +1,97 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """
4
+ Class for dealing with POS data.
5
+ """
6
+
7
+ import chardet
8
+ from ..reader import TaggerReader
9
+
10
+ class ConllPos(object):
11
+ """
12
+ Dummy class for storing column positions in a conll file.
13
+ """
14
+ id = 0
15
+ word = 1
16
+ pos = 2
17
+ SEP = '\t'
18
+
19
+ class POSReader(TaggerReader):
20
+ """
21
+ This class reads data from a POS corpus and turns it into a format
22
+ readable by the neural network for the POS tagging task.
23
+ """
24
+
25
+ def __init__(self, md=None, filename=None, load_dictionaries=True):
26
+ """
27
+ Constructor
28
+ """
29
+ self.rare_tag = None
30
+ self.sentences = []
31
+ if filename is not None:
32
+ try:
33
+ self._read_plain(filename)
34
+ except:
35
+ self._read_conll(filename)
36
+
37
+ super(POSReader, self).__init__(md, load_dictionaries=load_dictionaries)
38
+
39
+ @property
40
+ def task(self):
41
+ """
42
+ Abstract Base Class (ABC) attribute.
43
+ """
44
+ return 'pos'
45
+
46
+ def _read_plain(self, filename):
47
+ """
48
+ Read data from a "plain" file, with one sentence per line, each token
49
+ as token_tag.
50
+ """
51
+ self.sentences = []
52
+ with open(filename, 'rb') as f:
53
+ raw_data = f.read(1024)
54
+ detected = chardet.detect(raw_data).get('encoding', 'utf-8')
55
+ with open(filename, 'rt', encoding = detected) as f:
56
+ for line in f:
57
+ #line = unicode(line, 'utf-8')
58
+ items = line.split(ConllPos.SEP)
59
+ sentence = []
60
+ for item in items:
61
+ token, tag = item.rsplit('_', 1)
62
+ sentence.append((token, tag))
63
+
64
+ self.sentences.append(sentence)
65
+
66
+ def _read_conll(self, filename):
67
+ """
68
+ Read data from a CoNLL formatted file. It expects at least 4 columns:
69
+ id, surface word, lemma (ignored, may be anything)
70
+ and the POS tag.
71
+ """
72
+ self.sentences = []
73
+ sentence = []
74
+ with open(filename, 'rb') as f:
75
+ raw_data = f.read(1024)
76
+ detected = chardet.detect(raw_data).get('encoding', 'utf-8')
77
+ with open(filename, 'rt', encoding = detected) as f:
78
+ for line in f:
79
+ line = line.strip()
80
+ if line.strip() == '':
81
+ if len(sentence) > 0:
82
+ self.sentences.append(sentence)
83
+ sentence = []
84
+ continue
85
+
86
+ #line = unicode(line, 'utf-8')
87
+ fields = line.split(ConllPos.SEP)
88
+ word = fields[ConllPos.word]
89
+ pos = fields[ConllPos.pos]
90
+ sentence.append((word, pos))
91
+
92
+ if len(sentence) > 0:
93
+ self.sentences.append(sentence)
94
+
95
+
96
+ # backwards compatibility
97
+ MacMorphoReader = POSReader