nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +16 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +847 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +787 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  32. nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  33. nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  34. nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  35. nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  36. nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  37. nltkor/sejong/ch.py +12 -0
  38. nltkor/sejong/dict_semClassNum.txt +491 -0
  39. nltkor/sejong/layer.txt +630 -0
  40. nltkor/sejong/sejong_download.py +87 -0
  41. nltkor/sejong/ssem.py +684 -0
  42. nltkor/similarity/__init__.py +3 -0
  43. nltkor/similarity/bartscore____.py +337 -0
  44. nltkor/similarity/bertscore____.py +339 -0
  45. nltkor/similarity/classical.py +245 -0
  46. nltkor/similarity/cosine_similarity.py +175 -0
  47. nltkor/tag/__init__.py +71 -0
  48. nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
  49. nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
  50. nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
  51. nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
  52. nltkor/tag/espresso_tag.py +220 -0
  53. nltkor/tag/libs/__init__.py +10 -0
  54. nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
  55. nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
  56. nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
  57. nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
  58. nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
  59. nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
  60. nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
  61. nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
  62. nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
  63. nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
  64. nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
  65. nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
  66. nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
  67. nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
  68. nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
  69. nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
  70. nltkor/tag/libs/arguments.py +280 -0
  71. nltkor/tag/libs/attributes.py +231 -0
  72. nltkor/tag/libs/config.py +159 -0
  73. nltkor/tag/libs/metadata.py +129 -0
  74. nltkor/tag/libs/ner/__init__.py +2 -0
  75. nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
  76. nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
  77. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
  78. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
  79. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  80. nltkor/tag/libs/ner/ner_reader.py +92 -0
  81. nltkor/tag/libs/network.c +72325 -0
  82. nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
  83. nltkor/tag/libs/network.pyx +878 -0
  84. nltkor/tag/libs/networkconv.pyx +1028 -0
  85. nltkor/tag/libs/networkdependencyconv.pyx +451 -0
  86. nltkor/tag/libs/parse/__init__.py +1 -0
  87. nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
  88. nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
  89. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
  90. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
  91. nltkor/tag/libs/parse/parse_reader.py +283 -0
  92. nltkor/tag/libs/pos/__init__.py +2 -0
  93. nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
  94. nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
  95. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
  96. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
  97. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  98. nltkor/tag/libs/pos/pos_reader.py +97 -0
  99. nltkor/tag/libs/reader.py +485 -0
  100. nltkor/tag/libs/srl/__init__.py +3 -0
  101. nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
  102. nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
  103. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
  104. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
  105. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
  106. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
  107. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  108. nltkor/tag/libs/srl/srl_reader.py +436 -0
  109. nltkor/tag/libs/srl/train_srl.py +87 -0
  110. nltkor/tag/libs/taggers.py +926 -0
  111. nltkor/tag/libs/utils.py +384 -0
  112. nltkor/tag/libs/word_dictionary.py +239 -0
  113. nltkor/tag/libs/wsd/__init__.py +2 -0
  114. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
  115. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
  116. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
  117. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
  118. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  119. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  120. nltkor/tokenize/__init__.py +62 -0
  121. nltkor/tokenize/ko_tokenize.py +115 -0
  122. nltkor/trans.py +121 -0
  123. nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
  124. nltkor-1.2.14.dist-info/METADATA +41 -0
  125. nltkor-1.2.14.dist-info/RECORD +127 -0
  126. nltkor-1.2.14.dist-info/WHEEL +5 -0
  127. nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,384 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """
4
+ Utility functions
5
+ """
6
+
7
+ import re
8
+ import os, sys
9
+ import logging
10
+ import nltk
11
+ import nltkor
12
+ import _pickle
13
+ import chardet
14
+ #from nltkor import Kor_char
15
+ from nltkor.tokenize import Ko_tokenize
16
+ from nltkor.tag.libs import config
17
+ import numpy as np
18
+
19
+ #from nltk.tokenize.regexp import RegexpTokenizer
20
+ #from nltk.tokenize import TreebankWordTokenizer
21
+ from . import attributes
22
+
23
+
24
+ def get_word_from_morph_lexicon(root, word, tags, space_flag):
25
+ '''
26
+ space_flag: if True then including space, otherwise do not including space
27
+ '''
28
+
29
+ values = list()
30
+ value_data = list()
31
+ if not word: return root.keys()
32
+
33
+ current_dict = root
34
+ _end = '$$'
35
+ s = 0
36
+ for i, letter in enumerate(word):
37
+ #print(i, '>', letter, current_dict)
38
+ if letter in current_dict:
39
+ #print(letter, current_dict[letter])
40
+ current_dict = current_dict[letter]
41
+ if _end in current_dict :
42
+ for idx in range(i-s): values.pop()
43
+ values.append(word[s:i+1])
44
+ for idx in range(i-s): value_data.pop()
45
+ value_data.append(current_dict[_end])
46
+ else: values.append(letter); value_data.append(tags[i])
47
+ else:
48
+ #print('==', letter, values)
49
+ if space_flag or letter != ' ':
50
+ values.append(letter) # 최장일치 : -1
51
+ value_data.append(tags[i])
52
+ s = i+1
53
+ current_dict = root
54
+ else:
55
+ if values: return values, value_data
56
+ else: return list(word), tags
57
+
58
+ def intersperse(lst, item):
59
+ result = [item] * (len(lst) * 2 - 1)
60
+ result[0::2] = lst
61
+ return result
62
+
63
+ def get_word(root, word, tags, space_flag=False) :
64
+ '''
65
+ space_flag : True : 공백이 있어도 매칭됨
66
+ False: 공백이 있으면 매칭 안됨
67
+ '''
68
+ word_list = get_word_from_morph_lexicon(root, word, tags, space_flag)
69
+ return word_list
70
+
71
+
72
+
73
+ def tokenize(text, use_sent_tokenizer=True):
74
+ """
75
+ Call the tokenizer function for the given language.
76
+ The returned tokens are in a list of lists, one for each sentence.
77
+
78
+ :param use_sent_tokenizer: True : use sentence tokenizer
79
+ False : sentence per line
80
+ """
81
+ return tokenize_ko(text, use_sent_tokenizer)
82
+
83
+ def tokenize_ko(text, use_sent_tokenizer=True, clean=True):
84
+ """
85
+ text: string
86
+ Return a list of lists of the tokens in text, separated by sentences.
87
+ """
88
+ if clean:
89
+ text = clean_kotext(text)
90
+
91
+ if use_sent_tokenizer:
92
+ ## False: 띄어쓰기 무시, True: 띄어쓰기 고려
93
+ sentences = [Ko_tokenize.syllable(sentence, True) for sentence in Ko_tokenize.sentence(text)]
94
+ else:
95
+ sentences = [Ko_tokenize.syllable(text, True)]
96
+
97
+ return sentences
98
+
99
+ def clean_kotext(text, correct=False):
100
+ """
101
+ 1. 특수 공백문자를 공백으로 처리
102
+ Apply some transformations to the text, such as
103
+ replacing digits for 9 and simplifying quotation marks.
104
+
105
+ :param correct: If True, tries to correct punctuation misspellings.
106
+ """
107
+ # replaces different kinds of quotation marks with "
108
+ # take care not to remove apostrophes
109
+ '''
110
+ text = re.sub(r"(?u)(^|\W)[‘’′`']", r'\1"', text)
111
+ text = re.sub(r"(?u)[‘’`′'](\W|$)", r'"\1', text)
112
+ text = re.sub(r'(?u)[«»“”]', '"', text)
113
+
114
+ if correct:
115
+ # tries to fix mistyped tokens (common in Wikipedia-pt) as ,, '' ..
116
+ text = re.sub(r'(?<!\.)\.\.(?!\.)', '.', text) # take care with ellipses
117
+ text = re.sub(r'([,";:])\1,', r'\1', text)
118
+
119
+ # inserts space after leading hyphen. It happens sometimes in cases like:
120
+ # blablabla -that is, bloblobloblo
121
+ text = re.sub(' -(?=[^\W\d_])', ' - ', text)
122
+ '''
123
+
124
+ # replaces numbers with the 9's
125
+ text = re.sub(r'\xa0', ' ', text)
126
+ text = re.sub(u' ', ' ', text)
127
+ text = re.sub(u' ', ' ', text)
128
+ text = re.sub(u' +', ' ', text) # 연속된 공백 처리
129
+ # replaces numbers with the 9's, 사전쪽과 같이 판단할 것
130
+ #text = re.sub(r'\d', '9', text)
131
+ # replaces english character with the a's
132
+ #text = re.sub(r'[a-zA-Z]', 'a', text)
133
+ # replaces chinses character with the 家's
134
+ #for x in re.findall(r'[\u4e00-\u9fff]', text):
135
+ # text = re.sub(x, '家', text)
136
+
137
+ # replaces special ellipsis character
138
+ #text = text.replace(u'…', '...')
139
+
140
+ return text
141
+
142
+
143
+ def generate_feature_vectors(num_vectors, num_features, min_value=-0.1, max_value=0.1):
144
+ """
145
+ Generates vectors of real numbers, to be used as word features.
146
+ Vectors are initialized randomly. Returns a 2-dim numpy array.
147
+ """
148
+ logger = logging.getLogger("Logger")
149
+ #table = (max_value * 2) * np.random.random((num_vectors, num_vectors, num_features, num_features)) + min_value
150
+ table = (max_value * 2) * np.random.random((num_vectors, num_features)) + min_value
151
+ logger.debug("Generated %d feature vectors with %d features each." % (num_vectors, num_features))
152
+ print("Generated %d feature vectors with %d features each." % (num_vectors, num_features))
153
+
154
+ return table
155
+
156
+
157
+ def count_lines(filename):
158
+ """Counts and returns how many non empty lines in a file there are."""
159
+ with open(filename, 'r') as f:
160
+ lines = [x for x in list(f) if x.strip()]
161
+ return len(lines)
162
+
163
+ def _create_affix_tables(affix, table_list, num_features):
164
+ """
165
+ Internal helper function for loading suffix or prefix feature tables
166
+ into the given list.
167
+ affix should be either 'suffix' or 'prefix'.
168
+ """
169
+ logger = logging.getLogger('Logger')
170
+ logger.info('Generating %s features...' % affix)
171
+ tensor = []
172
+ codes = getattr(attributes.Affix, '%s_codes' % affix)
173
+ num_affixes_per_size = getattr(attributes.Affix, 'num_%ses_per_size' % affix)
174
+ for size in codes:
175
+
176
+ # use num_*_per_size because it accounts for special suffix codes
177
+ num_affixes = num_affixes_per_size[size]
178
+ table = generate_feature_vectors(num_affixes, num_features)
179
+ tensor.append(table)
180
+
181
+ # affix attribute actually has a 3-dim tensor
182
+ # (concatenation of 2d tables, one for each suffix size)
183
+ for table in tensor:
184
+ table_list.append(table)
185
+
186
+ def create_feature_tables(args, md, text_reader):
187
+ """
188
+ Create the feature tables to be used by the network. If the args object
189
+ contains the load_features option as true, the feature table for word types
190
+ is loaded instead of being created. The actual number of
191
+ feature tables will depend on the argument options.
192
+
193
+ :param arguments: Parameters supplied to the program
194
+ :param md: metadata about the network
195
+ :param text_reader: The TextReader being used.
196
+ :returns: all the feature tables to be used
197
+ """
198
+
199
+ logger = logging.getLogger("Logger")
200
+ feature_tables = []
201
+
202
+ if not args.load_types:
203
+ logger.info("Generating word type features...")
204
+ table_size = len(text_reader.word_dict)
205
+ types_table = generate_feature_vectors(table_size, args.num_features)
206
+ else:
207
+ logger.info("Loading word type features...")
208
+ # check if there is a word feature file specific for the task
209
+ # if not, load a generic one
210
+ filename = md.paths[md.type_features]
211
+ if os.path.exists(filename):
212
+ types_table = load_features_from_file(filename)
213
+ else:
214
+ filename = md.paths['type_features']
215
+ types_table = load_features_from_file(filename)
216
+
217
+ if len(types_table) < len(text_reader.word_dict):
218
+ # the type dictionary provided has more types than
219
+ # the number of feature vectors. So, let's generate
220
+ # feature vectors for the new types by replicating the vector
221
+ # associated with the RARE word
222
+ diff = len(text_reader.word_dict) - len(types_table)
223
+ logger.warning("Number of types in feature table and dictionary differ.")
224
+ logger.warning("Generating features for %d new types." % diff)
225
+ num_features = len(types_table[0])
226
+ new_vecs = generate_feature_vectors(diff, num_features)
227
+ types_table = np.append(types_table, new_vecs, axis=0)
228
+
229
+ elif len(types_table) < len(text_reader.word_dict):
230
+ logger.warning("Number of features provided is greater than the number of tokens\
231
+ in the dictionary. The extra features will be ignored.")
232
+
233
+ feature_tables.append(types_table) # head
234
+ #print(md.task)
235
+ #if md.task in ['labeled_dependency', 'unlabeled_dependency']:
236
+ # feature_tables.append(types_table) # tail
237
+
238
+ # Capitalization
239
+ if md.use_caps:
240
+ logger.info("Generating capitalization features...")
241
+ caps_table = generate_feature_vectors(attributes.Caps.num_values, args.caps)
242
+ feature_tables.append(caps_table)
243
+
244
+ # Prefixes
245
+ if md.use_prefix:
246
+ _create_affix_tables('prefix', feature_tables, args.prefix)
247
+
248
+ # Suffixes
249
+ if md.use_suffix:
250
+ _create_affix_tables('suffix', feature_tables, args.suffix)
251
+
252
+ # POS tags
253
+ if md.use_pos:
254
+ logger.info("Generating POS features...")
255
+ num_pos_tags = text_reader.get_num_pos_tags()
256
+ pos_table = generate_feature_vectors(num_pos_tags, args.pos)
257
+ #feature_tables.append(pos_table) # head # 여기와 *_reader의 converter와 일치해야 한다.
258
+ feature_tables.append(pos_table) # tail
259
+
260
+ # chunk tags
261
+ if md.use_chunk:
262
+ logger.info("Generating chunk features...")
263
+ num_chunk_tags = count_lines(md.paths['chunk_tags'])
264
+ chunk_table = generate_feature_vectors(num_chunk_tags, args.chunk)
265
+ feature_tables.append(chunk_table)
266
+
267
+ #print(len(feature_tables))
268
+ return feature_tables
269
+
270
+
271
+
272
+ def set_distance_features(max_dist=None,
273
+ num_target_features=None, num_pred_features=None):
274
+ """
275
+ Returns the distance feature tables to be used by a convolutional network.
276
+ One table is for relative distance to the target predicate, the other
277
+ to the predicate.
278
+
279
+ :param max_dist: maximum distance to be used in new vectors.
280
+ """
281
+ logger = logging.getLogger("Logger")
282
+
283
+ # max_dist before/after, 0 distance, and distances above the max
284
+ max_dist = 2 * (max_dist + 1) + 1
285
+ logger.info("Generating target word distance features...")
286
+ target_dist = generate_feature_vectors(max_dist, num_target_features)
287
+ logger.info("Generating predicate distance features...")
288
+ pred_dist = generate_feature_vectors(max_dist, num_pred_features)
289
+
290
+ return [target_dist, pred_dist]
291
+
292
+
293
+ def set_logger(level):
294
+ """Sets the logger to be used throughout the system."""
295
+ log_format = '%(message)s'
296
+ logging.basicConfig(format=log_format)
297
+ logger = logging.getLogger("Logger")
298
+ logger.setLevel(level)
299
+
300
+ def load_features_from_file(features_file):
301
+ """Reads a file with features written as binary data."""
302
+ return np.load(features_file)
303
+
304
+ def save_features_to_file(table, features_file):
305
+ """Saves a feature table to a given file, writing binary data."""
306
+ np.save(features_file, table)
307
+
308
+ def convert_iobes_to_bracket(tag):
309
+ """
310
+ Convert tags from the IOBES scheme to the CoNLL bracketing.
311
+
312
+ Example:
313
+ B-A0 -> (A0*
314
+ I-A0 -> *
315
+ E-A0 -> *)
316
+ S-A1 -> (A1*)
317
+ O -> *
318
+ """
319
+ if tag.startswith('I') or tag.startswith('O'):
320
+ return '*'
321
+ if tag.startswith('B'):
322
+ return '(%s*' % tag[2:]
323
+ if tag.startswith('E'):
324
+ return '*)'
325
+ if tag.startswith('S'):
326
+ return '(%s*)' % tag[2:]
327
+ else:
328
+ raise ValueError("Unknown tag: %s" % tag)
329
+
330
+ def boundaries_to_arg_limits(boundaries):
331
+ """
332
+ Converts a sequence of IOBES tags delimiting arguments to an array
333
+ of argument boundaries, used by the network.
334
+ """
335
+ limits = []
336
+ start = None
337
+
338
+ for i, tag in enumerate(boundaries):
339
+ if tag == 'S':
340
+ limits.append([i, i])
341
+ elif tag == 'B':
342
+ start = i
343
+ elif tag == 'E':
344
+ limits.append([start, i])
345
+
346
+ return np.array(limits, np.int)
347
+
348
+ class PickleConverter:
349
+ """txt 확장자 사전을 pickle 확장자로 변환"""
350
+ def __init__(self, morph_pickle=None, co_pickle=None, prob_pickle=None):
351
+ self.morph_pickle = morph_pickle or config.FILES.get("pos_morph_lexicon")
352
+
353
+ def _convert_morph_lexicon(self, root, word, data):
354
+ '''
355
+ root = dict()
356
+ ...
357
+ _convert_morph_dict(root, u_key, u_data)
358
+ '''
359
+ current_dict = root
360
+ _end = '$$'
361
+ for letter in word:
362
+ current_dict = current_dict.setdefault(letter, {})
363
+ current_dict = current_dict.setdefault(_end, data)
364
+ return root
365
+
366
+ def convert_morph_lexicon(self, filename=None):
367
+ filename = filename if filename else self.morph_pickle
368
+ txt_filename = filename.replace('.pickle', '.txt')
369
+ morph_dict = {}
370
+
371
+ with open(txt_filename, 'rb') as f:
372
+ raw_data = f.read(1024)
373
+ detected = chardet.detect(raw_data).get('encoding', 'utf-8')
374
+ with open(txt_filename, 'rt', encoding=detected) as f:
375
+ for line in f:
376
+ if ';;' in line[:2]: continue
377
+ try:
378
+ k, v = line.strip().split('\t')
379
+ except:
380
+ print('morph lexicon error : ', line)
381
+ self._convert_morph_lexicon(morph_dict, k, v)
382
+
383
+ with open(filename, 'wb') as f:
384
+ _pickle.dump(morph_dict, f, 2)
@@ -0,0 +1,239 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import itertools
4
+ from collections import Counter, OrderedDict as OD
5
+ from nltkor import Kor_char
6
+
7
+ class WordDictionary(dict):
8
+ """
9
+ Class to store words and their corresponding indices in
10
+ the network lookup table. Also deals with padding and
11
+ maps rare words to a special index.
12
+ """
13
+
14
+ padding_left = '*LEFT*'
15
+ padding_right = '*RIGHT*'
16
+ rare = '*RARE*'
17
+
18
+ number_transformation = {ord(c): '9' for c in '012345678'}
19
+ #english_transformation = {ord(c): 'a' for c in 'abcdefghijklmnopqrstuvwxyz'}
20
+ # 한자 처리 필요 korChar.hanja_syllable() 사용 , utils.py 참조
21
+
22
+ def __init__(self, tokens, size=None, minimum_occurrences=None, wordlist=None):
23
+ """
24
+ Fills a dictionary (to be used for indexing) with the most
25
+ common words in the given text.
26
+
27
+ :param tokens: Either a list of tokens or a list of lists of tokens
28
+ (each token represented as a string).
29
+ :param size: Maximum number of token indices
30
+ (not including paddings, rare, etc.).
31
+ :param minimum_occurrences: The minimum number of occurrences a token must
32
+ have in order to be included.
33
+ :param wordlist: Use this list of words to build the dictionary. Overrides tokens
34
+ if not None and ignores maximum size.
35
+ """
36
+ if wordlist is None:
37
+ # work with the supplied tokens. extract frequencies.
38
+
39
+ # gets frequency count
40
+ c = self._get_frequency_count(tokens)
41
+
42
+ if minimum_occurrences is None:
43
+ minimum_occurrences = 1
44
+
45
+ words = [key for key, number in c.most_common()
46
+ if number >= minimum_occurrences and key]
47
+
48
+ if size is not None and size < len(words):
49
+ words = words[:size]
50
+ else:
51
+ # using ordered dict as an ordered set
52
+ # (we need to keep the order and eliminate duplicates)
53
+ words = [word.lower().translate(WordDictionary.number_transformation) for word in wordlist]
54
+ values = [None] * len(words)
55
+ words = OD(zip(words, values)).keys()
56
+ #for c in words:
57
+ # print c.encode('utf-8')
58
+
59
+ # verifies the maximum size
60
+ if size is None:
61
+ size = len(words)
62
+
63
+ # set all words in the dictionary
64
+ for word, num in zip(words, range(size)):
65
+ self[word] = num
66
+
67
+ # if the given words include one of the the rare or padding symbols, don't replace it
68
+ # 사전 생성 시에 가장 뒤에 padding, rare를 생성
69
+ special_symbols = [WordDictionary.rare.lower(),
70
+ WordDictionary.padding_left.lower(),
71
+ WordDictionary.padding_right.lower()]
72
+
73
+ for symbol in special_symbols:
74
+ if symbol not in words:
75
+ self[symbol] = size
76
+ size += 1
77
+
78
+ self.check()
79
+
80
+ @classmethod
81
+ def init_from_wordlist(cls, wordlist):
82
+ """
83
+ Initializes the WordDictionary instance with a list of words, independently from their
84
+ frequencies. Every word in the list gets an entry.
85
+ """
86
+ return cls(None, wordlist=wordlist)
87
+
88
+ @classmethod
89
+ def init_empty(cls):
90
+ """
91
+ Initializes an empty Word Dictionary.
92
+ """
93
+ return cls([[]])
94
+
95
+ def save(self, filename):
96
+ """
97
+ Saves the word dictionary to the given file as a list of word types.
98
+
99
+ Special words (paddings and rare) are also included.
100
+ """
101
+ sorted_words = sorted(self, key=self.get)
102
+ text = '\n'.join(sorted_words)
103
+ with open(filename, 'w') as f:
104
+ f.write(text)
105
+
106
+ @classmethod
107
+ def load(cls, filename):
108
+ """
109
+ Loads a WordDictionary object from a vocabulary file.
110
+ """
111
+ words = []
112
+ with open(filename, 'r') as f:
113
+ for word in f:
114
+ word = word.strip()
115
+ if word:
116
+ words.append(word)
117
+
118
+ return cls.init_from_wordlist(words)
119
+
120
+ def _get_frequency_count(self, token_list):
121
+ """
122
+ Returns a token counter for tokens in token_list.
123
+
124
+ :param token_list: Either a list of tokens (as strings) or a list
125
+ of lists of tokens.
126
+ """
127
+ if type(token_list[0]) == list:
128
+ c = Counter(t.lower().translate(WordDictionary.number_transformation)
129
+ for sent in token_list for t in sent)
130
+ else:
131
+ c = Counter(t.lower().translate(WordDictionary.number_transformation)
132
+ for t in token_list)
133
+ return c
134
+
135
+
136
+ def update_tokens(self, tokens, size=None, minimum_occurrences=1, freqs=None):
137
+ """
138
+ Updates the dictionary, adding more types until size is reached.
139
+
140
+ :param freqs: a dictionary providing a token count.
141
+ """
142
+ if freqs is None:
143
+ freqs = self._get_frequency_count(tokens)
144
+
145
+ if size is None or size == 0:
146
+ # size None or 0 means no size limit
147
+ size = len(freqs)
148
+
149
+ if self.num_tokens >= size:
150
+ return
151
+ else:
152
+ size_diff = size - self.num_tokens
153
+
154
+ # a new version of freqs with only tokens not present in the dictionary
155
+ # and above minimum frequency
156
+ candidate_tokens = dict((token, freqs[token])
157
+ for token in freqs
158
+ if token not in self and freqs[token] >= minimum_occurrences)
159
+
160
+ # order the types from the most frequent to the least
161
+ new_tokens = sorted(candidate_tokens, key=lambda x: candidate_tokens[x], reverse=True)
162
+
163
+ next_value = len(self)
164
+ for token in new_tokens:
165
+ self[token] = next_value
166
+ next_value += 1
167
+ size_diff -= 1
168
+ if size_diff == 0:
169
+ break
170
+
171
+ self.check()
172
+
173
+ def __contains__(self, key):
174
+ """
175
+ Overrides the "in" operator. Case insensitive.
176
+ """
177
+ transformed = key.lower().translate(WordDictionary.number_transformation)
178
+ return super(WordDictionary, self).__contains__(transformed)
179
+
180
+ def __setitem__(self, key, value):
181
+ """
182
+ Overrides the [] write operator. It converts every key to lower case
183
+ before assignment.
184
+ """
185
+ transformed = key.lower().translate(WordDictionary.number_transformation)
186
+ super(WordDictionary, self).__setitem__(transformed, value)
187
+
188
+ def __getitem__(self, key):
189
+ """
190
+ Overrides the [] read operator.
191
+
192
+ Three differences from the original:
193
+ 1) when given a word without an entry, it returns the value for the *UNKNOWN* key.
194
+ 2) all entries are converted to lower case before verification.
195
+ 3) digits are mapped to 9
196
+ """
197
+ # faster than regexp
198
+ transformed = key.lower().translate(WordDictionary.number_transformation)
199
+ return super(WordDictionary, self).get(transformed, self.index_rare)
200
+
201
+ def get(self, key):
202
+ """
203
+ Overrides the dictionary get method, so when given a word without an entry, it returns
204
+ the value for the *rare* key. Note that it is not possible to supply a default value as
205
+ in the dict class.
206
+ """
207
+ # faster than regexp
208
+ #print (key, type(key))
209
+ transformed = key.lower().translate(WordDictionary.number_transformation)
210
+ #print key, transformed
211
+ return super(WordDictionary, self).get(transformed, self.index_rare)
212
+
213
+ def check(self):
214
+ """
215
+ Checks the internal structure of the dictionary and makes necessary adjustments,
216
+ such as updating num_tokens.
217
+ """
218
+ # since WordDictionary overrides __get__, we use the super call
219
+ # (the WordDictionary __get__ fails when self.index_rare is not set)
220
+ key = WordDictionary.rare.lower()
221
+ self.index_rare = super(WordDictionary, self).get(key)
222
+
223
+ self.index_padding_left = self[WordDictionary.padding_left]
224
+ self.index_padding_right = self[WordDictionary.padding_right]
225
+ self.num_tokens = len(self)
226
+
227
+ def get_words(self, indices):
228
+ """
229
+ Returns the words represented by a sequence of indices.
230
+ """
231
+ words = [w for w in self if self[w] in indices]
232
+ return words
233
+
234
+ def get_indices(self, words):
235
+ """
236
+ Returns the indices corresponding to a sequence of tokens.
237
+ """
238
+ indices = [self[w] for w in words]
239
+ return indices
@@ -0,0 +1,2 @@
1
+
2
+ from .wsd_reader import WSDReader
@@ -0,0 +1,7 @@
1
+
2
+ import warnings
3
+
4
+ # backwards compatibility
5
+ from .wsd_reader import *
6
+
7
+ warnings.warn('Module macmorphoreader is deprecated. Use module pos_reader instead.')