nltkor 1.2.0__cp39-cp39-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +15 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +814 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +467 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/ch.py +12 -0
  32. nltkor/sejong/dict_semClassNum.txt +491 -0
  33. nltkor/sejong/layer.txt +630 -0
  34. nltkor/sejong/sejong_download.py +87 -0
  35. nltkor/sejong/ssem.py +685 -0
  36. nltkor/similarity/__init__.py +3 -0
  37. nltkor/similarity/bartscore____.py +337 -0
  38. nltkor/similarity/bertscore____.py +339 -0
  39. nltkor/similarity/classical.py +245 -0
  40. nltkor/similarity/cosine_similarity.py +175 -0
  41. nltkor/tag/__init__.py +70 -0
  42. nltkor/tag/espresso_tag.py +220 -0
  43. nltkor/tag/libs/__init__.py +9 -0
  44. nltkor/tag/libs/arguments.py +280 -0
  45. nltkor/tag/libs/attributes.py +231 -0
  46. nltkor/tag/libs/config.py +158 -0
  47. nltkor/tag/libs/metadata.py +129 -0
  48. nltkor/tag/libs/ner/__init__.py +2 -0
  49. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  50. nltkor/tag/libs/ner/ner_reader.py +92 -0
  51. nltkor/tag/libs/network.c +59267 -0
  52. nltkor/tag/libs/network.cpython-39-darwin.so +0 -0
  53. nltkor/tag/libs/parse/__init__.py +1 -0
  54. nltkor/tag/libs/parse/parse_reader.py +283 -0
  55. nltkor/tag/libs/pos/__init__.py +2 -0
  56. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  57. nltkor/tag/libs/pos/pos_reader.py +89 -0
  58. nltkor/tag/libs/reader.py +510 -0
  59. nltkor/tag/libs/srl/__init__.py +3 -0
  60. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  61. nltkor/tag/libs/srl/srl_reader.py +436 -0
  62. nltkor/tag/libs/srl/train_srl.py +87 -0
  63. nltkor/tag/libs/taggers.py +926 -0
  64. nltkor/tag/libs/utils.py +344 -0
  65. nltkor/tag/libs/word_dictionary.py +239 -0
  66. nltkor/tag/libs/wsd/__init__.py +2 -0
  67. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  68. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  69. nltkor/tokenize/__init__.py +62 -0
  70. nltkor/tokenize/ko_tokenize.py +115 -0
  71. nltkor/trans.py +121 -0
  72. nltkor-1.2.0.dist-info/LICENSE.txt +1093 -0
  73. nltkor-1.2.0.dist-info/METADATA +33 -0
  74. nltkor-1.2.0.dist-info/RECORD +76 -0
  75. nltkor-1.2.0.dist-info/WHEEL +5 -0
  76. nltkor-1.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,510 @@
1
+ #!/usr/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ Base class for reading NLP tagging data.
6
+ """
7
+
8
+ import os
9
+ import re
10
+ import abc
11
+ import logging
12
+ import numpy as np
13
+ from collections import Counter
14
+
15
+ from . import attributes
16
+ from . import metadata
17
+ from . import config
18
+ from .word_dictionary import WordDictionary
19
+ from .attributes import get_capitalization
20
+
21
+ class FileNotFoundException(IOError):
22
+ """
23
+ Dummy class for indicating file not found instead of
24
+ the broad IOError.
25
+ """
26
+ pass
27
+
28
+ def load_tag_dict(filename):
29
+ """
30
+ Load a tag dictionary from a file containing one tag
31
+ per line.
32
+ """
33
+ tag_dict = {}
34
+ with open(filename, 'rt') as f:
35
+ code = 0
36
+ for tag in f:
37
+ tag = tag.strip()
38
+ if tag:
39
+ tag_dict[tag] = code
40
+ code += 1
41
+
42
+ return tag_dict
43
+
44
+ def _load_morph_lexicon(root, word, data):
45
+ '''
46
+ root = dict()
47
+ ...
48
+ _load_morph_dict(root, u_key, u_data)
49
+ '''
50
+ current_dict = root
51
+ _end = '$$'
52
+ for letter in word:
53
+ current_dict = current_dict.setdefault(letter, {})
54
+ current_dict = current_dict.setdefault(_end, data)
55
+ return root
56
+
57
+ def load_morph_lexicon(filename):
58
+ """
59
+ Load a co user defined morph from a file containing key<tab>patterns per line.
60
+ """
61
+ morph_dict = {}
62
+ with open(filename, 'rt') as f:
63
+ for line in f:
64
+ if ';;' in line[:2]: continue
65
+ try:
66
+ k,v = line.strip().split('\t')
67
+ except:
68
+ print('morph lexicon error : ', line)
69
+ _load_morph_lexicon(morph_dict, k, v)
70
+ return morph_dict
71
+
72
+ def load_co_lexicon(filename):
73
+ """
74
+ Load a co pattern from a file containing key<tab>patterns per line.
75
+ """
76
+ co_morph_dict = {}
77
+ with open(filename, 'rt') as f:
78
+ for line in f:
79
+ if ';;' in line[:2]: continue
80
+ try:
81
+ k,v = line.strip().split('\t')
82
+ except:
83
+ print(line)
84
+ if k in co_morph_dict:
85
+ print("load co morph lexicon : key {} conflict!".format(k))
86
+ co_morph_dict[k] = v
87
+ return co_morph_dict
88
+
89
+ def load_prob_dict(filename):
90
+ """
91
+ """
92
+ prob_dict = {}
93
+ with open(filename, 'rt') as f:
94
+ for line in f:
95
+ if ';;' in line[:2]: continue
96
+ try:
97
+ k, v = line.rstrip().split('\t')
98
+ except:
99
+ print (line)
100
+ if k in prob_dict:
101
+ print("load prob dict : key {} conflict!".format(k))
102
+ prob_dict[k] = float(v)
103
+ return prob_dict
104
+
105
+
106
+
107
+ def save_tag_dict(filename, tag_dict):
108
+ """
109
+ Save the given tag dictionary to the given file. Dictionary
110
+ is saved with one tag per line, in the order of their codes.
111
+ """
112
+ ordered_keys = sorted(tag_dict, key=tag_dict.get)
113
+ ordered_keys = sorted(ordered_keys)
114
+ text = '\n'.join(ordered_keys)
115
+ with open(filename, 'wt') as f:
116
+ f.write(text)
117
+
118
+
119
+ class TaggerReader(object):
120
+ """
121
+ Abstract class extending TextReader with useful functions
122
+ for tagging tasks.
123
+ """
124
+ __metaclass__ = abc.ABCMeta
125
+
126
+ def __init__(self, md=None, load_dictionaries=True):
127
+ '''
128
+ This class shouldn't be used directly. The constructor only
129
+ provides method calls for subclasses. Subclasses should call
130
+ this constructor after initializing the `task` attribute.
131
+ '''
132
+ self._set_metadata(md)
133
+ self.codified = False
134
+ self._converter = None
135
+
136
+ if load_dictionaries:
137
+ self.load_or_create_dictionary() # vocabulary
138
+ self.load_or_create_tag_dict() # tagset
139
+ if self.task == 'pos':
140
+ self.load_co_lexicon() # complicated morph lexicon
141
+ self.load_morph_lexicon() # user morph lexicon
142
+ self.load_prob_dict() # probability data
143
+
144
+ @abc.abstractmethod
145
+ def task(self):
146
+ """
147
+ The task the tagger reads data for.
148
+ Must be defined in subclasses.
149
+ """
150
+ return None
151
+
152
+ def load_or_create_dictionary(self):
153
+ """
154
+ Try to load the vocabulary from the default location. If the vocabulary
155
+ file is not available, create a new one from the sentences available
156
+ and save it.
157
+ """
158
+ try:
159
+ self.load_dictionary()
160
+ except FileNotFoundException:
161
+ self.generate_dictionary(minimum_occurrences=1)
162
+ #self.generate_dictionary(minimum_occurrences=2)
163
+ self.save_dictionary()
164
+
165
+ def load_or_create_tag_dict(self):
166
+ """
167
+ Try to load the tag dictionary from the default location. If the dictinaty
168
+ file is not available, scan the available sentences and create a new one.
169
+ """
170
+ key = '%s_tag_dict' % self.task
171
+ filename = self.md.paths[key]
172
+ if os.path.isfile(filename):
173
+ self.load_tag_dict(filename)
174
+ return
175
+
176
+ tags = {tag for sent in self.sentences for _, tag in sent}
177
+ self.tag_dict = {tag: code for code, tag in enumerate(tags)}
178
+ self.save_tag_dict(filename)
179
+
180
+ def load_morph_lexicon(self):
181
+ """
182
+ Try to load the morph lexicon to have to be captured from the default location.
183
+ """
184
+ logger = logging.getLogger("Logger")
185
+
186
+ key = '%s_morph_lexicon' % self.task
187
+ filename = self.md.paths[key]
188
+ if os.path.isfile(filename):
189
+ print(filename)
190
+ self.load_morph_lexicon(filename)
191
+ return
192
+ else:
193
+ logger.info("Can not find % " % filename)
194
+
195
+ def load_co_lexicon(self):
196
+ """
197
+ Try to load the morph pattern lexicon from the default location.
198
+ """
199
+ logger = logging.getLogger("Logger")
200
+
201
+ key = '%s_co_lexicon' % self.task
202
+ filename = self.md.paths[key]
203
+ if os.path.isfile(filename):
204
+ self.load_co_lexicon(filename)
205
+ return
206
+ else:
207
+ logger.info("Can not find % " % filename)
208
+
209
+ def load_prob_dict(self):
210
+ """
211
+ Try to load the morph pattern lexicon from the default location.
212
+ """
213
+ logger = logging.getLogger("Logger")
214
+
215
+ key = '%s_prob_dict' % self.task
216
+ filename = self.md.paths[key]
217
+ if os.path.isfile(filename):
218
+ self.load_prob_dict(filename)
219
+ return
220
+ else:
221
+ logger.info("Can not find % " % filename)
222
+
223
+
224
+
225
+ def generate_dictionary(self, dict_size=None, minimum_occurrences=1):
226
+ """
227
+ Generates a token dictionary based on the given sentences.
228
+
229
+ :param dict_size: Max number of tokens to be included in the dictionary.
230
+ :param minimum_occurrences: Minimum number of times that a token must
231
+ appear in the text in order to be included in the dictionary.
232
+ """
233
+ logger = logging.getLogger("Logger")
234
+
235
+ tokens = [token for sent in self.sentences for token, _ in sent]
236
+ self.word_dict = WordDictionary(tokens, dict_size, minimum_occurrences)
237
+ logger.info("Created dictionary with %d types" % self.word_dict.num_tokens)
238
+
239
+ def get_inverse_tag_dictionary(self):
240
+ """
241
+ Returns a version of the tag dictionary that maps numbers to tags.
242
+ Used for consulting the meaning of the network's output.
243
+ """
244
+ tuples = [(x[1], x[0]) for x in self.tag_dict.items()]
245
+ ret = dict(tuples)
246
+
247
+ return ret
248
+
249
+ def codify_sentence(self, sentence):
250
+ """
251
+ Converts a given sentence into the indices used by the neural network.
252
+
253
+ :param sentence: a sequence of tokens, already tokenized
254
+ """
255
+ if self._converter is None:
256
+ self.create_converter()
257
+ return np.array([self.converter.convert(t) for t in sentence])
258
+
259
+ def codify_sentences(self):
260
+ logger = logging.getLogger("Logger")
261
+ logger.info("data structuring.")
262
+ """
263
+ Converts each token in each sequence into indices to their feature vectors
264
+ in feature matrices. The previous sentences as text are not accessible anymore.
265
+ """
266
+ if self._converter is None:
267
+ self.create_converter()
268
+
269
+ new_sentences = []
270
+ self.tags = []
271
+ rare_tag_value = self.tag_dict.get(self.rare_tag)
272
+
273
+ for sent in self.sentences:
274
+ new_sent = []
275
+ sentence_tags = []
276
+
277
+ for token, tag in sent:
278
+ new_token = self.converter.convert(token)
279
+ new_sent.append(new_token)
280
+ sentence_tags.append(self.tag_dict.get(tag, rare_tag_value))
281
+
282
+ new_sentences.append(np.array(new_sent))
283
+ self.tags.append(np.array(sentence_tags))
284
+
285
+ self.sentences = new_sentences
286
+ self.codified = True
287
+
288
+ def get_word_counter(self):
289
+ """
290
+ Returns a Counter object with word type occurrences.
291
+ """
292
+ c = Counter(token.lower() for sent in self.sentences for token, _ in sent)
293
+ return c
294
+
295
+ def get_tag_counter(self):
296
+ """
297
+ Returns a Counter object with tag occurrences.
298
+ """
299
+ c = Counter(tag for sent in self.sentences for _, tag in sent)
300
+ return c
301
+
302
+ def save_tag_dict(self, filename=None, tag_dict=None):
303
+ """
304
+ Saves a tag dictionary to a file as a list of tags.
305
+
306
+ :param tag_dict: the dictionary to save. If None, the default
307
+ tag_dict for the class will be saved.
308
+ :param filename: the file where the dictionary should be saved.
309
+ If None, the class default tag_dict filename will be used.
310
+ """
311
+ if tag_dict is None:
312
+ tag_dict = self.tag_dict
313
+ if filename is None:
314
+ key = '%s_tag_dict' % self.task
315
+ filename = self.md.paths[key]
316
+
317
+ save_tag_dict(filename, tag_dict)
318
+
319
+ def load_tag_dict(self, filename=None):
320
+ """
321
+ Load the tag dictionary from the default file and assign
322
+ it to the tag_dict attribute.
323
+ """
324
+ if filename is None:
325
+ key = '%s_tag_dict' % self.task
326
+ filename = self.md.paths[key]
327
+
328
+ self.tag_dict = load_tag_dict(filename)
329
+
330
+ def load_morph_lexicon(self, filename=None):
331
+ """
332
+ Load the morph dictionary from the default file and assign
333
+ it to the morph_tag attribute.
334
+ """
335
+ if filename is None:
336
+ key = '%s_morph_lexicon' % self.task
337
+ filename = self.md.paths[key]
338
+
339
+ self.morph_lexicon = load_morph_lexicon(filename)
340
+
341
+ def load_co_lexicon(self, filename=None):
342
+ """
343
+ Load the tag dictionary from the default file and assign
344
+ it to the tag_dict attribute.
345
+ """
346
+ if filename is None:
347
+ key = '%s_co_lexicon' % self.task
348
+ filename = self.md.paths[key]
349
+
350
+ self.co_lexicon = load_co_lexicon(filename)
351
+
352
+ def load_prob_dict(self, filename=None):
353
+ """
354
+ Load the tag dictionary from the default file and assign
355
+ it to the tag_dict attribute.
356
+ """
357
+ if filename is None:
358
+ key = '%s_prob_dict' % self.task
359
+ filename = self.md.paths[key]
360
+
361
+ self.prob_dict = load_prob_dict(filename)
362
+
363
+
364
+ def _set_metadata(self, md):
365
+ if md is None:
366
+ #metadata not provided = using global data_dir for files
367
+ self.md = metadata.Metadata(self.task, config.FILES)
368
+ else:
369
+ self.md = md
370
+
371
+ def add_text(self, text):
372
+ """
373
+ Adds more text to the reader. The text must be a sequence of sequences of
374
+ tokens.
375
+ """
376
+ self.sentences.extend(text)
377
+
378
+ def load_dictionary(self):
379
+ """Read a file with a word list and create a dictionary."""
380
+ logger = logging.getLogger("Logger")
381
+ logger.info("Loading vocabulary")
382
+
383
+ # try to load vocabulary specific for the task
384
+ key = 'vocabulary_%s' % self.task
385
+ filename = self.md.paths[key]
386
+ if not os.path.isfile(filename):
387
+ # fallback to generic vocabulary
388
+ filename = self.md.paths['vocabulary']
389
+ if not os.path.isfile(filename):
390
+ raise FileNotFoundException()
391
+
392
+ words = []
393
+ with open(filename, 'rt') as f:
394
+ for word in f:
395
+ #word = unicode(word, 'utf-8').strip()
396
+ word = word.strip()
397
+ if word:
398
+ words.append(word)
399
+
400
+ wd = WordDictionary.init_from_wordlist(words)
401
+ self.word_dict = wd
402
+ logger.info("Done. Dictionary size is %d types" % wd.num_tokens)
403
+
404
+ def save_dictionary(self, filename=None):
405
+ """
406
+ Saves the reader's word dictionary as a list of words.
407
+
408
+ :param filename: path to the file to save the dictionary.
409
+ if not given, it will be saved in the default nlpnet
410
+ data directory.
411
+ """
412
+ logger = logging.getLogger("Logger")
413
+ if filename is None:
414
+ key = 'vocabulary_%s' % self.task
415
+ filename = self.md.paths[key]
416
+
417
+ self.word_dict.save(filename)
418
+ logger.info("Dictionary saved in %s" % filename)
419
+
420
+ def create_affix_list(self, prefix_or_suffix, max_size, min_occurrences):
421
+ """
422
+ Handle the creation of suffix and prefix lists.
423
+
424
+ Check if there exists an affix list in the data directory. If there isn't,
425
+ create a new one based on the training sentences.
426
+
427
+ :param prefix_or_suffix: string 'prefix' or 'suffix'
428
+ """
429
+ affix_type = prefix_or_suffix.lower()
430
+ assert affix_type == 'suffix' or affix_type == 'prefix'
431
+
432
+ filename = self.md.paths['%ses' % affix_type]
433
+ if os.path.isfile(filename):
434
+ return
435
+
436
+ logger = logging.getLogger("Logger")
437
+ affixes_all_lengths = []
438
+
439
+ # only get the affix size n from words with length at least (n+1)
440
+ types = {re.sub(r'\d', '9', token.lower())
441
+ for sent in self.sentences for token, _ in sent}
442
+
443
+ for length in range(1, max_size + 1):
444
+ if affix_type == 'suffix':
445
+ c = Counter(type_[-length:]
446
+ for type_ in types
447
+ if len(type_) > length)
448
+ else:
449
+ c = Counter(type_[:length]
450
+ for type_ in types
451
+ if len(type_) > length)
452
+ affixes_this_length = [affix for affix in c
453
+ if c[affix] >= min_occurrences]
454
+ affixes_all_lengths.extend(affixes_this_length)
455
+
456
+ logger.info('Created a list of %d %ses.' % (len(affixes_all_lengths), affix_type))
457
+ text = '\n'.join(affixes_all_lengths)
458
+ with open(filename, 'wt') as f:
459
+ f.write(text)
460
+
461
+ @property
462
+ def converter(self):
463
+ """
464
+ Return the token converter, which transforms tokens into their feature
465
+ vector indices. If it doesn't exist, one is created.
466
+ """
467
+ if self._converter is None:
468
+ self.create_converter()
469
+
470
+ return self._converter
471
+
472
+ @converter.setter
473
+ def converter(self, value):
474
+ self._converter = value
475
+
476
+ def create_converter(self):
477
+ """
478
+ Sets up the token converter, which is responsible for transforming tokens into their
479
+ feature vector indices
480
+ """
481
+ def add_affix_extractors(affix):
482
+ """
483
+ Helper function that works for both suffixes and prefixes.
484
+ The parameter affix should be 'suffix' or 'prefix'.
485
+ """
486
+ loader_function = getattr(attributes.Affix, 'load_%ses' % affix)
487
+ loader_function(self.md)
488
+
489
+ # deal with gaps between sizes (i.e., if there are sizes 2, 3, and 5)
490
+ codes = getattr(attributes.Affix, '%s_codes' % affix)
491
+ sizes = sorted(codes)
492
+
493
+ getter = getattr(attributes.Affix, 'get_%s' % affix)
494
+ for size in sizes:
495
+
496
+ # size=size because if we don't use it, lambda sticks to the last value of
497
+ # the loop iterator size
498
+ def f(word, size=size):
499
+ return getter(re.sub(r'\d', '9', word), size)
500
+
501
+ self.converter.add_extractor(f)
502
+
503
+ self._converter = attributes.TokenConverter()
504
+ self.converter.add_extractor(self.word_dict.get)
505
+ if self.md.use_caps:
506
+ self.converter.add_extractor(get_capitalization)
507
+ if self.md.use_prefix:
508
+ add_affix_extractors('prefix')
509
+ if self.md.use_suffix:
510
+ add_affix_extractors('suffix')
@@ -0,0 +1,3 @@
1
+
2
+ from .train_srl import *
3
+ from .srl_reader import SRLReader