nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +16 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +847 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +787 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  32. nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  33. nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  34. nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  35. nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  36. nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  37. nltkor/sejong/ch.py +12 -0
  38. nltkor/sejong/dict_semClassNum.txt +491 -0
  39. nltkor/sejong/layer.txt +630 -0
  40. nltkor/sejong/sejong_download.py +87 -0
  41. nltkor/sejong/ssem.py +684 -0
  42. nltkor/similarity/__init__.py +3 -0
  43. nltkor/similarity/bartscore____.py +337 -0
  44. nltkor/similarity/bertscore____.py +339 -0
  45. nltkor/similarity/classical.py +245 -0
  46. nltkor/similarity/cosine_similarity.py +175 -0
  47. nltkor/tag/__init__.py +71 -0
  48. nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
  49. nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
  50. nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
  51. nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
  52. nltkor/tag/espresso_tag.py +220 -0
  53. nltkor/tag/libs/__init__.py +10 -0
  54. nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
  55. nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
  56. nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
  57. nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
  58. nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
  59. nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
  60. nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
  61. nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
  62. nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
  63. nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
  64. nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
  65. nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
  66. nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
  67. nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
  68. nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
  69. nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
  70. nltkor/tag/libs/arguments.py +280 -0
  71. nltkor/tag/libs/attributes.py +231 -0
  72. nltkor/tag/libs/config.py +159 -0
  73. nltkor/tag/libs/metadata.py +129 -0
  74. nltkor/tag/libs/ner/__init__.py +2 -0
  75. nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
  76. nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
  77. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
  78. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
  79. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  80. nltkor/tag/libs/ner/ner_reader.py +92 -0
  81. nltkor/tag/libs/network.c +72325 -0
  82. nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
  83. nltkor/tag/libs/network.pyx +878 -0
  84. nltkor/tag/libs/networkconv.pyx +1028 -0
  85. nltkor/tag/libs/networkdependencyconv.pyx +451 -0
  86. nltkor/tag/libs/parse/__init__.py +1 -0
  87. nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
  88. nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
  89. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
  90. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
  91. nltkor/tag/libs/parse/parse_reader.py +283 -0
  92. nltkor/tag/libs/pos/__init__.py +2 -0
  93. nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
  94. nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
  95. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
  96. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
  97. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  98. nltkor/tag/libs/pos/pos_reader.py +97 -0
  99. nltkor/tag/libs/reader.py +485 -0
  100. nltkor/tag/libs/srl/__init__.py +3 -0
  101. nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
  102. nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
  103. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
  104. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
  105. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
  106. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
  107. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  108. nltkor/tag/libs/srl/srl_reader.py +436 -0
  109. nltkor/tag/libs/srl/train_srl.py +87 -0
  110. nltkor/tag/libs/taggers.py +926 -0
  111. nltkor/tag/libs/utils.py +384 -0
  112. nltkor/tag/libs/word_dictionary.py +239 -0
  113. nltkor/tag/libs/wsd/__init__.py +2 -0
  114. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
  115. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
  116. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
  117. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
  118. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  119. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  120. nltkor/tokenize/__init__.py +62 -0
  121. nltkor/tokenize/ko_tokenize.py +115 -0
  122. nltkor/trans.py +121 -0
  123. nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
  124. nltkor-1.2.14.dist-info/METADATA +41 -0
  125. nltkor-1.2.14.dist-info/RECORD +127 -0
  126. nltkor-1.2.14.dist-info/WHEEL +5 -0
  127. nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,485 @@
1
+ #!/usr/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ Base class for reading NLP tagging data.
6
+ """
7
+
8
+ import os
9
+ import sys
10
+ import re
11
+ import abc
12
+ import logging
13
+ import numpy as np
14
+ import chardet
15
+ import _pickle
16
+ from collections import Counter
17
+
18
+ from . import attributes
19
+ from . import metadata
20
+ from . import config
21
+ from .word_dictionary import WordDictionary
22
+ from .attributes import get_capitalization
23
+ from .utils import PickleConverter
24
+
25
+ class FileNotFoundException(IOError):
26
+ """
27
+ Dummy class for indicating file not found instead of
28
+ the broad IOError.
29
+ """
30
+ pass
31
+
32
+ pickle_converter = PickleConverter()
33
+
34
+ def load_tag_dict(filename):
35
+ """
36
+ Load a tag dictionary from a file containing one tag
37
+ per line.
38
+ """
39
+ tag_dict = {}
40
+ with open(filename, 'rb') as f:
41
+ raw_data = f.read(1024)
42
+ detected = chardet.detect(raw_data).get('encoding', 'utf-8')
43
+ with open(filename, 'rt', encoding = detected) as f:
44
+ code = 0
45
+ for tag in f:
46
+ tag = tag.strip()
47
+ if tag:
48
+ tag_dict[tag] = code
49
+ code += 1
50
+
51
+ return tag_dict
52
+
53
+ def load_morph_lexicon(filename):
54
+ if not os.path.exists(filename):
55
+ pickle_converter.convert_morph_lexicon(filename)
56
+
57
+ with open(filename, 'rb') as f:
58
+ return _pickle.load(f)
59
+
60
+
61
+ def save_tag_dict(filename, tag_dict):
62
+ """
63
+ Save the given tag dictionary to the given file. Dictionary
64
+ is saved with one tag per line, in the order of their codes.
65
+ """
66
+ ordered_keys = sorted(tag_dict, key=tag_dict.get)
67
+ ordered_keys = sorted(ordered_keys)
68
+ text = '\n'.join(ordered_keys)
69
+ with open(filename, 'wt') as f:
70
+ f.write(text)
71
+
72
+
73
+ class TaggerReader(object):
74
+ """
75
+ Abstract class extending TextReader with useful functions
76
+ for tagging tasks.
77
+ """
78
+ __metaclass__ = abc.ABCMeta
79
+
80
+ def __init__(self, md=None, load_dictionaries=True):
81
+ '''
82
+ This class shouldn't be used directly. The constructor only
83
+ provides method calls for subclasses. Subclasses should call
84
+ this constructor after initializing the `task` attribute.
85
+ '''
86
+ self._set_metadata(md)
87
+ self.codified = False
88
+ self._converter = None
89
+
90
+ if load_dictionaries:
91
+ self.load_or_create_dictionary() # vocabulary
92
+ self.load_or_create_tag_dict() # tagset
93
+ if self.task == 'pos':
94
+ self.load_co_lexicon() # complicated morph lexicon
95
+ self.load_morph_lexicon() # user morph lexicon
96
+ self.load_prob_dict() # probability data
97
+
98
+ @abc.abstractmethod
99
+ def task(self):
100
+ """
101
+ The task the tagger reads data for.
102
+ Must be defined in subclasses.
103
+ """
104
+ return None
105
+
106
+ def load_or_create_dictionary(self):
107
+ """
108
+ Try to load the vocabulary from the default location. If the vocabulary
109
+ file is not available, create a new one from the sentences available
110
+ and save it.
111
+ """
112
+ try:
113
+ self.load_dictionary()
114
+ except FileNotFoundException:
115
+ self.generate_dictionary(minimum_occurrences=1)
116
+ #self.generate_dictionary(minimum_occurrences=2)
117
+ self.save_dictionary()
118
+
119
+ def load_or_create_tag_dict(self):
120
+ """
121
+ Try to load the tag dictionary from the default location. If the dictinaty
122
+ file is not available, scan the available sentences and create a new one.
123
+ """
124
+ key = '%s_tag_dict' % self.task
125
+ filename = self.md.paths[key]
126
+ if os.path.isfile(filename):
127
+ self.load_tag_dict(filename)
128
+ return
129
+
130
+ tags = {tag for sent in self.sentences for _, tag in sent}
131
+ self.tag_dict = {tag: code for code, tag in enumerate(tags)}
132
+ self.save_tag_dict(filename)
133
+
134
+ def load_morph_lexicon(self):
135
+ """
136
+ Try to load the morph lexicon to have to be captured from the default location.
137
+ """
138
+ logger = logging.getLogger("Logger")
139
+
140
+ key = '%s_morph_lexicon' % self.task
141
+ filename = self.md.paths[key]
142
+ if os.path.isfile(filename):
143
+ print(filename)
144
+ self.load_morph_lexicon(filename)
145
+ return
146
+ else:
147
+ logger.info("Can not find % " % filename)
148
+
149
+ def load_co_lexicon(self):
150
+ """
151
+ Try to load the morph pattern lexicon from the default location.
152
+ """
153
+ logger = logging.getLogger("Logger")
154
+
155
+ key = '%s_co_lexicon' % self.task
156
+ filename = self.md.paths[key]
157
+ if os.path.isfile(filename):
158
+ self.load_co_lexicon(filename)
159
+ return
160
+ else:
161
+ logger.info("Can not find % " % filename)
162
+
163
+ def load_prob_dict(self):
164
+ """
165
+ Try to load the morph pattern lexicon from the default location.
166
+ """
167
+ logger = logging.getLogger("Logger")
168
+
169
+ key = '%s_prob_dict' % self.task
170
+ filename = self.md.paths[key]
171
+ if os.path.isfile(filename):
172
+ self.load_prob_dict(filename)
173
+ return
174
+ else:
175
+ logger.info("Can not find % " % filename)
176
+
177
+
178
+
179
+ def generate_dictionary(self, dict_size=None, minimum_occurrences=1):
180
+ """
181
+ Generates a token dictionary based on the given sentences.
182
+
183
+ :param dict_size: Max number of tokens to be included in the dictionary.
184
+ :param minimum_occurrences: Minimum number of times that a token must
185
+ appear in the text in order to be included in the dictionary.
186
+ """
187
+ logger = logging.getLogger("Logger")
188
+
189
+ tokens = [token for sent in self.sentences for token, _ in sent]
190
+ self.word_dict = WordDictionary(tokens, dict_size, minimum_occurrences)
191
+ logger.info("Created dictionary with %d types" % self.word_dict.num_tokens)
192
+
193
+ def get_inverse_tag_dictionary(self):
194
+ """
195
+ Returns a version of the tag dictionary that maps numbers to tags.
196
+ Used for consulting the meaning of the network's output.
197
+ """
198
+ tuples = [(x[1], x[0]) for x in self.tag_dict.items()]
199
+ ret = dict(tuples)
200
+
201
+ return ret
202
+
203
+ def codify_sentence(self, sentence):
204
+ """
205
+ Converts a given sentence into the indices used by the neural network.
206
+
207
+ :param sentence: a sequence of tokens, already tokenized
208
+ """
209
+ if self._converter is None:
210
+ self.create_converter()
211
+ return np.array([self.converter.convert(t) for t in sentence])
212
+
213
+ def codify_sentences(self):
214
+ logger = logging.getLogger("Logger")
215
+ logger.info("data structuring.")
216
+ """
217
+ Converts each token in each sequence into indices to their feature vectors
218
+ in feature matrices. The previous sentences as text are not accessible anymore.
219
+ """
220
+ if self._converter is None:
221
+ self.create_converter()
222
+
223
+ new_sentences = []
224
+ self.tags = []
225
+ rare_tag_value = self.tag_dict.get(self.rare_tag)
226
+
227
+ for sent in self.sentences:
228
+ new_sent = []
229
+ sentence_tags = []
230
+
231
+ for token, tag in sent:
232
+ new_token = self.converter.convert(token)
233
+ new_sent.append(new_token)
234
+ sentence_tags.append(self.tag_dict.get(tag, rare_tag_value))
235
+
236
+ new_sentences.append(np.array(new_sent))
237
+ self.tags.append(np.array(sentence_tags))
238
+
239
+ self.sentences = new_sentences
240
+ self.codified = True
241
+
242
+ def get_word_counter(self):
243
+ """
244
+ Returns a Counter object with word type occurrences.
245
+ """
246
+ c = Counter(token.lower() for sent in self.sentences for token, _ in sent)
247
+ return c
248
+
249
+ def get_tag_counter(self):
250
+ """
251
+ Returns a Counter object with tag occurrences.
252
+ """
253
+ c = Counter(tag for sent in self.sentences for _, tag in sent)
254
+ return c
255
+
256
+ def save_tag_dict(self, filename=None, tag_dict=None):
257
+ """
258
+ Saves a tag dictionary to a file as a list of tags.
259
+
260
+ :param tag_dict: the dictionary to save. If None, the default
261
+ tag_dict for the class will be saved.
262
+ :param filename: the file where the dictionary should be saved.
263
+ If None, the class default tag_dict filename will be used.
264
+ """
265
+ if tag_dict is None:
266
+ tag_dict = self.tag_dict
267
+ if filename is None:
268
+ key = '%s_tag_dict' % self.task
269
+ filename = self.md.paths[key]
270
+
271
+ save_tag_dict(filename, tag_dict)
272
+
273
+ def load_tag_dict(self, filename=None):
274
+ """
275
+ Load the tag dictionary from the default file and assign
276
+ it to the tag_dict attribute.
277
+ """
278
+ if filename is None:
279
+ key = '%s_tag_dict' % self.task
280
+ filename = self.md.paths[key]
281
+
282
+ self.tag_dict = load_tag_dict(filename)
283
+
284
+ def get_os_filename(self, filename):
285
+ name, ext = os.path.splitext(filename)
286
+ if os.name == "nt":
287
+ return f"{name}-win{ext}"
288
+ elif sys.platform == "darwin":
289
+ return f"{name}-mac{ext}"
290
+ elif sys.platform.startswith("linux"):
291
+ return f"{name}-linux{ext}"
292
+ return filename
293
+
294
+ def load_morph_lexicon(self, filename=None):
295
+ """
296
+ Load the morph dictionary from the default file and assign
297
+ it to the morph_tag attribute.
298
+ """
299
+ if filename is None:
300
+ key = '%s_morph_lexicon' % self.task
301
+ filename = self.md.paths[key]
302
+
303
+ self.morph_lexicon = load_morph_lexicon(filename)
304
+
305
+ def load_co_lexicon(self, filename=None):
306
+ """
307
+ Load the tag dictionary from the default file and assign
308
+ it to the tag_dict attribute.
309
+ """
310
+ if filename is None:
311
+ key = '%s_co_lexicon' % self.task
312
+ filename = self.get_os_filename(self.md.paths[key])
313
+
314
+ if not os.path.exists(filename):
315
+ raise FileNotFoundError(f"{filename}")
316
+
317
+ with open(filename, 'rb') as f:
318
+ self.co_lexicon = _pickle.load(f)
319
+
320
+ def load_prob_dict(self, filename=None):
321
+ """
322
+ Load the tag dictionary from the default file and assign
323
+ it to the tag_dict attribute.
324
+ """
325
+ if filename is None:
326
+ key = '%s_prob_dict' % self.task
327
+ filename = self.get_os_filename(self.md.paths[key])
328
+
329
+ if not os.path.exists(filename):
330
+ raise FileNotFoundError(f"{filename}")
331
+
332
+ with open(filename, 'rb') as f:
333
+ self.prob_dict = _pickle.load(f)
334
+
335
+
336
+ def _set_metadata(self, md):
337
+ if md is None:
338
+ #metadata not provided = using global data_dir for files
339
+ self.md = metadata.Metadata(self.task, config.FILES)
340
+ else:
341
+ self.md = md
342
+
343
+ def add_text(self, text):
344
+ """
345
+ Adds more text to the reader. The text must be a sequence of sequences of
346
+ tokens.
347
+ """
348
+ self.sentences.extend(text)
349
+
350
+ def load_dictionary(self):
351
+ """Read a file with a word list and create a dictionary."""
352
+ logger = logging.getLogger("Logger")
353
+ logger.info("Loading vocabulary")
354
+
355
+ # try to load vocabulary specific for the task
356
+ key = 'vocabulary_%s' % self.task
357
+ filename = self.md.paths[key]
358
+ if not os.path.isfile(filename):
359
+ # fallback to generic vocabulary
360
+ filename = self.md.paths['vocabulary']
361
+ if not os.path.isfile(filename):
362
+ raise FileNotFoundException()
363
+
364
+ words = []
365
+ with open(filename, 'rb') as f:
366
+ raw_data = f.read(1024)
367
+ detected = chardet.detect(raw_data).get('encoding', 'utf-8')
368
+ with open(filename, 'rt', encoding = detected) as f:
369
+ for word in f:
370
+ #word = unicode(word, 'utf-8').strip()
371
+ word = word.strip()
372
+ if word:
373
+ words.append(word)
374
+
375
+ wd = WordDictionary.init_from_wordlist(words)
376
+ self.word_dict = wd
377
+ logger.info("Done. Dictionary size is %d types" % wd.num_tokens)
378
+
379
+ def save_dictionary(self, filename=None):
380
+ """
381
+ Saves the reader's word dictionary as a list of words.
382
+
383
+ :param filename: path to the file to save the dictionary.
384
+ if not given, it will be saved in the default nlpnet
385
+ data directory.
386
+ """
387
+ logger = logging.getLogger("Logger")
388
+ if filename is None:
389
+ key = 'vocabulary_%s' % self.task
390
+ filename = self.md.paths[key]
391
+
392
+ self.word_dict.save(filename)
393
+ logger.info("Dictionary saved in %s" % filename)
394
+
395
+ def create_affix_list(self, prefix_or_suffix, max_size, min_occurrences):
396
+ """
397
+ Handle the creation of suffix and prefix lists.
398
+
399
+ Check if there exists an affix list in the data directory. If there isn't,
400
+ create a new one based on the training sentences.
401
+
402
+ :param prefix_or_suffix: string 'prefix' or 'suffix'
403
+ """
404
+ affix_type = prefix_or_suffix.lower()
405
+ assert affix_type == 'suffix' or affix_type == 'prefix'
406
+
407
+ filename = self.md.paths['%ses' % affix_type]
408
+ if os.path.isfile(filename):
409
+ return
410
+
411
+ logger = logging.getLogger("Logger")
412
+ affixes_all_lengths = []
413
+
414
+ # only get the affix size n from words with length at least (n+1)
415
+ types = {re.sub(r'\d', '9', token.lower())
416
+ for sent in self.sentences for token, _ in sent}
417
+
418
+ for length in range(1, max_size + 1):
419
+ if affix_type == 'suffix':
420
+ c = Counter(type_[-length:]
421
+ for type_ in types
422
+ if len(type_) > length)
423
+ else:
424
+ c = Counter(type_[:length]
425
+ for type_ in types
426
+ if len(type_) > length)
427
+ affixes_this_length = [affix for affix in c
428
+ if c[affix] >= min_occurrences]
429
+ affixes_all_lengths.extend(affixes_this_length)
430
+
431
+ logger.info('Created a list of %d %ses.' % (len(affixes_all_lengths), affix_type))
432
+ text = '\n'.join(affixes_all_lengths)
433
+ with open(filename, 'wt') as f:
434
+ f.write(text)
435
+
436
+ @property
437
+ def converter(self):
438
+ """
439
+ Return the token converter, which transforms tokens into their feature
440
+ vector indices. If it doesn't exist, one is created.
441
+ """
442
+ if self._converter is None:
443
+ self.create_converter()
444
+
445
+ return self._converter
446
+
447
+ @converter.setter
448
+ def converter(self, value):
449
+ self._converter = value
450
+
451
+ def create_converter(self):
452
+ """
453
+ Sets up the token converter, which is responsible for transforming tokens into their
454
+ feature vector indices
455
+ """
456
+ def add_affix_extractors(affix):
457
+ """
458
+ Helper function that works for both suffixes and prefixes.
459
+ The parameter affix should be 'suffix' or 'prefix'.
460
+ """
461
+ loader_function = getattr(attributes.Affix, 'load_%ses' % affix)
462
+ loader_function(self.md)
463
+
464
+ # deal with gaps between sizes (i.e., if there are sizes 2, 3, and 5)
465
+ codes = getattr(attributes.Affix, '%s_codes' % affix)
466
+ sizes = sorted(codes)
467
+
468
+ getter = getattr(attributes.Affix, 'get_%s' % affix)
469
+ for size in sizes:
470
+
471
+ # size=size because if we don't use it, lambda sticks to the last value of
472
+ # the loop iterator size
473
+ def f(word, size=size):
474
+ return getter(re.sub(r'\d', '9', word), size)
475
+
476
+ self.converter.add_extractor(f)
477
+
478
+ self._converter = attributes.TokenConverter()
479
+ self.converter.add_extractor(self.word_dict.get)
480
+ if self.md.use_caps:
481
+ self.converter.add_extractor(get_capitalization)
482
+ if self.md.use_prefix:
483
+ add_affix_extractors('prefix')
484
+ if self.md.use_suffix:
485
+ add_affix_extractors('suffix')
@@ -0,0 +1,3 @@
1
+
2
+ from .train_srl import *
3
+ from .srl_reader import SRLReader