nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nltkor/Kor_char.py +193 -0
- nltkor/__init__.py +16 -0
- nltkor/alignment/__init__.py +1315 -0
- nltkor/cider/__init__.py +2 -0
- nltkor/cider/cider.py +55 -0
- nltkor/cider/cider_scorer.py +207 -0
- nltkor/distance/__init__.py +441 -0
- nltkor/distance/wasserstein.py +126 -0
- nltkor/etc.py +22 -0
- nltkor/lazyimport.py +144 -0
- nltkor/make_requirement.py +11 -0
- nltkor/metrics/__init__.py +63 -0
- nltkor/metrics/bartscore.py +301 -0
- nltkor/metrics/bertscore.py +331 -0
- nltkor/metrics/bleu_tensor.py +20 -0
- nltkor/metrics/classical.py +847 -0
- nltkor/metrics/entment.py +24 -0
- nltkor/metrics/eval.py +517 -0
- nltkor/metrics/mauve.py +273 -0
- nltkor/metrics/mauve_utils.py +131 -0
- nltkor/misc/__init__.py +11 -0
- nltkor/misc/string2string_basic_functions.py +59 -0
- nltkor/misc/string2string_default_tokenizer.py +83 -0
- nltkor/misc/string2string_hash_functions.py +159 -0
- nltkor/misc/string2string_word_embeddings.py +503 -0
- nltkor/search/__init__.py +10 -0
- nltkor/search/classical.py +569 -0
- nltkor/search/faiss_search.py +787 -0
- nltkor/search/kobert_tokenizer.py +181 -0
- nltkor/sejong/__init__.py +3 -0
- nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
- nltkor/sejong/ch.py +12 -0
- nltkor/sejong/dict_semClassNum.txt +491 -0
- nltkor/sejong/layer.txt +630 -0
- nltkor/sejong/sejong_download.py +87 -0
- nltkor/sejong/ssem.py +684 -0
- nltkor/similarity/__init__.py +3 -0
- nltkor/similarity/bartscore____.py +337 -0
- nltkor/similarity/bertscore____.py +339 -0
- nltkor/similarity/classical.py +245 -0
- nltkor/similarity/cosine_similarity.py +175 -0
- nltkor/tag/__init__.py +71 -0
- nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
- nltkor/tag/espresso_tag.py +220 -0
- nltkor/tag/libs/__init__.py +10 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
- nltkor/tag/libs/arguments.py +280 -0
- nltkor/tag/libs/attributes.py +231 -0
- nltkor/tag/libs/config.py +159 -0
- nltkor/tag/libs/metadata.py +129 -0
- nltkor/tag/libs/ner/__init__.py +2 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/macmorphoreader.py +7 -0
- nltkor/tag/libs/ner/ner_reader.py +92 -0
- nltkor/tag/libs/network.c +72325 -0
- nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
- nltkor/tag/libs/network.pyx +878 -0
- nltkor/tag/libs/networkconv.pyx +1028 -0
- nltkor/tag/libs/networkdependencyconv.pyx +451 -0
- nltkor/tag/libs/parse/__init__.py +1 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/parse_reader.py +283 -0
- nltkor/tag/libs/pos/__init__.py +2 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/macmorphoreader.py +7 -0
- nltkor/tag/libs/pos/pos_reader.py +97 -0
- nltkor/tag/libs/reader.py +485 -0
- nltkor/tag/libs/srl/__init__.py +3 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__srl_reader_.py +535 -0
- nltkor/tag/libs/srl/srl_reader.py +436 -0
- nltkor/tag/libs/srl/train_srl.py +87 -0
- nltkor/tag/libs/taggers.py +926 -0
- nltkor/tag/libs/utils.py +384 -0
- nltkor/tag/libs/word_dictionary.py +239 -0
- nltkor/tag/libs/wsd/__init__.py +2 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
- nltkor/tag/libs/wsd/wsd_reader.py +93 -0
- nltkor/tokenize/__init__.py +62 -0
- nltkor/tokenize/ko_tokenize.py +115 -0
- nltkor/trans.py +121 -0
- nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
- nltkor-1.2.14.dist-info/METADATA +41 -0
- nltkor-1.2.14.dist-info/RECORD +127 -0
- nltkor-1.2.14.dist-info/WHEEL +5 -0
- nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,485 @@
|
|
1
|
+
#!/usr/env python
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
"""
|
5
|
+
Base class for reading NLP tagging data.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import os
|
9
|
+
import sys
|
10
|
+
import re
|
11
|
+
import abc
|
12
|
+
import logging
|
13
|
+
import numpy as np
|
14
|
+
import chardet
|
15
|
+
import _pickle
|
16
|
+
from collections import Counter
|
17
|
+
|
18
|
+
from . import attributes
|
19
|
+
from . import metadata
|
20
|
+
from . import config
|
21
|
+
from .word_dictionary import WordDictionary
|
22
|
+
from .attributes import get_capitalization
|
23
|
+
from .utils import PickleConverter
|
24
|
+
|
25
|
+
class FileNotFoundException(IOError):
|
26
|
+
"""
|
27
|
+
Dummy class for indicating file not found instead of
|
28
|
+
the broad IOError.
|
29
|
+
"""
|
30
|
+
pass
|
31
|
+
|
32
|
+
pickle_converter = PickleConverter()
|
33
|
+
|
34
|
+
def load_tag_dict(filename):
|
35
|
+
"""
|
36
|
+
Load a tag dictionary from a file containing one tag
|
37
|
+
per line.
|
38
|
+
"""
|
39
|
+
tag_dict = {}
|
40
|
+
with open(filename, 'rb') as f:
|
41
|
+
raw_data = f.read(1024)
|
42
|
+
detected = chardet.detect(raw_data).get('encoding', 'utf-8')
|
43
|
+
with open(filename, 'rt', encoding = detected) as f:
|
44
|
+
code = 0
|
45
|
+
for tag in f:
|
46
|
+
tag = tag.strip()
|
47
|
+
if tag:
|
48
|
+
tag_dict[tag] = code
|
49
|
+
code += 1
|
50
|
+
|
51
|
+
return tag_dict
|
52
|
+
|
53
|
+
def load_morph_lexicon(filename):
|
54
|
+
if not os.path.exists(filename):
|
55
|
+
pickle_converter.convert_morph_lexicon(filename)
|
56
|
+
|
57
|
+
with open(filename, 'rb') as f:
|
58
|
+
return _pickle.load(f)
|
59
|
+
|
60
|
+
|
61
|
+
def save_tag_dict(filename, tag_dict):
|
62
|
+
"""
|
63
|
+
Save the given tag dictionary to the given file. Dictionary
|
64
|
+
is saved with one tag per line, in the order of their codes.
|
65
|
+
"""
|
66
|
+
ordered_keys = sorted(tag_dict, key=tag_dict.get)
|
67
|
+
ordered_keys = sorted(ordered_keys)
|
68
|
+
text = '\n'.join(ordered_keys)
|
69
|
+
with open(filename, 'wt') as f:
|
70
|
+
f.write(text)
|
71
|
+
|
72
|
+
|
73
|
+
class TaggerReader(object):
|
74
|
+
"""
|
75
|
+
Abstract class extending TextReader with useful functions
|
76
|
+
for tagging tasks.
|
77
|
+
"""
|
78
|
+
__metaclass__ = abc.ABCMeta
|
79
|
+
|
80
|
+
def __init__(self, md=None, load_dictionaries=True):
|
81
|
+
'''
|
82
|
+
This class shouldn't be used directly. The constructor only
|
83
|
+
provides method calls for subclasses. Subclasses should call
|
84
|
+
this constructor after initializing the `task` attribute.
|
85
|
+
'''
|
86
|
+
self._set_metadata(md)
|
87
|
+
self.codified = False
|
88
|
+
self._converter = None
|
89
|
+
|
90
|
+
if load_dictionaries:
|
91
|
+
self.load_or_create_dictionary() # vocabulary
|
92
|
+
self.load_or_create_tag_dict() # tagset
|
93
|
+
if self.task == 'pos':
|
94
|
+
self.load_co_lexicon() # complicated morph lexicon
|
95
|
+
self.load_morph_lexicon() # user morph lexicon
|
96
|
+
self.load_prob_dict() # probability data
|
97
|
+
|
98
|
+
@abc.abstractmethod
|
99
|
+
def task(self):
|
100
|
+
"""
|
101
|
+
The task the tagger reads data for.
|
102
|
+
Must be defined in subclasses.
|
103
|
+
"""
|
104
|
+
return None
|
105
|
+
|
106
|
+
def load_or_create_dictionary(self):
|
107
|
+
"""
|
108
|
+
Try to load the vocabulary from the default location. If the vocabulary
|
109
|
+
file is not available, create a new one from the sentences available
|
110
|
+
and save it.
|
111
|
+
"""
|
112
|
+
try:
|
113
|
+
self.load_dictionary()
|
114
|
+
except FileNotFoundException:
|
115
|
+
self.generate_dictionary(minimum_occurrences=1)
|
116
|
+
#self.generate_dictionary(minimum_occurrences=2)
|
117
|
+
self.save_dictionary()
|
118
|
+
|
119
|
+
def load_or_create_tag_dict(self):
|
120
|
+
"""
|
121
|
+
Try to load the tag dictionary from the default location. If the dictinaty
|
122
|
+
file is not available, scan the available sentences and create a new one.
|
123
|
+
"""
|
124
|
+
key = '%s_tag_dict' % self.task
|
125
|
+
filename = self.md.paths[key]
|
126
|
+
if os.path.isfile(filename):
|
127
|
+
self.load_tag_dict(filename)
|
128
|
+
return
|
129
|
+
|
130
|
+
tags = {tag for sent in self.sentences for _, tag in sent}
|
131
|
+
self.tag_dict = {tag: code for code, tag in enumerate(tags)}
|
132
|
+
self.save_tag_dict(filename)
|
133
|
+
|
134
|
+
def load_morph_lexicon(self):
|
135
|
+
"""
|
136
|
+
Try to load the morph lexicon to have to be captured from the default location.
|
137
|
+
"""
|
138
|
+
logger = logging.getLogger("Logger")
|
139
|
+
|
140
|
+
key = '%s_morph_lexicon' % self.task
|
141
|
+
filename = self.md.paths[key]
|
142
|
+
if os.path.isfile(filename):
|
143
|
+
print(filename)
|
144
|
+
self.load_morph_lexicon(filename)
|
145
|
+
return
|
146
|
+
else:
|
147
|
+
logger.info("Can not find % " % filename)
|
148
|
+
|
149
|
+
def load_co_lexicon(self):
|
150
|
+
"""
|
151
|
+
Try to load the morph pattern lexicon from the default location.
|
152
|
+
"""
|
153
|
+
logger = logging.getLogger("Logger")
|
154
|
+
|
155
|
+
key = '%s_co_lexicon' % self.task
|
156
|
+
filename = self.md.paths[key]
|
157
|
+
if os.path.isfile(filename):
|
158
|
+
self.load_co_lexicon(filename)
|
159
|
+
return
|
160
|
+
else:
|
161
|
+
logger.info("Can not find % " % filename)
|
162
|
+
|
163
|
+
def load_prob_dict(self):
|
164
|
+
"""
|
165
|
+
Try to load the morph pattern lexicon from the default location.
|
166
|
+
"""
|
167
|
+
logger = logging.getLogger("Logger")
|
168
|
+
|
169
|
+
key = '%s_prob_dict' % self.task
|
170
|
+
filename = self.md.paths[key]
|
171
|
+
if os.path.isfile(filename):
|
172
|
+
self.load_prob_dict(filename)
|
173
|
+
return
|
174
|
+
else:
|
175
|
+
logger.info("Can not find % " % filename)
|
176
|
+
|
177
|
+
|
178
|
+
|
179
|
+
def generate_dictionary(self, dict_size=None, minimum_occurrences=1):
|
180
|
+
"""
|
181
|
+
Generates a token dictionary based on the given sentences.
|
182
|
+
|
183
|
+
:param dict_size: Max number of tokens to be included in the dictionary.
|
184
|
+
:param minimum_occurrences: Minimum number of times that a token must
|
185
|
+
appear in the text in order to be included in the dictionary.
|
186
|
+
"""
|
187
|
+
logger = logging.getLogger("Logger")
|
188
|
+
|
189
|
+
tokens = [token for sent in self.sentences for token, _ in sent]
|
190
|
+
self.word_dict = WordDictionary(tokens, dict_size, minimum_occurrences)
|
191
|
+
logger.info("Created dictionary with %d types" % self.word_dict.num_tokens)
|
192
|
+
|
193
|
+
def get_inverse_tag_dictionary(self):
|
194
|
+
"""
|
195
|
+
Returns a version of the tag dictionary that maps numbers to tags.
|
196
|
+
Used for consulting the meaning of the network's output.
|
197
|
+
"""
|
198
|
+
tuples = [(x[1], x[0]) for x in self.tag_dict.items()]
|
199
|
+
ret = dict(tuples)
|
200
|
+
|
201
|
+
return ret
|
202
|
+
|
203
|
+
def codify_sentence(self, sentence):
|
204
|
+
"""
|
205
|
+
Converts a given sentence into the indices used by the neural network.
|
206
|
+
|
207
|
+
:param sentence: a sequence of tokens, already tokenized
|
208
|
+
"""
|
209
|
+
if self._converter is None:
|
210
|
+
self.create_converter()
|
211
|
+
return np.array([self.converter.convert(t) for t in sentence])
|
212
|
+
|
213
|
+
def codify_sentences(self):
|
214
|
+
logger = logging.getLogger("Logger")
|
215
|
+
logger.info("data structuring.")
|
216
|
+
"""
|
217
|
+
Converts each token in each sequence into indices to their feature vectors
|
218
|
+
in feature matrices. The previous sentences as text are not accessible anymore.
|
219
|
+
"""
|
220
|
+
if self._converter is None:
|
221
|
+
self.create_converter()
|
222
|
+
|
223
|
+
new_sentences = []
|
224
|
+
self.tags = []
|
225
|
+
rare_tag_value = self.tag_dict.get(self.rare_tag)
|
226
|
+
|
227
|
+
for sent in self.sentences:
|
228
|
+
new_sent = []
|
229
|
+
sentence_tags = []
|
230
|
+
|
231
|
+
for token, tag in sent:
|
232
|
+
new_token = self.converter.convert(token)
|
233
|
+
new_sent.append(new_token)
|
234
|
+
sentence_tags.append(self.tag_dict.get(tag, rare_tag_value))
|
235
|
+
|
236
|
+
new_sentences.append(np.array(new_sent))
|
237
|
+
self.tags.append(np.array(sentence_tags))
|
238
|
+
|
239
|
+
self.sentences = new_sentences
|
240
|
+
self.codified = True
|
241
|
+
|
242
|
+
def get_word_counter(self):
|
243
|
+
"""
|
244
|
+
Returns a Counter object with word type occurrences.
|
245
|
+
"""
|
246
|
+
c = Counter(token.lower() for sent in self.sentences for token, _ in sent)
|
247
|
+
return c
|
248
|
+
|
249
|
+
def get_tag_counter(self):
|
250
|
+
"""
|
251
|
+
Returns a Counter object with tag occurrences.
|
252
|
+
"""
|
253
|
+
c = Counter(tag for sent in self.sentences for _, tag in sent)
|
254
|
+
return c
|
255
|
+
|
256
|
+
def save_tag_dict(self, filename=None, tag_dict=None):
|
257
|
+
"""
|
258
|
+
Saves a tag dictionary to a file as a list of tags.
|
259
|
+
|
260
|
+
:param tag_dict: the dictionary to save. If None, the default
|
261
|
+
tag_dict for the class will be saved.
|
262
|
+
:param filename: the file where the dictionary should be saved.
|
263
|
+
If None, the class default tag_dict filename will be used.
|
264
|
+
"""
|
265
|
+
if tag_dict is None:
|
266
|
+
tag_dict = self.tag_dict
|
267
|
+
if filename is None:
|
268
|
+
key = '%s_tag_dict' % self.task
|
269
|
+
filename = self.md.paths[key]
|
270
|
+
|
271
|
+
save_tag_dict(filename, tag_dict)
|
272
|
+
|
273
|
+
def load_tag_dict(self, filename=None):
|
274
|
+
"""
|
275
|
+
Load the tag dictionary from the default file and assign
|
276
|
+
it to the tag_dict attribute.
|
277
|
+
"""
|
278
|
+
if filename is None:
|
279
|
+
key = '%s_tag_dict' % self.task
|
280
|
+
filename = self.md.paths[key]
|
281
|
+
|
282
|
+
self.tag_dict = load_tag_dict(filename)
|
283
|
+
|
284
|
+
def get_os_filename(self, filename):
|
285
|
+
name, ext = os.path.splitext(filename)
|
286
|
+
if os.name == "nt":
|
287
|
+
return f"{name}-win{ext}"
|
288
|
+
elif sys.platform == "darwin":
|
289
|
+
return f"{name}-mac{ext}"
|
290
|
+
elif sys.platform.startswith("linux"):
|
291
|
+
return f"{name}-linux{ext}"
|
292
|
+
return filename
|
293
|
+
|
294
|
+
def load_morph_lexicon(self, filename=None):
|
295
|
+
"""
|
296
|
+
Load the morph dictionary from the default file and assign
|
297
|
+
it to the morph_tag attribute.
|
298
|
+
"""
|
299
|
+
if filename is None:
|
300
|
+
key = '%s_morph_lexicon' % self.task
|
301
|
+
filename = self.md.paths[key]
|
302
|
+
|
303
|
+
self.morph_lexicon = load_morph_lexicon(filename)
|
304
|
+
|
305
|
+
def load_co_lexicon(self, filename=None):
|
306
|
+
"""
|
307
|
+
Load the tag dictionary from the default file and assign
|
308
|
+
it to the tag_dict attribute.
|
309
|
+
"""
|
310
|
+
if filename is None:
|
311
|
+
key = '%s_co_lexicon' % self.task
|
312
|
+
filename = self.get_os_filename(self.md.paths[key])
|
313
|
+
|
314
|
+
if not os.path.exists(filename):
|
315
|
+
raise FileNotFoundError(f"{filename}")
|
316
|
+
|
317
|
+
with open(filename, 'rb') as f:
|
318
|
+
self.co_lexicon = _pickle.load(f)
|
319
|
+
|
320
|
+
def load_prob_dict(self, filename=None):
|
321
|
+
"""
|
322
|
+
Load the tag dictionary from the default file and assign
|
323
|
+
it to the tag_dict attribute.
|
324
|
+
"""
|
325
|
+
if filename is None:
|
326
|
+
key = '%s_prob_dict' % self.task
|
327
|
+
filename = self.get_os_filename(self.md.paths[key])
|
328
|
+
|
329
|
+
if not os.path.exists(filename):
|
330
|
+
raise FileNotFoundError(f"{filename}")
|
331
|
+
|
332
|
+
with open(filename, 'rb') as f:
|
333
|
+
self.prob_dict = _pickle.load(f)
|
334
|
+
|
335
|
+
|
336
|
+
def _set_metadata(self, md):
|
337
|
+
if md is None:
|
338
|
+
#metadata not provided = using global data_dir for files
|
339
|
+
self.md = metadata.Metadata(self.task, config.FILES)
|
340
|
+
else:
|
341
|
+
self.md = md
|
342
|
+
|
343
|
+
def add_text(self, text):
|
344
|
+
"""
|
345
|
+
Adds more text to the reader. The text must be a sequence of sequences of
|
346
|
+
tokens.
|
347
|
+
"""
|
348
|
+
self.sentences.extend(text)
|
349
|
+
|
350
|
+
def load_dictionary(self):
|
351
|
+
"""Read a file with a word list and create a dictionary."""
|
352
|
+
logger = logging.getLogger("Logger")
|
353
|
+
logger.info("Loading vocabulary")
|
354
|
+
|
355
|
+
# try to load vocabulary specific for the task
|
356
|
+
key = 'vocabulary_%s' % self.task
|
357
|
+
filename = self.md.paths[key]
|
358
|
+
if not os.path.isfile(filename):
|
359
|
+
# fallback to generic vocabulary
|
360
|
+
filename = self.md.paths['vocabulary']
|
361
|
+
if not os.path.isfile(filename):
|
362
|
+
raise FileNotFoundException()
|
363
|
+
|
364
|
+
words = []
|
365
|
+
with open(filename, 'rb') as f:
|
366
|
+
raw_data = f.read(1024)
|
367
|
+
detected = chardet.detect(raw_data).get('encoding', 'utf-8')
|
368
|
+
with open(filename, 'rt', encoding = detected) as f:
|
369
|
+
for word in f:
|
370
|
+
#word = unicode(word, 'utf-8').strip()
|
371
|
+
word = word.strip()
|
372
|
+
if word:
|
373
|
+
words.append(word)
|
374
|
+
|
375
|
+
wd = WordDictionary.init_from_wordlist(words)
|
376
|
+
self.word_dict = wd
|
377
|
+
logger.info("Done. Dictionary size is %d types" % wd.num_tokens)
|
378
|
+
|
379
|
+
def save_dictionary(self, filename=None):
|
380
|
+
"""
|
381
|
+
Saves the reader's word dictionary as a list of words.
|
382
|
+
|
383
|
+
:param filename: path to the file to save the dictionary.
|
384
|
+
if not given, it will be saved in the default nlpnet
|
385
|
+
data directory.
|
386
|
+
"""
|
387
|
+
logger = logging.getLogger("Logger")
|
388
|
+
if filename is None:
|
389
|
+
key = 'vocabulary_%s' % self.task
|
390
|
+
filename = self.md.paths[key]
|
391
|
+
|
392
|
+
self.word_dict.save(filename)
|
393
|
+
logger.info("Dictionary saved in %s" % filename)
|
394
|
+
|
395
|
+
def create_affix_list(self, prefix_or_suffix, max_size, min_occurrences):
|
396
|
+
"""
|
397
|
+
Handle the creation of suffix and prefix lists.
|
398
|
+
|
399
|
+
Check if there exists an affix list in the data directory. If there isn't,
|
400
|
+
create a new one based on the training sentences.
|
401
|
+
|
402
|
+
:param prefix_or_suffix: string 'prefix' or 'suffix'
|
403
|
+
"""
|
404
|
+
affix_type = prefix_or_suffix.lower()
|
405
|
+
assert affix_type == 'suffix' or affix_type == 'prefix'
|
406
|
+
|
407
|
+
filename = self.md.paths['%ses' % affix_type]
|
408
|
+
if os.path.isfile(filename):
|
409
|
+
return
|
410
|
+
|
411
|
+
logger = logging.getLogger("Logger")
|
412
|
+
affixes_all_lengths = []
|
413
|
+
|
414
|
+
# only get the affix size n from words with length at least (n+1)
|
415
|
+
types = {re.sub(r'\d', '9', token.lower())
|
416
|
+
for sent in self.sentences for token, _ in sent}
|
417
|
+
|
418
|
+
for length in range(1, max_size + 1):
|
419
|
+
if affix_type == 'suffix':
|
420
|
+
c = Counter(type_[-length:]
|
421
|
+
for type_ in types
|
422
|
+
if len(type_) > length)
|
423
|
+
else:
|
424
|
+
c = Counter(type_[:length]
|
425
|
+
for type_ in types
|
426
|
+
if len(type_) > length)
|
427
|
+
affixes_this_length = [affix for affix in c
|
428
|
+
if c[affix] >= min_occurrences]
|
429
|
+
affixes_all_lengths.extend(affixes_this_length)
|
430
|
+
|
431
|
+
logger.info('Created a list of %d %ses.' % (len(affixes_all_lengths), affix_type))
|
432
|
+
text = '\n'.join(affixes_all_lengths)
|
433
|
+
with open(filename, 'wt') as f:
|
434
|
+
f.write(text)
|
435
|
+
|
436
|
+
@property
|
437
|
+
def converter(self):
|
438
|
+
"""
|
439
|
+
Return the token converter, which transforms tokens into their feature
|
440
|
+
vector indices. If it doesn't exist, one is created.
|
441
|
+
"""
|
442
|
+
if self._converter is None:
|
443
|
+
self.create_converter()
|
444
|
+
|
445
|
+
return self._converter
|
446
|
+
|
447
|
+
@converter.setter
|
448
|
+
def converter(self, value):
|
449
|
+
self._converter = value
|
450
|
+
|
451
|
+
def create_converter(self):
|
452
|
+
"""
|
453
|
+
Sets up the token converter, which is responsible for transforming tokens into their
|
454
|
+
feature vector indices
|
455
|
+
"""
|
456
|
+
def add_affix_extractors(affix):
|
457
|
+
"""
|
458
|
+
Helper function that works for both suffixes and prefixes.
|
459
|
+
The parameter affix should be 'suffix' or 'prefix'.
|
460
|
+
"""
|
461
|
+
loader_function = getattr(attributes.Affix, 'load_%ses' % affix)
|
462
|
+
loader_function(self.md)
|
463
|
+
|
464
|
+
# deal with gaps between sizes (i.e., if there are sizes 2, 3, and 5)
|
465
|
+
codes = getattr(attributes.Affix, '%s_codes' % affix)
|
466
|
+
sizes = sorted(codes)
|
467
|
+
|
468
|
+
getter = getattr(attributes.Affix, 'get_%s' % affix)
|
469
|
+
for size in sizes:
|
470
|
+
|
471
|
+
# size=size because if we don't use it, lambda sticks to the last value of
|
472
|
+
# the loop iterator size
|
473
|
+
def f(word, size=size):
|
474
|
+
return getter(re.sub(r'\d', '9', word), size)
|
475
|
+
|
476
|
+
self.converter.add_extractor(f)
|
477
|
+
|
478
|
+
self._converter = attributes.TokenConverter()
|
479
|
+
self.converter.add_extractor(self.word_dict.get)
|
480
|
+
if self.md.use_caps:
|
481
|
+
self.converter.add_extractor(get_capitalization)
|
482
|
+
if self.md.use_prefix:
|
483
|
+
add_affix_extractors('prefix')
|
484
|
+
if self.md.use_suffix:
|
485
|
+
add_affix_extractors('suffix')
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|