nltkor 1.2.0__cp39-cp39-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +15 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +814 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +467 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/ch.py +12 -0
  32. nltkor/sejong/dict_semClassNum.txt +491 -0
  33. nltkor/sejong/layer.txt +630 -0
  34. nltkor/sejong/sejong_download.py +87 -0
  35. nltkor/sejong/ssem.py +685 -0
  36. nltkor/similarity/__init__.py +3 -0
  37. nltkor/similarity/bartscore____.py +337 -0
  38. nltkor/similarity/bertscore____.py +339 -0
  39. nltkor/similarity/classical.py +245 -0
  40. nltkor/similarity/cosine_similarity.py +175 -0
  41. nltkor/tag/__init__.py +70 -0
  42. nltkor/tag/espresso_tag.py +220 -0
  43. nltkor/tag/libs/__init__.py +9 -0
  44. nltkor/tag/libs/arguments.py +280 -0
  45. nltkor/tag/libs/attributes.py +231 -0
  46. nltkor/tag/libs/config.py +158 -0
  47. nltkor/tag/libs/metadata.py +129 -0
  48. nltkor/tag/libs/ner/__init__.py +2 -0
  49. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  50. nltkor/tag/libs/ner/ner_reader.py +92 -0
  51. nltkor/tag/libs/network.c +59267 -0
  52. nltkor/tag/libs/network.cpython-39-darwin.so +0 -0
  53. nltkor/tag/libs/parse/__init__.py +1 -0
  54. nltkor/tag/libs/parse/parse_reader.py +283 -0
  55. nltkor/tag/libs/pos/__init__.py +2 -0
  56. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  57. nltkor/tag/libs/pos/pos_reader.py +89 -0
  58. nltkor/tag/libs/reader.py +510 -0
  59. nltkor/tag/libs/srl/__init__.py +3 -0
  60. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  61. nltkor/tag/libs/srl/srl_reader.py +436 -0
  62. nltkor/tag/libs/srl/train_srl.py +87 -0
  63. nltkor/tag/libs/taggers.py +926 -0
  64. nltkor/tag/libs/utils.py +344 -0
  65. nltkor/tag/libs/word_dictionary.py +239 -0
  66. nltkor/tag/libs/wsd/__init__.py +2 -0
  67. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  68. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  69. nltkor/tokenize/__init__.py +62 -0
  70. nltkor/tokenize/ko_tokenize.py +115 -0
  71. nltkor/trans.py +121 -0
  72. nltkor-1.2.0.dist-info/LICENSE.txt +1093 -0
  73. nltkor-1.2.0.dist-info/METADATA +33 -0
  74. nltkor-1.2.0.dist-info/RECORD +76 -0
  75. nltkor-1.2.0.dist-info/WHEEL +5 -0
  76. nltkor-1.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,344 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """
4
+ Utility functions
5
+ """
6
+
7
+ import re
8
+ import os, sys
9
+ import logging
10
+ import nltk
11
+ import nltkor
12
+ #from nltkor import Kor_char
13
+ from nltkor.tokenize import Ko_tokenize
14
+ import numpy as np
15
+
16
+ #from nltk.tokenize.regexp import RegexpTokenizer
17
+ #from nltk.tokenize import TreebankWordTokenizer
18
+ from . import attributes
19
+
20
+
21
+ def get_word_from_morph_lexicon(root, word, tags, space_flag):
22
+ '''
23
+ space_flag: if True then including space, otherwise do not including space
24
+ '''
25
+
26
+ values = list()
27
+ value_data = list()
28
+ if not word: return root.keys()
29
+
30
+ current_dict = root
31
+ _end = '$$'
32
+ s = 0
33
+ for i, letter in enumerate(word):
34
+ #print(i, '>', letter, current_dict)
35
+ if letter in current_dict:
36
+ #print(letter, current_dict[letter])
37
+ current_dict = current_dict[letter]
38
+ if _end in current_dict :
39
+ for idx in range(i-s): values.pop()
40
+ values.append(word[s:i+1])
41
+ for idx in range(i-s): value_data.pop()
42
+ value_data.append(current_dict[_end])
43
+ else: values.append(letter); value_data.append(tags[i])
44
+ else:
45
+ #print('==', letter, values)
46
+ if space_flag or letter != ' ':
47
+ values.append(letter) # 최장일치 : -1
48
+ value_data.append(tags[i])
49
+ s = i+1
50
+ current_dict = root
51
+ else:
52
+ if values: return values, value_data
53
+ else: return list(word), tags
54
+
55
+ def intersperse(lst, item):
56
+ result = [item] * (len(lst) * 2 - 1)
57
+ result[0::2] = lst
58
+ return result
59
+
60
+ def get_word(root, word, tags, space_flag=False) :
61
+ '''
62
+ space_flag : True : 공백이 있어도 매칭됨
63
+ False: 공백이 있으면 매칭 안됨
64
+ '''
65
+ word_list = get_word_from_morph_lexicon(root, word, tags, space_flag)
66
+ return word_list
67
+
68
+
69
+
70
+ def tokenize(text, use_sent_tokenizer=True):
71
+ """
72
+ Call the tokenizer function for the given language.
73
+ The returned tokens are in a list of lists, one for each sentence.
74
+
75
+ :param use_sent_tokenizer: True : use sentence tokenizer
76
+ False : sentence per line
77
+ """
78
+ return tokenize_ko(text, use_sent_tokenizer)
79
+
80
+ def tokenize_ko(text, use_sent_tokenizer=True, clean=True):
81
+ """
82
+ text: string
83
+ Return a list of lists of the tokens in text, separated by sentences.
84
+ """
85
+ if clean:
86
+ text = clean_kotext(text)
87
+
88
+ if use_sent_tokenizer:
89
+ ## False: 띄어쓰기 무시, True: 띄어쓰기 고려
90
+ sentences = [Ko_tokenize.syllable(sentence, True) for sentence in Ko_tokenize.sentence(text)]
91
+ else:
92
+ sentences = [Ko_tokenize.syllable(text, True)]
93
+
94
+ return sentences
95
+
96
+ def clean_kotext(text, correct=False):
97
+ """
98
+ 1. 특수 공백문자를 공백으로 처리
99
+ Apply some transformations to the text, such as
100
+ replacing digits for 9 and simplifying quotation marks.
101
+
102
+ :param correct: If True, tries to correct punctuation misspellings.
103
+ """
104
+ # replaces different kinds of quotation marks with "
105
+ # take care not to remove apostrophes
106
+ '''
107
+ text = re.sub(r"(?u)(^|\W)[‘’′`']", r'\1"', text)
108
+ text = re.sub(r"(?u)[‘’`′'](\W|$)", r'"\1', text)
109
+ text = re.sub(r'(?u)[«»“”]', '"', text)
110
+
111
+ if correct:
112
+ # tries to fix mistyped tokens (common in Wikipedia-pt) as ,, '' ..
113
+ text = re.sub(r'(?<!\.)\.\.(?!\.)', '.', text) # take care with ellipses
114
+ text = re.sub(r'([,";:])\1,', r'\1', text)
115
+
116
+ # inserts space after leading hyphen. It happens sometimes in cases like:
117
+ # blablabla -that is, bloblobloblo
118
+ text = re.sub(' -(?=[^\W\d_])', ' - ', text)
119
+ '''
120
+
121
+ # replaces numbers with the 9's
122
+ text = re.sub(r'\xa0', ' ', text)
123
+ text = re.sub(u' ', ' ', text)
124
+ text = re.sub(u' ', ' ', text)
125
+ text = re.sub(u' +', ' ', text) # 연속된 공백 처리
126
+ # replaces numbers with the 9's, 사전쪽과 같이 판단할 것
127
+ #text = re.sub(r'\d', '9', text)
128
+ # replaces english character with the a's
129
+ #text = re.sub(r'[a-zA-Z]', 'a', text)
130
+ # replaces chinses character with the 家's
131
+ #for x in re.findall(r'[\u4e00-\u9fff]', text):
132
+ # text = re.sub(x, '家', text)
133
+
134
+ # replaces special ellipsis character
135
+ #text = text.replace(u'…', '...')
136
+
137
+ return text
138
+
139
+
140
+
141
+ def generate_feature_vectors(num_vectors, num_features, min_value=-0.1, max_value=0.1):
142
+ """
143
+ Generates vectors of real numbers, to be used as word features.
144
+ Vectors are initialized randomly. Returns a 2-dim numpy array.
145
+ """
146
+ logger = logging.getLogger("Logger")
147
+ #table = (max_value * 2) * np.random.random((num_vectors, num_vectors, num_features, num_features)) + min_value
148
+ table = (max_value * 2) * np.random.random((num_vectors, num_features)) + min_value
149
+ logger.debug("Generated %d feature vectors with %d features each." % (num_vectors, num_features))
150
+ print("Generated %d feature vectors with %d features each." % (num_vectors, num_features))
151
+
152
+ return table
153
+
154
+
155
+ def count_lines(filename):
156
+ """Counts and returns how many non empty lines in a file there are."""
157
+ with open(filename, 'r') as f:
158
+ lines = [x for x in list(f) if x.strip()]
159
+ return len(lines)
160
+
161
+ def _create_affix_tables(affix, table_list, num_features):
162
+ """
163
+ Internal helper function for loading suffix or prefix feature tables
164
+ into the given list.
165
+ affix should be either 'suffix' or 'prefix'.
166
+ """
167
+ logger = logging.getLogger('Logger')
168
+ logger.info('Generating %s features...' % affix)
169
+ tensor = []
170
+ codes = getattr(attributes.Affix, '%s_codes' % affix)
171
+ num_affixes_per_size = getattr(attributes.Affix, 'num_%ses_per_size' % affix)
172
+ for size in codes:
173
+
174
+ # use num_*_per_size because it accounts for special suffix codes
175
+ num_affixes = num_affixes_per_size[size]
176
+ table = generate_feature_vectors(num_affixes, num_features)
177
+ tensor.append(table)
178
+
179
+ # affix attribute actually has a 3-dim tensor
180
+ # (concatenation of 2d tables, one for each suffix size)
181
+ for table in tensor:
182
+ table_list.append(table)
183
+
184
+ def create_feature_tables(args, md, text_reader):
185
+ """
186
+ Create the feature tables to be used by the network. If the args object
187
+ contains the load_features option as true, the feature table for word types
188
+ is loaded instead of being created. The actual number of
189
+ feature tables will depend on the argument options.
190
+
191
+ :param arguments: Parameters supplied to the program
192
+ :param md: metadata about the network
193
+ :param text_reader: The TextReader being used.
194
+ :returns: all the feature tables to be used
195
+ """
196
+
197
+ logger = logging.getLogger("Logger")
198
+ feature_tables = []
199
+
200
+ if not args.load_types:
201
+ logger.info("Generating word type features...")
202
+ table_size = len(text_reader.word_dict)
203
+ types_table = generate_feature_vectors(table_size, args.num_features)
204
+ else:
205
+ logger.info("Loading word type features...")
206
+ # check if there is a word feature file specific for the task
207
+ # if not, load a generic one
208
+ filename = md.paths[md.type_features]
209
+ if os.path.exists(filename):
210
+ types_table = load_features_from_file(filename)
211
+ else:
212
+ filename = md.paths['type_features']
213
+ types_table = load_features_from_file(filename)
214
+
215
+ if len(types_table) < len(text_reader.word_dict):
216
+ # the type dictionary provided has more types than
217
+ # the number of feature vectors. So, let's generate
218
+ # feature vectors for the new types by replicating the vector
219
+ # associated with the RARE word
220
+ diff = len(text_reader.word_dict) - len(types_table)
221
+ logger.warning("Number of types in feature table and dictionary differ.")
222
+ logger.warning("Generating features for %d new types." % diff)
223
+ num_features = len(types_table[0])
224
+ new_vecs = generate_feature_vectors(diff, num_features)
225
+ types_table = np.append(types_table, new_vecs, axis=0)
226
+
227
+ elif len(types_table) < len(text_reader.word_dict):
228
+ logger.warning("Number of features provided is greater than the number of tokens\
229
+ in the dictionary. The extra features will be ignored.")
230
+
231
+ feature_tables.append(types_table) # head
232
+ #print(md.task)
233
+ #if md.task in ['labeled_dependency', 'unlabeled_dependency']:
234
+ # feature_tables.append(types_table) # tail
235
+
236
+ # Capitalization
237
+ if md.use_caps:
238
+ logger.info("Generating capitalization features...")
239
+ caps_table = generate_feature_vectors(attributes.Caps.num_values, args.caps)
240
+ feature_tables.append(caps_table)
241
+
242
+ # Prefixes
243
+ if md.use_prefix:
244
+ _create_affix_tables('prefix', feature_tables, args.prefix)
245
+
246
+ # Suffixes
247
+ if md.use_suffix:
248
+ _create_affix_tables('suffix', feature_tables, args.suffix)
249
+
250
+ # POS tags
251
+ if md.use_pos:
252
+ logger.info("Generating POS features...")
253
+ num_pos_tags = text_reader.get_num_pos_tags()
254
+ pos_table = generate_feature_vectors(num_pos_tags, args.pos)
255
+ #feature_tables.append(pos_table) # head # 여기와 *_reader의 converter와 일치해야 한다.
256
+ feature_tables.append(pos_table) # tail
257
+
258
+ # chunk tags
259
+ if md.use_chunk:
260
+ logger.info("Generating chunk features...")
261
+ num_chunk_tags = count_lines(md.paths['chunk_tags'])
262
+ chunk_table = generate_feature_vectors(num_chunk_tags, args.chunk)
263
+ feature_tables.append(chunk_table)
264
+
265
+ #print(len(feature_tables))
266
+ return feature_tables
267
+
268
+
269
+
270
+ def set_distance_features(max_dist=None,
271
+ num_target_features=None, num_pred_features=None):
272
+ """
273
+ Returns the distance feature tables to be used by a convolutional network.
274
+ One table is for relative distance to the target predicate, the other
275
+ to the predicate.
276
+
277
+ :param max_dist: maximum distance to be used in new vectors.
278
+ """
279
+ logger = logging.getLogger("Logger")
280
+
281
+ # max_dist before/after, 0 distance, and distances above the max
282
+ max_dist = 2 * (max_dist + 1) + 1
283
+ logger.info("Generating target word distance features...")
284
+ target_dist = generate_feature_vectors(max_dist, num_target_features)
285
+ logger.info("Generating predicate distance features...")
286
+ pred_dist = generate_feature_vectors(max_dist, num_pred_features)
287
+
288
+ return [target_dist, pred_dist]
289
+
290
+
291
+ def set_logger(level):
292
+ """Sets the logger to be used throughout the system."""
293
+ log_format = '%(message)s'
294
+ logging.basicConfig(format=log_format)
295
+ logger = logging.getLogger("Logger")
296
+ logger.setLevel(level)
297
+
298
+ def load_features_from_file(features_file):
299
+ """Reads a file with features written as binary data."""
300
+ return np.load(features_file)
301
+
302
+ def save_features_to_file(table, features_file):
303
+ """Saves a feature table to a given file, writing binary data."""
304
+ np.save(features_file, table)
305
+
306
+ def convert_iobes_to_bracket(tag):
307
+ """
308
+ Convert tags from the IOBES scheme to the CoNLL bracketing.
309
+
310
+ Example:
311
+ B-A0 -> (A0*
312
+ I-A0 -> *
313
+ E-A0 -> *)
314
+ S-A1 -> (A1*)
315
+ O -> *
316
+ """
317
+ if tag.startswith('I') or tag.startswith('O'):
318
+ return '*'
319
+ if tag.startswith('B'):
320
+ return '(%s*' % tag[2:]
321
+ if tag.startswith('E'):
322
+ return '*)'
323
+ if tag.startswith('S'):
324
+ return '(%s*)' % tag[2:]
325
+ else:
326
+ raise ValueError("Unknown tag: %s" % tag)
327
+
328
+ def boundaries_to_arg_limits(boundaries):
329
+ """
330
+ Converts a sequence of IOBES tags delimiting arguments to an array
331
+ of argument boundaries, used by the network.
332
+ """
333
+ limits = []
334
+ start = None
335
+
336
+ for i, tag in enumerate(boundaries):
337
+ if tag == 'S':
338
+ limits.append([i, i])
339
+ elif tag == 'B':
340
+ start = i
341
+ elif tag == 'E':
342
+ limits.append([start, i])
343
+
344
+ return np.array(limits, np.int)
@@ -0,0 +1,239 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import itertools
4
+ from collections import Counter, OrderedDict as OD
5
+ from nltkor import Kor_char
6
+
7
+ class WordDictionary(dict):
8
+ """
9
+ Class to store words and their corresponding indices in
10
+ the network lookup table. Also deals with padding and
11
+ maps rare words to a special index.
12
+ """
13
+
14
+ padding_left = '*LEFT*'
15
+ padding_right = '*RIGHT*'
16
+ rare = '*RARE*'
17
+
18
+ number_transformation = {ord(c): '9' for c in '012345678'}
19
+ #english_transformation = {ord(c): 'a' for c in 'abcdefghijklmnopqrstuvwxyz'}
20
+ # 한자 처리 필요 korChar.hanja_syllable() 사용 , utils.py 참조
21
+
22
+ def __init__(self, tokens, size=None, minimum_occurrences=None, wordlist=None):
23
+ """
24
+ Fills a dictionary (to be used for indexing) with the most
25
+ common words in the given text.
26
+
27
+ :param tokens: Either a list of tokens or a list of lists of tokens
28
+ (each token represented as a string).
29
+ :param size: Maximum number of token indices
30
+ (not including paddings, rare, etc.).
31
+ :param minimum_occurrences: The minimum number of occurrences a token must
32
+ have in order to be included.
33
+ :param wordlist: Use this list of words to build the dictionary. Overrides tokens
34
+ if not None and ignores maximum size.
35
+ """
36
+ if wordlist is None:
37
+ # work with the supplied tokens. extract frequencies.
38
+
39
+ # gets frequency count
40
+ c = self._get_frequency_count(tokens)
41
+
42
+ if minimum_occurrences is None:
43
+ minimum_occurrences = 1
44
+
45
+ words = [key for key, number in c.most_common()
46
+ if number >= minimum_occurrences and key]
47
+
48
+ if size is not None and size < len(words):
49
+ words = words[:size]
50
+ else:
51
+ # using ordered dict as an ordered set
52
+ # (we need to keep the order and eliminate duplicates)
53
+ words = [word.lower().translate(WordDictionary.number_transformation) for word in wordlist]
54
+ values = [None] * len(words)
55
+ words = OD(zip(words, values)).keys()
56
+ #for c in words:
57
+ # print c.encode('utf-8')
58
+
59
+ # verifies the maximum size
60
+ if size is None:
61
+ size = len(words)
62
+
63
+ # set all words in the dictionary
64
+ for word, num in zip(words, range(size)):
65
+ self[word] = num
66
+
67
+ # if the given words include one of the the rare or padding symbols, don't replace it
68
+ # 사전 생성 시에 가장 뒤에 padding, rare를 생성
69
+ special_symbols = [WordDictionary.rare.lower(),
70
+ WordDictionary.padding_left.lower(),
71
+ WordDictionary.padding_right.lower()]
72
+
73
+ for symbol in special_symbols:
74
+ if symbol not in words:
75
+ self[symbol] = size
76
+ size += 1
77
+
78
+ self.check()
79
+
80
+ @classmethod
81
+ def init_from_wordlist(cls, wordlist):
82
+ """
83
+ Initializes the WordDictionary instance with a list of words, independently from their
84
+ frequencies. Every word in the list gets an entry.
85
+ """
86
+ return cls(None, wordlist=wordlist)
87
+
88
+ @classmethod
89
+ def init_empty(cls):
90
+ """
91
+ Initializes an empty Word Dictionary.
92
+ """
93
+ return cls([[]])
94
+
95
+ def save(self, filename):
96
+ """
97
+ Saves the word dictionary to the given file as a list of word types.
98
+
99
+ Special words (paddings and rare) are also included.
100
+ """
101
+ sorted_words = sorted(self, key=self.get)
102
+ text = '\n'.join(sorted_words)
103
+ with open(filename, 'w') as f:
104
+ f.write(text)
105
+
106
+ @classmethod
107
+ def load(cls, filename):
108
+ """
109
+ Loads a WordDictionary object from a vocabulary file.
110
+ """
111
+ words = []
112
+ with open(filename, 'r') as f:
113
+ for word in f:
114
+ word = word.strip()
115
+ if word:
116
+ words.append(word)
117
+
118
+ return cls.init_from_wordlist(words)
119
+
120
+ def _get_frequency_count(self, token_list):
121
+ """
122
+ Returns a token counter for tokens in token_list.
123
+
124
+ :param token_list: Either a list of tokens (as strings) or a list
125
+ of lists of tokens.
126
+ """
127
+ if type(token_list[0]) == list:
128
+ c = Counter(t.lower().translate(WordDictionary.number_transformation)
129
+ for sent in token_list for t in sent)
130
+ else:
131
+ c = Counter(t.lower().translate(WordDictionary.number_transformation)
132
+ for t in token_list)
133
+ return c
134
+
135
+
136
+ def update_tokens(self, tokens, size=None, minimum_occurrences=1, freqs=None):
137
+ """
138
+ Updates the dictionary, adding more types until size is reached.
139
+
140
+ :param freqs: a dictionary providing a token count.
141
+ """
142
+ if freqs is None:
143
+ freqs = self._get_frequency_count(tokens)
144
+
145
+ if size is None or size == 0:
146
+ # size None or 0 means no size limit
147
+ size = len(freqs)
148
+
149
+ if self.num_tokens >= size:
150
+ return
151
+ else:
152
+ size_diff = size - self.num_tokens
153
+
154
+ # a new version of freqs with only tokens not present in the dictionary
155
+ # and above minimum frequency
156
+ candidate_tokens = dict((token, freqs[token])
157
+ for token in freqs
158
+ if token not in self and freqs[token] >= minimum_occurrences)
159
+
160
+ # order the types from the most frequent to the least
161
+ new_tokens = sorted(candidate_tokens, key=lambda x: candidate_tokens[x], reverse=True)
162
+
163
+ next_value = len(self)
164
+ for token in new_tokens:
165
+ self[token] = next_value
166
+ next_value += 1
167
+ size_diff -= 1
168
+ if size_diff == 0:
169
+ break
170
+
171
+ self.check()
172
+
173
+ def __contains__(self, key):
174
+ """
175
+ Overrides the "in" operator. Case insensitive.
176
+ """
177
+ transformed = key.lower().translate(WordDictionary.number_transformation)
178
+ return super(WordDictionary, self).__contains__(transformed)
179
+
180
+ def __setitem__(self, key, value):
181
+ """
182
+ Overrides the [] write operator. It converts every key to lower case
183
+ before assignment.
184
+ """
185
+ transformed = key.lower().translate(WordDictionary.number_transformation)
186
+ super(WordDictionary, self).__setitem__(transformed, value)
187
+
188
+ def __getitem__(self, key):
189
+ """
190
+ Overrides the [] read operator.
191
+
192
+ Three differences from the original:
193
+ 1) when given a word without an entry, it returns the value for the *UNKNOWN* key.
194
+ 2) all entries are converted to lower case before verification.
195
+ 3) digits are mapped to 9
196
+ """
197
+ # faster than regexp
198
+ transformed = key.lower().translate(WordDictionary.number_transformation)
199
+ return super(WordDictionary, self).get(transformed, self.index_rare)
200
+
201
+ def get(self, key):
202
+ """
203
+ Overrides the dictionary get method, so when given a word without an entry, it returns
204
+ the value for the *rare* key. Note that it is not possible to supply a default value as
205
+ in the dict class.
206
+ """
207
+ # faster than regexp
208
+ #print (key, type(key))
209
+ transformed = key.lower().translate(WordDictionary.number_transformation)
210
+ #print key, transformed
211
+ return super(WordDictionary, self).get(transformed, self.index_rare)
212
+
213
+ def check(self):
214
+ """
215
+ Checks the internal structure of the dictionary and makes necessary adjustments,
216
+ such as updating num_tokens.
217
+ """
218
+ # since WordDictionary overrides __get__, we use the super call
219
+ # (the WordDictionary __get__ fails when self.index_rare is not set)
220
+ key = WordDictionary.rare.lower()
221
+ self.index_rare = super(WordDictionary, self).get(key)
222
+
223
+ self.index_padding_left = self[WordDictionary.padding_left]
224
+ self.index_padding_right = self[WordDictionary.padding_right]
225
+ self.num_tokens = len(self)
226
+
227
+ def get_words(self, indices):
228
+ """
229
+ Returns the words represented by a sequence of indices.
230
+ """
231
+ words = [w for w in self if self[w] in indices]
232
+ return words
233
+
234
+ def get_indices(self, words):
235
+ """
236
+ Returns the indices corresponding to a sequence of tokens.
237
+ """
238
+ indices = [self[w] for w in words]
239
+ return indices
@@ -0,0 +1,2 @@
1
+
2
+ from .wsd_reader import WSDReader
@@ -0,0 +1,7 @@
1
+
2
+ import warnings
3
+
4
+ # backwards compatibility
5
+ from .wsd_reader import *
6
+
7
+ warnings.warn('Module macmorphoreader is deprecated. Use module pos_reader instead.')
@@ -0,0 +1,93 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """
4
+ Class for dealing with WSD data.
5
+ """
6
+
7
+ from ..reader import TaggerReader
8
+
9
+ class ConllWSD(object):
10
+ """
11
+ Dummy class for storing column positions in a conll file.
12
+ """
13
+ id = 0
14
+ word = 1
15
+ pos = 2
16
+ wsd = 3
17
+ SEP = '\t'
18
+
19
+ class WSDReader(TaggerReader):
20
+ """
21
+ This class reads data from a POS corpus and turns it into a format
22
+ readable by the neural network for the POS tagging task.
23
+ """
24
+
25
+ def __init__(self, md=None, filename=None, load_dictionaries=True):
26
+ """
27
+ Constructor
28
+ """
29
+ self.rare_tag = None
30
+ self.sentences = []
31
+ if filename is not None:
32
+ try:
33
+ self._read_plain(filename)
34
+ except:
35
+ self._read_conll(filename)
36
+
37
+ super(WSDReader, self).__init__(md, load_dictionaries=load_dictionaries)
38
+
39
+ @property
40
+ def task(self):
41
+ """
42
+ Abstract Base Class (ABC) attribute.
43
+ """
44
+ return 'wsd'
45
+
46
+ def _read_plain(self, filename):
47
+ """
48
+ Read data from a "plain" file, with one sentence per line, each token
49
+ as token_tag.
50
+ """
51
+ self.sentences = []
52
+ with open(filename, 'rt') as f:
53
+ for line in f:
54
+ #line = unicode(line, 'utf-8')
55
+ items = line.strip().split()
56
+ sentence = []
57
+ for item in items:
58
+ token, tag = item.rsplit('_', 1)
59
+ sentence.append((token, tag))
60
+
61
+ self.sentences.append(sentence)
62
+
63
+ def _read_conll(self, filename):
64
+ """
65
+ Read data from a CoNLL formatted file. It expects at least 4 columns:
66
+ id, surface word, lemma (ignored, may be anything)
67
+ and the POS tag.
68
+ """
69
+ self.sentences = []
70
+ sentence = []
71
+ with open(filename, 'rt') as f:
72
+ for line in f:
73
+ line = line.strip()
74
+ if line == '':
75
+ if len(sentence) > 0:
76
+ self.sentences.append(sentence)
77
+ sentence = []
78
+ continue
79
+
80
+ fields = line.split(ConllWSD.SEP)
81
+ try:
82
+ word = fields[ConllWSD.word]
83
+ pos = fields[ConllWSD.pos]
84
+ wsd = fields[ConllWSD.wsd]
85
+ except: continue
86
+ sentence.append((word, wsd))
87
+ #sentence.append((word, pos, ner))
88
+
89
+ if len(sentence) > 0:
90
+ self.sentences.append(sentence)
91
+
92
+ # backwards compatibility
93
+ MacMorphoReader = WSDReader