nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nltkor/Kor_char.py +193 -0
- nltkor/__init__.py +16 -0
- nltkor/alignment/__init__.py +1315 -0
- nltkor/cider/__init__.py +2 -0
- nltkor/cider/cider.py +55 -0
- nltkor/cider/cider_scorer.py +207 -0
- nltkor/distance/__init__.py +441 -0
- nltkor/distance/wasserstein.py +126 -0
- nltkor/etc.py +22 -0
- nltkor/lazyimport.py +144 -0
- nltkor/make_requirement.py +11 -0
- nltkor/metrics/__init__.py +63 -0
- nltkor/metrics/bartscore.py +301 -0
- nltkor/metrics/bertscore.py +331 -0
- nltkor/metrics/bleu_tensor.py +20 -0
- nltkor/metrics/classical.py +847 -0
- nltkor/metrics/entment.py +24 -0
- nltkor/metrics/eval.py +517 -0
- nltkor/metrics/mauve.py +273 -0
- nltkor/metrics/mauve_utils.py +131 -0
- nltkor/misc/__init__.py +11 -0
- nltkor/misc/string2string_basic_functions.py +59 -0
- nltkor/misc/string2string_default_tokenizer.py +83 -0
- nltkor/misc/string2string_hash_functions.py +159 -0
- nltkor/misc/string2string_word_embeddings.py +503 -0
- nltkor/search/__init__.py +10 -0
- nltkor/search/classical.py +569 -0
- nltkor/search/faiss_search.py +787 -0
- nltkor/search/kobert_tokenizer.py +181 -0
- nltkor/sejong/__init__.py +3 -0
- nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
- nltkor/sejong/ch.py +12 -0
- nltkor/sejong/dict_semClassNum.txt +491 -0
- nltkor/sejong/layer.txt +630 -0
- nltkor/sejong/sejong_download.py +87 -0
- nltkor/sejong/ssem.py +684 -0
- nltkor/similarity/__init__.py +3 -0
- nltkor/similarity/bartscore____.py +337 -0
- nltkor/similarity/bertscore____.py +339 -0
- nltkor/similarity/classical.py +245 -0
- nltkor/similarity/cosine_similarity.py +175 -0
- nltkor/tag/__init__.py +71 -0
- nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
- nltkor/tag/espresso_tag.py +220 -0
- nltkor/tag/libs/__init__.py +10 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
- nltkor/tag/libs/arguments.py +280 -0
- nltkor/tag/libs/attributes.py +231 -0
- nltkor/tag/libs/config.py +159 -0
- nltkor/tag/libs/metadata.py +129 -0
- nltkor/tag/libs/ner/__init__.py +2 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/macmorphoreader.py +7 -0
- nltkor/tag/libs/ner/ner_reader.py +92 -0
- nltkor/tag/libs/network.c +72325 -0
- nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
- nltkor/tag/libs/network.pyx +878 -0
- nltkor/tag/libs/networkconv.pyx +1028 -0
- nltkor/tag/libs/networkdependencyconv.pyx +451 -0
- nltkor/tag/libs/parse/__init__.py +1 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/parse_reader.py +283 -0
- nltkor/tag/libs/pos/__init__.py +2 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/macmorphoreader.py +7 -0
- nltkor/tag/libs/pos/pos_reader.py +97 -0
- nltkor/tag/libs/reader.py +485 -0
- nltkor/tag/libs/srl/__init__.py +3 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__srl_reader_.py +535 -0
- nltkor/tag/libs/srl/srl_reader.py +436 -0
- nltkor/tag/libs/srl/train_srl.py +87 -0
- nltkor/tag/libs/taggers.py +926 -0
- nltkor/tag/libs/utils.py +384 -0
- nltkor/tag/libs/word_dictionary.py +239 -0
- nltkor/tag/libs/wsd/__init__.py +2 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
- nltkor/tag/libs/wsd/wsd_reader.py +93 -0
- nltkor/tokenize/__init__.py +62 -0
- nltkor/tokenize/ko_tokenize.py +115 -0
- nltkor/trans.py +121 -0
- nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
- nltkor-1.2.14.dist-info/METADATA +41 -0
- nltkor-1.2.14.dist-info/RECORD +127 -0
- nltkor-1.2.14.dist-info/WHEEL +5 -0
- nltkor-1.2.14.dist-info/top_level.txt +1 -0
nltkor/tag/libs/utils.py
ADDED
@@ -0,0 +1,384 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
"""
|
4
|
+
Utility functions
|
5
|
+
"""
|
6
|
+
|
7
|
+
import re
|
8
|
+
import os, sys
|
9
|
+
import logging
|
10
|
+
import nltk
|
11
|
+
import nltkor
|
12
|
+
import _pickle
|
13
|
+
import chardet
|
14
|
+
#from nltkor import Kor_char
|
15
|
+
from nltkor.tokenize import Ko_tokenize
|
16
|
+
from nltkor.tag.libs import config
|
17
|
+
import numpy as np
|
18
|
+
|
19
|
+
#from nltk.tokenize.regexp import RegexpTokenizer
|
20
|
+
#from nltk.tokenize import TreebankWordTokenizer
|
21
|
+
from . import attributes
|
22
|
+
|
23
|
+
|
24
|
+
def get_word_from_morph_lexicon(root, word, tags, space_flag):
|
25
|
+
'''
|
26
|
+
space_flag: if True then including space, otherwise do not including space
|
27
|
+
'''
|
28
|
+
|
29
|
+
values = list()
|
30
|
+
value_data = list()
|
31
|
+
if not word: return root.keys()
|
32
|
+
|
33
|
+
current_dict = root
|
34
|
+
_end = '$$'
|
35
|
+
s = 0
|
36
|
+
for i, letter in enumerate(word):
|
37
|
+
#print(i, '>', letter, current_dict)
|
38
|
+
if letter in current_dict:
|
39
|
+
#print(letter, current_dict[letter])
|
40
|
+
current_dict = current_dict[letter]
|
41
|
+
if _end in current_dict :
|
42
|
+
for idx in range(i-s): values.pop()
|
43
|
+
values.append(word[s:i+1])
|
44
|
+
for idx in range(i-s): value_data.pop()
|
45
|
+
value_data.append(current_dict[_end])
|
46
|
+
else: values.append(letter); value_data.append(tags[i])
|
47
|
+
else:
|
48
|
+
#print('==', letter, values)
|
49
|
+
if space_flag or letter != ' ':
|
50
|
+
values.append(letter) # 최장일치 : -1
|
51
|
+
value_data.append(tags[i])
|
52
|
+
s = i+1
|
53
|
+
current_dict = root
|
54
|
+
else:
|
55
|
+
if values: return values, value_data
|
56
|
+
else: return list(word), tags
|
57
|
+
|
58
|
+
def intersperse(lst, item):
|
59
|
+
result = [item] * (len(lst) * 2 - 1)
|
60
|
+
result[0::2] = lst
|
61
|
+
return result
|
62
|
+
|
63
|
+
def get_word(root, word, tags, space_flag=False) :
|
64
|
+
'''
|
65
|
+
space_flag : True : 공백이 있어도 매칭됨
|
66
|
+
False: 공백이 있으면 매칭 안됨
|
67
|
+
'''
|
68
|
+
word_list = get_word_from_morph_lexicon(root, word, tags, space_flag)
|
69
|
+
return word_list
|
70
|
+
|
71
|
+
|
72
|
+
|
73
|
+
def tokenize(text, use_sent_tokenizer=True):
|
74
|
+
"""
|
75
|
+
Call the tokenizer function for the given language.
|
76
|
+
The returned tokens are in a list of lists, one for each sentence.
|
77
|
+
|
78
|
+
:param use_sent_tokenizer: True : use sentence tokenizer
|
79
|
+
False : sentence per line
|
80
|
+
"""
|
81
|
+
return tokenize_ko(text, use_sent_tokenizer)
|
82
|
+
|
83
|
+
def tokenize_ko(text, use_sent_tokenizer=True, clean=True):
|
84
|
+
"""
|
85
|
+
text: string
|
86
|
+
Return a list of lists of the tokens in text, separated by sentences.
|
87
|
+
"""
|
88
|
+
if clean:
|
89
|
+
text = clean_kotext(text)
|
90
|
+
|
91
|
+
if use_sent_tokenizer:
|
92
|
+
## False: 띄어쓰기 무시, True: 띄어쓰기 고려
|
93
|
+
sentences = [Ko_tokenize.syllable(sentence, True) for sentence in Ko_tokenize.sentence(text)]
|
94
|
+
else:
|
95
|
+
sentences = [Ko_tokenize.syllable(text, True)]
|
96
|
+
|
97
|
+
return sentences
|
98
|
+
|
99
|
+
def clean_kotext(text, correct=False):
|
100
|
+
"""
|
101
|
+
1. 특수 공백문자를 공백으로 처리
|
102
|
+
Apply some transformations to the text, such as
|
103
|
+
replacing digits for 9 and simplifying quotation marks.
|
104
|
+
|
105
|
+
:param correct: If True, tries to correct punctuation misspellings.
|
106
|
+
"""
|
107
|
+
# replaces different kinds of quotation marks with "
|
108
|
+
# take care not to remove apostrophes
|
109
|
+
'''
|
110
|
+
text = re.sub(r"(?u)(^|\W)[‘’′`']", r'\1"', text)
|
111
|
+
text = re.sub(r"(?u)[‘’`′'](\W|$)", r'"\1', text)
|
112
|
+
text = re.sub(r'(?u)[«»“”]', '"', text)
|
113
|
+
|
114
|
+
if correct:
|
115
|
+
# tries to fix mistyped tokens (common in Wikipedia-pt) as ,, '' ..
|
116
|
+
text = re.sub(r'(?<!\.)\.\.(?!\.)', '.', text) # take care with ellipses
|
117
|
+
text = re.sub(r'([,";:])\1,', r'\1', text)
|
118
|
+
|
119
|
+
# inserts space after leading hyphen. It happens sometimes in cases like:
|
120
|
+
# blablabla -that is, bloblobloblo
|
121
|
+
text = re.sub(' -(?=[^\W\d_])', ' - ', text)
|
122
|
+
'''
|
123
|
+
|
124
|
+
# replaces numbers with the 9's
|
125
|
+
text = re.sub(r'\xa0', ' ', text)
|
126
|
+
text = re.sub(u' ', ' ', text)
|
127
|
+
text = re.sub(u' ', ' ', text)
|
128
|
+
text = re.sub(u' +', ' ', text) # 연속된 공백 처리
|
129
|
+
# replaces numbers with the 9's, 사전쪽과 같이 판단할 것
|
130
|
+
#text = re.sub(r'\d', '9', text)
|
131
|
+
# replaces english character with the a's
|
132
|
+
#text = re.sub(r'[a-zA-Z]', 'a', text)
|
133
|
+
# replaces chinses character with the 家's
|
134
|
+
#for x in re.findall(r'[\u4e00-\u9fff]', text):
|
135
|
+
# text = re.sub(x, '家', text)
|
136
|
+
|
137
|
+
# replaces special ellipsis character
|
138
|
+
#text = text.replace(u'…', '...')
|
139
|
+
|
140
|
+
return text
|
141
|
+
|
142
|
+
|
143
|
+
def generate_feature_vectors(num_vectors, num_features, min_value=-0.1, max_value=0.1):
|
144
|
+
"""
|
145
|
+
Generates vectors of real numbers, to be used as word features.
|
146
|
+
Vectors are initialized randomly. Returns a 2-dim numpy array.
|
147
|
+
"""
|
148
|
+
logger = logging.getLogger("Logger")
|
149
|
+
#table = (max_value * 2) * np.random.random((num_vectors, num_vectors, num_features, num_features)) + min_value
|
150
|
+
table = (max_value * 2) * np.random.random((num_vectors, num_features)) + min_value
|
151
|
+
logger.debug("Generated %d feature vectors with %d features each." % (num_vectors, num_features))
|
152
|
+
print("Generated %d feature vectors with %d features each." % (num_vectors, num_features))
|
153
|
+
|
154
|
+
return table
|
155
|
+
|
156
|
+
|
157
|
+
def count_lines(filename):
|
158
|
+
"""Counts and returns how many non empty lines in a file there are."""
|
159
|
+
with open(filename, 'r') as f:
|
160
|
+
lines = [x for x in list(f) if x.strip()]
|
161
|
+
return len(lines)
|
162
|
+
|
163
|
+
def _create_affix_tables(affix, table_list, num_features):
|
164
|
+
"""
|
165
|
+
Internal helper function for loading suffix or prefix feature tables
|
166
|
+
into the given list.
|
167
|
+
affix should be either 'suffix' or 'prefix'.
|
168
|
+
"""
|
169
|
+
logger = logging.getLogger('Logger')
|
170
|
+
logger.info('Generating %s features...' % affix)
|
171
|
+
tensor = []
|
172
|
+
codes = getattr(attributes.Affix, '%s_codes' % affix)
|
173
|
+
num_affixes_per_size = getattr(attributes.Affix, 'num_%ses_per_size' % affix)
|
174
|
+
for size in codes:
|
175
|
+
|
176
|
+
# use num_*_per_size because it accounts for special suffix codes
|
177
|
+
num_affixes = num_affixes_per_size[size]
|
178
|
+
table = generate_feature_vectors(num_affixes, num_features)
|
179
|
+
tensor.append(table)
|
180
|
+
|
181
|
+
# affix attribute actually has a 3-dim tensor
|
182
|
+
# (concatenation of 2d tables, one for each suffix size)
|
183
|
+
for table in tensor:
|
184
|
+
table_list.append(table)
|
185
|
+
|
186
|
+
def create_feature_tables(args, md, text_reader):
|
187
|
+
"""
|
188
|
+
Create the feature tables to be used by the network. If the args object
|
189
|
+
contains the load_features option as true, the feature table for word types
|
190
|
+
is loaded instead of being created. The actual number of
|
191
|
+
feature tables will depend on the argument options.
|
192
|
+
|
193
|
+
:param arguments: Parameters supplied to the program
|
194
|
+
:param md: metadata about the network
|
195
|
+
:param text_reader: The TextReader being used.
|
196
|
+
:returns: all the feature tables to be used
|
197
|
+
"""
|
198
|
+
|
199
|
+
logger = logging.getLogger("Logger")
|
200
|
+
feature_tables = []
|
201
|
+
|
202
|
+
if not args.load_types:
|
203
|
+
logger.info("Generating word type features...")
|
204
|
+
table_size = len(text_reader.word_dict)
|
205
|
+
types_table = generate_feature_vectors(table_size, args.num_features)
|
206
|
+
else:
|
207
|
+
logger.info("Loading word type features...")
|
208
|
+
# check if there is a word feature file specific for the task
|
209
|
+
# if not, load a generic one
|
210
|
+
filename = md.paths[md.type_features]
|
211
|
+
if os.path.exists(filename):
|
212
|
+
types_table = load_features_from_file(filename)
|
213
|
+
else:
|
214
|
+
filename = md.paths['type_features']
|
215
|
+
types_table = load_features_from_file(filename)
|
216
|
+
|
217
|
+
if len(types_table) < len(text_reader.word_dict):
|
218
|
+
# the type dictionary provided has more types than
|
219
|
+
# the number of feature vectors. So, let's generate
|
220
|
+
# feature vectors for the new types by replicating the vector
|
221
|
+
# associated with the RARE word
|
222
|
+
diff = len(text_reader.word_dict) - len(types_table)
|
223
|
+
logger.warning("Number of types in feature table and dictionary differ.")
|
224
|
+
logger.warning("Generating features for %d new types." % diff)
|
225
|
+
num_features = len(types_table[0])
|
226
|
+
new_vecs = generate_feature_vectors(diff, num_features)
|
227
|
+
types_table = np.append(types_table, new_vecs, axis=0)
|
228
|
+
|
229
|
+
elif len(types_table) < len(text_reader.word_dict):
|
230
|
+
logger.warning("Number of features provided is greater than the number of tokens\
|
231
|
+
in the dictionary. The extra features will be ignored.")
|
232
|
+
|
233
|
+
feature_tables.append(types_table) # head
|
234
|
+
#print(md.task)
|
235
|
+
#if md.task in ['labeled_dependency', 'unlabeled_dependency']:
|
236
|
+
# feature_tables.append(types_table) # tail
|
237
|
+
|
238
|
+
# Capitalization
|
239
|
+
if md.use_caps:
|
240
|
+
logger.info("Generating capitalization features...")
|
241
|
+
caps_table = generate_feature_vectors(attributes.Caps.num_values, args.caps)
|
242
|
+
feature_tables.append(caps_table)
|
243
|
+
|
244
|
+
# Prefixes
|
245
|
+
if md.use_prefix:
|
246
|
+
_create_affix_tables('prefix', feature_tables, args.prefix)
|
247
|
+
|
248
|
+
# Suffixes
|
249
|
+
if md.use_suffix:
|
250
|
+
_create_affix_tables('suffix', feature_tables, args.suffix)
|
251
|
+
|
252
|
+
# POS tags
|
253
|
+
if md.use_pos:
|
254
|
+
logger.info("Generating POS features...")
|
255
|
+
num_pos_tags = text_reader.get_num_pos_tags()
|
256
|
+
pos_table = generate_feature_vectors(num_pos_tags, args.pos)
|
257
|
+
#feature_tables.append(pos_table) # head # 여기와 *_reader의 converter와 일치해야 한다.
|
258
|
+
feature_tables.append(pos_table) # tail
|
259
|
+
|
260
|
+
# chunk tags
|
261
|
+
if md.use_chunk:
|
262
|
+
logger.info("Generating chunk features...")
|
263
|
+
num_chunk_tags = count_lines(md.paths['chunk_tags'])
|
264
|
+
chunk_table = generate_feature_vectors(num_chunk_tags, args.chunk)
|
265
|
+
feature_tables.append(chunk_table)
|
266
|
+
|
267
|
+
#print(len(feature_tables))
|
268
|
+
return feature_tables
|
269
|
+
|
270
|
+
|
271
|
+
|
272
|
+
def set_distance_features(max_dist=None,
|
273
|
+
num_target_features=None, num_pred_features=None):
|
274
|
+
"""
|
275
|
+
Returns the distance feature tables to be used by a convolutional network.
|
276
|
+
One table is for relative distance to the target predicate, the other
|
277
|
+
to the predicate.
|
278
|
+
|
279
|
+
:param max_dist: maximum distance to be used in new vectors.
|
280
|
+
"""
|
281
|
+
logger = logging.getLogger("Logger")
|
282
|
+
|
283
|
+
# max_dist before/after, 0 distance, and distances above the max
|
284
|
+
max_dist = 2 * (max_dist + 1) + 1
|
285
|
+
logger.info("Generating target word distance features...")
|
286
|
+
target_dist = generate_feature_vectors(max_dist, num_target_features)
|
287
|
+
logger.info("Generating predicate distance features...")
|
288
|
+
pred_dist = generate_feature_vectors(max_dist, num_pred_features)
|
289
|
+
|
290
|
+
return [target_dist, pred_dist]
|
291
|
+
|
292
|
+
|
293
|
+
def set_logger(level):
|
294
|
+
"""Sets the logger to be used throughout the system."""
|
295
|
+
log_format = '%(message)s'
|
296
|
+
logging.basicConfig(format=log_format)
|
297
|
+
logger = logging.getLogger("Logger")
|
298
|
+
logger.setLevel(level)
|
299
|
+
|
300
|
+
def load_features_from_file(features_file):
|
301
|
+
"""Reads a file with features written as binary data."""
|
302
|
+
return np.load(features_file)
|
303
|
+
|
304
|
+
def save_features_to_file(table, features_file):
|
305
|
+
"""Saves a feature table to a given file, writing binary data."""
|
306
|
+
np.save(features_file, table)
|
307
|
+
|
308
|
+
def convert_iobes_to_bracket(tag):
|
309
|
+
"""
|
310
|
+
Convert tags from the IOBES scheme to the CoNLL bracketing.
|
311
|
+
|
312
|
+
Example:
|
313
|
+
B-A0 -> (A0*
|
314
|
+
I-A0 -> *
|
315
|
+
E-A0 -> *)
|
316
|
+
S-A1 -> (A1*)
|
317
|
+
O -> *
|
318
|
+
"""
|
319
|
+
if tag.startswith('I') or tag.startswith('O'):
|
320
|
+
return '*'
|
321
|
+
if tag.startswith('B'):
|
322
|
+
return '(%s*' % tag[2:]
|
323
|
+
if tag.startswith('E'):
|
324
|
+
return '*)'
|
325
|
+
if tag.startswith('S'):
|
326
|
+
return '(%s*)' % tag[2:]
|
327
|
+
else:
|
328
|
+
raise ValueError("Unknown tag: %s" % tag)
|
329
|
+
|
330
|
+
def boundaries_to_arg_limits(boundaries):
|
331
|
+
"""
|
332
|
+
Converts a sequence of IOBES tags delimiting arguments to an array
|
333
|
+
of argument boundaries, used by the network.
|
334
|
+
"""
|
335
|
+
limits = []
|
336
|
+
start = None
|
337
|
+
|
338
|
+
for i, tag in enumerate(boundaries):
|
339
|
+
if tag == 'S':
|
340
|
+
limits.append([i, i])
|
341
|
+
elif tag == 'B':
|
342
|
+
start = i
|
343
|
+
elif tag == 'E':
|
344
|
+
limits.append([start, i])
|
345
|
+
|
346
|
+
return np.array(limits, np.int)
|
347
|
+
|
348
|
+
class PickleConverter:
|
349
|
+
"""txt 확장자 사전을 pickle 확장자로 변환"""
|
350
|
+
def __init__(self, morph_pickle=None, co_pickle=None, prob_pickle=None):
|
351
|
+
self.morph_pickle = morph_pickle or config.FILES.get("pos_morph_lexicon")
|
352
|
+
|
353
|
+
def _convert_morph_lexicon(self, root, word, data):
|
354
|
+
'''
|
355
|
+
root = dict()
|
356
|
+
...
|
357
|
+
_convert_morph_dict(root, u_key, u_data)
|
358
|
+
'''
|
359
|
+
current_dict = root
|
360
|
+
_end = '$$'
|
361
|
+
for letter in word:
|
362
|
+
current_dict = current_dict.setdefault(letter, {})
|
363
|
+
current_dict = current_dict.setdefault(_end, data)
|
364
|
+
return root
|
365
|
+
|
366
|
+
def convert_morph_lexicon(self, filename=None):
|
367
|
+
filename = filename if filename else self.morph_pickle
|
368
|
+
txt_filename = filename.replace('.pickle', '.txt')
|
369
|
+
morph_dict = {}
|
370
|
+
|
371
|
+
with open(txt_filename, 'rb') as f:
|
372
|
+
raw_data = f.read(1024)
|
373
|
+
detected = chardet.detect(raw_data).get('encoding', 'utf-8')
|
374
|
+
with open(txt_filename, 'rt', encoding=detected) as f:
|
375
|
+
for line in f:
|
376
|
+
if ';;' in line[:2]: continue
|
377
|
+
try:
|
378
|
+
k, v = line.strip().split('\t')
|
379
|
+
except:
|
380
|
+
print('morph lexicon error : ', line)
|
381
|
+
self._convert_morph_lexicon(morph_dict, k, v)
|
382
|
+
|
383
|
+
with open(filename, 'wb') as f:
|
384
|
+
_pickle.dump(morph_dict, f, 2)
|
@@ -0,0 +1,239 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
import itertools
|
4
|
+
from collections import Counter, OrderedDict as OD
|
5
|
+
from nltkor import Kor_char
|
6
|
+
|
7
|
+
class WordDictionary(dict):
|
8
|
+
"""
|
9
|
+
Class to store words and their corresponding indices in
|
10
|
+
the network lookup table. Also deals with padding and
|
11
|
+
maps rare words to a special index.
|
12
|
+
"""
|
13
|
+
|
14
|
+
padding_left = '*LEFT*'
|
15
|
+
padding_right = '*RIGHT*'
|
16
|
+
rare = '*RARE*'
|
17
|
+
|
18
|
+
number_transformation = {ord(c): '9' for c in '012345678'}
|
19
|
+
#english_transformation = {ord(c): 'a' for c in 'abcdefghijklmnopqrstuvwxyz'}
|
20
|
+
# 한자 처리 필요 korChar.hanja_syllable() 사용 , utils.py 참조
|
21
|
+
|
22
|
+
def __init__(self, tokens, size=None, minimum_occurrences=None, wordlist=None):
|
23
|
+
"""
|
24
|
+
Fills a dictionary (to be used for indexing) with the most
|
25
|
+
common words in the given text.
|
26
|
+
|
27
|
+
:param tokens: Either a list of tokens or a list of lists of tokens
|
28
|
+
(each token represented as a string).
|
29
|
+
:param size: Maximum number of token indices
|
30
|
+
(not including paddings, rare, etc.).
|
31
|
+
:param minimum_occurrences: The minimum number of occurrences a token must
|
32
|
+
have in order to be included.
|
33
|
+
:param wordlist: Use this list of words to build the dictionary. Overrides tokens
|
34
|
+
if not None and ignores maximum size.
|
35
|
+
"""
|
36
|
+
if wordlist is None:
|
37
|
+
# work with the supplied tokens. extract frequencies.
|
38
|
+
|
39
|
+
# gets frequency count
|
40
|
+
c = self._get_frequency_count(tokens)
|
41
|
+
|
42
|
+
if minimum_occurrences is None:
|
43
|
+
minimum_occurrences = 1
|
44
|
+
|
45
|
+
words = [key for key, number in c.most_common()
|
46
|
+
if number >= minimum_occurrences and key]
|
47
|
+
|
48
|
+
if size is not None and size < len(words):
|
49
|
+
words = words[:size]
|
50
|
+
else:
|
51
|
+
# using ordered dict as an ordered set
|
52
|
+
# (we need to keep the order and eliminate duplicates)
|
53
|
+
words = [word.lower().translate(WordDictionary.number_transformation) for word in wordlist]
|
54
|
+
values = [None] * len(words)
|
55
|
+
words = OD(zip(words, values)).keys()
|
56
|
+
#for c in words:
|
57
|
+
# print c.encode('utf-8')
|
58
|
+
|
59
|
+
# verifies the maximum size
|
60
|
+
if size is None:
|
61
|
+
size = len(words)
|
62
|
+
|
63
|
+
# set all words in the dictionary
|
64
|
+
for word, num in zip(words, range(size)):
|
65
|
+
self[word] = num
|
66
|
+
|
67
|
+
# if the given words include one of the the rare or padding symbols, don't replace it
|
68
|
+
# 사전 생성 시에 가장 뒤에 padding, rare를 생성
|
69
|
+
special_symbols = [WordDictionary.rare.lower(),
|
70
|
+
WordDictionary.padding_left.lower(),
|
71
|
+
WordDictionary.padding_right.lower()]
|
72
|
+
|
73
|
+
for symbol in special_symbols:
|
74
|
+
if symbol not in words:
|
75
|
+
self[symbol] = size
|
76
|
+
size += 1
|
77
|
+
|
78
|
+
self.check()
|
79
|
+
|
80
|
+
@classmethod
|
81
|
+
def init_from_wordlist(cls, wordlist):
|
82
|
+
"""
|
83
|
+
Initializes the WordDictionary instance with a list of words, independently from their
|
84
|
+
frequencies. Every word in the list gets an entry.
|
85
|
+
"""
|
86
|
+
return cls(None, wordlist=wordlist)
|
87
|
+
|
88
|
+
@classmethod
|
89
|
+
def init_empty(cls):
|
90
|
+
"""
|
91
|
+
Initializes an empty Word Dictionary.
|
92
|
+
"""
|
93
|
+
return cls([[]])
|
94
|
+
|
95
|
+
def save(self, filename):
|
96
|
+
"""
|
97
|
+
Saves the word dictionary to the given file as a list of word types.
|
98
|
+
|
99
|
+
Special words (paddings and rare) are also included.
|
100
|
+
"""
|
101
|
+
sorted_words = sorted(self, key=self.get)
|
102
|
+
text = '\n'.join(sorted_words)
|
103
|
+
with open(filename, 'w') as f:
|
104
|
+
f.write(text)
|
105
|
+
|
106
|
+
@classmethod
|
107
|
+
def load(cls, filename):
|
108
|
+
"""
|
109
|
+
Loads a WordDictionary object from a vocabulary file.
|
110
|
+
"""
|
111
|
+
words = []
|
112
|
+
with open(filename, 'r') as f:
|
113
|
+
for word in f:
|
114
|
+
word = word.strip()
|
115
|
+
if word:
|
116
|
+
words.append(word)
|
117
|
+
|
118
|
+
return cls.init_from_wordlist(words)
|
119
|
+
|
120
|
+
def _get_frequency_count(self, token_list):
|
121
|
+
"""
|
122
|
+
Returns a token counter for tokens in token_list.
|
123
|
+
|
124
|
+
:param token_list: Either a list of tokens (as strings) or a list
|
125
|
+
of lists of tokens.
|
126
|
+
"""
|
127
|
+
if type(token_list[0]) == list:
|
128
|
+
c = Counter(t.lower().translate(WordDictionary.number_transformation)
|
129
|
+
for sent in token_list for t in sent)
|
130
|
+
else:
|
131
|
+
c = Counter(t.lower().translate(WordDictionary.number_transformation)
|
132
|
+
for t in token_list)
|
133
|
+
return c
|
134
|
+
|
135
|
+
|
136
|
+
def update_tokens(self, tokens, size=None, minimum_occurrences=1, freqs=None):
|
137
|
+
"""
|
138
|
+
Updates the dictionary, adding more types until size is reached.
|
139
|
+
|
140
|
+
:param freqs: a dictionary providing a token count.
|
141
|
+
"""
|
142
|
+
if freqs is None:
|
143
|
+
freqs = self._get_frequency_count(tokens)
|
144
|
+
|
145
|
+
if size is None or size == 0:
|
146
|
+
# size None or 0 means no size limit
|
147
|
+
size = len(freqs)
|
148
|
+
|
149
|
+
if self.num_tokens >= size:
|
150
|
+
return
|
151
|
+
else:
|
152
|
+
size_diff = size - self.num_tokens
|
153
|
+
|
154
|
+
# a new version of freqs with only tokens not present in the dictionary
|
155
|
+
# and above minimum frequency
|
156
|
+
candidate_tokens = dict((token, freqs[token])
|
157
|
+
for token in freqs
|
158
|
+
if token not in self and freqs[token] >= minimum_occurrences)
|
159
|
+
|
160
|
+
# order the types from the most frequent to the least
|
161
|
+
new_tokens = sorted(candidate_tokens, key=lambda x: candidate_tokens[x], reverse=True)
|
162
|
+
|
163
|
+
next_value = len(self)
|
164
|
+
for token in new_tokens:
|
165
|
+
self[token] = next_value
|
166
|
+
next_value += 1
|
167
|
+
size_diff -= 1
|
168
|
+
if size_diff == 0:
|
169
|
+
break
|
170
|
+
|
171
|
+
self.check()
|
172
|
+
|
173
|
+
def __contains__(self, key):
|
174
|
+
"""
|
175
|
+
Overrides the "in" operator. Case insensitive.
|
176
|
+
"""
|
177
|
+
transformed = key.lower().translate(WordDictionary.number_transformation)
|
178
|
+
return super(WordDictionary, self).__contains__(transformed)
|
179
|
+
|
180
|
+
def __setitem__(self, key, value):
|
181
|
+
"""
|
182
|
+
Overrides the [] write operator. It converts every key to lower case
|
183
|
+
before assignment.
|
184
|
+
"""
|
185
|
+
transformed = key.lower().translate(WordDictionary.number_transformation)
|
186
|
+
super(WordDictionary, self).__setitem__(transformed, value)
|
187
|
+
|
188
|
+
def __getitem__(self, key):
|
189
|
+
"""
|
190
|
+
Overrides the [] read operator.
|
191
|
+
|
192
|
+
Three differences from the original:
|
193
|
+
1) when given a word without an entry, it returns the value for the *UNKNOWN* key.
|
194
|
+
2) all entries are converted to lower case before verification.
|
195
|
+
3) digits are mapped to 9
|
196
|
+
"""
|
197
|
+
# faster than regexp
|
198
|
+
transformed = key.lower().translate(WordDictionary.number_transformation)
|
199
|
+
return super(WordDictionary, self).get(transformed, self.index_rare)
|
200
|
+
|
201
|
+
def get(self, key):
|
202
|
+
"""
|
203
|
+
Overrides the dictionary get method, so when given a word without an entry, it returns
|
204
|
+
the value for the *rare* key. Note that it is not possible to supply a default value as
|
205
|
+
in the dict class.
|
206
|
+
"""
|
207
|
+
# faster than regexp
|
208
|
+
#print (key, type(key))
|
209
|
+
transformed = key.lower().translate(WordDictionary.number_transformation)
|
210
|
+
#print key, transformed
|
211
|
+
return super(WordDictionary, self).get(transformed, self.index_rare)
|
212
|
+
|
213
|
+
def check(self):
|
214
|
+
"""
|
215
|
+
Checks the internal structure of the dictionary and makes necessary adjustments,
|
216
|
+
such as updating num_tokens.
|
217
|
+
"""
|
218
|
+
# since WordDictionary overrides __get__, we use the super call
|
219
|
+
# (the WordDictionary __get__ fails when self.index_rare is not set)
|
220
|
+
key = WordDictionary.rare.lower()
|
221
|
+
self.index_rare = super(WordDictionary, self).get(key)
|
222
|
+
|
223
|
+
self.index_padding_left = self[WordDictionary.padding_left]
|
224
|
+
self.index_padding_right = self[WordDictionary.padding_right]
|
225
|
+
self.num_tokens = len(self)
|
226
|
+
|
227
|
+
def get_words(self, indices):
|
228
|
+
"""
|
229
|
+
Returns the words represented by a sequence of indices.
|
230
|
+
"""
|
231
|
+
words = [w for w in self if self[w] in indices]
|
232
|
+
return words
|
233
|
+
|
234
|
+
def get_indices(self, words):
|
235
|
+
"""
|
236
|
+
Returns the indices corresponding to a sequence of tokens.
|
237
|
+
"""
|
238
|
+
indices = [self[w] for w in words]
|
239
|
+
return indices
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|