nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nltkor/Kor_char.py +193 -0
- nltkor/__init__.py +16 -0
- nltkor/alignment/__init__.py +1315 -0
- nltkor/cider/__init__.py +2 -0
- nltkor/cider/cider.py +55 -0
- nltkor/cider/cider_scorer.py +207 -0
- nltkor/distance/__init__.py +441 -0
- nltkor/distance/wasserstein.py +126 -0
- nltkor/etc.py +22 -0
- nltkor/lazyimport.py +144 -0
- nltkor/make_requirement.py +11 -0
- nltkor/metrics/__init__.py +63 -0
- nltkor/metrics/bartscore.py +301 -0
- nltkor/metrics/bertscore.py +331 -0
- nltkor/metrics/bleu_tensor.py +20 -0
- nltkor/metrics/classical.py +847 -0
- nltkor/metrics/entment.py +24 -0
- nltkor/metrics/eval.py +517 -0
- nltkor/metrics/mauve.py +273 -0
- nltkor/metrics/mauve_utils.py +131 -0
- nltkor/misc/__init__.py +11 -0
- nltkor/misc/string2string_basic_functions.py +59 -0
- nltkor/misc/string2string_default_tokenizer.py +83 -0
- nltkor/misc/string2string_hash_functions.py +159 -0
- nltkor/misc/string2string_word_embeddings.py +503 -0
- nltkor/search/__init__.py +10 -0
- nltkor/search/classical.py +569 -0
- nltkor/search/faiss_search.py +787 -0
- nltkor/search/kobert_tokenizer.py +181 -0
- nltkor/sejong/__init__.py +3 -0
- nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
- nltkor/sejong/ch.py +12 -0
- nltkor/sejong/dict_semClassNum.txt +491 -0
- nltkor/sejong/layer.txt +630 -0
- nltkor/sejong/sejong_download.py +87 -0
- nltkor/sejong/ssem.py +684 -0
- nltkor/similarity/__init__.py +3 -0
- nltkor/similarity/bartscore____.py +337 -0
- nltkor/similarity/bertscore____.py +339 -0
- nltkor/similarity/classical.py +245 -0
- nltkor/similarity/cosine_similarity.py +175 -0
- nltkor/tag/__init__.py +71 -0
- nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
- nltkor/tag/espresso_tag.py +220 -0
- nltkor/tag/libs/__init__.py +10 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
- nltkor/tag/libs/arguments.py +280 -0
- nltkor/tag/libs/attributes.py +231 -0
- nltkor/tag/libs/config.py +159 -0
- nltkor/tag/libs/metadata.py +129 -0
- nltkor/tag/libs/ner/__init__.py +2 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/macmorphoreader.py +7 -0
- nltkor/tag/libs/ner/ner_reader.py +92 -0
- nltkor/tag/libs/network.c +72325 -0
- nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
- nltkor/tag/libs/network.pyx +878 -0
- nltkor/tag/libs/networkconv.pyx +1028 -0
- nltkor/tag/libs/networkdependencyconv.pyx +451 -0
- nltkor/tag/libs/parse/__init__.py +1 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/parse_reader.py +283 -0
- nltkor/tag/libs/pos/__init__.py +2 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/macmorphoreader.py +7 -0
- nltkor/tag/libs/pos/pos_reader.py +97 -0
- nltkor/tag/libs/reader.py +485 -0
- nltkor/tag/libs/srl/__init__.py +3 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__srl_reader_.py +535 -0
- nltkor/tag/libs/srl/srl_reader.py +436 -0
- nltkor/tag/libs/srl/train_srl.py +87 -0
- nltkor/tag/libs/taggers.py +926 -0
- nltkor/tag/libs/utils.py +384 -0
- nltkor/tag/libs/word_dictionary.py +239 -0
- nltkor/tag/libs/wsd/__init__.py +2 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
- nltkor/tag/libs/wsd/wsd_reader.py +93 -0
- nltkor/tokenize/__init__.py +62 -0
- nltkor/tokenize/ko_tokenize.py +115 -0
- nltkor/trans.py +121 -0
- nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
- nltkor-1.2.14.dist-info/METADATA +41 -0
- nltkor-1.2.14.dist-info/RECORD +127 -0
- nltkor-1.2.14.dist-info/WHEEL +5 -0
- nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,231 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import numpy as np
|
5
|
+
|
6
|
+
from .word_dictionary import WordDictionary as WD
|
7
|
+
from collections import defaultdict
|
8
|
+
|
9
|
+
# dummy value to be used when POS is an additional attribute
|
10
|
+
PADDING_POS = 'PADDING'
|
11
|
+
|
12
|
+
class Caps(object):
|
13
|
+
"""Dummy class for storing numeric values for capitalization."""
|
14
|
+
num_values = 5
|
15
|
+
lower = 0
|
16
|
+
title = 1
|
17
|
+
non_alpha = 2
|
18
|
+
other = 3
|
19
|
+
padding = 4
|
20
|
+
|
21
|
+
|
22
|
+
class Token(object):
|
23
|
+
def __init__(self, word, morph_h='NA', pos_h='NA', morph_t='NA', pos_t='NA', chunk='NA'):
|
24
|
+
"""
|
25
|
+
A token representation that stores discrete attributes to be given as
|
26
|
+
input to the neural network.
|
27
|
+
"""
|
28
|
+
self.word = word
|
29
|
+
self.morph_h = morph_h
|
30
|
+
self.pos_h = pos_h
|
31
|
+
self.pos_t = pos_t
|
32
|
+
self.morph_t = morph_t
|
33
|
+
self.chunk = chunk
|
34
|
+
|
35
|
+
def __str__(self):
|
36
|
+
return str(self.word)
|
37
|
+
|
38
|
+
def __repr__(self):
|
39
|
+
return self.word.__repr__()
|
40
|
+
|
41
|
+
|
42
|
+
class Affix(object):
|
43
|
+
"""Dummy class for manipulating suffixes and their related codes."""
|
44
|
+
# codes maps integers (affix sizes) to dicts. each dict maps a suffix of the given
|
45
|
+
# size to its code
|
46
|
+
suffix_codes = {}
|
47
|
+
prefix_codes = {}
|
48
|
+
other = 0
|
49
|
+
padding = 1
|
50
|
+
num_suffixes_per_size = {}
|
51
|
+
num_prefixes_per_size = {}
|
52
|
+
|
53
|
+
@classmethod
|
54
|
+
def load_suffixes(cls, md):
|
55
|
+
"""
|
56
|
+
Loads suffixes from the suffix file.
|
57
|
+
"""
|
58
|
+
cls.load_affixes(cls.suffix_codes, md.paths['suffixes'])
|
59
|
+
|
60
|
+
# +2 because of the unkown suffix code and padding
|
61
|
+
cls.num_suffixes_per_size = {size: len(cls.suffix_codes[size]) + 2
|
62
|
+
for size in cls.suffix_codes}
|
63
|
+
|
64
|
+
@classmethod
|
65
|
+
def load_prefixes(cls, md):
|
66
|
+
"""
|
67
|
+
Loads prefixes from the prefix file.
|
68
|
+
"""
|
69
|
+
cls.load_affixes(cls.prefix_codes, md.paths['prefixes'])
|
70
|
+
|
71
|
+
# +2 because of the unkown prefix code and padding
|
72
|
+
cls.num_prefixes_per_size = {size: len(cls.prefix_codes[size]) + 2
|
73
|
+
for size in cls.prefix_codes}
|
74
|
+
|
75
|
+
|
76
|
+
@classmethod
|
77
|
+
def load_affixes(cls, codes, filename):
|
78
|
+
"""
|
79
|
+
Parent function for loading prefixes and suffixes.
|
80
|
+
"""
|
81
|
+
logger = logging.getLogger("Logger")
|
82
|
+
|
83
|
+
# intermediate storage
|
84
|
+
affixes_by_size = defaultdict(list)
|
85
|
+
|
86
|
+
try:
|
87
|
+
with open(filename, 'rb') as f:
|
88
|
+
for line in f:
|
89
|
+
affix = unicode(line.strip(), 'utf-8')
|
90
|
+
size = len(affix)
|
91
|
+
affixes_by_size[size].append(affix)
|
92
|
+
except IOError:
|
93
|
+
logger.error("File %s doesn't exist." % filename)
|
94
|
+
raise
|
95
|
+
|
96
|
+
for size in affixes_by_size:
|
97
|
+
# for each size, each affix has a code starting from 2
|
98
|
+
# 0 is reserved for unknown affixes
|
99
|
+
# 1 is reserved for padding pseudo-affixes
|
100
|
+
codes[size] = {affix: code
|
101
|
+
for code, affix in enumerate(affixes_by_size[size], 2)}
|
102
|
+
|
103
|
+
@classmethod
|
104
|
+
def get_suffix(cls, word, size):
|
105
|
+
"""
|
106
|
+
Return the suffix code for the given word. Consider a suffix
|
107
|
+
of the given size.
|
108
|
+
"""
|
109
|
+
if word == WD.padding_left or word == WD.padding_right:
|
110
|
+
return cls.padding
|
111
|
+
|
112
|
+
if len(word) <= size:
|
113
|
+
return cls.other
|
114
|
+
|
115
|
+
suffix = word[-size:].lower()
|
116
|
+
code = cls.suffix_codes[size].get(suffix, cls.other)
|
117
|
+
return code
|
118
|
+
|
119
|
+
@classmethod
|
120
|
+
def get_prefix(cls, word, size):
|
121
|
+
"""
|
122
|
+
Return the suffix code for the given word. Consider a suffix
|
123
|
+
of the given size.
|
124
|
+
"""
|
125
|
+
if word == WD.padding_left or word == WD.padding_right:
|
126
|
+
return cls.padding
|
127
|
+
|
128
|
+
if len(word) <= size:
|
129
|
+
return cls.other
|
130
|
+
|
131
|
+
prefix = word[:size].lower()
|
132
|
+
code = cls.prefix_codes[size].get(prefix, cls.other)
|
133
|
+
return code
|
134
|
+
|
135
|
+
|
136
|
+
class TokenConverter(object):
|
137
|
+
|
138
|
+
def __init__(self):
|
139
|
+
"""
|
140
|
+
Class to convert tokens into indices to their feature vectos in
|
141
|
+
feature matrices.
|
142
|
+
"""
|
143
|
+
self.extractors = []
|
144
|
+
|
145
|
+
def add_extractor(self, extractor):
|
146
|
+
"""
|
147
|
+
Adds an extractor function to the TokenConverter. In order to get a token's
|
148
|
+
feature indices, the Converter will call each of its extraction functions passing
|
149
|
+
the token as a parameter. The result will be a list containing each result.
|
150
|
+
"""
|
151
|
+
self.extractors.append(extractor)
|
152
|
+
|
153
|
+
def get_padding_left(self, tokens_as_string=True):
|
154
|
+
"""
|
155
|
+
Returns an object to be used as the left padding in the sentence.
|
156
|
+
|
157
|
+
:param tokens_as_string: if True, treat tokens as strings.
|
158
|
+
If False, treat them as Token objects.
|
159
|
+
"""
|
160
|
+
if tokens_as_string:
|
161
|
+
pad = WD.padding_left
|
162
|
+
else:
|
163
|
+
pad = Token(WD.padding_left, morph_h=WD.padding_left, \
|
164
|
+
morph_t=WD.padding_left, pos_h=PADDING_POS, pos_t=PADDING_POS)
|
165
|
+
return self.convert(pad)
|
166
|
+
|
167
|
+
def get_padding_right(self, tokens_as_string=True):
|
168
|
+
"""
|
169
|
+
Returns an object to be used as the right padding in the sentence.
|
170
|
+
|
171
|
+
:param tokens_as_string: if True, treat tokens as strings.
|
172
|
+
If False, treat them as Token objects.
|
173
|
+
"""
|
174
|
+
if tokens_as_string:
|
175
|
+
pad = WD.padding_right
|
176
|
+
else:
|
177
|
+
pad = Token(WD.padding_right, morph_h=WD.padding_right, \
|
178
|
+
morph_t=WD.padding_right, pos_h=PADDING_POS, pos_t=PADDING_POS)
|
179
|
+
#pad = Token(WD.padding_right, pos_t=PADDING_POS)
|
180
|
+
return self.convert(pad)
|
181
|
+
|
182
|
+
def convert(self, token):
|
183
|
+
"""
|
184
|
+
Converts a token into its feature indices.
|
185
|
+
"""
|
186
|
+
indices = np.array([function(token) for function in self.extractors])
|
187
|
+
return indices
|
188
|
+
|
189
|
+
|
190
|
+
def get_capitalization(word):
|
191
|
+
"""
|
192
|
+
Returns a code describing the capitalization of the word:
|
193
|
+
lower, title, other or non-alpha (numbers and other tokens that can't be
|
194
|
+
capitalized).
|
195
|
+
"""
|
196
|
+
if word == WD.padding_left or word == WD.padding_right:
|
197
|
+
return Caps.padding
|
198
|
+
|
199
|
+
if not any(c.isalpha() for c in word):
|
200
|
+
# check if there is at least one letter
|
201
|
+
# (this is faster than using a regex)
|
202
|
+
return Caps.non_alpha
|
203
|
+
|
204
|
+
if word.islower():
|
205
|
+
return Caps.lower
|
206
|
+
|
207
|
+
# word.istitle() returns false for compunds like Low-cost
|
208
|
+
if len(word) == 1:
|
209
|
+
# if we reached here, there's a single upper case letter
|
210
|
+
return Caps.title
|
211
|
+
elif word[0].isupper() and word[1:].islower():
|
212
|
+
return Caps.title
|
213
|
+
|
214
|
+
return Caps.other
|
215
|
+
|
216
|
+
def capitalize(word, capitalization):
|
217
|
+
"""
|
218
|
+
Capitalizes the word in the desired format. If the capitalization is
|
219
|
+
Caps.other, it is set all uppercase.
|
220
|
+
"""
|
221
|
+
if capitalization == Caps.non_alpha or capitalization == Caps.padding:
|
222
|
+
return word
|
223
|
+
elif capitalization == Caps.lower:
|
224
|
+
return word.lower()
|
225
|
+
elif capitalization == Caps.title:
|
226
|
+
return word[0].upper() + word[1:].lower()
|
227
|
+
elif capitalization == Caps.other:
|
228
|
+
return word.upper()
|
229
|
+
else:
|
230
|
+
raise ValueError("Unknown capitalization type.")
|
231
|
+
|
@@ -0,0 +1,159 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
"""
|
4
|
+
Configuration data for the system.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import os
|
8
|
+
|
9
|
+
data_dir = None
|
10
|
+
FILES = {}
|
11
|
+
|
12
|
+
def get_config_paths(directory):
|
13
|
+
"""Sets the data directory containing the data for the models."""
|
14
|
+
assert os.path.isdir(directory), 'Invalid data directory'
|
15
|
+
|
16
|
+
return { key: os.path.join(directory, value) for key, value in [
|
17
|
+
# cross-task data
|
18
|
+
('.', '.'), #for data_dir access
|
19
|
+
|
20
|
+
# vocabulary file used as a fallback if a reader doesn't have a specific one
|
21
|
+
('vocabulary' , 'vocabulary.txt'),
|
22
|
+
('type_features' , 'types-features.npy'),
|
23
|
+
('termvectors' , 'termvectors.txt'),
|
24
|
+
|
25
|
+
# POS
|
26
|
+
('network_pos' , 'pos-network.npz'),
|
27
|
+
('network_text_pos' , 'pos-network.txt'),
|
28
|
+
('pos_tags' , 'pos-tags.txt'),
|
29
|
+
('pos_tag_dict' , 'pos-tags.txt'),
|
30
|
+
('pos_co_lexicon' , 'pos-co-lexicon.pickle'),
|
31
|
+
('pos_morph_lexicon' , 'pos-morph-lexicon.pickle'),
|
32
|
+
('pos_prob_dict' , 'pos-prob-dict.pickle'),
|
33
|
+
('pos_morph_lexicon_txt' , 'pos-morph-lexicon.txt'),
|
34
|
+
('suffix' , 'suffixes.txt'),
|
35
|
+
('suffixes' , 'suffixes.txt'),
|
36
|
+
('prefix' , 'prefixes.txt'),
|
37
|
+
('prefixes' , 'prefixes.txt'),
|
38
|
+
('metadata_pos' , 'metadata-pos.pickle'),
|
39
|
+
('metadata_text_pos' , 'metadata-pos.txt'),
|
40
|
+
('type_features_pos' , 'types-features-pos.npy'),
|
41
|
+
('caps_features_pos' , 'caps-features-pos.npy'),
|
42
|
+
('suffix_features_pos' , 'suffix-features-pos.npy'),
|
43
|
+
('prefix_features_pos' , 'prefix-features-pos.npy'),
|
44
|
+
('vocabulary_pos' , 'vocabulary-pos.txt'),
|
45
|
+
|
46
|
+
# NER
|
47
|
+
('network_ner' , 'ner-network.npz'),
|
48
|
+
('network_text_ner' , 'ner-network.txt'),
|
49
|
+
('ner_tags' , 'ner-tags.txt'),
|
50
|
+
('ner_tag_dict' , 'ner-tags.txt'),
|
51
|
+
('ner_morph_lexicon' , 'ner-morph-lexicon.txt'),
|
52
|
+
('suffix' , 'suffixes.txt'),
|
53
|
+
('suffixes' , 'suffixes.txt'),
|
54
|
+
('prefix' , 'prefixes.txt'),
|
55
|
+
('prefixes' , 'prefixes.txt'),
|
56
|
+
('metadata_ner' , 'metadata-ner.pickle'),
|
57
|
+
('metadata_text_ner' , 'metadata-ner.txt'),
|
58
|
+
('type_features_ner' , 'types-features-ner.npy'),
|
59
|
+
('caps_features_ner' , 'caps-features-ner.npy'),
|
60
|
+
('suffix_features_ner' , 'suffix-features-ner.npy'),
|
61
|
+
('prefix_features_ner' , 'prefix-features-ner.npy'),
|
62
|
+
('vocabulary_ner' , 'vocabulary-ner.txt'),
|
63
|
+
|
64
|
+
# WSD
|
65
|
+
('network_wsd' , 'wsd-network.npz'),
|
66
|
+
('network_text_wsd' , 'wsd-network.txt'),
|
67
|
+
('wsd_tags' , 'wsd-tags.txt'),
|
68
|
+
('wsd_tag_dict' , 'wsd-tags.txt'),
|
69
|
+
('wsd_morph_lexicon' , 'wsd-morph-lexicon.txt'),
|
70
|
+
# ('suffix' , 'suffixes.txt'),
|
71
|
+
# ('suffixes' , 'suffixes.txt'),
|
72
|
+
# ('prefix' , 'prefixes.txt'),
|
73
|
+
# ('prefixes' , 'prefixes.txt'),
|
74
|
+
('metadata_wsd' , 'metadata-wsd.pickle'),
|
75
|
+
('metadata_text_wsd' , 'metadata-wsd.txt'),
|
76
|
+
('type_features_wsd' , 'types-features-wsd.npy'),
|
77
|
+
('caps_features_wsd' , 'caps-features-wsd.npy'),
|
78
|
+
('suffix_features_wsd' , 'suffix-features-wsd.npy'),
|
79
|
+
('prefix_features_wsd' , 'prefix-features-wsd.npy'),
|
80
|
+
('vocabulary_wsd' , 'vocabulary-wsd.txt'),
|
81
|
+
|
82
|
+
# dependency
|
83
|
+
('network_labeled_dependency', 'ldep-network.npz'),
|
84
|
+
('network_text_labeled_dependency', 'ldep-network.txt'),
|
85
|
+
('type_features_labeled_dependency', 'types-features-ldep.npy'),
|
86
|
+
('caps_features_labeled_dependency', 'caps-features-ldep.npy'),
|
87
|
+
('pos_features_labeled_dependency', 'pos-features-ldep.npy'),
|
88
|
+
('metadata_labeled_dependency', 'metadata-ldep.pickle'),
|
89
|
+
('metadata_text_labeled_dependency', 'metadata-ldep.txt'),
|
90
|
+
('dependency_tag_dict', 'dependency-tags.txt'),
|
91
|
+
('labeled_dependency_tag_dict', 'dependency-tags.txt'),
|
92
|
+
('vocabulary_labeled_dependency', 'vocabulary-ldep.txt'),
|
93
|
+
|
94
|
+
('dependency_pos_tags', 'dep-pos-tags.txt'),
|
95
|
+
|
96
|
+
('network_unlabeled_dependency', 'udep-network.npz'),
|
97
|
+
('network_text_unlabeled_dependency', 'udep-network.txt'),
|
98
|
+
('type_features_unlabeled_dependency', 'types-features-udep.npy'),
|
99
|
+
('caps_features_unlabeled_dependency', 'caps-features-udep.npy'),
|
100
|
+
('pos_features_unlabeled_dependency', 'pos-features-udep.npy'),
|
101
|
+
('metadata_unlabeled_dependency', 'metadata-udep.pickle'),
|
102
|
+
('metadata_text_unlabeled_dependency', 'metadata-udep.txt'),
|
103
|
+
('vocabulary_unlabeled_dependency', 'vocabulary-udep.txt'),
|
104
|
+
|
105
|
+
# chunk
|
106
|
+
#('chunk_tag_dict' , 'chunk-tag-dict.pickle'),
|
107
|
+
#('chunk_tags' , 'chunk-tags.txt'),
|
108
|
+
|
109
|
+
# SRL
|
110
|
+
('network_srl' , 'srl-network.npz'),
|
111
|
+
('network_text_srl' , 'srl-network.txt'),
|
112
|
+
#('network_srl_boundary' , 'srl-id-network.npz'),
|
113
|
+
#('network_srl_classify' , 'srl-class-network.npz'),
|
114
|
+
#('network_srl_predicates' , 'srl-class-predicates.npz'),
|
115
|
+
#('srl_iob_tag_dict' , 'srl-tags.txt'),
|
116
|
+
#('srl_iob_tags' , 'srl-tags.txt'),
|
117
|
+
('srl_tags' , 'srl-tags.txt'),
|
118
|
+
#('srl_classify_tag_dict' , 'srl-tags.txt'),
|
119
|
+
#('srl_classify_tags' , 'srl-tags.txt'),
|
120
|
+
#('srl_predicates_tag_dict' , 'srl-predicates-tags.txt'),
|
121
|
+
#('srl_predicates_tags' , 'srl-predicates-tags.txt'),
|
122
|
+
('type_features_srl' , 'types-features-srl.npy'),
|
123
|
+
('caps_features_srl' , 'caps-features-srl.npy'),
|
124
|
+
('pos_features_srl' , 'pos-features-srl.npy'),
|
125
|
+
#('chunk_features_classify' , 'chunk-features-class.npy'),
|
126
|
+
#('type_features_boundary' , 'types-features-id.npy'),
|
127
|
+
#('caps_features_boundary' , 'caps-features-id.npy'),
|
128
|
+
#('pos_features_boundary' , 'pos-features-id.npy'),
|
129
|
+
#('chunk_features_boundary' , 'chunk-features-id.npy'),
|
130
|
+
#('type_features_classify' , 'types-features-class.npy'),
|
131
|
+
#('caps_features_classify' , 'caps-features-class.npy'),
|
132
|
+
#('pos_features_classify' , 'pos-features-class.npy'),
|
133
|
+
#('chunk_features_classify' , 'chunk-features-class.npy'),
|
134
|
+
#('type_features_1step' , 'types-features-1step.npy'),
|
135
|
+
#('caps_features_1step' , 'caps-features-1step.npy'),
|
136
|
+
#('pos_features_1step' , 'pos-features-1step.npy'),
|
137
|
+
#('chunk_features_1step' , 'chunk-features-1step.npy'),
|
138
|
+
#('type_features_srl_predicates', 'types-features-preds.npy'),
|
139
|
+
#('caps_features_srl_predicates', 'caps-features-preds.npy'),
|
140
|
+
#('pos_features_srl_predicates' , 'pos-features-preds.npy'),
|
141
|
+
('metadata_srl' , 'metadata-srl.pickle'),
|
142
|
+
('metadata_text_srl' , 'metadata-srl.txt'),
|
143
|
+
#('metadata_srl_boundary' , 'metadata-srl-boundary.pickle'),
|
144
|
+
#('metadata_srl_classify' , 'metadata-srl-classify.pickle'),
|
145
|
+
#('metadata_srl_predicates' , 'metadata-srl-predicates.pickle'),
|
146
|
+
('vocabulary_srl', 'vocabulary-srl.txt'),
|
147
|
+
#('vocabulary_srl_boundary', 'vocabulary-srl-boundary.txt'),
|
148
|
+
#('vocabulary_srl_classify', 'vocabulary-srl-classify.txt'),
|
149
|
+
#('vocabulary_srl_predicates', 'vocabulary-srl-predicates.txt')
|
150
|
+
]
|
151
|
+
}
|
152
|
+
|
153
|
+
|
154
|
+
def set_data_dir(directory):
|
155
|
+
"""Sets the global data directory containing the data for the models."""
|
156
|
+
global data_dir, FILES
|
157
|
+
data_dir = directory
|
158
|
+
FILES = get_config_paths(directory)
|
159
|
+
|
@@ -0,0 +1,129 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
"""
|
4
|
+
This script contains the definition of the Metadata class.
|
5
|
+
It can also be invoked in order to create a Metada object
|
6
|
+
and save it to a file in the data directory.
|
7
|
+
"""
|
8
|
+
|
9
|
+
import _pickle
|
10
|
+
|
11
|
+
from . import config
|
12
|
+
|
13
|
+
class Metadata(object):
|
14
|
+
"""
|
15
|
+
Class for storing metadata about a neural network and its
|
16
|
+
parameter files.
|
17
|
+
"""
|
18
|
+
|
19
|
+
def __init__(self, task, paths=None, use_caps=True, use_suffix=False, use_prefix=False,
|
20
|
+
use_pos=False, use_chunk=False, use_lemma=False):
|
21
|
+
self.task = task
|
22
|
+
self.paths = paths if paths else config.FILES
|
23
|
+
self.use_caps = use_caps
|
24
|
+
self.use_suffix = use_suffix
|
25
|
+
self.use_prefix = use_prefix
|
26
|
+
self.use_pos = use_pos
|
27
|
+
self.use_chunk = use_chunk
|
28
|
+
self.use_lemma = use_lemma
|
29
|
+
self.metadata = 'metadata_%s' % task
|
30
|
+
self.network = 'network_%s' % task
|
31
|
+
self.network_text = 'network_text_%s' % task
|
32
|
+
self.tag_dict = '%s_tag_dict' % task
|
33
|
+
|
34
|
+
# dependency edge filter doesn't use an actual neural network, so
|
35
|
+
# we call it "model" to be more consistent
|
36
|
+
self.model = self.network
|
37
|
+
'''
|
38
|
+
if task == 'srl_boundary':
|
39
|
+
self.pred_dist_table = 'pred_dist_table_boundary'
|
40
|
+
self.target_dist_table = 'target_dist_table_boundary'
|
41
|
+
self.transitions = 'srl_transitions_boundary'
|
42
|
+
self.type_features = 'type_features_boundary'
|
43
|
+
self.caps_features = 'caps_features_boundary'
|
44
|
+
self.pos_features = 'pos_features_boundary'
|
45
|
+
self.chunk_features = 'chunk_features_boundary'
|
46
|
+
self.suffix_features = None
|
47
|
+
|
48
|
+
elif task == 'srl_classify':
|
49
|
+
self.pred_dist_table = 'pred_dist_table_classify'
|
50
|
+
self.target_dist_table = 'target_dist_table_classify'
|
51
|
+
self.transitions = None
|
52
|
+
self.type_features = 'type_features_classify'
|
53
|
+
self.caps_features = 'caps_features_classify'
|
54
|
+
self.pos_features = 'pos_features_classify'
|
55
|
+
self.chunk_features = 'chunk_features_classify'
|
56
|
+
self.suffix_features = None
|
57
|
+
|
58
|
+
elif task == 'srl':
|
59
|
+
# one step srl
|
60
|
+
self.pred_dist_table = 'pred_dist_table_1step'
|
61
|
+
self.target_dist_table = 'target_dist_table_1step'
|
62
|
+
self.transitions = 'srl_transitions_1step'
|
63
|
+
self.type_features = 'type_features_1step'
|
64
|
+
self.caps_features = 'caps_features_1step'
|
65
|
+
self.pos_features = 'pos_features_1step'
|
66
|
+
self.chunk_features = 'chunk_features_1step'
|
67
|
+
self.suffix_features = None
|
68
|
+
|
69
|
+
else:
|
70
|
+
self.type_features = 'type_features_%s' % task
|
71
|
+
self.caps_features = 'caps_features_%s' % task
|
72
|
+
self.pos_features = 'pos_features_%s' % task
|
73
|
+
self.chunk_features = 'chunk_features_%s' % task
|
74
|
+
self.suffix_features = 'suffix_features_%s' % task
|
75
|
+
self.prefix_features = 'prefix_features_%s' % task
|
76
|
+
'''
|
77
|
+
self.type_features = 'type_features_%s' % task
|
78
|
+
self.caps_features = 'caps_features_%s' % task
|
79
|
+
self.pos_features = 'pos_features_%s' % task
|
80
|
+
#self.chunk_features = 'chunk_features_%s' % task
|
81
|
+
self.suffix_features = 'suffix_features_%s' % task
|
82
|
+
self.prefix_features = 'prefix_features_%s' % task
|
83
|
+
|
84
|
+
def __str__(self):
|
85
|
+
"""Shows the task at hand and which attributes are used."""
|
86
|
+
lines = []
|
87
|
+
lines.append("Metadata for task %s" % self.task)
|
88
|
+
for k in self.__dict__:
|
89
|
+
if isinstance(k, str) and k.startswith('use_'):
|
90
|
+
lines.append('%s: %s' % (k, self.__dict__[k]))
|
91
|
+
|
92
|
+
return '\n'.join(lines)
|
93
|
+
|
94
|
+
def save_to_file(self):
|
95
|
+
"""
|
96
|
+
Save the contents of the metadata to a file. The filename is determined according
|
97
|
+
to the task.
|
98
|
+
"""
|
99
|
+
save_data = self.__dict__.copy()
|
100
|
+
filename = self.paths['metadata_%s' % self.task]
|
101
|
+
del(save_data['paths'])
|
102
|
+
|
103
|
+
with open(filename, 'wb') as f:
|
104
|
+
_pickle.dump(save_data, f, 2)
|
105
|
+
|
106
|
+
filename = self.paths['metadata_text_%s' % self.task]
|
107
|
+
|
108
|
+
with open(filename, 'wt') as f:
|
109
|
+
for k,v in save_data.items():
|
110
|
+
f.write("%s: %s\n" %(k,v))
|
111
|
+
|
112
|
+
@classmethod
|
113
|
+
def load_from_file(cls, task, paths=None):
|
114
|
+
"""
|
115
|
+
Reads the file containing the metadata for the given task and returns a
|
116
|
+
Metadata object.
|
117
|
+
"""
|
118
|
+
if paths is None:
|
119
|
+
paths = config.FILES
|
120
|
+
md = Metadata(None, paths)
|
121
|
+
|
122
|
+
# the actual content of the file is the __dict__ member variable, which contain all
|
123
|
+
# the instance's data
|
124
|
+
with open(paths['metadata_%s' % task], 'rb') as f:
|
125
|
+
data = _pickle.load(f)
|
126
|
+
md.__dict__.update(data)
|
127
|
+
|
128
|
+
return md
|
129
|
+
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,92 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
"""
|
4
|
+
Class for dealing with POS data.
|
5
|
+
"""
|
6
|
+
|
7
|
+
from ..reader import TaggerReader
|
8
|
+
|
9
|
+
class ConllNER(object):
|
10
|
+
"""
|
11
|
+
Dummy class for storing column positions in a conll file.
|
12
|
+
"""
|
13
|
+
id = 0
|
14
|
+
word = 1
|
15
|
+
pos = 2
|
16
|
+
ner = 3
|
17
|
+
link = 4
|
18
|
+
SEP = '\t'
|
19
|
+
|
20
|
+
class NERReader(TaggerReader):
|
21
|
+
"""
|
22
|
+
This class reads data from a POS corpus and turns it into a format
|
23
|
+
readable by the neural network for the POS tagging task.
|
24
|
+
"""
|
25
|
+
|
26
|
+
def __init__(self, md=None, filename=None, load_dictionaries=True):
|
27
|
+
"""
|
28
|
+
Constructor
|
29
|
+
"""
|
30
|
+
self.rare_tag = None
|
31
|
+
self.sentences = []
|
32
|
+
if filename is not None:
|
33
|
+
try:
|
34
|
+
self._read_plain(filename)
|
35
|
+
except:
|
36
|
+
self._read_conll(filename)
|
37
|
+
|
38
|
+
super(NERReader, self).__init__(md, load_dictionaries=load_dictionaries)
|
39
|
+
|
40
|
+
@property
|
41
|
+
def task(self):
|
42
|
+
"""
|
43
|
+
Abstract Base Class (ABC) attribute.
|
44
|
+
"""
|
45
|
+
return 'ner'
|
46
|
+
|
47
|
+
def _read_plain(self, filename):
|
48
|
+
"""
|
49
|
+
Read data from a "plain" file, with one sentence per line, each token
|
50
|
+
as token_tag.
|
51
|
+
"""
|
52
|
+
self.sentences = []
|
53
|
+
with open(filename, 'rt') as f:
|
54
|
+
for line in f:
|
55
|
+
#line = unicode(line, 'utf-8')
|
56
|
+
items = line.strip().split()
|
57
|
+
sentence = []
|
58
|
+
for item in items:
|
59
|
+
token, tag = item.rsplit('_', 1)
|
60
|
+
sentence.append((token, tag))
|
61
|
+
|
62
|
+
self.sentences.append(sentence)
|
63
|
+
|
64
|
+
def _read_conll(self, filename):
|
65
|
+
"""
|
66
|
+
Read data from a CoNLL formatted file. It expects at least 4 columns:
|
67
|
+
id, surface word, lemma (ignored, may be anything)
|
68
|
+
and the POS tag.
|
69
|
+
"""
|
70
|
+
self.sentences = []
|
71
|
+
sentence = []
|
72
|
+
with open(filename, 'rt') as f:
|
73
|
+
for line in f:
|
74
|
+
line = line.strip()
|
75
|
+
if line == '':
|
76
|
+
if len(sentence) > 0:
|
77
|
+
self.sentences.append(sentence)
|
78
|
+
sentence = [] # 문장 끝나고 빈 라인이 여러개 있는 것에 대비
|
79
|
+
continue
|
80
|
+
|
81
|
+
fields = line.split(ConllNER.SEP)
|
82
|
+
word = fields[ConllNER.word]
|
83
|
+
pos = fields[ConllNER.pos]
|
84
|
+
ner = fields[ConllNER.ner]
|
85
|
+
link = fields[ConllNER.link]
|
86
|
+
sentence.append((word, ner))
|
87
|
+
|
88
|
+
if len(sentence) > 0:
|
89
|
+
self.sentences.append(sentence)
|
90
|
+
|
91
|
+
# backwards compatibility
|
92
|
+
MacMorphoReader = NERReader
|