nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +16 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +847 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +787 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  32. nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  33. nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  34. nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  35. nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  36. nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  37. nltkor/sejong/ch.py +12 -0
  38. nltkor/sejong/dict_semClassNum.txt +491 -0
  39. nltkor/sejong/layer.txt +630 -0
  40. nltkor/sejong/sejong_download.py +87 -0
  41. nltkor/sejong/ssem.py +684 -0
  42. nltkor/similarity/__init__.py +3 -0
  43. nltkor/similarity/bartscore____.py +337 -0
  44. nltkor/similarity/bertscore____.py +339 -0
  45. nltkor/similarity/classical.py +245 -0
  46. nltkor/similarity/cosine_similarity.py +175 -0
  47. nltkor/tag/__init__.py +71 -0
  48. nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
  49. nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
  50. nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
  51. nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
  52. nltkor/tag/espresso_tag.py +220 -0
  53. nltkor/tag/libs/__init__.py +10 -0
  54. nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
  55. nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
  56. nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
  57. nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
  58. nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
  59. nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
  60. nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
  61. nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
  62. nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
  63. nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
  64. nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
  65. nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
  66. nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
  67. nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
  68. nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
  69. nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
  70. nltkor/tag/libs/arguments.py +280 -0
  71. nltkor/tag/libs/attributes.py +231 -0
  72. nltkor/tag/libs/config.py +159 -0
  73. nltkor/tag/libs/metadata.py +129 -0
  74. nltkor/tag/libs/ner/__init__.py +2 -0
  75. nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
  76. nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
  77. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
  78. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
  79. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  80. nltkor/tag/libs/ner/ner_reader.py +92 -0
  81. nltkor/tag/libs/network.c +72325 -0
  82. nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
  83. nltkor/tag/libs/network.pyx +878 -0
  84. nltkor/tag/libs/networkconv.pyx +1028 -0
  85. nltkor/tag/libs/networkdependencyconv.pyx +451 -0
  86. nltkor/tag/libs/parse/__init__.py +1 -0
  87. nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
  88. nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
  89. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
  90. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
  91. nltkor/tag/libs/parse/parse_reader.py +283 -0
  92. nltkor/tag/libs/pos/__init__.py +2 -0
  93. nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
  94. nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
  95. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
  96. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
  97. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  98. nltkor/tag/libs/pos/pos_reader.py +97 -0
  99. nltkor/tag/libs/reader.py +485 -0
  100. nltkor/tag/libs/srl/__init__.py +3 -0
  101. nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
  102. nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
  103. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
  104. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
  105. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
  106. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
  107. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  108. nltkor/tag/libs/srl/srl_reader.py +436 -0
  109. nltkor/tag/libs/srl/train_srl.py +87 -0
  110. nltkor/tag/libs/taggers.py +926 -0
  111. nltkor/tag/libs/utils.py +384 -0
  112. nltkor/tag/libs/word_dictionary.py +239 -0
  113. nltkor/tag/libs/wsd/__init__.py +2 -0
  114. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
  115. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
  116. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
  117. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
  118. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  119. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  120. nltkor/tokenize/__init__.py +62 -0
  121. nltkor/tokenize/ko_tokenize.py +115 -0
  122. nltkor/trans.py +121 -0
  123. nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
  124. nltkor-1.2.14.dist-info/METADATA +41 -0
  125. nltkor-1.2.14.dist-info/RECORD +127 -0
  126. nltkor-1.2.14.dist-info/WHEEL +5 -0
  127. nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,231 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import logging
4
+ import numpy as np
5
+
6
+ from .word_dictionary import WordDictionary as WD
7
+ from collections import defaultdict
8
+
9
+ # dummy value to be used when POS is an additional attribute
10
+ PADDING_POS = 'PADDING'
11
+
12
+ class Caps(object):
13
+ """Dummy class for storing numeric values for capitalization."""
14
+ num_values = 5
15
+ lower = 0
16
+ title = 1
17
+ non_alpha = 2
18
+ other = 3
19
+ padding = 4
20
+
21
+
22
+ class Token(object):
23
+ def __init__(self, word, morph_h='NA', pos_h='NA', morph_t='NA', pos_t='NA', chunk='NA'):
24
+ """
25
+ A token representation that stores discrete attributes to be given as
26
+ input to the neural network.
27
+ """
28
+ self.word = word
29
+ self.morph_h = morph_h
30
+ self.pos_h = pos_h
31
+ self.pos_t = pos_t
32
+ self.morph_t = morph_t
33
+ self.chunk = chunk
34
+
35
+ def __str__(self):
36
+ return str(self.word)
37
+
38
+ def __repr__(self):
39
+ return self.word.__repr__()
40
+
41
+
42
+ class Affix(object):
43
+ """Dummy class for manipulating suffixes and their related codes."""
44
+ # codes maps integers (affix sizes) to dicts. each dict maps a suffix of the given
45
+ # size to its code
46
+ suffix_codes = {}
47
+ prefix_codes = {}
48
+ other = 0
49
+ padding = 1
50
+ num_suffixes_per_size = {}
51
+ num_prefixes_per_size = {}
52
+
53
+ @classmethod
54
+ def load_suffixes(cls, md):
55
+ """
56
+ Loads suffixes from the suffix file.
57
+ """
58
+ cls.load_affixes(cls.suffix_codes, md.paths['suffixes'])
59
+
60
+ # +2 because of the unkown suffix code and padding
61
+ cls.num_suffixes_per_size = {size: len(cls.suffix_codes[size]) + 2
62
+ for size in cls.suffix_codes}
63
+
64
+ @classmethod
65
+ def load_prefixes(cls, md):
66
+ """
67
+ Loads prefixes from the prefix file.
68
+ """
69
+ cls.load_affixes(cls.prefix_codes, md.paths['prefixes'])
70
+
71
+ # +2 because of the unkown prefix code and padding
72
+ cls.num_prefixes_per_size = {size: len(cls.prefix_codes[size]) + 2
73
+ for size in cls.prefix_codes}
74
+
75
+
76
+ @classmethod
77
+ def load_affixes(cls, codes, filename):
78
+ """
79
+ Parent function for loading prefixes and suffixes.
80
+ """
81
+ logger = logging.getLogger("Logger")
82
+
83
+ # intermediate storage
84
+ affixes_by_size = defaultdict(list)
85
+
86
+ try:
87
+ with open(filename, 'rb') as f:
88
+ for line in f:
89
+ affix = unicode(line.strip(), 'utf-8')
90
+ size = len(affix)
91
+ affixes_by_size[size].append(affix)
92
+ except IOError:
93
+ logger.error("File %s doesn't exist." % filename)
94
+ raise
95
+
96
+ for size in affixes_by_size:
97
+ # for each size, each affix has a code starting from 2
98
+ # 0 is reserved for unknown affixes
99
+ # 1 is reserved for padding pseudo-affixes
100
+ codes[size] = {affix: code
101
+ for code, affix in enumerate(affixes_by_size[size], 2)}
102
+
103
+ @classmethod
104
+ def get_suffix(cls, word, size):
105
+ """
106
+ Return the suffix code for the given word. Consider a suffix
107
+ of the given size.
108
+ """
109
+ if word == WD.padding_left or word == WD.padding_right:
110
+ return cls.padding
111
+
112
+ if len(word) <= size:
113
+ return cls.other
114
+
115
+ suffix = word[-size:].lower()
116
+ code = cls.suffix_codes[size].get(suffix, cls.other)
117
+ return code
118
+
119
+ @classmethod
120
+ def get_prefix(cls, word, size):
121
+ """
122
+ Return the suffix code for the given word. Consider a suffix
123
+ of the given size.
124
+ """
125
+ if word == WD.padding_left or word == WD.padding_right:
126
+ return cls.padding
127
+
128
+ if len(word) <= size:
129
+ return cls.other
130
+
131
+ prefix = word[:size].lower()
132
+ code = cls.prefix_codes[size].get(prefix, cls.other)
133
+ return code
134
+
135
+
136
+ class TokenConverter(object):
137
+
138
+ def __init__(self):
139
+ """
140
+ Class to convert tokens into indices to their feature vectos in
141
+ feature matrices.
142
+ """
143
+ self.extractors = []
144
+
145
+ def add_extractor(self, extractor):
146
+ """
147
+ Adds an extractor function to the TokenConverter. In order to get a token's
148
+ feature indices, the Converter will call each of its extraction functions passing
149
+ the token as a parameter. The result will be a list containing each result.
150
+ """
151
+ self.extractors.append(extractor)
152
+
153
+ def get_padding_left(self, tokens_as_string=True):
154
+ """
155
+ Returns an object to be used as the left padding in the sentence.
156
+
157
+ :param tokens_as_string: if True, treat tokens as strings.
158
+ If False, treat them as Token objects.
159
+ """
160
+ if tokens_as_string:
161
+ pad = WD.padding_left
162
+ else:
163
+ pad = Token(WD.padding_left, morph_h=WD.padding_left, \
164
+ morph_t=WD.padding_left, pos_h=PADDING_POS, pos_t=PADDING_POS)
165
+ return self.convert(pad)
166
+
167
+ def get_padding_right(self, tokens_as_string=True):
168
+ """
169
+ Returns an object to be used as the right padding in the sentence.
170
+
171
+ :param tokens_as_string: if True, treat tokens as strings.
172
+ If False, treat them as Token objects.
173
+ """
174
+ if tokens_as_string:
175
+ pad = WD.padding_right
176
+ else:
177
+ pad = Token(WD.padding_right, morph_h=WD.padding_right, \
178
+ morph_t=WD.padding_right, pos_h=PADDING_POS, pos_t=PADDING_POS)
179
+ #pad = Token(WD.padding_right, pos_t=PADDING_POS)
180
+ return self.convert(pad)
181
+
182
+ def convert(self, token):
183
+ """
184
+ Converts a token into its feature indices.
185
+ """
186
+ indices = np.array([function(token) for function in self.extractors])
187
+ return indices
188
+
189
+
190
+ def get_capitalization(word):
191
+ """
192
+ Returns a code describing the capitalization of the word:
193
+ lower, title, other or non-alpha (numbers and other tokens that can't be
194
+ capitalized).
195
+ """
196
+ if word == WD.padding_left or word == WD.padding_right:
197
+ return Caps.padding
198
+
199
+ if not any(c.isalpha() for c in word):
200
+ # check if there is at least one letter
201
+ # (this is faster than using a regex)
202
+ return Caps.non_alpha
203
+
204
+ if word.islower():
205
+ return Caps.lower
206
+
207
+ # word.istitle() returns false for compunds like Low-cost
208
+ if len(word) == 1:
209
+ # if we reached here, there's a single upper case letter
210
+ return Caps.title
211
+ elif word[0].isupper() and word[1:].islower():
212
+ return Caps.title
213
+
214
+ return Caps.other
215
+
216
+ def capitalize(word, capitalization):
217
+ """
218
+ Capitalizes the word in the desired format. If the capitalization is
219
+ Caps.other, it is set all uppercase.
220
+ """
221
+ if capitalization == Caps.non_alpha or capitalization == Caps.padding:
222
+ return word
223
+ elif capitalization == Caps.lower:
224
+ return word.lower()
225
+ elif capitalization == Caps.title:
226
+ return word[0].upper() + word[1:].lower()
227
+ elif capitalization == Caps.other:
228
+ return word.upper()
229
+ else:
230
+ raise ValueError("Unknown capitalization type.")
231
+
@@ -0,0 +1,159 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """
4
+ Configuration data for the system.
5
+ """
6
+
7
+ import os
8
+
9
+ data_dir = None
10
+ FILES = {}
11
+
12
+ def get_config_paths(directory):
13
+ """Sets the data directory containing the data for the models."""
14
+ assert os.path.isdir(directory), 'Invalid data directory'
15
+
16
+ return { key: os.path.join(directory, value) for key, value in [
17
+ # cross-task data
18
+ ('.', '.'), #for data_dir access
19
+
20
+ # vocabulary file used as a fallback if a reader doesn't have a specific one
21
+ ('vocabulary' , 'vocabulary.txt'),
22
+ ('type_features' , 'types-features.npy'),
23
+ ('termvectors' , 'termvectors.txt'),
24
+
25
+ # POS
26
+ ('network_pos' , 'pos-network.npz'),
27
+ ('network_text_pos' , 'pos-network.txt'),
28
+ ('pos_tags' , 'pos-tags.txt'),
29
+ ('pos_tag_dict' , 'pos-tags.txt'),
30
+ ('pos_co_lexicon' , 'pos-co-lexicon.pickle'),
31
+ ('pos_morph_lexicon' , 'pos-morph-lexicon.pickle'),
32
+ ('pos_prob_dict' , 'pos-prob-dict.pickle'),
33
+ ('pos_morph_lexicon_txt' , 'pos-morph-lexicon.txt'),
34
+ ('suffix' , 'suffixes.txt'),
35
+ ('suffixes' , 'suffixes.txt'),
36
+ ('prefix' , 'prefixes.txt'),
37
+ ('prefixes' , 'prefixes.txt'),
38
+ ('metadata_pos' , 'metadata-pos.pickle'),
39
+ ('metadata_text_pos' , 'metadata-pos.txt'),
40
+ ('type_features_pos' , 'types-features-pos.npy'),
41
+ ('caps_features_pos' , 'caps-features-pos.npy'),
42
+ ('suffix_features_pos' , 'suffix-features-pos.npy'),
43
+ ('prefix_features_pos' , 'prefix-features-pos.npy'),
44
+ ('vocabulary_pos' , 'vocabulary-pos.txt'),
45
+
46
+ # NER
47
+ ('network_ner' , 'ner-network.npz'),
48
+ ('network_text_ner' , 'ner-network.txt'),
49
+ ('ner_tags' , 'ner-tags.txt'),
50
+ ('ner_tag_dict' , 'ner-tags.txt'),
51
+ ('ner_morph_lexicon' , 'ner-morph-lexicon.txt'),
52
+ ('suffix' , 'suffixes.txt'),
53
+ ('suffixes' , 'suffixes.txt'),
54
+ ('prefix' , 'prefixes.txt'),
55
+ ('prefixes' , 'prefixes.txt'),
56
+ ('metadata_ner' , 'metadata-ner.pickle'),
57
+ ('metadata_text_ner' , 'metadata-ner.txt'),
58
+ ('type_features_ner' , 'types-features-ner.npy'),
59
+ ('caps_features_ner' , 'caps-features-ner.npy'),
60
+ ('suffix_features_ner' , 'suffix-features-ner.npy'),
61
+ ('prefix_features_ner' , 'prefix-features-ner.npy'),
62
+ ('vocabulary_ner' , 'vocabulary-ner.txt'),
63
+
64
+ # WSD
65
+ ('network_wsd' , 'wsd-network.npz'),
66
+ ('network_text_wsd' , 'wsd-network.txt'),
67
+ ('wsd_tags' , 'wsd-tags.txt'),
68
+ ('wsd_tag_dict' , 'wsd-tags.txt'),
69
+ ('wsd_morph_lexicon' , 'wsd-morph-lexicon.txt'),
70
+ # ('suffix' , 'suffixes.txt'),
71
+ # ('suffixes' , 'suffixes.txt'),
72
+ # ('prefix' , 'prefixes.txt'),
73
+ # ('prefixes' , 'prefixes.txt'),
74
+ ('metadata_wsd' , 'metadata-wsd.pickle'),
75
+ ('metadata_text_wsd' , 'metadata-wsd.txt'),
76
+ ('type_features_wsd' , 'types-features-wsd.npy'),
77
+ ('caps_features_wsd' , 'caps-features-wsd.npy'),
78
+ ('suffix_features_wsd' , 'suffix-features-wsd.npy'),
79
+ ('prefix_features_wsd' , 'prefix-features-wsd.npy'),
80
+ ('vocabulary_wsd' , 'vocabulary-wsd.txt'),
81
+
82
+ # dependency
83
+ ('network_labeled_dependency', 'ldep-network.npz'),
84
+ ('network_text_labeled_dependency', 'ldep-network.txt'),
85
+ ('type_features_labeled_dependency', 'types-features-ldep.npy'),
86
+ ('caps_features_labeled_dependency', 'caps-features-ldep.npy'),
87
+ ('pos_features_labeled_dependency', 'pos-features-ldep.npy'),
88
+ ('metadata_labeled_dependency', 'metadata-ldep.pickle'),
89
+ ('metadata_text_labeled_dependency', 'metadata-ldep.txt'),
90
+ ('dependency_tag_dict', 'dependency-tags.txt'),
91
+ ('labeled_dependency_tag_dict', 'dependency-tags.txt'),
92
+ ('vocabulary_labeled_dependency', 'vocabulary-ldep.txt'),
93
+
94
+ ('dependency_pos_tags', 'dep-pos-tags.txt'),
95
+
96
+ ('network_unlabeled_dependency', 'udep-network.npz'),
97
+ ('network_text_unlabeled_dependency', 'udep-network.txt'),
98
+ ('type_features_unlabeled_dependency', 'types-features-udep.npy'),
99
+ ('caps_features_unlabeled_dependency', 'caps-features-udep.npy'),
100
+ ('pos_features_unlabeled_dependency', 'pos-features-udep.npy'),
101
+ ('metadata_unlabeled_dependency', 'metadata-udep.pickle'),
102
+ ('metadata_text_unlabeled_dependency', 'metadata-udep.txt'),
103
+ ('vocabulary_unlabeled_dependency', 'vocabulary-udep.txt'),
104
+
105
+ # chunk
106
+ #('chunk_tag_dict' , 'chunk-tag-dict.pickle'),
107
+ #('chunk_tags' , 'chunk-tags.txt'),
108
+
109
+ # SRL
110
+ ('network_srl' , 'srl-network.npz'),
111
+ ('network_text_srl' , 'srl-network.txt'),
112
+ #('network_srl_boundary' , 'srl-id-network.npz'),
113
+ #('network_srl_classify' , 'srl-class-network.npz'),
114
+ #('network_srl_predicates' , 'srl-class-predicates.npz'),
115
+ #('srl_iob_tag_dict' , 'srl-tags.txt'),
116
+ #('srl_iob_tags' , 'srl-tags.txt'),
117
+ ('srl_tags' , 'srl-tags.txt'),
118
+ #('srl_classify_tag_dict' , 'srl-tags.txt'),
119
+ #('srl_classify_tags' , 'srl-tags.txt'),
120
+ #('srl_predicates_tag_dict' , 'srl-predicates-tags.txt'),
121
+ #('srl_predicates_tags' , 'srl-predicates-tags.txt'),
122
+ ('type_features_srl' , 'types-features-srl.npy'),
123
+ ('caps_features_srl' , 'caps-features-srl.npy'),
124
+ ('pos_features_srl' , 'pos-features-srl.npy'),
125
+ #('chunk_features_classify' , 'chunk-features-class.npy'),
126
+ #('type_features_boundary' , 'types-features-id.npy'),
127
+ #('caps_features_boundary' , 'caps-features-id.npy'),
128
+ #('pos_features_boundary' , 'pos-features-id.npy'),
129
+ #('chunk_features_boundary' , 'chunk-features-id.npy'),
130
+ #('type_features_classify' , 'types-features-class.npy'),
131
+ #('caps_features_classify' , 'caps-features-class.npy'),
132
+ #('pos_features_classify' , 'pos-features-class.npy'),
133
+ #('chunk_features_classify' , 'chunk-features-class.npy'),
134
+ #('type_features_1step' , 'types-features-1step.npy'),
135
+ #('caps_features_1step' , 'caps-features-1step.npy'),
136
+ #('pos_features_1step' , 'pos-features-1step.npy'),
137
+ #('chunk_features_1step' , 'chunk-features-1step.npy'),
138
+ #('type_features_srl_predicates', 'types-features-preds.npy'),
139
+ #('caps_features_srl_predicates', 'caps-features-preds.npy'),
140
+ #('pos_features_srl_predicates' , 'pos-features-preds.npy'),
141
+ ('metadata_srl' , 'metadata-srl.pickle'),
142
+ ('metadata_text_srl' , 'metadata-srl.txt'),
143
+ #('metadata_srl_boundary' , 'metadata-srl-boundary.pickle'),
144
+ #('metadata_srl_classify' , 'metadata-srl-classify.pickle'),
145
+ #('metadata_srl_predicates' , 'metadata-srl-predicates.pickle'),
146
+ ('vocabulary_srl', 'vocabulary-srl.txt'),
147
+ #('vocabulary_srl_boundary', 'vocabulary-srl-boundary.txt'),
148
+ #('vocabulary_srl_classify', 'vocabulary-srl-classify.txt'),
149
+ #('vocabulary_srl_predicates', 'vocabulary-srl-predicates.txt')
150
+ ]
151
+ }
152
+
153
+
154
+ def set_data_dir(directory):
155
+ """Sets the global data directory containing the data for the models."""
156
+ global data_dir, FILES
157
+ data_dir = directory
158
+ FILES = get_config_paths(directory)
159
+
@@ -0,0 +1,129 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """
4
+ This script contains the definition of the Metadata class.
5
+ It can also be invoked in order to create a Metada object
6
+ and save it to a file in the data directory.
7
+ """
8
+
9
+ import _pickle
10
+
11
+ from . import config
12
+
13
+ class Metadata(object):
14
+ """
15
+ Class for storing metadata about a neural network and its
16
+ parameter files.
17
+ """
18
+
19
+ def __init__(self, task, paths=None, use_caps=True, use_suffix=False, use_prefix=False,
20
+ use_pos=False, use_chunk=False, use_lemma=False):
21
+ self.task = task
22
+ self.paths = paths if paths else config.FILES
23
+ self.use_caps = use_caps
24
+ self.use_suffix = use_suffix
25
+ self.use_prefix = use_prefix
26
+ self.use_pos = use_pos
27
+ self.use_chunk = use_chunk
28
+ self.use_lemma = use_lemma
29
+ self.metadata = 'metadata_%s' % task
30
+ self.network = 'network_%s' % task
31
+ self.network_text = 'network_text_%s' % task
32
+ self.tag_dict = '%s_tag_dict' % task
33
+
34
+ # dependency edge filter doesn't use an actual neural network, so
35
+ # we call it "model" to be more consistent
36
+ self.model = self.network
37
+ '''
38
+ if task == 'srl_boundary':
39
+ self.pred_dist_table = 'pred_dist_table_boundary'
40
+ self.target_dist_table = 'target_dist_table_boundary'
41
+ self.transitions = 'srl_transitions_boundary'
42
+ self.type_features = 'type_features_boundary'
43
+ self.caps_features = 'caps_features_boundary'
44
+ self.pos_features = 'pos_features_boundary'
45
+ self.chunk_features = 'chunk_features_boundary'
46
+ self.suffix_features = None
47
+
48
+ elif task == 'srl_classify':
49
+ self.pred_dist_table = 'pred_dist_table_classify'
50
+ self.target_dist_table = 'target_dist_table_classify'
51
+ self.transitions = None
52
+ self.type_features = 'type_features_classify'
53
+ self.caps_features = 'caps_features_classify'
54
+ self.pos_features = 'pos_features_classify'
55
+ self.chunk_features = 'chunk_features_classify'
56
+ self.suffix_features = None
57
+
58
+ elif task == 'srl':
59
+ # one step srl
60
+ self.pred_dist_table = 'pred_dist_table_1step'
61
+ self.target_dist_table = 'target_dist_table_1step'
62
+ self.transitions = 'srl_transitions_1step'
63
+ self.type_features = 'type_features_1step'
64
+ self.caps_features = 'caps_features_1step'
65
+ self.pos_features = 'pos_features_1step'
66
+ self.chunk_features = 'chunk_features_1step'
67
+ self.suffix_features = None
68
+
69
+ else:
70
+ self.type_features = 'type_features_%s' % task
71
+ self.caps_features = 'caps_features_%s' % task
72
+ self.pos_features = 'pos_features_%s' % task
73
+ self.chunk_features = 'chunk_features_%s' % task
74
+ self.suffix_features = 'suffix_features_%s' % task
75
+ self.prefix_features = 'prefix_features_%s' % task
76
+ '''
77
+ self.type_features = 'type_features_%s' % task
78
+ self.caps_features = 'caps_features_%s' % task
79
+ self.pos_features = 'pos_features_%s' % task
80
+ #self.chunk_features = 'chunk_features_%s' % task
81
+ self.suffix_features = 'suffix_features_%s' % task
82
+ self.prefix_features = 'prefix_features_%s' % task
83
+
84
+ def __str__(self):
85
+ """Shows the task at hand and which attributes are used."""
86
+ lines = []
87
+ lines.append("Metadata for task %s" % self.task)
88
+ for k in self.__dict__:
89
+ if isinstance(k, str) and k.startswith('use_'):
90
+ lines.append('%s: %s' % (k, self.__dict__[k]))
91
+
92
+ return '\n'.join(lines)
93
+
94
+ def save_to_file(self):
95
+ """
96
+ Save the contents of the metadata to a file. The filename is determined according
97
+ to the task.
98
+ """
99
+ save_data = self.__dict__.copy()
100
+ filename = self.paths['metadata_%s' % self.task]
101
+ del(save_data['paths'])
102
+
103
+ with open(filename, 'wb') as f:
104
+ _pickle.dump(save_data, f, 2)
105
+
106
+ filename = self.paths['metadata_text_%s' % self.task]
107
+
108
+ with open(filename, 'wt') as f:
109
+ for k,v in save_data.items():
110
+ f.write("%s: %s\n" %(k,v))
111
+
112
+ @classmethod
113
+ def load_from_file(cls, task, paths=None):
114
+ """
115
+ Reads the file containing the metadata for the given task and returns a
116
+ Metadata object.
117
+ """
118
+ if paths is None:
119
+ paths = config.FILES
120
+ md = Metadata(None, paths)
121
+
122
+ # the actual content of the file is the __dict__ member variable, which contain all
123
+ # the instance's data
124
+ with open(paths['metadata_%s' % task], 'rb') as f:
125
+ data = _pickle.load(f)
126
+ md.__dict__.update(data)
127
+
128
+ return md
129
+
@@ -0,0 +1,2 @@
1
+
2
+ from .ner_reader import NERReader
@@ -0,0 +1,7 @@
1
+
2
+ import warnings
3
+
4
+ # backwards compatibility
5
+ from .ner_reader import *
6
+
7
+ warnings.warn('Module macmorphoreader is deprecated. Use module pos_reader instead.')
@@ -0,0 +1,92 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """
4
+ Class for dealing with POS data.
5
+ """
6
+
7
+ from ..reader import TaggerReader
8
+
9
+ class ConllNER(object):
10
+ """
11
+ Dummy class for storing column positions in a conll file.
12
+ """
13
+ id = 0
14
+ word = 1
15
+ pos = 2
16
+ ner = 3
17
+ link = 4
18
+ SEP = '\t'
19
+
20
+ class NERReader(TaggerReader):
21
+ """
22
+ This class reads data from a POS corpus and turns it into a format
23
+ readable by the neural network for the POS tagging task.
24
+ """
25
+
26
+ def __init__(self, md=None, filename=None, load_dictionaries=True):
27
+ """
28
+ Constructor
29
+ """
30
+ self.rare_tag = None
31
+ self.sentences = []
32
+ if filename is not None:
33
+ try:
34
+ self._read_plain(filename)
35
+ except:
36
+ self._read_conll(filename)
37
+
38
+ super(NERReader, self).__init__(md, load_dictionaries=load_dictionaries)
39
+
40
+ @property
41
+ def task(self):
42
+ """
43
+ Abstract Base Class (ABC) attribute.
44
+ """
45
+ return 'ner'
46
+
47
+ def _read_plain(self, filename):
48
+ """
49
+ Read data from a "plain" file, with one sentence per line, each token
50
+ as token_tag.
51
+ """
52
+ self.sentences = []
53
+ with open(filename, 'rt') as f:
54
+ for line in f:
55
+ #line = unicode(line, 'utf-8')
56
+ items = line.strip().split()
57
+ sentence = []
58
+ for item in items:
59
+ token, tag = item.rsplit('_', 1)
60
+ sentence.append((token, tag))
61
+
62
+ self.sentences.append(sentence)
63
+
64
+ def _read_conll(self, filename):
65
+ """
66
+ Read data from a CoNLL formatted file. It expects at least 4 columns:
67
+ id, surface word, lemma (ignored, may be anything)
68
+ and the POS tag.
69
+ """
70
+ self.sentences = []
71
+ sentence = []
72
+ with open(filename, 'rt') as f:
73
+ for line in f:
74
+ line = line.strip()
75
+ if line == '':
76
+ if len(sentence) > 0:
77
+ self.sentences.append(sentence)
78
+ sentence = [] # 문장 끝나고 빈 라인이 여러개 있는 것에 대비
79
+ continue
80
+
81
+ fields = line.split(ConllNER.SEP)
82
+ word = fields[ConllNER.word]
83
+ pos = fields[ConllNER.pos]
84
+ ner = fields[ConllNER.ner]
85
+ link = fields[ConllNER.link]
86
+ sentence.append((word, ner))
87
+
88
+ if len(sentence) > 0:
89
+ self.sentences.append(sentence)
90
+
91
+ # backwards compatibility
92
+ MacMorphoReader = NERReader