nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nltkor/Kor_char.py +193 -0
- nltkor/__init__.py +16 -0
- nltkor/alignment/__init__.py +1315 -0
- nltkor/cider/__init__.py +2 -0
- nltkor/cider/cider.py +55 -0
- nltkor/cider/cider_scorer.py +207 -0
- nltkor/distance/__init__.py +441 -0
- nltkor/distance/wasserstein.py +126 -0
- nltkor/etc.py +22 -0
- nltkor/lazyimport.py +144 -0
- nltkor/make_requirement.py +11 -0
- nltkor/metrics/__init__.py +63 -0
- nltkor/metrics/bartscore.py +301 -0
- nltkor/metrics/bertscore.py +331 -0
- nltkor/metrics/bleu_tensor.py +20 -0
- nltkor/metrics/classical.py +847 -0
- nltkor/metrics/entment.py +24 -0
- nltkor/metrics/eval.py +517 -0
- nltkor/metrics/mauve.py +273 -0
- nltkor/metrics/mauve_utils.py +131 -0
- nltkor/misc/__init__.py +11 -0
- nltkor/misc/string2string_basic_functions.py +59 -0
- nltkor/misc/string2string_default_tokenizer.py +83 -0
- nltkor/misc/string2string_hash_functions.py +159 -0
- nltkor/misc/string2string_word_embeddings.py +503 -0
- nltkor/search/__init__.py +10 -0
- nltkor/search/classical.py +569 -0
- nltkor/search/faiss_search.py +787 -0
- nltkor/search/kobert_tokenizer.py +181 -0
- nltkor/sejong/__init__.py +3 -0
- nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
- nltkor/sejong/ch.py +12 -0
- nltkor/sejong/dict_semClassNum.txt +491 -0
- nltkor/sejong/layer.txt +630 -0
- nltkor/sejong/sejong_download.py +87 -0
- nltkor/sejong/ssem.py +684 -0
- nltkor/similarity/__init__.py +3 -0
- nltkor/similarity/bartscore____.py +337 -0
- nltkor/similarity/bertscore____.py +339 -0
- nltkor/similarity/classical.py +245 -0
- nltkor/similarity/cosine_similarity.py +175 -0
- nltkor/tag/__init__.py +71 -0
- nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
- nltkor/tag/espresso_tag.py +220 -0
- nltkor/tag/libs/__init__.py +10 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
- nltkor/tag/libs/arguments.py +280 -0
- nltkor/tag/libs/attributes.py +231 -0
- nltkor/tag/libs/config.py +159 -0
- nltkor/tag/libs/metadata.py +129 -0
- nltkor/tag/libs/ner/__init__.py +2 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/macmorphoreader.py +7 -0
- nltkor/tag/libs/ner/ner_reader.py +92 -0
- nltkor/tag/libs/network.c +72325 -0
- nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
- nltkor/tag/libs/network.pyx +878 -0
- nltkor/tag/libs/networkconv.pyx +1028 -0
- nltkor/tag/libs/networkdependencyconv.pyx +451 -0
- nltkor/tag/libs/parse/__init__.py +1 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/parse_reader.py +283 -0
- nltkor/tag/libs/pos/__init__.py +2 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/macmorphoreader.py +7 -0
- nltkor/tag/libs/pos/pos_reader.py +97 -0
- nltkor/tag/libs/reader.py +485 -0
- nltkor/tag/libs/srl/__init__.py +3 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__srl_reader_.py +535 -0
- nltkor/tag/libs/srl/srl_reader.py +436 -0
- nltkor/tag/libs/srl/train_srl.py +87 -0
- nltkor/tag/libs/taggers.py +926 -0
- nltkor/tag/libs/utils.py +384 -0
- nltkor/tag/libs/word_dictionary.py +239 -0
- nltkor/tag/libs/wsd/__init__.py +2 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
- nltkor/tag/libs/wsd/wsd_reader.py +93 -0
- nltkor/tokenize/__init__.py +62 -0
- nltkor/tokenize/ko_tokenize.py +115 -0
- nltkor/trans.py +121 -0
- nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
- nltkor-1.2.14.dist-info/METADATA +41 -0
- nltkor-1.2.14.dist-info/RECORD +127 -0
- nltkor-1.2.14.dist-info/WHEEL +5 -0
- nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,283 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
'''
|
4
|
+
Class for dealing with dependency parsing data.
|
5
|
+
'''
|
6
|
+
|
7
|
+
import os
|
8
|
+
import logging
|
9
|
+
import numpy as np
|
10
|
+
|
11
|
+
from .. import attributes
|
12
|
+
from .. import reader
|
13
|
+
from ..word_dictionary import WordDictionary
|
14
|
+
|
15
|
+
class ConllPos(object):
|
16
|
+
'''
|
17
|
+
Dummy class to store field positions in a CoNLL-like file
|
18
|
+
for dependency parsing. NB: The positions are different from
|
19
|
+
those used in SRL!
|
20
|
+
'''
|
21
|
+
id = 0
|
22
|
+
word = 1 # 어절
|
23
|
+
morph_h = 2 # 접두사를 제외한 첫 형태소
|
24
|
+
pos_h = 3 # 접두사를 제외한 첫 형태소의 품사
|
25
|
+
morph_t = 4 # 마지막 형태소
|
26
|
+
pos_t = 5 # 마지막 형태소 품사
|
27
|
+
dep_head = 6 # dependency head 의존정보
|
28
|
+
dep_rel = 7 # dependency relation 구문 태그
|
29
|
+
SEP = '\t'
|
30
|
+
|
31
|
+
|
32
|
+
class DependencyReader(reader.TaggerReader):
|
33
|
+
'''
|
34
|
+
Class to read dependency files in CoNLL X format.
|
35
|
+
'''
|
36
|
+
|
37
|
+
def __init__(self, md=None, filename=None, labeled=False):
|
38
|
+
'''
|
39
|
+
Constructor.
|
40
|
+
:param md: Metadata object containing the description for this reader
|
41
|
+
:param filename: file containing data to be read and used in training
|
42
|
+
or tagging
|
43
|
+
:param labeled: (ignored if md is supplied) whether it is intended
|
44
|
+
to be used in labeled dependency parsing. Note that if it is
|
45
|
+
True, another reader object will be needed for unlabeled dependency.
|
46
|
+
'''
|
47
|
+
if md is not None:
|
48
|
+
self.labeled = md.task.startswith('labeled')
|
49
|
+
else:
|
50
|
+
self.labeled = labeled
|
51
|
+
|
52
|
+
if filename is not None:
|
53
|
+
self._read_conll(filename)
|
54
|
+
|
55
|
+
if self.labeled:
|
56
|
+
self.taskname = 'labeled_dependency'
|
57
|
+
else:
|
58
|
+
self.taskname = 'unlabeled_dependency'
|
59
|
+
|
60
|
+
self.rare_tag = None
|
61
|
+
self.pos_dict = None
|
62
|
+
super(DependencyReader, self).__init__(md)
|
63
|
+
|
64
|
+
@property
|
65
|
+
def task(self):
|
66
|
+
"""
|
67
|
+
Abstract Base Class (ABC) attribute.
|
68
|
+
"""
|
69
|
+
return self.taskname
|
70
|
+
|
71
|
+
|
72
|
+
def _read_conll(self, filename):
|
73
|
+
'''
|
74
|
+
Read data from a CoNLL formatted file.
|
75
|
+
'''
|
76
|
+
lines = []
|
77
|
+
self.sentences = []
|
78
|
+
self.heads = []
|
79
|
+
|
80
|
+
# this keeps track of the tokens
|
81
|
+
sentence = []
|
82
|
+
|
83
|
+
# this has the number of each token's head, in the same order as
|
84
|
+
# the tokens appear
|
85
|
+
sentence_heads = []
|
86
|
+
if self.labeled:
|
87
|
+
self.labels = []
|
88
|
+
sentence_labels = []
|
89
|
+
|
90
|
+
with open(filename, 'r') as f:
|
91
|
+
for line in f:
|
92
|
+
line = line.strip()
|
93
|
+
#line = unicode(line, 'utf-8').strip()
|
94
|
+
lines.append(line)
|
95
|
+
|
96
|
+
for line in lines:
|
97
|
+
if line == '':
|
98
|
+
# empty line, last sentence is finished
|
99
|
+
if len(sentence) > 0:
|
100
|
+
self.sentences.append(sentence)
|
101
|
+
self.heads.append(np.array(sentence_heads))
|
102
|
+
|
103
|
+
if self.labeled:
|
104
|
+
self.labels.append(sentence_labels)
|
105
|
+
sentence_labels = []
|
106
|
+
|
107
|
+
sentence = []
|
108
|
+
sentence_heads = []
|
109
|
+
|
110
|
+
continue
|
111
|
+
|
112
|
+
fields = line.split(ConllPos.SEP)
|
113
|
+
word = fields[ConllPos.word]
|
114
|
+
morph_h = fields[ConllPos.morph_h]
|
115
|
+
pos_h = fields[ConllPos.pos_h]
|
116
|
+
morph_t = fields[ConllPos.morph_t]
|
117
|
+
pos_t = fields[ConllPos.pos_t]
|
118
|
+
head = int(fields[ConllPos.dep_head])
|
119
|
+
label = fields[ConllPos.dep_rel]
|
120
|
+
|
121
|
+
if head == 0:
|
122
|
+
# we represent a dependency to root as an edge to the token itself
|
123
|
+
head = int(fields[ConllPos.id])
|
124
|
+
|
125
|
+
# -1 because tokens are numbered from 1
|
126
|
+
head -= 1
|
127
|
+
|
128
|
+
token = attributes.Token(word, pos_h=pos_h, morph_t=morph_t, pos_t=pos_t )
|
129
|
+
#token = attributes.Token(word, morph_h=morph_h, pos_h=pos_h, morph_t=morph_t, pos_t=pos_t )
|
130
|
+
sentence.append(token)
|
131
|
+
sentence_heads.append(head)
|
132
|
+
if self.labeled:
|
133
|
+
sentence_labels.append(label)
|
134
|
+
|
135
|
+
# in case there was not an empty line after the last sentence
|
136
|
+
if len(sentence) > 0:
|
137
|
+
self.sentences.append(sentence)
|
138
|
+
self.heads.append(np.array(sentence_heads))
|
139
|
+
if self.labeled:
|
140
|
+
self.labels.append(sentence_labels)
|
141
|
+
|
142
|
+
def _create_pos_dict(self):
|
143
|
+
"""
|
144
|
+
Examine all POS tags in the sentences and create a dictionary based on them.
|
145
|
+
"""
|
146
|
+
logger = logging.getLogger("Logger")
|
147
|
+
logger.info('Creating new POS tag dictionary (for dependency parsing)')
|
148
|
+
tags = {token.pos_h for sent in self.sentences
|
149
|
+
for token in sent} | {token.pos_t for sent in self.sentences
|
150
|
+
for token in sent}
|
151
|
+
pos_dict = {tag: code for code, tag in enumerate(tags)}
|
152
|
+
|
153
|
+
code = max(pos_dict.values()) + 1
|
154
|
+
pos_dict[attributes.PADDING_POS] = code
|
155
|
+
|
156
|
+
return pos_dict
|
157
|
+
|
158
|
+
def load_pos_dict(self):
|
159
|
+
"""
|
160
|
+
Load the pos tag dictionary (specific for dependency parsing)
|
161
|
+
from its default location.
|
162
|
+
"""
|
163
|
+
logger = logging.getLogger("Logger")
|
164
|
+
logger.debug('Loading POS tag dictionary (for dependency parsing)')
|
165
|
+
pos_dict = reader.load_tag_dict(self.md.paths['dependency_pos_tags'])
|
166
|
+
return pos_dict
|
167
|
+
|
168
|
+
def load_tag_dict(self, filename=None):
|
169
|
+
"""
|
170
|
+
Verify if this reader is for the unlabeled dependency task. If so,
|
171
|
+
it doesn't use a tag dictionary and the call is ignored.
|
172
|
+
"""
|
173
|
+
if not self.labeled:
|
174
|
+
return
|
175
|
+
|
176
|
+
super(DependencyReader, self).load_tag_dict(filename)
|
177
|
+
|
178
|
+
def load_or_create_tag_dict(self):
|
179
|
+
"""
|
180
|
+
Try to load the tag dictionary from the default location. If the dictinaty
|
181
|
+
file is not available, scan the available sentences and create a new one.
|
182
|
+
|
183
|
+
It only is needed in labeled dependency parsing.
|
184
|
+
"""
|
185
|
+
if not self.labeled:
|
186
|
+
return
|
187
|
+
|
188
|
+
logger = logging.getLogger('Logger')
|
189
|
+
filename = self.md.paths['dependency_tag_dict']
|
190
|
+
if os.path.isfile(filename):
|
191
|
+
self.load_tag_dict(filename)
|
192
|
+
logger.debug('Loaded dependency tag dictionary')
|
193
|
+
return
|
194
|
+
|
195
|
+
tags = {tag for sent_labels in self.labels for tag in sent_labels}
|
196
|
+
self.tag_dict = {tag: code for code, tag in enumerate(tags)}
|
197
|
+
|
198
|
+
reader.save_tag_dict(filename, self.tag_dict)
|
199
|
+
logger.debug('Saved dependency tag dictionary')
|
200
|
+
|
201
|
+
|
202
|
+
def generate_dictionary(self, dict_size=None, minimum_occurrences=2):
|
203
|
+
"""
|
204
|
+
Generates a token dictionary based on the given sentences.
|
205
|
+
|
206
|
+
:param dict_size: Max number of tokens to be included in the dictionary.
|
207
|
+
:param minimum_occurrences: Minimum number of times that a token must
|
208
|
+
appear in the text in order to be included in the dictionary.
|
209
|
+
"""
|
210
|
+
logger = logging.getLogger("Logger")
|
211
|
+
#all_tokens = [token.morph_t for sent in self.sentences for token in sent]
|
212
|
+
all_tokens = [token.morph_h for sent in self.sentences for token in sent] \
|
213
|
+
+ [token.morph_t for sent in self.sentences for token in sent]
|
214
|
+
self.word_dict = WordDictionary(all_tokens, dict_size, minimum_occurrences)
|
215
|
+
logger.info("Created dictionary with %d tokens" % self.word_dict.num_tokens)
|
216
|
+
|
217
|
+
def codify_sentences(self):
|
218
|
+
"""
|
219
|
+
Converts each token in each sequence into indices to their feature vectors
|
220
|
+
in feature matrices. The previous sentences as text are not accessible anymore.
|
221
|
+
Tags are left as the index of the each token's head.
|
222
|
+
"""
|
223
|
+
if self.converter is None:
|
224
|
+
self.create_converter()
|
225
|
+
|
226
|
+
self.sentences = [np.array([self.converter.convert(token) for token in sent])
|
227
|
+
for sent in self.sentences]
|
228
|
+
|
229
|
+
if self.labeled:
|
230
|
+
self.labels = [np.array([self.tag_dict[label] for label in sent_labels])
|
231
|
+
for sent_labels in self.labels]
|
232
|
+
|
233
|
+
self.codified = True
|
234
|
+
|
235
|
+
def _load_or_create_pos_dict(self):
|
236
|
+
"""
|
237
|
+
Try to load the pos tag dictionary to be used with this reader (when
|
238
|
+
using POS tags as additional features). If there isn't a file in the
|
239
|
+
data directory with the right name, a new dictionary is created
|
240
|
+
after examining the data.
|
241
|
+
"""
|
242
|
+
if self.pos_dict is not None:
|
243
|
+
return
|
244
|
+
|
245
|
+
if os.path.isfile(self.md.paths['dependency_pos_tags']):
|
246
|
+
self.pos_dict = self.load_pos_dict()
|
247
|
+
else:
|
248
|
+
self.pos_dict = self._create_pos_dict()
|
249
|
+
self.save_tag_dict(self.md.paths['dependency_pos_tags'], self.pos_dict)
|
250
|
+
|
251
|
+
def get_num_pos_tags(self):
|
252
|
+
"""
|
253
|
+
Return the number of POS tags that can be used as an additional feature
|
254
|
+
by this reader.
|
255
|
+
"""
|
256
|
+
self._load_or_create_pos_dict()
|
257
|
+
return len(self.pos_dict)
|
258
|
+
|
259
|
+
def create_converter(self):
|
260
|
+
"""
|
261
|
+
This function overrides the TextReader's one in order to deal with Token
|
262
|
+
objects instead of raw strings. It also allows POS as an attribute.
|
263
|
+
여기 순서와 utils.create_feature_table의 순서가 같아야 한다.
|
264
|
+
"""
|
265
|
+
#f = lambda token: self.word_dict[token.morph_h]
|
266
|
+
self.converter = attributes.TokenConverter()
|
267
|
+
#self.converter.add_extractor(f)
|
268
|
+
|
269
|
+
f = lambda token: self.word_dict[token.morph_t]
|
270
|
+
self.converter.add_extractor(f)
|
271
|
+
|
272
|
+
#if self.md.use_caps:
|
273
|
+
# caps_lookup = lambda t: attributes.get_capitalization(t.word)
|
274
|
+
# self.converter.add_extractor(caps_lookup)
|
275
|
+
|
276
|
+
if self.md.use_pos:
|
277
|
+
self._load_or_create_pos_dict()
|
278
|
+
g = lambda token: self.pos_dict[token.pos_h]
|
279
|
+
self.converter.add_extractor(g)
|
280
|
+
g = lambda token: self.pos_dict[token.pos_t]
|
281
|
+
self.converter.add_extractor(g)
|
282
|
+
|
283
|
+
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,97 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
"""
|
4
|
+
Class for dealing with POS data.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import chardet
|
8
|
+
from ..reader import TaggerReader
|
9
|
+
|
10
|
+
class ConllPos(object):
|
11
|
+
"""
|
12
|
+
Dummy class for storing column positions in a conll file.
|
13
|
+
"""
|
14
|
+
id = 0
|
15
|
+
word = 1
|
16
|
+
pos = 2
|
17
|
+
SEP = '\t'
|
18
|
+
|
19
|
+
class POSReader(TaggerReader):
|
20
|
+
"""
|
21
|
+
This class reads data from a POS corpus and turns it into a format
|
22
|
+
readable by the neural network for the POS tagging task.
|
23
|
+
"""
|
24
|
+
|
25
|
+
def __init__(self, md=None, filename=None, load_dictionaries=True):
|
26
|
+
"""
|
27
|
+
Constructor
|
28
|
+
"""
|
29
|
+
self.rare_tag = None
|
30
|
+
self.sentences = []
|
31
|
+
if filename is not None:
|
32
|
+
try:
|
33
|
+
self._read_plain(filename)
|
34
|
+
except:
|
35
|
+
self._read_conll(filename)
|
36
|
+
|
37
|
+
super(POSReader, self).__init__(md, load_dictionaries=load_dictionaries)
|
38
|
+
|
39
|
+
@property
|
40
|
+
def task(self):
|
41
|
+
"""
|
42
|
+
Abstract Base Class (ABC) attribute.
|
43
|
+
"""
|
44
|
+
return 'pos'
|
45
|
+
|
46
|
+
def _read_plain(self, filename):
|
47
|
+
"""
|
48
|
+
Read data from a "plain" file, with one sentence per line, each token
|
49
|
+
as token_tag.
|
50
|
+
"""
|
51
|
+
self.sentences = []
|
52
|
+
with open(filename, 'rb') as f:
|
53
|
+
raw_data = f.read(1024)
|
54
|
+
detected = chardet.detect(raw_data).get('encoding', 'utf-8')
|
55
|
+
with open(filename, 'rt', encoding = detected) as f:
|
56
|
+
for line in f:
|
57
|
+
#line = unicode(line, 'utf-8')
|
58
|
+
items = line.split(ConllPos.SEP)
|
59
|
+
sentence = []
|
60
|
+
for item in items:
|
61
|
+
token, tag = item.rsplit('_', 1)
|
62
|
+
sentence.append((token, tag))
|
63
|
+
|
64
|
+
self.sentences.append(sentence)
|
65
|
+
|
66
|
+
def _read_conll(self, filename):
|
67
|
+
"""
|
68
|
+
Read data from a CoNLL formatted file. It expects at least 4 columns:
|
69
|
+
id, surface word, lemma (ignored, may be anything)
|
70
|
+
and the POS tag.
|
71
|
+
"""
|
72
|
+
self.sentences = []
|
73
|
+
sentence = []
|
74
|
+
with open(filename, 'rb') as f:
|
75
|
+
raw_data = f.read(1024)
|
76
|
+
detected = chardet.detect(raw_data).get('encoding', 'utf-8')
|
77
|
+
with open(filename, 'rt', encoding = detected) as f:
|
78
|
+
for line in f:
|
79
|
+
line = line.strip()
|
80
|
+
if line.strip() == '':
|
81
|
+
if len(sentence) > 0:
|
82
|
+
self.sentences.append(sentence)
|
83
|
+
sentence = []
|
84
|
+
continue
|
85
|
+
|
86
|
+
#line = unicode(line, 'utf-8')
|
87
|
+
fields = line.split(ConllPos.SEP)
|
88
|
+
word = fields[ConllPos.word]
|
89
|
+
pos = fields[ConllPos.pos]
|
90
|
+
sentence.append((word, pos))
|
91
|
+
|
92
|
+
if len(sentence) > 0:
|
93
|
+
self.sentences.append(sentence)
|
94
|
+
|
95
|
+
|
96
|
+
# backwards compatibility
|
97
|
+
MacMorphoReader = POSReader
|