nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nltkor/Kor_char.py +193 -0
- nltkor/__init__.py +16 -0
- nltkor/alignment/__init__.py +1315 -0
- nltkor/cider/__init__.py +2 -0
- nltkor/cider/cider.py +55 -0
- nltkor/cider/cider_scorer.py +207 -0
- nltkor/distance/__init__.py +441 -0
- nltkor/distance/wasserstein.py +126 -0
- nltkor/etc.py +22 -0
- nltkor/lazyimport.py +144 -0
- nltkor/make_requirement.py +11 -0
- nltkor/metrics/__init__.py +63 -0
- nltkor/metrics/bartscore.py +301 -0
- nltkor/metrics/bertscore.py +331 -0
- nltkor/metrics/bleu_tensor.py +20 -0
- nltkor/metrics/classical.py +847 -0
- nltkor/metrics/entment.py +24 -0
- nltkor/metrics/eval.py +517 -0
- nltkor/metrics/mauve.py +273 -0
- nltkor/metrics/mauve_utils.py +131 -0
- nltkor/misc/__init__.py +11 -0
- nltkor/misc/string2string_basic_functions.py +59 -0
- nltkor/misc/string2string_default_tokenizer.py +83 -0
- nltkor/misc/string2string_hash_functions.py +159 -0
- nltkor/misc/string2string_word_embeddings.py +503 -0
- nltkor/search/__init__.py +10 -0
- nltkor/search/classical.py +569 -0
- nltkor/search/faiss_search.py +787 -0
- nltkor/search/kobert_tokenizer.py +181 -0
- nltkor/sejong/__init__.py +3 -0
- nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
- nltkor/sejong/ch.py +12 -0
- nltkor/sejong/dict_semClassNum.txt +491 -0
- nltkor/sejong/layer.txt +630 -0
- nltkor/sejong/sejong_download.py +87 -0
- nltkor/sejong/ssem.py +684 -0
- nltkor/similarity/__init__.py +3 -0
- nltkor/similarity/bartscore____.py +337 -0
- nltkor/similarity/bertscore____.py +339 -0
- nltkor/similarity/classical.py +245 -0
- nltkor/similarity/cosine_similarity.py +175 -0
- nltkor/tag/__init__.py +71 -0
- nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
- nltkor/tag/espresso_tag.py +220 -0
- nltkor/tag/libs/__init__.py +10 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
- nltkor/tag/libs/arguments.py +280 -0
- nltkor/tag/libs/attributes.py +231 -0
- nltkor/tag/libs/config.py +159 -0
- nltkor/tag/libs/metadata.py +129 -0
- nltkor/tag/libs/ner/__init__.py +2 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/macmorphoreader.py +7 -0
- nltkor/tag/libs/ner/ner_reader.py +92 -0
- nltkor/tag/libs/network.c +72325 -0
- nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
- nltkor/tag/libs/network.pyx +878 -0
- nltkor/tag/libs/networkconv.pyx +1028 -0
- nltkor/tag/libs/networkdependencyconv.pyx +451 -0
- nltkor/tag/libs/parse/__init__.py +1 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/parse_reader.py +283 -0
- nltkor/tag/libs/pos/__init__.py +2 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/macmorphoreader.py +7 -0
- nltkor/tag/libs/pos/pos_reader.py +97 -0
- nltkor/tag/libs/reader.py +485 -0
- nltkor/tag/libs/srl/__init__.py +3 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__srl_reader_.py +535 -0
- nltkor/tag/libs/srl/srl_reader.py +436 -0
- nltkor/tag/libs/srl/train_srl.py +87 -0
- nltkor/tag/libs/taggers.py +926 -0
- nltkor/tag/libs/utils.py +384 -0
- nltkor/tag/libs/word_dictionary.py +239 -0
- nltkor/tag/libs/wsd/__init__.py +2 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
- nltkor/tag/libs/wsd/wsd_reader.py +93 -0
- nltkor/tokenize/__init__.py +62 -0
- nltkor/tokenize/ko_tokenize.py +115 -0
- nltkor/trans.py +121 -0
- nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
- nltkor-1.2.14.dist-info/METADATA +41 -0
- nltkor-1.2.14.dist-info/RECORD +127 -0
- nltkor-1.2.14.dist-info/WHEEL +5 -0
- nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,926 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
"""
|
4
|
+
Taggers wrapping the neural networks.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import logging
|
8
|
+
#from os import major
|
9
|
+
import numpy as np
|
10
|
+
import re
|
11
|
+
from nltkor import etc
|
12
|
+
|
13
|
+
from . import utils
|
14
|
+
from . import config
|
15
|
+
from . import attributes
|
16
|
+
from .metadata import Metadata
|
17
|
+
from .pos import POSReader
|
18
|
+
from .ner import NERReader
|
19
|
+
from .wsd import WSDReader
|
20
|
+
from .srl import SRLReader
|
21
|
+
from .parse import DependencyReader
|
22
|
+
import sys
|
23
|
+
sys.path.append("libs/")
|
24
|
+
from .network import Network, ConvolutionalNetwork, ConvolutionalDependencyNetwork
|
25
|
+
|
26
|
+
|
27
|
+
def load_network(md):
|
28
|
+
"""
|
29
|
+
Loads the network from the default file and returns it.
|
30
|
+
"""
|
31
|
+
logger = logging.getLogger("Logger")
|
32
|
+
is_srl = md.task == 'srl'
|
33
|
+
|
34
|
+
logger.info('Loading network')
|
35
|
+
if is_srl :
|
36
|
+
net_class = ConvolutionalNetwork
|
37
|
+
elif md.task.endswith('dependency'):
|
38
|
+
net_class = ConvolutionalDependencyNetwork
|
39
|
+
else:
|
40
|
+
net_class = Network
|
41
|
+
|
42
|
+
nn = net_class.load_from_file(md.paths[md.network])
|
43
|
+
|
44
|
+
logger.info('Done')
|
45
|
+
return nn
|
46
|
+
|
47
|
+
|
48
|
+
def create_reader(md, gold_file=None):
|
49
|
+
"""
|
50
|
+
Creates a TextReader object for the given task and loads its dictionary.
|
51
|
+
:param md: a metadata object describing the task
|
52
|
+
:param gold_file: path to a file with gold standard data, if
|
53
|
+
the reader will be used for testing.
|
54
|
+
"""
|
55
|
+
logger = logging.getLogger('Logger')
|
56
|
+
logger.info('Loading text reader...')
|
57
|
+
|
58
|
+
if md.task == 'pos':
|
59
|
+
tr = POSReader(md, filename=gold_file)
|
60
|
+
|
61
|
+
elif md.task == 'ner':
|
62
|
+
tr = NERReader(md, filename=gold_file)
|
63
|
+
|
64
|
+
elif md.task == 'wsd':
|
65
|
+
tr = WSDReader(md, filename=gold_file)
|
66
|
+
|
67
|
+
elif 'dependency' in md.task:
|
68
|
+
labeled = md.task.startswith('labeled')
|
69
|
+
tr = DependencyReader(md, filename=gold_file, labeled=labeled)
|
70
|
+
|
71
|
+
elif md.task.startswith('srl'):
|
72
|
+
tr = SRLReader(md, filename=gold_file)
|
73
|
+
|
74
|
+
else:
|
75
|
+
raise ValueError("Unknown task: %s" % md.task)
|
76
|
+
|
77
|
+
logger.info('Done')
|
78
|
+
return tr
|
79
|
+
|
80
|
+
def _group_arguments(tokens, predicate_positions, arg_tokens, labels):
|
81
|
+
"""
|
82
|
+
Groups words pertaining to each argument and returns a dictionary for each predicate.
|
83
|
+
"""
|
84
|
+
print(tokens, predicate_positions, arg_tokens, labels)
|
85
|
+
arg_structs = []
|
86
|
+
|
87
|
+
for predicate_position, pred_arg_tokens, pred_labels in zip(predicate_positions,
|
88
|
+
arg_tokens,
|
89
|
+
labels):
|
90
|
+
structure = {}
|
91
|
+
|
92
|
+
for tag, arg_token in zip(pred_labels, pred_arg_tokens):
|
93
|
+
#argument_tokens = [token]
|
94
|
+
#tag = pred_labels.pop(0)
|
95
|
+
structure[tag] = [arg_token]
|
96
|
+
|
97
|
+
predicate = tokens[predicate_position-1]
|
98
|
+
arg_structs.append((predicate, structure))
|
99
|
+
|
100
|
+
return arg_structs
|
101
|
+
|
102
|
+
|
103
|
+
class SRLAnnotatedSentence(object):
|
104
|
+
"""
|
105
|
+
Class storing a sentence with annotated semantic roles.
|
106
|
+
|
107
|
+
It stores a list with the sentence tokens, called `tokens`, and a list of tuples
|
108
|
+
in the format `(predicate, arg_strucutres)`. Each `arg_structure` is a dict mapping
|
109
|
+
semantic roles to the words that constitute it. This is used instead of a two-level
|
110
|
+
dictionary because one sentence may have more than one occurrence of the same
|
111
|
+
predicate.
|
112
|
+
|
113
|
+
This class is used only for storing data.
|
114
|
+
"""
|
115
|
+
|
116
|
+
def __init__(self, tokens, arg_structures):
|
117
|
+
"""
|
118
|
+
Creates an instance of a sentence with SRL data.
|
119
|
+
|
120
|
+
:param tokens: a list of strings
|
121
|
+
:param arg_structures: a list of tuples in the format (predicate, mapping).
|
122
|
+
Each predicate is a string and each mapping is a dictionary mapping role labels
|
123
|
+
to the words that constitute it.
|
124
|
+
"""
|
125
|
+
self.tokens = tokens
|
126
|
+
self.arg_structures = arg_structures
|
127
|
+
|
128
|
+
class ParsedSentence(object):
|
129
|
+
"""
|
130
|
+
Class for storing a sentence with dependency parsing annotation.
|
131
|
+
|
132
|
+
It stores a list of tokens, the dependency heads, dependency labels and POS tags
|
133
|
+
if the parser used them. Dependency heads are the index of the head of each
|
134
|
+
token, and -1 means a dependency to the root.
|
135
|
+
"""
|
136
|
+
def __init__(self, tokens, heads, labels, pos=None):
|
137
|
+
"""
|
138
|
+
Constructor.
|
139
|
+
|
140
|
+
:param tokens: list of strings
|
141
|
+
:param heads: list of integers (-1 means dependency to root, others are token indices)
|
142
|
+
:param labels: list of strings
|
143
|
+
:param pos: None or list of strings
|
144
|
+
"""
|
145
|
+
self.tokens = tokens
|
146
|
+
self.heads = heads
|
147
|
+
self.labels = labels
|
148
|
+
self.pos = pos
|
149
|
+
|
150
|
+
def __len__(self):
|
151
|
+
return len(self.tokens)
|
152
|
+
|
153
|
+
def to_conll_list(self):
|
154
|
+
"""
|
155
|
+
Return a list representation of the sentence in CoNLL X format.
|
156
|
+
|
157
|
+
Each line has:
|
158
|
+
[number starting from 1] token _ POS POS _ head label
|
159
|
+
|
160
|
+
Token numbers start from 1, root is referred as 0.
|
161
|
+
POS is only available if the original parser used it.
|
162
|
+
"""
|
163
|
+
tokenL = []
|
164
|
+
headL = []
|
165
|
+
labelL = []
|
166
|
+
posL = []
|
167
|
+
for i in range(len(self.tokens)):
|
168
|
+
tokenL.append(self.tokens[i])
|
169
|
+
headL.append(self.heads[i] + 1)
|
170
|
+
labelL.append(self.labels[i])
|
171
|
+
posL.append(self.pos[i])
|
172
|
+
|
173
|
+
return tokenL, posL, labelL, headL
|
174
|
+
|
175
|
+
def to_conll(self):
|
176
|
+
"""
|
177
|
+
Return a string representation of the sentence in CoNLL X format.
|
178
|
+
|
179
|
+
Each line has:
|
180
|
+
[number starting from 1] token _ POS POS _ head label
|
181
|
+
|
182
|
+
Token numbers start from 1, root is referred as 0.
|
183
|
+
POS is only available if the original parser used it.
|
184
|
+
"""
|
185
|
+
result = []
|
186
|
+
for i in range(len(self.tokens)):
|
187
|
+
token = self.tokens[i]
|
188
|
+
head = self.heads[i] + 1
|
189
|
+
label = self.labels[i]
|
190
|
+
pos = self.pos[i] if self.pos else '_'
|
191
|
+
|
192
|
+
#line = u'{id}\t{token}\t_\t{pos}\t{pos}\t_\t{head}\t{label}'
|
193
|
+
#result.append(line.format(id=i+1, pos=pos, head=head, label=label, token=token))
|
194
|
+
line = u'{id}\t{token}\t{head}\t{label}'
|
195
|
+
result.append(line.format(id=i+1, head=head, label=label, token=token))
|
196
|
+
|
197
|
+
return '\n'.join(result)
|
198
|
+
|
199
|
+
|
200
|
+
class Tagger(object):
|
201
|
+
"""
|
202
|
+
Base class for taggers. It should not be instantiated.
|
203
|
+
"""
|
204
|
+
def __init__(self, data_dir=None):
|
205
|
+
"""Creates a tagger and loads data preemptively"""
|
206
|
+
asrt_msg = "espresso data directory is not set. \
|
207
|
+
If you don't have the trained models, download them from http://air.cwnu.ac.kr/espresso/models.html"
|
208
|
+
if data_dir is None:
|
209
|
+
assert config.data_dir is not None, asrt_msg
|
210
|
+
self.paths = config.FILES
|
211
|
+
else:
|
212
|
+
self.paths = config.get_config_paths(data_dir)
|
213
|
+
|
214
|
+
self.data_dir = data_dir
|
215
|
+
self._load_data()
|
216
|
+
|
217
|
+
def _load_data(self):
|
218
|
+
"""Implemented by subclasses"""
|
219
|
+
pass
|
220
|
+
|
221
|
+
|
222
|
+
class SRLTagger(Tagger):
|
223
|
+
"""
|
224
|
+
An SRLTagger loads the models and performs SRL on text.
|
225
|
+
|
226
|
+
It works on three stages: predicate identification, argument detection and
|
227
|
+
argument classification.
|
228
|
+
"""
|
229
|
+
|
230
|
+
def _load_data(self):
|
231
|
+
"""Loads data for SRL"""
|
232
|
+
md_srl = Metadata.load_from_file('srl', self.paths)
|
233
|
+
self.nn = load_network(md_srl)
|
234
|
+
self.reader = create_reader(md_srl)
|
235
|
+
self.reader.create_converter()
|
236
|
+
self.itd = self.reader.get_inverse_tag_dictionary()
|
237
|
+
|
238
|
+
self.parser = DependencyParser(self.data_dir)
|
239
|
+
|
240
|
+
|
241
|
+
def find_predicates(self, tokens):
|
242
|
+
"""
|
243
|
+
Finds out which tokens are predicates.
|
244
|
+
|
245
|
+
:param tokens: a list of attribute.Token elements
|
246
|
+
:returns: the indices of predicate tokens
|
247
|
+
"""
|
248
|
+
answer = []
|
249
|
+
for i, token in enumerate(tokens):
|
250
|
+
if token[0] == 'V' and tokens[i-1][0] != 'V': answer.append(i+1)
|
251
|
+
return np.array(answer)
|
252
|
+
|
253
|
+
def find_arguments(self, token_obj, predL, headL, relL):
|
254
|
+
"""
|
255
|
+
Finds out which tokens are predicates.
|
256
|
+
|
257
|
+
:param tokens: a list of attribute.Token elements
|
258
|
+
:returns: the indices of predicate tokens
|
259
|
+
"""
|
260
|
+
answer_token = []; answer = []
|
261
|
+
for p in predL:
|
262
|
+
pred_arg_token = []; pred_arg = []
|
263
|
+
for j, h in enumerate(headL):
|
264
|
+
if p == h and relL[j][0] == 'N':
|
265
|
+
pred_arg_token.append(token_obj[j])
|
266
|
+
pred_arg.append(np.array([j, j]))
|
267
|
+
|
268
|
+
#TODO
|
269
|
+
# predicate의 header
|
270
|
+
#if headL[p-1] != 0: # 마지막 제외
|
271
|
+
# pred_arg_token.append(token_obj[headL[p-1]-1])
|
272
|
+
# pred_arg.append(np.array([headL[p-1]-1, headL[p-1]]))
|
273
|
+
|
274
|
+
answer_token.append(pred_arg_token)
|
275
|
+
answer.append(pred_arg)
|
276
|
+
#print(answer_token)
|
277
|
+
#print(answer)
|
278
|
+
return answer_token, answer
|
279
|
+
|
280
|
+
def tag(self, text, use_sent_tokenizer=True, mode='standard'):
|
281
|
+
"""
|
282
|
+
Runs the SRL process on the given text.
|
283
|
+
|
284
|
+
:param text: unicode or str encoded in utf-8.
|
285
|
+
:param no_repeats: whether to prevent repeated argument labels
|
286
|
+
:returns: a list of SRLAnnotatedSentence objects
|
287
|
+
"""
|
288
|
+
tokens = utils.tokenize(text)
|
289
|
+
result = []
|
290
|
+
for sent in tokens:
|
291
|
+
tagged = self.tag_sentence(sent)
|
292
|
+
result.append(tagged)
|
293
|
+
|
294
|
+
return result
|
295
|
+
|
296
|
+
def tag_sentence(self, tokens, no_repeats=False):
|
297
|
+
"""
|
298
|
+
Runs the SRL process on the given tokens.
|
299
|
+
|
300
|
+
:param tokens: a list of tokens (as strings)
|
301
|
+
:param no_repeats: whether to prevent repeated argument labels
|
302
|
+
:returns: a list of lists (one list for each sentence). Sentences have tuples
|
303
|
+
(all_tokens, predicate, arg_structure), where arg_structure is a dictionary
|
304
|
+
mapping argument labels to the words it includes.
|
305
|
+
"""
|
306
|
+
# 구문분석
|
307
|
+
parsed = self.parser.parse_sentence(tokens)
|
308
|
+
wordL, posL, relL, headL = parsed.to_conll_list()
|
309
|
+
tokens_obj = []
|
310
|
+
for w, p, r in zip(wordL, posL, relL):
|
311
|
+
hm, hp, tm, tp = p
|
312
|
+
token = attributes.Token(w, hm, hp, tm, tp, r)
|
313
|
+
tokens_obj.append(token)
|
314
|
+
|
315
|
+
converted_class = np.array([self.reader.converter.convert(t)
|
316
|
+
for t in tokens_obj])
|
317
|
+
pred_positions = self.find_predicates(relL)
|
318
|
+
|
319
|
+
arg_tokens, arg_limits = self.find_arguments(wordL, pred_positions, headL, relL)
|
320
|
+
print(arg_tokens)
|
321
|
+
print(pred_positions)
|
322
|
+
print(arg_limits)
|
323
|
+
|
324
|
+
# now, argument classification
|
325
|
+
answers = self.nn.tag_sentence(converted_class,
|
326
|
+
pred_positions, arg_limits,
|
327
|
+
allow_repeats=not no_repeats)
|
328
|
+
labels = [[self.itd[x] for x in pred_answer]
|
329
|
+
for pred_answer in answers]
|
330
|
+
|
331
|
+
structures = _group_arguments(wordL, pred_positions, arg_tokens, labels)
|
332
|
+
return SRLAnnotatedSentence(wordL, structures)
|
333
|
+
|
334
|
+
class DependencyParser(Tagger):
|
335
|
+
"""A Dependency Parser based on a neural network tagger."""
|
336
|
+
|
337
|
+
def __init__(self, *args, **kwargs):
|
338
|
+
"""
|
339
|
+
Set the data directory for the POS tagger, if one is used,
|
340
|
+
and call the parent constructor.
|
341
|
+
"""
|
342
|
+
super(DependencyParser, self).__init__(*args, **kwargs)
|
343
|
+
|
344
|
+
def _load_data(self):
|
345
|
+
"""Loads data for Dependency Parsing"""
|
346
|
+
md_udep = Metadata.load_from_file('unlabeled_dependency', paths=self.paths)
|
347
|
+
self.unlabeled_nn = load_network(md_udep)
|
348
|
+
self.unlabeled_reader = create_reader(md_udep)
|
349
|
+
|
350
|
+
md_ldep = Metadata.load_from_file('labeled_dependency', paths=self.paths)
|
351
|
+
self.labeled_nn = load_network(md_ldep)
|
352
|
+
self.labeled_reader = create_reader(md_ldep)
|
353
|
+
self.itd = self.labeled_reader.get_inverse_tag_dictionary()
|
354
|
+
|
355
|
+
self.use_pos = md_udep.use_pos or md_ldep.use_pos
|
356
|
+
if self.use_pos:
|
357
|
+
self.pos_tagger = POSTagger(self.data_dir)
|
358
|
+
|
359
|
+
def parse(self, text):
|
360
|
+
"""
|
361
|
+
Split the given text into sentences and determines their
|
362
|
+
dependency trees. If you want to provide your own tokenized
|
363
|
+
text, use `parse_sentence` instead.
|
364
|
+
|
365
|
+
:param text: a string
|
366
|
+
:returns: a list of ParsedSentence's
|
367
|
+
"""
|
368
|
+
sentences = utils.tokenize(text)
|
369
|
+
result = []
|
370
|
+
for sent in sentences:
|
371
|
+
parsed = self.parse_sentence(sent)
|
372
|
+
result.append(parsed)
|
373
|
+
|
374
|
+
return result
|
375
|
+
|
376
|
+
def tag_tokens(self, tokens):
|
377
|
+
"""
|
378
|
+
Parse the given sentence. This function is just an alias for
|
379
|
+
`parse_sentence`.
|
380
|
+
"""
|
381
|
+
return self.parse_sentence(tokens)
|
382
|
+
|
383
|
+
def parse_sentence(self, tokens):
|
384
|
+
"""
|
385
|
+
Parse the given sentence. It must be already tokenized; if you
|
386
|
+
want nlpnet to tokenize the text, use the method `parse` instead.
|
387
|
+
|
388
|
+
:param tokens: a list of strings (sentences)
|
389
|
+
:return: a ParsedSentence instance
|
390
|
+
"""
|
391
|
+
original_tokens = tokens
|
392
|
+
udep_tokens_obj = []
|
393
|
+
ldep_tokens_obj = []
|
394
|
+
|
395
|
+
# if the parser uses POS a feature, have a tagger tag it first
|
396
|
+
if self.use_pos:
|
397
|
+
eojeols, eojeol_features = self.pos_tagger.tag_tokens(tokens, mode='eojeol')
|
398
|
+
#print("**", eojeols)
|
399
|
+
#print(eojeol_features)
|
400
|
+
#print(tokens, eojeols)
|
401
|
+
|
402
|
+
for word, feature in zip(eojeols, eojeol_features):
|
403
|
+
m_h, t_h, m_t, t_t = feature
|
404
|
+
#udep_tokens_obj.append(attributes.Token(word, morph_h=m_h, pos_h=t_h, morph_t=m_t, pos_t=t_t))
|
405
|
+
udep_tokens_obj.append(attributes.Token(word, pos_h=t_h, morph_t=m_t, pos_t=t_t))
|
406
|
+
ldep_tokens_obj.append(attributes.Token(word, pos_h=t_h, morph_t=m_t, pos_t=t_t))
|
407
|
+
|
408
|
+
converted_tokens = self.unlabeled_reader.codify_sentence(udep_tokens_obj)
|
409
|
+
#print(converted_tokens)
|
410
|
+
heads = self.unlabeled_nn.tag_sentence(converted_tokens)
|
411
|
+
#print(heads)
|
412
|
+
|
413
|
+
# the root is returned having a value == len(sentence)
|
414
|
+
root = heads.argmax()
|
415
|
+
heads[root] = root
|
416
|
+
|
417
|
+
converted_tokens = self.labeled_reader.codify_sentence(ldep_tokens_obj)
|
418
|
+
label_codes = self.labeled_nn.tag_sentence(converted_tokens, heads)
|
419
|
+
labels = [self.itd[code] for code in label_codes]
|
420
|
+
#print(label_codes)
|
421
|
+
#print(labels)
|
422
|
+
|
423
|
+
# to the final answer, signal the root with -1
|
424
|
+
heads[root] = -1
|
425
|
+
pos_tags = eojeol_features if self.use_pos else None
|
426
|
+
#pos_tags = zip(*tokens)[1] if self.use_pos else None
|
427
|
+
|
428
|
+
parsed = ParsedSentence(eojeols, heads, labels, pos_tags)
|
429
|
+
#parsed = ParsedSentence(original_tokens, heads, labels, pos_tags)
|
430
|
+
return parsed
|
431
|
+
|
432
|
+
def tag(self, text, use_sent_tokenizer=True, mode='eojeol'):
|
433
|
+
"""
|
434
|
+
Parse the given text. This is just an alias for the
|
435
|
+
`parse` method.
|
436
|
+
"""
|
437
|
+
return self.parse(text)
|
438
|
+
|
439
|
+
|
440
|
+
class WSDTagger(Tagger):
|
441
|
+
"""A WSDTagger loads the models and performs WSD tagging on text."""
|
442
|
+
|
443
|
+
def _load_data(self):
|
444
|
+
"""Loads data for WSD"""
|
445
|
+
md_wsd = Metadata.load_from_file('wsd', self.paths)
|
446
|
+
self.nn = load_network(md_wsd)
|
447
|
+
self.reader = create_reader(md_wsd)
|
448
|
+
self.reader.create_converter()
|
449
|
+
self.itd = self.reader.get_inverse_tag_dictionary()
|
450
|
+
#self.morph_lexicon = self.reader.morph_lexicon # user lexicon
|
451
|
+
#self.co_lexicon = self.reader.co_lexicon
|
452
|
+
#self.prob_dict = self.reader.prob_dict
|
453
|
+
self.pos_tagger = POSTagger(self.data_dir)
|
454
|
+
|
455
|
+
def tag(self, text, use_sent_tokenizer=True, mode='standard'):
|
456
|
+
"""
|
457
|
+
Tags the given text.
|
458
|
+
|
459
|
+
:param text: a string or unicode object. Strings assumed to be utf-8
|
460
|
+
:returns: a list of lists (sentences with tokens).
|
461
|
+
Each sentence has (token, tag) tuples.
|
462
|
+
"""
|
463
|
+
tokens = utils.tokenize(text)
|
464
|
+
result = []
|
465
|
+
for sent in tokens:
|
466
|
+
tagged = self.tag_sentence(sent)
|
467
|
+
result.append(tagged)
|
468
|
+
|
469
|
+
return result
|
470
|
+
|
471
|
+
def tag_sentence(self, tokens):
|
472
|
+
"""
|
473
|
+
Tags a given list of tokens.
|
474
|
+
|
475
|
+
Tokens should be produced with the espresso tokenizer in order to
|
476
|
+
match the entries in the vocabulary. If you have non-tokenized text,
|
477
|
+
use NERTagger.tag(text).
|
478
|
+
|
479
|
+
:param tokens: a list of strings
|
480
|
+
:returns: a list of strings (morphs, tags)
|
481
|
+
"""
|
482
|
+
pos_tagged = self.pos_tagger.tag_tokens(tokens)
|
483
|
+
|
484
|
+
pos_tagged = filter(lambda x : x != (' ', 'SP'), pos_tagged)
|
485
|
+
unzipped_pos_tagged = zip(*pos_tagged)
|
486
|
+
morphs, morph_pos_tags = list(unzipped_pos_tagged)
|
487
|
+
#print(morphs, morph_pos_tags)
|
488
|
+
|
489
|
+
converter = self.reader.converter
|
490
|
+
converted_tokens = np.array([converter.convert(token) for token in morphs])
|
491
|
+
#print("0", converted_tokens)
|
492
|
+
|
493
|
+
answer = self.nn.tag_sentence(converted_tokens)
|
494
|
+
tags = [self.itd[tag] for tag in answer] # 번호를 수로 표현
|
495
|
+
|
496
|
+
#print("1", morphs, tags)
|
497
|
+
|
498
|
+
return zip(morphs, tags)
|
499
|
+
|
500
|
+
|
501
|
+
class NERTagger(Tagger):
|
502
|
+
"""A NERTagger loads the models and performs NER tagging on text."""
|
503
|
+
|
504
|
+
def _load_data(self):
|
505
|
+
"""Loads data for NER"""
|
506
|
+
md_ner = Metadata.load_from_file('ner', self.paths)
|
507
|
+
self.nn = load_network(md_ner)
|
508
|
+
self.reader = create_reader(md_ner)
|
509
|
+
self.reader.create_converter()
|
510
|
+
self.itd = self.reader.get_inverse_tag_dictionary()
|
511
|
+
#self.morph_lexicon = self.reader.morph_lexicon # user lexicon
|
512
|
+
#self.co_lexicon = self.reader.co_lexicon
|
513
|
+
#self.prob_dict = self.reader.prob_dict
|
514
|
+
self.pos_tagger = POSTagger(self.data_dir)
|
515
|
+
|
516
|
+
def tag(self, text, use_sent_tokenizer=True, mode='standard'):
|
517
|
+
"""
|
518
|
+
Tags the given text.
|
519
|
+
|
520
|
+
:param text: a string or unicode object. Strings assumed to be utf-8
|
521
|
+
:returns: a list of lists (sentences with tokens).
|
522
|
+
Each sentence has (token, tag) tuples.
|
523
|
+
"""
|
524
|
+
tokens = utils.tokenize(text)
|
525
|
+
result = []
|
526
|
+
for sent in tokens:
|
527
|
+
tagged = self.tag_sentence(sent)
|
528
|
+
result.append(tagged)
|
529
|
+
|
530
|
+
return result
|
531
|
+
|
532
|
+
def tag_sentence(self, tokens):
|
533
|
+
"""
|
534
|
+
Tags a given list of tokens.
|
535
|
+
|
536
|
+
Tokens should be produced with the espresso tokenizer in order to
|
537
|
+
match the entries in the vocabulary. If you have non-tokenized text,
|
538
|
+
use NERTagger.tag(text).
|
539
|
+
|
540
|
+
:param tokens: a list of strings
|
541
|
+
:returns: a list of strings (morphs, tags)
|
542
|
+
"""
|
543
|
+
pos_tagged = self.pos_tagger.tag_tokens(tokens)
|
544
|
+
|
545
|
+
pos_tagged = filter(lambda x : x != (' ', 'SP'), pos_tagged) # 공백 제거
|
546
|
+
unzipped_pos_tagged = zip(*pos_tagged)
|
547
|
+
morphs, morph_pos_tags = list(unzipped_pos_tagged)
|
548
|
+
#print(morphs, morph_pos_tags)
|
549
|
+
|
550
|
+
converter = self.reader.converter
|
551
|
+
converted_tokens = np.array([converter.convert(token) for token in morphs])
|
552
|
+
#print("0", converted_tokens)
|
553
|
+
|
554
|
+
answer = self.nn.tag_sentence(converted_tokens)
|
555
|
+
tags = [self.itd[tag] for tag in answer] # 번호를 수로 표현
|
556
|
+
|
557
|
+
#print("1", morphs, tags)
|
558
|
+
|
559
|
+
return zip(morphs, tags)
|
560
|
+
|
561
|
+
|
562
|
+
class POSTagger(Tagger):
|
563
|
+
"""A POSTagger loads the models and performs POS tagging on text."""
|
564
|
+
|
565
|
+
def _load_data(self):
|
566
|
+
"""Loads data for POS"""
|
567
|
+
md = Metadata.load_from_file('pos', self.paths)
|
568
|
+
self.nn = load_network(md)
|
569
|
+
self.reader = create_reader(md)
|
570
|
+
self.reader.create_converter()
|
571
|
+
self.itd = self.reader.get_inverse_tag_dictionary()
|
572
|
+
self.morph_lexicon = self.reader.morph_lexicon # user lexicon
|
573
|
+
self.co_lexicon = self.reader.co_lexicon
|
574
|
+
self.prob_dict = self.reader.prob_dict
|
575
|
+
|
576
|
+
def tag(self, text, use_sent_tokenizer=True, mode="standard"):
|
577
|
+
"""
|
578
|
+
Tags the given text.
|
579
|
+
|
580
|
+
:param text: a string or unicode object. Strings assumed to be utf-8
|
581
|
+
:param mode: [standard, eumjeol, eojeol]. "eumjeol" does not lemmatize,
|
582
|
+
"eojeol" includes NN+XV
|
583
|
+
:returns: a list of lists (sentences with tokens).
|
584
|
+
Each sentence has (token, tag) tuples.
|
585
|
+
"""
|
586
|
+
tokens = utils.tokenize(text, use_sent_tokenizer) # 문장 단위 + 음절
|
587
|
+
result = []
|
588
|
+
for sent in tokens:
|
589
|
+
tagged = self.tag_tokens(sent, mode)
|
590
|
+
result.append(tagged)
|
591
|
+
|
592
|
+
return result
|
593
|
+
|
594
|
+
def tag_tokens(self, tokens, mode="standard"):
|
595
|
+
"""
|
596
|
+
Tags a given list of tokens.
|
597
|
+
|
598
|
+
Tokens should be produced with the espresso tokenizer in order to
|
599
|
+
match the entries in the vocabulary. If you have non-tokenized text,
|
600
|
+
use POSTagger.tag(text).
|
601
|
+
|
602
|
+
:param tokens: a list of strings
|
603
|
+
:param mode: [standard, eumjeol, verb]. "eumjeol" does not lemmatize,
|
604
|
+
"verb" includes NN+XV
|
605
|
+
:returns: a list of strings (the tags)
|
606
|
+
"""
|
607
|
+
converter = self.reader.converter # 클래스 지정
|
608
|
+
converted_tokens = np.array([converter.convert('*space*') if token==' ' else converter.convert(token)
|
609
|
+
for token in tokens])
|
610
|
+
#print("0", converted_tokens)
|
611
|
+
|
612
|
+
answer = self.nn.tag_sentence(converted_tokens)
|
613
|
+
tags = [self.itd[tag] for tag in answer] # 번호를 문자로 변환
|
614
|
+
|
615
|
+
if mode == 'eojeol':
|
616
|
+
eojeols, eojeol_features = self.get_eojeol_tokens(tokens, tags, mode)
|
617
|
+
return eojeols, eojeol_features
|
618
|
+
else:
|
619
|
+
morphs, morph_tags = self.get_morph_tokens(tokens, tags, mode)
|
620
|
+
return zip(morphs, morph_tags)
|
621
|
+
|
622
|
+
|
623
|
+
def _get_morph_tokens(self, tokens, tags):
|
624
|
+
"""
|
625
|
+
공백으로 형태소 분리.
|
626
|
+
|
627
|
+
:param tokens: a list of strings
|
628
|
+
:param tags: a list of tags of each string
|
629
|
+
:return: a list of (morph, tag)
|
630
|
+
"""
|
631
|
+
#print(utils.get_word(self.morph_lexicon, tokens, tags, True))
|
632
|
+
# 기분석 사전 검색
|
633
|
+
tokens, tags = utils.get_word(self.morph_lexicon, tokens, tags, True)
|
634
|
+
#print(tokens)
|
635
|
+
#print(tags)
|
636
|
+
morphs = [''.join(tokens[0]) if isinstance(tokens[0], list) else tokens[0]]
|
637
|
+
morph_tags = [(lambda x: 'MA' if x == 'MS' else x)\
|
638
|
+
((lambda x: 'NN' if x == 'NS' else x)(tags[0]))]
|
639
|
+
for idx in range(1,len(tokens)):
|
640
|
+
if (tags[idx-1]=='NS' and tags[idx]=='NN') \
|
641
|
+
or (tags[idx-1]=='MS' and tags[idx]=='MA'):
|
642
|
+
morphs.append(morphs.pop()+(''.join(tokens[idx]) if isinstance(tokens[idx], list) else tokens[idx]))
|
643
|
+
elif tags[idx-1] != tags[idx] or tags[idx] == 'SY':
|
644
|
+
morphs.append(''.join(tokens[idx]) if isinstance(tokens[idx], list) else tokens[idx])
|
645
|
+
morph_tags.append((lambda x: 'MA' if x == 'MS' else x)\
|
646
|
+
((lambda x: 'NN' if x == 'NS' else x)(tags[idx])))
|
647
|
+
else:
|
648
|
+
morphs.append(morphs.pop()+(''.join(tokens[idx]) if isinstance(tokens[idx], list) else tokens[idx]))
|
649
|
+
|
650
|
+
return morphs, morph_tags
|
651
|
+
|
652
|
+
|
653
|
+
def get_eumjeol_tokens(self, tokens, tags):
|
654
|
+
"""
|
655
|
+
음절 토큰으로 처리.
|
656
|
+
'CO'를 앞 형태소에 붙이고 품사는 앞의 것을 따름
|
657
|
+
새로운 -> 새/VB+로운/CO -> 새로운/VB
|
658
|
+
|
659
|
+
:param tokens: a list of strings
|
660
|
+
:param tags: a list of tags of each string
|
661
|
+
:return: a list of (eumjeol, tag)
|
662
|
+
"""
|
663
|
+
eumjeol = []
|
664
|
+
eumjeol_tags = []
|
665
|
+
#print(tokens)
|
666
|
+
#print(tags)
|
667
|
+
for idx in range(0, len(tokens)):
|
668
|
+
if idx>0 and (tags[idx]=='CO' and \
|
669
|
+
tags[idx-1]!='SP' and tags[idx-1][1]!='N'):
|
670
|
+
eumjeol.append(eumjeol.pop()+(''.join(tokens[idx]) if isinstance(tokens[idx], list) else tokens[idx]))
|
671
|
+
elif idx>0 and (tags[idx]=='CO' and \
|
672
|
+
tags[idx-1]!='SP' and tags[idx-1][1]=='N'):
|
673
|
+
eumjeol.append(tokens[idx])
|
674
|
+
eumjeol_tags.append('XV')
|
675
|
+
elif tags[idx] =='CO':
|
676
|
+
eumjeol.append(tokens[idx])
|
677
|
+
eumjeol_tags.append('VB')
|
678
|
+
else:
|
679
|
+
eumjeol.append(tokens[idx])
|
680
|
+
eumjeol_tags.append(tags[idx])
|
681
|
+
#print(eumjeol)
|
682
|
+
#print(eumjeol_tags)
|
683
|
+
|
684
|
+
return eumjeol, eumjeol_tags
|
685
|
+
|
686
|
+
def get_eojeol(self, tokens, tags):
|
687
|
+
"""
|
688
|
+
음절 토큰으로 처리.
|
689
|
+
'CO'를 앞 형태소에 붙이고 품사는 앞의 것을 따름
|
690
|
+
새로운 -> 새/VB+로운/CO -> 새로운/VB
|
691
|
+
|
692
|
+
:param tokens: a list of strings
|
693
|
+
:param tags: a list of tags of each string
|
694
|
+
:return: a list of (eumjeol, tag)
|
695
|
+
"""
|
696
|
+
eojeols = []
|
697
|
+
eumjeol = []
|
698
|
+
#print(tokens)
|
699
|
+
for t in tokens:
|
700
|
+
if t == ' ':
|
701
|
+
eojeols.append(''.join(eumjeol))
|
702
|
+
eumjeol = []
|
703
|
+
else:
|
704
|
+
eumjeol.append(t)
|
705
|
+
eojeols.append(''.join(eumjeol))
|
706
|
+
#print(eojeols)
|
707
|
+
|
708
|
+
return eojeols
|
709
|
+
|
710
|
+
|
711
|
+
|
712
|
+
def get_morph_tokens(self, tokens, tags, mode="standard"):
|
713
|
+
"""
|
714
|
+
combine eumjeol to morph
|
715
|
+
|
716
|
+
param tokens: eumjeol token list
|
717
|
+
param tags: pos tag list of each token
|
718
|
+
"""
|
719
|
+
_morphs, _morph_tags = self._get_morph_tokens(tokens, tags)
|
720
|
+
#print('2---', morphs, morph_tags) # 원형 복원 전
|
721
|
+
|
722
|
+
if mode=='eumjeol':
|
723
|
+
eumjeols, eumjeol_tags = self.get_eumjeol_tokens(_morphs, _morph_tags)
|
724
|
+
return eumjeols, eumjeol_tags
|
725
|
+
|
726
|
+
# 'CO', 축약어 처리
|
727
|
+
morphs, morph_tags = self.handling_abbrs(_morphs, _morph_tags)
|
728
|
+
#print("3", morphs, morph_tags) # 원형복원
|
729
|
+
|
730
|
+
if mode=='eojeol':
|
731
|
+
eojeols = self.get_eojeol(_morphs, _morph_tags)
|
732
|
+
return eojeols, morphs, morph_tags
|
733
|
+
return morphs, morph_tags
|
734
|
+
|
735
|
+
def handling_abbrs(self, _morphs, _tags):
|
736
|
+
'''
|
737
|
+
CO tag와 축약을 처리한다.
|
738
|
+
'''
|
739
|
+
morphs = []
|
740
|
+
morph_tags = []
|
741
|
+
#print(_morphs, _tags, flush=True)
|
742
|
+
|
743
|
+
for i, t in enumerate(_tags):
|
744
|
+
if t == 'CO':
|
745
|
+
prev_morph = _morphs[i-1] if i > 0 else 'BOS'
|
746
|
+
prev_tag = _tags[i-1] if i > 0 else 'BOS'
|
747
|
+
next_morph = _morphs[i+1] if i < len(_tags)-1 else 'EOS'
|
748
|
+
next_tag = _tags[i+1] if i < len(_tags)-1 else 'EOS'
|
749
|
+
if _tags[i-1] in ['MM']:
|
750
|
+
morph_tags[-1] = 'NN'; _tags[i-1] = 'NN'; prev_tag='NN' # 나중 제거할 것
|
751
|
+
morphs, morph_tags = self.handling_co_tags(morphs, morph_tags, \
|
752
|
+
prev_morph, prev_tag, _morphs[i], _tags[i], next_morph, next_tag)
|
753
|
+
elif i > 0:
|
754
|
+
morphs, morph_tags = self.handling_others(morphs, morph_tags, _morphs[i], _tags[i])
|
755
|
+
else:
|
756
|
+
if _morphs[i] == ' ': t = 'SP'
|
757
|
+
morphs.append(_morphs[i])
|
758
|
+
morph_tags.append(t)
|
759
|
+
return morphs, morph_tags
|
760
|
+
|
761
|
+
|
762
|
+
def handling_others(self, morphs, morph_tags, morph, tag):
|
763
|
+
'''
|
764
|
+
다른 형태소들의 처리를 담당한다. 여기에는
|
765
|
+
1. 'ㄴ,ㄹ, ㅁ, ㅂ, ㅆ' 의 축약
|
766
|
+
2. '가수다'와 같은 어절이 포함된다.
|
767
|
+
'''
|
768
|
+
#print(morphs, morph_tags, morph, tag)
|
769
|
+
try:
|
770
|
+
if morph_tags[-1] == tag: # 이미 복원된 것과 연결
|
771
|
+
morphs.append(morphs.pop()+morph) # 미룬다 -> (미루 + ㄴ) + 다 -> 미루 + ㄴ다
|
772
|
+
elif morph_tags[-1] == 'NN' and tag == 'EE': # '가수다'
|
773
|
+
morphs.append('이')
|
774
|
+
morph_tags.append('VB')
|
775
|
+
morphs.append(morph)
|
776
|
+
morph_tags.append(tag)
|
777
|
+
elif morph_tags[-1] == 'MM' and tag == 'XV': # 제거 할 것, 'MMXV' 대응
|
778
|
+
morph_tags[-1] = 'NN'
|
779
|
+
morphs.append(morph)
|
780
|
+
morph_tags.append(tag)
|
781
|
+
else:
|
782
|
+
morphs.append(morph)
|
783
|
+
morph_tags.append(tag)
|
784
|
+
#print('9>', i, morphs, morph_tags)
|
785
|
+
except:
|
786
|
+
print('>>>', morphs, morph, tag)
|
787
|
+
return morphs, morph_tags
|
788
|
+
|
789
|
+
def handling_co_tags(self, morphs, morph_tags, m_1, t_1, m, t, m__1, t__1):
|
790
|
+
"""
|
791
|
+
CO tag를 다룬다. CO tag의 형태소를 확장한다.
|
792
|
+
"""
|
793
|
+
#print(morphs, morph_tags, m, t)
|
794
|
+
#------------------------------------------------------------
|
795
|
+
def get_best_path(l):
|
796
|
+
max_p = -1000; max_list = []; max_same_morph = 10
|
797
|
+
for idx, x in enumerate(l):
|
798
|
+
same_morph = 0
|
799
|
+
_m_t_ = etc.parse_morph(x) # '가/VB+ㄴ/EE' -> [(가, VB), (ㄴ, EE)]
|
800
|
+
#print(morphs, morph_tags, m_1, t_1, _m_t_)
|
801
|
+
# 이전 형태소
|
802
|
+
if (t_1 == _m_t_[0][1]) or (t_1 in ['JJ']):
|
803
|
+
same_morph = -1 if len(morph_tags)>1 else 0
|
804
|
+
first_word = _m_t_[0][0]
|
805
|
+
#print(same_morph, morph_tags[same_morph])
|
806
|
+
while morph_tags[same_morph] == _m_t_[0][1] or morph_tags[same_morph] in ['JJ']:
|
807
|
+
first_word = morphs[same_morph] + first_word
|
808
|
+
same_morph -= 1
|
809
|
+
if (len(morph_tags)+same_morph)<0 or len(morph_tags) == 1 : break
|
810
|
+
prev_word = (morphs[same_morph]+'/'+morph_tags[same_morph]) if (len(morph_tags)+same_morph)>=0 else 'BOS'
|
811
|
+
prev_tag = morph_tags[same_morph] if (len(morph_tags)+same_morph)>=0 else 'BOS'
|
812
|
+
first_word = first_word+'/'+_m_t_[0][1]
|
813
|
+
else:
|
814
|
+
prev_word = m_1+'/'+t_1 # viterbi를 위해서
|
815
|
+
prev_tag = t_1
|
816
|
+
first_word = (_m_t_[0][0]+'/'+_m_t_[0][1])
|
817
|
+
first_tag = _m_t_[0][1]
|
818
|
+
last_word = _m_t_[-1][0]+'/'+_m_t_[-1][1]
|
819
|
+
last_tag = _m_t_[-1][1]
|
820
|
+
|
821
|
+
p = (self.prob_dict[prev_word] if prev_word in self.prob_dict else -100)
|
822
|
+
#print(p)
|
823
|
+
p += (self.prob_dict[prev_tag + '' + first_tag] if prev_tag + '' + first_tag in self.prob_dict else -100) \
|
824
|
+
+ (self.prob_dict[first_word] if first_word in self.prob_dict else -100) \
|
825
|
+
+ (self.prob_dict[last_word] if last_word in self.prob_dict else -100) \
|
826
|
+
+ (self.prob_dict[last_tag + '' + t__1] if last_tag + '' + t__1 in self.prob_dict else -100)
|
827
|
+
#print(p)
|
828
|
+
if '/' in first_word:
|
829
|
+
first_word = first_word.split('/', 1)[0]
|
830
|
+
if p > max_p:
|
831
|
+
max_p = p
|
832
|
+
max_same_morph = same_morph
|
833
|
+
max_list = []
|
834
|
+
for i, (m, t) in enumerate(_m_t_):
|
835
|
+
m = first_word if i==0 else m
|
836
|
+
t = first_tag if i==0 else t
|
837
|
+
max_list.append((m,t))
|
838
|
+
#max_list = _m_t_
|
839
|
+
#print(max_same_morph, max_list)
|
840
|
+
return max_list, max_same_morph+1
|
841
|
+
|
842
|
+
# ---------------------------------------------------------
|
843
|
+
try:
|
844
|
+
l = self.co_lexicon[m].split('|')
|
845
|
+
except:
|
846
|
+
morphs.append(m)
|
847
|
+
morph_tags.append('NN')
|
848
|
+
return morphs, morph_tags
|
849
|
+
|
850
|
+
if len(l) == 1: # 후보가 하나일 경우
|
851
|
+
_m_t_ = etc.parse_morph(l[0])
|
852
|
+
for _m_, _t_ in _m_t_:
|
853
|
+
if len(morph_tags)>1 and morph_tags[-1] == _t_:
|
854
|
+
morphs.append(morphs.pop()+_m_)
|
855
|
+
else:
|
856
|
+
morphs.append(_m_)
|
857
|
+
morph_tags.append(_t_)
|
858
|
+
return morphs, morph_tags
|
859
|
+
|
860
|
+
# ------- 사전에 후보가 2개 이상일 경우 ----------
|
861
|
+
max_list, overlap_idx = get_best_path(l)
|
862
|
+
co_morphs = [m for (m,t) in max_list]
|
863
|
+
co_morph_tags = [t for (m,t) in max_list]
|
864
|
+
#print(':::', overlap_idx, morphs, co_morphs)
|
865
|
+
if overlap_idx <= 0: #handling of overlapping morphemes
|
866
|
+
morphs = morphs[:overlap_idx] + co_morphs
|
867
|
+
morph_tags = morph_tags[:overlap_idx] + co_morph_tags
|
868
|
+
else:
|
869
|
+
morphs = morphs + co_morphs
|
870
|
+
morph_tags = morph_tags + co_morph_tags
|
871
|
+
return morphs, morph_tags
|
872
|
+
|
873
|
+
def get_eojeol_tokens(self, tokens, tags, mode="eojeol"):
|
874
|
+
"""
|
875
|
+
# 복원 후 떨어진 형태소 연결, 구문분석에서 XV 형태소 연결하기
|
876
|
+
# 사랑+하 -> 사랑하 (구문분석)
|
877
|
+
|
878
|
+
param tokens : 음절
|
879
|
+
param tags : 품사
|
880
|
+
"""
|
881
|
+
eojeols, morphs, morph_tags = self.get_morph_tokens(tokens, tags, mode)
|
882
|
+
eojeol_features = []
|
883
|
+
#print(morphs, morph_tags)
|
884
|
+
head_m = ''; head_t = ''; tail_m=''; tail_t=''
|
885
|
+
for i in range(len(morphs)):
|
886
|
+
t = morph_tags[i]
|
887
|
+
#print(i, morphs[i], len(morphs), t)
|
888
|
+
#if ((i==0 or morphs[i-1] == ' ') and (i == len(morphs)-1 or morphs[i+1] == ' ')):
|
889
|
+
# # 기호로만 한 어절이 되는 경우
|
890
|
+
# head_m = morphs[i]
|
891
|
+
# head_t = morph_tags[i]
|
892
|
+
# tail_m = morphs[i]
|
893
|
+
# tail_t = morph_tags[i]
|
894
|
+
# eojeol_features.append((head_m, head_t, tail_m, tail_t))
|
895
|
+
# print("1::::", head_m, head_t, tail_m, tail_t)
|
896
|
+
# continue
|
897
|
+
|
898
|
+
# 어절 마지막
|
899
|
+
if t == 'SP' :
|
900
|
+
## tail feature of last eojeol
|
901
|
+
tail_m = morphs[i-2] if (morph_tags[i-1] == 'SY' and morphs[i-1]!=',' and morphs[i-2]!= ' ') else morphs[i-1]
|
902
|
+
tail_t = morph_tags[i-2] if (morph_tags[i-1] == 'SY' and morphs[i-1]!=',' and morphs[i-2] != ' ') else morph_tags[i-1]
|
903
|
+
tail_t = 'EE' if tail_t in ['XV', 'VB'] else tail_t # 비상 종결인 경우
|
904
|
+
eojeol_features.append((head_m, head_t, tail_m, tail_t))
|
905
|
+
#print("2::::", head_m, head_t, tail_m, tail_t)
|
906
|
+
continue
|
907
|
+
|
908
|
+
if i == len(morphs)-1:
|
909
|
+
## tail feature of last eojeol
|
910
|
+
tail_m = morphs[i-1] if (morph_tags[i] == 'SY' and morphs[i]!=',' and morphs[i-1]!= ' ') else morphs[i]
|
911
|
+
tail_t = morph_tags[i-1] if (morph_tags[i] == 'SY' and morphs[i]!=',' and morphs[i-1]!= ' ') else morph_tags[i]
|
912
|
+
tail_t = 'EE' if tail_t in ['XV', 'VB'] else tail_t # 비상 종결인 경우
|
913
|
+
eojeol_features.append((head_m, head_t, tail_m, tail_t))
|
914
|
+
#print("3::::", head_m, head_t, tail_m, tail_t)
|
915
|
+
continue
|
916
|
+
|
917
|
+
# 어절 처음
|
918
|
+
if i == 0 or morphs[i-1] == ' ':
|
919
|
+
head_m = morphs[i+1] if (morph_tags[i] == 'SY' and morph_tags[i+1] != 'SP') else morphs[i]
|
920
|
+
head_t = morph_tags[i+1] if (morph_tags[i] == 'SY' and morph_tags[i+1] != 'SP') else morph_tags[i]
|
921
|
+
idx = 2 if ((morph_tags[i] == 'SY' and morph_tags[i+1] != 'SP') and i < len(morphs)-2) else 1
|
922
|
+
head_t += morph_tags[i+idx] if morph_tags[i+idx] in ['XV', 'VB'] else ''
|
923
|
+
#print("4:::", i, idx, morph_tags, head_m, head_t, tail_m, tail_t)
|
924
|
+
|
925
|
+
#print(eojeols, eojeol_features)
|
926
|
+
return eojeols, eojeol_features
|