nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +16 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +847 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +787 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  32. nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  33. nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  34. nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  35. nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  36. nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  37. nltkor/sejong/ch.py +12 -0
  38. nltkor/sejong/dict_semClassNum.txt +491 -0
  39. nltkor/sejong/layer.txt +630 -0
  40. nltkor/sejong/sejong_download.py +87 -0
  41. nltkor/sejong/ssem.py +684 -0
  42. nltkor/similarity/__init__.py +3 -0
  43. nltkor/similarity/bartscore____.py +337 -0
  44. nltkor/similarity/bertscore____.py +339 -0
  45. nltkor/similarity/classical.py +245 -0
  46. nltkor/similarity/cosine_similarity.py +175 -0
  47. nltkor/tag/__init__.py +71 -0
  48. nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
  49. nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
  50. nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
  51. nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
  52. nltkor/tag/espresso_tag.py +220 -0
  53. nltkor/tag/libs/__init__.py +10 -0
  54. nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
  55. nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
  56. nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
  57. nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
  58. nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
  59. nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
  60. nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
  61. nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
  62. nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
  63. nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
  64. nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
  65. nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
  66. nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
  67. nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
  68. nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
  69. nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
  70. nltkor/tag/libs/arguments.py +280 -0
  71. nltkor/tag/libs/attributes.py +231 -0
  72. nltkor/tag/libs/config.py +159 -0
  73. nltkor/tag/libs/metadata.py +129 -0
  74. nltkor/tag/libs/ner/__init__.py +2 -0
  75. nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
  76. nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
  77. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
  78. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
  79. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  80. nltkor/tag/libs/ner/ner_reader.py +92 -0
  81. nltkor/tag/libs/network.c +72325 -0
  82. nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
  83. nltkor/tag/libs/network.pyx +878 -0
  84. nltkor/tag/libs/networkconv.pyx +1028 -0
  85. nltkor/tag/libs/networkdependencyconv.pyx +451 -0
  86. nltkor/tag/libs/parse/__init__.py +1 -0
  87. nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
  88. nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
  89. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
  90. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
  91. nltkor/tag/libs/parse/parse_reader.py +283 -0
  92. nltkor/tag/libs/pos/__init__.py +2 -0
  93. nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
  94. nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
  95. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
  96. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
  97. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  98. nltkor/tag/libs/pos/pos_reader.py +97 -0
  99. nltkor/tag/libs/reader.py +485 -0
  100. nltkor/tag/libs/srl/__init__.py +3 -0
  101. nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
  102. nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
  103. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
  104. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
  105. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
  106. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
  107. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  108. nltkor/tag/libs/srl/srl_reader.py +436 -0
  109. nltkor/tag/libs/srl/train_srl.py +87 -0
  110. nltkor/tag/libs/taggers.py +926 -0
  111. nltkor/tag/libs/utils.py +384 -0
  112. nltkor/tag/libs/word_dictionary.py +239 -0
  113. nltkor/tag/libs/wsd/__init__.py +2 -0
  114. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
  115. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
  116. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
  117. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
  118. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  119. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  120. nltkor/tokenize/__init__.py +62 -0
  121. nltkor/tokenize/ko_tokenize.py +115 -0
  122. nltkor/trans.py +121 -0
  123. nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
  124. nltkor-1.2.14.dist-info/METADATA +41 -0
  125. nltkor-1.2.14.dist-info/RECORD +127 -0
  126. nltkor-1.2.14.dist-info/WHEEL +5 -0
  127. nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,926 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """
4
+ Taggers wrapping the neural networks.
5
+ """
6
+
7
+ import logging
8
+ #from os import major
9
+ import numpy as np
10
+ import re
11
+ from nltkor import etc
12
+
13
+ from . import utils
14
+ from . import config
15
+ from . import attributes
16
+ from .metadata import Metadata
17
+ from .pos import POSReader
18
+ from .ner import NERReader
19
+ from .wsd import WSDReader
20
+ from .srl import SRLReader
21
+ from .parse import DependencyReader
22
+ import sys
23
+ sys.path.append("libs/")
24
+ from .network import Network, ConvolutionalNetwork, ConvolutionalDependencyNetwork
25
+
26
+
27
+ def load_network(md):
28
+ """
29
+ Loads the network from the default file and returns it.
30
+ """
31
+ logger = logging.getLogger("Logger")
32
+ is_srl = md.task == 'srl'
33
+
34
+ logger.info('Loading network')
35
+ if is_srl :
36
+ net_class = ConvolutionalNetwork
37
+ elif md.task.endswith('dependency'):
38
+ net_class = ConvolutionalDependencyNetwork
39
+ else:
40
+ net_class = Network
41
+
42
+ nn = net_class.load_from_file(md.paths[md.network])
43
+
44
+ logger.info('Done')
45
+ return nn
46
+
47
+
48
+ def create_reader(md, gold_file=None):
49
+ """
50
+ Creates a TextReader object for the given task and loads its dictionary.
51
+ :param md: a metadata object describing the task
52
+ :param gold_file: path to a file with gold standard data, if
53
+ the reader will be used for testing.
54
+ """
55
+ logger = logging.getLogger('Logger')
56
+ logger.info('Loading text reader...')
57
+
58
+ if md.task == 'pos':
59
+ tr = POSReader(md, filename=gold_file)
60
+
61
+ elif md.task == 'ner':
62
+ tr = NERReader(md, filename=gold_file)
63
+
64
+ elif md.task == 'wsd':
65
+ tr = WSDReader(md, filename=gold_file)
66
+
67
+ elif 'dependency' in md.task:
68
+ labeled = md.task.startswith('labeled')
69
+ tr = DependencyReader(md, filename=gold_file, labeled=labeled)
70
+
71
+ elif md.task.startswith('srl'):
72
+ tr = SRLReader(md, filename=gold_file)
73
+
74
+ else:
75
+ raise ValueError("Unknown task: %s" % md.task)
76
+
77
+ logger.info('Done')
78
+ return tr
79
+
80
+ def _group_arguments(tokens, predicate_positions, arg_tokens, labels):
81
+ """
82
+ Groups words pertaining to each argument and returns a dictionary for each predicate.
83
+ """
84
+ print(tokens, predicate_positions, arg_tokens, labels)
85
+ arg_structs = []
86
+
87
+ for predicate_position, pred_arg_tokens, pred_labels in zip(predicate_positions,
88
+ arg_tokens,
89
+ labels):
90
+ structure = {}
91
+
92
+ for tag, arg_token in zip(pred_labels, pred_arg_tokens):
93
+ #argument_tokens = [token]
94
+ #tag = pred_labels.pop(0)
95
+ structure[tag] = [arg_token]
96
+
97
+ predicate = tokens[predicate_position-1]
98
+ arg_structs.append((predicate, structure))
99
+
100
+ return arg_structs
101
+
102
+
103
+ class SRLAnnotatedSentence(object):
104
+ """
105
+ Class storing a sentence with annotated semantic roles.
106
+
107
+ It stores a list with the sentence tokens, called `tokens`, and a list of tuples
108
+ in the format `(predicate, arg_strucutres)`. Each `arg_structure` is a dict mapping
109
+ semantic roles to the words that constitute it. This is used instead of a two-level
110
+ dictionary because one sentence may have more than one occurrence of the same
111
+ predicate.
112
+
113
+ This class is used only for storing data.
114
+ """
115
+
116
+ def __init__(self, tokens, arg_structures):
117
+ """
118
+ Creates an instance of a sentence with SRL data.
119
+
120
+ :param tokens: a list of strings
121
+ :param arg_structures: a list of tuples in the format (predicate, mapping).
122
+ Each predicate is a string and each mapping is a dictionary mapping role labels
123
+ to the words that constitute it.
124
+ """
125
+ self.tokens = tokens
126
+ self.arg_structures = arg_structures
127
+
128
+ class ParsedSentence(object):
129
+ """
130
+ Class for storing a sentence with dependency parsing annotation.
131
+
132
+ It stores a list of tokens, the dependency heads, dependency labels and POS tags
133
+ if the parser used them. Dependency heads are the index of the head of each
134
+ token, and -1 means a dependency to the root.
135
+ """
136
+ def __init__(self, tokens, heads, labels, pos=None):
137
+ """
138
+ Constructor.
139
+
140
+ :param tokens: list of strings
141
+ :param heads: list of integers (-1 means dependency to root, others are token indices)
142
+ :param labels: list of strings
143
+ :param pos: None or list of strings
144
+ """
145
+ self.tokens = tokens
146
+ self.heads = heads
147
+ self.labels = labels
148
+ self.pos = pos
149
+
150
+ def __len__(self):
151
+ return len(self.tokens)
152
+
153
+ def to_conll_list(self):
154
+ """
155
+ Return a list representation of the sentence in CoNLL X format.
156
+
157
+ Each line has:
158
+ [number starting from 1] token _ POS POS _ head label
159
+
160
+ Token numbers start from 1, root is referred as 0.
161
+ POS is only available if the original parser used it.
162
+ """
163
+ tokenL = []
164
+ headL = []
165
+ labelL = []
166
+ posL = []
167
+ for i in range(len(self.tokens)):
168
+ tokenL.append(self.tokens[i])
169
+ headL.append(self.heads[i] + 1)
170
+ labelL.append(self.labels[i])
171
+ posL.append(self.pos[i])
172
+
173
+ return tokenL, posL, labelL, headL
174
+
175
+ def to_conll(self):
176
+ """
177
+ Return a string representation of the sentence in CoNLL X format.
178
+
179
+ Each line has:
180
+ [number starting from 1] token _ POS POS _ head label
181
+
182
+ Token numbers start from 1, root is referred as 0.
183
+ POS is only available if the original parser used it.
184
+ """
185
+ result = []
186
+ for i in range(len(self.tokens)):
187
+ token = self.tokens[i]
188
+ head = self.heads[i] + 1
189
+ label = self.labels[i]
190
+ pos = self.pos[i] if self.pos else '_'
191
+
192
+ #line = u'{id}\t{token}\t_\t{pos}\t{pos}\t_\t{head}\t{label}'
193
+ #result.append(line.format(id=i+1, pos=pos, head=head, label=label, token=token))
194
+ line = u'{id}\t{token}\t{head}\t{label}'
195
+ result.append(line.format(id=i+1, head=head, label=label, token=token))
196
+
197
+ return '\n'.join(result)
198
+
199
+
200
+ class Tagger(object):
201
+ """
202
+ Base class for taggers. It should not be instantiated.
203
+ """
204
+ def __init__(self, data_dir=None):
205
+ """Creates a tagger and loads data preemptively"""
206
+ asrt_msg = "espresso data directory is not set. \
207
+ If you don't have the trained models, download them from http://air.cwnu.ac.kr/espresso/models.html"
208
+ if data_dir is None:
209
+ assert config.data_dir is not None, asrt_msg
210
+ self.paths = config.FILES
211
+ else:
212
+ self.paths = config.get_config_paths(data_dir)
213
+
214
+ self.data_dir = data_dir
215
+ self._load_data()
216
+
217
+ def _load_data(self):
218
+ """Implemented by subclasses"""
219
+ pass
220
+
221
+
222
+ class SRLTagger(Tagger):
223
+ """
224
+ An SRLTagger loads the models and performs SRL on text.
225
+
226
+ It works on three stages: predicate identification, argument detection and
227
+ argument classification.
228
+ """
229
+
230
+ def _load_data(self):
231
+ """Loads data for SRL"""
232
+ md_srl = Metadata.load_from_file('srl', self.paths)
233
+ self.nn = load_network(md_srl)
234
+ self.reader = create_reader(md_srl)
235
+ self.reader.create_converter()
236
+ self.itd = self.reader.get_inverse_tag_dictionary()
237
+
238
+ self.parser = DependencyParser(self.data_dir)
239
+
240
+
241
+ def find_predicates(self, tokens):
242
+ """
243
+ Finds out which tokens are predicates.
244
+
245
+ :param tokens: a list of attribute.Token elements
246
+ :returns: the indices of predicate tokens
247
+ """
248
+ answer = []
249
+ for i, token in enumerate(tokens):
250
+ if token[0] == 'V' and tokens[i-1][0] != 'V': answer.append(i+1)
251
+ return np.array(answer)
252
+
253
+ def find_arguments(self, token_obj, predL, headL, relL):
254
+ """
255
+ Finds out which tokens are predicates.
256
+
257
+ :param tokens: a list of attribute.Token elements
258
+ :returns: the indices of predicate tokens
259
+ """
260
+ answer_token = []; answer = []
261
+ for p in predL:
262
+ pred_arg_token = []; pred_arg = []
263
+ for j, h in enumerate(headL):
264
+ if p == h and relL[j][0] == 'N':
265
+ pred_arg_token.append(token_obj[j])
266
+ pred_arg.append(np.array([j, j]))
267
+
268
+ #TODO
269
+ # predicate의 header
270
+ #if headL[p-1] != 0: # 마지막 제외
271
+ # pred_arg_token.append(token_obj[headL[p-1]-1])
272
+ # pred_arg.append(np.array([headL[p-1]-1, headL[p-1]]))
273
+
274
+ answer_token.append(pred_arg_token)
275
+ answer.append(pred_arg)
276
+ #print(answer_token)
277
+ #print(answer)
278
+ return answer_token, answer
279
+
280
+ def tag(self, text, use_sent_tokenizer=True, mode='standard'):
281
+ """
282
+ Runs the SRL process on the given text.
283
+
284
+ :param text: unicode or str encoded in utf-8.
285
+ :param no_repeats: whether to prevent repeated argument labels
286
+ :returns: a list of SRLAnnotatedSentence objects
287
+ """
288
+ tokens = utils.tokenize(text)
289
+ result = []
290
+ for sent in tokens:
291
+ tagged = self.tag_sentence(sent)
292
+ result.append(tagged)
293
+
294
+ return result
295
+
296
+ def tag_sentence(self, tokens, no_repeats=False):
297
+ """
298
+ Runs the SRL process on the given tokens.
299
+
300
+ :param tokens: a list of tokens (as strings)
301
+ :param no_repeats: whether to prevent repeated argument labels
302
+ :returns: a list of lists (one list for each sentence). Sentences have tuples
303
+ (all_tokens, predicate, arg_structure), where arg_structure is a dictionary
304
+ mapping argument labels to the words it includes.
305
+ """
306
+ # 구문분석
307
+ parsed = self.parser.parse_sentence(tokens)
308
+ wordL, posL, relL, headL = parsed.to_conll_list()
309
+ tokens_obj = []
310
+ for w, p, r in zip(wordL, posL, relL):
311
+ hm, hp, tm, tp = p
312
+ token = attributes.Token(w, hm, hp, tm, tp, r)
313
+ tokens_obj.append(token)
314
+
315
+ converted_class = np.array([self.reader.converter.convert(t)
316
+ for t in tokens_obj])
317
+ pred_positions = self.find_predicates(relL)
318
+
319
+ arg_tokens, arg_limits = self.find_arguments(wordL, pred_positions, headL, relL)
320
+ print(arg_tokens)
321
+ print(pred_positions)
322
+ print(arg_limits)
323
+
324
+ # now, argument classification
325
+ answers = self.nn.tag_sentence(converted_class,
326
+ pred_positions, arg_limits,
327
+ allow_repeats=not no_repeats)
328
+ labels = [[self.itd[x] for x in pred_answer]
329
+ for pred_answer in answers]
330
+
331
+ structures = _group_arguments(wordL, pred_positions, arg_tokens, labels)
332
+ return SRLAnnotatedSentence(wordL, structures)
333
+
334
+ class DependencyParser(Tagger):
335
+ """A Dependency Parser based on a neural network tagger."""
336
+
337
+ def __init__(self, *args, **kwargs):
338
+ """
339
+ Set the data directory for the POS tagger, if one is used,
340
+ and call the parent constructor.
341
+ """
342
+ super(DependencyParser, self).__init__(*args, **kwargs)
343
+
344
+ def _load_data(self):
345
+ """Loads data for Dependency Parsing"""
346
+ md_udep = Metadata.load_from_file('unlabeled_dependency', paths=self.paths)
347
+ self.unlabeled_nn = load_network(md_udep)
348
+ self.unlabeled_reader = create_reader(md_udep)
349
+
350
+ md_ldep = Metadata.load_from_file('labeled_dependency', paths=self.paths)
351
+ self.labeled_nn = load_network(md_ldep)
352
+ self.labeled_reader = create_reader(md_ldep)
353
+ self.itd = self.labeled_reader.get_inverse_tag_dictionary()
354
+
355
+ self.use_pos = md_udep.use_pos or md_ldep.use_pos
356
+ if self.use_pos:
357
+ self.pos_tagger = POSTagger(self.data_dir)
358
+
359
+ def parse(self, text):
360
+ """
361
+ Split the given text into sentences and determines their
362
+ dependency trees. If you want to provide your own tokenized
363
+ text, use `parse_sentence` instead.
364
+
365
+ :param text: a string
366
+ :returns: a list of ParsedSentence's
367
+ """
368
+ sentences = utils.tokenize(text)
369
+ result = []
370
+ for sent in sentences:
371
+ parsed = self.parse_sentence(sent)
372
+ result.append(parsed)
373
+
374
+ return result
375
+
376
+ def tag_tokens(self, tokens):
377
+ """
378
+ Parse the given sentence. This function is just an alias for
379
+ `parse_sentence`.
380
+ """
381
+ return self.parse_sentence(tokens)
382
+
383
+ def parse_sentence(self, tokens):
384
+ """
385
+ Parse the given sentence. It must be already tokenized; if you
386
+ want nlpnet to tokenize the text, use the method `parse` instead.
387
+
388
+ :param tokens: a list of strings (sentences)
389
+ :return: a ParsedSentence instance
390
+ """
391
+ original_tokens = tokens
392
+ udep_tokens_obj = []
393
+ ldep_tokens_obj = []
394
+
395
+ # if the parser uses POS a feature, have a tagger tag it first
396
+ if self.use_pos:
397
+ eojeols, eojeol_features = self.pos_tagger.tag_tokens(tokens, mode='eojeol')
398
+ #print("**", eojeols)
399
+ #print(eojeol_features)
400
+ #print(tokens, eojeols)
401
+
402
+ for word, feature in zip(eojeols, eojeol_features):
403
+ m_h, t_h, m_t, t_t = feature
404
+ #udep_tokens_obj.append(attributes.Token(word, morph_h=m_h, pos_h=t_h, morph_t=m_t, pos_t=t_t))
405
+ udep_tokens_obj.append(attributes.Token(word, pos_h=t_h, morph_t=m_t, pos_t=t_t))
406
+ ldep_tokens_obj.append(attributes.Token(word, pos_h=t_h, morph_t=m_t, pos_t=t_t))
407
+
408
+ converted_tokens = self.unlabeled_reader.codify_sentence(udep_tokens_obj)
409
+ #print(converted_tokens)
410
+ heads = self.unlabeled_nn.tag_sentence(converted_tokens)
411
+ #print(heads)
412
+
413
+ # the root is returned having a value == len(sentence)
414
+ root = heads.argmax()
415
+ heads[root] = root
416
+
417
+ converted_tokens = self.labeled_reader.codify_sentence(ldep_tokens_obj)
418
+ label_codes = self.labeled_nn.tag_sentence(converted_tokens, heads)
419
+ labels = [self.itd[code] for code in label_codes]
420
+ #print(label_codes)
421
+ #print(labels)
422
+
423
+ # to the final answer, signal the root with -1
424
+ heads[root] = -1
425
+ pos_tags = eojeol_features if self.use_pos else None
426
+ #pos_tags = zip(*tokens)[1] if self.use_pos else None
427
+
428
+ parsed = ParsedSentence(eojeols, heads, labels, pos_tags)
429
+ #parsed = ParsedSentence(original_tokens, heads, labels, pos_tags)
430
+ return parsed
431
+
432
+ def tag(self, text, use_sent_tokenizer=True, mode='eojeol'):
433
+ """
434
+ Parse the given text. This is just an alias for the
435
+ `parse` method.
436
+ """
437
+ return self.parse(text)
438
+
439
+
440
+ class WSDTagger(Tagger):
441
+ """A WSDTagger loads the models and performs WSD tagging on text."""
442
+
443
+ def _load_data(self):
444
+ """Loads data for WSD"""
445
+ md_wsd = Metadata.load_from_file('wsd', self.paths)
446
+ self.nn = load_network(md_wsd)
447
+ self.reader = create_reader(md_wsd)
448
+ self.reader.create_converter()
449
+ self.itd = self.reader.get_inverse_tag_dictionary()
450
+ #self.morph_lexicon = self.reader.morph_lexicon # user lexicon
451
+ #self.co_lexicon = self.reader.co_lexicon
452
+ #self.prob_dict = self.reader.prob_dict
453
+ self.pos_tagger = POSTagger(self.data_dir)
454
+
455
+ def tag(self, text, use_sent_tokenizer=True, mode='standard'):
456
+ """
457
+ Tags the given text.
458
+
459
+ :param text: a string or unicode object. Strings assumed to be utf-8
460
+ :returns: a list of lists (sentences with tokens).
461
+ Each sentence has (token, tag) tuples.
462
+ """
463
+ tokens = utils.tokenize(text)
464
+ result = []
465
+ for sent in tokens:
466
+ tagged = self.tag_sentence(sent)
467
+ result.append(tagged)
468
+
469
+ return result
470
+
471
+ def tag_sentence(self, tokens):
472
+ """
473
+ Tags a given list of tokens.
474
+
475
+ Tokens should be produced with the espresso tokenizer in order to
476
+ match the entries in the vocabulary. If you have non-tokenized text,
477
+ use NERTagger.tag(text).
478
+
479
+ :param tokens: a list of strings
480
+ :returns: a list of strings (morphs, tags)
481
+ """
482
+ pos_tagged = self.pos_tagger.tag_tokens(tokens)
483
+
484
+ pos_tagged = filter(lambda x : x != (' ', 'SP'), pos_tagged)
485
+ unzipped_pos_tagged = zip(*pos_tagged)
486
+ morphs, morph_pos_tags = list(unzipped_pos_tagged)
487
+ #print(morphs, morph_pos_tags)
488
+
489
+ converter = self.reader.converter
490
+ converted_tokens = np.array([converter.convert(token) for token in morphs])
491
+ #print("0", converted_tokens)
492
+
493
+ answer = self.nn.tag_sentence(converted_tokens)
494
+ tags = [self.itd[tag] for tag in answer] # 번호를 수로 표현
495
+
496
+ #print("1", morphs, tags)
497
+
498
+ return zip(morphs, tags)
499
+
500
+
501
+ class NERTagger(Tagger):
502
+ """A NERTagger loads the models and performs NER tagging on text."""
503
+
504
+ def _load_data(self):
505
+ """Loads data for NER"""
506
+ md_ner = Metadata.load_from_file('ner', self.paths)
507
+ self.nn = load_network(md_ner)
508
+ self.reader = create_reader(md_ner)
509
+ self.reader.create_converter()
510
+ self.itd = self.reader.get_inverse_tag_dictionary()
511
+ #self.morph_lexicon = self.reader.morph_lexicon # user lexicon
512
+ #self.co_lexicon = self.reader.co_lexicon
513
+ #self.prob_dict = self.reader.prob_dict
514
+ self.pos_tagger = POSTagger(self.data_dir)
515
+
516
+ def tag(self, text, use_sent_tokenizer=True, mode='standard'):
517
+ """
518
+ Tags the given text.
519
+
520
+ :param text: a string or unicode object. Strings assumed to be utf-8
521
+ :returns: a list of lists (sentences with tokens).
522
+ Each sentence has (token, tag) tuples.
523
+ """
524
+ tokens = utils.tokenize(text)
525
+ result = []
526
+ for sent in tokens:
527
+ tagged = self.tag_sentence(sent)
528
+ result.append(tagged)
529
+
530
+ return result
531
+
532
+ def tag_sentence(self, tokens):
533
+ """
534
+ Tags a given list of tokens.
535
+
536
+ Tokens should be produced with the espresso tokenizer in order to
537
+ match the entries in the vocabulary. If you have non-tokenized text,
538
+ use NERTagger.tag(text).
539
+
540
+ :param tokens: a list of strings
541
+ :returns: a list of strings (morphs, tags)
542
+ """
543
+ pos_tagged = self.pos_tagger.tag_tokens(tokens)
544
+
545
+ pos_tagged = filter(lambda x : x != (' ', 'SP'), pos_tagged) # 공백 제거
546
+ unzipped_pos_tagged = zip(*pos_tagged)
547
+ morphs, morph_pos_tags = list(unzipped_pos_tagged)
548
+ #print(morphs, morph_pos_tags)
549
+
550
+ converter = self.reader.converter
551
+ converted_tokens = np.array([converter.convert(token) for token in morphs])
552
+ #print("0", converted_tokens)
553
+
554
+ answer = self.nn.tag_sentence(converted_tokens)
555
+ tags = [self.itd[tag] for tag in answer] # 번호를 수로 표현
556
+
557
+ #print("1", morphs, tags)
558
+
559
+ return zip(morphs, tags)
560
+
561
+
562
+ class POSTagger(Tagger):
563
+ """A POSTagger loads the models and performs POS tagging on text."""
564
+
565
+ def _load_data(self):
566
+ """Loads data for POS"""
567
+ md = Metadata.load_from_file('pos', self.paths)
568
+ self.nn = load_network(md)
569
+ self.reader = create_reader(md)
570
+ self.reader.create_converter()
571
+ self.itd = self.reader.get_inverse_tag_dictionary()
572
+ self.morph_lexicon = self.reader.morph_lexicon # user lexicon
573
+ self.co_lexicon = self.reader.co_lexicon
574
+ self.prob_dict = self.reader.prob_dict
575
+
576
+ def tag(self, text, use_sent_tokenizer=True, mode="standard"):
577
+ """
578
+ Tags the given text.
579
+
580
+ :param text: a string or unicode object. Strings assumed to be utf-8
581
+ :param mode: [standard, eumjeol, eojeol]. "eumjeol" does not lemmatize,
582
+ "eojeol" includes NN+XV
583
+ :returns: a list of lists (sentences with tokens).
584
+ Each sentence has (token, tag) tuples.
585
+ """
586
+ tokens = utils.tokenize(text, use_sent_tokenizer) # 문장 단위 + 음절
587
+ result = []
588
+ for sent in tokens:
589
+ tagged = self.tag_tokens(sent, mode)
590
+ result.append(tagged)
591
+
592
+ return result
593
+
594
+ def tag_tokens(self, tokens, mode="standard"):
595
+ """
596
+ Tags a given list of tokens.
597
+
598
+ Tokens should be produced with the espresso tokenizer in order to
599
+ match the entries in the vocabulary. If you have non-tokenized text,
600
+ use POSTagger.tag(text).
601
+
602
+ :param tokens: a list of strings
603
+ :param mode: [standard, eumjeol, verb]. "eumjeol" does not lemmatize,
604
+ "verb" includes NN+XV
605
+ :returns: a list of strings (the tags)
606
+ """
607
+ converter = self.reader.converter # 클래스 지정
608
+ converted_tokens = np.array([converter.convert('*space*') if token==' ' else converter.convert(token)
609
+ for token in tokens])
610
+ #print("0", converted_tokens)
611
+
612
+ answer = self.nn.tag_sentence(converted_tokens)
613
+ tags = [self.itd[tag] for tag in answer] # 번호를 문자로 변환
614
+
615
+ if mode == 'eojeol':
616
+ eojeols, eojeol_features = self.get_eojeol_tokens(tokens, tags, mode)
617
+ return eojeols, eojeol_features
618
+ else:
619
+ morphs, morph_tags = self.get_morph_tokens(tokens, tags, mode)
620
+ return zip(morphs, morph_tags)
621
+
622
+
623
+ def _get_morph_tokens(self, tokens, tags):
624
+ """
625
+ 공백으로 형태소 분리.
626
+
627
+ :param tokens: a list of strings
628
+ :param tags: a list of tags of each string
629
+ :return: a list of (morph, tag)
630
+ """
631
+ #print(utils.get_word(self.morph_lexicon, tokens, tags, True))
632
+ # 기분석 사전 검색
633
+ tokens, tags = utils.get_word(self.morph_lexicon, tokens, tags, True)
634
+ #print(tokens)
635
+ #print(tags)
636
+ morphs = [''.join(tokens[0]) if isinstance(tokens[0], list) else tokens[0]]
637
+ morph_tags = [(lambda x: 'MA' if x == 'MS' else x)\
638
+ ((lambda x: 'NN' if x == 'NS' else x)(tags[0]))]
639
+ for idx in range(1,len(tokens)):
640
+ if (tags[idx-1]=='NS' and tags[idx]=='NN') \
641
+ or (tags[idx-1]=='MS' and tags[idx]=='MA'):
642
+ morphs.append(morphs.pop()+(''.join(tokens[idx]) if isinstance(tokens[idx], list) else tokens[idx]))
643
+ elif tags[idx-1] != tags[idx] or tags[idx] == 'SY':
644
+ morphs.append(''.join(tokens[idx]) if isinstance(tokens[idx], list) else tokens[idx])
645
+ morph_tags.append((lambda x: 'MA' if x == 'MS' else x)\
646
+ ((lambda x: 'NN' if x == 'NS' else x)(tags[idx])))
647
+ else:
648
+ morphs.append(morphs.pop()+(''.join(tokens[idx]) if isinstance(tokens[idx], list) else tokens[idx]))
649
+
650
+ return morphs, morph_tags
651
+
652
+
653
+ def get_eumjeol_tokens(self, tokens, tags):
654
+ """
655
+ 음절 토큰으로 처리.
656
+ 'CO'를 앞 형태소에 붙이고 품사는 앞의 것을 따름
657
+ 새로운 -> 새/VB+로운/CO -> 새로운/VB
658
+
659
+ :param tokens: a list of strings
660
+ :param tags: a list of tags of each string
661
+ :return: a list of (eumjeol, tag)
662
+ """
663
+ eumjeol = []
664
+ eumjeol_tags = []
665
+ #print(tokens)
666
+ #print(tags)
667
+ for idx in range(0, len(tokens)):
668
+ if idx>0 and (tags[idx]=='CO' and \
669
+ tags[idx-1]!='SP' and tags[idx-1][1]!='N'):
670
+ eumjeol.append(eumjeol.pop()+(''.join(tokens[idx]) if isinstance(tokens[idx], list) else tokens[idx]))
671
+ elif idx>0 and (tags[idx]=='CO' and \
672
+ tags[idx-1]!='SP' and tags[idx-1][1]=='N'):
673
+ eumjeol.append(tokens[idx])
674
+ eumjeol_tags.append('XV')
675
+ elif tags[idx] =='CO':
676
+ eumjeol.append(tokens[idx])
677
+ eumjeol_tags.append('VB')
678
+ else:
679
+ eumjeol.append(tokens[idx])
680
+ eumjeol_tags.append(tags[idx])
681
+ #print(eumjeol)
682
+ #print(eumjeol_tags)
683
+
684
+ return eumjeol, eumjeol_tags
685
+
686
+ def get_eojeol(self, tokens, tags):
687
+ """
688
+ 음절 토큰으로 처리.
689
+ 'CO'를 앞 형태소에 붙이고 품사는 앞의 것을 따름
690
+ 새로운 -> 새/VB+로운/CO -> 새로운/VB
691
+
692
+ :param tokens: a list of strings
693
+ :param tags: a list of tags of each string
694
+ :return: a list of (eumjeol, tag)
695
+ """
696
+ eojeols = []
697
+ eumjeol = []
698
+ #print(tokens)
699
+ for t in tokens:
700
+ if t == ' ':
701
+ eojeols.append(''.join(eumjeol))
702
+ eumjeol = []
703
+ else:
704
+ eumjeol.append(t)
705
+ eojeols.append(''.join(eumjeol))
706
+ #print(eojeols)
707
+
708
+ return eojeols
709
+
710
+
711
+
712
+ def get_morph_tokens(self, tokens, tags, mode="standard"):
713
+ """
714
+ combine eumjeol to morph
715
+
716
+ param tokens: eumjeol token list
717
+ param tags: pos tag list of each token
718
+ """
719
+ _morphs, _morph_tags = self._get_morph_tokens(tokens, tags)
720
+ #print('2---', morphs, morph_tags) # 원형 복원 전
721
+
722
+ if mode=='eumjeol':
723
+ eumjeols, eumjeol_tags = self.get_eumjeol_tokens(_morphs, _morph_tags)
724
+ return eumjeols, eumjeol_tags
725
+
726
+ # 'CO', 축약어 처리
727
+ morphs, morph_tags = self.handling_abbrs(_morphs, _morph_tags)
728
+ #print("3", morphs, morph_tags) # 원형복원
729
+
730
+ if mode=='eojeol':
731
+ eojeols = self.get_eojeol(_morphs, _morph_tags)
732
+ return eojeols, morphs, morph_tags
733
+ return morphs, morph_tags
734
+
735
+ def handling_abbrs(self, _morphs, _tags):
736
+ '''
737
+ CO tag와 축약을 처리한다.
738
+ '''
739
+ morphs = []
740
+ morph_tags = []
741
+ #print(_morphs, _tags, flush=True)
742
+
743
+ for i, t in enumerate(_tags):
744
+ if t == 'CO':
745
+ prev_morph = _morphs[i-1] if i > 0 else 'BOS'
746
+ prev_tag = _tags[i-1] if i > 0 else 'BOS'
747
+ next_morph = _morphs[i+1] if i < len(_tags)-1 else 'EOS'
748
+ next_tag = _tags[i+1] if i < len(_tags)-1 else 'EOS'
749
+ if _tags[i-1] in ['MM']:
750
+ morph_tags[-1] = 'NN'; _tags[i-1] = 'NN'; prev_tag='NN' # 나중 제거할 것
751
+ morphs, morph_tags = self.handling_co_tags(morphs, morph_tags, \
752
+ prev_morph, prev_tag, _morphs[i], _tags[i], next_morph, next_tag)
753
+ elif i > 0:
754
+ morphs, morph_tags = self.handling_others(morphs, morph_tags, _morphs[i], _tags[i])
755
+ else:
756
+ if _morphs[i] == ' ': t = 'SP'
757
+ morphs.append(_morphs[i])
758
+ morph_tags.append(t)
759
+ return morphs, morph_tags
760
+
761
+
762
+ def handling_others(self, morphs, morph_tags, morph, tag):
763
+ '''
764
+ 다른 형태소들의 처리를 담당한다. 여기에는
765
+ 1. 'ㄴ,ㄹ, ㅁ, ㅂ, ㅆ' 의 축약
766
+ 2. '가수다'와 같은 어절이 포함된다.
767
+ '''
768
+ #print(morphs, morph_tags, morph, tag)
769
+ try:
770
+ if morph_tags[-1] == tag: # 이미 복원된 것과 연결
771
+ morphs.append(morphs.pop()+morph) # 미룬다 -> (미루 + ㄴ) + 다 -> 미루 + ㄴ다
772
+ elif morph_tags[-1] == 'NN' and tag == 'EE': # '가수다'
773
+ morphs.append('이')
774
+ morph_tags.append('VB')
775
+ morphs.append(morph)
776
+ morph_tags.append(tag)
777
+ elif morph_tags[-1] == 'MM' and tag == 'XV': # 제거 할 것, 'MMXV' 대응
778
+ morph_tags[-1] = 'NN'
779
+ morphs.append(morph)
780
+ morph_tags.append(tag)
781
+ else:
782
+ morphs.append(morph)
783
+ morph_tags.append(tag)
784
+ #print('9>', i, morphs, morph_tags)
785
+ except:
786
+ print('>>>', morphs, morph, tag)
787
+ return morphs, morph_tags
788
+
789
+ def handling_co_tags(self, morphs, morph_tags, m_1, t_1, m, t, m__1, t__1):
790
+ """
791
+ CO tag를 다룬다. CO tag의 형태소를 확장한다.
792
+ """
793
+ #print(morphs, morph_tags, m, t)
794
+ #------------------------------------------------------------
795
+ def get_best_path(l):
796
+ max_p = -1000; max_list = []; max_same_morph = 10
797
+ for idx, x in enumerate(l):
798
+ same_morph = 0
799
+ _m_t_ = etc.parse_morph(x) # '가/VB+ㄴ/EE' -> [(가, VB), (ㄴ, EE)]
800
+ #print(morphs, morph_tags, m_1, t_1, _m_t_)
801
+ # 이전 형태소
802
+ if (t_1 == _m_t_[0][1]) or (t_1 in ['JJ']):
803
+ same_morph = -1 if len(morph_tags)>1 else 0
804
+ first_word = _m_t_[0][0]
805
+ #print(same_morph, morph_tags[same_morph])
806
+ while morph_tags[same_morph] == _m_t_[0][1] or morph_tags[same_morph] in ['JJ']:
807
+ first_word = morphs[same_morph] + first_word
808
+ same_morph -= 1
809
+ if (len(morph_tags)+same_morph)<0 or len(morph_tags) == 1 : break
810
+ prev_word = (morphs[same_morph]+'/'+morph_tags[same_morph]) if (len(morph_tags)+same_morph)>=0 else 'BOS'
811
+ prev_tag = morph_tags[same_morph] if (len(morph_tags)+same_morph)>=0 else 'BOS'
812
+ first_word = first_word+'/'+_m_t_[0][1]
813
+ else:
814
+ prev_word = m_1+'/'+t_1 # viterbi를 위해서
815
+ prev_tag = t_1
816
+ first_word = (_m_t_[0][0]+'/'+_m_t_[0][1])
817
+ first_tag = _m_t_[0][1]
818
+ last_word = _m_t_[-1][0]+'/'+_m_t_[-1][1]
819
+ last_tag = _m_t_[-1][1]
820
+
821
+ p = (self.prob_dict[prev_word] if prev_word in self.prob_dict else -100)
822
+ #print(p)
823
+ p += (self.prob_dict[prev_tag + ' ' + first_tag] if prev_tag + ' ' + first_tag in self.prob_dict else -100) \
824
+ + (self.prob_dict[first_word] if first_word in self.prob_dict else -100) \
825
+ + (self.prob_dict[last_word] if last_word in self.prob_dict else -100) \
826
+ + (self.prob_dict[last_tag + ' ' + t__1] if last_tag + ' ' + t__1 in self.prob_dict else -100)
827
+ #print(p)
828
+ if '/' in first_word:
829
+ first_word = first_word.split('/', 1)[0]
830
+ if p > max_p:
831
+ max_p = p
832
+ max_same_morph = same_morph
833
+ max_list = []
834
+ for i, (m, t) in enumerate(_m_t_):
835
+ m = first_word if i==0 else m
836
+ t = first_tag if i==0 else t
837
+ max_list.append((m,t))
838
+ #max_list = _m_t_
839
+ #print(max_same_morph, max_list)
840
+ return max_list, max_same_morph+1
841
+
842
+ # ---------------------------------------------------------
843
+ try:
844
+ l = self.co_lexicon[m].split('|')
845
+ except:
846
+ morphs.append(m)
847
+ morph_tags.append('NN')
848
+ return morphs, morph_tags
849
+
850
+ if len(l) == 1: # 후보가 하나일 경우
851
+ _m_t_ = etc.parse_morph(l[0])
852
+ for _m_, _t_ in _m_t_:
853
+ if len(morph_tags)>1 and morph_tags[-1] == _t_:
854
+ morphs.append(morphs.pop()+_m_)
855
+ else:
856
+ morphs.append(_m_)
857
+ morph_tags.append(_t_)
858
+ return morphs, morph_tags
859
+
860
+ # ------- 사전에 후보가 2개 이상일 경우 ----------
861
+ max_list, overlap_idx = get_best_path(l)
862
+ co_morphs = [m for (m,t) in max_list]
863
+ co_morph_tags = [t for (m,t) in max_list]
864
+ #print(':::', overlap_idx, morphs, co_morphs)
865
+ if overlap_idx <= 0: #handling of overlapping morphemes
866
+ morphs = morphs[:overlap_idx] + co_morphs
867
+ morph_tags = morph_tags[:overlap_idx] + co_morph_tags
868
+ else:
869
+ morphs = morphs + co_morphs
870
+ morph_tags = morph_tags + co_morph_tags
871
+ return morphs, morph_tags
872
+
873
+ def get_eojeol_tokens(self, tokens, tags, mode="eojeol"):
874
+ """
875
+ # 복원 후 떨어진 형태소 연결, 구문분석에서 XV 형태소 연결하기
876
+ # 사랑+하 -> 사랑하 (구문분석)
877
+
878
+ param tokens : 음절
879
+ param tags : 품사
880
+ """
881
+ eojeols, morphs, morph_tags = self.get_morph_tokens(tokens, tags, mode)
882
+ eojeol_features = []
883
+ #print(morphs, morph_tags)
884
+ head_m = ''; head_t = ''; tail_m=''; tail_t=''
885
+ for i in range(len(morphs)):
886
+ t = morph_tags[i]
887
+ #print(i, morphs[i], len(morphs), t)
888
+ #if ((i==0 or morphs[i-1] == ' ') and (i == len(morphs)-1 or morphs[i+1] == ' ')):
889
+ # # 기호로만 한 어절이 되는 경우
890
+ # head_m = morphs[i]
891
+ # head_t = morph_tags[i]
892
+ # tail_m = morphs[i]
893
+ # tail_t = morph_tags[i]
894
+ # eojeol_features.append((head_m, head_t, tail_m, tail_t))
895
+ # print("1::::", head_m, head_t, tail_m, tail_t)
896
+ # continue
897
+
898
+ # 어절 마지막
899
+ if t == 'SP' :
900
+ ## tail feature of last eojeol
901
+ tail_m = morphs[i-2] if (morph_tags[i-1] == 'SY' and morphs[i-1]!=',' and morphs[i-2]!= ' ') else morphs[i-1]
902
+ tail_t = morph_tags[i-2] if (morph_tags[i-1] == 'SY' and morphs[i-1]!=',' and morphs[i-2] != ' ') else morph_tags[i-1]
903
+ tail_t = 'EE' if tail_t in ['XV', 'VB'] else tail_t # 비상 종결인 경우
904
+ eojeol_features.append((head_m, head_t, tail_m, tail_t))
905
+ #print("2::::", head_m, head_t, tail_m, tail_t)
906
+ continue
907
+
908
+ if i == len(morphs)-1:
909
+ ## tail feature of last eojeol
910
+ tail_m = morphs[i-1] if (morph_tags[i] == 'SY' and morphs[i]!=',' and morphs[i-1]!= ' ') else morphs[i]
911
+ tail_t = morph_tags[i-1] if (morph_tags[i] == 'SY' and morphs[i]!=',' and morphs[i-1]!= ' ') else morph_tags[i]
912
+ tail_t = 'EE' if tail_t in ['XV', 'VB'] else tail_t # 비상 종결인 경우
913
+ eojeol_features.append((head_m, head_t, tail_m, tail_t))
914
+ #print("3::::", head_m, head_t, tail_m, tail_t)
915
+ continue
916
+
917
+ # 어절 처음
918
+ if i == 0 or morphs[i-1] == ' ':
919
+ head_m = morphs[i+1] if (morph_tags[i] == 'SY' and morph_tags[i+1] != 'SP') else morphs[i]
920
+ head_t = morph_tags[i+1] if (morph_tags[i] == 'SY' and morph_tags[i+1] != 'SP') else morph_tags[i]
921
+ idx = 2 if ((morph_tags[i] == 'SY' and morph_tags[i+1] != 'SP') and i < len(morphs)-2) else 1
922
+ head_t += morph_tags[i+idx] if morph_tags[i+idx] in ['XV', 'VB'] else ''
923
+ #print("4:::", i, idx, morph_tags, head_m, head_t, tail_m, tail_t)
924
+
925
+ #print(eojeols, eojeol_features)
926
+ return eojeols, eojeol_features