nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nltkor/Kor_char.py +193 -0
- nltkor/__init__.py +16 -0
- nltkor/alignment/__init__.py +1315 -0
- nltkor/cider/__init__.py +2 -0
- nltkor/cider/cider.py +55 -0
- nltkor/cider/cider_scorer.py +207 -0
- nltkor/distance/__init__.py +441 -0
- nltkor/distance/wasserstein.py +126 -0
- nltkor/etc.py +22 -0
- nltkor/lazyimport.py +144 -0
- nltkor/make_requirement.py +11 -0
- nltkor/metrics/__init__.py +63 -0
- nltkor/metrics/bartscore.py +301 -0
- nltkor/metrics/bertscore.py +331 -0
- nltkor/metrics/bleu_tensor.py +20 -0
- nltkor/metrics/classical.py +847 -0
- nltkor/metrics/entment.py +24 -0
- nltkor/metrics/eval.py +517 -0
- nltkor/metrics/mauve.py +273 -0
- nltkor/metrics/mauve_utils.py +131 -0
- nltkor/misc/__init__.py +11 -0
- nltkor/misc/string2string_basic_functions.py +59 -0
- nltkor/misc/string2string_default_tokenizer.py +83 -0
- nltkor/misc/string2string_hash_functions.py +159 -0
- nltkor/misc/string2string_word_embeddings.py +503 -0
- nltkor/search/__init__.py +10 -0
- nltkor/search/classical.py +569 -0
- nltkor/search/faiss_search.py +787 -0
- nltkor/search/kobert_tokenizer.py +181 -0
- nltkor/sejong/__init__.py +3 -0
- nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
- nltkor/sejong/ch.py +12 -0
- nltkor/sejong/dict_semClassNum.txt +491 -0
- nltkor/sejong/layer.txt +630 -0
- nltkor/sejong/sejong_download.py +87 -0
- nltkor/sejong/ssem.py +684 -0
- nltkor/similarity/__init__.py +3 -0
- nltkor/similarity/bartscore____.py +337 -0
- nltkor/similarity/bertscore____.py +339 -0
- nltkor/similarity/classical.py +245 -0
- nltkor/similarity/cosine_similarity.py +175 -0
- nltkor/tag/__init__.py +71 -0
- nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
- nltkor/tag/espresso_tag.py +220 -0
- nltkor/tag/libs/__init__.py +10 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
- nltkor/tag/libs/arguments.py +280 -0
- nltkor/tag/libs/attributes.py +231 -0
- nltkor/tag/libs/config.py +159 -0
- nltkor/tag/libs/metadata.py +129 -0
- nltkor/tag/libs/ner/__init__.py +2 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/macmorphoreader.py +7 -0
- nltkor/tag/libs/ner/ner_reader.py +92 -0
- nltkor/tag/libs/network.c +72325 -0
- nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
- nltkor/tag/libs/network.pyx +878 -0
- nltkor/tag/libs/networkconv.pyx +1028 -0
- nltkor/tag/libs/networkdependencyconv.pyx +451 -0
- nltkor/tag/libs/parse/__init__.py +1 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/parse_reader.py +283 -0
- nltkor/tag/libs/pos/__init__.py +2 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/macmorphoreader.py +7 -0
- nltkor/tag/libs/pos/pos_reader.py +97 -0
- nltkor/tag/libs/reader.py +485 -0
- nltkor/tag/libs/srl/__init__.py +3 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__srl_reader_.py +535 -0
- nltkor/tag/libs/srl/srl_reader.py +436 -0
- nltkor/tag/libs/srl/train_srl.py +87 -0
- nltkor/tag/libs/taggers.py +926 -0
- nltkor/tag/libs/utils.py +384 -0
- nltkor/tag/libs/word_dictionary.py +239 -0
- nltkor/tag/libs/wsd/__init__.py +2 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
- nltkor/tag/libs/wsd/wsd_reader.py +93 -0
- nltkor/tokenize/__init__.py +62 -0
- nltkor/tokenize/ko_tokenize.py +115 -0
- nltkor/trans.py +121 -0
- nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
- nltkor-1.2.14.dist-info/METADATA +41 -0
- nltkor-1.2.14.dist-info/RECORD +127 -0
- nltkor-1.2.14.dist-info/WHEEL +5 -0
- nltkor-1.2.14.dist-info/top_level.txt +1 -0
nltkor/sejong/ssem.py
ADDED
@@ -0,0 +1,684 @@
|
|
1
|
+
from xml.etree.ElementTree import parse
|
2
|
+
import os, re
|
3
|
+
from operator import eq
|
4
|
+
import time
|
5
|
+
import nltkor
|
6
|
+
from nltkor.sejong.sejong_download import SejongDir
|
7
|
+
|
8
|
+
|
9
|
+
common_path=os.path.dirname(nltkor.sejong.__file__)
|
10
|
+
class Entry():
|
11
|
+
|
12
|
+
def __init__(self, name, en, pos):
|
13
|
+
self.name = name
|
14
|
+
self.entry = en
|
15
|
+
self.pos = pos
|
16
|
+
self.SejongDir = SejongDir()
|
17
|
+
|
18
|
+
|
19
|
+
def __repr__(self):
|
20
|
+
return "%s('%s')" % (type(self).__name__, self.name)
|
21
|
+
|
22
|
+
|
23
|
+
|
24
|
+
# sense객체 리턴
|
25
|
+
def senses(self):
|
26
|
+
list = []
|
27
|
+
|
28
|
+
allsense = self.entry.findall("sense")
|
29
|
+
for se in allsense:
|
30
|
+
try:
|
31
|
+
ss = str(self.name + "." + se.attrib['n'])
|
32
|
+
except KeyError:
|
33
|
+
ss = str(self.name)
|
34
|
+
temp = Sense(ss, se, self.pos)
|
35
|
+
list.append(temp)
|
36
|
+
|
37
|
+
return list
|
38
|
+
|
39
|
+
# 숙어
|
40
|
+
def idm(self):
|
41
|
+
list = []
|
42
|
+
try:
|
43
|
+
id = self.entry.find("idm_grp")
|
44
|
+
idm = id.findall("idm")
|
45
|
+
except AttributeError:
|
46
|
+
return list
|
47
|
+
|
48
|
+
for tmp in idm:
|
49
|
+
if tmp.text is None:
|
50
|
+
return list
|
51
|
+
|
52
|
+
if '~' in tmp.text:
|
53
|
+
name = self.name.split('.')
|
54
|
+
tmp.text = tmp.text.replace('~', name[0])
|
55
|
+
list.append(tmp.text)
|
56
|
+
|
57
|
+
return list
|
58
|
+
|
59
|
+
# 복합어
|
60
|
+
def comp(self):
|
61
|
+
|
62
|
+
list = []
|
63
|
+
try:
|
64
|
+
mor = self.entry.find("morph_grp")
|
65
|
+
comp = mor.findall("comp")
|
66
|
+
except AttributeError:
|
67
|
+
return list
|
68
|
+
|
69
|
+
for tmp in comp:
|
70
|
+
if tmp.text is None:
|
71
|
+
return list
|
72
|
+
|
73
|
+
if '~' in tmp.text:
|
74
|
+
name = self.name.split('.')
|
75
|
+
tmp.text = tmp.text.replace('~', name[0])
|
76
|
+
list.append(tmp.text)
|
77
|
+
|
78
|
+
return list
|
79
|
+
|
80
|
+
# 파생어
|
81
|
+
def der(self):
|
82
|
+
list = []
|
83
|
+
try:
|
84
|
+
mor = self.entry.find("morph_grp")
|
85
|
+
comp = mor.findall("der")
|
86
|
+
except AttributeError:
|
87
|
+
return list
|
88
|
+
for tmp in comp:
|
89
|
+
if tmp.text is None:
|
90
|
+
return list
|
91
|
+
|
92
|
+
if '~' in tmp.text:
|
93
|
+
name = self.name.split('.')
|
94
|
+
tmp.text = tmp.text.replace('~', name[0])
|
95
|
+
list.append(tmp.text)
|
96
|
+
|
97
|
+
return list
|
98
|
+
|
99
|
+
|
100
|
+
class Sense():
|
101
|
+
|
102
|
+
def __init__(self, name, se, pos):
|
103
|
+
self.name = name
|
104
|
+
self.sense = se
|
105
|
+
self.pos = pos
|
106
|
+
|
107
|
+
def __repr__(self):
|
108
|
+
return "%s('%s')" % (type(self).__name__, self.name)
|
109
|
+
|
110
|
+
# 공통 태그
|
111
|
+
def common_lr(self, sense):
|
112
|
+
sem = sense.find("sem_grp")
|
113
|
+
lr = sem.find("lr")
|
114
|
+
return lr
|
115
|
+
|
116
|
+
# sem
|
117
|
+
#sem
|
118
|
+
def sem(self):
|
119
|
+
list = []
|
120
|
+
sem = self.sense.find("sem_grp")
|
121
|
+
synn = sem.find("sem_class")
|
122
|
+
try:
|
123
|
+
synn = synn.text
|
124
|
+
except AttributeError:
|
125
|
+
return list
|
126
|
+
|
127
|
+
list.append(synn)
|
128
|
+
return list
|
129
|
+
|
130
|
+
# if None in list:
|
131
|
+
# list = []
|
132
|
+
# return list
|
133
|
+
# else:
|
134
|
+
# return list
|
135
|
+
# 동의어
|
136
|
+
def syn(self):
|
137
|
+
list = []
|
138
|
+
lr = self.common_lr(self.sense)
|
139
|
+
try:
|
140
|
+
synn = lr.findall("syn")
|
141
|
+
except AttributeError:
|
142
|
+
return list
|
143
|
+
|
144
|
+
for tmp in synn:
|
145
|
+
list.append(tmp.text)
|
146
|
+
|
147
|
+
if None in list:
|
148
|
+
list = []
|
149
|
+
return list
|
150
|
+
else:
|
151
|
+
return list
|
152
|
+
# 반의어
|
153
|
+
def ant(self):
|
154
|
+
list = []
|
155
|
+
lr = self.common_lr(self.sense)
|
156
|
+
try:
|
157
|
+
ant = lr.findall("ant")
|
158
|
+
except AttributeError:
|
159
|
+
return list
|
160
|
+
|
161
|
+
for tmp in ant:
|
162
|
+
list.append(tmp.text)
|
163
|
+
|
164
|
+
if None in list:
|
165
|
+
list = []
|
166
|
+
return list
|
167
|
+
else:
|
168
|
+
return list
|
169
|
+
# 동위어
|
170
|
+
def coord(self):
|
171
|
+
list = []
|
172
|
+
lr = self.common_lr(self.sense)
|
173
|
+
try:
|
174
|
+
coo = lr.findall("coord")
|
175
|
+
except AttributeError:
|
176
|
+
return list
|
177
|
+
|
178
|
+
for tmp in coo:
|
179
|
+
list.append(tmp.text)
|
180
|
+
|
181
|
+
if None in list:
|
182
|
+
list = []
|
183
|
+
return list
|
184
|
+
else:
|
185
|
+
return list
|
186
|
+
# 부분어
|
187
|
+
def mero(self):
|
188
|
+
list = []
|
189
|
+
lr = self.common_lr(self.sense)
|
190
|
+
try:
|
191
|
+
me = lr.findall("mero")
|
192
|
+
except AttributeError:
|
193
|
+
return list
|
194
|
+
|
195
|
+
for tmp in me:
|
196
|
+
list.append(tmp.text)
|
197
|
+
|
198
|
+
'''if not list:
|
199
|
+
return("@@@@@",list)
|
200
|
+
'''
|
201
|
+
if None in list:
|
202
|
+
list = []
|
203
|
+
return list
|
204
|
+
else:
|
205
|
+
return list
|
206
|
+
# 상위어
|
207
|
+
def hyper(self):
|
208
|
+
list = []
|
209
|
+
lr = self.common_lr(self.sense)
|
210
|
+
try:
|
211
|
+
hy = lr.findall("hyper")
|
212
|
+
except AttributeError:
|
213
|
+
return list
|
214
|
+
|
215
|
+
for tmp in hy:
|
216
|
+
list.append(tmp.text)
|
217
|
+
|
218
|
+
if None in list:
|
219
|
+
list = []
|
220
|
+
return list
|
221
|
+
else:
|
222
|
+
return list
|
223
|
+
# 하위어
|
224
|
+
def hypo(self):
|
225
|
+
list = []
|
226
|
+
lr = self.common_lr(self.sense)
|
227
|
+
try:
|
228
|
+
hy = lr.findall("hypo")
|
229
|
+
except AttributeError:
|
230
|
+
return list
|
231
|
+
|
232
|
+
for tmp in hy:
|
233
|
+
if '~' in tmp.text:
|
234
|
+
name = self.name.split('.')
|
235
|
+
tmp.text = tmp.text.replace('~', name[0])
|
236
|
+
list.append(tmp.text)
|
237
|
+
else:
|
238
|
+
list.append(tmp.text)
|
239
|
+
|
240
|
+
if None in list:
|
241
|
+
list = []
|
242
|
+
return list
|
243
|
+
else:
|
244
|
+
return list
|
245
|
+
# 전체어
|
246
|
+
def holo(self):
|
247
|
+
list = []
|
248
|
+
lr = self.common_lr(self.sense)
|
249
|
+
try:
|
250
|
+
ho = lr.findall("holo")
|
251
|
+
except AttributeError:
|
252
|
+
return list
|
253
|
+
|
254
|
+
for tmp in ho:
|
255
|
+
if '~' in tmp.text:
|
256
|
+
name = self.name.split('.')
|
257
|
+
tmp.text = tmp.text.replace('~', name[0])
|
258
|
+
list.append(tmp.text)
|
259
|
+
else:
|
260
|
+
list.append(tmp.text)
|
261
|
+
|
262
|
+
if None in list:
|
263
|
+
list = []
|
264
|
+
return list
|
265
|
+
else:
|
266
|
+
return list
|
267
|
+
# 관련어
|
268
|
+
def rel(self):
|
269
|
+
list = []
|
270
|
+
lr = self.common_lr(self.sense)
|
271
|
+
try:
|
272
|
+
rel = lr.findall("rel")
|
273
|
+
except AttributeError:
|
274
|
+
return list
|
275
|
+
|
276
|
+
for tmp in rel:
|
277
|
+
list.append(tmp.text)
|
278
|
+
|
279
|
+
if None in list:
|
280
|
+
list = []
|
281
|
+
return list
|
282
|
+
else:
|
283
|
+
return list
|
284
|
+
# 예시
|
285
|
+
def example(self):
|
286
|
+
list = []
|
287
|
+
|
288
|
+
if self.pos != 'nng_s':
|
289
|
+
return list
|
290
|
+
|
291
|
+
else:
|
292
|
+
sem = self.sense.find("sem_grp")
|
293
|
+
eg = sem.findall("eg")
|
294
|
+
for tmp in eg:
|
295
|
+
if '~' in tmp.text:
|
296
|
+
name = self.name.split('.')
|
297
|
+
tmp.text = tmp.text.replace('~', name[0])
|
298
|
+
list.append(tmp.text)
|
299
|
+
else:
|
300
|
+
list.append(tmp.text)
|
301
|
+
|
302
|
+
if None in list:
|
303
|
+
list = []
|
304
|
+
return list
|
305
|
+
else:
|
306
|
+
return list
|
307
|
+
# 영어
|
308
|
+
def trans(self):
|
309
|
+
list = []
|
310
|
+
sem = self.sense.find("sem_grp")
|
311
|
+
trs = sem.findall("trans")
|
312
|
+
for tmp in trs:
|
313
|
+
list.append(tmp.text)
|
314
|
+
|
315
|
+
if None in list:
|
316
|
+
list = []
|
317
|
+
return list
|
318
|
+
else:
|
319
|
+
return list
|
320
|
+
# 형용사 결합
|
321
|
+
def comb_aj(self):
|
322
|
+
list = []
|
323
|
+
|
324
|
+
try:
|
325
|
+
syn = self.sense.find("syn_grp")
|
326
|
+
aj = syn.findall("comb_aj")
|
327
|
+
except AttributeError:
|
328
|
+
return list
|
329
|
+
|
330
|
+
for tmp in aj:
|
331
|
+
if tmp.text is None:
|
332
|
+
return list
|
333
|
+
|
334
|
+
if '~' in tmp.text:
|
335
|
+
name = self.name.split('.')
|
336
|
+
tmp.text = tmp.text.replace('~', name[0])
|
337
|
+
list.append(tmp.text)
|
338
|
+
|
339
|
+
return list
|
340
|
+
# 명사 결합
|
341
|
+
def comb_n(self):
|
342
|
+
list = []
|
343
|
+
try:
|
344
|
+
syn = self.sense.find("syn_grp")
|
345
|
+
n = syn.findall("comb_n")
|
346
|
+
except AttributeError:
|
347
|
+
return list
|
348
|
+
for tmp in n:
|
349
|
+
if tmp.text is None:
|
350
|
+
return list
|
351
|
+
|
352
|
+
if '~' in tmp.text:
|
353
|
+
name = self.name.split('.')
|
354
|
+
tmp.text = tmp.text.replace('~', name[0])
|
355
|
+
list.append(tmp.text)
|
356
|
+
|
357
|
+
return list
|
358
|
+
# 동사 결합
|
359
|
+
def comb_v(self):
|
360
|
+
list = []
|
361
|
+
try:
|
362
|
+
syn = self.sense.find("syn_grp")
|
363
|
+
v = syn.findall("comb_v")
|
364
|
+
except AttributeError:
|
365
|
+
return list
|
366
|
+
|
367
|
+
for tmp in v:
|
368
|
+
v = tmp.find("form").text
|
369
|
+
if v is None:
|
370
|
+
return list
|
371
|
+
|
372
|
+
if '~' in v:
|
373
|
+
name = self.name.split('.')
|
374
|
+
v = v.replace('~', name[0])
|
375
|
+
list.append(v)
|
376
|
+
return list
|
377
|
+
|
378
|
+
# frame
|
379
|
+
def sel_rst(self):
|
380
|
+
|
381
|
+
final = {}
|
382
|
+
list = []
|
383
|
+
|
384
|
+
if self.pos == 'nng_s':
|
385
|
+
return list
|
386
|
+
|
387
|
+
frame_grps = self.sense.findall("frame_grp")
|
388
|
+
|
389
|
+
for grp in frame_grps: # 각각의 frame_grp type
|
390
|
+
sub_list = []
|
391
|
+
for subsense in grp.findall('subsense'): # n개의 subsense
|
392
|
+
str = ""
|
393
|
+
eg_list = []
|
394
|
+
check = 0
|
395
|
+
for sel_rst in subsense.findall('sel_rst'): # m개의 sel_rst
|
396
|
+
check += 1
|
397
|
+
for tmp in sel_rst.attrib.items():
|
398
|
+
|
399
|
+
if (tmp[0] == 'arg'):
|
400
|
+
str += ("<" + tmp[0] + "=" + tmp[1] + " ")
|
401
|
+
|
402
|
+
if (tmp[0] == 'tht'):
|
403
|
+
str += (tmp[0] + "=" + tmp[1] + ">")
|
404
|
+
try:
|
405
|
+
str += (sel_rst.text)
|
406
|
+
except TypeError:
|
407
|
+
str += ' '
|
408
|
+
|
409
|
+
if (check != len(subsense.findall('sel_rst'))):
|
410
|
+
str += ', '
|
411
|
+
|
412
|
+
for eg in subsense.findall('eg'):
|
413
|
+
eg_list.append(eg.text)
|
414
|
+
|
415
|
+
sub_list.append(str)
|
416
|
+
sub_list.append(eg_list)
|
417
|
+
|
418
|
+
final[grp.find('frame').text] = sub_list
|
419
|
+
|
420
|
+
return final
|
421
|
+
|
422
|
+
# 최상위 경로
|
423
|
+
def sem_path(self):
|
424
|
+
|
425
|
+
cur_sem = self.sem()[0]
|
426
|
+
if cur_sem == None:
|
427
|
+
return []
|
428
|
+
filename = common_path+'/dict_semClassNum.txt'
|
429
|
+
with open(filename, 'r',encoding="cp949") as file_object:
|
430
|
+
lines = file_object.read()
|
431
|
+
|
432
|
+
#print(lines)
|
433
|
+
temp_list = []
|
434
|
+
sem_list = []
|
435
|
+
str = ""
|
436
|
+
|
437
|
+
# 리스트 형성
|
438
|
+
for tmp in lines:
|
439
|
+
if tmp != '\n' and tmp != '\t':
|
440
|
+
str += tmp
|
441
|
+
else:
|
442
|
+
if (str != ''):
|
443
|
+
sem_list.append(str)
|
444
|
+
str = ''
|
445
|
+
|
446
|
+
# 입력 단어 sem 위치 찾기
|
447
|
+
regex = re.compile(r"_" + cur_sem + '$')
|
448
|
+
for x in sem_list:
|
449
|
+
if regex.search(x):
|
450
|
+
cur_sem = x
|
451
|
+
temp_list.append(cur_sem)
|
452
|
+
|
453
|
+
while len(cur_sem.split('_')[0]) > 1:
|
454
|
+
|
455
|
+
if cur_sem.split('_')[0][-2] == '.':
|
456
|
+
tmp = cur_sem.split('_')[0][0:-2] + '_'
|
457
|
+
else:
|
458
|
+
tmp = cur_sem.split('_')[0][0:-3] + '_'
|
459
|
+
regex = re.compile(r"^" + tmp)
|
460
|
+
|
461
|
+
for x in sem_list:
|
462
|
+
if regex.search(x):
|
463
|
+
cur_sem = x
|
464
|
+
temp_list.append(x)
|
465
|
+
|
466
|
+
return list(reversed(temp_list))
|
467
|
+
|
468
|
+
|
469
|
+
#유사도
|
470
|
+
def wup_similarity(self,target):
|
471
|
+
#self sem
|
472
|
+
sem = self.sense.find("sem_grp")
|
473
|
+
synn = sem.find("sem_class")
|
474
|
+
synn1 = synn.text
|
475
|
+
|
476
|
+
|
477
|
+
#target sem
|
478
|
+
sem=target.sense.find("sem_grp")
|
479
|
+
synn=sem.find("sem_class")
|
480
|
+
synn2=synn.text
|
481
|
+
|
482
|
+
|
483
|
+
list=[]
|
484
|
+
path=common_path+"/layer.txt"
|
485
|
+
f=open(path,'r')
|
486
|
+
lines=f.readlines()
|
487
|
+
for tmp in lines:
|
488
|
+
if '_'+synn1+'\n' in tmp:
|
489
|
+
list.append(tmp)
|
490
|
+
if '_'+synn2+'\n' in tmp:
|
491
|
+
list.append(tmp)
|
492
|
+
|
493
|
+
ch=[]
|
494
|
+
for tmp in list:
|
495
|
+
ch.append(tmp.split("_")[0])
|
496
|
+
|
497
|
+
word1 =ch[0].split('.');
|
498
|
+
word2 =ch[1].split('.');
|
499
|
+
|
500
|
+
same=0
|
501
|
+
|
502
|
+
for tmp in range (0, min(len(word1),len(word2))):
|
503
|
+
if word1[tmp] == word2[tmp]:
|
504
|
+
same+=2
|
505
|
+
else:
|
506
|
+
break
|
507
|
+
|
508
|
+
if self.name==target.name:
|
509
|
+
same+=2
|
510
|
+
|
511
|
+
result=same/((len(word1)+len(word2))+2)
|
512
|
+
|
513
|
+
return result
|
514
|
+
|
515
|
+
|
516
|
+
|
517
|
+
# sense 바로 접근
|
518
|
+
def sense(input):
|
519
|
+
|
520
|
+
input_list = input.split('.')
|
521
|
+
arg= (input_list[0]+'.'+input_list[1]+'.'+input_list[2])
|
522
|
+
target =entry(arg)
|
523
|
+
allsense =target.entry.findall("sense")
|
524
|
+
|
525
|
+
for se in allsense:
|
526
|
+
if input==str(target.name+'.'+se.attrib['n']):
|
527
|
+
return Sense(input,se,target.pos)
|
528
|
+
|
529
|
+
#ss = str(self.name + "." + se.attrib['n'])
|
530
|
+
#ss = str(self.name)
|
531
|
+
|
532
|
+
# entry 바로 접근
|
533
|
+
def entry(input):
|
534
|
+
|
535
|
+
input_list = input.split('.')
|
536
|
+
path=common_path+""
|
537
|
+
if 'nn' in input_list[1]:
|
538
|
+
path += "/01. 체언_상세"
|
539
|
+
elif input_list[1] == 'vv':
|
540
|
+
path += "/02. 용언_상세//vv"
|
541
|
+
elif input_list[1] == 'va':
|
542
|
+
path += "/02. 용언_상세//va"
|
543
|
+
else:
|
544
|
+
return
|
545
|
+
|
546
|
+
|
547
|
+
path += "//"+input_list[0]+".xml"
|
548
|
+
|
549
|
+
tree = parse(path)
|
550
|
+
root = tree.getroot()
|
551
|
+
allentry = root.findall("entry")
|
552
|
+
for en in allentry:
|
553
|
+
try:
|
554
|
+
if input==str(input_list[0]+"."+en.attrib['pos']+"." + en.attrib['n']):
|
555
|
+
return Entry(str(input_list[0]+"."+en.attrib['pos']+"." + en.attrib['n']), en, str(en.attrib['pos']))
|
556
|
+
except KeyError:
|
557
|
+
if input==str(input_list[0]+"."+en.attrib['pos']):
|
558
|
+
return Entry(str(input_list[0]+"."+en.attrib['pos']), en, str(en.attrib['pos']))
|
559
|
+
|
560
|
+
# entry 객체 리턴
|
561
|
+
def entrys(word):
|
562
|
+
path = filecheck(word)
|
563
|
+
list = []
|
564
|
+
|
565
|
+
for tmp in path:
|
566
|
+
tree = parse(tmp)
|
567
|
+
root = tree.getroot()
|
568
|
+
allentry = root.findall("entry")
|
569
|
+
|
570
|
+
for en in allentry:
|
571
|
+
try:
|
572
|
+
es = str(word + "." + en.attrib['pos'] + "." + en.attrib['n'])
|
573
|
+
except KeyError:
|
574
|
+
es = str(word + "." + en.attrib['pos'])
|
575
|
+
|
576
|
+
temp = Entry(es, en, str(en.attrib['pos']))
|
577
|
+
list.append(temp)
|
578
|
+
|
579
|
+
return list
|
580
|
+
|
581
|
+
|
582
|
+
def _syn(word):
|
583
|
+
|
584
|
+
ets=entrys(word)
|
585
|
+
syn_list=[]
|
586
|
+
|
587
|
+
for et in ets:
|
588
|
+
for se in et.senses():
|
589
|
+
syn_list+=se.syn()
|
590
|
+
|
591
|
+
return syn_list
|
592
|
+
|
593
|
+
'''
|
594
|
+
def entry_error():
|
595
|
+
|
596
|
+
path="./02. 용언_상세//va"
|
597
|
+
abs_dir=os.path.join(os.getcwd(),path)
|
598
|
+
file_names=os.listdir(abs_dir)
|
599
|
+
|
600
|
+
#print(file_names)
|
601
|
+
#print(len(file_names))
|
602
|
+
error_list=[]
|
603
|
+
|
604
|
+
|
605
|
+
for word in file_names:
|
606
|
+
|
607
|
+
fpath=path+"//"+word
|
608
|
+
#print(fpath)
|
609
|
+
tree=parse(fpath)
|
610
|
+
root=tree.getroot()
|
611
|
+
#print(root.findtext('orth'))
|
612
|
+
allentry=root.findall("entry")
|
613
|
+
|
614
|
+
for en in allentry:
|
615
|
+
try:
|
616
|
+
en.attrib['n']
|
617
|
+
|
618
|
+
except:
|
619
|
+
|
620
|
+
error_list.append(word)
|
621
|
+
break;
|
622
|
+
|
623
|
+
print(error_list)
|
624
|
+
print(len(error_list))
|
625
|
+
print(len(file_names))
|
626
|
+
|
627
|
+
|
628
|
+
return error_list
|
629
|
+
|
630
|
+
|
631
|
+
|
632
|
+
def sense_error():
|
633
|
+
|
634
|
+
path="./02. 용언_상세//va"
|
635
|
+
abs_dir=os.path.join(os.getcwd(),path)
|
636
|
+
file_names=os.listdir(abs_dir)
|
637
|
+
|
638
|
+
error_list=[]
|
639
|
+
|
640
|
+
|
641
|
+
for word in file_names:
|
642
|
+
|
643
|
+
fpath=path+"//"+word
|
644
|
+
tree=parse(fpath)
|
645
|
+
root=tree.getroot()
|
646
|
+
allentry=root.findall("entry")
|
647
|
+
|
648
|
+
for en in allentry:
|
649
|
+
allsense=en.findall("sense")
|
650
|
+
for se in allsense:
|
651
|
+
try:
|
652
|
+
se.attrib['n']
|
653
|
+
except:
|
654
|
+
|
655
|
+
if word not in error_list:
|
656
|
+
error_list.append(word)
|
657
|
+
break;
|
658
|
+
|
659
|
+
print(error_list)
|
660
|
+
print(len(error_list))
|
661
|
+
print(len(file_names))
|
662
|
+
|
663
|
+
|
664
|
+
return error_list
|
665
|
+
|
666
|
+
'''
|
667
|
+
|
668
|
+
# file check
|
669
|
+
def filecheck(word):
|
670
|
+
n_path = common_path+"/01. 체언_상세"
|
671
|
+
vv_path = common_path+"/02. 용언_상세/vv"
|
672
|
+
va_path = common_path+"/02. 용언_상세/va"
|
673
|
+
|
674
|
+
|
675
|
+
path = [n_path, vv_path, va_path]
|
676
|
+
ret_list = []
|
677
|
+
check = word + ".xml"
|
678
|
+
|
679
|
+
|
680
|
+
for tmp in path:
|
681
|
+
if check in os.listdir(tmp):
|
682
|
+
ret_list.append(tmp + "/" + check)
|
683
|
+
|
684
|
+
return ret_list
|