nltkor 1.2.0__cp39-cp39-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +15 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +814 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +467 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/ch.py +12 -0
  32. nltkor/sejong/dict_semClassNum.txt +491 -0
  33. nltkor/sejong/layer.txt +630 -0
  34. nltkor/sejong/sejong_download.py +87 -0
  35. nltkor/sejong/ssem.py +685 -0
  36. nltkor/similarity/__init__.py +3 -0
  37. nltkor/similarity/bartscore____.py +337 -0
  38. nltkor/similarity/bertscore____.py +339 -0
  39. nltkor/similarity/classical.py +245 -0
  40. nltkor/similarity/cosine_similarity.py +175 -0
  41. nltkor/tag/__init__.py +70 -0
  42. nltkor/tag/espresso_tag.py +220 -0
  43. nltkor/tag/libs/__init__.py +9 -0
  44. nltkor/tag/libs/arguments.py +280 -0
  45. nltkor/tag/libs/attributes.py +231 -0
  46. nltkor/tag/libs/config.py +158 -0
  47. nltkor/tag/libs/metadata.py +129 -0
  48. nltkor/tag/libs/ner/__init__.py +2 -0
  49. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  50. nltkor/tag/libs/ner/ner_reader.py +92 -0
  51. nltkor/tag/libs/network.c +59267 -0
  52. nltkor/tag/libs/network.cpython-39-darwin.so +0 -0
  53. nltkor/tag/libs/parse/__init__.py +1 -0
  54. nltkor/tag/libs/parse/parse_reader.py +283 -0
  55. nltkor/tag/libs/pos/__init__.py +2 -0
  56. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  57. nltkor/tag/libs/pos/pos_reader.py +89 -0
  58. nltkor/tag/libs/reader.py +510 -0
  59. nltkor/tag/libs/srl/__init__.py +3 -0
  60. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  61. nltkor/tag/libs/srl/srl_reader.py +436 -0
  62. nltkor/tag/libs/srl/train_srl.py +87 -0
  63. nltkor/tag/libs/taggers.py +926 -0
  64. nltkor/tag/libs/utils.py +344 -0
  65. nltkor/tag/libs/word_dictionary.py +239 -0
  66. nltkor/tag/libs/wsd/__init__.py +2 -0
  67. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  68. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  69. nltkor/tokenize/__init__.py +62 -0
  70. nltkor/tokenize/ko_tokenize.py +115 -0
  71. nltkor/trans.py +121 -0
  72. nltkor-1.2.0.dist-info/LICENSE.txt +1093 -0
  73. nltkor-1.2.0.dist-info/METADATA +33 -0
  74. nltkor-1.2.0.dist-info/RECORD +76 -0
  75. nltkor-1.2.0.dist-info/WHEEL +5 -0
  76. nltkor-1.2.0.dist-info/top_level.txt +1 -0
nltkor/sejong/ssem.py ADDED
@@ -0,0 +1,685 @@
1
+ from xml.etree.ElementTree import parse
2
+ import os, re
3
+ from operator import eq
4
+ import time
5
+ import nltkor
6
+ from nltkor.sejong.sejong_download import SejongDir
7
+
8
+
9
+ common_path=os.path.dirname(nltkor.sejong.__file__)
10
+ SejongDir = SejongDir()
11
+ class Entry():
12
+
13
+ def __init__(self, name, en, pos):
14
+ self.name = name
15
+ self.entry = en
16
+ self.pos = pos
17
+ self.SejongDir = SejongDir()
18
+
19
+
20
+ def __repr__(self):
21
+ return "%s('%s')" % (type(self).__name__, self.name)
22
+
23
+
24
+
25
+ # sense객체 리턴
26
+ def senses(self):
27
+ list = []
28
+
29
+ allsense = self.entry.findall("sense")
30
+ for se in allsense:
31
+ try:
32
+ ss = str(self.name + "." + se.attrib['n'])
33
+ except KeyError:
34
+ ss = str(self.name)
35
+ temp = Sense(ss, se, self.pos)
36
+ list.append(temp)
37
+
38
+ return list
39
+
40
+ # 숙어
41
+ def idm(self):
42
+ list = []
43
+ try:
44
+ id = self.entry.find("idm_grp")
45
+ idm = id.findall("idm")
46
+ except AttributeError:
47
+ return list
48
+
49
+ for tmp in idm:
50
+ if tmp.text is None:
51
+ return list
52
+
53
+ if '~' in tmp.text:
54
+ name = self.name.split('.')
55
+ tmp.text = tmp.text.replace('~', name[0])
56
+ list.append(tmp.text)
57
+
58
+ return list
59
+
60
+ # 복합어
61
+ def comp(self):
62
+
63
+ list = []
64
+ try:
65
+ mor = self.entry.find("morph_grp")
66
+ comp = mor.findall("comp")
67
+ except AttributeError:
68
+ return list
69
+
70
+ for tmp in comp:
71
+ if tmp.text is None:
72
+ return list
73
+
74
+ if '~' in tmp.text:
75
+ name = self.name.split('.')
76
+ tmp.text = tmp.text.replace('~', name[0])
77
+ list.append(tmp.text)
78
+
79
+ return list
80
+
81
+ # 파생어
82
+ def der(self):
83
+ list = []
84
+ try:
85
+ mor = self.entry.find("morph_grp")
86
+ comp = mor.findall("der")
87
+ except AttributeError:
88
+ return list
89
+ for tmp in comp:
90
+ if tmp.text is None:
91
+ return list
92
+
93
+ if '~' in tmp.text:
94
+ name = self.name.split('.')
95
+ tmp.text = tmp.text.replace('~', name[0])
96
+ list.append(tmp.text)
97
+
98
+ return list
99
+
100
+
101
+ class Sense():
102
+
103
+ def __init__(self, name, se, pos):
104
+ self.name = name
105
+ self.sense = se
106
+ self.pos = pos
107
+
108
+ def __repr__(self):
109
+ return "%s('%s')" % (type(self).__name__, self.name)
110
+
111
+ # 공통 태그
112
+ def common_lr(self, sense):
113
+ sem = sense.find("sem_grp")
114
+ lr = sem.find("lr")
115
+ return lr
116
+
117
+ # sem
118
+ #sem
119
+ def sem(self):
120
+ list = []
121
+ sem = self.sense.find("sem_grp")
122
+ synn = sem.find("sem_class")
123
+ try:
124
+ synn = synn.text
125
+ except AttributeError:
126
+ return list
127
+
128
+ list.append(synn)
129
+ return list
130
+
131
+ # if None in list:
132
+ # list = []
133
+ # return list
134
+ # else:
135
+ # return list
136
+ # 동의어
137
+ def syn(self):
138
+ list = []
139
+ lr = self.common_lr(self.sense)
140
+ try:
141
+ synn = lr.findall("syn")
142
+ except AttributeError:
143
+ return list
144
+
145
+ for tmp in synn:
146
+ list.append(tmp.text)
147
+
148
+ if None in list:
149
+ list = []
150
+ return list
151
+ else:
152
+ return list
153
+ # 반의어
154
+ def ant(self):
155
+ list = []
156
+ lr = self.common_lr(self.sense)
157
+ try:
158
+ ant = lr.findall("ant")
159
+ except AttributeError:
160
+ return list
161
+
162
+ for tmp in ant:
163
+ list.append(tmp.text)
164
+
165
+ if None in list:
166
+ list = []
167
+ return list
168
+ else:
169
+ return list
170
+ # 동위어
171
+ def coord(self):
172
+ list = []
173
+ lr = self.common_lr(self.sense)
174
+ try:
175
+ coo = lr.findall("coord")
176
+ except AttributeError:
177
+ return list
178
+
179
+ for tmp in coo:
180
+ list.append(tmp.text)
181
+
182
+ if None in list:
183
+ list = []
184
+ return list
185
+ else:
186
+ return list
187
+ # 부분어
188
+ def mero(self):
189
+ list = []
190
+ lr = self.common_lr(self.sense)
191
+ try:
192
+ me = lr.findall("mero")
193
+ except AttributeError:
194
+ return list
195
+
196
+ for tmp in me:
197
+ list.append(tmp.text)
198
+
199
+ '''if not list:
200
+ return("@@@@@",list)
201
+ '''
202
+ if None in list:
203
+ list = []
204
+ return list
205
+ else:
206
+ return list
207
+ # 상위어
208
+ def hyper(self):
209
+ list = []
210
+ lr = self.common_lr(self.sense)
211
+ try:
212
+ hy = lr.findall("hyper")
213
+ except AttributeError:
214
+ return list
215
+
216
+ for tmp in hy:
217
+ list.append(tmp.text)
218
+
219
+ if None in list:
220
+ list = []
221
+ return list
222
+ else:
223
+ return list
224
+ # 하위어
225
+ def hypo(self):
226
+ list = []
227
+ lr = self.common_lr(self.sense)
228
+ try:
229
+ hy = lr.findall("hypo")
230
+ except AttributeError:
231
+ return list
232
+
233
+ for tmp in hy:
234
+ if '~' in tmp.text:
235
+ name = self.name.split('.')
236
+ tmp.text = tmp.text.replace('~', name[0])
237
+ list.append(tmp.text)
238
+ else:
239
+ list.append(tmp.text)
240
+
241
+ if None in list:
242
+ list = []
243
+ return list
244
+ else:
245
+ return list
246
+ # 전체어
247
+ def holo(self):
248
+ list = []
249
+ lr = self.common_lr(self.sense)
250
+ try:
251
+ ho = lr.findall("holo")
252
+ except AttributeError:
253
+ return list
254
+
255
+ for tmp in ho:
256
+ if '~' in tmp.text:
257
+ name = self.name.split('.')
258
+ tmp.text = tmp.text.replace('~', name[0])
259
+ list.append(tmp.text)
260
+ else:
261
+ list.append(tmp.text)
262
+
263
+ if None in list:
264
+ list = []
265
+ return list
266
+ else:
267
+ return list
268
+ # 관련어
269
+ def rel(self):
270
+ list = []
271
+ lr = self.common_lr(self.sense)
272
+ try:
273
+ rel = lr.findall("rel")
274
+ except AttributeError:
275
+ return list
276
+
277
+ for tmp in rel:
278
+ list.append(tmp.text)
279
+
280
+ if None in list:
281
+ list = []
282
+ return list
283
+ else:
284
+ return list
285
+ # 예시
286
+ def example(self):
287
+ list = []
288
+
289
+ if self.pos != 'nng_s':
290
+ return list
291
+
292
+ else:
293
+ sem = self.sense.find("sem_grp")
294
+ eg = sem.findall("eg")
295
+ for tmp in eg:
296
+ if '~' in tmp.text:
297
+ name = self.name.split('.')
298
+ tmp.text = tmp.text.replace('~', name[0])
299
+ list.append(tmp.text)
300
+ else:
301
+ list.append(tmp.text)
302
+
303
+ if None in list:
304
+ list = []
305
+ return list
306
+ else:
307
+ return list
308
+ # 영어
309
+ def trans(self):
310
+ list = []
311
+ sem = self.sense.find("sem_grp")
312
+ trs = sem.findall("trans")
313
+ for tmp in trs:
314
+ list.append(tmp.text)
315
+
316
+ if None in list:
317
+ list = []
318
+ return list
319
+ else:
320
+ return list
321
+ # 형용사 결합
322
+ def comb_aj(self):
323
+ list = []
324
+
325
+ try:
326
+ syn = self.sense.find("syn_grp")
327
+ aj = syn.findall("comb_aj")
328
+ except AttributeError:
329
+ return list
330
+
331
+ for tmp in aj:
332
+ if tmp.text is None:
333
+ return list
334
+
335
+ if '~' in tmp.text:
336
+ name = self.name.split('.')
337
+ tmp.text = tmp.text.replace('~', name[0])
338
+ list.append(tmp.text)
339
+
340
+ return list
341
+ # 명사 결합
342
+ def comb_n(self):
343
+ list = []
344
+ try:
345
+ syn = self.sense.find("syn_grp")
346
+ n = syn.findall("comb_n")
347
+ except AttributeError:
348
+ return list
349
+ for tmp in n:
350
+ if tmp.text is None:
351
+ return list
352
+
353
+ if '~' in tmp.text:
354
+ name = self.name.split('.')
355
+ tmp.text = tmp.text.replace('~', name[0])
356
+ list.append(tmp.text)
357
+
358
+ return list
359
+ # 동사 결합
360
+ def comb_v(self):
361
+ list = []
362
+ try:
363
+ syn = self.sense.find("syn_grp")
364
+ v = syn.findall("comb_v")
365
+ except AttributeError:
366
+ return list
367
+
368
+ for tmp in v:
369
+ v = tmp.find("form").text
370
+ if v is None:
371
+ return list
372
+
373
+ if '~' in v:
374
+ name = self.name.split('.')
375
+ v = v.replace('~', name[0])
376
+ list.append(v)
377
+ return list
378
+
379
+ # frame
380
+ def sel_rst(self):
381
+
382
+ final = {}
383
+ list = []
384
+
385
+ if self.pos == 'nng_s':
386
+ return list
387
+
388
+ frame_grps = self.sense.findall("frame_grp")
389
+
390
+ for grp in frame_grps: # 각각의 frame_grp type
391
+ sub_list = []
392
+ for subsense in grp.findall('subsense'): # n개의 subsense
393
+ str = ""
394
+ eg_list = []
395
+ check = 0
396
+ for sel_rst in subsense.findall('sel_rst'): # m개의 sel_rst
397
+ check += 1
398
+ for tmp in sel_rst.attrib.items():
399
+
400
+ if (tmp[0] == 'arg'):
401
+ str += ("<" + tmp[0] + "=" + tmp[1] + " ")
402
+
403
+ if (tmp[0] == 'tht'):
404
+ str += (tmp[0] + "=" + tmp[1] + ">")
405
+ try:
406
+ str += (sel_rst.text)
407
+ except TypeError:
408
+ str += ' '
409
+
410
+ if (check != len(subsense.findall('sel_rst'))):
411
+ str += ', '
412
+
413
+ for eg in subsense.findall('eg'):
414
+ eg_list.append(eg.text)
415
+
416
+ sub_list.append(str)
417
+ sub_list.append(eg_list)
418
+
419
+ final[grp.find('frame').text] = sub_list
420
+
421
+ return final
422
+
423
+ # 최상위 경로
424
+ def sem_path(self):
425
+
426
+ cur_sem = self.sem()[0]
427
+ if cur_sem == None:
428
+ return []
429
+ filename = common_path+'/dict_semClassNum.txt'
430
+ with open(filename, 'r',encoding="cp949") as file_object:
431
+ lines = file_object.read()
432
+
433
+ #print(lines)
434
+ temp_list = []
435
+ sem_list = []
436
+ str = ""
437
+
438
+ # 리스트 형성
439
+ for tmp in lines:
440
+ if tmp != '\n' and tmp != '\t':
441
+ str += tmp
442
+ else:
443
+ if (str != ''):
444
+ sem_list.append(str)
445
+ str = ''
446
+
447
+ # 입력 단어 sem 위치 찾기
448
+ regex = re.compile(r"_" + cur_sem + '$')
449
+ for x in sem_list:
450
+ if regex.search(x):
451
+ cur_sem = x
452
+ temp_list.append(cur_sem)
453
+
454
+ while len(cur_sem.split('_')[0]) > 1:
455
+
456
+ if cur_sem.split('_')[0][-2] == '.':
457
+ tmp = cur_sem.split('_')[0][0:-2] + '_'
458
+ else:
459
+ tmp = cur_sem.split('_')[0][0:-3] + '_'
460
+ regex = re.compile(r"^" + tmp)
461
+
462
+ for x in sem_list:
463
+ if regex.search(x):
464
+ cur_sem = x
465
+ temp_list.append(x)
466
+
467
+ return list(reversed(temp_list))
468
+
469
+
470
+ #유사도
471
+ def wup_similarity(self,target):
472
+ #self sem
473
+ sem = self.sense.find("sem_grp")
474
+ synn = sem.find("sem_class")
475
+ synn1 = synn.text
476
+
477
+
478
+ #target sem
479
+ sem=target.sense.find("sem_grp")
480
+ synn=sem.find("sem_class")
481
+ synn2=synn.text
482
+
483
+
484
+ list=[]
485
+ path=common_path+"/layer.txt"
486
+ f=open(path,'r')
487
+ lines=f.readlines()
488
+ for tmp in lines:
489
+ if '_'+synn1+'\n' in tmp:
490
+ list.append(tmp)
491
+ if '_'+synn2+'\n' in tmp:
492
+ list.append(tmp)
493
+
494
+ ch=[]
495
+ for tmp in list:
496
+ ch.append(tmp.split("_")[0])
497
+
498
+ word1 =ch[0].split('.');
499
+ word2 =ch[1].split('.');
500
+
501
+ same=0
502
+
503
+ for tmp in range (0, min(len(word1),len(word2))):
504
+ if word1[tmp] == word2[tmp]:
505
+ same+=2
506
+ else:
507
+ break
508
+
509
+ if self.name==target.name:
510
+ same+=2
511
+
512
+ result=same/((len(word1)+len(word2))+2)
513
+
514
+ return result
515
+
516
+
517
+
518
+ # sense 바로 접근
519
+ def sense(input):
520
+
521
+ input_list = input.split('.')
522
+ arg= (input_list[0]+'.'+input_list[1]+'.'+input_list[2])
523
+ target =entry(arg)
524
+ allsense =target.entry.findall("sense")
525
+
526
+ for se in allsense:
527
+ if input==str(target.name+'.'+se.attrib['n']):
528
+ return Sense(input,se,target.pos)
529
+
530
+ #ss = str(self.name + "." + se.attrib['n'])
531
+ #ss = str(self.name)
532
+
533
+ # entry 바로 접근
534
+ def entry(input):
535
+
536
+ input_list = input.split('.')
537
+ path=common_path+""
538
+ if 'nn' in input_list[1]:
539
+ path += "/01. 체언_상세"
540
+ elif input_list[1] == 'vv':
541
+ path += "/02. 용언_상세//vv"
542
+ elif input_list[1] == 'va':
543
+ path += "/02. 용언_상세//va"
544
+ else:
545
+ return
546
+
547
+
548
+ path += "//"+input_list[0]+".xml"
549
+
550
+ tree = parse(path)
551
+ root = tree.getroot()
552
+ allentry = root.findall("entry")
553
+ for en in allentry:
554
+ try:
555
+ if input==str(input_list[0]+"."+en.attrib['pos']+"." + en.attrib['n']):
556
+ return Entry(str(input_list[0]+"."+en.attrib['pos']+"." + en.attrib['n']), en, str(en.attrib['pos']))
557
+ except KeyError:
558
+ if input==str(input_list[0]+"."+en.attrib['pos']):
559
+ return Entry(str(input_list[0]+"."+en.attrib['pos']), en, str(en.attrib['pos']))
560
+
561
+ # entry 객체 리턴
562
+ def entrys(word):
563
+ path = filecheck(word)
564
+ list = []
565
+
566
+ for tmp in path:
567
+ tree = parse(tmp)
568
+ root = tree.getroot()
569
+ allentry = root.findall("entry")
570
+
571
+ for en in allentry:
572
+ try:
573
+ es = str(word + "." + en.attrib['pos'] + "." + en.attrib['n'])
574
+ except KeyError:
575
+ es = str(word + "." + en.attrib['pos'])
576
+
577
+ temp = Entry(es, en, str(en.attrib['pos']))
578
+ list.append(temp)
579
+
580
+ return list
581
+
582
+
583
+ def _syn(word):
584
+
585
+ ets=entrys(word)
586
+ syn_list=[]
587
+
588
+ for et in ets:
589
+ for se in et.senses():
590
+ syn_list+=se.syn()
591
+
592
+ return syn_list
593
+
594
+ '''
595
+ def entry_error():
596
+
597
+ path="./02. 용언_상세//va"
598
+ abs_dir=os.path.join(os.getcwd(),path)
599
+ file_names=os.listdir(abs_dir)
600
+
601
+ #print(file_names)
602
+ #print(len(file_names))
603
+ error_list=[]
604
+
605
+
606
+ for word in file_names:
607
+
608
+ fpath=path+"//"+word
609
+ #print(fpath)
610
+ tree=parse(fpath)
611
+ root=tree.getroot()
612
+ #print(root.findtext('orth'))
613
+ allentry=root.findall("entry")
614
+
615
+ for en in allentry:
616
+ try:
617
+ en.attrib['n']
618
+
619
+ except:
620
+
621
+ error_list.append(word)
622
+ break;
623
+
624
+ print(error_list)
625
+ print(len(error_list))
626
+ print(len(file_names))
627
+
628
+
629
+ return error_list
630
+
631
+
632
+
633
+ def sense_error():
634
+
635
+ path="./02. 용언_상세//va"
636
+ abs_dir=os.path.join(os.getcwd(),path)
637
+ file_names=os.listdir(abs_dir)
638
+
639
+ error_list=[]
640
+
641
+
642
+ for word in file_names:
643
+
644
+ fpath=path+"//"+word
645
+ tree=parse(fpath)
646
+ root=tree.getroot()
647
+ allentry=root.findall("entry")
648
+
649
+ for en in allentry:
650
+ allsense=en.findall("sense")
651
+ for se in allsense:
652
+ try:
653
+ se.attrib['n']
654
+ except:
655
+
656
+ if word not in error_list:
657
+ error_list.append(word)
658
+ break;
659
+
660
+ print(error_list)
661
+ print(len(error_list))
662
+ print(len(file_names))
663
+
664
+
665
+ return error_list
666
+
667
+ '''
668
+
669
+ # file check
670
+ def filecheck(word):
671
+ n_path = common_path+"/01. 체언_상세"
672
+ vv_path = common_path+"/02. 용언_상세/vv"
673
+ va_path = common_path+"/02. 용언_상세/va"
674
+
675
+
676
+ path = [n_path, vv_path, va_path]
677
+ ret_list = []
678
+ check = word + ".xml"
679
+
680
+
681
+ for tmp in path:
682
+ if check in os.listdir(tmp):
683
+ ret_list.append(tmp + "/" + check)
684
+
685
+ return ret_list
@@ -0,0 +1,3 @@
1
+ # The following trick allows us to import the classes directly from the similarity module:
2
+ from .cosine_similarity import CosineSimilarity
3
+ from .classical import LCSubstringSimilarity, LCSubsequenceSimilarity, JaroSimilarity