nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +16 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +847 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +787 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  32. nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  33. nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  34. nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  35. nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  36. nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  37. nltkor/sejong/ch.py +12 -0
  38. nltkor/sejong/dict_semClassNum.txt +491 -0
  39. nltkor/sejong/layer.txt +630 -0
  40. nltkor/sejong/sejong_download.py +87 -0
  41. nltkor/sejong/ssem.py +684 -0
  42. nltkor/similarity/__init__.py +3 -0
  43. nltkor/similarity/bartscore____.py +337 -0
  44. nltkor/similarity/bertscore____.py +339 -0
  45. nltkor/similarity/classical.py +245 -0
  46. nltkor/similarity/cosine_similarity.py +175 -0
  47. nltkor/tag/__init__.py +71 -0
  48. nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
  49. nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
  50. nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
  51. nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
  52. nltkor/tag/espresso_tag.py +220 -0
  53. nltkor/tag/libs/__init__.py +10 -0
  54. nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
  55. nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
  56. nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
  57. nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
  58. nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
  59. nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
  60. nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
  61. nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
  62. nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
  63. nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
  64. nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
  65. nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
  66. nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
  67. nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
  68. nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
  69. nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
  70. nltkor/tag/libs/arguments.py +280 -0
  71. nltkor/tag/libs/attributes.py +231 -0
  72. nltkor/tag/libs/config.py +159 -0
  73. nltkor/tag/libs/metadata.py +129 -0
  74. nltkor/tag/libs/ner/__init__.py +2 -0
  75. nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
  76. nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
  77. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
  78. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
  79. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  80. nltkor/tag/libs/ner/ner_reader.py +92 -0
  81. nltkor/tag/libs/network.c +72325 -0
  82. nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
  83. nltkor/tag/libs/network.pyx +878 -0
  84. nltkor/tag/libs/networkconv.pyx +1028 -0
  85. nltkor/tag/libs/networkdependencyconv.pyx +451 -0
  86. nltkor/tag/libs/parse/__init__.py +1 -0
  87. nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
  88. nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
  89. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
  90. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
  91. nltkor/tag/libs/parse/parse_reader.py +283 -0
  92. nltkor/tag/libs/pos/__init__.py +2 -0
  93. nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
  94. nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
  95. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
  96. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
  97. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  98. nltkor/tag/libs/pos/pos_reader.py +97 -0
  99. nltkor/tag/libs/reader.py +485 -0
  100. nltkor/tag/libs/srl/__init__.py +3 -0
  101. nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
  102. nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
  103. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
  104. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
  105. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
  106. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
  107. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  108. nltkor/tag/libs/srl/srl_reader.py +436 -0
  109. nltkor/tag/libs/srl/train_srl.py +87 -0
  110. nltkor/tag/libs/taggers.py +926 -0
  111. nltkor/tag/libs/utils.py +384 -0
  112. nltkor/tag/libs/word_dictionary.py +239 -0
  113. nltkor/tag/libs/wsd/__init__.py +2 -0
  114. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
  115. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
  116. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
  117. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
  118. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  119. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  120. nltkor/tokenize/__init__.py +62 -0
  121. nltkor/tokenize/ko_tokenize.py +115 -0
  122. nltkor/trans.py +121 -0
  123. nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
  124. nltkor-1.2.14.dist-info/METADATA +41 -0
  125. nltkor-1.2.14.dist-info/RECORD +127 -0
  126. nltkor-1.2.14.dist-info/WHEEL +5 -0
  127. nltkor-1.2.14.dist-info/top_level.txt +1 -0
nltkor/Kor_char.py ADDED
@@ -0,0 +1,193 @@
1
+ import unicodedata
2
+
3
+ CHOSEONG_IDX_CODEMAP = [1, 2, 0, 3, 0, 0, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 7, 8, 9, 0, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
4
+ JONGSEONG_IDX_CODEMAP= [1, 2, 3, 4, 5, 6, 7, 0, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 0, 18, 19, 20, 21, 22, 0, 23, 24, 25, 26, 27]
5
+ getCJamoIdxChoseong = lambda x: ((x > min(CHOSEONG_IDX_CODEMAP) and x <= max(CHOSEONG_IDX_CODEMAP)) and CHOSEONG_IDX_CODEMAP.index(x)) or 0
6
+ getCJamoIdxJongseong = lambda x: ((x > min(JONGSEONG_IDX_CODEMAP) and x <= max(JONGSEONG_IDX_CODEMAP)) and JONGSEONG_IDX_CODEMAP.index(x)) or 0
7
+
8
+
9
+
10
+ def error():
11
+
12
+ try:
13
+ raise Exception("function expect a character, check the value")
14
+ except Exception as e:
15
+ print(e)
16
+ return
17
+
18
+ #한글 문자 판별 함수. 구: kor_check()
19
+ def is_kor_char(character, encoding = None):
20
+
21
+ if len(character)>1 or len(character)<=0:
22
+ error()
23
+ return
24
+
25
+ ch = ord(character)
26
+
27
+ return ( ( ch >= 0xac00 and ch <= 0xd7a3) or # Hangul Syllables
28
+ ( ch >= 0x1100 and ch <= 0x11ff) or # Hangul Jamo
29
+ ( ch >= 0x3131 and ch <= 0x318e) or # Hangul Compatibility Jamo
30
+ ( ch >= 0xffa1 and ch <= 0xffdc) ) # Hangul Halfwidth
31
+
32
+
33
+ # 초/중/종성 분할 함수. kor_split()
34
+ def split_syllable(character, encoding = None):
35
+
36
+ if len(character)>1 or len(character)<=0:
37
+ error()
38
+ return
39
+
40
+ char = character
41
+ returnChr = lambda x: (x and chr(x)) or str()
42
+ returnCJJ = lambda x, y, z: tuple(map(returnChr, (x, y, z)))
43
+
44
+ ch = ord(char)
45
+
46
+
47
+ #초성 : comp
48
+ if (ch >= 0x3131 and ch <= 0x314e) or (ch >= 0x3165 and ch <= 0x3186):
49
+ return returnCJJ(ch, 0, 0)
50
+
51
+ #중성 : comp
52
+ if (ch >= 0x314f and ch <= 0x3163) or (ch >= 0x3187 and ch <= 0x318e):
53
+ return returnCJJ(0, ch, 0)
54
+
55
+ # Hangul Syllables : 가 - 힣
56
+ if (ch >= 0xac00 and ch <= 0xd7a3):
57
+ idx_cho = int((ch - 0xac00) / 0x024c) # idx_cho = int (ch-44032)/588
58
+ idx_jung= int(((ch - 0xac00) % 0x024c) / 0x001c)#idx_jung = int ((ch-44032)%588)/28
59
+ idx_jong= int((ch - 0xac00) % 0x001c)# idx_jong = int ((ch-44032) % 28)
60
+ return returnCJJ(getCJamoIdxChoseong(idx_cho+1)+0x3131, idx_jung+0x314f, (idx_jong and getCJamoIdxJongseong(idx_jong)+0x3131) or 0)
61
+
62
+ # None
63
+ return returnCJJ(ch, 0, 0)
64
+
65
+
66
+
67
+ #초/중/종성 결합 함수. kor_join()
68
+ def join_syllable(choseong, jungseong, jongseong, encoding = None):
69
+
70
+
71
+ if len(choseong)|len(choseong)|len(choseong)>1 :
72
+ error()
73
+ return
74
+ elif len(choseong)|len(choseong)|len(choseong)<=0 :
75
+ error()
76
+ return
77
+
78
+ returnChr = lambda x: (x and chr(x)) or str()
79
+ returnChar = lambda x: returnChr(x)
80
+
81
+ if not jungseong:
82
+ if not choseong:
83
+ return returnChar(0)
84
+ return choseong
85
+ else:
86
+ if not choseong:
87
+ return jungseong
88
+
89
+ idx_cho = CHOSEONG_IDX_CODEMAP[ord(choseong)-0x3131]-1
90
+ idx_jung = ord(jungseong)-0x314f
91
+ idx_jong = (jongseong and JONGSEONG_IDX_CODEMAP[ord(jongseong)-0x3131]) or 0
92
+
93
+ return returnChar(0xac00+((idx_cho*21)+idx_jung)*28+idx_jong)
94
+
95
+
96
+
97
+ #한글 비교 함수
98
+ def kor_cmp(s1, s2, encoding = None):
99
+
100
+ if len(s1)|len(s2)>1 or len(s1)|len(s2)<=0 :
101
+ error()
102
+ return
103
+
104
+
105
+ if type(s1) == str:
106
+ s1 = str().join(map(lambda x: str().join(map(lambda y: y or " ", split_syllable(x))), s1))
107
+ if type(s2) == str:
108
+ s2 = str().join(map(lambda x: str().join(map(lambda y: y or " ", split_syllable(x))), s2))
109
+
110
+ return (s1>s2)-(s1<s2)
111
+
112
+
113
+
114
+ #한글 음절 판별 함수
115
+ def is_kor_syllable(character, encoding = None):
116
+
117
+ if len(character)>1 or len(character)<=0:
118
+ error()
119
+ return
120
+
121
+ return "HANGUL SYLLABLE" in unicodedata.name(character)
122
+
123
+
124
+
125
+ #한자 문자 판별 함수
126
+ def is_hanja(character, encoding = None):
127
+
128
+ if len(character)>1 or len(character)<=0:
129
+ error()
130
+ return
131
+
132
+ return "CJK" in unicodedata.name(character)
133
+
134
+
135
+
136
+ #숫자 판별 함수
137
+ def is_number(character, encoding = None):
138
+
139
+ if len(character)>1 or len(character)<=0:
140
+ error()
141
+ return
142
+
143
+ return "DIGIT" in unicodedata.name(character)
144
+
145
+
146
+ #영어 알파벳 문자 판별 함수
147
+ def is_eng_char(character, encoding = None):
148
+
149
+ if len(character)>1 or len(character)<=0:
150
+ error()
151
+ return
152
+
153
+ return "LATIN" in unicodedata.name(character)
154
+
155
+
156
+ #기호 판별 함수
157
+ def is_symbol(character, encoding = None):
158
+
159
+ if len(character)>1 or len(character)<=0:
160
+ error()
161
+ return
162
+
163
+ return unicodedata.category(character)[0] == "S"
164
+
165
+
166
+ #구두점 판별 함수
167
+ def is_punctuation(character, encoding = None):
168
+
169
+ if len(character)>1 or len(character)<=0:
170
+ error()
171
+ return
172
+
173
+ return unicodedata.category(character)[0] == "P"
174
+
175
+
176
+ #영어 알파벳 연결 문자 판별 함수
177
+ def is_engConnection(character, encoding = None):
178
+
179
+ if len(character)>1 or len(character)<=0:
180
+ error()
181
+ return
182
+
183
+ return character in (".", "-", "_", "|")
184
+
185
+
186
+ # 숫자 연결 문자 판별 함수
187
+ def is_numConnection(character, encoding = None):
188
+
189
+ if len(character)>1 or len(character)<=0:
190
+ error()
191
+ return
192
+
193
+ return character in (".", ",")
nltkor/__init__.py ADDED
@@ -0,0 +1,16 @@
1
+ from nltkor import alignment
2
+ from nltkor import cider
3
+ from nltkor import distance
4
+
5
+ from nltkor import sejong
6
+ from nltkor import metrics
7
+ from nltkor import misc
8
+ from nltkor import search
9
+ from nltkor import similarity
10
+ from nltkor import tag
11
+ from nltkor import tokenize
12
+ from nltkor import trans
13
+ from nltkor import Kor_char
14
+ from nltkor import etc
15
+
16
+ __version__ = '1.2.14'