nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nltkor/Kor_char.py +193 -0
- nltkor/__init__.py +16 -0
- nltkor/alignment/__init__.py +1315 -0
- nltkor/cider/__init__.py +2 -0
- nltkor/cider/cider.py +55 -0
- nltkor/cider/cider_scorer.py +207 -0
- nltkor/distance/__init__.py +441 -0
- nltkor/distance/wasserstein.py +126 -0
- nltkor/etc.py +22 -0
- nltkor/lazyimport.py +144 -0
- nltkor/make_requirement.py +11 -0
- nltkor/metrics/__init__.py +63 -0
- nltkor/metrics/bartscore.py +301 -0
- nltkor/metrics/bertscore.py +331 -0
- nltkor/metrics/bleu_tensor.py +20 -0
- nltkor/metrics/classical.py +847 -0
- nltkor/metrics/entment.py +24 -0
- nltkor/metrics/eval.py +517 -0
- nltkor/metrics/mauve.py +273 -0
- nltkor/metrics/mauve_utils.py +131 -0
- nltkor/misc/__init__.py +11 -0
- nltkor/misc/string2string_basic_functions.py +59 -0
- nltkor/misc/string2string_default_tokenizer.py +83 -0
- nltkor/misc/string2string_hash_functions.py +159 -0
- nltkor/misc/string2string_word_embeddings.py +503 -0
- nltkor/search/__init__.py +10 -0
- nltkor/search/classical.py +569 -0
- nltkor/search/faiss_search.py +787 -0
- nltkor/search/kobert_tokenizer.py +181 -0
- nltkor/sejong/__init__.py +3 -0
- nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
- nltkor/sejong/ch.py +12 -0
- nltkor/sejong/dict_semClassNum.txt +491 -0
- nltkor/sejong/layer.txt +630 -0
- nltkor/sejong/sejong_download.py +87 -0
- nltkor/sejong/ssem.py +684 -0
- nltkor/similarity/__init__.py +3 -0
- nltkor/similarity/bartscore____.py +337 -0
- nltkor/similarity/bertscore____.py +339 -0
- nltkor/similarity/classical.py +245 -0
- nltkor/similarity/cosine_similarity.py +175 -0
- nltkor/tag/__init__.py +71 -0
- nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
- nltkor/tag/espresso_tag.py +220 -0
- nltkor/tag/libs/__init__.py +10 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
- nltkor/tag/libs/arguments.py +280 -0
- nltkor/tag/libs/attributes.py +231 -0
- nltkor/tag/libs/config.py +159 -0
- nltkor/tag/libs/metadata.py +129 -0
- nltkor/tag/libs/ner/__init__.py +2 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/macmorphoreader.py +7 -0
- nltkor/tag/libs/ner/ner_reader.py +92 -0
- nltkor/tag/libs/network.c +72325 -0
- nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
- nltkor/tag/libs/network.pyx +878 -0
- nltkor/tag/libs/networkconv.pyx +1028 -0
- nltkor/tag/libs/networkdependencyconv.pyx +451 -0
- nltkor/tag/libs/parse/__init__.py +1 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/parse_reader.py +283 -0
- nltkor/tag/libs/pos/__init__.py +2 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/macmorphoreader.py +7 -0
- nltkor/tag/libs/pos/pos_reader.py +97 -0
- nltkor/tag/libs/reader.py +485 -0
- nltkor/tag/libs/srl/__init__.py +3 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__srl_reader_.py +535 -0
- nltkor/tag/libs/srl/srl_reader.py +436 -0
- nltkor/tag/libs/srl/train_srl.py +87 -0
- nltkor/tag/libs/taggers.py +926 -0
- nltkor/tag/libs/utils.py +384 -0
- nltkor/tag/libs/word_dictionary.py +239 -0
- nltkor/tag/libs/wsd/__init__.py +2 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
- nltkor/tag/libs/wsd/wsd_reader.py +93 -0
- nltkor/tokenize/__init__.py +62 -0
- nltkor/tokenize/ko_tokenize.py +115 -0
- nltkor/trans.py +121 -0
- nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
- nltkor-1.2.14.dist-info/METADATA +41 -0
- nltkor-1.2.14.dist-info/RECORD +127 -0
- nltkor-1.2.14.dist-info/WHEEL +5 -0
- nltkor-1.2.14.dist-info/top_level.txt +1 -0
nltkor/Kor_char.py
ADDED
@@ -0,0 +1,193 @@
|
|
1
|
+
import unicodedata
|
2
|
+
|
3
|
+
CHOSEONG_IDX_CODEMAP = [1, 2, 0, 3, 0, 0, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 7, 8, 9, 0, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
|
4
|
+
JONGSEONG_IDX_CODEMAP= [1, 2, 3, 4, 5, 6, 7, 0, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 0, 18, 19, 20, 21, 22, 0, 23, 24, 25, 26, 27]
|
5
|
+
getCJamoIdxChoseong = lambda x: ((x > min(CHOSEONG_IDX_CODEMAP) and x <= max(CHOSEONG_IDX_CODEMAP)) and CHOSEONG_IDX_CODEMAP.index(x)) or 0
|
6
|
+
getCJamoIdxJongseong = lambda x: ((x > min(JONGSEONG_IDX_CODEMAP) and x <= max(JONGSEONG_IDX_CODEMAP)) and JONGSEONG_IDX_CODEMAP.index(x)) or 0
|
7
|
+
|
8
|
+
|
9
|
+
|
10
|
+
def error():
|
11
|
+
|
12
|
+
try:
|
13
|
+
raise Exception("function expect a character, check the value")
|
14
|
+
except Exception as e:
|
15
|
+
print(e)
|
16
|
+
return
|
17
|
+
|
18
|
+
#한글 문자 판별 함수. 구: kor_check()
|
19
|
+
def is_kor_char(character, encoding = None):
|
20
|
+
|
21
|
+
if len(character)>1 or len(character)<=0:
|
22
|
+
error()
|
23
|
+
return
|
24
|
+
|
25
|
+
ch = ord(character)
|
26
|
+
|
27
|
+
return ( ( ch >= 0xac00 and ch <= 0xd7a3) or # Hangul Syllables
|
28
|
+
( ch >= 0x1100 and ch <= 0x11ff) or # Hangul Jamo
|
29
|
+
( ch >= 0x3131 and ch <= 0x318e) or # Hangul Compatibility Jamo
|
30
|
+
( ch >= 0xffa1 and ch <= 0xffdc) ) # Hangul Halfwidth
|
31
|
+
|
32
|
+
|
33
|
+
# 초/중/종성 분할 함수. kor_split()
|
34
|
+
def split_syllable(character, encoding = None):
|
35
|
+
|
36
|
+
if len(character)>1 or len(character)<=0:
|
37
|
+
error()
|
38
|
+
return
|
39
|
+
|
40
|
+
char = character
|
41
|
+
returnChr = lambda x: (x and chr(x)) or str()
|
42
|
+
returnCJJ = lambda x, y, z: tuple(map(returnChr, (x, y, z)))
|
43
|
+
|
44
|
+
ch = ord(char)
|
45
|
+
|
46
|
+
|
47
|
+
#초성 : comp
|
48
|
+
if (ch >= 0x3131 and ch <= 0x314e) or (ch >= 0x3165 and ch <= 0x3186):
|
49
|
+
return returnCJJ(ch, 0, 0)
|
50
|
+
|
51
|
+
#중성 : comp
|
52
|
+
if (ch >= 0x314f and ch <= 0x3163) or (ch >= 0x3187 and ch <= 0x318e):
|
53
|
+
return returnCJJ(0, ch, 0)
|
54
|
+
|
55
|
+
# Hangul Syllables : 가 - 힣
|
56
|
+
if (ch >= 0xac00 and ch <= 0xd7a3):
|
57
|
+
idx_cho = int((ch - 0xac00) / 0x024c) # idx_cho = int (ch-44032)/588
|
58
|
+
idx_jung= int(((ch - 0xac00) % 0x024c) / 0x001c)#idx_jung = int ((ch-44032)%588)/28
|
59
|
+
idx_jong= int((ch - 0xac00) % 0x001c)# idx_jong = int ((ch-44032) % 28)
|
60
|
+
return returnCJJ(getCJamoIdxChoseong(idx_cho+1)+0x3131, idx_jung+0x314f, (idx_jong and getCJamoIdxJongseong(idx_jong)+0x3131) or 0)
|
61
|
+
|
62
|
+
# None
|
63
|
+
return returnCJJ(ch, 0, 0)
|
64
|
+
|
65
|
+
|
66
|
+
|
67
|
+
#초/중/종성 결합 함수. kor_join()
|
68
|
+
def join_syllable(choseong, jungseong, jongseong, encoding = None):
|
69
|
+
|
70
|
+
|
71
|
+
if len(choseong)|len(choseong)|len(choseong)>1 :
|
72
|
+
error()
|
73
|
+
return
|
74
|
+
elif len(choseong)|len(choseong)|len(choseong)<=0 :
|
75
|
+
error()
|
76
|
+
return
|
77
|
+
|
78
|
+
returnChr = lambda x: (x and chr(x)) or str()
|
79
|
+
returnChar = lambda x: returnChr(x)
|
80
|
+
|
81
|
+
if not jungseong:
|
82
|
+
if not choseong:
|
83
|
+
return returnChar(0)
|
84
|
+
return choseong
|
85
|
+
else:
|
86
|
+
if not choseong:
|
87
|
+
return jungseong
|
88
|
+
|
89
|
+
idx_cho = CHOSEONG_IDX_CODEMAP[ord(choseong)-0x3131]-1
|
90
|
+
idx_jung = ord(jungseong)-0x314f
|
91
|
+
idx_jong = (jongseong and JONGSEONG_IDX_CODEMAP[ord(jongseong)-0x3131]) or 0
|
92
|
+
|
93
|
+
return returnChar(0xac00+((idx_cho*21)+idx_jung)*28+idx_jong)
|
94
|
+
|
95
|
+
|
96
|
+
|
97
|
+
#한글 비교 함수
|
98
|
+
def kor_cmp(s1, s2, encoding = None):
|
99
|
+
|
100
|
+
if len(s1)|len(s2)>1 or len(s1)|len(s2)<=0 :
|
101
|
+
error()
|
102
|
+
return
|
103
|
+
|
104
|
+
|
105
|
+
if type(s1) == str:
|
106
|
+
s1 = str().join(map(lambda x: str().join(map(lambda y: y or " ", split_syllable(x))), s1))
|
107
|
+
if type(s2) == str:
|
108
|
+
s2 = str().join(map(lambda x: str().join(map(lambda y: y or " ", split_syllable(x))), s2))
|
109
|
+
|
110
|
+
return (s1>s2)-(s1<s2)
|
111
|
+
|
112
|
+
|
113
|
+
|
114
|
+
#한글 음절 판별 함수
|
115
|
+
def is_kor_syllable(character, encoding = None):
|
116
|
+
|
117
|
+
if len(character)>1 or len(character)<=0:
|
118
|
+
error()
|
119
|
+
return
|
120
|
+
|
121
|
+
return "HANGUL SYLLABLE" in unicodedata.name(character)
|
122
|
+
|
123
|
+
|
124
|
+
|
125
|
+
#한자 문자 판별 함수
|
126
|
+
def is_hanja(character, encoding = None):
|
127
|
+
|
128
|
+
if len(character)>1 or len(character)<=0:
|
129
|
+
error()
|
130
|
+
return
|
131
|
+
|
132
|
+
return "CJK" in unicodedata.name(character)
|
133
|
+
|
134
|
+
|
135
|
+
|
136
|
+
#숫자 판별 함수
|
137
|
+
def is_number(character, encoding = None):
|
138
|
+
|
139
|
+
if len(character)>1 or len(character)<=0:
|
140
|
+
error()
|
141
|
+
return
|
142
|
+
|
143
|
+
return "DIGIT" in unicodedata.name(character)
|
144
|
+
|
145
|
+
|
146
|
+
#영어 알파벳 문자 판별 함수
|
147
|
+
def is_eng_char(character, encoding = None):
|
148
|
+
|
149
|
+
if len(character)>1 or len(character)<=0:
|
150
|
+
error()
|
151
|
+
return
|
152
|
+
|
153
|
+
return "LATIN" in unicodedata.name(character)
|
154
|
+
|
155
|
+
|
156
|
+
#기호 판별 함수
|
157
|
+
def is_symbol(character, encoding = None):
|
158
|
+
|
159
|
+
if len(character)>1 or len(character)<=0:
|
160
|
+
error()
|
161
|
+
return
|
162
|
+
|
163
|
+
return unicodedata.category(character)[0] == "S"
|
164
|
+
|
165
|
+
|
166
|
+
#구두점 판별 함수
|
167
|
+
def is_punctuation(character, encoding = None):
|
168
|
+
|
169
|
+
if len(character)>1 or len(character)<=0:
|
170
|
+
error()
|
171
|
+
return
|
172
|
+
|
173
|
+
return unicodedata.category(character)[0] == "P"
|
174
|
+
|
175
|
+
|
176
|
+
#영어 알파벳 연결 문자 판별 함수
|
177
|
+
def is_engConnection(character, encoding = None):
|
178
|
+
|
179
|
+
if len(character)>1 or len(character)<=0:
|
180
|
+
error()
|
181
|
+
return
|
182
|
+
|
183
|
+
return character in (".", "-", "_", "|")
|
184
|
+
|
185
|
+
|
186
|
+
# 숫자 연결 문자 판별 함수
|
187
|
+
def is_numConnection(character, encoding = None):
|
188
|
+
|
189
|
+
if len(character)>1 or len(character)<=0:
|
190
|
+
error()
|
191
|
+
return
|
192
|
+
|
193
|
+
return character in (".", ",")
|
nltkor/__init__.py
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
from nltkor import alignment
|
2
|
+
from nltkor import cider
|
3
|
+
from nltkor import distance
|
4
|
+
|
5
|
+
from nltkor import sejong
|
6
|
+
from nltkor import metrics
|
7
|
+
from nltkor import misc
|
8
|
+
from nltkor import search
|
9
|
+
from nltkor import similarity
|
10
|
+
from nltkor import tag
|
11
|
+
from nltkor import tokenize
|
12
|
+
from nltkor import trans
|
13
|
+
from nltkor import Kor_char
|
14
|
+
from nltkor import etc
|
15
|
+
|
16
|
+
__version__ = '1.2.14'
|