khmerns 0.0.3__cp313-cp313-win32.whl → 0.0.4__cp313-cp313-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khmerns/__init__.py +2 -1
- khmerns/__init__.pyi +4 -0
- khmerns/_core.cp313-win32.pyd +0 -0
- khmerns/khnormal.py +207 -0
- {khmerns-0.0.3.dist-info → khmerns-0.0.4.dist-info}/METADATA +9 -3
- {khmerns-0.0.3.dist-info → khmerns-0.0.4.dist-info}/RECORD +11 -10
- lib/ggml-base.lib +0 -0
- lib/ggml-cpu.lib +0 -0
- lib/ggml.lib +0 -0
- {khmerns-0.0.3.dist-info → khmerns-0.0.4.dist-info}/WHEEL +0 -0
- {khmerns-0.0.3.dist-info → khmerns-0.0.4.dist-info}/licenses/LICENSE +0 -0
khmerns/__init__.py
CHANGED
khmerns/__init__.pyi
CHANGED
khmerns/_core.cp313-win32.pyd
CHANGED
|
Binary file
|
khmerns/khnormal.py
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
#!/usr/bin/python3
|
|
2
|
+
# Copyright (c) 2021-2024, SIL Global.
|
|
3
|
+
# Licensed under MIT license: https://opensource.org/licenses/MIT
|
|
4
|
+
|
|
5
|
+
import enum
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Cats(enum.Enum):
|
|
10
|
+
Other = 0
|
|
11
|
+
Base = 1
|
|
12
|
+
Robat = 2
|
|
13
|
+
Coeng = 3
|
|
14
|
+
Shift = 4
|
|
15
|
+
Z = 5
|
|
16
|
+
VPre = 6
|
|
17
|
+
VB = 7
|
|
18
|
+
VA = 8
|
|
19
|
+
VPost = 9
|
|
20
|
+
MS = 10
|
|
21
|
+
MF = 11
|
|
22
|
+
ZFCoeng = 12
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
categories = (
|
|
26
|
+
[Cats.Base] * 35 # 1780-17A2
|
|
27
|
+
+ [Cats.Other] * 2 # 17A3-17A4
|
|
28
|
+
+ [Cats.Base] * 15 # 17A5-17B3
|
|
29
|
+
+ [Cats.Other] * 2 # 17B4-17B5
|
|
30
|
+
+ [Cats.VPost] # 17B6
|
|
31
|
+
+ [Cats.VA] * 4 # 17B7-17BA
|
|
32
|
+
+ [Cats.VB] * 3 # 17BB-17BD
|
|
33
|
+
+ [Cats.VPre] * 8 # 17BE-17C5
|
|
34
|
+
+ [Cats.MS] # 17C6
|
|
35
|
+
+ [Cats.MF] * 2 # 17C7-17C8
|
|
36
|
+
+ [Cats.Shift] * 2 # 17C9-17CA
|
|
37
|
+
+ [Cats.MS] # 17CB
|
|
38
|
+
+ [Cats.Robat] # 17CC
|
|
39
|
+
+ [Cats.MS] * 5 # 17CD-17D1
|
|
40
|
+
+ [Cats.Coeng] # 17D2
|
|
41
|
+
+ [Cats.MS] # 17D3
|
|
42
|
+
+ [Cats.Other] * 9 # 17D4-17DC
|
|
43
|
+
+ [Cats.MS]
|
|
44
|
+
) # 17DD
|
|
45
|
+
|
|
46
|
+
khres = { # useful regular sub expressions used later
|
|
47
|
+
# All bases
|
|
48
|
+
"B": "[\u1780-\u17a2\u17a5-\u17b3\u25cc]",
|
|
49
|
+
# All consonants excluding Ro
|
|
50
|
+
"NonRo": "[\u1780-\u1799\u179b-\u17a2\u17a5-\u17b3]",
|
|
51
|
+
# All consonants exclude Bo
|
|
52
|
+
"NonBA": "[\u1780-\u1793\u1795-\u17a2\u17a5-\u17b3]",
|
|
53
|
+
# Series 1 consonants
|
|
54
|
+
"S1": "[\u1780-\u1783\u1785-\u1788\u178a-\u178d\u178f-\u1792"
|
|
55
|
+
"\u1795-\u1797\u179e-\u17a0\u17a2]",
|
|
56
|
+
# Series 2 consonants
|
|
57
|
+
"S2": "[\u1784\u1780\u178e\u1793\u1794\u1798-\u179d\u17a1\u17a3-\u17b3]",
|
|
58
|
+
# Simple following Vowel in Modern Khmer
|
|
59
|
+
"VA": "(?:[\u17b7-\u17ba\u17be\u17bf\u17dd]|\u17b6\u17c6)",
|
|
60
|
+
# Above vowel (as per shifter rules) with vowel sequences
|
|
61
|
+
"VAX": "(?:[\u17c1-\u17c5]?{VA})",
|
|
62
|
+
# Above vowel with samyok (modern khmer)
|
|
63
|
+
"VAS": "(?:{VA}|[\u17c1-\u17c3]?\u17d0)",
|
|
64
|
+
# Above vowel with samyok (middle khmer)
|
|
65
|
+
"VASX": "(?:{VAX}|[\u17c1-\u17c3]?\u17d0)",
|
|
66
|
+
# Below vowel (with Middle Khmer prefix)
|
|
67
|
+
"VB": "(?:[\u17c1-\u17c3]?[\u17bb-\u17bd])",
|
|
68
|
+
# contains series 1 and no BA
|
|
69
|
+
"STRONG": """ {S1}\u17cc? # series 1 robat?
|
|
70
|
+
(?:\u17d2{NonBA} # nonba coengs
|
|
71
|
+
(?:\u17d2{NonBA})?)?
|
|
72
|
+
| {NonBA}\u17cc? # nonba robat?
|
|
73
|
+
(?: \u17d2{S1} # series 1 coeng
|
|
74
|
+
(?:\u17d2{NonBA})? # + any nonba coeng
|
|
75
|
+
| \u17d2{NonBA}\u17d2{S1} # nonba coeng + series 1 coeng
|
|
76
|
+
)""",
|
|
77
|
+
# contains BA or only series 2
|
|
78
|
+
"NSTRONG": """(?:{S2}\u17cc?(?:\u17d2{S2}(?:\u17d2{S2})?)? # Series 2 + series 2 coengs
|
|
79
|
+
|\u1794\u17cc?(?:{COENG}(?:{COENG})?)? # or ba with any coeng
|
|
80
|
+
|{B}\u17cc?(?:\u17d2{NonRo}\u17d2\u1794 # or ba coeng
|
|
81
|
+
|\u17d2\u1794(?:\u17d2{B})))""",
|
|
82
|
+
"COENG": "(?:(?:\u17d2{NonRo})?\u17d2{B})",
|
|
83
|
+
# final coeng
|
|
84
|
+
"FCOENG": "(?:\u200d(?:\u17d2{NonRo})+)",
|
|
85
|
+
# Allowed shifter sequences in Modern Khmer
|
|
86
|
+
"SHIFT": """(?: (?<={STRONG}) \u17ca\u200c (?={VA}) # strong + triisap held up
|
|
87
|
+
| (?<={NSTRONG})\u17c9\u200c (?={VAS}) # weak + muusikatoan held up
|
|
88
|
+
| [\u17c9\u17ca] # any shifter
|
|
89
|
+
)""",
|
|
90
|
+
# Allowed shifter sequences in Middle Khmer
|
|
91
|
+
"SHIFTX": """(?:(?<={STRONG}) \u17ca\u200c (?={VAX}) # strong + triisap held up
|
|
92
|
+
| (?<={NSTRONG})\u17c9\u200c (?={VASX}) # weak + muusikatoan held up
|
|
93
|
+
| [\u17c9\u17ca] # any shifter
|
|
94
|
+
)""",
|
|
95
|
+
# Modern Khmer vowel
|
|
96
|
+
"V": "[\u17b6-\u17c5]?",
|
|
97
|
+
# Middle Khmer vowel sequences (not worth trying to unpack this)
|
|
98
|
+
"VX": "(?:\u17c1[\u17bc\u17bd]?[\u17b7\u17b9\u17ba]?|"
|
|
99
|
+
"[\u17c2\u17c3]?[\u17bc\u17bd]?[\u17b7-\u17ba]\u17b6|"
|
|
100
|
+
"[\u17c2\u17c3]?[\u17bb-\u17bd]?\u17b6|\u17be[\u17bc\u17bd]?\u17b6?|"
|
|
101
|
+
"[\u17c1-\u17c5]?\u17bb(?![\u17d0\u17dd])|"
|
|
102
|
+
"[\u17bf\u17c0]|[\u17c2-\u17c5]?[\u17bc\u17bd]?[\u17b7-\u17ba]?)",
|
|
103
|
+
# Modern Khmer Modifiers
|
|
104
|
+
"MS": """(?:(?: [\u17c6\u17cb\u17cd-\u17cf\u17d1\u17d3] # follows anything
|
|
105
|
+
| (?<!\u17bb) [\u17d0\u17dd]) # not after -u
|
|
106
|
+
[\u17c6\u17cb\u17cd-\u17d1\u17d3\u17dd]? # And an optional second
|
|
107
|
+
)""",
|
|
108
|
+
# Middle Khmer Modifiers
|
|
109
|
+
"MSX": """(?:(?: [\u17c6\u17cb\u17cd-\u17cf\u17d1\u17d3] # follows anything
|
|
110
|
+
| (?<!\u17bb [\u17b6\u17c4\u17c5]?) # blocking -u sequence
|
|
111
|
+
[\u17d0\u17dd]) # for these modifiers
|
|
112
|
+
[\u17c6\u17cb\u17cd-\u17d1\u17d3\u17dd]? # And an optional second
|
|
113
|
+
)""",
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
# expand 3 times: SHIFTX -> VASX -> VAX -> VA
|
|
117
|
+
for i in range(3):
|
|
118
|
+
khres = {k: v.format(**khres) for k, v in khres.items()}
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def charcat(c):
|
|
122
|
+
"""Returns the Khmer character category for a single char string"""
|
|
123
|
+
o = ord(c)
|
|
124
|
+
if 0x1780 <= o <= 0x17DD:
|
|
125
|
+
return categories[o - 0x1780]
|
|
126
|
+
elif o == 0x200C:
|
|
127
|
+
return Cats.Z
|
|
128
|
+
elif o == 0x200D:
|
|
129
|
+
return Cats.ZFCoeng
|
|
130
|
+
return Cats.Other
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def lunar(m, base):
|
|
134
|
+
"""Returns the lunar date symbol from the appropriate set base"""
|
|
135
|
+
v = (ord(m.group(1) or "\u17e0") - 0x17E0) * 10 + ord(m.group(2)) - 0x17E0
|
|
136
|
+
if v > 15: # translate \u17D4\u17D2\u17E0 as well
|
|
137
|
+
return m.group(0)
|
|
138
|
+
return chr(v + base)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def normalize(txt, lang="km"):
|
|
142
|
+
"""Returns khmer normalised string, without fixing or marking errors"""
|
|
143
|
+
# Mark final coengs in Middle Khmer
|
|
144
|
+
if lang == "xhm":
|
|
145
|
+
txt = re.sub(r"([\u17B6-\u17C5]\u17D2)", "\u200d\\1", txt)
|
|
146
|
+
# Categorise every character in the string
|
|
147
|
+
charcats = [charcat(c) for c in txt]
|
|
148
|
+
|
|
149
|
+
# Recategorise base -> coeng after coeng char (or ZFCoeng)
|
|
150
|
+
for i in range(1, len(charcats)):
|
|
151
|
+
if txt[i - 1] in "\u200d\u17d2" and charcats[i] in (Cats.Base, Cats.Coeng):
|
|
152
|
+
charcats[i] = charcats[i - 1]
|
|
153
|
+
|
|
154
|
+
# Find subranges of base+non other and sort components in the subrange
|
|
155
|
+
i = 0
|
|
156
|
+
res = []
|
|
157
|
+
while i < len(charcats):
|
|
158
|
+
c = charcats[i]
|
|
159
|
+
if c != Cats.Base:
|
|
160
|
+
res.append(txt[i])
|
|
161
|
+
i += 1
|
|
162
|
+
continue
|
|
163
|
+
# Scan for end of syllable
|
|
164
|
+
j = i + 1
|
|
165
|
+
while j < len(charcats) and charcats[j].value > Cats.Base.value:
|
|
166
|
+
j += 1
|
|
167
|
+
# Sort syllable based on character categories
|
|
168
|
+
# Sort the char indices by category then position in string
|
|
169
|
+
newindices = sorted(range(i, j), key=lambda e: (charcats[e].value, e))
|
|
170
|
+
replaces = "".join(txt[n] for n in newindices)
|
|
171
|
+
|
|
172
|
+
replaces = re.sub(
|
|
173
|
+
"(\u200d?\u17d2)[\u17d2\u200c\u200d]+", r"\1", replaces
|
|
174
|
+
) # remove multiple invisible chars
|
|
175
|
+
replaces = re.sub("\u17be\u17b6", "\u17c4\u17b8", replaces) # confusable vowels
|
|
176
|
+
# map compoound vowel sequences to compounds with -u before to be converted
|
|
177
|
+
replaces = re.sub("\u17c1([\u17bb-\u17bd]?)\u17b8", "\u17be\\1", replaces)
|
|
178
|
+
replaces = re.sub("\u17c1([\u17bb-\u17bd]?)\u17b6", "\u17c4\\1", replaces)
|
|
179
|
+
replaces = re.sub("(\u17be)(\u17bb)", r"\2\1", replaces)
|
|
180
|
+
# Replace -u + upper vowel with consonant shifter
|
|
181
|
+
replaces = re.sub(
|
|
182
|
+
("((?:{STRONG})[\u17c1-\u17c5]?)\u17bb" + "(?={VA}|\u17d0)").format(**khres),
|
|
183
|
+
"\\1\u17ca",
|
|
184
|
+
replaces,
|
|
185
|
+
flags=re.X,
|
|
186
|
+
)
|
|
187
|
+
replaces = re.sub(
|
|
188
|
+
("((?:{NSTRONG})[\u17c1-\u17c5]?)\u17bb" + "(?={VA}|\u17d0)").format(**khres),
|
|
189
|
+
"\\1\u17c9",
|
|
190
|
+
replaces,
|
|
191
|
+
flags=re.X,
|
|
192
|
+
)
|
|
193
|
+
replaces = re.sub(
|
|
194
|
+
"(\u17d2\u179a)(\u17d2[\u1780-\u17b3])", r"\2\1", replaces
|
|
195
|
+
) # coeng ro second
|
|
196
|
+
# replaces = re.sub("(\u17d2)\u178a", "\\1\u178f", replaces) # coeng da->ta
|
|
197
|
+
# convert lunar dates from old style to use lunar date symbols
|
|
198
|
+
replaces = re.sub(
|
|
199
|
+
"(\u17e1?)([\u17e0-\u17e9])\u17d2\u17d4", lambda m: lunar(m, 0x19E0), replaces
|
|
200
|
+
)
|
|
201
|
+
replaces = re.sub(
|
|
202
|
+
"\u17d4\u17d2(\u17e1?)([\u17e0-\u17e9])", lambda m: lunar(m, 0x19F0), replaces
|
|
203
|
+
)
|
|
204
|
+
replaces = re.sub("\u17d4\u17d2\u17d4", "\u19f0", replaces)
|
|
205
|
+
res.append(replaces)
|
|
206
|
+
i = j
|
|
207
|
+
return "".join(res)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: khmerns
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.4
|
|
4
4
|
Summary: Khmer Neural Segmenter
|
|
5
5
|
Keywords: khmer,nlp,segmentation,tokenization,neural-network
|
|
6
6
|
Author-Email: Seanghay Yath <seanghay.dev@gmail.com>
|
|
@@ -35,11 +35,15 @@ pip install khmerns
|
|
|
35
35
|
## Usage
|
|
36
36
|
|
|
37
37
|
```python
|
|
38
|
-
from khmerns import tokenize
|
|
38
|
+
from khmerns import tokenize, normalize
|
|
39
39
|
|
|
40
40
|
# Returns a list of words
|
|
41
41
|
words = tokenize("សួស្តីបងប្អូន")
|
|
42
|
-
# ['សួស្តី', 'បង', 'ប្អូន']
|
|
42
|
+
# => ['សួស្តី', 'បង', 'ប្អូន']
|
|
43
|
+
|
|
44
|
+
# normalize and reorder Khmer characters
|
|
45
|
+
words = tokenize(normalize("សួស្តីបងប្អូន"))
|
|
46
|
+
# => ['សួស្តី', 'បង', 'ប្អូន']
|
|
43
47
|
```
|
|
44
48
|
|
|
45
49
|
You can also use the class-based API if you prefer:
|
|
@@ -48,8 +52,10 @@ You can also use the class-based API if you prefer:
|
|
|
48
52
|
from khmerns import KhmerSegmenter
|
|
49
53
|
|
|
50
54
|
segmenter = KhmerSegmenter()
|
|
55
|
+
|
|
51
56
|
words = segmenter.tokenize("សួស្តីបងប្អូន")
|
|
52
57
|
# or
|
|
58
|
+
|
|
53
59
|
words = segmenter("សួស្តីបងប្អូន")
|
|
54
60
|
```
|
|
55
61
|
|
|
@@ -15,15 +15,16 @@ include/ggml-webgpu.h,sha256=YoxXN2KYJOwzJmIKXESA3olVy_gsNSvu2ORuAlFPs5I,347
|
|
|
15
15
|
include/ggml-zendnn.h,sha256=rlX7HjkqaAory2NoxpQSVZgPjEnZHZ6dKjQAOLK_8lE,520
|
|
16
16
|
include/ggml.h,sha256=8AuUtoOyhyNaf65atAKT1NBO-FrquzEXbYB-2VYjC6I,106093
|
|
17
17
|
include/gguf.h,sha256=dGmizdUtMG2dBzbEiu0ebZfF2Abz5smJuCXuEpMFToc,10426
|
|
18
|
-
khmerns/__init__.py,sha256=
|
|
19
|
-
khmerns/__init__.pyi,sha256=
|
|
20
|
-
khmerns/_core.cp313-win32.pyd,sha256=
|
|
18
|
+
khmerns/__init__.py,sha256=8hEEyGjLUGOJa3bST254yZgTznTVBBkXvhfxttm6mnE,163
|
|
19
|
+
khmerns/__init__.pyi,sha256=IvxNBUgitIhVklMX3Toia7ujT5uRKc5FDKHJFEnI9Ds,527
|
|
20
|
+
khmerns/_core.cp313-win32.pyd,sha256=UErs8Nsh-MIhrB5JfEJymkljGy_o9Wmcw94Km4MEs_4,3675648
|
|
21
|
+
khmerns/khnormal.py,sha256=B9WrmKmEARJantRBxKDhsMk5Cr2ljG0xEb61B7vbiQA,8058
|
|
21
22
|
lib/cmake/ggml/ggml-config.cmake,sha256=OPnz2F8SEuCgmqv8vJc3xlADN6NkgootaXOIKZ9uQ_Y,12170
|
|
22
23
|
lib/cmake/ggml/ggml-version.cmake,sha256=cmpuq1NZlHRY3WdA7y81233b6KomdObooai5LL4-Dyc,2827
|
|
23
|
-
lib/ggml-base.lib,sha256=
|
|
24
|
-
lib/ggml-cpu.lib,sha256=
|
|
25
|
-
lib/ggml.lib,sha256=
|
|
26
|
-
khmerns-0.0.
|
|
27
|
-
khmerns-0.0.
|
|
28
|
-
khmerns-0.0.
|
|
29
|
-
khmerns-0.0.
|
|
24
|
+
lib/ggml-base.lib,sha256=Zf770LD1Qgjb4zVgYae3qvO6xh4L8f5si6BXE5MgoWg,1217310
|
|
25
|
+
lib/ggml-cpu.lib,sha256=jUZVtJZfl_7_gi52X1IWPq0t9WqOaGfL6zdgEA5M14w,1130366
|
|
26
|
+
lib/ggml.lib,sha256=SHjLwyVS-blOapFSg3lUAsbhrs3AmEdXJdZWmeN4GTk,190804
|
|
27
|
+
khmerns-0.0.4.dist-info/METADATA,sha256=6hqAVMq7aCeoVlQpIvnB7CdiG5gtTSoepWc7zTIO_S8,3449
|
|
28
|
+
khmerns-0.0.4.dist-info/WHEEL,sha256=BLWkcoBBaeCx8jVONI8U_yio93Axp7qMoW-QbdfeAIY,102
|
|
29
|
+
khmerns-0.0.4.dist-info/licenses/LICENSE,sha256=NJbBwbQTQpJwxvCeUDhqpUe3HKE3geRtx5iqIlQ5q0c,1089
|
|
30
|
+
khmerns-0.0.4.dist-info/RECORD,,
|
lib/ggml-base.lib
CHANGED
|
Binary file
|
lib/ggml-cpu.lib
CHANGED
|
Binary file
|
lib/ggml.lib
CHANGED
|
Binary file
|
|
File without changes
|
|
File without changes
|