khmerns 0.0.3__cp313-cp313-win32.whl → 0.0.4__cp313-cp313-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
khmerns/__init__.py CHANGED
@@ -1,3 +1,4 @@
1
1
  from ._core import KhmerSegmenter, tokenize, __version__
2
+ from .khnormal import normalize
2
3
 
3
- __all__ = ["KhmerSegmenter", "tokenize", "__version__"]
4
+ __all__ = ["KhmerSegmenter", "tokenize", "__version__", "normalize"]
khmerns/__init__.pyi CHANGED
@@ -14,3 +14,7 @@ class KhmerSegmenter:
14
14
  def tokenize(text: str) -> List[str]:
15
15
  """Segment Khmer text and return a list of words."""
16
16
  ...
17
+
18
+ def normalize(text: str) -> str:
19
+ """Normalize and reorder Khmer character"""
20
+ ...
Binary file
khmerns/khnormal.py ADDED
@@ -0,0 +1,207 @@
1
+ #!/usr/bin/python3
2
+ # Copyright (c) 2021-2024, SIL Global.
3
+ # Licensed under MIT license: https://opensource.org/licenses/MIT
4
+
5
+ import enum
6
+ import re
7
+
8
+
9
+ class Cats(enum.Enum):
10
+ Other = 0
11
+ Base = 1
12
+ Robat = 2
13
+ Coeng = 3
14
+ Shift = 4
15
+ Z = 5
16
+ VPre = 6
17
+ VB = 7
18
+ VA = 8
19
+ VPost = 9
20
+ MS = 10
21
+ MF = 11
22
+ ZFCoeng = 12
23
+
24
+
25
+ categories = (
26
+ [Cats.Base] * 35 # 1780-17A2
27
+ + [Cats.Other] * 2 # 17A3-17A4
28
+ + [Cats.Base] * 15 # 17A5-17B3
29
+ + [Cats.Other] * 2 # 17B4-17B5
30
+ + [Cats.VPost] # 17B6
31
+ + [Cats.VA] * 4 # 17B7-17BA
32
+ + [Cats.VB] * 3 # 17BB-17BD
33
+ + [Cats.VPre] * 8 # 17BE-17C5
34
+ + [Cats.MS] # 17C6
35
+ + [Cats.MF] * 2 # 17C7-17C8
36
+ + [Cats.Shift] * 2 # 17C9-17CA
37
+ + [Cats.MS] # 17CB
38
+ + [Cats.Robat] # 17CC
39
+ + [Cats.MS] * 5 # 17CD-17D1
40
+ + [Cats.Coeng] # 17D2
41
+ + [Cats.MS] # 17D3
42
+ + [Cats.Other] * 9 # 17D4-17DC
43
+ + [Cats.MS]
44
+ ) # 17DD
45
+
46
+ khres = { # useful regular sub expressions used later
47
+ # All bases
48
+ "B": "[\u1780-\u17a2\u17a5-\u17b3\u25cc]",
49
+ # All consonants excluding Ro
50
+ "NonRo": "[\u1780-\u1799\u179b-\u17a2\u17a5-\u17b3]",
51
+ # All consonants exclude Bo
52
+ "NonBA": "[\u1780-\u1793\u1795-\u17a2\u17a5-\u17b3]",
53
+ # Series 1 consonants
54
+ "S1": "[\u1780-\u1783\u1785-\u1788\u178a-\u178d\u178f-\u1792"
55
+ "\u1795-\u1797\u179e-\u17a0\u17a2]",
56
+ # Series 2 consonants
57
+ "S2": "[\u1784\u1780\u178e\u1793\u1794\u1798-\u179d\u17a1\u17a3-\u17b3]",
58
+ # Simple following Vowel in Modern Khmer
59
+ "VA": "(?:[\u17b7-\u17ba\u17be\u17bf\u17dd]|\u17b6\u17c6)",
60
+ # Above vowel (as per shifter rules) with vowel sequences
61
+ "VAX": "(?:[\u17c1-\u17c5]?{VA})",
62
+ # Above vowel with samyok (modern khmer)
63
+ "VAS": "(?:{VA}|[\u17c1-\u17c3]?\u17d0)",
64
+ # Above vowel with samyok (middle khmer)
65
+ "VASX": "(?:{VAX}|[\u17c1-\u17c3]?\u17d0)",
66
+ # Below vowel (with Middle Khmer prefix)
67
+ "VB": "(?:[\u17c1-\u17c3]?[\u17bb-\u17bd])",
68
+ # contains series 1 and no BA
69
+ "STRONG": """ {S1}\u17cc? # series 1 robat?
70
+ (?:\u17d2{NonBA} # nonba coengs
71
+ (?:\u17d2{NonBA})?)?
72
+ | {NonBA}\u17cc? # nonba robat?
73
+ (?: \u17d2{S1} # series 1 coeng
74
+ (?:\u17d2{NonBA})? # + any nonba coeng
75
+ | \u17d2{NonBA}\u17d2{S1} # nonba coeng + series 1 coeng
76
+ )""",
77
+ # contains BA or only series 2
78
+ "NSTRONG": """(?:{S2}\u17cc?(?:\u17d2{S2}(?:\u17d2{S2})?)? # Series 2 + series 2 coengs
79
+ |\u1794\u17cc?(?:{COENG}(?:{COENG})?)? # or ba with any coeng
80
+ |{B}\u17cc?(?:\u17d2{NonRo}\u17d2\u1794 # or ba coeng
81
+ |\u17d2\u1794(?:\u17d2{B})))""",
82
+ "COENG": "(?:(?:\u17d2{NonRo})?\u17d2{B})",
83
+ # final coeng
84
+ "FCOENG": "(?:\u200d(?:\u17d2{NonRo})+)",
85
+ # Allowed shifter sequences in Modern Khmer
86
+ "SHIFT": """(?: (?<={STRONG}) \u17ca\u200c (?={VA}) # strong + triisap held up
87
+ | (?<={NSTRONG})\u17c9\u200c (?={VAS}) # weak + muusikatoan held up
88
+ | [\u17c9\u17ca] # any shifter
89
+ )""",
90
+ # Allowed shifter sequences in Middle Khmer
91
+ "SHIFTX": """(?:(?<={STRONG}) \u17ca\u200c (?={VAX}) # strong + triisap held up
92
+ | (?<={NSTRONG})\u17c9\u200c (?={VASX}) # weak + muusikatoan held up
93
+ | [\u17c9\u17ca] # any shifter
94
+ )""",
95
+ # Modern Khmer vowel
96
+ "V": "[\u17b6-\u17c5]?",
97
+ # Middle Khmer vowel sequences (not worth trying to unpack this)
98
+ "VX": "(?:\u17c1[\u17bc\u17bd]?[\u17b7\u17b9\u17ba]?|"
99
+ "[\u17c2\u17c3]?[\u17bc\u17bd]?[\u17b7-\u17ba]\u17b6|"
100
+ "[\u17c2\u17c3]?[\u17bb-\u17bd]?\u17b6|\u17be[\u17bc\u17bd]?\u17b6?|"
101
+ "[\u17c1-\u17c5]?\u17bb(?![\u17d0\u17dd])|"
102
+ "[\u17bf\u17c0]|[\u17c2-\u17c5]?[\u17bc\u17bd]?[\u17b7-\u17ba]?)",
103
+ # Modern Khmer Modifiers
104
+ "MS": """(?:(?: [\u17c6\u17cb\u17cd-\u17cf\u17d1\u17d3] # follows anything
105
+ | (?<!\u17bb) [\u17d0\u17dd]) # not after -u
106
+ [\u17c6\u17cb\u17cd-\u17d1\u17d3\u17dd]? # And an optional second
107
+ )""",
108
+ # Middle Khmer Modifiers
109
+ "MSX": """(?:(?: [\u17c6\u17cb\u17cd-\u17cf\u17d1\u17d3] # follows anything
110
+ | (?<!\u17bb [\u17b6\u17c4\u17c5]?) # blocking -u sequence
111
+ [\u17d0\u17dd]) # for these modifiers
112
+ [\u17c6\u17cb\u17cd-\u17d1\u17d3\u17dd]? # And an optional second
113
+ )""",
114
+ }
115
+
116
+ # expand 3 times: SHIFTX -> VASX -> VAX -> VA
117
+ for i in range(3):
118
+ khres = {k: v.format(**khres) for k, v in khres.items()}
119
+
120
+
121
+ def charcat(c):
122
+ """Returns the Khmer character category for a single char string"""
123
+ o = ord(c)
124
+ if 0x1780 <= o <= 0x17DD:
125
+ return categories[o - 0x1780]
126
+ elif o == 0x200C:
127
+ return Cats.Z
128
+ elif o == 0x200D:
129
+ return Cats.ZFCoeng
130
+ return Cats.Other
131
+
132
+
133
+ def lunar(m, base):
134
+ """Returns the lunar date symbol from the appropriate set base"""
135
+ v = (ord(m.group(1) or "\u17e0") - 0x17E0) * 10 + ord(m.group(2)) - 0x17E0
136
+ if v > 15: # translate \u17D4\u17D2\u17E0 as well
137
+ return m.group(0)
138
+ return chr(v + base)
139
+
140
+
141
+ def normalize(txt, lang="km"):
142
+ """Returns khmer normalised string, without fixing or marking errors"""
143
+ # Mark final coengs in Middle Khmer
144
+ if lang == "xhm":
145
+ txt = re.sub(r"([\u17B6-\u17C5]\u17D2)", "\u200d\\1", txt)
146
+ # Categorise every character in the string
147
+ charcats = [charcat(c) for c in txt]
148
+
149
+ # Recategorise base -> coeng after coeng char (or ZFCoeng)
150
+ for i in range(1, len(charcats)):
151
+ if txt[i - 1] in "\u200d\u17d2" and charcats[i] in (Cats.Base, Cats.Coeng):
152
+ charcats[i] = charcats[i - 1]
153
+
154
+ # Find subranges of base+non other and sort components in the subrange
155
+ i = 0
156
+ res = []
157
+ while i < len(charcats):
158
+ c = charcats[i]
159
+ if c != Cats.Base:
160
+ res.append(txt[i])
161
+ i += 1
162
+ continue
163
+ # Scan for end of syllable
164
+ j = i + 1
165
+ while j < len(charcats) and charcats[j].value > Cats.Base.value:
166
+ j += 1
167
+ # Sort syllable based on character categories
168
+ # Sort the char indices by category then position in string
169
+ newindices = sorted(range(i, j), key=lambda e: (charcats[e].value, e))
170
+ replaces = "".join(txt[n] for n in newindices)
171
+
172
+ replaces = re.sub(
173
+ "(\u200d?\u17d2)[\u17d2\u200c\u200d]+", r"\1", replaces
174
+ ) # remove multiple invisible chars
175
+ replaces = re.sub("\u17be\u17b6", "\u17c4\u17b8", replaces) # confusable vowels
176
+ # map compoound vowel sequences to compounds with -u before to be converted
177
+ replaces = re.sub("\u17c1([\u17bb-\u17bd]?)\u17b8", "\u17be\\1", replaces)
178
+ replaces = re.sub("\u17c1([\u17bb-\u17bd]?)\u17b6", "\u17c4\\1", replaces)
179
+ replaces = re.sub("(\u17be)(\u17bb)", r"\2\1", replaces)
180
+ # Replace -u + upper vowel with consonant shifter
181
+ replaces = re.sub(
182
+ ("((?:{STRONG})[\u17c1-\u17c5]?)\u17bb" + "(?={VA}|\u17d0)").format(**khres),
183
+ "\\1\u17ca",
184
+ replaces,
185
+ flags=re.X,
186
+ )
187
+ replaces = re.sub(
188
+ ("((?:{NSTRONG})[\u17c1-\u17c5]?)\u17bb" + "(?={VA}|\u17d0)").format(**khres),
189
+ "\\1\u17c9",
190
+ replaces,
191
+ flags=re.X,
192
+ )
193
+ replaces = re.sub(
194
+ "(\u17d2\u179a)(\u17d2[\u1780-\u17b3])", r"\2\1", replaces
195
+ ) # coeng ro second
196
+ # replaces = re.sub("(\u17d2)\u178a", "\\1\u178f", replaces) # coeng da->ta
197
+ # convert lunar dates from old style to use lunar date symbols
198
+ replaces = re.sub(
199
+ "(\u17e1?)([\u17e0-\u17e9])\u17d2\u17d4", lambda m: lunar(m, 0x19E0), replaces
200
+ )
201
+ replaces = re.sub(
202
+ "\u17d4\u17d2(\u17e1?)([\u17e0-\u17e9])", lambda m: lunar(m, 0x19F0), replaces
203
+ )
204
+ replaces = re.sub("\u17d4\u17d2\u17d4", "\u19f0", replaces)
205
+ res.append(replaces)
206
+ i = j
207
+ return "".join(res)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: khmerns
3
- Version: 0.0.3
3
+ Version: 0.0.4
4
4
  Summary: Khmer Neural Segmenter
5
5
  Keywords: khmer,nlp,segmentation,tokenization,neural-network
6
6
  Author-Email: Seanghay Yath <seanghay.dev@gmail.com>
@@ -35,11 +35,15 @@ pip install khmerns
35
35
  ## Usage
36
36
 
37
37
  ```python
38
- from khmerns import tokenize
38
+ from khmerns import tokenize, normalize
39
39
 
40
40
  # Returns a list of words
41
41
  words = tokenize("សួស្តីបងប្អូន")
42
- # ['សួស្តី', 'បង', 'ប្អូន']
42
+ # => ['សួស្តី', 'បង', 'ប្អូន']
43
+
44
+ # normalize and reorder Khmer characters
45
+ words = tokenize(normalize("សួស្តីបងប្អូន"))
46
+ # => ['សួស្តី', 'បង', 'ប្អូន']
43
47
  ```
44
48
 
45
49
  You can also use the class-based API if you prefer:
@@ -48,8 +52,10 @@ You can also use the class-based API if you prefer:
48
52
  from khmerns import KhmerSegmenter
49
53
 
50
54
  segmenter = KhmerSegmenter()
55
+
51
56
  words = segmenter.tokenize("សួស្តីបងប្អូន")
52
57
  # or
58
+
53
59
  words = segmenter("សួស្តីបងប្អូន")
54
60
  ```
55
61
 
@@ -15,15 +15,16 @@ include/ggml-webgpu.h,sha256=YoxXN2KYJOwzJmIKXESA3olVy_gsNSvu2ORuAlFPs5I,347
15
15
  include/ggml-zendnn.h,sha256=rlX7HjkqaAory2NoxpQSVZgPjEnZHZ6dKjQAOLK_8lE,520
16
16
  include/ggml.h,sha256=8AuUtoOyhyNaf65atAKT1NBO-FrquzEXbYB-2VYjC6I,106093
17
17
  include/gguf.h,sha256=dGmizdUtMG2dBzbEiu0ebZfF2Abz5smJuCXuEpMFToc,10426
18
- khmerns/__init__.py,sha256=56VwZ3fseBKsGGMhVtE1EI1Tqi4uhigZ7OdRKf6MAKE,117
19
- khmerns/__init__.pyi,sha256=5Ywjkugs1j6YWPS6bkFFTl077E9mb9-fSXPgu-kQhbw,437
20
- khmerns/_core.cp313-win32.pyd,sha256=iNNM43CsmRjOyZZMNbE3Zy0urinvRsqxlStHb17YqM4,3675648
18
+ khmerns/__init__.py,sha256=8hEEyGjLUGOJa3bST254yZgTznTVBBkXvhfxttm6mnE,163
19
+ khmerns/__init__.pyi,sha256=IvxNBUgitIhVklMX3Toia7ujT5uRKc5FDKHJFEnI9Ds,527
20
+ khmerns/_core.cp313-win32.pyd,sha256=UErs8Nsh-MIhrB5JfEJymkljGy_o9Wmcw94Km4MEs_4,3675648
21
+ khmerns/khnormal.py,sha256=B9WrmKmEARJantRBxKDhsMk5Cr2ljG0xEb61B7vbiQA,8058
21
22
  lib/cmake/ggml/ggml-config.cmake,sha256=OPnz2F8SEuCgmqv8vJc3xlADN6NkgootaXOIKZ9uQ_Y,12170
22
23
  lib/cmake/ggml/ggml-version.cmake,sha256=cmpuq1NZlHRY3WdA7y81233b6KomdObooai5LL4-Dyc,2827
23
- lib/ggml-base.lib,sha256=0lpo9Fx6S8qaFQhzqXpTN8IF69ceJNp2uZ6G4VfN4p0,1217310
24
- lib/ggml-cpu.lib,sha256=b7uRpQwrFVO97VRAN1EKudCAk7Tq2aWbrpEpWHm6rbA,1130366
25
- lib/ggml.lib,sha256=3vtZ3gV3X1MT6tvIqHP3VoOMWSpUkKKjnmC8ekvUMIQ,190804
26
- khmerns-0.0.3.dist-info/METADATA,sha256=NI0Lq5nTQGzEEI5prGscnoTSKg2WFwkoL1QbDhCEAaI,3263
27
- khmerns-0.0.3.dist-info/WHEEL,sha256=BLWkcoBBaeCx8jVONI8U_yio93Axp7qMoW-QbdfeAIY,102
28
- khmerns-0.0.3.dist-info/licenses/LICENSE,sha256=NJbBwbQTQpJwxvCeUDhqpUe3HKE3geRtx5iqIlQ5q0c,1089
29
- khmerns-0.0.3.dist-info/RECORD,,
24
+ lib/ggml-base.lib,sha256=Zf770LD1Qgjb4zVgYae3qvO6xh4L8f5si6BXE5MgoWg,1217310
25
+ lib/ggml-cpu.lib,sha256=jUZVtJZfl_7_gi52X1IWPq0t9WqOaGfL6zdgEA5M14w,1130366
26
+ lib/ggml.lib,sha256=SHjLwyVS-blOapFSg3lUAsbhrs3AmEdXJdZWmeN4GTk,190804
27
+ khmerns-0.0.4.dist-info/METADATA,sha256=6hqAVMq7aCeoVlQpIvnB7CdiG5gtTSoepWc7zTIO_S8,3449
28
+ khmerns-0.0.4.dist-info/WHEEL,sha256=BLWkcoBBaeCx8jVONI8U_yio93Axp7qMoW-QbdfeAIY,102
29
+ khmerns-0.0.4.dist-info/licenses/LICENSE,sha256=NJbBwbQTQpJwxvCeUDhqpUe3HKE3geRtx5iqIlQ5q0c,1089
30
+ khmerns-0.0.4.dist-info/RECORD,,
lib/ggml-base.lib CHANGED
Binary file
lib/ggml-cpu.lib CHANGED
Binary file
lib/ggml.lib CHANGED
Binary file