nexaloid 0.1.0a0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ Metadata-Version: 2.4
2
+ Name: nexaloid
3
+ Version: 0.1.0a0
4
+ Summary: Chinese tokenizer runtime
5
+ Requires-Python: >=3.9
@@ -0,0 +1,13 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "nexaloid"
7
+ version = "0.1.0a0"
8
+ description = "Chinese tokenizer runtime"
9
+ requires-python = ">=3.9"
10
+
11
+ [tool.setuptools.packages.find]
12
+ where = ["src"]
13
+
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,5 @@
1
+ from .token import Token
2
+ from .tokenizer import Mode, Tokenizer
3
+
4
+ __all__ = ["Mode", "Token", "Tokenizer"]
5
+
@@ -0,0 +1,40 @@
1
+ from __future__ import annotations
2
+
3
+ from .tokenizer import Tokenizer
4
+
5
+ # Module-level tokenizer mirrors jieba's global API while still delegating to native core.
6
+ dt = Tokenizer()
7
+
8
+
9
+ def cut(sentence, cut_all=False, HMM=True, use_paddle=False):
10
+ del use_paddle
11
+ return dt.cut(sentence, cut_all=cut_all, HMM=HMM)
12
+
13
+
14
+ def lcut(sentence, cut_all=False, HMM=True, use_paddle=False):
15
+ del use_paddle
16
+ return dt.lcut(sentence, cut_all=cut_all, HMM=HMM)
17
+
18
+
19
+ def cut_for_search(sentence, HMM=True):
20
+ return dt.cut_for_search(sentence, HMM=HMM)
21
+
22
+
23
+ def lcut_for_search(sentence, HMM=True):
24
+ return list(cut_for_search(sentence, HMM=HMM))
25
+
26
+
27
+ def add_word(word, freq=None, tag=None):
28
+ return dt.add_word(word, freq=freq, tag=tag)
29
+
30
+
31
+ def del_word(word):
32
+ return dt.del_word(word)
33
+
34
+
35
+ def load_userdict(f):
36
+ return dt.load_userdict(f)
37
+
38
+
39
+ def suggest_freq(segment, tune=False):
40
+ return dt.suggest_freq(segment, tune=tune)
@@ -0,0 +1,14 @@
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass(frozen=True)
5
+ class Token:
6
+ text: str
7
+ start_byte: int
8
+ end_byte: int
9
+ start_char: int
10
+ end_char: int
11
+ pos: str | None
12
+ source: str
13
+ score: float
14
+
@@ -0,0 +1,313 @@
1
+ from __future__ import annotations
2
+
3
+ import ctypes
4
+ import os
5
+ from enum import IntEnum
6
+ from pathlib import Path
7
+
8
+ from .token import Token
9
+
10
+
11
+ class Mode(IntEnum):
12
+ ACCURATE = 0
13
+ FULL = 1
14
+ SEARCH = 2
15
+
16
+
17
+ class _NxConfig(ctypes.Structure):
18
+ # Keep this layout byte-for-byte compatible with core/include/nexaloid.h.
19
+ _fields_ = [
20
+ ("dict_path", ctypes.c_char_p),
21
+ ("user_dict_path", ctypes.c_char_p),
22
+ ("enable_hmm", ctypes.c_uint32),
23
+ ("enable_normalization", ctypes.c_uint32),
24
+ ("enable_plugins", ctypes.c_uint32),
25
+ ("reserved", ctypes.c_uint32 * 8),
26
+ ]
27
+
28
+
29
+ class _NxToken(ctypes.Structure):
30
+ # Token fields mirror NxToken; Python only wraps them into a dataclass.
31
+ _fields_ = [
32
+ ("start_byte", ctypes.c_uint32),
33
+ ("end_byte", ctypes.c_uint32),
34
+ ("start_char", ctypes.c_uint32),
35
+ ("end_char", ctypes.c_uint32),
36
+ ("word_id", ctypes.c_uint32),
37
+ ("pos_id", ctypes.c_uint16),
38
+ ("source", ctypes.c_uint16),
39
+ ("flags", ctypes.c_uint16),
40
+ ("score", ctypes.c_float),
41
+ ]
42
+
43
+
44
+ _CALLBACK = ctypes.CFUNCTYPE(
45
+ None,
46
+ ctypes.POINTER(_NxToken),
47
+ ctypes.POINTER(ctypes.c_char),
48
+ ctypes.c_size_t,
49
+ ctypes.c_void_p,
50
+ )
51
+
52
+ _BATCH_CALLBACK = ctypes.CFUNCTYPE(
53
+ None,
54
+ ctypes.c_uint32,
55
+ ctypes.POINTER(_NxToken),
56
+ ctypes.POINTER(ctypes.c_char),
57
+ ctypes.c_size_t,
58
+ ctypes.c_void_p,
59
+ )
60
+
61
+ _SOURCES = {
62
+ 1: "base_dict",
63
+ 2: "user_dict",
64
+ 3: "domain_dict",
65
+ 4: "rule",
66
+ 5: "unknown",
67
+ 6: "plugin",
68
+ }
69
+
70
+ _DICT_DIR = Path(__file__).resolve().parents[4] / "data" / "dict"
71
+ _BUILT_DICT = _DICT_DIR / "nexaloid.nxdict"
72
+ _BUILT_TEXT_DICT = _DICT_DIR / "nexaloid.tsv"
73
+
74
+
75
+ def _resolve_dict_path(dict_path: str | os.PathLike[str] | None) -> Path:
76
+ if dict_path is not None:
77
+ return Path(dict_path)
78
+ if _BUILT_DICT.exists():
79
+ return _BUILT_DICT
80
+ return _BUILT_TEXT_DICT
81
+
82
+
83
+ def _resolve_domain_dict_path(domain: str | None) -> Path | None:
84
+ if domain is None:
85
+ return None
86
+ domain_dir = os.environ.get("NEXALOID_DOMAIN_DICT_DIR")
87
+ if not domain_dir:
88
+ raise ValueError("domain requires NEXALOID_DOMAIN_DICT_DIR")
89
+
90
+ root = Path(domain_dir)
91
+ for suffix in (".nxdict", ".tsv", ".txt"):
92
+ candidate = root / f"{domain}{suffix}"
93
+ if candidate.exists():
94
+ return candidate
95
+ raise FileNotFoundError(f"domain dictionary not found: {root / (domain + '.tsv')}")
96
+
97
+
98
+ def _load_lib() -> ctypes.CDLL:
99
+ # Bindings are thin wrappers; the native shared library does all tokenization.
100
+ explicit = os.environ.get("NEXALOID_LIB")
101
+ candidates = []
102
+ if explicit:
103
+ candidates.append(Path(explicit))
104
+ root = Path(__file__).resolve().parents[4]
105
+ candidates += [
106
+ root / "core" / "zig-out" / "bin" / "nexaloid.dll",
107
+ root / "core" / "zig-out" / "lib" / "libnexaloid.so",
108
+ root / "core" / "zig-out" / "lib" / "libnexaloid.dylib",
109
+ ]
110
+ for path in candidates:
111
+ if path.exists():
112
+ return ctypes.CDLL(str(path))
113
+ raise RuntimeError("nexaloid shared library not found; run `zig build` in core or set NEXALOID_LIB")
114
+
115
+
116
+ _LIB = _load_lib()
117
+ _LIB.nx_engine_new.argtypes = [ctypes.POINTER(_NxConfig), ctypes.POINTER(ctypes.c_void_p)]
118
+ _LIB.nx_engine_new.restype = ctypes.c_int
119
+ _LIB.nx_engine_free.argtypes = [ctypes.c_void_p]
120
+ _LIB.nx_engine_free.restype = None
121
+ _LIB.nx_tokenize.argtypes = [
122
+ ctypes.c_void_p,
123
+ ctypes.c_char_p,
124
+ ctypes.c_size_t,
125
+ ctypes.c_int,
126
+ _CALLBACK,
127
+ ctypes.c_void_p,
128
+ ]
129
+ _LIB.nx_tokenize.restype = ctypes.c_int
130
+ _LIB.nx_tokenize_batch.argtypes = [
131
+ ctypes.c_void_p,
132
+ ctypes.POINTER(ctypes.c_char_p),
133
+ ctypes.POINTER(ctypes.c_size_t),
134
+ ctypes.c_size_t,
135
+ ctypes.c_int,
136
+ ctypes.c_uint32,
137
+ _BATCH_CALLBACK,
138
+ ctypes.c_void_p,
139
+ ]
140
+ _LIB.nx_tokenize_batch.restype = ctypes.c_int
141
+ _LIB.nx_add_word.argtypes = [
142
+ ctypes.c_void_p,
143
+ ctypes.c_char_p,
144
+ ctypes.c_size_t,
145
+ ctypes.c_uint32,
146
+ ctypes.c_float,
147
+ ctypes.c_uint16,
148
+ ]
149
+ _LIB.nx_add_word.restype = ctypes.c_int
150
+ _LIB.nx_reload_user_dict.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
151
+ _LIB.nx_reload_user_dict.restype = ctypes.c_int
152
+ _LIB.nx_status_message.argtypes = [ctypes.c_int]
153
+ _LIB.nx_status_message.restype = ctypes.c_char_p
154
+
155
+
156
+ class NexaloidError(RuntimeError):
157
+ pass
158
+
159
+
160
+ class Tokenizer:
161
+ def __init__(
162
+ self,
163
+ dict_path: str | os.PathLike[str] | None = None,
164
+ *,
165
+ domain: str | None = None,
166
+ ):
167
+ self._engine = ctypes.c_void_p()
168
+ config = _NxConfig()
169
+ resolved_dict = _resolve_dict_path(dict_path)
170
+ resolved_domain_dict = _resolve_domain_dict_path(domain)
171
+ if resolved_dict.exists():
172
+ config.dict_path = str(resolved_dict).encode("utf-8")
173
+ if resolved_domain_dict is not None:
174
+ config.user_dict_path = str(resolved_domain_dict).encode("utf-8")
175
+ self._check(_LIB.nx_engine_new(ctypes.byref(config), ctypes.byref(self._engine)))
176
+ self._closed = False
177
+ self._words: dict[str, float] = {}
178
+ self._deleted: set[str] = set()
179
+
180
+ def close(self) -> None:
181
+ if not self._closed:
182
+ _LIB.nx_engine_free(self._engine)
183
+ self._closed = True
184
+
185
+ def __del__(self) -> None:
186
+ try:
187
+ self.close()
188
+ except Exception:
189
+ pass
190
+
191
+ def add_word(self, word: str, freq: float | None = None, tag: str | None = None) -> None:
192
+ del tag
193
+ score = float(freq if freq is not None else 10.0)
194
+ self._add_word_score(word, score)
195
+
196
+ def _add_word_score(self, word: str, score: float) -> None:
197
+ data = word.encode("utf-8")
198
+ self._check(_LIB.nx_add_word(self._engine, data, len(data), 0, score, 0))
199
+ self._words[word] = score
200
+ self._deleted.discard(word)
201
+
202
+ def del_word(self, word: str) -> None:
203
+ self._deleted.add(word)
204
+
205
+ def load_userdict(self, path: str | os.PathLike[str]) -> None:
206
+ data = str(Path(path)).encode("utf-8")
207
+ self._check(_LIB.nx_reload_user_dict(self._engine, data))
208
+
209
+ def suggest_freq(self, segment, tune: bool = False):
210
+ word = "".join(segment) if isinstance(segment, tuple) else str(segment)
211
+ if tune:
212
+ self.add_word(word, freq=20.0)
213
+ return self._words.get(word, 0)
214
+
215
+ def tokenize(self, text: str, mode: Mode = Mode.ACCURATE) -> list[Token]:
216
+ data = text.encode("utf-8")
217
+ out: list[Token] = []
218
+
219
+ @_CALLBACK
220
+ def on_token(token_ptr, text_ptr, text_len, user_data):
221
+ del user_data
222
+ # Copy text from the callback frame immediately; native memory is not retained.
223
+ raw = ctypes.string_at(text_ptr, text_len)
224
+ token = token_ptr.contents
225
+ part = raw[token.start_byte : token.end_byte].decode("utf-8")
226
+ if part in self._deleted:
227
+ return
228
+ out.append(
229
+ Token(
230
+ text=part,
231
+ start_byte=token.start_byte,
232
+ end_byte=token.end_byte,
233
+ start_char=token.start_char,
234
+ end_char=token.end_char,
235
+ pos=None,
236
+ source=_SOURCES.get(token.source, "unknown"),
237
+ score=float(token.score),
238
+ )
239
+ )
240
+
241
+ self._check(_LIB.nx_tokenize(self._engine, data, len(data), int(mode), on_token, None))
242
+ return out
243
+
244
+ def tokenize_batch(
245
+ self,
246
+ texts: list[str] | tuple[str, ...],
247
+ mode: Mode = Mode.ACCURATE,
248
+ thread_count: int = 0,
249
+ ) -> list[list[Token]]:
250
+ encoded = [text.encode("utf-8") for text in texts]
251
+ text_array = (ctypes.c_char_p * len(encoded))(*encoded)
252
+ len_array = (ctypes.c_size_t * len(encoded))(*(len(item) for item in encoded))
253
+ out: list[list[Token]] = [[] for _ in encoded]
254
+
255
+ @_BATCH_CALLBACK
256
+ def on_token(index, token_ptr, text_ptr, text_len, user_data):
257
+ del user_data
258
+ # Core emits batch callbacks in input order after worker threads finish.
259
+ raw = ctypes.string_at(text_ptr, text_len)
260
+ token = token_ptr.contents
261
+ part = raw[token.start_byte : token.end_byte].decode("utf-8")
262
+ if part in self._deleted:
263
+ return
264
+ out[index].append(
265
+ Token(
266
+ text=part,
267
+ start_byte=token.start_byte,
268
+ end_byte=token.end_byte,
269
+ start_char=token.start_char,
270
+ end_char=token.end_char,
271
+ pos=None,
272
+ source=_SOURCES.get(token.source, "unknown"),
273
+ score=float(token.score),
274
+ )
275
+ )
276
+
277
+ self._check(
278
+ _LIB.nx_tokenize_batch(
279
+ self._engine,
280
+ text_array,
281
+ len_array,
282
+ len(encoded),
283
+ int(mode),
284
+ max(0, int(thread_count)),
285
+ on_token,
286
+ None,
287
+ )
288
+ )
289
+ return out
290
+
291
+ def cut(self, text: str, cut_all: bool = False, HMM: bool = True):
292
+ del HMM
293
+ mode = Mode.FULL if cut_all else Mode.ACCURATE
294
+ for token in self.tokenize(text, mode):
295
+ yield token.text
296
+
297
+ def lcut(self, text: str, cut_all: bool = False, HMM: bool = True) -> list[str]:
298
+ return list(self.cut(text, cut_all=cut_all, HMM=HMM))
299
+
300
+ def cut_for_search(self, text: str, HMM: bool = True):
301
+ del HMM
302
+ seen: set[str] = set()
303
+ for token in self.tokenize(text, Mode.SEARCH):
304
+ if len(token.text) <= 1:
305
+ continue
306
+ if token.text not in seen:
307
+ seen.add(token.text)
308
+ yield token.text
309
+
310
+ def _check(self, status: int) -> None:
311
+ if status != 0:
312
+ msg = _LIB.nx_status_message(status).decode("utf-8", "replace")
313
+ raise NexaloidError(msg)
@@ -0,0 +1,5 @@
1
+ Metadata-Version: 2.4
2
+ Name: nexaloid
3
+ Version: 0.1.0a0
4
+ Summary: Chinese tokenizer runtime
5
+ Requires-Python: >=3.9
@@ -0,0 +1,9 @@
1
+ pyproject.toml
2
+ src/nexaloid/__init__.py
3
+ src/nexaloid/compat_jieba.py
4
+ src/nexaloid/token.py
5
+ src/nexaloid/tokenizer.py
6
+ src/nexaloid.egg-info/PKG-INFO
7
+ src/nexaloid.egg-info/SOURCES.txt
8
+ src/nexaloid.egg-info/dependency_links.txt
9
+ src/nexaloid.egg-info/top_level.txt
@@ -0,0 +1 @@
1
+ nexaloid