nexaloid 0.1.0a0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nexaloid-0.1.0a0/PKG-INFO +5 -0
- nexaloid-0.1.0a0/pyproject.toml +13 -0
- nexaloid-0.1.0a0/setup.cfg +4 -0
- nexaloid-0.1.0a0/src/nexaloid/__init__.py +5 -0
- nexaloid-0.1.0a0/src/nexaloid/compat_jieba.py +40 -0
- nexaloid-0.1.0a0/src/nexaloid/token.py +14 -0
- nexaloid-0.1.0a0/src/nexaloid/tokenizer.py +313 -0
- nexaloid-0.1.0a0/src/nexaloid.egg-info/PKG-INFO +5 -0
- nexaloid-0.1.0a0/src/nexaloid.egg-info/SOURCES.txt +9 -0
- nexaloid-0.1.0a0/src/nexaloid.egg-info/dependency_links.txt +1 -0
- nexaloid-0.1.0a0/src/nexaloid.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "nexaloid"
|
|
7
|
+
version = "0.1.0a0"
|
|
8
|
+
description = "Chinese tokenizer runtime"
|
|
9
|
+
requires-python = ">=3.9"
|
|
10
|
+
|
|
11
|
+
[tool.setuptools.packages.find]
|
|
12
|
+
where = ["src"]
|
|
13
|
+
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from .tokenizer import Tokenizer
|
|
4
|
+
|
|
5
|
+
# Module-level tokenizer mirrors jieba's global API while still delegating to native core.
|
|
6
|
+
dt = Tokenizer()
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def cut(sentence, cut_all=False, HMM=True, use_paddle=False):
|
|
10
|
+
del use_paddle
|
|
11
|
+
return dt.cut(sentence, cut_all=cut_all, HMM=HMM)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def lcut(sentence, cut_all=False, HMM=True, use_paddle=False):
|
|
15
|
+
del use_paddle
|
|
16
|
+
return dt.lcut(sentence, cut_all=cut_all, HMM=HMM)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def cut_for_search(sentence, HMM=True):
|
|
20
|
+
return dt.cut_for_search(sentence, HMM=HMM)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def lcut_for_search(sentence, HMM=True):
|
|
24
|
+
return list(cut_for_search(sentence, HMM=HMM))
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def add_word(word, freq=None, tag=None):
|
|
28
|
+
return dt.add_word(word, freq=freq, tag=tag)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def del_word(word):
|
|
32
|
+
return dt.del_word(word)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def load_userdict(f):
|
|
36
|
+
return dt.load_userdict(f)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def suggest_freq(segment, tune=False):
|
|
40
|
+
return dt.suggest_freq(segment, tune=tune)
|
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import ctypes
|
|
4
|
+
import os
|
|
5
|
+
from enum import IntEnum
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from .token import Token
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Mode(IntEnum):
|
|
12
|
+
ACCURATE = 0
|
|
13
|
+
FULL = 1
|
|
14
|
+
SEARCH = 2
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class _NxConfig(ctypes.Structure):
|
|
18
|
+
# Keep this layout byte-for-byte compatible with core/include/nexaloid.h.
|
|
19
|
+
_fields_ = [
|
|
20
|
+
("dict_path", ctypes.c_char_p),
|
|
21
|
+
("user_dict_path", ctypes.c_char_p),
|
|
22
|
+
("enable_hmm", ctypes.c_uint32),
|
|
23
|
+
("enable_normalization", ctypes.c_uint32),
|
|
24
|
+
("enable_plugins", ctypes.c_uint32),
|
|
25
|
+
("reserved", ctypes.c_uint32 * 8),
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class _NxToken(ctypes.Structure):
|
|
30
|
+
# Token fields mirror NxToken; Python only wraps them into a dataclass.
|
|
31
|
+
_fields_ = [
|
|
32
|
+
("start_byte", ctypes.c_uint32),
|
|
33
|
+
("end_byte", ctypes.c_uint32),
|
|
34
|
+
("start_char", ctypes.c_uint32),
|
|
35
|
+
("end_char", ctypes.c_uint32),
|
|
36
|
+
("word_id", ctypes.c_uint32),
|
|
37
|
+
("pos_id", ctypes.c_uint16),
|
|
38
|
+
("source", ctypes.c_uint16),
|
|
39
|
+
("flags", ctypes.c_uint16),
|
|
40
|
+
("score", ctypes.c_float),
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
_CALLBACK = ctypes.CFUNCTYPE(
|
|
45
|
+
None,
|
|
46
|
+
ctypes.POINTER(_NxToken),
|
|
47
|
+
ctypes.POINTER(ctypes.c_char),
|
|
48
|
+
ctypes.c_size_t,
|
|
49
|
+
ctypes.c_void_p,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
_BATCH_CALLBACK = ctypes.CFUNCTYPE(
|
|
53
|
+
None,
|
|
54
|
+
ctypes.c_uint32,
|
|
55
|
+
ctypes.POINTER(_NxToken),
|
|
56
|
+
ctypes.POINTER(ctypes.c_char),
|
|
57
|
+
ctypes.c_size_t,
|
|
58
|
+
ctypes.c_void_p,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
_SOURCES = {
|
|
62
|
+
1: "base_dict",
|
|
63
|
+
2: "user_dict",
|
|
64
|
+
3: "domain_dict",
|
|
65
|
+
4: "rule",
|
|
66
|
+
5: "unknown",
|
|
67
|
+
6: "plugin",
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
_DICT_DIR = Path(__file__).resolve().parents[4] / "data" / "dict"
|
|
71
|
+
_BUILT_DICT = _DICT_DIR / "nexaloid.nxdict"
|
|
72
|
+
_BUILT_TEXT_DICT = _DICT_DIR / "nexaloid.tsv"
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _resolve_dict_path(dict_path: str | os.PathLike[str] | None) -> Path:
|
|
76
|
+
if dict_path is not None:
|
|
77
|
+
return Path(dict_path)
|
|
78
|
+
if _BUILT_DICT.exists():
|
|
79
|
+
return _BUILT_DICT
|
|
80
|
+
return _BUILT_TEXT_DICT
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _resolve_domain_dict_path(domain: str | None) -> Path | None:
|
|
84
|
+
if domain is None:
|
|
85
|
+
return None
|
|
86
|
+
domain_dir = os.environ.get("NEXALOID_DOMAIN_DICT_DIR")
|
|
87
|
+
if not domain_dir:
|
|
88
|
+
raise ValueError("domain requires NEXALOID_DOMAIN_DICT_DIR")
|
|
89
|
+
|
|
90
|
+
root = Path(domain_dir)
|
|
91
|
+
for suffix in (".nxdict", ".tsv", ".txt"):
|
|
92
|
+
candidate = root / f"{domain}{suffix}"
|
|
93
|
+
if candidate.exists():
|
|
94
|
+
return candidate
|
|
95
|
+
raise FileNotFoundError(f"domain dictionary not found: {root / (domain + '.tsv')}")
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _load_lib() -> ctypes.CDLL:
|
|
99
|
+
# Bindings are thin wrappers; the native shared library does all tokenization.
|
|
100
|
+
explicit = os.environ.get("NEXALOID_LIB")
|
|
101
|
+
candidates = []
|
|
102
|
+
if explicit:
|
|
103
|
+
candidates.append(Path(explicit))
|
|
104
|
+
root = Path(__file__).resolve().parents[4]
|
|
105
|
+
candidates += [
|
|
106
|
+
root / "core" / "zig-out" / "bin" / "nexaloid.dll",
|
|
107
|
+
root / "core" / "zig-out" / "lib" / "libnexaloid.so",
|
|
108
|
+
root / "core" / "zig-out" / "lib" / "libnexaloid.dylib",
|
|
109
|
+
]
|
|
110
|
+
for path in candidates:
|
|
111
|
+
if path.exists():
|
|
112
|
+
return ctypes.CDLL(str(path))
|
|
113
|
+
raise RuntimeError("nexaloid shared library not found; run `zig build` in core or set NEXALOID_LIB")
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
_LIB = _load_lib()
|
|
117
|
+
_LIB.nx_engine_new.argtypes = [ctypes.POINTER(_NxConfig), ctypes.POINTER(ctypes.c_void_p)]
|
|
118
|
+
_LIB.nx_engine_new.restype = ctypes.c_int
|
|
119
|
+
_LIB.nx_engine_free.argtypes = [ctypes.c_void_p]
|
|
120
|
+
_LIB.nx_engine_free.restype = None
|
|
121
|
+
_LIB.nx_tokenize.argtypes = [
|
|
122
|
+
ctypes.c_void_p,
|
|
123
|
+
ctypes.c_char_p,
|
|
124
|
+
ctypes.c_size_t,
|
|
125
|
+
ctypes.c_int,
|
|
126
|
+
_CALLBACK,
|
|
127
|
+
ctypes.c_void_p,
|
|
128
|
+
]
|
|
129
|
+
_LIB.nx_tokenize.restype = ctypes.c_int
|
|
130
|
+
_LIB.nx_tokenize_batch.argtypes = [
|
|
131
|
+
ctypes.c_void_p,
|
|
132
|
+
ctypes.POINTER(ctypes.c_char_p),
|
|
133
|
+
ctypes.POINTER(ctypes.c_size_t),
|
|
134
|
+
ctypes.c_size_t,
|
|
135
|
+
ctypes.c_int,
|
|
136
|
+
ctypes.c_uint32,
|
|
137
|
+
_BATCH_CALLBACK,
|
|
138
|
+
ctypes.c_void_p,
|
|
139
|
+
]
|
|
140
|
+
_LIB.nx_tokenize_batch.restype = ctypes.c_int
|
|
141
|
+
_LIB.nx_add_word.argtypes = [
|
|
142
|
+
ctypes.c_void_p,
|
|
143
|
+
ctypes.c_char_p,
|
|
144
|
+
ctypes.c_size_t,
|
|
145
|
+
ctypes.c_uint32,
|
|
146
|
+
ctypes.c_float,
|
|
147
|
+
ctypes.c_uint16,
|
|
148
|
+
]
|
|
149
|
+
_LIB.nx_add_word.restype = ctypes.c_int
|
|
150
|
+
_LIB.nx_reload_user_dict.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
|
|
151
|
+
_LIB.nx_reload_user_dict.restype = ctypes.c_int
|
|
152
|
+
_LIB.nx_status_message.argtypes = [ctypes.c_int]
|
|
153
|
+
_LIB.nx_status_message.restype = ctypes.c_char_p
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class NexaloidError(RuntimeError):
|
|
157
|
+
pass
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class Tokenizer:
|
|
161
|
+
def __init__(
|
|
162
|
+
self,
|
|
163
|
+
dict_path: str | os.PathLike[str] | None = None,
|
|
164
|
+
*,
|
|
165
|
+
domain: str | None = None,
|
|
166
|
+
):
|
|
167
|
+
self._engine = ctypes.c_void_p()
|
|
168
|
+
config = _NxConfig()
|
|
169
|
+
resolved_dict = _resolve_dict_path(dict_path)
|
|
170
|
+
resolved_domain_dict = _resolve_domain_dict_path(domain)
|
|
171
|
+
if resolved_dict.exists():
|
|
172
|
+
config.dict_path = str(resolved_dict).encode("utf-8")
|
|
173
|
+
if resolved_domain_dict is not None:
|
|
174
|
+
config.user_dict_path = str(resolved_domain_dict).encode("utf-8")
|
|
175
|
+
self._check(_LIB.nx_engine_new(ctypes.byref(config), ctypes.byref(self._engine)))
|
|
176
|
+
self._closed = False
|
|
177
|
+
self._words: dict[str, float] = {}
|
|
178
|
+
self._deleted: set[str] = set()
|
|
179
|
+
|
|
180
|
+
def close(self) -> None:
|
|
181
|
+
if not self._closed:
|
|
182
|
+
_LIB.nx_engine_free(self._engine)
|
|
183
|
+
self._closed = True
|
|
184
|
+
|
|
185
|
+
def __del__(self) -> None:
|
|
186
|
+
try:
|
|
187
|
+
self.close()
|
|
188
|
+
except Exception:
|
|
189
|
+
pass
|
|
190
|
+
|
|
191
|
+
def add_word(self, word: str, freq: float | None = None, tag: str | None = None) -> None:
|
|
192
|
+
del tag
|
|
193
|
+
score = float(freq if freq is not None else 10.0)
|
|
194
|
+
self._add_word_score(word, score)
|
|
195
|
+
|
|
196
|
+
def _add_word_score(self, word: str, score: float) -> None:
|
|
197
|
+
data = word.encode("utf-8")
|
|
198
|
+
self._check(_LIB.nx_add_word(self._engine, data, len(data), 0, score, 0))
|
|
199
|
+
self._words[word] = score
|
|
200
|
+
self._deleted.discard(word)
|
|
201
|
+
|
|
202
|
+
def del_word(self, word: str) -> None:
|
|
203
|
+
self._deleted.add(word)
|
|
204
|
+
|
|
205
|
+
def load_userdict(self, path: str | os.PathLike[str]) -> None:
|
|
206
|
+
data = str(Path(path)).encode("utf-8")
|
|
207
|
+
self._check(_LIB.nx_reload_user_dict(self._engine, data))
|
|
208
|
+
|
|
209
|
+
def suggest_freq(self, segment, tune: bool = False):
|
|
210
|
+
word = "".join(segment) if isinstance(segment, tuple) else str(segment)
|
|
211
|
+
if tune:
|
|
212
|
+
self.add_word(word, freq=20.0)
|
|
213
|
+
return self._words.get(word, 0)
|
|
214
|
+
|
|
215
|
+
def tokenize(self, text: str, mode: Mode = Mode.ACCURATE) -> list[Token]:
|
|
216
|
+
data = text.encode("utf-8")
|
|
217
|
+
out: list[Token] = []
|
|
218
|
+
|
|
219
|
+
@_CALLBACK
|
|
220
|
+
def on_token(token_ptr, text_ptr, text_len, user_data):
|
|
221
|
+
del user_data
|
|
222
|
+
# Copy text from the callback frame immediately; native memory is not retained.
|
|
223
|
+
raw = ctypes.string_at(text_ptr, text_len)
|
|
224
|
+
token = token_ptr.contents
|
|
225
|
+
part = raw[token.start_byte : token.end_byte].decode("utf-8")
|
|
226
|
+
if part in self._deleted:
|
|
227
|
+
return
|
|
228
|
+
out.append(
|
|
229
|
+
Token(
|
|
230
|
+
text=part,
|
|
231
|
+
start_byte=token.start_byte,
|
|
232
|
+
end_byte=token.end_byte,
|
|
233
|
+
start_char=token.start_char,
|
|
234
|
+
end_char=token.end_char,
|
|
235
|
+
pos=None,
|
|
236
|
+
source=_SOURCES.get(token.source, "unknown"),
|
|
237
|
+
score=float(token.score),
|
|
238
|
+
)
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
self._check(_LIB.nx_tokenize(self._engine, data, len(data), int(mode), on_token, None))
|
|
242
|
+
return out
|
|
243
|
+
|
|
244
|
+
def tokenize_batch(
|
|
245
|
+
self,
|
|
246
|
+
texts: list[str] | tuple[str, ...],
|
|
247
|
+
mode: Mode = Mode.ACCURATE,
|
|
248
|
+
thread_count: int = 0,
|
|
249
|
+
) -> list[list[Token]]:
|
|
250
|
+
encoded = [text.encode("utf-8") for text in texts]
|
|
251
|
+
text_array = (ctypes.c_char_p * len(encoded))(*encoded)
|
|
252
|
+
len_array = (ctypes.c_size_t * len(encoded))(*(len(item) for item in encoded))
|
|
253
|
+
out: list[list[Token]] = [[] for _ in encoded]
|
|
254
|
+
|
|
255
|
+
@_BATCH_CALLBACK
|
|
256
|
+
def on_token(index, token_ptr, text_ptr, text_len, user_data):
|
|
257
|
+
del user_data
|
|
258
|
+
# Core emits batch callbacks in input order after worker threads finish.
|
|
259
|
+
raw = ctypes.string_at(text_ptr, text_len)
|
|
260
|
+
token = token_ptr.contents
|
|
261
|
+
part = raw[token.start_byte : token.end_byte].decode("utf-8")
|
|
262
|
+
if part in self._deleted:
|
|
263
|
+
return
|
|
264
|
+
out[index].append(
|
|
265
|
+
Token(
|
|
266
|
+
text=part,
|
|
267
|
+
start_byte=token.start_byte,
|
|
268
|
+
end_byte=token.end_byte,
|
|
269
|
+
start_char=token.start_char,
|
|
270
|
+
end_char=token.end_char,
|
|
271
|
+
pos=None,
|
|
272
|
+
source=_SOURCES.get(token.source, "unknown"),
|
|
273
|
+
score=float(token.score),
|
|
274
|
+
)
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
self._check(
|
|
278
|
+
_LIB.nx_tokenize_batch(
|
|
279
|
+
self._engine,
|
|
280
|
+
text_array,
|
|
281
|
+
len_array,
|
|
282
|
+
len(encoded),
|
|
283
|
+
int(mode),
|
|
284
|
+
max(0, int(thread_count)),
|
|
285
|
+
on_token,
|
|
286
|
+
None,
|
|
287
|
+
)
|
|
288
|
+
)
|
|
289
|
+
return out
|
|
290
|
+
|
|
291
|
+
def cut(self, text: str, cut_all: bool = False, HMM: bool = True):
|
|
292
|
+
del HMM
|
|
293
|
+
mode = Mode.FULL if cut_all else Mode.ACCURATE
|
|
294
|
+
for token in self.tokenize(text, mode):
|
|
295
|
+
yield token.text
|
|
296
|
+
|
|
297
|
+
def lcut(self, text: str, cut_all: bool = False, HMM: bool = True) -> list[str]:
|
|
298
|
+
return list(self.cut(text, cut_all=cut_all, HMM=HMM))
|
|
299
|
+
|
|
300
|
+
def cut_for_search(self, text: str, HMM: bool = True):
|
|
301
|
+
del HMM
|
|
302
|
+
seen: set[str] = set()
|
|
303
|
+
for token in self.tokenize(text, Mode.SEARCH):
|
|
304
|
+
if len(token.text) <= 1:
|
|
305
|
+
continue
|
|
306
|
+
if token.text not in seen:
|
|
307
|
+
seen.add(token.text)
|
|
308
|
+
yield token.text
|
|
309
|
+
|
|
310
|
+
def _check(self, status: int) -> None:
|
|
311
|
+
if status != 0:
|
|
312
|
+
msg = _LIB.nx_status_message(status).decode("utf-8", "replace")
|
|
313
|
+
raise NexaloidError(msg)
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
pyproject.toml
|
|
2
|
+
src/nexaloid/__init__.py
|
|
3
|
+
src/nexaloid/compat_jieba.py
|
|
4
|
+
src/nexaloid/token.py
|
|
5
|
+
src/nexaloid/tokenizer.py
|
|
6
|
+
src/nexaloid.egg-info/PKG-INFO
|
|
7
|
+
src/nexaloid.egg-info/SOURCES.txt
|
|
8
|
+
src/nexaloid.egg-info/dependency_links.txt
|
|
9
|
+
src/nexaloid.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
nexaloid
|