khmerns 0.0.3__cp314-cp314-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
include/gguf.h ADDED
@@ -0,0 +1,202 @@
1
+ // This file contains functionality related to "GGUF" files, the binary file format used by ggml.
2
+ // GGUF files have the following structure:
3
+ //
4
+ // 1. File magic "GGUF" (4 bytes).
5
+ // 2. File version (uint32_t).
6
+ // 3. Number of ggml tensors in file (int64_t).
7
+ // 4. Number of key-value-pairs in file (int64_t).
8
+ // 5. For each KV pair:
9
+ // 1. The key (string).
10
+ // 2. The value type (gguf_type).
11
+ // 3a. If the value type is GGUF_TYPE_ARRAY:
12
+ // 1. The type of the array (gguf_type).
13
+ // 2. The number of elements in the array (uint64_t).
14
+ // 3. The binary representation of each element in the array.
15
+ // 3b. Otherwise:
16
+ // 1. The binary representation of the value.
17
+ // 6. For each ggml tensor:
18
+ // 1. The tensor name (string).
19
+ // 2. The number of dimensions of the tensor (uint32_t).
20
+ // 3. For each dimension:
21
+ // 1. The size of the tensor in the dimension (int64_t).
22
+ // 4. The tensor data type (ggml_type).
23
+ // 5. The tensor data offset in the tensor data binary blob (uint64_t).
24
+ // 7. The tensor data binary blob (optional, aligned).
25
+ //
26
+ // Strings are serialized as the string length (uint64_t) followed by the C string without the null terminator.
27
+ // All enums are stored as int32_t.
28
+ // All bool values are stored as int8_t.
29
+ // If the special key "general.alignment" (uint32_t) is defined it is used for alignment,
30
+ // otherwise GGUF_DEFAULT_ALIGNMENT is used.
31
+ //
32
+ // Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de)
33
+
34
+ #pragma once
35
+
36
+ #include "ggml.h"
37
+
38
+ #include <stdbool.h>
39
+ #include <stdint.h>
40
+
41
+ #define GGUF_MAGIC "GGUF"
42
+ #define GGUF_VERSION 3
43
+
44
+ #define GGUF_KEY_GENERAL_ALIGNMENT "general.alignment"
45
+
46
+ #define GGUF_DEFAULT_ALIGNMENT 32
47
+
48
+ #ifdef __cplusplus
49
+ extern "C" {
50
+ #endif
51
+
52
+ // types that can be stored as GGUF KV data
53
+ enum gguf_type {
54
+ GGUF_TYPE_UINT8 = 0,
55
+ GGUF_TYPE_INT8 = 1,
56
+ GGUF_TYPE_UINT16 = 2,
57
+ GGUF_TYPE_INT16 = 3,
58
+ GGUF_TYPE_UINT32 = 4,
59
+ GGUF_TYPE_INT32 = 5,
60
+ GGUF_TYPE_FLOAT32 = 6,
61
+ GGUF_TYPE_BOOL = 7,
62
+ GGUF_TYPE_STRING = 8,
63
+ GGUF_TYPE_ARRAY = 9,
64
+ GGUF_TYPE_UINT64 = 10,
65
+ GGUF_TYPE_INT64 = 11,
66
+ GGUF_TYPE_FLOAT64 = 12,
67
+ GGUF_TYPE_COUNT, // marks the end of the enum
68
+ };
69
+
70
+ struct gguf_context;
71
+
72
+ struct gguf_init_params {
73
+ bool no_alloc;
74
+
75
+ // if not NULL, create a ggml_context and allocate the tensor data in it
76
+ struct ggml_context ** ctx;
77
+ };
78
+
79
+ GGML_API struct gguf_context * gguf_init_empty(void);
80
+ GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
81
+ //GGML_API struct gguf_context * gguf_init_from_buffer(..);
82
+
83
+ GGML_API void gguf_free(struct gguf_context * ctx);
84
+
85
+ GGML_API const char * gguf_type_name(enum gguf_type type);
86
+
87
+ GGML_API uint32_t gguf_get_version (const struct gguf_context * ctx);
88
+ GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx);
89
+ GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
90
+
91
+ GGML_API int64_t gguf_get_n_kv(const struct gguf_context * ctx);
92
+ GGML_API int64_t gguf_find_key(const struct gguf_context * ctx, const char * key); // returns -1 if key is not found
93
+ GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int64_t key_id);
94
+
95
+ GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int64_t key_id);
96
+ GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id);
97
+
98
+ // will abort if the wrong type is used for the key
99
+ GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int64_t key_id);
100
+ GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int64_t key_id);
101
+ GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int64_t key_id);
102
+ GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int64_t key_id);
103
+ GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int64_t key_id);
104
+ GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int64_t key_id);
105
+ GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int64_t key_id);
106
+ GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int64_t key_id);
107
+ GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int64_t key_id);
108
+ GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int64_t key_id);
109
+ GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int64_t key_id);
110
+ GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int64_t key_id);
111
+ GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int64_t key_id);
112
+ GGML_API size_t gguf_get_arr_n (const struct gguf_context * ctx, int64_t key_id);
113
+
114
+ // get raw pointer to the first element of the array with the given key_id
115
+ // for bool arrays, note that they are always stored as int8 on all platforms (usually this makes no difference)
116
+ GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int64_t key_id);
117
+
118
+ // get ith C string from array with given key_id
119
+ GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
120
+
121
+ GGML_API int64_t gguf_get_n_tensors (const struct gguf_context * ctx);
122
+ GGML_API int64_t gguf_find_tensor (const struct gguf_context * ctx, const char * name); // returns -1 if the tensor is not found
123
+ GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int64_t tensor_id);
124
+ GGML_API const char * gguf_get_tensor_name (const struct gguf_context * ctx, int64_t tensor_id);
125
+ GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int64_t tensor_id);
126
+ GGML_API size_t gguf_get_tensor_size (const struct gguf_context * ctx, int64_t tensor_id);
127
+
128
+ // removes key if it exists, returns id that the key had prior to removal (-1 if it didn't exist)
129
+ GGML_API int64_t gguf_remove_key(struct gguf_context * ctx, const char * key);
130
+
131
+ // overrides an existing KV pair or adds a new one, the new KV pair is always at the back
132
+ GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
133
+ GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
134
+ GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
135
+ GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val);
136
+ GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
137
+ GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
138
+ GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
139
+ GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
140
+ GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val);
141
+ GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val);
142
+ GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
143
+ GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
144
+
145
+ // creates a new array with n elements of the given type and copies the corresponding number of bytes from data
146
+ GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, size_t n);
147
+
148
+ // creates a new array with n strings and copies the corresponding strings from data
149
+ GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, size_t n);
150
+
151
+ // set or add KV pairs from another context
152
+ GGML_API void gguf_set_kv(struct gguf_context * ctx, const struct gguf_context * src);
153
+
154
+ // add tensor to GGUF context, tensor name must be unique
155
+ GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
156
+
157
+ // after changing a tensor's type, the offsets of all tensors with higher indices are immediately recalculated
158
+ // in such a way that the tensor data remains as one contiguous block (except for padding)
159
+ GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
160
+
161
+ // assumes that at least gguf_get_tensor_size bytes can be read from data
162
+ GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data);
163
+
164
+ // writing gguf files can be done in 3 ways:
165
+ //
166
+ // - write the entire gguf_context to a binary file in a single pass:
167
+ //
168
+ // gguf_write_to_file(ctx, fname, /*only_meta =*/ false);
169
+ //
170
+ // - write only the meta data to a file, then re-open the file and append the tensor data:
171
+ //
172
+ // gguf_write_to_file(ctx, fname, /*only_meta =*/ true);
173
+ // FILE * f = fopen(fname, "ab");
174
+ // fwrite(f, ...); // write tensor data
175
+ // fclose(f);
176
+ //
177
+ // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
178
+ //
179
+ // FILE * f = fopen(fname, "wb");
180
+ // const size_t size_meta = gguf_get_meta_size(ctx);
181
+ // fseek(f, size_meta, SEEK_SET);
182
+ // fwrite(f, ...); // write tensor data
183
+ // void * data = malloc(size_meta);
184
+ // gguf_get_meta_data(ctx, data);
185
+ // rewind(f);
186
+ // fwrite(data, 1, data, f);
187
+ // free(data);
188
+ // fclose(f);
189
+ //
190
+
191
+ // write the entire context to a binary file
192
+ GGML_API bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
193
+
194
+ // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
195
+ GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
196
+
197
+ // writes the meta data to pointer "data"
198
+ GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
199
+
200
+ #ifdef __cplusplus
201
+ }
202
+ #endif
khmerns/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ from ._core import KhmerSegmenter, tokenize, __version__
2
+
3
+ __all__ = ["KhmerSegmenter", "tokenize", "__version__"]
khmerns/__init__.pyi ADDED
@@ -0,0 +1,16 @@
1
+ from typing import List
2
+
3
+ __version__: str
4
+
5
+ class KhmerSegmenter:
6
+ def __init__(self) -> None: ...
7
+ def tokenize(self, text: str) -> List[str]:
8
+ """Segment Khmer text and return a list of words."""
9
+ ...
10
+ def __call__(self, text: str) -> List[str]:
11
+ """Segment Khmer text and return a list of words."""
12
+ ...
13
+
14
+ def tokenize(text: str) -> List[str]:
15
+ """Segment Khmer text and return a list of words."""
16
+ ...
Binary file
@@ -0,0 +1,121 @@
1
+ Metadata-Version: 2.4
2
+ Name: khmerns
3
+ Version: 0.0.3
4
+ Summary: Khmer Neural Segmenter
5
+ Keywords: khmer,nlp,segmentation,tokenization,neural-network
6
+ Author-Email: Seanghay Yath <seanghay.dev@gmail.com>
7
+ License-Expression: MIT
8
+ License-File: LICENSE
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Programming Language :: Python :: 3 :: Only
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Programming Language :: Python :: 3.14
17
+ Project-URL: Homepage, https://github.com/seanghay/khmer-neural-segmenter
18
+ Project-URL: Repository, https://github.com/seanghay/khmer-neural-segmenter
19
+ Project-URL: Issues, https://github.com/seanghay/khmer-neural-segmenter/issues
20
+ Requires-Python: >=3.9
21
+ Description-Content-Type: text/markdown
22
+
23
+ # Khmer Neural Segmenter
24
+
25
+ A fast Khmer word segmentation library.
26
+
27
+ <img src="img/graph.png" alt="" width=500>
28
+
29
+ ## Installation
30
+
31
+ ```
32
+ pip install khmerns
33
+ ```
34
+
35
+ ## Usage
36
+
37
+ ```python
38
+ from khmerns import tokenize
39
+
40
+ # Returns a list of words
41
+ words = tokenize("សួស្តីបងប្អូន")
42
+ # ['សួស្តី', 'បង', 'ប្អូន']
43
+ ```
44
+
45
+ You can also use the class-based API if you prefer:
46
+
47
+ ```python
48
+ from khmerns import KhmerSegmenter
49
+
50
+ segmenter = KhmerSegmenter()
51
+ words = segmenter.tokenize("សួស្តីបងប្អូន")
52
+ # or
53
+ words = segmenter("សួស្តីបងប្អូន")
54
+ ```
55
+
56
+ ## Training
57
+
58
+ The training pipeline lives in the `training/` directory. It trains a BiGRU + CRF model on character-level BIO tags, then converts the result to GGUF for the C++ inference backend.
59
+
60
+ ### Data format
61
+
62
+ Training data is a plain text file at `training/data/train.txt`. One word per line. Words that appear on consecutive lines are treated as part of the same sentence. The model learns word boundaries from this.
63
+
64
+ Example `training/data/train.txt`:
65
+
66
+ ```
67
+ សួស្តី
68
+ បង
69
+ ប្អូន
70
+ ខ្ញុំ
71
+ ទៅ
72
+ ផ្សារ
73
+ ```
74
+
75
+ Non-Khmer tokens (spaces, punctuation, numbers, Latin text) are tagged as `NON-KHMER`. Khmer tokens get `B-WORD` on the first character and `I-WORD` on the rest.
76
+
77
+ ### Steps
78
+
79
+ ```bash
80
+ cd training
81
+ pip install -r requirements.txt
82
+ ```
83
+
84
+ **1. Prepare training data**
85
+
86
+ Place your segmented text in `data/train.txt` (one word per line). If you have raw unsegmented Khmer text, you can use the generation script to pre-segment it:
87
+
88
+ ```bash
89
+ python generate.py
90
+ ```
91
+
92
+ This requires `khmersegment` and a source text file. Edit the path in `generate.py` to point to your raw text.
93
+
94
+ **2. Train**
95
+
96
+ ```bash
97
+ python train.py
98
+ ```
99
+
100
+ Trains for 20 epochs with AdamW (lr=1e-5) and ReduceLROnPlateau. Saves `best_model.pt` (best eval loss) and `model.pt` (final). Uses CUDA if available.
101
+
102
+ **3. Convert to GGUF**
103
+
104
+ ```bash
105
+ python convert_to_gguf.py best_model.pt model.gguf
106
+ ```
107
+
108
+ This produces a GGUF file (~3.3MB) containing all model weights.
109
+
110
+ **4. Embed in the C++ binary**
111
+
112
+ To use the new model in the library, convert the GGUF file to a C header and replace `src/model_data.h`, then rebuild:
113
+
114
+ ```bash
115
+ xxd -i model.gguf > ../src/model_data.h
116
+ pip install -e ..
117
+ ```
118
+
119
+ ## License
120
+
121
+ MIT
@@ -0,0 +1,29 @@
1
+ include/ggml-alloc.h,sha256=q5pVcoR6mOwx7lf-IC84IWvgHaPmPjYYPX2LnzR1ag8,3709
2
+ include/ggml-backend.h,sha256=ATnc9x8vL0X51r3r5XFpz0hBRGAtv6uHHL4Gta0NaRQ,21426
3
+ include/ggml-blas.h,sha256=XGbWYj60VJBBaYGwpd8u6REiwJLQx5wg4SE5-fM4M4E,611
4
+ include/ggml-cann.h,sha256=dc2nq-Z7-_9UZbJln2t1o6maduJqgCe1pCJPQ1ziMOg,4676
5
+ include/ggml-cpp.h,sha256=7tG7ys4Uqvre6aJ7t0CgofIDzmWilcv1M8fn3vD6yd4,1691
6
+ include/ggml-cpu.h,sha256=e4riopzIzfZq0rK7V8PEchNsSPc2wr_F678kBUJPOqk,7674
7
+ include/ggml-cuda.h,sha256=qqFiJffdn4w6MVhOOw6g0Uo54GFnmZsjBWYZLSvNxZ0,1632
8
+ include/ggml-metal.h,sha256=PPz5g2PKsENZLJEE-JuSugbHlChrzomGSHQ7RX4VXms,2202
9
+ include/ggml-opt.h,sha256=qTpDaSWZqnIi8JKcMKEZNhXmFPJuCclY6zMy8B56nw0,14295
10
+ include/ggml-rpc.h,sha256=jfGW1PTGF_rq85edNxWfm_LB4KtXJ_DIZn6n4TqlrBo,1086
11
+ include/ggml-sycl.h,sha256=WJW-Sk6poGXcMmXuwPCcmpJG3fJy5uyx4bB12oNHrKo,1812
12
+ include/ggml-virtgpu.h,sha256=L3KdW5jwhgIHee2FgiAOucBH09Tgz_GzPB_pV1mYreE,266
13
+ include/ggml-vulkan.h,sha256=C29iUCGwR5B9JLYLPKRB6Kl9RCNfDTKr6yZlESw1QY4,981
14
+ include/ggml-webgpu.h,sha256=YoxXN2KYJOwzJmIKXESA3olVy_gsNSvu2ORuAlFPs5I,347
15
+ include/ggml-zendnn.h,sha256=rlX7HjkqaAory2NoxpQSVZgPjEnZHZ6dKjQAOLK_8lE,520
16
+ include/ggml.h,sha256=8AuUtoOyhyNaf65atAKT1NBO-FrquzEXbYB-2VYjC6I,106093
17
+ include/gguf.h,sha256=dGmizdUtMG2dBzbEiu0ebZfF2Abz5smJuCXuEpMFToc,10426
18
+ khmerns/__init__.py,sha256=56VwZ3fseBKsGGMhVtE1EI1Tqi4uhigZ7OdRKf6MAKE,117
19
+ khmerns/__init__.pyi,sha256=5Ywjkugs1j6YWPS6bkFFTl077E9mb9-fSXPgu-kQhbw,437
20
+ khmerns/_core.cp314-win32.pyd,sha256=yuCZ1Xjmkv6CyPwLXKNxUIiJuNUsRlsHA_8NyDg6Okk,3675648
21
+ lib/cmake/ggml/ggml-config.cmake,sha256=OPnz2F8SEuCgmqv8vJc3xlADN6NkgootaXOIKZ9uQ_Y,12170
22
+ lib/cmake/ggml/ggml-version.cmake,sha256=cmpuq1NZlHRY3WdA7y81233b6KomdObooai5LL4-Dyc,2827
23
+ lib/ggml-base.lib,sha256=OmtZI_juR08w9OXQcBR1ZuTOk-eSRo-2ds0nYLvXr8Y,1217308
24
+ lib/ggml-cpu.lib,sha256=-djxu2q6TZUH8HBwq-v6oxrfjY7f5W4rYaUgosUEKlU,1130366
25
+ lib/ggml.lib,sha256=ktfyHQWCPnA0cBlPMbnY4M3IeIJmbQr6hzrliPV2gdw,190804
26
+ khmerns-0.0.3.dist-info/METADATA,sha256=NI0Lq5nTQGzEEI5prGscnoTSKg2WFwkoL1QbDhCEAaI,3263
27
+ khmerns-0.0.3.dist-info/WHEEL,sha256=xwLkizf5NInhysT1HPChN8-LXYl1drA7qIdg1OQtfC8,102
28
+ khmerns-0.0.3.dist-info/licenses/LICENSE,sha256=NJbBwbQTQpJwxvCeUDhqpUe3HKE3geRtx5iqIlQ5q0c,1089
29
+ khmerns-0.0.3.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: scikit-build-core 0.11.6
3
+ Root-Is-Purelib: false
4
+ Tag: cp314-cp314-win32
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Seanghay Yath
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.