netra-nmt 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- netra_nmt-0.1.0/.gitignore +259 -0
- netra_nmt-0.1.0/08_test_tokenizer.py +11 -0
- netra_nmt-0.1.0/LICENSE +21 -0
- netra_nmt-0.1.0/PKG-INFO +217 -0
- netra_nmt-0.1.0/README.md +183 -0
- netra_nmt-0.1.0/assets/Asset 4@Bag.png +0 -0
- netra_nmt-0.1.0/assets/Asset 5@Bag_black.png +0 -0
- netra_nmt-0.1.0/assets/bertscore_benchmark.png +0 -0
- netra_nmt-0.1.0/assets/chrf_benchmark.png +0 -0
- netra_nmt-0.1.0/assets/inference_benchmark.png +0 -0
- netra_nmt-0.1.0/assets/logo.png +0 -0
- netra_nmt-0.1.0/assets/metrics_benchmark.png +0 -0
- netra_nmt-0.1.0/assets/model_architecture.png +0 -0
- netra_nmt-0.1.0/assets/wordmark.png +0 -0
- netra_nmt-0.1.0/netra_architecture.png +0 -0
- netra_nmt-0.1.0/netra_nmt/__init__.py +29 -0
- netra_nmt-0.1.0/netra_nmt/assets/spm_32k.model +0 -0
- netra_nmt-0.1.0/netra_nmt/cli.py +209 -0
- netra_nmt-0.1.0/netra_nmt/config.py +62 -0
- netra_nmt-0.1.0/netra_nmt/decoding.py +174 -0
- netra_nmt-0.1.0/netra_nmt/model.py +383 -0
- netra_nmt-0.1.0/netra_nmt/server.py +133 -0
- netra_nmt-0.1.0/netra_nmt/static/index.html +328 -0
- netra_nmt-0.1.0/netra_nmt/static/netra-wordmark-orange.png +0 -0
- netra_nmt-0.1.0/netra_nmt/static/netra-wordmark.png +0 -0
- netra_nmt-0.1.0/netra_nmt/translator.py +193 -0
- netra_nmt-0.1.0/netra_nmt/weights.py +70 -0
- netra_nmt-0.1.0/pyproject.toml +57 -0
- netra_nmt-0.1.0/requirements.txt +16 -0
- netra_nmt-0.1.0/results/eval_epoch03.json +86 -0
- netra_nmt-0.1.0/results/eval_results_alt.json +43 -0
- netra_nmt-0.1.0/results/eval_results_lyfeyvutha_350M_alt.json +43 -0
- netra_nmt-0.1.0/results/eval_results_madlad3b_alt.json +44 -0
- netra_nmt-0.1.0/results/eval_results_nllb1.3B_alt.json +43 -0
- netra_nmt-0.1.0/results/eval_results_nllb200_alt.json +43 -0
- netra_nmt-0.1.0/results/eval_results_rinabuoy.json +43 -0
- netra_nmt-0.1.0/results/google_translate_alt.json +41 -0
- netra_nmt-0.1.0/results/train_log.jsonl +1992 -0
- netra_nmt-0.1.0/results/train_log_finetune.jsonl +38 -0
- netra_nmt-0.1.0/scripts/assets/metrics_benchmark.png +0 -0
- netra_nmt-0.1.0/scripts/benchmark_metrics.ipynb +395 -0
- netra_nmt-0.1.0/scripts/compare_models.ipynb +819 -0
- netra_nmt-0.1.0/scripts/compare_models.py +315 -0
- netra_nmt-0.1.0/scripts/export_checkpoint.py +167 -0
- netra_nmt-0.1.0/scripts/visualize_architecture.py +378 -0
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[codz]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
# Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
# uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
# poetry.lock
|
|
109
|
+
# poetry.toml
|
|
110
|
+
|
|
111
|
+
# pdm
|
|
112
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
113
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
|
114
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
|
115
|
+
# pdm.lock
|
|
116
|
+
# pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# pixi
|
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
|
122
|
+
# pixi.lock
|
|
123
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
|
124
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
|
125
|
+
.pixi
|
|
126
|
+
|
|
127
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
128
|
+
__pypackages__/
|
|
129
|
+
|
|
130
|
+
# Celery stuff
|
|
131
|
+
celerybeat-schedule
|
|
132
|
+
celerybeat.pid
|
|
133
|
+
|
|
134
|
+
# Redis
|
|
135
|
+
*.rdb
|
|
136
|
+
*.aof
|
|
137
|
+
*.pid
|
|
138
|
+
|
|
139
|
+
# RabbitMQ
|
|
140
|
+
mnesia/
|
|
141
|
+
rabbitmq/
|
|
142
|
+
rabbitmq-data/
|
|
143
|
+
|
|
144
|
+
# ActiveMQ
|
|
145
|
+
activemq-data/
|
|
146
|
+
|
|
147
|
+
# SageMath parsed files
|
|
148
|
+
*.sage.py
|
|
149
|
+
|
|
150
|
+
# Environments
|
|
151
|
+
.myenv
|
|
152
|
+
.env
|
|
153
|
+
.envrc
|
|
154
|
+
.venv
|
|
155
|
+
env/
|
|
156
|
+
venv/
|
|
157
|
+
ENV/
|
|
158
|
+
env.bak/
|
|
159
|
+
venv.bak/
|
|
160
|
+
myenv/
|
|
161
|
+
./myenv/
|
|
162
|
+
./myenv
|
|
163
|
+
|
|
164
|
+
# Spyder project settings
|
|
165
|
+
.spyderproject
|
|
166
|
+
.spyproject
|
|
167
|
+
|
|
168
|
+
# Rope project settings
|
|
169
|
+
.ropeproject
|
|
170
|
+
|
|
171
|
+
# mkdocs documentation
|
|
172
|
+
/site
|
|
173
|
+
|
|
174
|
+
# mypy
|
|
175
|
+
.mypy_cache/
|
|
176
|
+
.dmypy.json
|
|
177
|
+
dmypy.json
|
|
178
|
+
|
|
179
|
+
# Pyre type checker
|
|
180
|
+
.pyre/
|
|
181
|
+
|
|
182
|
+
# pytype static type analyzer
|
|
183
|
+
.pytype/
|
|
184
|
+
|
|
185
|
+
# Cython debug symbols
|
|
186
|
+
cython_debug/
|
|
187
|
+
|
|
188
|
+
# PyCharm
|
|
189
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
190
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
191
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
192
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
193
|
+
# .idea/
|
|
194
|
+
|
|
195
|
+
# Abstra
|
|
196
|
+
# Abstra is an AI-powered process automation framework.
|
|
197
|
+
# Ignore directories containing user credentials, local state, and settings.
|
|
198
|
+
# Learn more at https://abstra.io/docs
|
|
199
|
+
.abstra/
|
|
200
|
+
|
|
201
|
+
# Visual Studio Code
|
|
202
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
203
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
204
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
205
|
+
# you could uncomment the following to ignore the entire vscode folder
|
|
206
|
+
# .vscode/
|
|
207
|
+
# Temporary file for partial code execution
|
|
208
|
+
tempCodeRunnerFile.py
|
|
209
|
+
|
|
210
|
+
# Ruff stuff:
|
|
211
|
+
.ruff_cache/
|
|
212
|
+
|
|
213
|
+
# PyPI configuration file
|
|
214
|
+
.pypirc
|
|
215
|
+
|
|
216
|
+
# Marimo
|
|
217
|
+
marimo/_static/
|
|
218
|
+
marimo/_lsp/
|
|
219
|
+
__marimo__/
|
|
220
|
+
|
|
221
|
+
# Streamlit
|
|
222
|
+
.streamlit/secrets.toml
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
./data
|
|
226
|
+
data/
|
|
227
|
+
data
|
|
228
|
+
corpus.txt
|
|
229
|
+
|
|
230
|
+
./checkpoints
|
|
231
|
+
checkpoints/
|
|
232
|
+
checkpoints
|
|
233
|
+
|
|
234
|
+
checkpoints_distill
|
|
235
|
+
|
|
236
|
+
checkpoints_human_finetune
|
|
237
|
+
|
|
238
|
+
checkpoints_finetune/
|
|
239
|
+
|
|
240
|
+
archived/
|
|
241
|
+
|
|
242
|
+
tokenizer/corpus.txt
|
|
243
|
+
tokenizer/*.model
|
|
244
|
+
tokenizer/*.vocab
|
|
245
|
+
# netra-nmt release export (weights go to Hugging Face Hub)
|
|
246
|
+
export/
|
|
247
|
+
|
|
248
|
+
DESIGN_SYSTEM.md
|
|
249
|
+
# Standalone Streamlit Space (not part of the package; deployed separately)
|
|
250
|
+
streamlit_space/
|
|
251
|
+
|
|
252
|
+
# Benchmark scripts and generated chart / result images (local, not versioned)
|
|
253
|
+
scripts/benchmark_speed*.py
|
|
254
|
+
scripts/chart_speed.py
|
|
255
|
+
results/benchmark_speed*.json
|
|
256
|
+
|
|
257
|
+
scripts/training/
|
|
258
|
+
|
|
259
|
+
test.py
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import sentencepiece as spm
|
|
2
|
+
|
|
3
|
+
model = spm.SentencePieceProcessor()
|
|
4
|
+
|
|
5
|
+
model.load("tokenizer/spm_32k.model")
|
|
6
|
+
|
|
7
|
+
text = "ខ្ញុំទៅសាលារៀន I go to school"
|
|
8
|
+
|
|
9
|
+
print("INPUT:", text)
|
|
10
|
+
print("TOKENS:", model.encode(text, out_type=str))
|
|
11
|
+
print("IDS:", model.encode(text, out_type=int))
|
netra_nmt-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Netra Lab
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
netra_nmt-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: netra-nmt
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A compact, from-scratch English↔Khmer neural machine translation model.
|
|
5
|
+
Project-URL: Homepage, https://github.com/NDarayut/netra-nmt
|
|
6
|
+
Author-email: Darayut Nhem <darayutnhem009@gmail.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Keywords: english,khmer,nmt,seq2seq,transformer,translation
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
13
|
+
Requires-Python: >=3.10
|
|
14
|
+
Requires-Dist: huggingface-hub>=0.20.0
|
|
15
|
+
Requires-Dist: numpy
|
|
16
|
+
Requires-Dist: safetensors>=0.4.0
|
|
17
|
+
Requires-Dist: sentencepiece>=0.1.99
|
|
18
|
+
Requires-Dist: torch>=2.0
|
|
19
|
+
Provides-Extra: train
|
|
20
|
+
Requires-Dist: accelerate; extra == 'train'
|
|
21
|
+
Requires-Dist: bert-score>=0.3.13; extra == 'train'
|
|
22
|
+
Requires-Dist: datasets; extra == 'train'
|
|
23
|
+
Requires-Dist: fasttext-wheel; extra == 'train'
|
|
24
|
+
Requires-Dist: khmercut; extra == 'train'
|
|
25
|
+
Requires-Dist: langdetect; extra == 'train'
|
|
26
|
+
Requires-Dist: pandas; extra == 'train'
|
|
27
|
+
Requires-Dist: pyarrow; extra == 'train'
|
|
28
|
+
Requires-Dist: sacrebleu>=2.3.1; extra == 'train'
|
|
29
|
+
Requires-Dist: unbabel-comet>=2.2.0; extra == 'train'
|
|
30
|
+
Provides-Extra: web
|
|
31
|
+
Requires-Dist: fastapi>=0.110; extra == 'web'
|
|
32
|
+
Requires-Dist: uvicorn[standard]>=0.27; extra == 'web'
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
|
|
35
|
+
<div align="center">
|
|
36
|
+
<img
|
|
37
|
+
src="https://raw.githubusercontent.com/netra-ai-lab/Netra-NMT/main/assets/logo.png"
|
|
38
|
+
width="20%"
|
|
39
|
+
alt="Netra Lab"
|
|
40
|
+
style="vertical-align: middle; margin-right: 30px;"
|
|
41
|
+
/>
|
|
42
|
+
<img
|
|
43
|
+
src="https://raw.githubusercontent.com/netra-ai-lab/Netra-NMT/main/assets/wordmark.png"
|
|
44
|
+
width="20%"
|
|
45
|
+
alt="Netra Lab"
|
|
46
|
+
style="vertical-align: middle;"
|
|
47
|
+
/>
|
|
48
|
+
</div>
|
|
49
|
+
|
|
50
|
+
<hr>
|
|
51
|
+
|
|
52
|
+
<p align="center">
|
|
53
|
+
<a href="https://github.com/netra-ai-lab/Netra-NMT"><b>GitHub</b></a> |
|
|
54
|
+
<a href="https://huggingface.co/Darayut/netra-nmt-small"><b>Model Download</b></a> |
|
|
55
|
+
<a href="https://huggingface.co/datasets/Darayut/bilingual-en-km"><b>Dataset Download</b></a> |
|
|
56
|
+
<a href="https://huggingface.co/spaces/Darayut/Netra-NMT"><b>Inference Space</b></a> |
|
|
57
|
+
</p>
|
|
58
|
+
|
|
59
|
+
<h2>
|
|
60
|
+
<p align="center">
|
|
61
|
+
<a href="">A Compact Bidirectional Encoder-Decoder Transformer-Based Model for English-Khmer Translation</a>
|
|
62
|
+
</p>
|
|
63
|
+
</h2>
|
|
64
|
+
|
|
65
|
+
<p align="center">
|
|
66
|
+
<img src="https://raw.githubusercontent.com/netra-ai-lab/Netra-NMT/main/assets/inference_benchmark.png" style="width: 1000px" align=center>
|
|
67
|
+
</p>
|
|
68
|
+
|
|
69
|
+
<p align="center">
|
|
70
|
+
<a href="">Inference Speed Benchmark on CPU Using Greedy, and Beam Search Decoding Strategy</a>
|
|
71
|
+
</p>
|
|
72
|
+
|
|
73
|
+
## 1. Abstract
|
|
74
|
+
This repository present Netra-NMT a 90M-parameter encoder-decoder transformer-based model trained on **220 million tokens** of English-Khmer parallel text (4.2M bidirectional examples). The encoder uses bidirectional self-attention, much like BERT, to capture global contextual representation. The decoder perform autoregressive generation through causal self-attention and encoder-decoder cross attention.
|
|
75
|
+
|
|
76
|
+
Unlike traditional transformer block, Netra-NMT incorporates several architectural improvements, including Pre-Layer Normalization (Pre-LN) for stable optimization, SwiGLU feed-forward networks for enhanced representational capacity, and weight tying between the decoder embedding layer and output projection head to reduce parameter redundancy.
|
|
77
|
+
|
|
78
|
+
## 2. Dataset
|
|
79
|
+
|
|
80
|
+
Netra-NMT was trained on **220 million tokens** drawn from approximately **2.4 million unique English-Khmer sentence pairs** (4.2 million examples after bidirectional augmentation). The corpus combines LLM-generated synthetic data with web-crawled parallel text, spanning legal, literary, medical, technical, and conversational domains.
|
|
81
|
+
|
|
82
|
+
### 2.1 Sources
|
|
83
|
+
|
|
84
|
+
| Dataset | Type | Pairs | Domains |
|
|
85
|
+
|---------|------|------:|---------|
|
|
86
|
+
| [Darayut/khmer-english-pairs-raw](https://huggingface.co/datasets/Darayut/khmer-english-pairs-raw) | Synthetic | 200K | Legal, Literary, Governmental |
|
|
87
|
+
| [lyfeyvutha/nllb-en-km-316K](https://huggingface.co/datasets/lyfeyvutha/nllb-en-km-316K) | Synthetic | 316K | General |
|
|
88
|
+
| [KrorngAI/ParaCrawl-English-Khmer-v2](https://huggingface.co/datasets/KrorngAI/ParaCrawl-English-Khmer-v2) | Web crawl (ParaCrawl) | 1.5M | Web / general |
|
|
89
|
+
| [SeyhaLite/Translate-English-Khmer-All](https://huggingface.co/datasets/SeyhaLite/Translate-English-Khmer-All) | --- | 366K | General |
|
|
90
|
+
| **Total** | | **2.4M** | |
|
|
91
|
+
|
|
92
|
+
### 2.2 Preprocessing
|
|
93
|
+
|
|
94
|
+
Raw data was cleaned through the following pipeline:
|
|
95
|
+
|
|
96
|
+
1. **Deduplication**: exact duplicate pairs removed across all sources.
|
|
97
|
+
2. **Length filtering**: pairs with extreme source/target length mismatches were discarded.
|
|
98
|
+
3. **Empty/null removal**: pairs where either side was empty or below a minimum token count were dropped.
|
|
99
|
+
|
|
100
|
+
After cleaning, each surviving pair is duplicated in both directions (`EN→KM` and `KM→EN`) with a direction prefix token (`<2km>` / `<2en>`), yielding ~4.2 million training examples.
|
|
101
|
+
|
|
102
|
+
## 3. Model Architecture
|
|
103
|
+
|
|
104
|
+
<div align="center">
|
|
105
|
+
<img src="https://raw.githubusercontent.com/netra-ai-lab/Netra-NMT/main/assets/model_architecture.png" width="70%" alt="Netra-NMT Architecture" />
|
|
106
|
+
<p><em>Figure 1: Overview of the Netra-NMT encoder-decoder architecture. The encoder (left) processes the source sentence with bidirectional self-attention; the decoder (right) generates the target sentence autoregressively via causal self-attention and cross-attention over the encoder output. Both sides share a 32K SentencePiece tokenizer.</em></p>
|
|
107
|
+
</div>
|
|
108
|
+
|
|
109
|
+
Netra-NMT follows a standard encoder-decoder transformer architecture with several modifications for training stability and parameter efficiency.
|
|
110
|
+
|
|
111
|
+
**Encoder** takes the source sentence tokenized by the shared 32K SentencePiece tokenizer, adds learned positional embeddings, and passes the sequence through 6 transformer layers with *bidirectional* self-attention (every token attends to every other token, similar to BERT). A final Pre-LN layer norm is applied to the encoder output before it is passed to the decoder via cross-attention.
|
|
112
|
+
|
|
113
|
+
**Decoder** takes the (partially generated) target sentence through the same tokenizer, adds positional embeddings, and passes it through 6 transformer layers. Each decoder layer applies three sub-layers in order: (1) *causal* (masked) self-attention over previously generated tokens, (2) cross-attention over the full encoder output, and (3) a feed-forward block. A final Pre-LN layer norm feeds into the tied linear projection head to produce output token probabilities.
|
|
114
|
+
|
|
115
|
+
**Architectural improvements over the vanilla transformer:**
|
|
116
|
+
|
|
117
|
+
| Feature | Detail |
|
|
118
|
+
|---------|--------|
|
|
119
|
+
| Pre-Layer Normalization | Layer norm applied *before* each sub-layer (Pre-LN) rather than after, improving gradient flow and training stability |
|
|
120
|
+
| SwiGLU FFN | Feed-forward blocks use the SwiGLU activation instead of ReLU, providing richer representational capacity at no parameter cost |
|
|
121
|
+
| Weight tying | The decoder input embedding matrix is shared with the output linear projection head, reducing redundant parameters |
|
|
122
|
+
|
|
123
|
+
**Hyperparameters:**
|
|
124
|
+
|
|
125
|
+
| | |
|
|
126
|
+
|---|---|
|
|
127
|
+
| d_model | 512 |
|
|
128
|
+
| Encoder / Decoder layers | 6 / 6 |
|
|
129
|
+
| Attention heads | 8 |
|
|
130
|
+
| FFN hidden size | 2048 |
|
|
131
|
+
| Vocabulary | 32K (SentencePiece unigram, shared) |
|
|
132
|
+
| Total parameters | ~89.7M |
|
|
133
|
+
|
|
134
|
+
## 4. Evaluation Results
|
|
135
|
+
|
|
136
|
+
<p align="center">
|
|
137
|
+
<img src="https://raw.githubusercontent.com/netra-ai-lab/Netra-NMT/main/assets/metrics_benchmark.png" style="width: 1000px" align=center>
|
|
138
|
+
</p>
|
|
139
|
+
|
|
140
|
+
## Install
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
pip install netra-nmt # core (Python API + CLI)
|
|
144
|
+
pip install "netra-nmt[web]" # + FastAPI web app & REST API
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
Or from source:
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
git clone https://github.com/NDarayut/netra-nmt
|
|
151
|
+
cd netra-nmt
|
|
152
|
+
pip install -e ".[web]"
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
The first translation downloads the weights (~180 MB fp16) from the Hugging Face Hub and caches them
|
|
156
|
+
under `~/.cache/huggingface`.
|
|
157
|
+
|
|
158
|
+
## Usage
|
|
159
|
+
|
|
160
|
+
### 1. Python API
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
from netra_nmt import NetraTranslator
|
|
164
|
+
|
|
165
|
+
t = NetraTranslator() # auto-detect GPU/CPU; downloads weights once
|
|
166
|
+
t.translate("Hello, how are you?", direction="en2km") # → "សួស្តី សុខសប្បាយអត់?"
|
|
167
|
+
t.translate("ខ្ញុំស្រឡាញ់ប្រទេសរបស់ខ្ញុំ។", direction="km2en")
|
|
168
|
+
|
|
169
|
+
# Batch + decoding options
|
|
170
|
+
t.translate_batch(["Good morning.", "See you tomorrow."], direction="en2km")
|
|
171
|
+
t.translate("Good morning, my friend.", direction="en2km", mode="beam", beam_size=5)
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
One-shot helper (caches a default translator):
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
from netra_nmt import translate
|
|
178
|
+
translate("Hello", direction="en2km")
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
`direction` is `"en2km"` (English→Khmer) or `"km2en"` (Khmer→English).
|
|
182
|
+
`mode` is `"greedy"` (default), `"beam"`, or `"sample"`.
|
|
183
|
+
|
|
184
|
+
### 2. CLI
|
|
185
|
+
|
|
186
|
+
```bash
|
|
187
|
+
# Single sentence (default direction en2km):
|
|
188
|
+
netra-translate --text "Hello, how are you?"
|
|
189
|
+
|
|
190
|
+
# Khmer → English with beam search:
|
|
191
|
+
netra-translate --text "សួស្តី, តើអ្នកសុខសប្បាយទេ?" --direction km2en --mode beam
|
|
192
|
+
|
|
193
|
+
# Translate a file (one sentence per line):
|
|
194
|
+
netra-translate --file input.txt --output output.txt --direction en2km
|
|
195
|
+
|
|
196
|
+
# Interactive REPL (omit --text / --file):
|
|
197
|
+
netra-translate
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
### 3. Web app + REST API (FastAPI)
|
|
201
|
+
|
|
202
|
+
```bash
|
|
203
|
+
netra-web # serves the web UI + API at http://127.0.0.1:8000
|
|
204
|
+
netra-web --port 8080 --device cpu
|
|
205
|
+
netra-web --local-dir export # load weights from a local export dir
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
A two-pane translation site (source left, output right, EN⇄KM swap button) plus a JSON API:
|
|
209
|
+
|
|
210
|
+
```bash
|
|
211
|
+
curl -X POST http://127.0.0.1:8000/api/translate \
|
|
212
|
+
-H 'Content-Type: application/json' \
|
|
213
|
+
-d '{"text": "Hello, how are you?", "direction": "en2km"}'
|
|
214
|
+
# {"translation": "...", "direction": "en2km"}
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
Requires the `web` extra (`pip install "netra-nmt[web]"`).
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
<img
|
|
3
|
+
src="https://raw.githubusercontent.com/netra-ai-lab/Netra-NMT/main/assets/logo.png"
|
|
4
|
+
width="20%"
|
|
5
|
+
alt="Netra Lab"
|
|
6
|
+
style="vertical-align: middle; margin-right: 30px;"
|
|
7
|
+
/>
|
|
8
|
+
<img
|
|
9
|
+
src="https://raw.githubusercontent.com/netra-ai-lab/Netra-NMT/main/assets/wordmark.png"
|
|
10
|
+
width="20%"
|
|
11
|
+
alt="Netra Lab"
|
|
12
|
+
style="vertical-align: middle;"
|
|
13
|
+
/>
|
|
14
|
+
</div>
|
|
15
|
+
|
|
16
|
+
<hr>
|
|
17
|
+
|
|
18
|
+
<p align="center">
|
|
19
|
+
<a href="https://github.com/netra-ai-lab/Netra-NMT"><b>GitHub</b></a> |
|
|
20
|
+
<a href="https://huggingface.co/Darayut/netra-nmt-small"><b>Model Download</b></a> |
|
|
21
|
+
<a href="https://huggingface.co/datasets/Darayut/bilingual-en-km"><b>Dataset Download</b></a> |
|
|
22
|
+
<a href="https://huggingface.co/spaces/Darayut/Netra-NMT"><b>Inference Space</b></a> |
|
|
23
|
+
</p>
|
|
24
|
+
|
|
25
|
+
<h2>
|
|
26
|
+
<p align="center">
|
|
27
|
+
<a href="">A Compact Bidirectional Encoder-Decoder Transformer-Based Model for English-Khmer Translation</a>
|
|
28
|
+
</p>
|
|
29
|
+
</h2>
|
|
30
|
+
|
|
31
|
+
<p align="center">
|
|
32
|
+
<img src="https://raw.githubusercontent.com/netra-ai-lab/Netra-NMT/main/assets/inference_benchmark.png" style="width: 1000px" align=center>
|
|
33
|
+
</p>
|
|
34
|
+
|
|
35
|
+
<p align="center">
|
|
36
|
+
<a href="">Inference Speed Benchmark on CPU Using Greedy, and Beam Search Decoding Strategy</a>
|
|
37
|
+
</p>
|
|
38
|
+
|
|
39
|
+
## 1. Abstract
|
|
40
|
+
This repository present Netra-NMT a 90M-parameter encoder-decoder transformer-based model trained on **220 million tokens** of English-Khmer parallel text (4.2M bidirectional examples). The encoder uses bidirectional self-attention, much like BERT, to capture global contextual representation. The decoder perform autoregressive generation through causal self-attention and encoder-decoder cross attention.
|
|
41
|
+
|
|
42
|
+
Unlike traditional transformer block, Netra-NMT incorporates several architectural improvements, including Pre-Layer Normalization (Pre-LN) for stable optimization, SwiGLU feed-forward networks for enhanced representational capacity, and weight tying between the decoder embedding layer and output projection head to reduce parameter redundancy.
|
|
43
|
+
|
|
44
|
+
## 2. Dataset
|
|
45
|
+
|
|
46
|
+
Netra-NMT was trained on **220 million tokens** drawn from approximately **2.4 million unique English-Khmer sentence pairs** (4.2 million examples after bidirectional augmentation). The corpus combines LLM-generated synthetic data with web-crawled parallel text, spanning legal, literary, medical, technical, and conversational domains.
|
|
47
|
+
|
|
48
|
+
### 2.1 Sources
|
|
49
|
+
|
|
50
|
+
| Dataset | Type | Pairs | Domains |
|
|
51
|
+
|---------|------|------:|---------|
|
|
52
|
+
| [Darayut/khmer-english-pairs-raw](https://huggingface.co/datasets/Darayut/khmer-english-pairs-raw) | Synthetic | 200K | Legal, Literary, Governmental |
|
|
53
|
+
| [lyfeyvutha/nllb-en-km-316K](https://huggingface.co/datasets/lyfeyvutha/nllb-en-km-316K) | Synthetic | 316K | General |
|
|
54
|
+
| [KrorngAI/ParaCrawl-English-Khmer-v2](https://huggingface.co/datasets/KrorngAI/ParaCrawl-English-Khmer-v2) | Web crawl (ParaCrawl) | 1.5M | Web / general |
|
|
55
|
+
| [SeyhaLite/Translate-English-Khmer-All](https://huggingface.co/datasets/SeyhaLite/Translate-English-Khmer-All) | --- | 366K | General |
|
|
56
|
+
| **Total** | | **2.4M** | |
|
|
57
|
+
|
|
58
|
+
### 2.2 Preprocessing
|
|
59
|
+
|
|
60
|
+
Raw data was cleaned through the following pipeline:
|
|
61
|
+
|
|
62
|
+
1. **Deduplication**: exact duplicate pairs removed across all sources.
|
|
63
|
+
2. **Length filtering**: pairs with extreme source/target length mismatches were discarded.
|
|
64
|
+
3. **Empty/null removal**: pairs where either side was empty or below a minimum token count were dropped.
|
|
65
|
+
|
|
66
|
+
After cleaning, each surviving pair is duplicated in both directions (`EN→KM` and `KM→EN`) with a direction prefix token (`<2km>` / `<2en>`), yielding ~4.2 million training examples.
|
|
67
|
+
|
|
68
|
+
## 3. Model Architecture
|
|
69
|
+
|
|
70
|
+
<div align="center">
|
|
71
|
+
<img src="https://raw.githubusercontent.com/netra-ai-lab/Netra-NMT/main/assets/model_architecture.png" width="70%" alt="Netra-NMT Architecture" />
|
|
72
|
+
<p><em>Figure 1: Overview of the Netra-NMT encoder-decoder architecture. The encoder (left) processes the source sentence with bidirectional self-attention; the decoder (right) generates the target sentence autoregressively via causal self-attention and cross-attention over the encoder output. Both sides share a 32K SentencePiece tokenizer.</em></p>
|
|
73
|
+
</div>
|
|
74
|
+
|
|
75
|
+
Netra-NMT follows a standard encoder-decoder transformer architecture with several modifications for training stability and parameter efficiency.
|
|
76
|
+
|
|
77
|
+
**Encoder** takes the source sentence tokenized by the shared 32K SentencePiece tokenizer, adds learned positional embeddings, and passes the sequence through 6 transformer layers with *bidirectional* self-attention (every token attends to every other token, similar to BERT). A final Pre-LN layer norm is applied to the encoder output before it is passed to the decoder via cross-attention.
|
|
78
|
+
|
|
79
|
+
**Decoder** takes the (partially generated) target sentence through the same tokenizer, adds positional embeddings, and passes it through 6 transformer layers. Each decoder layer applies three sub-layers in order: (1) *causal* (masked) self-attention over previously generated tokens, (2) cross-attention over the full encoder output, and (3) a feed-forward block. A final Pre-LN layer norm feeds into the tied linear projection head to produce output token probabilities.
|
|
80
|
+
|
|
81
|
+
**Architectural improvements over the vanilla transformer:**
|
|
82
|
+
|
|
83
|
+
| Feature | Detail |
|
|
84
|
+
|---------|--------|
|
|
85
|
+
| Pre-Layer Normalization | Layer norm applied *before* each sub-layer (Pre-LN) rather than after, improving gradient flow and training stability |
|
|
86
|
+
| SwiGLU FFN | Feed-forward blocks use the SwiGLU activation instead of ReLU, providing richer representational capacity at no parameter cost |
|
|
87
|
+
| Weight tying | The decoder input embedding matrix is shared with the output linear projection head, reducing redundant parameters |
|
|
88
|
+
|
|
89
|
+
**Hyperparameters:**
|
|
90
|
+
|
|
91
|
+
| | |
|
|
92
|
+
|---|---|
|
|
93
|
+
| d_model | 512 |
|
|
94
|
+
| Encoder / Decoder layers | 6 / 6 |
|
|
95
|
+
| Attention heads | 8 |
|
|
96
|
+
| FFN hidden size | 2048 |
|
|
97
|
+
| Vocabulary | 32K (SentencePiece unigram, shared) |
|
|
98
|
+
| Total parameters | ~89.7M |
|
|
99
|
+
|
|
100
|
+
## 4. Evaluation Results
|
|
101
|
+
|
|
102
|
+
<p align="center">
|
|
103
|
+
<img src="https://raw.githubusercontent.com/netra-ai-lab/Netra-NMT/main/assets/metrics_benchmark.png" style="width: 1000px" align=center>
|
|
104
|
+
</p>
|
|
105
|
+
|
|
106
|
+
## Install
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
pip install netra-nmt # core (Python API + CLI)
|
|
110
|
+
pip install "netra-nmt[web]" # + FastAPI web app & REST API
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
Or from source:
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
git clone https://github.com/NDarayut/netra-nmt
|
|
117
|
+
cd netra-nmt
|
|
118
|
+
pip install -e ".[web]"
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
The first translation downloads the weights (~180 MB fp16) from the Hugging Face Hub and caches them
|
|
122
|
+
under `~/.cache/huggingface`.
|
|
123
|
+
|
|
124
|
+
## Usage
|
|
125
|
+
|
|
126
|
+
### 1. Python API
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
from netra_nmt import NetraTranslator
|
|
130
|
+
|
|
131
|
+
t = NetraTranslator() # auto-detect GPU/CPU; downloads weights once
|
|
132
|
+
t.translate("Hello, how are you?", direction="en2km") # → "សួស្តី សុខសប្បាយអត់?"
|
|
133
|
+
t.translate("ខ្ញុំស្រឡាញ់ប្រទេសរបស់ខ្ញុំ។", direction="km2en")
|
|
134
|
+
|
|
135
|
+
# Batch + decoding options
|
|
136
|
+
t.translate_batch(["Good morning.", "See you tomorrow."], direction="en2km")
|
|
137
|
+
t.translate("Good morning, my friend.", direction="en2km", mode="beam", beam_size=5)
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
One-shot helper (caches a default translator):
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
from netra_nmt import translate
|
|
144
|
+
translate("Hello", direction="en2km")
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
`direction` is `"en2km"` (English→Khmer) or `"km2en"` (Khmer→English).
|
|
148
|
+
`mode` is `"greedy"` (default), `"beam"`, or `"sample"`.
|
|
149
|
+
|
|
150
|
+
### 2. CLI
|
|
151
|
+
|
|
152
|
+
```bash
|
|
153
|
+
# Single sentence (default direction en2km):
|
|
154
|
+
netra-translate --text "Hello, how are you?"
|
|
155
|
+
|
|
156
|
+
# Khmer → English with beam search:
|
|
157
|
+
netra-translate --text "សួស្តី, តើអ្នកសុខសប្បាយទេ?" --direction km2en --mode beam
|
|
158
|
+
|
|
159
|
+
# Translate a file (one sentence per line):
|
|
160
|
+
netra-translate --file input.txt --output output.txt --direction en2km
|
|
161
|
+
|
|
162
|
+
# Interactive REPL (omit --text / --file):
|
|
163
|
+
netra-translate
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
### 3. Web app + REST API (FastAPI)
|
|
167
|
+
|
|
168
|
+
```bash
|
|
169
|
+
netra-web # serves the web UI + API at http://127.0.0.1:8000
|
|
170
|
+
netra-web --port 8080 --device cpu
|
|
171
|
+
netra-web --local-dir export # load weights from a local export dir
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
A two-pane translation site (source left, output right, EN⇄KM swap button) plus a JSON API:
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
curl -X POST http://127.0.0.1:8000/api/translate \
|
|
178
|
+
-H 'Content-Type: application/json' \
|
|
179
|
+
-d '{"text": "Hello, how are you?", "direction": "en2km"}'
|
|
180
|
+
# {"translation": "...", "direction": "en2km"}
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
Requires the `web` extra (`pip install "netra-nmt[web]"`).
|
|
Binary file
|