netra-nmt 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. netra_nmt-0.1.0/.gitignore +259 -0
  2. netra_nmt-0.1.0/08_test_tokenizer.py +11 -0
  3. netra_nmt-0.1.0/LICENSE +21 -0
  4. netra_nmt-0.1.0/PKG-INFO +217 -0
  5. netra_nmt-0.1.0/README.md +183 -0
  6. netra_nmt-0.1.0/assets/Asset 4@Bag.png +0 -0
  7. netra_nmt-0.1.0/assets/Asset 5@Bag_black.png +0 -0
  8. netra_nmt-0.1.0/assets/bertscore_benchmark.png +0 -0
  9. netra_nmt-0.1.0/assets/chrf_benchmark.png +0 -0
  10. netra_nmt-0.1.0/assets/inference_benchmark.png +0 -0
  11. netra_nmt-0.1.0/assets/logo.png +0 -0
  12. netra_nmt-0.1.0/assets/metrics_benchmark.png +0 -0
  13. netra_nmt-0.1.0/assets/model_architecture.png +0 -0
  14. netra_nmt-0.1.0/assets/wordmark.png +0 -0
  15. netra_nmt-0.1.0/netra_architecture.png +0 -0
  16. netra_nmt-0.1.0/netra_nmt/__init__.py +29 -0
  17. netra_nmt-0.1.0/netra_nmt/assets/spm_32k.model +0 -0
  18. netra_nmt-0.1.0/netra_nmt/cli.py +209 -0
  19. netra_nmt-0.1.0/netra_nmt/config.py +62 -0
  20. netra_nmt-0.1.0/netra_nmt/decoding.py +174 -0
  21. netra_nmt-0.1.0/netra_nmt/model.py +383 -0
  22. netra_nmt-0.1.0/netra_nmt/server.py +133 -0
  23. netra_nmt-0.1.0/netra_nmt/static/index.html +328 -0
  24. netra_nmt-0.1.0/netra_nmt/static/netra-wordmark-orange.png +0 -0
  25. netra_nmt-0.1.0/netra_nmt/static/netra-wordmark.png +0 -0
  26. netra_nmt-0.1.0/netra_nmt/translator.py +193 -0
  27. netra_nmt-0.1.0/netra_nmt/weights.py +70 -0
  28. netra_nmt-0.1.0/pyproject.toml +57 -0
  29. netra_nmt-0.1.0/requirements.txt +16 -0
  30. netra_nmt-0.1.0/results/eval_epoch03.json +86 -0
  31. netra_nmt-0.1.0/results/eval_results_alt.json +43 -0
  32. netra_nmt-0.1.0/results/eval_results_lyfeyvutha_350M_alt.json +43 -0
  33. netra_nmt-0.1.0/results/eval_results_madlad3b_alt.json +44 -0
  34. netra_nmt-0.1.0/results/eval_results_nllb1.3B_alt.json +43 -0
  35. netra_nmt-0.1.0/results/eval_results_nllb200_alt.json +43 -0
  36. netra_nmt-0.1.0/results/eval_results_rinabuoy.json +43 -0
  37. netra_nmt-0.1.0/results/google_translate_alt.json +41 -0
  38. netra_nmt-0.1.0/results/train_log.jsonl +1992 -0
  39. netra_nmt-0.1.0/results/train_log_finetune.jsonl +38 -0
  40. netra_nmt-0.1.0/scripts/assets/metrics_benchmark.png +0 -0
  41. netra_nmt-0.1.0/scripts/benchmark_metrics.ipynb +395 -0
  42. netra_nmt-0.1.0/scripts/compare_models.ipynb +819 -0
  43. netra_nmt-0.1.0/scripts/compare_models.py +315 -0
  44. netra_nmt-0.1.0/scripts/export_checkpoint.py +167 -0
  45. netra_nmt-0.1.0/scripts/visualize_architecture.py +378 -0
@@ -0,0 +1,259 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ # Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ # poetry.lock
109
+ # poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ # pdm.lock
116
+ # pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ # pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # Redis
135
+ *.rdb
136
+ *.aof
137
+ *.pid
138
+
139
+ # RabbitMQ
140
+ mnesia/
141
+ rabbitmq/
142
+ rabbitmq-data/
143
+
144
+ # ActiveMQ
145
+ activemq-data/
146
+
147
+ # SageMath parsed files
148
+ *.sage.py
149
+
150
+ # Environments
151
+ .myenv
152
+ .env
153
+ .envrc
154
+ .venv
155
+ env/
156
+ venv/
157
+ ENV/
158
+ env.bak/
159
+ venv.bak/
160
+ myenv/
161
+ ./myenv/
162
+ ./myenv
163
+
164
+ # Spyder project settings
165
+ .spyderproject
166
+ .spyproject
167
+
168
+ # Rope project settings
169
+ .ropeproject
170
+
171
+ # mkdocs documentation
172
+ /site
173
+
174
+ # mypy
175
+ .mypy_cache/
176
+ .dmypy.json
177
+ dmypy.json
178
+
179
+ # Pyre type checker
180
+ .pyre/
181
+
182
+ # pytype static type analyzer
183
+ .pytype/
184
+
185
+ # Cython debug symbols
186
+ cython_debug/
187
+
188
+ # PyCharm
189
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
190
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
191
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
192
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
193
+ # .idea/
194
+
195
+ # Abstra
196
+ # Abstra is an AI-powered process automation framework.
197
+ # Ignore directories containing user credentials, local state, and settings.
198
+ # Learn more at https://abstra.io/docs
199
+ .abstra/
200
+
201
+ # Visual Studio Code
202
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
203
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
204
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
205
+ # you could uncomment the following to ignore the entire vscode folder
206
+ # .vscode/
207
+ # Temporary file for partial code execution
208
+ tempCodeRunnerFile.py
209
+
210
+ # Ruff stuff:
211
+ .ruff_cache/
212
+
213
+ # PyPI configuration file
214
+ .pypirc
215
+
216
+ # Marimo
217
+ marimo/_static/
218
+ marimo/_lsp/
219
+ __marimo__/
220
+
221
+ # Streamlit
222
+ .streamlit/secrets.toml
223
+
224
+
225
+ ./data
226
+ data/
227
+ data
228
+ corpus.txt
229
+
230
+ ./checkpoints
231
+ checkpoints/
232
+ checkpoints
233
+
234
+ checkpoints_distill
235
+
236
+ checkpoints_human_finetune
237
+
238
+ checkpoints_finetune/
239
+
240
+ archived/
241
+
242
+ tokenizer/corpus.txt
243
+ tokenizer/*.model
244
+ tokenizer/*.vocab
245
+ # netra-nmt release export (weights go to Hugging Face Hub)
246
+ export/
247
+
248
+ DESIGN_SYSTEM.md
249
+ # Standalone Streamlit Space (not part of the package; deployed separately)
250
+ streamlit_space/
251
+
252
+ # Benchmark scripts and generated chart / result images (local, not versioned)
253
+ scripts/benchmark_speed*.py
254
+ scripts/chart_speed.py
255
+ results/benchmark_speed*.json
256
+
257
+ scripts/training/
258
+
259
+ test.py
@@ -0,0 +1,11 @@
1
+ import sentencepiece as spm
2
+
3
+ model = spm.SentencePieceProcessor()
4
+
5
+ model.load("tokenizer/spm_32k.model")
6
+
7
+ text = "ខ្ញុំទៅសាលារៀន I go to school"
8
+
9
+ print("INPUT:", text)
10
+ print("TOKENS:", model.encode(text, out_type=str))
11
+ print("IDS:", model.encode(text, out_type=int))
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Netra Lab
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,217 @@
1
+ Metadata-Version: 2.4
2
+ Name: netra-nmt
3
+ Version: 0.1.0
4
+ Summary: A compact, from-scratch English↔Khmer neural machine translation model.
5
+ Project-URL: Homepage, https://github.com/NDarayut/netra-nmt
6
+ Author-email: Darayut Nhem <darayutnhem009@gmail.com>
7
+ License: MIT
8
+ License-File: LICENSE
9
+ Keywords: english,khmer,nmt,seq2seq,transformer,translation
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
13
+ Requires-Python: >=3.10
14
+ Requires-Dist: huggingface-hub>=0.20.0
15
+ Requires-Dist: numpy
16
+ Requires-Dist: safetensors>=0.4.0
17
+ Requires-Dist: sentencepiece>=0.1.99
18
+ Requires-Dist: torch>=2.0
19
+ Provides-Extra: train
20
+ Requires-Dist: accelerate; extra == 'train'
21
+ Requires-Dist: bert-score>=0.3.13; extra == 'train'
22
+ Requires-Dist: datasets; extra == 'train'
23
+ Requires-Dist: fasttext-wheel; extra == 'train'
24
+ Requires-Dist: khmercut; extra == 'train'
25
+ Requires-Dist: langdetect; extra == 'train'
26
+ Requires-Dist: pandas; extra == 'train'
27
+ Requires-Dist: pyarrow; extra == 'train'
28
+ Requires-Dist: sacrebleu>=2.3.1; extra == 'train'
29
+ Requires-Dist: unbabel-comet>=2.2.0; extra == 'train'
30
+ Provides-Extra: web
31
+ Requires-Dist: fastapi>=0.110; extra == 'web'
32
+ Requires-Dist: uvicorn[standard]>=0.27; extra == 'web'
33
+ Description-Content-Type: text/markdown
34
+
35
+ <div align="center">
36
+ <img
37
+ src="https://raw.githubusercontent.com/netra-ai-lab/Netra-NMT/main/assets/logo.png"
38
+ width="20%"
39
+ alt="Netra Lab"
40
+ style="vertical-align: middle; margin-right: 30px;"
41
+ />
42
+ <img
43
+ src="https://raw.githubusercontent.com/netra-ai-lab/Netra-NMT/main/assets/wordmark.png"
44
+ width="20%"
45
+ alt="Netra Lab"
46
+ style="vertical-align: middle;"
47
+ />
48
+ </div>
49
+
50
+ <hr>
51
+
52
+ <p align="center">
53
+ <a href="https://github.com/netra-ai-lab/Netra-NMT"><b>GitHub</b></a> |
54
+ <a href="https://huggingface.co/Darayut/netra-nmt-small"><b>Model Download</b></a> |
55
+ <a href="https://huggingface.co/datasets/Darayut/bilingual-en-km"><b>Dataset Download</b></a> |
56
+ <a href="https://huggingface.co/spaces/Darayut/Netra-NMT"><b>Inference Space</b></a> |
57
+ </p>
58
+
59
+ <h2>
60
+ <p align="center">
61
+ <a href="">A Compact Bidirectional Encoder-Decoder Transformer-Based Model for English-Khmer Translation</a>
62
+ </p>
63
+ </h2>
64
+
65
+ <p align="center">
66
+ <img src="https://raw.githubusercontent.com/netra-ai-lab/Netra-NMT/main/assets/inference_benchmark.png" style="width: 1000px" align=center>
67
+ </p>
68
+
69
+ <p align="center">
70
+ <a href="">Inference Speed Benchmark on CPU Using Greedy, and Beam Search Decoding Strategy</a>
71
+ </p>
72
+
73
+ ## 1. Abstract
74
+ This repository present Netra-NMT a 90M-parameter encoder-decoder transformer-based model trained on **220 million tokens** of English-Khmer parallel text (4.2M bidirectional examples). The encoder uses bidirectional self-attention, much like BERT, to capture global contextual representation. The decoder perform autoregressive generation through causal self-attention and encoder-decoder cross attention.
75
+
76
+ Unlike traditional transformer block, Netra-NMT incorporates several architectural improvements, including Pre-Layer Normalization (Pre-LN) for stable optimization, SwiGLU feed-forward networks for enhanced representational capacity, and weight tying between the decoder embedding layer and output projection head to reduce parameter redundancy.
77
+
78
+ ## 2. Dataset
79
+
80
+ Netra-NMT was trained on **220 million tokens** drawn from approximately **2.4 million unique English-Khmer sentence pairs** (4.2 million examples after bidirectional augmentation). The corpus combines LLM-generated synthetic data with web-crawled parallel text, spanning legal, literary, medical, technical, and conversational domains.
81
+
82
+ ### 2.1 Sources
83
+
84
+ | Dataset | Type | Pairs | Domains |
85
+ |---------|------|------:|---------|
86
+ | [Darayut/khmer-english-pairs-raw](https://huggingface.co/datasets/Darayut/khmer-english-pairs-raw) | Synthetic | 200K | Legal, Literary, Governmental |
87
+ | [lyfeyvutha/nllb-en-km-316K](https://huggingface.co/datasets/lyfeyvutha/nllb-en-km-316K) | Synthetic | 316K | General |
88
+ | [KrorngAI/ParaCrawl-English-Khmer-v2](https://huggingface.co/datasets/KrorngAI/ParaCrawl-English-Khmer-v2) | Web crawl (ParaCrawl) | 1.5M | Web / general |
89
+ | [SeyhaLite/Translate-English-Khmer-All](https://huggingface.co/datasets/SeyhaLite/Translate-English-Khmer-All) | --- | 366K | General |
90
+ | **Total** | | **2.4M** | |
91
+
92
+ ### 2.2 Preprocessing
93
+
94
+ Raw data was cleaned through the following pipeline:
95
+
96
+ 1. **Deduplication**: exact duplicate pairs removed across all sources.
97
+ 2. **Length filtering**: pairs with extreme source/target length mismatches were discarded.
98
+ 3. **Empty/null removal**: pairs where either side was empty or below a minimum token count were dropped.
99
+
100
+ After cleaning, each surviving pair is duplicated in both directions (`EN→KM` and `KM→EN`) with a direction prefix token (`<2km>` / `<2en>`), yielding ~4.2 million training examples.
101
+
102
+ ## 3. Model Architecture
103
+
104
+ <div align="center">
105
+ <img src="https://raw.githubusercontent.com/netra-ai-lab/Netra-NMT/main/assets/model_architecture.png" width="70%" alt="Netra-NMT Architecture" />
106
+ <p><em>Figure 1: Overview of the Netra-NMT encoder-decoder architecture. The encoder (left) processes the source sentence with bidirectional self-attention; the decoder (right) generates the target sentence autoregressively via causal self-attention and cross-attention over the encoder output. Both sides share a 32K SentencePiece tokenizer.</em></p>
107
+ </div>
108
+
109
+ Netra-NMT follows a standard encoder-decoder transformer architecture with several modifications for training stability and parameter efficiency.
110
+
111
+ **Encoder** takes the source sentence tokenized by the shared 32K SentencePiece tokenizer, adds learned positional embeddings, and passes the sequence through 6 transformer layers with *bidirectional* self-attention (every token attends to every other token, similar to BERT). A final Pre-LN layer norm is applied to the encoder output before it is passed to the decoder via cross-attention.
112
+
113
+ **Decoder** takes the (partially generated) target sentence through the same tokenizer, adds positional embeddings, and passes it through 6 transformer layers. Each decoder layer applies three sub-layers in order: (1) *causal* (masked) self-attention over previously generated tokens, (2) cross-attention over the full encoder output, and (3) a feed-forward block. A final Pre-LN layer norm feeds into the tied linear projection head to produce output token probabilities.
114
+
115
+ **Architectural improvements over the vanilla transformer:**
116
+
117
+ | Feature | Detail |
118
+ |---------|--------|
119
+ | Pre-Layer Normalization | Layer norm applied *before* each sub-layer (Pre-LN) rather than after, improving gradient flow and training stability |
120
+ | SwiGLU FFN | Feed-forward blocks use the SwiGLU activation instead of ReLU, providing richer representational capacity at no parameter cost |
121
+ | Weight tying | The decoder input embedding matrix is shared with the output linear projection head, reducing redundant parameters |
122
+
123
+ **Hyperparameters:**
124
+
125
+ | | |
126
+ |---|---|
127
+ | d_model | 512 |
128
+ | Encoder / Decoder layers | 6 / 6 |
129
+ | Attention heads | 8 |
130
+ | FFN hidden size | 2048 |
131
+ | Vocabulary | 32K (SentencePiece unigram, shared) |
132
+ | Total parameters | ~89.7M |
133
+
134
+ ## 4. Evaluation Results
135
+
136
+ <p align="center">
137
+ <img src="https://raw.githubusercontent.com/netra-ai-lab/Netra-NMT/main/assets/metrics_benchmark.png" style="width: 1000px" align=center>
138
+ </p>
139
+
140
+ ## Install
141
+
142
+ ```bash
143
+ pip install netra-nmt # core (Python API + CLI)
144
+ pip install "netra-nmt[web]" # + FastAPI web app & REST API
145
+ ```
146
+
147
+ Or from source:
148
+
149
+ ```bash
150
+ git clone https://github.com/NDarayut/netra-nmt
151
+ cd netra-nmt
152
+ pip install -e ".[web]"
153
+ ```
154
+
155
+ The first translation downloads the weights (~180 MB fp16) from the Hugging Face Hub and caches them
156
+ under `~/.cache/huggingface`.
157
+
158
+ ## Usage
159
+
160
+ ### 1. Python API
161
+
162
+ ```python
163
+ from netra_nmt import NetraTranslator
164
+
165
+ t = NetraTranslator() # auto-detect GPU/CPU; downloads weights once
166
+ t.translate("Hello, how are you?", direction="en2km") # → "សួស្តី សុខសប្បាយអត់?"
167
+ t.translate("ខ្ញុំស្រឡាញ់ប្រទេសរបស់ខ្ញុំ។", direction="km2en")
168
+
169
+ # Batch + decoding options
170
+ t.translate_batch(["Good morning.", "See you tomorrow."], direction="en2km")
171
+ t.translate("Good morning, my friend.", direction="en2km", mode="beam", beam_size=5)
172
+ ```
173
+
174
+ One-shot helper (caches a default translator):
175
+
176
+ ```python
177
+ from netra_nmt import translate
178
+ translate("Hello", direction="en2km")
179
+ ```
180
+
181
+ `direction` is `"en2km"` (English→Khmer) or `"km2en"` (Khmer→English).
182
+ `mode` is `"greedy"` (default), `"beam"`, or `"sample"`.
183
+
184
+ ### 2. CLI
185
+
186
+ ```bash
187
+ # Single sentence (default direction en2km):
188
+ netra-translate --text "Hello, how are you?"
189
+
190
+ # Khmer → English with beam search:
191
+ netra-translate --text "សួស្តី, តើអ្នកសុខសប្បាយទេ?" --direction km2en --mode beam
192
+
193
+ # Translate a file (one sentence per line):
194
+ netra-translate --file input.txt --output output.txt --direction en2km
195
+
196
+ # Interactive REPL (omit --text / --file):
197
+ netra-translate
198
+ ```
199
+
200
+ ### 3. Web app + REST API (FastAPI)
201
+
202
+ ```bash
203
+ netra-web # serves the web UI + API at http://127.0.0.1:8000
204
+ netra-web --port 8080 --device cpu
205
+ netra-web --local-dir export # load weights from a local export dir
206
+ ```
207
+
208
+ A two-pane translation site (source left, output right, EN⇄KM swap button) plus a JSON API:
209
+
210
+ ```bash
211
+ curl -X POST http://127.0.0.1:8000/api/translate \
212
+ -H 'Content-Type: application/json' \
213
+ -d '{"text": "Hello, how are you?", "direction": "en2km"}'
214
+ # {"translation": "...", "direction": "en2km"}
215
+ ```
216
+
217
+ Requires the `web` extra (`pip install "netra-nmt[web]"`).
@@ -0,0 +1,183 @@
1
+ <div align="center">
2
+ <img
3
+ src="https://raw.githubusercontent.com/netra-ai-lab/Netra-NMT/main/assets/logo.png"
4
+ width="20%"
5
+ alt="Netra Lab"
6
+ style="vertical-align: middle; margin-right: 30px;"
7
+ />
8
+ <img
9
+ src="https://raw.githubusercontent.com/netra-ai-lab/Netra-NMT/main/assets/wordmark.png"
10
+ width="20%"
11
+ alt="Netra Lab"
12
+ style="vertical-align: middle;"
13
+ />
14
+ </div>
15
+
16
+ <hr>
17
+
18
+ <p align="center">
19
+ <a href="https://github.com/netra-ai-lab/Netra-NMT"><b>GitHub</b></a> |
20
+ <a href="https://huggingface.co/Darayut/netra-nmt-small"><b>Model Download</b></a> |
21
+ <a href="https://huggingface.co/datasets/Darayut/bilingual-en-km"><b>Dataset Download</b></a> |
22
+ <a href="https://huggingface.co/spaces/Darayut/Netra-NMT"><b>Inference Space</b></a> |
23
+ </p>
24
+
25
+ <h2>
26
+ <p align="center">
27
+ <a href="">A Compact Bidirectional Encoder-Decoder Transformer-Based Model for English-Khmer Translation</a>
28
+ </p>
29
+ </h2>
30
+
31
+ <p align="center">
32
+ <img src="https://raw.githubusercontent.com/netra-ai-lab/Netra-NMT/main/assets/inference_benchmark.png" style="width: 1000px" align=center>
33
+ </p>
34
+
35
+ <p align="center">
36
+ <a href="">Inference Speed Benchmark on CPU Using Greedy, and Beam Search Decoding Strategy</a>
37
+ </p>
38
+
39
+ ## 1. Abstract
40
+ This repository present Netra-NMT a 90M-parameter encoder-decoder transformer-based model trained on **220 million tokens** of English-Khmer parallel text (4.2M bidirectional examples). The encoder uses bidirectional self-attention, much like BERT, to capture global contextual representation. The decoder perform autoregressive generation through causal self-attention and encoder-decoder cross attention.
41
+
42
+ Unlike traditional transformer block, Netra-NMT incorporates several architectural improvements, including Pre-Layer Normalization (Pre-LN) for stable optimization, SwiGLU feed-forward networks for enhanced representational capacity, and weight tying between the decoder embedding layer and output projection head to reduce parameter redundancy.
43
+
44
+ ## 2. Dataset
45
+
46
+ Netra-NMT was trained on **220 million tokens** drawn from approximately **2.4 million unique English-Khmer sentence pairs** (4.2 million examples after bidirectional augmentation). The corpus combines LLM-generated synthetic data with web-crawled parallel text, spanning legal, literary, medical, technical, and conversational domains.
47
+
48
+ ### 2.1 Sources
49
+
50
+ | Dataset | Type | Pairs | Domains |
51
+ |---------|------|------:|---------|
52
+ | [Darayut/khmer-english-pairs-raw](https://huggingface.co/datasets/Darayut/khmer-english-pairs-raw) | Synthetic | 200K | Legal, Literary, Governmental |
53
+ | [lyfeyvutha/nllb-en-km-316K](https://huggingface.co/datasets/lyfeyvutha/nllb-en-km-316K) | Synthetic | 316K | General |
54
+ | [KrorngAI/ParaCrawl-English-Khmer-v2](https://huggingface.co/datasets/KrorngAI/ParaCrawl-English-Khmer-v2) | Web crawl (ParaCrawl) | 1.5M | Web / general |
55
+ | [SeyhaLite/Translate-English-Khmer-All](https://huggingface.co/datasets/SeyhaLite/Translate-English-Khmer-All) | --- | 366K | General |
56
+ | **Total** | | **2.4M** | |
57
+
58
+ ### 2.2 Preprocessing
59
+
60
+ Raw data was cleaned through the following pipeline:
61
+
62
+ 1. **Deduplication**: exact duplicate pairs removed across all sources.
63
+ 2. **Length filtering**: pairs with extreme source/target length mismatches were discarded.
64
+ 3. **Empty/null removal**: pairs where either side was empty or below a minimum token count were dropped.
65
+
66
+ After cleaning, each surviving pair is duplicated in both directions (`EN→KM` and `KM→EN`) with a direction prefix token (`<2km>` / `<2en>`), yielding ~4.2 million training examples.
67
+
68
+ ## 3. Model Architecture
69
+
70
+ <div align="center">
71
+ <img src="https://raw.githubusercontent.com/netra-ai-lab/Netra-NMT/main/assets/model_architecture.png" width="70%" alt="Netra-NMT Architecture" />
72
+ <p><em>Figure 1: Overview of the Netra-NMT encoder-decoder architecture. The encoder (left) processes the source sentence with bidirectional self-attention; the decoder (right) generates the target sentence autoregressively via causal self-attention and cross-attention over the encoder output. Both sides share a 32K SentencePiece tokenizer.</em></p>
73
+ </div>
74
+
75
+ Netra-NMT follows a standard encoder-decoder transformer architecture with several modifications for training stability and parameter efficiency.
76
+
77
+ **Encoder** takes the source sentence tokenized by the shared 32K SentencePiece tokenizer, adds learned positional embeddings, and passes the sequence through 6 transformer layers with *bidirectional* self-attention (every token attends to every other token, similar to BERT). A final Pre-LN layer norm is applied to the encoder output before it is passed to the decoder via cross-attention.
78
+
79
+ **Decoder** takes the (partially generated) target sentence through the same tokenizer, adds positional embeddings, and passes it through 6 transformer layers. Each decoder layer applies three sub-layers in order: (1) *causal* (masked) self-attention over previously generated tokens, (2) cross-attention over the full encoder output, and (3) a feed-forward block. A final Pre-LN layer norm feeds into the tied linear projection head to produce output token probabilities.
80
+
81
+ **Architectural improvements over the vanilla transformer:**
82
+
83
+ | Feature | Detail |
84
+ |---------|--------|
85
+ | Pre-Layer Normalization | Layer norm applied *before* each sub-layer (Pre-LN) rather than after, improving gradient flow and training stability |
86
+ | SwiGLU FFN | Feed-forward blocks use the SwiGLU activation instead of ReLU, providing richer representational capacity at no parameter cost |
87
+ | Weight tying | The decoder input embedding matrix is shared with the output linear projection head, reducing redundant parameters |
88
+
89
+ **Hyperparameters:**
90
+
91
+ | | |
92
+ |---|---|
93
+ | d_model | 512 |
94
+ | Encoder / Decoder layers | 6 / 6 |
95
+ | Attention heads | 8 |
96
+ | FFN hidden size | 2048 |
97
+ | Vocabulary | 32K (SentencePiece unigram, shared) |
98
+ | Total parameters | ~89.7M |
99
+
100
+ ## 4. Evaluation Results
101
+
102
+ <p align="center">
103
+ <img src="https://raw.githubusercontent.com/netra-ai-lab/Netra-NMT/main/assets/metrics_benchmark.png" style="width: 1000px" align=center>
104
+ </p>
105
+
106
+ ## Install
107
+
108
+ ```bash
109
+ pip install netra-nmt # core (Python API + CLI)
110
+ pip install "netra-nmt[web]" # + FastAPI web app & REST API
111
+ ```
112
+
113
+ Or from source:
114
+
115
+ ```bash
116
+ git clone https://github.com/NDarayut/netra-nmt
117
+ cd netra-nmt
118
+ pip install -e ".[web]"
119
+ ```
120
+
121
+ The first translation downloads the weights (~180 MB fp16) from the Hugging Face Hub and caches them
122
+ under `~/.cache/huggingface`.
123
+
124
+ ## Usage
125
+
126
+ ### 1. Python API
127
+
128
+ ```python
129
+ from netra_nmt import NetraTranslator
130
+
131
+ t = NetraTranslator() # auto-detect GPU/CPU; downloads weights once
132
+ t.translate("Hello, how are you?", direction="en2km") # → "សួស្តី សុខសប្បាយអត់?"
133
+ t.translate("ខ្ញុំស្រឡាញ់ប្រទេសរបស់ខ្ញុំ។", direction="km2en")
134
+
135
+ # Batch + decoding options
136
+ t.translate_batch(["Good morning.", "See you tomorrow."], direction="en2km")
137
+ t.translate("Good morning, my friend.", direction="en2km", mode="beam", beam_size=5)
138
+ ```
139
+
140
+ One-shot helper (caches a default translator):
141
+
142
+ ```python
143
+ from netra_nmt import translate
144
+ translate("Hello", direction="en2km")
145
+ ```
146
+
147
+ `direction` is `"en2km"` (English→Khmer) or `"km2en"` (Khmer→English).
148
+ `mode` is `"greedy"` (default), `"beam"`, or `"sample"`.
149
+
150
+ ### 2. CLI
151
+
152
+ ```bash
153
+ # Single sentence (default direction en2km):
154
+ netra-translate --text "Hello, how are you?"
155
+
156
+ # Khmer → English with beam search:
157
+ netra-translate --text "សួស្តី, តើអ្នកសុខសប្បាយទេ?" --direction km2en --mode beam
158
+
159
+ # Translate a file (one sentence per line):
160
+ netra-translate --file input.txt --output output.txt --direction en2km
161
+
162
+ # Interactive REPL (omit --text / --file):
163
+ netra-translate
164
+ ```
165
+
166
+ ### 3. Web app + REST API (FastAPI)
167
+
168
+ ```bash
169
+ netra-web # serves the web UI + API at http://127.0.0.1:8000
170
+ netra-web --port 8080 --device cpu
171
+ netra-web --local-dir export # load weights from a local export dir
172
+ ```
173
+
174
+ A two-pane translation site (source left, output right, EN⇄KM swap button) plus a JSON API:
175
+
176
+ ```bash
177
+ curl -X POST http://127.0.0.1:8000/api/translate \
178
+ -H 'Content-Type: application/json' \
179
+ -d '{"text": "Hello, how are you?", "direction": "en2km"}'
180
+ # {"translation": "...", "direction": "en2km"}
181
+ ```
182
+
183
+ Requires the `web` extra (`pip install "netra-nmt[web]"`).
Binary file