tigrinya-tokenizer 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tigrinya_tokenizer-0.1.0/PKG-INFO +59 -0
- tigrinya_tokenizer-0.1.0/README.md +45 -0
- tigrinya_tokenizer-0.1.0/pyproject.toml +26 -0
- tigrinya_tokenizer-0.1.0/setup.cfg +4 -0
- tigrinya_tokenizer-0.1.0/src/tigrinya_tokenizer.egg-info/PKG-INFO +59 -0
- tigrinya_tokenizer-0.1.0/src/tigrinya_tokenizer.egg-info/SOURCES.txt +10 -0
- tigrinya_tokenizer-0.1.0/src/tigrinya_tokenizer.egg-info/dependency_links.txt +1 -0
- tigrinya_tokenizer-0.1.0/src/tigrinya_tokenizer.egg-info/top_level.txt +1 -0
- tigrinya_tokenizer-0.1.0/src/tokenizer/corpus.py +8 -0
- tigrinya_tokenizer-0.1.0/src/tokenizer/normalization.py +9 -0
- tigrinya_tokenizer-0.1.0/src/tokenizer/train_bpe.py +89 -0
- tigrinya_tokenizer-0.1.0/test/test_tokenizer_tigrinya.py +79 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tigrinya-tokenizer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Robust Tigrinya Byte Pair Encoding tokenizer
|
|
5
|
+
Author-email: Haben Eyasu <habifishe@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/haben-ai/Tigrinya-Tokenizer
|
|
8
|
+
Project-URL: Issues, https://github.com/haben-ai/Tigrinya-Tokenizer/issues
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Cython
|
|
11
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
12
|
+
Requires-Python: >=3.8
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# Tigriyna_BPE_Tokenizer
|
|
16
|
+
|
|
17
|
+
A Byte Pair Encoding (BPE) tokenizer for the Tigrinya language, designed for low-resource NLP research and machine learning pipelines.
|
|
18
|
+
|
|
19
|
+
Tigrinya is a low-resource Semitic language, and most existing tokenizers are optimized for high-resource languages. This project aims to reduce token fragmentation, lower out-of-vocabulary (OOV) rates, and better capture Tigrinya morphology.
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Features
|
|
24
|
+
|
|
25
|
+
- BPE-based subword tokenization for Tigrinya
|
|
26
|
+
- Optimized for low-resource settings
|
|
27
|
+
- Reduced OOV rate and token fragmentation
|
|
28
|
+
- Easy integration into NLP pipelines
|
|
29
|
+
- Reproducible tokenizer training and evaluation
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## Motivation
|
|
34
|
+
|
|
35
|
+
Tokenization plays a critical role in NLP system performance. Generic tokenizers often perform poorly on Tigrinya due to:
|
|
36
|
+
|
|
37
|
+
- Rich morphology
|
|
38
|
+
- Limited training data
|
|
39
|
+
- Underrepresentation in multilingual models
|
|
40
|
+
|
|
41
|
+
This project addresses these challenges by providing a tokenizer tailored specifically to the Tigrinya language.
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Project Structure
|
|
46
|
+
|
|
47
|
+
```text
|
|
48
|
+
Tigriyna_BPE_Tokenizer/
|
|
49
|
+
├── data/
|
|
50
|
+
│ ├── raw/ # Raw text data (ignored)
|
|
51
|
+
│ ├── processed/ # Processed text data (ignored)
|
|
52
|
+
├── tokenizer/
|
|
53
|
+
│ ├── train_bpe.py # Train BPE tokenizer
|
|
54
|
+
│ ├── encode.py # Encode text
|
|
55
|
+
│ └── decode.py # Decode tokens
|
|
56
|
+
├── experiments/ # Evaluation and analysis
|
|
57
|
+
├── requirements.txt
|
|
58
|
+
├── .gitignore
|
|
59
|
+
└── README.md
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# Tigriyna_BPE_Tokenizer
|
|
2
|
+
|
|
3
|
+
A Byte Pair Encoding (BPE) tokenizer for the Tigrinya language, designed for low-resource NLP research and machine learning pipelines.
|
|
4
|
+
|
|
5
|
+
Tigrinya is a low-resource Semitic language, and most existing tokenizers are optimized for high-resource languages. This project aims to reduce token fragmentation, lower out-of-vocabulary (OOV) rates, and better capture Tigrinya morphology.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
- BPE-based subword tokenization for Tigrinya
|
|
12
|
+
- Optimized for low-resource settings
|
|
13
|
+
- Reduced OOV rate and token fragmentation
|
|
14
|
+
- Easy integration into NLP pipelines
|
|
15
|
+
- Reproducible tokenizer training and evaluation
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## Motivation
|
|
20
|
+
|
|
21
|
+
Tokenization plays a critical role in NLP system performance. Generic tokenizers often perform poorly on Tigrinya due to:
|
|
22
|
+
|
|
23
|
+
- Rich morphology
|
|
24
|
+
- Limited training data
|
|
25
|
+
- Underrepresentation in multilingual models
|
|
26
|
+
|
|
27
|
+
This project addresses these challenges by providing a tokenizer tailored specifically to the Tigrinya language.
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## Project Structure
|
|
32
|
+
|
|
33
|
+
```text
|
|
34
|
+
Tigriyna_BPE_Tokenizer/
|
|
35
|
+
├── data/
|
|
36
|
+
│ ├── raw/ # Raw text data (ignored)
|
|
37
|
+
│ ├── processed/ # Processed text data (ignored)
|
|
38
|
+
├── tokenizer/
|
|
39
|
+
│ ├── train_bpe.py # Train BPE tokenizer
|
|
40
|
+
│ ├── encode.py # Encode text
|
|
41
|
+
│ └── decode.py # Decode tokens
|
|
42
|
+
├── experiments/ # Evaluation and analysis
|
|
43
|
+
├── requirements.txt
|
|
44
|
+
├── .gitignore
|
|
45
|
+
└── README.md
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "tigrinya-tokenizer"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Robust Tigrinya Byte Pair Encoding tokenizer"
|
|
9
|
+
authors = [
|
|
10
|
+
{ name="Haben Eyasu", email="habifishe@gmail.com" }
|
|
11
|
+
]
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.8"
|
|
14
|
+
dependencies = []
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
license = "MIT"
|
|
18
|
+
classifiers = [
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Cython",
|
|
21
|
+
"Topic :: Text Processing :: Linguistic"
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[project.urls]
|
|
25
|
+
Homepage = "https://github.com/haben-ai/Tigrinya-Tokenizer"
|
|
26
|
+
Issues = "https://github.com/haben-ai/Tigrinya-Tokenizer/issues"
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tigrinya-tokenizer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Robust Tigrinya Byte Pair Encoding tokenizer
|
|
5
|
+
Author-email: Haben Eyasu <habifishe@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/haben-ai/Tigrinya-Tokenizer
|
|
8
|
+
Project-URL: Issues, https://github.com/haben-ai/Tigrinya-Tokenizer/issues
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Cython
|
|
11
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
12
|
+
Requires-Python: >=3.8
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# Tigriyna_BPE_Tokenizer
|
|
16
|
+
|
|
17
|
+
A Byte Pair Encoding (BPE) tokenizer for the Tigrinya language, designed for low-resource NLP research and machine learning pipelines.
|
|
18
|
+
|
|
19
|
+
Tigrinya is a low-resource Semitic language, and most existing tokenizers are optimized for high-resource languages. This project aims to reduce token fragmentation, lower out-of-vocabulary (OOV) rates, and better capture Tigrinya morphology.
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Features
|
|
24
|
+
|
|
25
|
+
- BPE-based subword tokenization for Tigrinya
|
|
26
|
+
- Optimized for low-resource settings
|
|
27
|
+
- Reduced OOV rate and token fragmentation
|
|
28
|
+
- Easy integration into NLP pipelines
|
|
29
|
+
- Reproducible tokenizer training and evaluation
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## Motivation
|
|
34
|
+
|
|
35
|
+
Tokenization plays a critical role in NLP system performance. Generic tokenizers often perform poorly on Tigrinya due to:
|
|
36
|
+
|
|
37
|
+
- Rich morphology
|
|
38
|
+
- Limited training data
|
|
39
|
+
- Underrepresentation in multilingual models
|
|
40
|
+
|
|
41
|
+
This project addresses these challenges by providing a tokenizer tailored specifically to the Tigrinya language.
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Project Structure
|
|
46
|
+
|
|
47
|
+
```text
|
|
48
|
+
Tigriyna_BPE_Tokenizer/
|
|
49
|
+
├── data/
|
|
50
|
+
│ ├── raw/ # Raw text data (ignored)
|
|
51
|
+
│ ├── processed/ # Processed text data (ignored)
|
|
52
|
+
├── tokenizer/
|
|
53
|
+
│ ├── train_bpe.py # Train BPE tokenizer
|
|
54
|
+
│ ├── encode.py # Encode text
|
|
55
|
+
│ └── decode.py # Decode tokens
|
|
56
|
+
├── experiments/ # Evaluation and analysis
|
|
57
|
+
├── requirements.txt
|
|
58
|
+
├── .gitignore
|
|
59
|
+
└── README.md
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/tigrinya_tokenizer.egg-info/PKG-INFO
|
|
4
|
+
src/tigrinya_tokenizer.egg-info/SOURCES.txt
|
|
5
|
+
src/tigrinya_tokenizer.egg-info/dependency_links.txt
|
|
6
|
+
src/tigrinya_tokenizer.egg-info/top_level.txt
|
|
7
|
+
src/tokenizer/corpus.py
|
|
8
|
+
src/tokenizer/normalization.py
|
|
9
|
+
src/tokenizer/train_bpe.py
|
|
10
|
+
test/test_tokenizer_tigrinya.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
tokenizer
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from tokenizers import Tokenizer
|
|
4
|
+
from tokenizers.models import BPE
|
|
5
|
+
from tokenizers.trainers import BpeTrainer
|
|
6
|
+
from tokenizers.pre_tokenizers import Whitespace
|
|
7
|
+
from tokenizers.normalizers import Sequence, NFC
|
|
8
|
+
from tokenizers.decoders import BPEDecoder
|
|
9
|
+
import yaml
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def load_config(path="configs/bpe_50k.yaml"):
|
|
13
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
14
|
+
return yaml.safe_load(f)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def train():
|
|
18
|
+
cfg = load_config()
|
|
19
|
+
corpus_file = "data/processed/normalized.txt"
|
|
20
|
+
|
|
21
|
+
if not os.path.exists(corpus_file):
|
|
22
|
+
print(f"[ERROR] Corpus file not found: {corpus_file}")
|
|
23
|
+
return
|
|
24
|
+
|
|
25
|
+
if os.path.getsize(corpus_file) == 0:
|
|
26
|
+
print(f"[ERROR] Corpus file is empty: {corpus_file}")
|
|
27
|
+
return
|
|
28
|
+
|
|
29
|
+
print(f"[INFO] Training BPE tokenizer on: {corpus_file}")
|
|
30
|
+
print(f"[INFO] Target vocab size: {cfg['tokenizer']['vocab_size']}")
|
|
31
|
+
|
|
32
|
+
# Initialize tokenizer
|
|
33
|
+
tokenizer = Tokenizer(BPE(unk_token="<unk>"))
|
|
34
|
+
|
|
35
|
+
# Unicode normalization (critical for Ge’ez script consistency)
|
|
36
|
+
tokenizer.normalizer = Sequence([NFC()])
|
|
37
|
+
|
|
38
|
+
# ✅ IMPORTANT FIX:
|
|
39
|
+
# Use whitespace splitting so BPE can learn merges inside words
|
|
40
|
+
tokenizer.pre_tokenizer = Whitespace()
|
|
41
|
+
|
|
42
|
+
# Proper BPE decoder
|
|
43
|
+
tokenizer.decoder = BPEDecoder()
|
|
44
|
+
|
|
45
|
+
# Trainer configuration
|
|
46
|
+
trainer = BpeTrainer(
|
|
47
|
+
vocab_size=cfg["tokenizer"]["vocab_size"],
|
|
48
|
+
min_frequency=cfg["tokenizer"]["min_frequency"],
|
|
49
|
+
special_tokens=cfg["special_tokens"]
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
print("[INFO] Starting training...")
|
|
53
|
+
tokenizer.train(files=[corpus_file], trainer=trainer)
|
|
54
|
+
print("[INFO] Training complete!")
|
|
55
|
+
|
|
56
|
+
# Save tokenizer
|
|
57
|
+
out_dir = Path("outputs/tokenizer")
|
|
58
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
59
|
+
output_path = out_dir / "tokenizer.json"
|
|
60
|
+
tokenizer.save(str(output_path))
|
|
61
|
+
|
|
62
|
+
print(f"[INFO] Tokenizer saved to: {output_path}")
|
|
63
|
+
|
|
64
|
+
# Quick sanity check
|
|
65
|
+
test_text = "ሰላም ኩን ኣደርካ?"
|
|
66
|
+
encoding = tokenizer.encode(test_text)
|
|
67
|
+
|
|
68
|
+
print(f"\n[INFO] Sample text: {test_text}")
|
|
69
|
+
print(f"[INFO] Tokens: {encoding.tokens}")
|
|
70
|
+
print(f"[INFO] Decoded: {tokenizer.decode(encoding.ids)}")
|
|
71
|
+
|
|
72
|
+
if tokenizer.decode(encoding.ids) == test_text:
|
|
73
|
+
print("[INFO] Round-trip PASSED ✅")
|
|
74
|
+
else:
|
|
75
|
+
print("[WARNING] Round-trip FAILED ❌")
|
|
76
|
+
|
|
77
|
+
# Check if merges were learned
|
|
78
|
+
model = tokenizer.model
|
|
79
|
+
if hasattr(model, "get_merges"):
|
|
80
|
+
merges = model.get_merges()
|
|
81
|
+
print(f"[INFO] Number of learned merges: {len(merges)}")
|
|
82
|
+
if len(merges) == 0:
|
|
83
|
+
print("[WARNING] No merges learned! Increase vocab_size or check corpus.")
|
|
84
|
+
else:
|
|
85
|
+
print("[INFO] Cannot directly inspect merges (check tokenizer.json).")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
if __name__ == "__main__":
|
|
89
|
+
train()
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from tokenizers import Tokenizer
|
|
2
|
+
|
|
3
|
+
TOKENIZER_PATH = "outputs/tokenizer/tokenizer.json"
|
|
4
|
+
|
|
5
|
+
# Load tokenizer
|
|
6
|
+
tokenizer = Tokenizer.from_file(TOKENIZER_PATH)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# 1️⃣ Random Tigrinya Test Words
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
TEST_WORDS = [
|
|
14
|
+
"ሰላም",
|
|
15
|
+
"ትግርኛ",
|
|
16
|
+
"ኣደርካ",
|
|
17
|
+
"ሕብረት",
|
|
18
|
+
"መንግስቲ",
|
|
19
|
+
"ምምሕዳር",
|
|
20
|
+
"ትምህርቲ",
|
|
21
|
+
"ኤርትራ",
|
|
22
|
+
"ሃገር",
|
|
23
|
+
"ፍቕሪ",
|
|
24
|
+
"ጸሓፊ",
|
|
25
|
+
"ቤት",
|
|
26
|
+
"ስራሕ",
|
|
27
|
+
"ኣቦ",
|
|
28
|
+
"ኣይተ"
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def test_word(word):
|
|
33
|
+
print("=" * 60)
|
|
34
|
+
print(f"Original: {word}")
|
|
35
|
+
|
|
36
|
+
encoding = tokenizer.encode(word)
|
|
37
|
+
tokens = encoding.tokens
|
|
38
|
+
decoded = tokenizer.decode(encoding.ids)
|
|
39
|
+
|
|
40
|
+
print(f"Tokens: {tokens}")
|
|
41
|
+
print(f"Decoded: {decoded}")
|
|
42
|
+
|
|
43
|
+
# Check unknown tokens
|
|
44
|
+
if "<unk>" in tokens:
|
|
45
|
+
print("⚠ WARNING: <unk> token detected")
|
|
46
|
+
|
|
47
|
+
# Check round-trip correctness
|
|
48
|
+
if decoded == word:
|
|
49
|
+
print("✅ Round-trip OK")
|
|
50
|
+
else:
|
|
51
|
+
print("❌ Round-trip FAILED")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def run_tests():
|
|
55
|
+
print("\nRunning Tigrinya Tokenizer Tests\n")
|
|
56
|
+
for word in TEST_WORDS:
|
|
57
|
+
test_word(word)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# 2️⃣ Sentence-Level Tests
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
SENTENCES = [
|
|
65
|
+
"ሰላም ኩን ኣደርካ?",
|
|
66
|
+
"ኣብ ትግርኛ መምህራን ኣሎዉ።",
|
|
67
|
+
"ትምህርቲ ኣገዳሲ ኢዩ፣ ወላ'ውን ጠቃሚ ኢዩ",
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def run_sentence_tests():
|
|
72
|
+
print("\nRunning Sentence Tests\n")
|
|
73
|
+
for sentence in SENTENCES:
|
|
74
|
+
test_word(sentence)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
if __name__ == "__main__":
|
|
78
|
+
run_tests()
|
|
79
|
+
run_sentence_tests()
|