tigrinya-tokenizer 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,59 @@
1
+ Metadata-Version: 2.4
2
+ Name: tigrinya-tokenizer
3
+ Version: 0.1.0
4
+ Summary: Robust Tigrinya Byte Pair Encoding tokenizer
5
+ Author-email: Haben Eyasu <habifishe@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/haben-ai/Tigrinya-Tokenizer
8
+ Project-URL: Issues, https://github.com/haben-ai/Tigrinya-Tokenizer/issues
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Cython
11
+ Classifier: Topic :: Text Processing :: Linguistic
12
+ Requires-Python: >=3.8
13
+ Description-Content-Type: text/markdown
14
+
15
+ # Tigriyna_BPE_Tokenizer
16
+
17
+ A Byte Pair Encoding (BPE) tokenizer for the Tigrinya language, designed for low-resource NLP research and machine learning pipelines.
18
+
19
+ Tigrinya is a low-resource Semitic language, and most existing tokenizers are optimized for high-resource languages. This project aims to reduce token fragmentation, lower out-of-vocabulary (OOV) rates, and better capture Tigrinya morphology.
20
+
21
+ ---
22
+
23
+ ## Features
24
+
25
+ - BPE-based subword tokenization for Tigrinya
26
+ - Optimized for low-resource settings
27
+ - Reduced OOV rate and token fragmentation
28
+ - Easy integration into NLP pipelines
29
+ - Reproducible tokenizer training and evaluation
30
+
31
+ ---
32
+
33
+ ## Motivation
34
+
35
+ Tokenization plays a critical role in NLP system performance. Generic tokenizers often perform poorly on Tigrinya due to:
36
+
37
+ - Rich morphology
38
+ - Limited training data
39
+ - Underrepresentation in multilingual models
40
+
41
+ This project addresses these challenges by providing a tokenizer tailored specifically to the Tigrinya language.
42
+
43
+ ---
44
+
45
+ ## Project Structure
46
+
47
+ ```text
48
+ Tigriyna_BPE_Tokenizer/
49
+ ├── data/
50
+ │ ├── raw/ # Raw text data (ignored)
51
+ │ ├── processed/ # Processed text data (ignored)
52
+ ├── tokenizer/
53
+ │ ├── train_bpe.py # Train BPE tokenizer
54
+ │ ├── encode.py # Encode text
55
+ │ └── decode.py # Decode tokens
56
+ ├── experiments/ # Evaluation and analysis
57
+ ├── requirements.txt
58
+ ├── .gitignore
59
+ └── README.md
@@ -0,0 +1,45 @@
1
+ # Tigriyna_BPE_Tokenizer
2
+
3
+ A Byte Pair Encoding (BPE) tokenizer for the Tigrinya language, designed for low-resource NLP research and machine learning pipelines.
4
+
5
+ Tigrinya is a low-resource Semitic language, and most existing tokenizers are optimized for high-resource languages. This project aims to reduce token fragmentation, lower out-of-vocabulary (OOV) rates, and better capture Tigrinya morphology.
6
+
7
+ ---
8
+
9
+ ## Features
10
+
11
+ - BPE-based subword tokenization for Tigrinya
12
+ - Optimized for low-resource settings
13
+ - Reduced OOV rate and token fragmentation
14
+ - Easy integration into NLP pipelines
15
+ - Reproducible tokenizer training and evaluation
16
+
17
+ ---
18
+
19
+ ## Motivation
20
+
21
+ Tokenization plays a critical role in NLP system performance. Generic tokenizers often perform poorly on Tigrinya due to:
22
+
23
+ - Rich morphology
24
+ - Limited training data
25
+ - Underrepresentation in multilingual models
26
+
27
+ This project addresses these challenges by providing a tokenizer tailored specifically to the Tigrinya language.
28
+
29
+ ---
30
+
31
+ ## Project Structure
32
+
33
+ ```text
34
+ Tigriyna_BPE_Tokenizer/
35
+ ├── data/
36
+ │ ├── raw/ # Raw text data (ignored)
37
+ │ ├── processed/ # Processed text data (ignored)
38
+ ├── tokenizer/
39
+ │ ├── train_bpe.py # Train BPE tokenizer
40
+ │ ├── encode.py # Encode text
41
+ │ └── decode.py # Decode tokens
42
+ ├── experiments/ # Evaluation and analysis
43
+ ├── requirements.txt
44
+ ├── .gitignore
45
+ └── README.md
@@ -0,0 +1,26 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "tigrinya-tokenizer"
7
+ version = "0.1.0"
8
+ description = "Robust Tigrinya Byte Pair Encoding tokenizer"
9
+ authors = [
10
+ { name="Haben Eyasu", email="habifishe@gmail.com" }
11
+ ]
12
+ readme = "README.md"
13
+ requires-python = ">=3.8"
14
+ dependencies = []
15
+
16
+
17
+ license = "MIT"
18
+ classifiers = [
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Cython",
21
+ "Topic :: Text Processing :: Linguistic"
22
+ ]
23
+
24
+ [project.urls]
25
+ Homepage = "https://github.com/haben-ai/Tigrinya-Tokenizer"
26
+ Issues = "https://github.com/haben-ai/Tigrinya-Tokenizer/issues"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,59 @@
1
+ Metadata-Version: 2.4
2
+ Name: tigrinya-tokenizer
3
+ Version: 0.1.0
4
+ Summary: Robust Tigrinya Byte Pair Encoding tokenizer
5
+ Author-email: Haben Eyasu <habifishe@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/haben-ai/Tigrinya-Tokenizer
8
+ Project-URL: Issues, https://github.com/haben-ai/Tigrinya-Tokenizer/issues
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Cython
11
+ Classifier: Topic :: Text Processing :: Linguistic
12
+ Requires-Python: >=3.8
13
+ Description-Content-Type: text/markdown
14
+
15
+ # Tigriyna_BPE_Tokenizer
16
+
17
+ A Byte Pair Encoding (BPE) tokenizer for the Tigrinya language, designed for low-resource NLP research and machine learning pipelines.
18
+
19
+ Tigrinya is a low-resource Semitic language, and most existing tokenizers are optimized for high-resource languages. This project aims to reduce token fragmentation, lower out-of-vocabulary (OOV) rates, and better capture Tigrinya morphology.
20
+
21
+ ---
22
+
23
+ ## Features
24
+
25
+ - BPE-based subword tokenization for Tigrinya
26
+ - Optimized for low-resource settings
27
+ - Reduced OOV rate and token fragmentation
28
+ - Easy integration into NLP pipelines
29
+ - Reproducible tokenizer training and evaluation
30
+
31
+ ---
32
+
33
+ ## Motivation
34
+
35
+ Tokenization plays a critical role in NLP system performance. Generic tokenizers often perform poorly on Tigrinya due to:
36
+
37
+ - Rich morphology
38
+ - Limited training data
39
+ - Underrepresentation in multilingual models
40
+
41
+ This project addresses these challenges by providing a tokenizer tailored specifically to the Tigrinya language.
42
+
43
+ ---
44
+
45
+ ## Project Structure
46
+
47
+ ```text
48
+ Tigriyna_BPE_Tokenizer/
49
+ ├── data/
50
+ │ ├── raw/ # Raw text data (ignored)
51
+ │ ├── processed/ # Processed text data (ignored)
52
+ ├── tokenizer/
53
+ │ ├── train_bpe.py # Train BPE tokenizer
54
+ │ ├── encode.py # Encode text
55
+ │ └── decode.py # Decode tokens
56
+ ├── experiments/ # Evaluation and analysis
57
+ ├── requirements.txt
58
+ ├── .gitignore
59
+ └── README.md
@@ -0,0 +1,10 @@
1
+ README.md
2
+ pyproject.toml
3
+ src/tigrinya_tokenizer.egg-info/PKG-INFO
4
+ src/tigrinya_tokenizer.egg-info/SOURCES.txt
5
+ src/tigrinya_tokenizer.egg-info/dependency_links.txt
6
+ src/tigrinya_tokenizer.egg-info/top_level.txt
7
+ src/tokenizer/corpus.py
8
+ src/tokenizer/normalization.py
9
+ src/tokenizer/train_bpe.py
10
+ test/test_tokenizer_tigrinya.py
@@ -0,0 +1,8 @@
1
+ from pathlib import Path
2
+
3
+ def iter_lines(path: Path):
4
+ with path.open("r", encoding="utf-8") as f:
5
+ for line in f:
6
+ line = line.strip()
7
+ if line:
8
+ yield line
@@ -0,0 +1,9 @@
1
+ import unicodedata
2
+ import regex as re
3
+
4
+ WHITESPACE_RE = re.compile(r"\s+")
5
+
6
+ def normalize_text(text: str) -> str:
7
+ text = unicodedata.normalize("NFC", text)
8
+ text = WHITESPACE_RE.sub(" ", text).strip()
9
+ return text
@@ -0,0 +1,89 @@
1
+ import os
2
+ from pathlib import Path
3
+ from tokenizers import Tokenizer
4
+ from tokenizers.models import BPE
5
+ from tokenizers.trainers import BpeTrainer
6
+ from tokenizers.pre_tokenizers import Whitespace
7
+ from tokenizers.normalizers import Sequence, NFC
8
+ from tokenizers.decoders import BPEDecoder
9
+ import yaml
10
+
11
+
12
+ def load_config(path="configs/bpe_50k.yaml"):
13
+ with open(path, "r", encoding="utf-8") as f:
14
+ return yaml.safe_load(f)
15
+
16
+
17
+ def train():
18
+ cfg = load_config()
19
+ corpus_file = "data/processed/normalized.txt"
20
+
21
+ if not os.path.exists(corpus_file):
22
+ print(f"[ERROR] Corpus file not found: {corpus_file}")
23
+ return
24
+
25
+ if os.path.getsize(corpus_file) == 0:
26
+ print(f"[ERROR] Corpus file is empty: {corpus_file}")
27
+ return
28
+
29
+ print(f"[INFO] Training BPE tokenizer on: {corpus_file}")
30
+ print(f"[INFO] Target vocab size: {cfg['tokenizer']['vocab_size']}")
31
+
32
+ # Initialize tokenizer
33
+ tokenizer = Tokenizer(BPE(unk_token="<unk>"))
34
+
35
+ # Unicode normalization (critical for Ge’ez script consistency)
36
+ tokenizer.normalizer = Sequence([NFC()])
37
+
38
+ # ✅ IMPORTANT FIX:
39
+ # Use whitespace splitting so BPE can learn merges inside words
40
+ tokenizer.pre_tokenizer = Whitespace()
41
+
42
+ # Proper BPE decoder
43
+ tokenizer.decoder = BPEDecoder()
44
+
45
+ # Trainer configuration
46
+ trainer = BpeTrainer(
47
+ vocab_size=cfg["tokenizer"]["vocab_size"],
48
+ min_frequency=cfg["tokenizer"]["min_frequency"],
49
+ special_tokens=cfg["special_tokens"]
50
+ )
51
+
52
+ print("[INFO] Starting training...")
53
+ tokenizer.train(files=[corpus_file], trainer=trainer)
54
+ print("[INFO] Training complete!")
55
+
56
+ # Save tokenizer
57
+ out_dir = Path("outputs/tokenizer")
58
+ out_dir.mkdir(parents=True, exist_ok=True)
59
+ output_path = out_dir / "tokenizer.json"
60
+ tokenizer.save(str(output_path))
61
+
62
+ print(f"[INFO] Tokenizer saved to: {output_path}")
63
+
64
+ # Quick sanity check
65
+ test_text = "ሰላም ኩን ኣደርካ?"
66
+ encoding = tokenizer.encode(test_text)
67
+
68
+ print(f"\n[INFO] Sample text: {test_text}")
69
+ print(f"[INFO] Tokens: {encoding.tokens}")
70
+ print(f"[INFO] Decoded: {tokenizer.decode(encoding.ids)}")
71
+
72
+ if tokenizer.decode(encoding.ids) == test_text:
73
+ print("[INFO] Round-trip PASSED ✅")
74
+ else:
75
+ print("[WARNING] Round-trip FAILED ❌")
76
+
77
+ # Check if merges were learned
78
+ model = tokenizer.model
79
+ if hasattr(model, "get_merges"):
80
+ merges = model.get_merges()
81
+ print(f"[INFO] Number of learned merges: {len(merges)}")
82
+ if len(merges) == 0:
83
+ print("[WARNING] No merges learned! Increase vocab_size or check corpus.")
84
+ else:
85
+ print("[INFO] Cannot directly inspect merges (check tokenizer.json).")
86
+
87
+
88
+ if __name__ == "__main__":
89
+ train()
@@ -0,0 +1,79 @@
1
+ from tokenizers import Tokenizer
2
+
3
+ TOKENIZER_PATH = "outputs/tokenizer/tokenizer.json"
4
+
5
+ # Load tokenizer
6
+ tokenizer = Tokenizer.from_file(TOKENIZER_PATH)
7
+
8
+
9
+
10
+ # 1️⃣ Random Tigrinya Test Words
11
+
12
+
13
+ TEST_WORDS = [
14
+ "ሰላም",
15
+ "ትግርኛ",
16
+ "ኣደርካ",
17
+ "ሕብረት",
18
+ "መንግስቲ",
19
+ "ምምሕዳር",
20
+ "ትምህርቲ",
21
+ "ኤርትራ",
22
+ "ሃገር",
23
+ "ፍቕሪ",
24
+ "ጸሓፊ",
25
+ "ቤት",
26
+ "ስራሕ",
27
+ "ኣቦ",
28
+ "ኣይተ"
29
+ ]
30
+
31
+
32
+ def test_word(word):
33
+ print("=" * 60)
34
+ print(f"Original: {word}")
35
+
36
+ encoding = tokenizer.encode(word)
37
+ tokens = encoding.tokens
38
+ decoded = tokenizer.decode(encoding.ids)
39
+
40
+ print(f"Tokens: {tokens}")
41
+ print(f"Decoded: {decoded}")
42
+
43
+ # Check unknown tokens
44
+ if "<unk>" in tokens:
45
+ print("⚠ WARNING: <unk> token detected")
46
+
47
+ # Check round-trip correctness
48
+ if decoded == word:
49
+ print("✅ Round-trip OK")
50
+ else:
51
+ print("❌ Round-trip FAILED")
52
+
53
+
54
+ def run_tests():
55
+ print("\nRunning Tigrinya Tokenizer Tests\n")
56
+ for word in TEST_WORDS:
57
+ test_word(word)
58
+
59
+
60
+
61
+ # 2️⃣ Sentence-Level Tests
62
+
63
+
64
+ SENTENCES = [
65
+ "ሰላም ኩን ኣደርካ?",
66
+ "ኣብ ትግርኛ መምህራን ኣሎዉ።",
67
+ "ትምህርቲ ኣገዳሲ ኢዩ፣ ወላ'ውን ጠቃሚ ኢዩ",
68
+ ]
69
+
70
+
71
+ def run_sentence_tests():
72
+ print("\nRunning Sentence Tests\n")
73
+ for sentence in SENTENCES:
74
+ test_word(sentence)
75
+
76
+
77
+ if __name__ == "__main__":
78
+ run_tests()
79
+ run_sentence_tests()