PyPI - tigrinya-tokenizer - Versions diffs - 0.1.0__tar.gz - Mend

tigrinya-tokenizer 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

tigrinya_tokenizer-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,59 @@
+Metadata-Version: 2.4
+Name: tigrinya-tokenizer
+Version: 0.1.0
+Summary: Robust Tigrinya Byte Pair Encoding tokenizer
+Author-email: Haben Eyasu <habifishe@gmail.com>
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/haben-ai/Tigrinya-Tokenizer
+Project-URL: Issues, https://github.com/haben-ai/Tigrinya-Tokenizer/issues
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Cython
+Classifier: Topic :: Text Processing :: Linguistic
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+# Tigriyna_BPE_Tokenizer
+A Byte Pair Encoding (BPE) tokenizer for the Tigrinya language, designed for low-resource NLP research and machine learning pipelines.
+Tigrinya is a low-resource Semitic language, and most existing tokenizers are optimized for high-resource languages. This project aims to reduce token fragmentation, lower out-of-vocabulary (OOV) rates, and better capture Tigrinya morphology.
+---
+## Features
+- BPE-based subword tokenization for Tigrinya
+- Optimized for low-resource settings
+- Reduced OOV rate and token fragmentation
+- Easy integration into NLP pipelines
+- Reproducible tokenizer training and evaluation
+---
+## Motivation
+Tokenization plays a critical role in NLP system performance. Generic tokenizers often perform poorly on Tigrinya due to:
+- Rich morphology
+- Limited training data
+- Underrepresentation in multilingual models
+This project addresses these challenges by providing a tokenizer tailored specifically to the Tigrinya language.
+---
+## Project Structure
+```text
+Tigriyna_BPE_Tokenizer/
+├── data/
+│   ├── raw/                 # Raw text data (ignored)
+│   ├── processed/           # Processed text data (ignored)
+├── tokenizer/
+│   ├── train_bpe.py         # Train BPE tokenizer
+│   ├── encode.py            # Encode text
+│   └── decode.py            # Decode tokens
+├── experiments/             # Evaluation and analysis
+├── requirements.txt
+├── .gitignore
+└── README.md

tigrinya_tokenizer-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,45 @@
+# Tigriyna_BPE_Tokenizer
+A Byte Pair Encoding (BPE) tokenizer for the Tigrinya language, designed for low-resource NLP research and machine learning pipelines.
+Tigrinya is a low-resource Semitic language, and most existing tokenizers are optimized for high-resource languages. This project aims to reduce token fragmentation, lower out-of-vocabulary (OOV) rates, and better capture Tigrinya morphology.
+---
+## Features
+- BPE-based subword tokenization for Tigrinya
+- Optimized for low-resource settings
+- Reduced OOV rate and token fragmentation
+- Easy integration into NLP pipelines
+- Reproducible tokenizer training and evaluation
+---
+## Motivation
+Tokenization plays a critical role in NLP system performance. Generic tokenizers often perform poorly on Tigrinya due to:
+- Rich morphology
+- Limited training data
+- Underrepresentation in multilingual models
+This project addresses these challenges by providing a tokenizer tailored specifically to the Tigrinya language.
+---
+## Project Structure
+```text
+Tigriyna_BPE_Tokenizer/
+├── data/
+│   ├── raw/                 # Raw text data (ignored)
+│   ├── processed/           # Processed text data (ignored)
+├── tokenizer/
+│   ├── train_bpe.py         # Train BPE tokenizer
+│   ├── encode.py            # Encode text
+│   └── decode.py            # Decode tokens
+├── experiments/             # Evaluation and analysis
+├── requirements.txt
+├── .gitignore
+└── README.md

tigrinya_tokenizer-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,26 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "tigrinya-tokenizer"
+version = "0.1.0"
+description = "Robust Tigrinya Byte Pair Encoding tokenizer"
+authors = [
+  { name="Haben Eyasu", email="habifishe@gmail.com" }
+]
+readme = "README.md"
+requires-python = ">=3.8"
+dependencies = []
+license = "MIT"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Cython",
+    "Topic :: Text Processing :: Linguistic"
+]
+[project.urls]
+Homepage = "https://github.com/haben-ai/Tigrinya-Tokenizer"
+Issues = "https://github.com/haben-ai/Tigrinya-Tokenizer/issues"

tigrinya_tokenizer-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

tigrinya_tokenizer-0.1.0/src/tigrinya_tokenizer.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,59 @@
+Metadata-Version: 2.4
+Name: tigrinya-tokenizer
+Version: 0.1.0
+Summary: Robust Tigrinya Byte Pair Encoding tokenizer
+Author-email: Haben Eyasu <habifishe@gmail.com>
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/haben-ai/Tigrinya-Tokenizer
+Project-URL: Issues, https://github.com/haben-ai/Tigrinya-Tokenizer/issues
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Cython
+Classifier: Topic :: Text Processing :: Linguistic
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+# Tigriyna_BPE_Tokenizer
+A Byte Pair Encoding (BPE) tokenizer for the Tigrinya language, designed for low-resource NLP research and machine learning pipelines.
+Tigrinya is a low-resource Semitic language, and most existing tokenizers are optimized for high-resource languages. This project aims to reduce token fragmentation, lower out-of-vocabulary (OOV) rates, and better capture Tigrinya morphology.
+---
+## Features
+- BPE-based subword tokenization for Tigrinya
+- Optimized for low-resource settings
+- Reduced OOV rate and token fragmentation
+- Easy integration into NLP pipelines
+- Reproducible tokenizer training and evaluation
+---
+## Motivation
+Tokenization plays a critical role in NLP system performance. Generic tokenizers often perform poorly on Tigrinya due to:
+- Rich morphology
+- Limited training data
+- Underrepresentation in multilingual models
+This project addresses these challenges by providing a tokenizer tailored specifically to the Tigrinya language.
+---
+## Project Structure
+```text
+Tigriyna_BPE_Tokenizer/
+├── data/
+│   ├── raw/                 # Raw text data (ignored)
+│   ├── processed/           # Processed text data (ignored)
+├── tokenizer/
+│   ├── train_bpe.py         # Train BPE tokenizer
+│   ├── encode.py            # Encode text
+│   └── decode.py            # Decode tokens
+├── experiments/             # Evaluation and analysis
+├── requirements.txt
+├── .gitignore
+└── README.md

tigrinya_tokenizer-0.1.0/src/tigrinya_tokenizer.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,10 @@
+README.md
+pyproject.toml
+src/tigrinya_tokenizer.egg-info/PKG-INFO
+src/tigrinya_tokenizer.egg-info/SOURCES.txt
+src/tigrinya_tokenizer.egg-info/dependency_links.txt
+src/tigrinya_tokenizer.egg-info/top_level.txt
+src/tokenizer/corpus.py
+src/tokenizer/normalization.py
+src/tokenizer/train_bpe.py
+test/test_tokenizer_tigrinya.py

tigrinya_tokenizer-0.1.0/src/tigrinya_tokenizer.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

tigrinya_tokenizer-0.1.0/src/tigrinya_tokenizer.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ tokenizer

tigrinya_tokenizer-0.1.0/src/tokenizer/corpus.py ADDED Viewed

@@ -0,0 +1,8 @@
+from pathlib import Path
+def iter_lines(path: Path):
+    with path.open("r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                yield line

tigrinya_tokenizer-0.1.0/src/tokenizer/normalization.py ADDED Viewed

@@ -0,0 +1,9 @@
+import unicodedata
+import regex as re
+WHITESPACE_RE = re.compile(r"\s+")
+def normalize_text(text: str) -> str:
+    text = unicodedata.normalize("NFC", text)
+    text = WHITESPACE_RE.sub(" ", text).strip()
+    return text

tigrinya_tokenizer-0.1.0/src/tokenizer/train_bpe.py ADDED Viewed

@@ -0,0 +1,89 @@
+import os
+from pathlib import Path
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+from tokenizers.pre_tokenizers import Whitespace
+from tokenizers.normalizers import Sequence, NFC
+from tokenizers.decoders import BPEDecoder
+import yaml
+def load_config(path="configs/bpe_50k.yaml"):
+    with open(path, "r", encoding="utf-8") as f:
+        return yaml.safe_load(f)
+def train():
+    cfg = load_config()
+    corpus_file = "data/processed/normalized.txt"
+    if not os.path.exists(corpus_file):
+        print(f"[ERROR] Corpus file not found: {corpus_file}")
+        return
+    if os.path.getsize(corpus_file) == 0:
+        print(f"[ERROR] Corpus file is empty: {corpus_file}")
+        return
+    print(f"[INFO] Training BPE tokenizer on: {corpus_file}")
+    print(f"[INFO] Target vocab size: {cfg['tokenizer']['vocab_size']}")
+    # Initialize tokenizer
+    tokenizer = Tokenizer(BPE(unk_token="<unk>"))
+    # Unicode normalization (critical for Ge’ez script consistency)
+    tokenizer.normalizer = Sequence([NFC()])
+    # ✅ IMPORTANT FIX:
+    # Use whitespace splitting so BPE can learn merges inside words
+    tokenizer.pre_tokenizer = Whitespace()
+    # Proper BPE decoder
+    tokenizer.decoder = BPEDecoder()
+    # Trainer configuration
+    trainer = BpeTrainer(
+        vocab_size=cfg["tokenizer"]["vocab_size"],
+        min_frequency=cfg["tokenizer"]["min_frequency"],
+        special_tokens=cfg["special_tokens"]
+    )
+    print("[INFO] Starting training...")
+    tokenizer.train(files=[corpus_file], trainer=trainer)
+    print("[INFO] Training complete!")
+    # Save tokenizer
+    out_dir = Path("outputs/tokenizer")
+    out_dir.mkdir(parents=True, exist_ok=True)
+    output_path = out_dir / "tokenizer.json"
+    tokenizer.save(str(output_path))
+    print(f"[INFO] Tokenizer saved to: {output_path}")
+    # Quick sanity check
+    test_text = "ሰላም ኩን ኣደርካ?"
+    encoding = tokenizer.encode(test_text)
+    print(f"\n[INFO] Sample text: {test_text}")
+    print(f"[INFO] Tokens: {encoding.tokens}")
+    print(f"[INFO] Decoded: {tokenizer.decode(encoding.ids)}")
+    if tokenizer.decode(encoding.ids) == test_text:
+        print("[INFO] Round-trip PASSED ✅")
+    else:
+        print("[WARNING] Round-trip FAILED ❌")
+    # Check if merges were learned
+    model = tokenizer.model
+    if hasattr(model, "get_merges"):
+        merges = model.get_merges()
+        print(f"[INFO] Number of learned merges: {len(merges)}")
+        if len(merges) == 0:
+            print("[WARNING] No merges learned! Increase vocab_size or check corpus.")
+    else:
+        print("[INFO] Cannot directly inspect merges (check tokenizer.json).")
+if __name__ == "__main__":
+    train()

tigrinya_tokenizer-0.1.0/test/test_tokenizer_tigrinya.py ADDED Viewed

@@ -0,0 +1,79 @@
+from tokenizers import Tokenizer
+TOKENIZER_PATH = "outputs/tokenizer/tokenizer.json"
+# Load tokenizer
+tokenizer = Tokenizer.from_file(TOKENIZER_PATH)
+# 1️⃣ Random Tigrinya Test Words
+TEST_WORDS = [
+    "ሰላም",
+    "ትግርኛ",
+    "ኣደርካ",
+    "ሕብረት",
+    "መንግስቲ",
+    "ምምሕዳር",
+    "ትምህርቲ",
+    "ኤርትራ",
+    "ሃገር",
+    "ፍቕሪ",
+    "ጸሓፊ",
+    "ቤት",
+    "ስራሕ",
+    "ኣቦ",
+    "ኣይተ"
+]
+def test_word(word):
+    print("=" * 60)
+    print(f"Original: {word}")
+    encoding = tokenizer.encode(word)
+    tokens = encoding.tokens
+    decoded = tokenizer.decode(encoding.ids)
+    print(f"Tokens: {tokens}")
+    print(f"Decoded: {decoded}")
+    # Check unknown tokens
+    if "<unk>" in tokens:
+        print("⚠ WARNING: <unk> token detected")
+    # Check round-trip correctness
+    if decoded == word:
+        print("✅ Round-trip OK")
+    else:
+        print("❌ Round-trip FAILED")
+def run_tests():
+    print("\nRunning Tigrinya Tokenizer Tests\n")
+    for word in TEST_WORDS:
+        test_word(word)
+# 2️⃣ Sentence-Level Tests
+SENTENCES = [
+    "ሰላም ኩን ኣደርካ?",
+    "ኣብ ትግርኛ መምህራን ኣሎዉ።",
+    "ትምህርቲ ኣገዳሲ ኢዩ፣ ወላ'ውን ጠቃሚ ኢዩ",
+]
+def run_sentence_tests():
+    print("\nRunning Sentence Tests\n")
+    for sentence in SENTENCES:
+        test_word(sentence)
+if __name__ == "__main__":
+    run_tests()
+    run_sentence_tests()