vettu 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. vettu-1.0.0/LICENSE +21 -0
  2. vettu-1.0.0/MANIFEST.in +4 -0
  3. vettu-1.0.0/PKG-INFO +159 -0
  4. vettu-1.0.0/README.md +126 -0
  5. vettu-1.0.0/pyproject.toml +47 -0
  6. vettu-1.0.0/setup.cfg +4 -0
  7. vettu-1.0.0/setup.py +39 -0
  8. vettu-1.0.0/tamil_tokenizer/__init__.py +30 -0
  9. vettu-1.0.0/tamil_tokenizer/__main__.py +124 -0
  10. vettu-1.0.0/tamil_tokenizer/config/__init__.py +13 -0
  11. vettu-1.0.0/tamil_tokenizer/config/config_loader.py +420 -0
  12. vettu-1.0.0/tamil_tokenizer/config/constant_table.py +443 -0
  13. vettu-1.0.0/tamil_tokenizer/config/constants.py +73 -0
  14. vettu-1.0.0/tamil_tokenizer/constants/__init__.py +6 -0
  15. vettu-1.0.0/tamil_tokenizer/constants/letter_groups.py +245 -0
  16. vettu-1.0.0/tamil_tokenizer/constants/tamil_letters.py +432 -0
  17. vettu-1.0.0/tamil_tokenizer/data/allFileList.list +25 -0
  18. vettu-1.0.0/tamil_tokenizer/data/condition_rule.list +37 -0
  19. vettu-1.0.0/tamil_tokenizer/data/ignore.list +11520 -0
  20. vettu-1.0.0/tamil_tokenizer/data/ignoreGrammar.list +47 -0
  21. vettu-1.0.0/tamil_tokenizer/data/ignoreNoun.list +7006 -0
  22. vettu-1.0.0/tamil_tokenizer/data/ignorePerson.list +299 -0
  23. vettu-1.0.0/tamil_tokenizer/data/ignorePlace.list +56648 -0
  24. vettu-1.0.0/tamil_tokenizer/data/ignoreVerb.list +1980 -0
  25. vettu-1.0.0/tamil_tokenizer/data/mainConstant.list +129 -0
  26. vettu-1.0.0/tamil_tokenizer/data/main_parse_map.list +161 -0
  27. vettu-1.0.0/tamil_tokenizer/data/nounConstants.list +33 -0
  28. vettu-1.0.0/tamil_tokenizer/data/nounParseOrder.list +282 -0
  29. vettu-1.0.0/tamil_tokenizer/data/noun_parse_map.list +185 -0
  30. vettu-1.0.0/tamil_tokenizer/data/parseOrder.list +616 -0
  31. vettu-1.0.0/tamil_tokenizer/data/prefix.list +0 -0
  32. vettu-1.0.0/tamil_tokenizer/data/specialCharacter.list +106 -0
  33. vettu-1.0.0/tamil_tokenizer/data/twinConstant.list +7 -0
  34. vettu-1.0.0/tamil_tokenizer/data/twinParseOrder.list +7 -0
  35. vettu-1.0.0/tamil_tokenizer/data/uniqueList.list +209 -0
  36. vettu-1.0.0/tamil_tokenizer/data/verbConstants.list +36 -0
  37. vettu-1.0.0/tamil_tokenizer/data/verbParseOrder.list +169 -0
  38. vettu-1.0.0/tamil_tokenizer/data/verb_parse_map.list +186 -0
  39. vettu-1.0.0/tamil_tokenizer/grammar/__init__.py +7 -0
  40. vettu-1.0.0/tamil_tokenizer/grammar/illakanam.py +354 -0
  41. vettu-1.0.0/tamil_tokenizer/grammar/tamil_util.py +654 -0
  42. vettu-1.0.0/tamil_tokenizer/grammar/vetrumai.py +288 -0
  43. vettu-1.0.0/tamil_tokenizer/hf_tokenizer.py +313 -0
  44. vettu-1.0.0/tamil_tokenizer/parsers/__init__.py +11 -0
  45. vettu-1.0.0/tamil_tokenizer/parsers/core_parser.py +598 -0
  46. vettu-1.0.0/tamil_tokenizer/parsers/root_word_parser.py +698 -0
  47. vettu-1.0.0/tamil_tokenizer/parsers/word_parser_interface.py +88 -0
  48. vettu-1.0.0/tamil_tokenizer/tokenizer.py +699 -0
  49. vettu-1.0.0/tamil_tokenizer/utils/__init__.py +19 -0
  50. vettu-1.0.0/tamil_tokenizer/utils/recursive_algorithm.py +238 -0
  51. vettu-1.0.0/tamil_tokenizer/utils/splitting.py +459 -0
  52. vettu-1.0.0/tamil_tokenizer/utils/tamil_iterator.py +232 -0
  53. vettu-1.0.0/tamil_tokenizer/utils/word_class.py +175 -0
  54. vettu-1.0.0/tamil_tokenizer/utils/word_splitter.py +207 -0
  55. vettu-1.0.0/vettu.egg-info/PKG-INFO +159 -0
  56. vettu-1.0.0/vettu.egg-info/SOURCES.txt +58 -0
  57. vettu-1.0.0/vettu.egg-info/dependency_links.txt +1 -0
  58. vettu-1.0.0/vettu.egg-info/entry_points.txt +2 -0
  59. vettu-1.0.0/vettu.egg-info/requires.txt +3 -0
  60. vettu-1.0.0/vettu.egg-info/top_level.txt +1 -0
vettu-1.0.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Tamil NLP Project
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,4 @@
1
+ include LICENSE
2
+ include README.md
3
+ include pyproject.toml
4
+ recursive-include tamil_tokenizer/data *.list
vettu-1.0.0/PKG-INFO ADDED
@@ -0,0 +1,159 @@
1
+ Metadata-Version: 2.4
2
+ Name: vettu
3
+ Version: 1.0.0
4
+ Summary: Multi-level tokenizer for Tamil text — sentence, word, character, and morpheme tokenization
5
+ Home-page: https://github.com/tamil-phy/tamil_tokenizer
6
+ Author: Tamil NLP Project
7
+ License: MIT
8
+ Project-URL: Homepage, https://github.com/tamil-phy/tamil_tokenizer
9
+ Project-URL: Repository, https://github.com/tamil-phy/tamil_tokenizer
10
+ Project-URL: Issues, https://github.com/tamil-phy/tamil_tokenizer/issues
11
+ Keywords: tamil,tokenizer,nlp,morpheme,tamil-nlp,text-processing
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Topic :: Text Processing :: Linguistic
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Natural Language :: Tamil
24
+ Classifier: Operating System :: OS Independent
25
+ Requires-Python: >=3.8
26
+ Description-Content-Type: text/markdown
27
+ License-File: LICENSE
28
+ Provides-Extra: hf
29
+ Requires-Dist: transformers>=4.20.0; extra == "hf"
30
+ Dynamic: home-page
31
+ Dynamic: license-file
32
+ Dynamic: requires-python
33
+
34
+ # Tamil Tokenizer
35
+
36
+ A standalone, multi-level tokenizer for Tamil text. No external dependencies — uses only the Python standard library.
37
+
38
+ ## Features
39
+
40
+ Four levels of tokenization:
41
+
42
+ | Level | Description | Example |
43
+ |-------|-------------|---------|
44
+ | **sentence** | Split text into sentences | `"அவன் வந்தான். அவள் பார்த்தாள்."` → 2 sentences |
45
+ | **word** | Split into words + punctuation | `"அவன் வந்தான்."` → `அவன்`, `வந்தான்`, `.` |
46
+ | **character** | Split into Tamil letters with classification (உயிர்/மெய்/உயிர்மெய், வல்லினம்/மெல்லினம்/இடையினம்) | `"வந்தான்"` → `வ`, `ந்`, `தா`, `ன்` |
47
+ | **morpheme** | Split into root + grammatical suffixes (case, tense, person) | `"பள்ளிக்கு"` → root `பள்ளி` + case suffix `க்கு` (Dative) |
48
+
49
+ ## Installation
50
+
51
+ ```bash
52
+ # From the project directory
53
+ pip install -e .
54
+
55
+ # Or just use directly (no install needed)
56
+ python -m tamil_tokenizer "அவன் வந்தான்."
57
+ ```
58
+
59
+ ## Usage
60
+
61
+ ### Command Line
62
+
63
+ ```bash
64
+ # Word tokenization (default)
65
+ python -m tamil_tokenizer "அவன் வந்தான்."
66
+
67
+ # Character tokenization
68
+ python -m tamil_tokenizer "தமிழ்நாடு" --level character
69
+
70
+ # Sentence tokenization
71
+ python -m tamil_tokenizer "அவன் வந்தான். அவள் பார்த்தாள்." --level sentence
72
+
73
+ # Morpheme tokenization
74
+ python -m tamil_tokenizer "பள்ளிக்கு சென்றான்." --level morpheme
75
+
76
+ # JSON output
77
+ python -m tamil_tokenizer "அவன் வந்தான்." --format json
78
+
79
+ # Plain text output (just token strings)
80
+ python -m tamil_tokenizer "அவன் வந்தான்." --format text
81
+
82
+ # Interactive mode
83
+ python -m tamil_tokenizer --interactive
84
+ ```
85
+
86
+ ### Python API
87
+
88
+ ```python
89
+ from tamil_tokenizer import TamilTokenizer, Token, TokenType
90
+
91
+ tokenizer = TamilTokenizer()
92
+
93
+ # Sentence tokenization
94
+ sentences = tokenizer.sentence_tokenize("அவன் வந்தான். அவள் பார்த்தாள்.")
95
+
96
+ # Word tokenization
97
+ words = tokenizer.word_tokenize("அவன் வந்தான்.")
98
+
99
+ # Character tokenization
100
+ letters = tokenizer.character_tokenize("வந்தான்")
101
+ for letter in letters:
102
+ print(f"{letter.text} -> {letter.token_type.value} ({letter.metadata})")
103
+
104
+ # Morpheme tokenization
105
+ morphemes = tokenizer.morpheme_tokenize("பள்ளிக்கு")
106
+ for m in morphemes:
107
+ print(f"{m.text} -> {m.token_type.value} ({m.metadata})")
108
+
109
+ # Unified pipeline
110
+ tokens = tokenizer.tokenize("அவன் வந்தான்.", level="word")
111
+
112
+ # Convenience: get just strings
113
+ strings = tokenizer.tokenize_to_strings("அவன் வந்தான்.", level="word")
114
+ # ['அவன்', 'வந்தான்', '.']
115
+
116
+ # Convenience: get dicts (useful for JSON serialization)
117
+ dicts = tokenizer.tokenize_to_dicts("அவன் வந்தான்.", level="character")
118
+ ```
119
+
120
+ ## Token Types
121
+
122
+ ### Word-level
123
+ - `word` — Tamil word
124
+ - `number` — Numeric value
125
+ - `punctuation` — Punctuation mark
126
+ - `symbol` — Other symbol
127
+
128
+ ### Character-level
129
+ - `vowel` — உயிரெழுத்து (அ, ஆ, இ, ...)
130
+ - `consonant` — மெய்யெழுத்து (க், ங், ச், ...)
131
+ - `vowel_consonant` — உயிர்மெய்யெழுத்து (க, கா, கி, ...)
132
+ - `special` — ஆய்த எழுத்து (ஃ)
133
+
134
+ ### Morpheme-level
135
+ - `root` — Root word
136
+ - `suffix` — Generic suffix
137
+ - `case_suffix` — வேற்றுமை உருபு (case marker)
138
+ - `tense_marker` — கால இடைநிலை (tense marker)
139
+ - `person_marker` — விகுதி (person/number marker)
140
+
141
+ ## Project Structure
142
+
143
+ ```
144
+ tamil_tokenizer/
145
+ ├── __init__.py # Package init + public API
146
+ ├── __main__.py # CLI entry point
147
+ ├── tokenizer.py # Main TamilTokenizer class
148
+ ├── constants/ # Tamil Unicode constants & letter groups
149
+ ├── grammar/ # Grammar analysis (util, case, tense)
150
+ ├── config/ # Configuration & data file loading
151
+ ├── parsers/ # Root word parser & core parsing
152
+ ├── utils/ # Iterator, splitting, word class utilities
153
+ └── data/ # Grammar rule files (.list)
154
+ ```
155
+
156
+ ## Requirements
157
+
158
+ - Python 3.8+
159
+ - No external dependencies
vettu-1.0.0/README.md ADDED
@@ -0,0 +1,126 @@
1
+ # Tamil Tokenizer
2
+
3
+ A standalone, multi-level tokenizer for Tamil text. No external dependencies — uses only the Python standard library.
4
+
5
+ ## Features
6
+
7
+ Four levels of tokenization:
8
+
9
+ | Level | Description | Example |
10
+ |-------|-------------|---------|
11
+ | **sentence** | Split text into sentences | `"அவன் வந்தான். அவள் பார்த்தாள்."` → 2 sentences |
12
+ | **word** | Split into words + punctuation | `"அவன் வந்தான்."` → `அவன்`, `வந்தான்`, `.` |
13
+ | **character** | Split into Tamil letters with classification (உயிர்/மெய்/உயிர்மெய், வல்லினம்/மெல்லினம்/இடையினம்) | `"வந்தான்"` → `வ`, `ந்`, `தா`, `ன்` |
14
+ | **morpheme** | Split into root + grammatical suffixes (case, tense, person) | `"பள்ளிக்கு"` → root `பள்ளி` + case suffix `க்கு` (Dative) |
15
+
16
+ ## Installation
17
+
18
+ ```bash
19
+ # From the project directory
20
+ pip install -e .
21
+
22
+ # Or just use directly (no install needed)
23
+ python -m tamil_tokenizer "அவன் வந்தான்."
24
+ ```
25
+
26
+ ## Usage
27
+
28
+ ### Command Line
29
+
30
+ ```bash
31
+ # Word tokenization (default)
32
+ python -m tamil_tokenizer "அவன் வந்தான்."
33
+
34
+ # Character tokenization
35
+ python -m tamil_tokenizer "தமிழ்நாடு" --level character
36
+
37
+ # Sentence tokenization
38
+ python -m tamil_tokenizer "அவன் வந்தான். அவள் பார்த்தாள்." --level sentence
39
+
40
+ # Morpheme tokenization
41
+ python -m tamil_tokenizer "பள்ளிக்கு சென்றான்." --level morpheme
42
+
43
+ # JSON output
44
+ python -m tamil_tokenizer "அவன் வந்தான்." --format json
45
+
46
+ # Plain text output (just token strings)
47
+ python -m tamil_tokenizer "அவன் வந்தான்." --format text
48
+
49
+ # Interactive mode
50
+ python -m tamil_tokenizer --interactive
51
+ ```
52
+
53
+ ### Python API
54
+
55
+ ```python
56
+ from tamil_tokenizer import TamilTokenizer, Token, TokenType
57
+
58
+ tokenizer = TamilTokenizer()
59
+
60
+ # Sentence tokenization
61
+ sentences = tokenizer.sentence_tokenize("அவன் வந்தான். அவள் பார்த்தாள்.")
62
+
63
+ # Word tokenization
64
+ words = tokenizer.word_tokenize("அவன் வந்தான்.")
65
+
66
+ # Character tokenization
67
+ letters = tokenizer.character_tokenize("வந்தான்")
68
+ for letter in letters:
69
+ print(f"{letter.text} -> {letter.token_type.value} ({letter.metadata})")
70
+
71
+ # Morpheme tokenization
72
+ morphemes = tokenizer.morpheme_tokenize("பள்ளிக்கு")
73
+ for m in morphemes:
74
+ print(f"{m.text} -> {m.token_type.value} ({m.metadata})")
75
+
76
+ # Unified pipeline
77
+ tokens = tokenizer.tokenize("அவன் வந்தான்.", level="word")
78
+
79
+ # Convenience: get just strings
80
+ strings = tokenizer.tokenize_to_strings("அவன் வந்தான்.", level="word")
81
+ # ['அவன்', 'வந்தான்', '.']
82
+
83
+ # Convenience: get dicts (useful for JSON serialization)
84
+ dicts = tokenizer.tokenize_to_dicts("அவன் வந்தான்.", level="character")
85
+ ```
86
+
87
+ ## Token Types
88
+
89
+ ### Word-level
90
+ - `word` — Tamil word
91
+ - `number` — Numeric value
92
+ - `punctuation` — Punctuation mark
93
+ - `symbol` — Other symbol
94
+
95
+ ### Character-level
96
+ - `vowel` — உயிரெழுத்து (அ, ஆ, இ, ...)
97
+ - `consonant` — மெய்யெழுத்து (க், ங், ச், ...)
98
+ - `vowel_consonant` — உயிர்மெய்யெழுத்து (க, கா, கி, ...)
99
+ - `special` — ஆய்த எழுத்து (ஃ)
100
+
101
+ ### Morpheme-level
102
+ - `root` — Root word
103
+ - `suffix` — Generic suffix
104
+ - `case_suffix` — வேற்றுமை உருபு (case marker)
105
+ - `tense_marker` — கால இடைநிலை (tense marker)
106
+ - `person_marker` — விகுதி (person/number marker)
107
+
108
+ ## Project Structure
109
+
110
+ ```
111
+ tamil_tokenizer/
112
+ ├── __init__.py # Package init + public API
113
+ ├── __main__.py # CLI entry point
114
+ ├── tokenizer.py # Main TamilTokenizer class
115
+ ├── constants/ # Tamil Unicode constants & letter groups
116
+ ├── grammar/ # Grammar analysis (util, case, tense)
117
+ ├── config/ # Configuration & data file loading
118
+ ├── parsers/ # Root word parser & core parsing
119
+ ├── utils/ # Iterator, splitting, word class utilities
120
+ └── data/ # Grammar rule files (.list)
121
+ ```
122
+
123
+ ## Requirements
124
+
125
+ - Python 3.8+
126
+ - No external dependencies
@@ -0,0 +1,47 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "vettu"
7
+ version = "1.0.0"
8
+ description = "Multi-level tokenizer for Tamil text — sentence, word, character, and morpheme tokenization"
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ requires-python = ">=3.8"
12
+ authors = [
13
+ {name = "Tamil NLP Project"},
14
+ ]
15
+ keywords = ["tamil", "tokenizer", "nlp", "morpheme", "tamil-nlp", "text-processing"]
16
+ classifiers = [
17
+ "Development Status :: 4 - Beta",
18
+ "Intended Audience :: Developers",
19
+ "Intended Audience :: Science/Research",
20
+ "License :: OSI Approved :: MIT License",
21
+ "Topic :: Text Processing :: Linguistic",
22
+ "Programming Language :: Python :: 3",
23
+ "Programming Language :: Python :: 3.8",
24
+ "Programming Language :: Python :: 3.9",
25
+ "Programming Language :: Python :: 3.10",
26
+ "Programming Language :: Python :: 3.11",
27
+ "Programming Language :: Python :: 3.12",
28
+ "Natural Language :: Tamil",
29
+ "Operating System :: OS Independent",
30
+ ]
31
+
32
+ [project.optional-dependencies]
33
+ hf = ["transformers>=4.20.0"]
34
+
35
+ [project.scripts]
36
+ tamil-tokenize = "tamil_tokenizer.__main__:main"
37
+
38
+ [project.urls]
39
+ Homepage = "https://github.com/tamil-phy/tamil_tokenizer"
40
+ Repository = "https://github.com/tamil-phy/tamil_tokenizer"
41
+ Issues = "https://github.com/tamil-phy/tamil_tokenizer/issues"
42
+
43
+ [tool.setuptools.packages.find]
44
+ include = ["tamil_tokenizer*"]
45
+
46
+ [tool.setuptools.package-data]
47
+ tamil_tokenizer = ["data/*.list"]
vettu-1.0.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
vettu-1.0.0/setup.py ADDED
@@ -0,0 +1,39 @@
1
+ """Setup script for Tamil Tokenizer."""
2
+
3
+ from setuptools import setup, find_packages
4
+
5
+ setup(
6
+ name="vettu",
7
+ version="1.0.0",
8
+ description="Multi-level tokenizer for Tamil text",
9
+ long_description=open("README.md", encoding="utf-8").read(),
10
+ long_description_content_type="text/markdown",
11
+ author="Tamil NLP Project",
12
+ license="MIT",
13
+ url="https://github.com/tamil-phy/tamil_tokenizer",
14
+ python_requires=">=3.8",
15
+ packages=find_packages(),
16
+ package_data={
17
+ "tamil_tokenizer": ["data/*.list"],
18
+ },
19
+ include_package_data=True,
20
+ entry_points={
21
+ "console_scripts": [
22
+ "tamil-tokenize=tamil_tokenizer.__main__:main",
23
+ ],
24
+ },
25
+ classifiers=[
26
+ "Development Status :: 4 - Beta",
27
+ "Intended Audience :: Developers",
28
+ "Intended Audience :: Science/Research",
29
+ "Topic :: Text Processing :: Linguistic",
30
+ "Programming Language :: Python :: 3",
31
+ "Programming Language :: Python :: 3.8",
32
+ "Programming Language :: Python :: 3.9",
33
+ "Programming Language :: Python :: 3.10",
34
+ "Programming Language :: Python :: 3.11",
35
+ "Programming Language :: Python :: 3.12",
36
+ "Natural Language :: Tamil",
37
+ "Operating System :: OS Independent",
38
+ ],
39
+ )
@@ -0,0 +1,30 @@
1
+ """
2
+ Tamil Tokenizer - Standalone multi-level tokenizer for Tamil text.
3
+
4
+ Provides four levels of tokenization:
5
+ - Sentence tokenization
6
+ - Word tokenization
7
+ - Character (letter) tokenization
8
+ - Morpheme tokenization (root word + suffixes)
9
+
10
+ Usage:
11
+ from tamil_tokenizer import TamilTokenizer, Token, TokenType
12
+
13
+ tokenizer = TamilTokenizer()
14
+ tokens = tokenizer.tokenize("அவன் வந்தான்.", level="word")
15
+ """
16
+
17
+ __version__ = "1.0.0"
18
+
19
+ from .tokenizer import TamilTokenizer, Token, TokenType
20
+
21
+ def _import_hf_tokenizer():
22
+ """Lazy import to avoid hard dependency on transformers."""
23
+ from .hf_tokenizer import TamilHFTokenizer
24
+ return TamilHFTokenizer
25
+
26
+ try:
27
+ from .hf_tokenizer import TamilHFTokenizer
28
+ __all__ = ['TamilTokenizer', 'TamilHFTokenizer', 'Token', 'TokenType']
29
+ except ImportError:
30
+ __all__ = ['TamilTokenizer', 'Token', 'TokenType']
@@ -0,0 +1,124 @@
1
+ """
2
+ Tamil Tokenizer CLI - Command-line interface for Tamil tokenization.
3
+
4
+ Usage:
5
+ python -m tamil_tokenizer "அவன் வந்தான்."
6
+ python -m tamil_tokenizer "அவன் வந்தான்." --level character
7
+ python -m tamil_tokenizer "அவன் வந்தான். அவள் பார்த்தாள்." --level sentence
8
+ python -m tamil_tokenizer "பள்ளிக்கு சென்றான்." --level morpheme
9
+ python -m tamil_tokenizer --interactive
10
+ """
11
+
12
+ import argparse
13
+ import sys
14
+ from typing import Optional
15
+
16
+ from .tokenizer import TamilTokenizer, Token
17
+
18
+
19
+ def print_tokens(tokens: list, level: str, text: str) -> None:
20
+ """Print tokens in a formatted table."""
21
+ print(f"\n{'='*60}")
22
+ print(f"Tokenization Level: {level}")
23
+ print(f"Input: {text}")
24
+ print(f"{'='*60}")
25
+ print(f"Tokens ({len(tokens)}):")
26
+
27
+ for i, token in enumerate(tokens):
28
+ meta_str = ""
29
+ if token.metadata:
30
+ meta_parts = [f"{k}={v}" for k, v in token.metadata.items()]
31
+ meta_str = f" ({', '.join(meta_parts)})"
32
+ print(f" {i+1}. [{token.token_type.value:>16}] '{token.text}'{meta_str}")
33
+
34
+
35
+ def interactive_mode(tokenizer: TamilTokenizer) -> None:
36
+ """Run tokenizer in interactive mode."""
37
+ print("Tamil Tokenizer - Interactive Mode")
38
+ print("Type Tamil text to tokenize. Commands:")
39
+ print(" :level <sentence|word|character|morpheme> - Change level")
40
+ print(" :quit - Exit")
41
+ print(f"{'='*60}")
42
+
43
+ current_level = "word"
44
+
45
+ while True:
46
+ try:
47
+ text = input(f"\n[{current_level}] >>> ").strip()
48
+ except (EOFError, KeyboardInterrupt):
49
+ print("\nGoodbye!")
50
+ break
51
+
52
+ if not text:
53
+ continue
54
+
55
+ if text == ":quit":
56
+ print("Goodbye!")
57
+ break
58
+
59
+ if text.startswith(":level "):
60
+ new_level = text.split(maxsplit=1)[1].strip()
61
+ if new_level in ("sentence", "word", "character", "morpheme"):
62
+ current_level = new_level
63
+ print(f"Level set to: {current_level}")
64
+ else:
65
+ print(f"Unknown level: {new_level}. Use: sentence, word, character, morpheme")
66
+ continue
67
+
68
+ tokens = tokenizer.tokenize(text, level=current_level)
69
+ print_tokens(tokens, current_level, text)
70
+
71
+
72
+ def main():
73
+ """Main entry point."""
74
+ parser = argparse.ArgumentParser(
75
+ description='Tamil Tokenizer - Multi-level tokenization for Tamil text',
76
+ formatter_class=argparse.RawDescriptionHelpFormatter,
77
+ epilog="""
78
+ Examples:
79
+ %(prog)s "அவன் வந்தான்." Word tokenization (default)
80
+ %(prog)s "அவன் வந்தான்." --level character Character tokenization
81
+ %(prog)s "அவன் வந்தான். அவள் பார்த்தாள்." --level sentence Sentence tokenization
82
+ %(prog)s "பள்ளிக்கு சென்றான்." --level morpheme Morpheme tokenization
83
+ %(prog)s --interactive Interactive mode
84
+ """
85
+ )
86
+
87
+ parser.add_argument('text', nargs='?', help='Tamil text to tokenize')
88
+ parser.add_argument('-l', '--level', default='word',
89
+ choices=['sentence', 'word', 'character', 'morpheme'],
90
+ help='Tokenization level (default: word)')
91
+ parser.add_argument('-i', '--interactive', action='store_true',
92
+ help='Run in interactive mode')
93
+ parser.add_argument('-d', '--data', metavar='PATH',
94
+ help='Path to data directory')
95
+ parser.add_argument('-f', '--format', default='table',
96
+ choices=['table', 'text', 'json'],
97
+ help='Output format (default: table)')
98
+ parser.add_argument('--version', action='version',
99
+ version='Tamil Tokenizer 1.0.0')
100
+
101
+ args = parser.parse_args()
102
+
103
+ tokenizer = TamilTokenizer(args.data)
104
+
105
+ if args.interactive:
106
+ interactive_mode(tokenizer)
107
+ elif args.text:
108
+ if args.format == 'json':
109
+ import json
110
+ result = tokenizer.tokenize_to_dicts(args.text, level=args.level)
111
+ print(json.dumps(result, ensure_ascii=False, indent=2))
112
+ elif args.format == 'text':
113
+ result = tokenizer.tokenize_to_strings(args.text, level=args.level)
114
+ for t in result:
115
+ print(t)
116
+ else:
117
+ tokens = tokenizer.tokenize(args.text, level=args.level)
118
+ print_tokens(tokens, args.level, args.text)
119
+ else:
120
+ parser.print_help()
121
+
122
+
123
+ if __name__ == '__main__':
124
+ main()
@@ -0,0 +1,13 @@
1
+ """Tamil tokenizer configuration module."""
2
+
3
+ from .constants import ConfigConstants, DEFAULT_FILE_PATHS
4
+ from .config_loader import ConfigLoader, ReadConfig
5
+ from .constant_table import TamilConstantTable
6
+
7
+ __all__ = [
8
+ 'ConfigConstants',
9
+ 'DEFAULT_FILE_PATHS',
10
+ 'ConfigLoader',
11
+ 'ReadConfig',
12
+ 'TamilConstantTable',
13
+ ]