vettu 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vettu-1.0.0/LICENSE +21 -0
- vettu-1.0.0/MANIFEST.in +4 -0
- vettu-1.0.0/PKG-INFO +159 -0
- vettu-1.0.0/README.md +126 -0
- vettu-1.0.0/pyproject.toml +47 -0
- vettu-1.0.0/setup.cfg +4 -0
- vettu-1.0.0/setup.py +39 -0
- vettu-1.0.0/tamil_tokenizer/__init__.py +30 -0
- vettu-1.0.0/tamil_tokenizer/__main__.py +124 -0
- vettu-1.0.0/tamil_tokenizer/config/__init__.py +13 -0
- vettu-1.0.0/tamil_tokenizer/config/config_loader.py +420 -0
- vettu-1.0.0/tamil_tokenizer/config/constant_table.py +443 -0
- vettu-1.0.0/tamil_tokenizer/config/constants.py +73 -0
- vettu-1.0.0/tamil_tokenizer/constants/__init__.py +6 -0
- vettu-1.0.0/tamil_tokenizer/constants/letter_groups.py +245 -0
- vettu-1.0.0/tamil_tokenizer/constants/tamil_letters.py +432 -0
- vettu-1.0.0/tamil_tokenizer/data/allFileList.list +25 -0
- vettu-1.0.0/tamil_tokenizer/data/condition_rule.list +37 -0
- vettu-1.0.0/tamil_tokenizer/data/ignore.list +11520 -0
- vettu-1.0.0/tamil_tokenizer/data/ignoreGrammar.list +47 -0
- vettu-1.0.0/tamil_tokenizer/data/ignoreNoun.list +7006 -0
- vettu-1.0.0/tamil_tokenizer/data/ignorePerson.list +299 -0
- vettu-1.0.0/tamil_tokenizer/data/ignorePlace.list +56648 -0
- vettu-1.0.0/tamil_tokenizer/data/ignoreVerb.list +1980 -0
- vettu-1.0.0/tamil_tokenizer/data/mainConstant.list +129 -0
- vettu-1.0.0/tamil_tokenizer/data/main_parse_map.list +161 -0
- vettu-1.0.0/tamil_tokenizer/data/nounConstants.list +33 -0
- vettu-1.0.0/tamil_tokenizer/data/nounParseOrder.list +282 -0
- vettu-1.0.0/tamil_tokenizer/data/noun_parse_map.list +185 -0
- vettu-1.0.0/tamil_tokenizer/data/parseOrder.list +616 -0
- vettu-1.0.0/tamil_tokenizer/data/prefix.list +0 -0
- vettu-1.0.0/tamil_tokenizer/data/specialCharacter.list +106 -0
- vettu-1.0.0/tamil_tokenizer/data/twinConstant.list +7 -0
- vettu-1.0.0/tamil_tokenizer/data/twinParseOrder.list +7 -0
- vettu-1.0.0/tamil_tokenizer/data/uniqueList.list +209 -0
- vettu-1.0.0/tamil_tokenizer/data/verbConstants.list +36 -0
- vettu-1.0.0/tamil_tokenizer/data/verbParseOrder.list +169 -0
- vettu-1.0.0/tamil_tokenizer/data/verb_parse_map.list +186 -0
- vettu-1.0.0/tamil_tokenizer/grammar/__init__.py +7 -0
- vettu-1.0.0/tamil_tokenizer/grammar/illakanam.py +354 -0
- vettu-1.0.0/tamil_tokenizer/grammar/tamil_util.py +654 -0
- vettu-1.0.0/tamil_tokenizer/grammar/vetrumai.py +288 -0
- vettu-1.0.0/tamil_tokenizer/hf_tokenizer.py +313 -0
- vettu-1.0.0/tamil_tokenizer/parsers/__init__.py +11 -0
- vettu-1.0.0/tamil_tokenizer/parsers/core_parser.py +598 -0
- vettu-1.0.0/tamil_tokenizer/parsers/root_word_parser.py +698 -0
- vettu-1.0.0/tamil_tokenizer/parsers/word_parser_interface.py +88 -0
- vettu-1.0.0/tamil_tokenizer/tokenizer.py +699 -0
- vettu-1.0.0/tamil_tokenizer/utils/__init__.py +19 -0
- vettu-1.0.0/tamil_tokenizer/utils/recursive_algorithm.py +238 -0
- vettu-1.0.0/tamil_tokenizer/utils/splitting.py +459 -0
- vettu-1.0.0/tamil_tokenizer/utils/tamil_iterator.py +232 -0
- vettu-1.0.0/tamil_tokenizer/utils/word_class.py +175 -0
- vettu-1.0.0/tamil_tokenizer/utils/word_splitter.py +207 -0
- vettu-1.0.0/vettu.egg-info/PKG-INFO +159 -0
- vettu-1.0.0/vettu.egg-info/SOURCES.txt +58 -0
- vettu-1.0.0/vettu.egg-info/dependency_links.txt +1 -0
- vettu-1.0.0/vettu.egg-info/entry_points.txt +2 -0
- vettu-1.0.0/vettu.egg-info/requires.txt +3 -0
- vettu-1.0.0/vettu.egg-info/top_level.txt +1 -0
vettu-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Tamil NLP Project
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
vettu-1.0.0/MANIFEST.in
ADDED
vettu-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vettu
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Multi-level tokenizer for Tamil text — sentence, word, character, and morpheme tokenization
|
|
5
|
+
Home-page: https://github.com/tamil-phy/tamil_tokenizer
|
|
6
|
+
Author: Tamil NLP Project
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/tamil-phy/tamil_tokenizer
|
|
9
|
+
Project-URL: Repository, https://github.com/tamil-phy/tamil_tokenizer
|
|
10
|
+
Project-URL: Issues, https://github.com/tamil-phy/tamil_tokenizer/issues
|
|
11
|
+
Keywords: tamil,tokenizer,nlp,morpheme,tamil-nlp,text-processing
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Natural Language :: Tamil
|
|
24
|
+
Classifier: Operating System :: OS Independent
|
|
25
|
+
Requires-Python: >=3.8
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
License-File: LICENSE
|
|
28
|
+
Provides-Extra: hf
|
|
29
|
+
Requires-Dist: transformers>=4.20.0; extra == "hf"
|
|
30
|
+
Dynamic: home-page
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
Dynamic: requires-python
|
|
33
|
+
|
|
34
|
+
# Tamil Tokenizer
|
|
35
|
+
|
|
36
|
+
A standalone, multi-level tokenizer for Tamil text. No external dependencies — uses only the Python standard library.
|
|
37
|
+
|
|
38
|
+
## Features
|
|
39
|
+
|
|
40
|
+
Four levels of tokenization:
|
|
41
|
+
|
|
42
|
+
| Level | Description | Example |
|
|
43
|
+
|-------|-------------|---------|
|
|
44
|
+
| **sentence** | Split text into sentences | `"அவன் வந்தான். அவள் பார்த்தாள்."` → 2 sentences |
|
|
45
|
+
| **word** | Split into words + punctuation | `"அவன் வந்தான்."` → `அவன்`, `வந்தான்`, `.` |
|
|
46
|
+
| **character** | Split into Tamil letters with classification (உயிர்/மெய்/உயிர்மெய், வல்லினம்/மெல்லினம்/இடையினம்) | `"வந்தான்"` → `வ`, `ந்`, `தா`, `ன்` |
|
|
47
|
+
| **morpheme** | Split into root + grammatical suffixes (case, tense, person) | `"பள்ளிக்கு"` → root `பள்ளி` + case suffix `க்கு` (Dative) |
|
|
48
|
+
|
|
49
|
+
## Installation
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
# From the project directory
|
|
53
|
+
pip install -e .
|
|
54
|
+
|
|
55
|
+
# Or just use directly (no install needed)
|
|
56
|
+
python -m tamil_tokenizer "அவன் வந்தான்."
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Usage
|
|
60
|
+
|
|
61
|
+
### Command Line
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
# Word tokenization (default)
|
|
65
|
+
python -m tamil_tokenizer "அவன் வந்தான்."
|
|
66
|
+
|
|
67
|
+
# Character tokenization
|
|
68
|
+
python -m tamil_tokenizer "தமிழ்நாடு" --level character
|
|
69
|
+
|
|
70
|
+
# Sentence tokenization
|
|
71
|
+
python -m tamil_tokenizer "அவன் வந்தான். அவள் பார்த்தாள்." --level sentence
|
|
72
|
+
|
|
73
|
+
# Morpheme tokenization
|
|
74
|
+
python -m tamil_tokenizer "பள்ளிக்கு சென்றான்." --level morpheme
|
|
75
|
+
|
|
76
|
+
# JSON output
|
|
77
|
+
python -m tamil_tokenizer "அவன் வந்தான்." --format json
|
|
78
|
+
|
|
79
|
+
# Plain text output (just token strings)
|
|
80
|
+
python -m tamil_tokenizer "அவன் வந்தான்." --format text
|
|
81
|
+
|
|
82
|
+
# Interactive mode
|
|
83
|
+
python -m tamil_tokenizer --interactive
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Python API
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
from tamil_tokenizer import TamilTokenizer, Token, TokenType
|
|
90
|
+
|
|
91
|
+
tokenizer = TamilTokenizer()
|
|
92
|
+
|
|
93
|
+
# Sentence tokenization
|
|
94
|
+
sentences = tokenizer.sentence_tokenize("அவன் வந்தான். அவள் பார்த்தாள்.")
|
|
95
|
+
|
|
96
|
+
# Word tokenization
|
|
97
|
+
words = tokenizer.word_tokenize("அவன் வந்தான்.")
|
|
98
|
+
|
|
99
|
+
# Character tokenization
|
|
100
|
+
letters = tokenizer.character_tokenize("வந்தான்")
|
|
101
|
+
for letter in letters:
|
|
102
|
+
print(f"{letter.text} -> {letter.token_type.value} ({letter.metadata})")
|
|
103
|
+
|
|
104
|
+
# Morpheme tokenization
|
|
105
|
+
morphemes = tokenizer.morpheme_tokenize("பள்ளிக்கு")
|
|
106
|
+
for m in morphemes:
|
|
107
|
+
print(f"{m.text} -> {m.token_type.value} ({m.metadata})")
|
|
108
|
+
|
|
109
|
+
# Unified pipeline
|
|
110
|
+
tokens = tokenizer.tokenize("அவன் வந்தான்.", level="word")
|
|
111
|
+
|
|
112
|
+
# Convenience: get just strings
|
|
113
|
+
strings = tokenizer.tokenize_to_strings("அவன் வந்தான்.", level="word")
|
|
114
|
+
# ['அவன்', 'வந்தான்', '.']
|
|
115
|
+
|
|
116
|
+
# Convenience: get dicts (useful for JSON serialization)
|
|
117
|
+
dicts = tokenizer.tokenize_to_dicts("அவன் வந்தான்.", level="character")
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## Token Types
|
|
121
|
+
|
|
122
|
+
### Word-level
|
|
123
|
+
- `word` — Tamil word
|
|
124
|
+
- `number` — Numeric value
|
|
125
|
+
- `punctuation` — Punctuation mark
|
|
126
|
+
- `symbol` — Other symbol
|
|
127
|
+
|
|
128
|
+
### Character-level
|
|
129
|
+
- `vowel` — உயிரெழுத்து (அ, ஆ, இ, ...)
|
|
130
|
+
- `consonant` — மெய்யெழுத்து (க், ங், ச், ...)
|
|
131
|
+
- `vowel_consonant` — உயிர்மெய்யெழுத்து (க, கா, கி, ...)
|
|
132
|
+
- `special` — ஆய்த எழுத்து (ஃ)
|
|
133
|
+
|
|
134
|
+
### Morpheme-level
|
|
135
|
+
- `root` — Root word
|
|
136
|
+
- `suffix` — Generic suffix
|
|
137
|
+
- `case_suffix` — வேற்றுமை உருபு (case marker)
|
|
138
|
+
- `tense_marker` — கால இடைநிலை (tense marker)
|
|
139
|
+
- `person_marker` — விகுதி (person/number marker)
|
|
140
|
+
|
|
141
|
+
## Project Structure
|
|
142
|
+
|
|
143
|
+
```
|
|
144
|
+
tamil_tokenizer/
|
|
145
|
+
├── __init__.py # Package init + public API
|
|
146
|
+
├── __main__.py # CLI entry point
|
|
147
|
+
├── tokenizer.py # Main TamilTokenizer class
|
|
148
|
+
├── constants/ # Tamil Unicode constants & letter groups
|
|
149
|
+
├── grammar/ # Grammar analysis (util, case, tense)
|
|
150
|
+
├── config/ # Configuration & data file loading
|
|
151
|
+
├── parsers/ # Root word parser & core parsing
|
|
152
|
+
├── utils/ # Iterator, splitting, word class utilities
|
|
153
|
+
└── data/ # Grammar rule files (.list)
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
## Requirements
|
|
157
|
+
|
|
158
|
+
- Python 3.8+
|
|
159
|
+
- No external dependencies
|
vettu-1.0.0/README.md
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
# Tamil Tokenizer
|
|
2
|
+
|
|
3
|
+
A standalone, multi-level tokenizer for Tamil text. No external dependencies — uses only the Python standard library.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
Four levels of tokenization:
|
|
8
|
+
|
|
9
|
+
| Level | Description | Example |
|
|
10
|
+
|-------|-------------|---------|
|
|
11
|
+
| **sentence** | Split text into sentences | `"அவன் வந்தான். அவள் பார்த்தாள்."` → 2 sentences |
|
|
12
|
+
| **word** | Split into words + punctuation | `"அவன் வந்தான்."` → `அவன்`, `வந்தான்`, `.` |
|
|
13
|
+
| **character** | Split into Tamil letters with classification (உயிர்/மெய்/உயிர்மெய், வல்லினம்/மெல்லினம்/இடையினம்) | `"வந்தான்"` → `வ`, `ந்`, `தா`, `ன்` |
|
|
14
|
+
| **morpheme** | Split into root + grammatical suffixes (case, tense, person) | `"பள்ளிக்கு"` → root `பள்ளி` + case suffix `க்கு` (Dative) |
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
# From the project directory
|
|
20
|
+
pip install -e .
|
|
21
|
+
|
|
22
|
+
# Or just use directly (no install needed)
|
|
23
|
+
python -m tamil_tokenizer "அவன் வந்தான்."
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Usage
|
|
27
|
+
|
|
28
|
+
### Command Line
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
# Word tokenization (default)
|
|
32
|
+
python -m tamil_tokenizer "அவன் வந்தான்."
|
|
33
|
+
|
|
34
|
+
# Character tokenization
|
|
35
|
+
python -m tamil_tokenizer "தமிழ்நாடு" --level character
|
|
36
|
+
|
|
37
|
+
# Sentence tokenization
|
|
38
|
+
python -m tamil_tokenizer "அவன் வந்தான். அவள் பார்த்தாள்." --level sentence
|
|
39
|
+
|
|
40
|
+
# Morpheme tokenization
|
|
41
|
+
python -m tamil_tokenizer "பள்ளிக்கு சென்றான்." --level morpheme
|
|
42
|
+
|
|
43
|
+
# JSON output
|
|
44
|
+
python -m tamil_tokenizer "அவன் வந்தான்." --format json
|
|
45
|
+
|
|
46
|
+
# Plain text output (just token strings)
|
|
47
|
+
python -m tamil_tokenizer "அவன் வந்தான்." --format text
|
|
48
|
+
|
|
49
|
+
# Interactive mode
|
|
50
|
+
python -m tamil_tokenizer --interactive
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### Python API
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from tamil_tokenizer import TamilTokenizer, Token, TokenType
|
|
57
|
+
|
|
58
|
+
tokenizer = TamilTokenizer()
|
|
59
|
+
|
|
60
|
+
# Sentence tokenization
|
|
61
|
+
sentences = tokenizer.sentence_tokenize("அவன் வந்தான். அவள் பார்த்தாள்.")
|
|
62
|
+
|
|
63
|
+
# Word tokenization
|
|
64
|
+
words = tokenizer.word_tokenize("அவன் வந்தான்.")
|
|
65
|
+
|
|
66
|
+
# Character tokenization
|
|
67
|
+
letters = tokenizer.character_tokenize("வந்தான்")
|
|
68
|
+
for letter in letters:
|
|
69
|
+
print(f"{letter.text} -> {letter.token_type.value} ({letter.metadata})")
|
|
70
|
+
|
|
71
|
+
# Morpheme tokenization
|
|
72
|
+
morphemes = tokenizer.morpheme_tokenize("பள்ளிக்கு")
|
|
73
|
+
for m in morphemes:
|
|
74
|
+
print(f"{m.text} -> {m.token_type.value} ({m.metadata})")
|
|
75
|
+
|
|
76
|
+
# Unified pipeline
|
|
77
|
+
tokens = tokenizer.tokenize("அவன் வந்தான்.", level="word")
|
|
78
|
+
|
|
79
|
+
# Convenience: get just strings
|
|
80
|
+
strings = tokenizer.tokenize_to_strings("அவன் வந்தான்.", level="word")
|
|
81
|
+
# ['அவன்', 'வந்தான்', '.']
|
|
82
|
+
|
|
83
|
+
# Convenience: get dicts (useful for JSON serialization)
|
|
84
|
+
dicts = tokenizer.tokenize_to_dicts("அவன் வந்தான்.", level="character")
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Token Types
|
|
88
|
+
|
|
89
|
+
### Word-level
|
|
90
|
+
- `word` — Tamil word
|
|
91
|
+
- `number` — Numeric value
|
|
92
|
+
- `punctuation` — Punctuation mark
|
|
93
|
+
- `symbol` — Other symbol
|
|
94
|
+
|
|
95
|
+
### Character-level
|
|
96
|
+
- `vowel` — உயிரெழுத்து (அ, ஆ, இ, ...)
|
|
97
|
+
- `consonant` — மெய்யெழுத்து (க், ங், ச், ...)
|
|
98
|
+
- `vowel_consonant` — உயிர்மெய்யெழுத்து (க, கா, கி, ...)
|
|
99
|
+
- `special` — ஆய்த எழுத்து (ஃ)
|
|
100
|
+
|
|
101
|
+
### Morpheme-level
|
|
102
|
+
- `root` — Root word
|
|
103
|
+
- `suffix` — Generic suffix
|
|
104
|
+
- `case_suffix` — வேற்றுமை உருபு (case marker)
|
|
105
|
+
- `tense_marker` — கால இடைநிலை (tense marker)
|
|
106
|
+
- `person_marker` — விகுதி (person/number marker)
|
|
107
|
+
|
|
108
|
+
## Project Structure
|
|
109
|
+
|
|
110
|
+
```
|
|
111
|
+
tamil_tokenizer/
|
|
112
|
+
├── __init__.py # Package init + public API
|
|
113
|
+
├── __main__.py # CLI entry point
|
|
114
|
+
├── tokenizer.py # Main TamilTokenizer class
|
|
115
|
+
├── constants/ # Tamil Unicode constants & letter groups
|
|
116
|
+
├── grammar/ # Grammar analysis (util, case, tense)
|
|
117
|
+
├── config/ # Configuration & data file loading
|
|
118
|
+
├── parsers/ # Root word parser & core parsing
|
|
119
|
+
├── utils/ # Iterator, splitting, word class utilities
|
|
120
|
+
└── data/ # Grammar rule files (.list)
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Requirements
|
|
124
|
+
|
|
125
|
+
- Python 3.8+
|
|
126
|
+
- No external dependencies
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "vettu"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Multi-level tokenizer for Tamil text — sentence, word, character, and morpheme tokenization"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.8"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Tamil NLP Project"},
|
|
14
|
+
]
|
|
15
|
+
keywords = ["tamil", "tokenizer", "nlp", "morpheme", "tamil-nlp", "text-processing"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Intended Audience :: Science/Research",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Topic :: Text Processing :: Linguistic",
|
|
22
|
+
"Programming Language :: Python :: 3",
|
|
23
|
+
"Programming Language :: Python :: 3.8",
|
|
24
|
+
"Programming Language :: Python :: 3.9",
|
|
25
|
+
"Programming Language :: Python :: 3.10",
|
|
26
|
+
"Programming Language :: Python :: 3.11",
|
|
27
|
+
"Programming Language :: Python :: 3.12",
|
|
28
|
+
"Natural Language :: Tamil",
|
|
29
|
+
"Operating System :: OS Independent",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.optional-dependencies]
|
|
33
|
+
hf = ["transformers>=4.20.0"]
|
|
34
|
+
|
|
35
|
+
[project.scripts]
|
|
36
|
+
tamil-tokenize = "tamil_tokenizer.__main__:main"
|
|
37
|
+
|
|
38
|
+
[project.urls]
|
|
39
|
+
Homepage = "https://github.com/tamil-phy/tamil_tokenizer"
|
|
40
|
+
Repository = "https://github.com/tamil-phy/tamil_tokenizer"
|
|
41
|
+
Issues = "https://github.com/tamil-phy/tamil_tokenizer/issues"
|
|
42
|
+
|
|
43
|
+
[tool.setuptools.packages.find]
|
|
44
|
+
include = ["tamil_tokenizer*"]
|
|
45
|
+
|
|
46
|
+
[tool.setuptools.package-data]
|
|
47
|
+
tamil_tokenizer = ["data/*.list"]
|
vettu-1.0.0/setup.cfg
ADDED
vettu-1.0.0/setup.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Setup script for Tamil Tokenizer."""
|
|
2
|
+
|
|
3
|
+
from setuptools import setup, find_packages
|
|
4
|
+
|
|
5
|
+
setup(
|
|
6
|
+
name="vettu",
|
|
7
|
+
version="1.0.0",
|
|
8
|
+
description="Multi-level tokenizer for Tamil text",
|
|
9
|
+
long_description=open("README.md", encoding="utf-8").read(),
|
|
10
|
+
long_description_content_type="text/markdown",
|
|
11
|
+
author="Tamil NLP Project",
|
|
12
|
+
license="MIT",
|
|
13
|
+
url="https://github.com/tamil-phy/tamil_tokenizer",
|
|
14
|
+
python_requires=">=3.8",
|
|
15
|
+
packages=find_packages(),
|
|
16
|
+
package_data={
|
|
17
|
+
"tamil_tokenizer": ["data/*.list"],
|
|
18
|
+
},
|
|
19
|
+
include_package_data=True,
|
|
20
|
+
entry_points={
|
|
21
|
+
"console_scripts": [
|
|
22
|
+
"tamil-tokenize=tamil_tokenizer.__main__:main",
|
|
23
|
+
],
|
|
24
|
+
},
|
|
25
|
+
classifiers=[
|
|
26
|
+
"Development Status :: 4 - Beta",
|
|
27
|
+
"Intended Audience :: Developers",
|
|
28
|
+
"Intended Audience :: Science/Research",
|
|
29
|
+
"Topic :: Text Processing :: Linguistic",
|
|
30
|
+
"Programming Language :: Python :: 3",
|
|
31
|
+
"Programming Language :: Python :: 3.8",
|
|
32
|
+
"Programming Language :: Python :: 3.9",
|
|
33
|
+
"Programming Language :: Python :: 3.10",
|
|
34
|
+
"Programming Language :: Python :: 3.11",
|
|
35
|
+
"Programming Language :: Python :: 3.12",
|
|
36
|
+
"Natural Language :: Tamil",
|
|
37
|
+
"Operating System :: OS Independent",
|
|
38
|
+
],
|
|
39
|
+
)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tamil Tokenizer - Standalone multi-level tokenizer for Tamil text.
|
|
3
|
+
|
|
4
|
+
Provides four levels of tokenization:
|
|
5
|
+
- Sentence tokenization
|
|
6
|
+
- Word tokenization
|
|
7
|
+
- Character (letter) tokenization
|
|
8
|
+
- Morpheme tokenization (root word + suffixes)
|
|
9
|
+
|
|
10
|
+
Usage:
|
|
11
|
+
from tamil_tokenizer import TamilTokenizer, Token, TokenType
|
|
12
|
+
|
|
13
|
+
tokenizer = TamilTokenizer()
|
|
14
|
+
tokens = tokenizer.tokenize("அவன் வந்தான்.", level="word")
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
__version__ = "1.0.0"
|
|
18
|
+
|
|
19
|
+
from .tokenizer import TamilTokenizer, Token, TokenType
|
|
20
|
+
|
|
21
|
+
def _import_hf_tokenizer():
|
|
22
|
+
"""Lazy import to avoid hard dependency on transformers."""
|
|
23
|
+
from .hf_tokenizer import TamilHFTokenizer
|
|
24
|
+
return TamilHFTokenizer
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
from .hf_tokenizer import TamilHFTokenizer
|
|
28
|
+
__all__ = ['TamilTokenizer', 'TamilHFTokenizer', 'Token', 'TokenType']
|
|
29
|
+
except ImportError:
|
|
30
|
+
__all__ = ['TamilTokenizer', 'Token', 'TokenType']
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tamil Tokenizer CLI - Command-line interface for Tamil tokenization.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
python -m tamil_tokenizer "அவன் வந்தான்."
|
|
6
|
+
python -m tamil_tokenizer "அவன் வந்தான்." --level character
|
|
7
|
+
python -m tamil_tokenizer "அவன் வந்தான். அவள் பார்த்தாள்." --level sentence
|
|
8
|
+
python -m tamil_tokenizer "பள்ளிக்கு சென்றான்." --level morpheme
|
|
9
|
+
python -m tamil_tokenizer --interactive
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import sys
|
|
14
|
+
from typing import Optional
|
|
15
|
+
|
|
16
|
+
from .tokenizer import TamilTokenizer, Token
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def print_tokens(tokens: list, level: str, text: str) -> None:
|
|
20
|
+
"""Print tokens in a formatted table."""
|
|
21
|
+
print(f"\n{'='*60}")
|
|
22
|
+
print(f"Tokenization Level: {level}")
|
|
23
|
+
print(f"Input: {text}")
|
|
24
|
+
print(f"{'='*60}")
|
|
25
|
+
print(f"Tokens ({len(tokens)}):")
|
|
26
|
+
|
|
27
|
+
for i, token in enumerate(tokens):
|
|
28
|
+
meta_str = ""
|
|
29
|
+
if token.metadata:
|
|
30
|
+
meta_parts = [f"{k}={v}" for k, v in token.metadata.items()]
|
|
31
|
+
meta_str = f" ({', '.join(meta_parts)})"
|
|
32
|
+
print(f" {i+1}. [{token.token_type.value:>16}] '{token.text}'{meta_str}")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def interactive_mode(tokenizer: TamilTokenizer) -> None:
|
|
36
|
+
"""Run tokenizer in interactive mode."""
|
|
37
|
+
print("Tamil Tokenizer - Interactive Mode")
|
|
38
|
+
print("Type Tamil text to tokenize. Commands:")
|
|
39
|
+
print(" :level <sentence|word|character|morpheme> - Change level")
|
|
40
|
+
print(" :quit - Exit")
|
|
41
|
+
print(f"{'='*60}")
|
|
42
|
+
|
|
43
|
+
current_level = "word"
|
|
44
|
+
|
|
45
|
+
while True:
|
|
46
|
+
try:
|
|
47
|
+
text = input(f"\n[{current_level}] >>> ").strip()
|
|
48
|
+
except (EOFError, KeyboardInterrupt):
|
|
49
|
+
print("\nGoodbye!")
|
|
50
|
+
break
|
|
51
|
+
|
|
52
|
+
if not text:
|
|
53
|
+
continue
|
|
54
|
+
|
|
55
|
+
if text == ":quit":
|
|
56
|
+
print("Goodbye!")
|
|
57
|
+
break
|
|
58
|
+
|
|
59
|
+
if text.startswith(":level "):
|
|
60
|
+
new_level = text.split(maxsplit=1)[1].strip()
|
|
61
|
+
if new_level in ("sentence", "word", "character", "morpheme"):
|
|
62
|
+
current_level = new_level
|
|
63
|
+
print(f"Level set to: {current_level}")
|
|
64
|
+
else:
|
|
65
|
+
print(f"Unknown level: {new_level}. Use: sentence, word, character, morpheme")
|
|
66
|
+
continue
|
|
67
|
+
|
|
68
|
+
tokens = tokenizer.tokenize(text, level=current_level)
|
|
69
|
+
print_tokens(tokens, current_level, text)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def main():
|
|
73
|
+
"""Main entry point."""
|
|
74
|
+
parser = argparse.ArgumentParser(
|
|
75
|
+
description='Tamil Tokenizer - Multi-level tokenization for Tamil text',
|
|
76
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
77
|
+
epilog="""
|
|
78
|
+
Examples:
|
|
79
|
+
%(prog)s "அவன் வந்தான்." Word tokenization (default)
|
|
80
|
+
%(prog)s "அவன் வந்தான்." --level character Character tokenization
|
|
81
|
+
%(prog)s "அவன் வந்தான். அவள் பார்த்தாள்." --level sentence Sentence tokenization
|
|
82
|
+
%(prog)s "பள்ளிக்கு சென்றான்." --level morpheme Morpheme tokenization
|
|
83
|
+
%(prog)s --interactive Interactive mode
|
|
84
|
+
"""
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
parser.add_argument('text', nargs='?', help='Tamil text to tokenize')
|
|
88
|
+
parser.add_argument('-l', '--level', default='word',
|
|
89
|
+
choices=['sentence', 'word', 'character', 'morpheme'],
|
|
90
|
+
help='Tokenization level (default: word)')
|
|
91
|
+
parser.add_argument('-i', '--interactive', action='store_true',
|
|
92
|
+
help='Run in interactive mode')
|
|
93
|
+
parser.add_argument('-d', '--data', metavar='PATH',
|
|
94
|
+
help='Path to data directory')
|
|
95
|
+
parser.add_argument('-f', '--format', default='table',
|
|
96
|
+
choices=['table', 'text', 'json'],
|
|
97
|
+
help='Output format (default: table)')
|
|
98
|
+
parser.add_argument('--version', action='version',
|
|
99
|
+
version='Tamil Tokenizer 1.0.0')
|
|
100
|
+
|
|
101
|
+
args = parser.parse_args()
|
|
102
|
+
|
|
103
|
+
tokenizer = TamilTokenizer(args.data)
|
|
104
|
+
|
|
105
|
+
if args.interactive:
|
|
106
|
+
interactive_mode(tokenizer)
|
|
107
|
+
elif args.text:
|
|
108
|
+
if args.format == 'json':
|
|
109
|
+
import json
|
|
110
|
+
result = tokenizer.tokenize_to_dicts(args.text, level=args.level)
|
|
111
|
+
print(json.dumps(result, ensure_ascii=False, indent=2))
|
|
112
|
+
elif args.format == 'text':
|
|
113
|
+
result = tokenizer.tokenize_to_strings(args.text, level=args.level)
|
|
114
|
+
for t in result:
|
|
115
|
+
print(t)
|
|
116
|
+
else:
|
|
117
|
+
tokens = tokenizer.tokenize(args.text, level=args.level)
|
|
118
|
+
print_tokens(tokens, args.level, args.text)
|
|
119
|
+
else:
|
|
120
|
+
parser.print_help()
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
if __name__ == '__main__':
|
|
124
|
+
main()
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Tamil tokenizer configuration module."""
|
|
2
|
+
|
|
3
|
+
from .constants import ConfigConstants, DEFAULT_FILE_PATHS
|
|
4
|
+
from .config_loader import ConfigLoader, ReadConfig
|
|
5
|
+
from .constant_table import TamilConstantTable
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
'ConfigConstants',
|
|
9
|
+
'DEFAULT_FILE_PATHS',
|
|
10
|
+
'ConfigLoader',
|
|
11
|
+
'ReadConfig',
|
|
12
|
+
'TamilConstantTable',
|
|
13
|
+
]
|