uzbek-tokenizer 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uzbek_tokenizer-0.1.0/MANIFEST.in +3 -0
- uzbek_tokenizer-0.1.0/PKG-INFO +206 -0
- uzbek_tokenizer-0.1.0/README.md +179 -0
- uzbek_tokenizer-0.1.0/pyproject.toml +45 -0
- uzbek_tokenizer-0.1.0/setup.cfg +4 -0
- uzbek_tokenizer-0.1.0/tests/test_segmenter.py +148 -0
- uzbek_tokenizer-0.1.0/uzbek_tokenizer/__init__.py +19 -0
- uzbek_tokenizer-0.1.0/uzbek_tokenizer/data/prefixes.txt +2 -0
- uzbek_tokenizer-0.1.0/uzbek_tokenizer/data/suffixes.txt +77 -0
- uzbek_tokenizer-0.1.0/uzbek_tokenizer/segmenter.py +96 -0
- uzbek_tokenizer-0.1.0/uzbek_tokenizer.egg-info/PKG-INFO +206 -0
- uzbek_tokenizer-0.1.0/uzbek_tokenizer.egg-info/SOURCES.txt +13 -0
- uzbek_tokenizer-0.1.0/uzbek_tokenizer.egg-info/dependency_links.txt +1 -0
- uzbek_tokenizer-0.1.0/uzbek_tokenizer.egg-info/requires.txt +1 -0
- uzbek_tokenizer-0.1.0/uzbek_tokenizer.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: uzbek-tokenizer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A morphological tokenizer for Uzbek language
|
|
5
|
+
Author-email: Ibrat Usmonov <usmonovibrat315@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/IbratDO/uzbek-tokenizer
|
|
8
|
+
Project-URL: Documentation, https://github.com/IbratDO/uzbek-tokenizer#usage
|
|
9
|
+
Project-URL: Repository, https://github.com/IbratDO/uzbek-tokenizer.git
|
|
10
|
+
Project-URL: Issues, https://github.com/IbratDO/uzbek-tokenizer/issues
|
|
11
|
+
Keywords: uzbek,tokenizer,morphology,nlp,segmentation
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering
|
|
23
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
24
|
+
Requires-Python: >=3.8
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
Requires-Dist: regex>=2022.0.0
|
|
27
|
+
|
|
28
|
+
# Uzbek Tokenizer
|
|
29
|
+
|
|
30
|
+
A Python library for morphological segmentation of Uzbek text. This tokenizer breaks down Uzbek words into their constituent morphemes (stems and affixes), which is useful for natural language processing tasks like machine learning, text analysis, and linguistic research.
|
|
31
|
+
|
|
32
|
+
## Features
|
|
33
|
+
|
|
34
|
+
- 🔤 **Morphological Segmentation**: Breaks Uzbek words into stems and affixes
|
|
35
|
+
- 📝 **Text Normalization**: Handles Unicode, punctuation, and Uzbek-specific characters
|
|
36
|
+
- ⚡ **Efficient**: Cached segmentation for repeated tokens
|
|
37
|
+
- 📦 **Easy to Use**: Simple API for both single words and batch processing
|
|
38
|
+
- 🎯 **Agglutinative Language Support**: Designed specifically for Uzbek's morphological structure
|
|
39
|
+
|
|
40
|
+
## Installation
|
|
41
|
+
|
|
42
|
+
Install via pip:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install uzbek-tokenizer
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Or clone the repository:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
git clone https://github.com/IbratDO/uzbek-tokenizer.git
|
|
52
|
+
cd uzbek-tokenizer
|
|
53
|
+
pip install -e .
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Quick Start
|
|
57
|
+
|
|
58
|
+
### Basic Usage
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from uzbek_tokenizer import apply_segmentation, normalize
|
|
62
|
+
|
|
63
|
+
# Segment a word
|
|
64
|
+
text = "kitoblarimizdan"
|
|
65
|
+
result = apply_segmentation(text)
|
|
66
|
+
print(result)
|
|
67
|
+
# Output: ['kitob', 'lar', 'imiz', 'dan']
|
|
68
|
+
|
|
69
|
+
# Segment a sentence
|
|
70
|
+
sentence = "kitoblarimizdan o'qidim"
|
|
71
|
+
result = apply_segmentation(sentence)
|
|
72
|
+
print(result)
|
|
73
|
+
# Output: ['kitob', 'lar', 'imiz', 'dan', "o'q", 'id', 'im']
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Normalization Only
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from uzbek_tokenizer import normalize
|
|
80
|
+
|
|
81
|
+
text = "Salom, JAHON!"
|
|
82
|
+
normalized = normalize(text)
|
|
83
|
+
print(normalized)
|
|
84
|
+
# Output: "salom , jahon !"
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Morphological Segmentation Only
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from uzbek_tokenizer import segment_morphological
|
|
91
|
+
|
|
92
|
+
word = "o'qiyotgan"
|
|
93
|
+
segments = segment_morphological(word)
|
|
94
|
+
print(segments)
|
|
95
|
+
# Output: ["o'q", 'i', 'yotgan']
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## API Reference
|
|
99
|
+
|
|
100
|
+
### `normalize(text: str) -> str`
|
|
101
|
+
Normalizes Uzbek text by:
|
|
102
|
+
- Converting to NFC Unicode form
|
|
103
|
+
- Lowercasing
|
|
104
|
+
- Standardizing punctuation
|
|
105
|
+
- Handling Uzbek apostrophe variants
|
|
106
|
+
- Collapsing whitespace
|
|
107
|
+
|
|
108
|
+
**Parameters:**
|
|
109
|
+
- `text` (str): Input text
|
|
110
|
+
|
|
111
|
+
**Returns:** Normalized text
|
|
112
|
+
|
|
113
|
+
### `segment_morphological(token: str) -> list[str]`
|
|
114
|
+
Recursively segments a single token into morphemes.
|
|
115
|
+
|
|
116
|
+
**Parameters:**
|
|
117
|
+
- `token` (str): A single word/token
|
|
118
|
+
|
|
119
|
+
**Returns:** List of morpheme components
|
|
120
|
+
|
|
121
|
+
### `apply_segmentation(line: str) -> list[str]`
|
|
122
|
+
Normalizes text and segments all tokens into morphemes.
|
|
123
|
+
|
|
124
|
+
**Parameters:**
|
|
125
|
+
- `line` (str): Input text (can be multiple words)
|
|
126
|
+
|
|
127
|
+
**Returns:** List of all morphemes
|
|
128
|
+
|
|
129
|
+
## How It Works
|
|
130
|
+
|
|
131
|
+
The tokenizer uses a **greedy longest-match-first algorithm**:
|
|
132
|
+
|
|
133
|
+
1. **Normalization**: Cleans and standardizes the input text
|
|
134
|
+
2. **Prefix Stripping**: Removes longest matching prefixes (be-, no-)
|
|
135
|
+
3. **Suffix Stripping**: Removes longest matching suffixes (47+ Uzbek suffixes)
|
|
136
|
+
4. **Recursion**: Repeats until no more affixes can be removed
|
|
137
|
+
5. **Output**: Returns the stem + all stripped affixes
|
|
138
|
+
|
|
139
|
+
Example: `kitoblarimizdan`
|
|
140
|
+
- Remove suffix "dan" → remainder: "kitoblarimiz"
|
|
141
|
+
- Remove suffix "imiz" → remainder: "kitoblar"
|
|
142
|
+
- Remove suffix "lar" → remainder: "kitob"
|
|
143
|
+
- No more affixes → stem: "kitob"
|
|
144
|
+
- Result: `[kitob, lar, imiz, dan]`
|
|
145
|
+
|
|
146
|
+
## Supported Affixes
|
|
147
|
+
|
|
148
|
+
### Prefixes (2)
|
|
149
|
+
- be-, no-
|
|
150
|
+
|
|
151
|
+
### Suffixes (47+)
|
|
152
|
+
Includes: -cha, -lar, -dagi, -dan, -ga, -ni, -ning, -lik, -ish, and many more
|
|
153
|
+
|
|
154
|
+
See `uzbek_tokenizer/data/prefixes.txt` and `uzbek_tokenizer/data/suffixes.txt` for the complete list.
|
|
155
|
+
|
|
156
|
+
## Requirements
|
|
157
|
+
|
|
158
|
+
- Python 3.8+
|
|
159
|
+
- regex >= 2022.0.0
|
|
160
|
+
|
|
161
|
+
## License
|
|
162
|
+
|
|
163
|
+
MIT License - see LICENSE file for details
|
|
164
|
+
|
|
165
|
+
## Contributing
|
|
166
|
+
|
|
167
|
+
Contributions are welcome! Please:
|
|
168
|
+
|
|
169
|
+
1. Fork the repository
|
|
170
|
+
2. Create a feature branch (`git checkout -b feature/amazing-feature`)
|
|
171
|
+
3. Commit your changes (`git commit -m 'Add amazing feature'`)
|
|
172
|
+
4. Push to the branch (`git push origin feature/amazing-feature`)
|
|
173
|
+
5. Open a Pull Request
|
|
174
|
+
|
|
175
|
+
## Issues & Feedback
|
|
176
|
+
|
|
177
|
+
Found a bug or have a suggestion? Please open an [issue on GitHub](https://github.com/IbratDO/uzbek-tokenizer/issues).
|
|
178
|
+
|
|
179
|
+
## Citation
|
|
180
|
+
|
|
181
|
+
If you use this tokenizer in your research, please cite:
|
|
182
|
+
|
|
183
|
+
```bibtex
|
|
184
|
+
@software{uzbek_tokenizer_2026,
|
|
185
|
+
author = {Ibrat Usmonov},
|
|
186
|
+
title = {Uzbek Tokenizer: Morphological Segmentation for Uzbek},
|
|
187
|
+
year = {2026},
|
|
188
|
+
url = {https://github.com/IbratDO/uzbek-tokenizer}
|
|
189
|
+
}
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
## Roadmap
|
|
193
|
+
|
|
194
|
+
- [ ] Vowel harmony validation
|
|
195
|
+
- [ ] Stem lemmatization
|
|
196
|
+
- [ ] Support for additional Uzbek dialects
|
|
197
|
+
- [ ] Performance benchmarks
|
|
198
|
+
- [ ] Web API
|
|
199
|
+
|
|
200
|
+
## Acknowledgments
|
|
201
|
+
|
|
202
|
+
Built for Uzbek language processing with love ❤️
|
|
203
|
+
|
|
204
|
+
---
|
|
205
|
+
|
|
206
|
+
**Status**: Early alpha (v0.1.0) - API may change. Contributions welcome!
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# Uzbek Tokenizer
|
|
2
|
+
|
|
3
|
+
A Python library for morphological segmentation of Uzbek text. This tokenizer breaks down Uzbek words into their constituent morphemes (stems and affixes), which is useful for natural language processing tasks like machine learning, text analysis, and linguistic research.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- 🔤 **Morphological Segmentation**: Breaks Uzbek words into stems and affixes
|
|
8
|
+
- 📝 **Text Normalization**: Handles Unicode, punctuation, and Uzbek-specific characters
|
|
9
|
+
- ⚡ **Efficient**: Cached segmentation for repeated tokens
|
|
10
|
+
- 📦 **Easy to Use**: Simple API for both single words and batch processing
|
|
11
|
+
- 🎯 **Agglutinative Language Support**: Designed specifically for Uzbek's morphological structure
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
Install via pip:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install uzbek-tokenizer
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Or clone the repository:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
git clone https://github.com/IbratDO/uzbek-tokenizer.git
|
|
25
|
+
cd uzbek-tokenizer
|
|
26
|
+
pip install -e .
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Quick Start
|
|
30
|
+
|
|
31
|
+
### Basic Usage
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
from uzbek_tokenizer import apply_segmentation, normalize
|
|
35
|
+
|
|
36
|
+
# Segment a word
|
|
37
|
+
text = "kitoblarimizdan"
|
|
38
|
+
result = apply_segmentation(text)
|
|
39
|
+
print(result)
|
|
40
|
+
# Output: ['kitob', 'lar', 'imiz', 'dan']
|
|
41
|
+
|
|
42
|
+
# Segment a sentence
|
|
43
|
+
sentence = "kitoblarimizdan o'qidim"
|
|
44
|
+
result = apply_segmentation(sentence)
|
|
45
|
+
print(result)
|
|
46
|
+
# Output: ['kitob', 'lar', 'imiz', 'dan', "o'q", 'id', 'im']
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Normalization Only
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from uzbek_tokenizer import normalize
|
|
53
|
+
|
|
54
|
+
text = "Salom, JAHON!"
|
|
55
|
+
normalized = normalize(text)
|
|
56
|
+
print(normalized)
|
|
57
|
+
# Output: "salom , jahon !"
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Morphological Segmentation Only
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from uzbek_tokenizer import segment_morphological
|
|
64
|
+
|
|
65
|
+
word = "o'qiyotgan"
|
|
66
|
+
segments = segment_morphological(word)
|
|
67
|
+
print(segments)
|
|
68
|
+
# Output: ["o'q", 'i', 'yotgan']
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## API Reference
|
|
72
|
+
|
|
73
|
+
### `normalize(text: str) -> str`
|
|
74
|
+
Normalizes Uzbek text by:
|
|
75
|
+
- Converting to NFC Unicode form
|
|
76
|
+
- Lowercasing
|
|
77
|
+
- Standardizing punctuation
|
|
78
|
+
- Handling Uzbek apostrophe variants
|
|
79
|
+
- Collapsing whitespace
|
|
80
|
+
|
|
81
|
+
**Parameters:**
|
|
82
|
+
- `text` (str): Input text
|
|
83
|
+
|
|
84
|
+
**Returns:** Normalized text
|
|
85
|
+
|
|
86
|
+
### `segment_morphological(token: str) -> list[str]`
|
|
87
|
+
Recursively segments a single token into morphemes.
|
|
88
|
+
|
|
89
|
+
**Parameters:**
|
|
90
|
+
- `token` (str): A single word/token
|
|
91
|
+
|
|
92
|
+
**Returns:** List of morpheme components
|
|
93
|
+
|
|
94
|
+
### `apply_segmentation(line: str) -> list[str]`
|
|
95
|
+
Normalizes text and segments all tokens into morphemes.
|
|
96
|
+
|
|
97
|
+
**Parameters:**
|
|
98
|
+
- `line` (str): Input text (can be multiple words)
|
|
99
|
+
|
|
100
|
+
**Returns:** List of all morphemes
|
|
101
|
+
|
|
102
|
+
## How It Works
|
|
103
|
+
|
|
104
|
+
The tokenizer uses a **greedy longest-match-first algorithm**:
|
|
105
|
+
|
|
106
|
+
1. **Normalization**: Cleans and standardizes the input text
|
|
107
|
+
2. **Prefix Stripping**: Removes longest matching prefixes (be-, no-)
|
|
108
|
+
3. **Suffix Stripping**: Removes longest matching suffixes (47+ Uzbek suffixes)
|
|
109
|
+
4. **Recursion**: Repeats until no more affixes can be removed
|
|
110
|
+
5. **Output**: Returns the stem + all stripped affixes
|
|
111
|
+
|
|
112
|
+
Example: `kitoblarimizdan`
|
|
113
|
+
- Remove suffix "dan" → remainder: "kitoblarimiz"
|
|
114
|
+
- Remove suffix "imiz" → remainder: "kitoblar"
|
|
115
|
+
- Remove suffix "lar" → remainder: "kitob"
|
|
116
|
+
- No more affixes → stem: "kitob"
|
|
117
|
+
- Result: `[kitob, lar, imiz, dan]`
|
|
118
|
+
|
|
119
|
+
## Supported Affixes
|
|
120
|
+
|
|
121
|
+
### Prefixes (2)
|
|
122
|
+
- be-, no-
|
|
123
|
+
|
|
124
|
+
### Suffixes (47+)
|
|
125
|
+
Includes: -cha, -lar, -dagi, -dan, -ga, -ni, -ning, -lik, -ish, and many more
|
|
126
|
+
|
|
127
|
+
See `uzbek_tokenizer/data/prefixes.txt` and `uzbek_tokenizer/data/suffixes.txt` for the complete list.
|
|
128
|
+
|
|
129
|
+
## Requirements
|
|
130
|
+
|
|
131
|
+
- Python 3.8+
|
|
132
|
+
- regex >= 2022.0.0
|
|
133
|
+
|
|
134
|
+
## License
|
|
135
|
+
|
|
136
|
+
MIT License - see LICENSE file for details
|
|
137
|
+
|
|
138
|
+
## Contributing
|
|
139
|
+
|
|
140
|
+
Contributions are welcome! Please:
|
|
141
|
+
|
|
142
|
+
1. Fork the repository
|
|
143
|
+
2. Create a feature branch (`git checkout -b feature/amazing-feature`)
|
|
144
|
+
3. Commit your changes (`git commit -m 'Add amazing feature'`)
|
|
145
|
+
4. Push to the branch (`git push origin feature/amazing-feature`)
|
|
146
|
+
5. Open a Pull Request
|
|
147
|
+
|
|
148
|
+
## Issues & Feedback
|
|
149
|
+
|
|
150
|
+
Found a bug or have a suggestion? Please open an [issue on GitHub](https://github.com/IbratDO/uzbek-tokenizer/issues).
|
|
151
|
+
|
|
152
|
+
## Citation
|
|
153
|
+
|
|
154
|
+
If you use this tokenizer in your research, please cite:
|
|
155
|
+
|
|
156
|
+
```bibtex
|
|
157
|
+
@software{uzbek_tokenizer_2026,
|
|
158
|
+
author = {Ibrat Usmonov},
|
|
159
|
+
title = {Uzbek Tokenizer: Morphological Segmentation for Uzbek},
|
|
160
|
+
year = {2026},
|
|
161
|
+
url = {https://github.com/IbratDO/uzbek-tokenizer}
|
|
162
|
+
}
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
## Roadmap
|
|
166
|
+
|
|
167
|
+
- [ ] Vowel harmony validation
|
|
168
|
+
- [ ] Stem lemmatization
|
|
169
|
+
- [ ] Support for additional Uzbek dialects
|
|
170
|
+
- [ ] Performance benchmarks
|
|
171
|
+
- [ ] Web API
|
|
172
|
+
|
|
173
|
+
## Acknowledgments
|
|
174
|
+
|
|
175
|
+
Built for Uzbek language processing with love ❤️
|
|
176
|
+
|
|
177
|
+
---
|
|
178
|
+
|
|
179
|
+
**Status**: Early alpha (v0.1.0) - API may change. Contributions welcome!
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=65.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "uzbek-tokenizer"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A morphological tokenizer for Uzbek language"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
license = {text = "MIT"}
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Ibrat Usmonov", email = "usmonovibrat315@gmail.com"}
|
|
14
|
+
]
|
|
15
|
+
keywords = ["uzbek", "tokenizer", "morphology", "nlp", "segmentation"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Intended Audience :: Science/Research",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Programming Language :: Python :: 3.8",
|
|
23
|
+
"Programming Language :: Python :: 3.9",
|
|
24
|
+
"Programming Language :: Python :: 3.10",
|
|
25
|
+
"Programming Language :: Python :: 3.11",
|
|
26
|
+
"Programming Language :: Python :: 3.12",
|
|
27
|
+
"Topic :: Scientific/Engineering",
|
|
28
|
+
"Topic :: Text Processing :: Linguistic",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
dependencies = [
|
|
32
|
+
"regex>=2022.0.0",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[project.urls]
|
|
36
|
+
Homepage = "https://github.com/IbratDO/uzbek-tokenizer"
|
|
37
|
+
Documentation = "https://github.com/IbratDO/uzbek-tokenizer#usage"
|
|
38
|
+
Repository = "https://github.com/IbratDO/uzbek-tokenizer.git"
|
|
39
|
+
Issues = "https://github.com/IbratDO/uzbek-tokenizer/issues"
|
|
40
|
+
|
|
41
|
+
[tool.setuptools]
|
|
42
|
+
packages = ["uzbek_tokenizer"]
|
|
43
|
+
|
|
44
|
+
[tool.setuptools.package-data]
|
|
45
|
+
uzbek_tokenizer = ["data/*.txt"]
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
from uzbek_tokenizer import normalize, segment_morphological, apply_segmentation
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class TestNormalize(unittest.TestCase):
|
|
6
|
+
"""Test cases for the normalize function."""
|
|
7
|
+
|
|
8
|
+
def test_lowercase(self):
|
|
9
|
+
"""Test that text is converted to lowercase."""
|
|
10
|
+
result = normalize("SALOM")
|
|
11
|
+
self.assertEqual(result, "salom")
|
|
12
|
+
|
|
13
|
+
def test_unicode_normalization(self):
|
|
14
|
+
"""Test Unicode NFC normalization."""
|
|
15
|
+
result = normalize("kafe")
|
|
16
|
+
self.assertIsInstance(result, str)
|
|
17
|
+
|
|
18
|
+
def test_punctuation_separation(self):
|
|
19
|
+
"""Test that punctuation is separated with spaces."""
|
|
20
|
+
result = normalize("Salom!")
|
|
21
|
+
self.assertIn("!", result)
|
|
22
|
+
|
|
23
|
+
def test_apostrophe_handling(self):
|
|
24
|
+
"""Test apostrophe normalization."""
|
|
25
|
+
# Various apostrophe types
|
|
26
|
+
result = normalize("o'q")
|
|
27
|
+
self.assertIn("ʼ", result)
|
|
28
|
+
|
|
29
|
+
def test_whitespace_collapse(self):
|
|
30
|
+
"""Test that multiple whitespaces are collapsed."""
|
|
31
|
+
result = normalize("salom dunyo")
|
|
32
|
+
self.assertEqual(result, "salom dunyo")
|
|
33
|
+
|
|
34
|
+
def test_ellipsis_handling(self):
|
|
35
|
+
"""Test that ellipsis is normalized."""
|
|
36
|
+
result = normalize("salom...")
|
|
37
|
+
self.assertIn(".", result)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class TestSegmentMorphological(unittest.TestCase):
|
|
41
|
+
"""Test cases for the segment_morphological function."""
|
|
42
|
+
|
|
43
|
+
def test_single_stem_no_affixes(self):
|
|
44
|
+
"""Test word with no affixes."""
|
|
45
|
+
result = segment_morphological("kitob")
|
|
46
|
+
self.assertEqual(result, ["kitob"])
|
|
47
|
+
|
|
48
|
+
def test_suffix_segmentation(self):
|
|
49
|
+
"""Test basic suffix removal."""
|
|
50
|
+
result = segment_morphological("kitoblar")
|
|
51
|
+
self.assertIn("kitob", result)
|
|
52
|
+
self.assertIn("lar", result)
|
|
53
|
+
|
|
54
|
+
def test_multiple_suffix_segmentation(self):
|
|
55
|
+
"""Test multiple suffix removal."""
|
|
56
|
+
result = segment_morphological("kitoblarimiz")
|
|
57
|
+
self.assertIn("kitob", result)
|
|
58
|
+
self.assertTrue(any(morpheme in result for morpheme in ["lar", "lari"]))
|
|
59
|
+
|
|
60
|
+
def test_prefix_segmentation(self):
|
|
61
|
+
"""Test prefix removal."""
|
|
62
|
+
result = segment_morphological("bepartach")
|
|
63
|
+
self.assertEqual(result[0], "be")
|
|
64
|
+
|
|
65
|
+
def test_single_character_token(self):
|
|
66
|
+
"""Test single character returns as-is."""
|
|
67
|
+
result = segment_morphological("a")
|
|
68
|
+
self.assertEqual(result, ["a"])
|
|
69
|
+
|
|
70
|
+
def test_empty_token_after_stripping(self):
|
|
71
|
+
"""Test that minimum stem length is enforced."""
|
|
72
|
+
result = segment_morphological("dan")
|
|
73
|
+
self.assertEqual(result, ["dan"])
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class TestApplySegmentation(unittest.TestCase):
|
|
77
|
+
"""Test cases for the apply_segmentation function."""
|
|
78
|
+
|
|
79
|
+
def test_single_word(self):
|
|
80
|
+
"""Test segmentation of single word."""
|
|
81
|
+
result = apply_segmentation("kitob")
|
|
82
|
+
self.assertEqual(result, ["kitob"])
|
|
83
|
+
|
|
84
|
+
def test_single_word_with_affixes(self):
|
|
85
|
+
"""Test segmentation of word with affixes."""
|
|
86
|
+
result = apply_segmentation("kitoblar")
|
|
87
|
+
self.assertIn("kitob", result)
|
|
88
|
+
self.assertIn("lar", result)
|
|
89
|
+
|
|
90
|
+
def test_multiple_words(self):
|
|
91
|
+
"""Test segmentation of multiple words."""
|
|
92
|
+
result = apply_segmentation("kitoblar o'qidim")
|
|
93
|
+
self.assertGreater(len(result), 2)
|
|
94
|
+
self.assertIn("kitob", result)
|
|
95
|
+
self.assertIn("lar", result)
|
|
96
|
+
|
|
97
|
+
def test_text_with_punctuation(self):
|
|
98
|
+
"""Test that punctuation is handled."""
|
|
99
|
+
result = apply_segmentation("Salom!")
|
|
100
|
+
self.assertIsInstance(result, list)
|
|
101
|
+
self.assertGreater(len(result), 0)
|
|
102
|
+
|
|
103
|
+
def test_empty_string(self):
|
|
104
|
+
"""Test handling of empty string."""
|
|
105
|
+
result = apply_segmentation("")
|
|
106
|
+
self.assertEqual(result, [])
|
|
107
|
+
|
|
108
|
+
def test_whitespace_only(self):
|
|
109
|
+
"""Test handling of whitespace-only string."""
|
|
110
|
+
result = apply_segmentation(" ")
|
|
111
|
+
self.assertEqual(result, [])
|
|
112
|
+
|
|
113
|
+
def test_normalized_output(self):
|
|
114
|
+
"""Test that output is normalized (lowercase)."""
|
|
115
|
+
result = apply_segmentation("KITOBLAR")
|
|
116
|
+
self.assertIn("kitob", result)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class TestIntegration(unittest.TestCase):
|
|
120
|
+
"""Integration tests for real-world Uzbek text."""
|
|
121
|
+
|
|
122
|
+
def test_real_sentence(self):
|
|
123
|
+
"""Test with real Uzbek sentence."""
|
|
124
|
+
text = "kitoblarimizdan o'qidim"
|
|
125
|
+
result = apply_segmentation(text)
|
|
126
|
+
|
|
127
|
+
self.assertIsInstance(result, list)
|
|
128
|
+
self.assertGreater(len(result), 0)
|
|
129
|
+
|
|
130
|
+
self.assertIn("kitob", result)
|
|
131
|
+
self.assertTrue(any(morpheme in result for morpheme in ["lar", "lari"]))
|
|
132
|
+
|
|
133
|
+
def test_complex_word_segmentation(self):
|
|
134
|
+
"""Test complex Uzbek word with multiple affixes."""
|
|
135
|
+
result = segment_morphological("borganmish")
|
|
136
|
+
self.assertIsInstance(result, list)
|
|
137
|
+
self.assertGreater(len(result), 1)
|
|
138
|
+
|
|
139
|
+
def test_caching_consistency(self):
|
|
140
|
+
"""Test that caching doesn't affect results."""
|
|
141
|
+
word = "kitoblar"
|
|
142
|
+
result1 = segment_morphological(word)
|
|
143
|
+
result2 = segment_morphological(word)
|
|
144
|
+
self.assertEqual(result1, result2)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
if __name__ == "__main__":
|
|
148
|
+
unittest.main()
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Uzbek Tokenizer - Morphological segmentation for Uzbek text
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .segmenter import (
|
|
6
|
+
normalize,
|
|
7
|
+
segment_morphological,
|
|
8
|
+
apply_segmentation,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
__version__ = "0.1.0"
|
|
12
|
+
__author__ = "Ibrat Usmonov"
|
|
13
|
+
__description__ = "O'zbek tili uchun morfologik segmentatsiya kutubxonasi"
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"normalize",
|
|
17
|
+
"segment_morphological",
|
|
18
|
+
"apply_segmentation",
|
|
19
|
+
]
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
cha
|
|
2
|
+
choq
|
|
3
|
+
chak
|
|
4
|
+
jon
|
|
5
|
+
xon
|
|
6
|
+
loq
|
|
7
|
+
bek
|
|
8
|
+
lar
|
|
9
|
+
niki
|
|
10
|
+
dagi
|
|
11
|
+
gacha
|
|
12
|
+
dek
|
|
13
|
+
day
|
|
14
|
+
ish
|
|
15
|
+
g'ish
|
|
16
|
+
mtir
|
|
17
|
+
roq
|
|
18
|
+
ta
|
|
19
|
+
tacha
|
|
20
|
+
larcha
|
|
21
|
+
ov
|
|
22
|
+
ala
|
|
23
|
+
ovlon
|
|
24
|
+
tadan
|
|
25
|
+
inchi
|
|
26
|
+
g'i
|
|
27
|
+
qa
|
|
28
|
+
qil
|
|
29
|
+
qi
|
|
30
|
+
g'a
|
|
31
|
+
sira
|
|
32
|
+
gila
|
|
33
|
+
nqira
|
|
34
|
+
kila
|
|
35
|
+
tir
|
|
36
|
+
dir
|
|
37
|
+
ar
|
|
38
|
+
zor
|
|
39
|
+
don
|
|
40
|
+
giz
|
|
41
|
+
sat
|
|
42
|
+
kaz
|
|
43
|
+
kiz
|
|
44
|
+
qaz
|
|
45
|
+
g'iz
|
|
46
|
+
guncha
|
|
47
|
+
may
|
|
48
|
+
gach
|
|
49
|
+
mayin
|
|
50
|
+
gani
|
|
51
|
+
gancha
|
|
52
|
+
masdan
|
|
53
|
+
gan
|
|
54
|
+
jak
|
|
55
|
+
yotgan
|
|
56
|
+
vchi
|
|
57
|
+
digan
|
|
58
|
+
si
|
|
59
|
+
miz
|
|
60
|
+
ngiz
|
|
61
|
+
lari
|
|
62
|
+
ning
|
|
63
|
+
ni
|
|
64
|
+
ga
|
|
65
|
+
dan
|
|
66
|
+
da
|
|
67
|
+
man
|
|
68
|
+
san
|
|
69
|
+
siz
|
|
70
|
+
lik
|
|
71
|
+
gin
|
|
72
|
+
sin
|
|
73
|
+
ka
|
|
74
|
+
mi
|
|
75
|
+
moq
|
|
76
|
+
i
|
|
77
|
+
ir
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import unicodedata
|
|
3
|
+
import regex as re
|
|
4
|
+
from functools import lru_cache
|
|
5
|
+
|
|
6
|
+
BASE_DIR = os.path.dirname(__file__)
|
|
7
|
+
with open(os.path.join(BASE_DIR, "data/prefixes.txt"), encoding="utf-8") as f:
|
|
8
|
+
PREFIXES = sorted([line.strip() for line in f if line.strip()], key=len, reverse=True)
|
|
9
|
+
|
|
10
|
+
with open(os.path.join(BASE_DIR, "data/suffixes.txt"), encoding="utf-8") as f:
|
|
11
|
+
SUFFIXES = sorted([line.strip() for line in f if line.strip()], key=len, reverse=True)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def normalize(text: str) -> str:
|
|
15
|
+
"""
|
|
16
|
+
Applies normalization rules to Uzbek text.
|
|
17
|
+
"""
|
|
18
|
+
# 1) Unicode normalize
|
|
19
|
+
text = unicodedata.normalize("NFC", text)
|
|
20
|
+
# 2) Lowercase
|
|
21
|
+
text = text.lower()
|
|
22
|
+
# 3) Punctuation normalization
|
|
23
|
+
text = re.sub(r"[«»]", '"', text)
|
|
24
|
+
text = re.sub(r"[–—]", "-", text)
|
|
25
|
+
text = re.sub(r'([.!?;:,(){}[\]"`~@#$%^&*+=|\\/<>\-])', r' \1 ', text)
|
|
26
|
+
|
|
27
|
+
# 4) Special handling for Uzbek apostrophes before O/G
|
|
28
|
+
text = re.sub(r"([OoGg])[\'ʻ''`ʼ]", r"\1ʻ", text)
|
|
29
|
+
|
|
30
|
+
# 5) Normalize all other apostrophe variants to ʼ (U+02BC)
|
|
31
|
+
text = re.sub(r"[\'ʻ''`]", "ʼ", text)
|
|
32
|
+
|
|
33
|
+
# 6) Handle special punctuation sequences
|
|
34
|
+
text = re.sub(r'\.{3,}', ' ... ', text) # Ellipsis
|
|
35
|
+
text = re.sub(r'[!]{2,}', ' !! ', text) # Multiple exclamations
|
|
36
|
+
text = re.sub(r'[?]{2,}', ' ?? ', text) # Multiple questions
|
|
37
|
+
|
|
38
|
+
# 7) Collapse whitespace
|
|
39
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
40
|
+
return text
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@lru_cache(maxsize=None)
|
|
44
|
+
def segment_morphological(token: str) -> list[str]:
|
|
45
|
+
"""
|
|
46
|
+
Recursively strip the longest matching prefix or suffix from `token`.
|
|
47
|
+
Returns a list of stems/affixes in order.
|
|
48
|
+
"""
|
|
49
|
+
if len(token) <= 1:
|
|
50
|
+
return [token]
|
|
51
|
+
|
|
52
|
+
# 2) Try stripping a prefix
|
|
53
|
+
for p in PREFIXES:
|
|
54
|
+
if token.startswith(p) and len(token) > len(p) + 2:
|
|
55
|
+
remainder = token[len(p):]
|
|
56
|
+
return [p] + segment_morphological(remainder)
|
|
57
|
+
|
|
58
|
+
# 3) Try stripping a suffix
|
|
59
|
+
for s in SUFFIXES:
|
|
60
|
+
if token.endswith(s) and len(token) > len(s) + 2:
|
|
61
|
+
remainder = token[:-len(s)]
|
|
62
|
+
return segment_morphological(remainder) + [s]
|
|
63
|
+
|
|
64
|
+
# 4) No more affixes found: return the token itself
|
|
65
|
+
return [token]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def apply_segmentation(line: str) -> list[str]:
|
|
69
|
+
"""
|
|
70
|
+
Takes a single line of whitespace-tokenized text (e.g. "kitoblarimizdan o'qidim")
|
|
71
|
+
and returns a list where each token is replaced by its
|
|
72
|
+
affix-segmented pieces, e.g.
|
|
73
|
+
["kitob", "lar", "imiz", "dan", "o'q", "id", "im"]
|
|
74
|
+
"""
|
|
75
|
+
pieces = []
|
|
76
|
+
norm = normalize(line)
|
|
77
|
+
|
|
78
|
+
for tok in norm.split(" "):
|
|
79
|
+
if tok: # Skip empty tokens
|
|
80
|
+
pieces.extend(segment_morphological(tok))
|
|
81
|
+
return pieces
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# Example usage
|
|
85
|
+
if __name__ == "__main__":
|
|
86
|
+
examples = [
|
|
87
|
+
"kitoblarimizdan o'qidim",
|
|
88
|
+
"yig'layvergan edilar",
|
|
89
|
+
"qayta ishlov berish uchun",
|
|
90
|
+
"borganmish"
|
|
91
|
+
]
|
|
92
|
+
for ex in examples:
|
|
93
|
+
print(f"\nOriginal: {ex}")
|
|
94
|
+
segmented = apply_segmentation(ex)
|
|
95
|
+
print(f"Segmented: {segmented}")
|
|
96
|
+
print(f"As string: {' '.join(segmented)}")
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: uzbek-tokenizer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A morphological tokenizer for Uzbek language
|
|
5
|
+
Author-email: Ibrat Usmonov <usmonovibrat315@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/IbratDO/uzbek-tokenizer
|
|
8
|
+
Project-URL: Documentation, https://github.com/IbratDO/uzbek-tokenizer#usage
|
|
9
|
+
Project-URL: Repository, https://github.com/IbratDO/uzbek-tokenizer.git
|
|
10
|
+
Project-URL: Issues, https://github.com/IbratDO/uzbek-tokenizer/issues
|
|
11
|
+
Keywords: uzbek,tokenizer,morphology,nlp,segmentation
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering
|
|
23
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
24
|
+
Requires-Python: >=3.8
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
Requires-Dist: regex>=2022.0.0
|
|
27
|
+
|
|
28
|
+
# Uzbek Tokenizer
|
|
29
|
+
|
|
30
|
+
A Python library for morphological segmentation of Uzbek text. This tokenizer breaks down Uzbek words into their constituent morphemes (stems and affixes), which is useful for natural language processing tasks like machine learning, text analysis, and linguistic research.
|
|
31
|
+
|
|
32
|
+
## Features
|
|
33
|
+
|
|
34
|
+
- 🔤 **Morphological Segmentation**: Breaks Uzbek words into stems and affixes
|
|
35
|
+
- 📝 **Text Normalization**: Handles Unicode, punctuation, and Uzbek-specific characters
|
|
36
|
+
- ⚡ **Efficient**: Cached segmentation for repeated tokens
|
|
37
|
+
- 📦 **Easy to Use**: Simple API for both single words and batch processing
|
|
38
|
+
- 🎯 **Agglutinative Language Support**: Designed specifically for Uzbek's morphological structure
|
|
39
|
+
|
|
40
|
+
## Installation
|
|
41
|
+
|
|
42
|
+
Install via pip:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install uzbek-tokenizer
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Or clone the repository:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
git clone https://github.com/IbratDO/uzbek-tokenizer.git
|
|
52
|
+
cd uzbek-tokenizer
|
|
53
|
+
pip install -e .
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Quick Start
|
|
57
|
+
|
|
58
|
+
### Basic Usage
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from uzbek_tokenizer import apply_segmentation, normalize
|
|
62
|
+
|
|
63
|
+
# Segment a word
|
|
64
|
+
text = "kitoblarimizdan"
|
|
65
|
+
result = apply_segmentation(text)
|
|
66
|
+
print(result)
|
|
67
|
+
# Output: ['kitob', 'lar', 'imiz', 'dan']
|
|
68
|
+
|
|
69
|
+
# Segment a sentence
|
|
70
|
+
sentence = "kitoblarimizdan o'qidim"
|
|
71
|
+
result = apply_segmentation(sentence)
|
|
72
|
+
print(result)
|
|
73
|
+
# Output: ['kitob', 'lar', 'imiz', 'dan', "o'q", 'id', 'im']
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Normalization Only
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from uzbek_tokenizer import normalize
|
|
80
|
+
|
|
81
|
+
text = "Salom, JAHON!"
|
|
82
|
+
normalized = normalize(text)
|
|
83
|
+
print(normalized)
|
|
84
|
+
# Output: "salom , jahon !"
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Morphological Segmentation Only
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from uzbek_tokenizer import segment_morphological
|
|
91
|
+
|
|
92
|
+
word = "o'qiyotgan"
|
|
93
|
+
segments = segment_morphological(word)
|
|
94
|
+
print(segments)
|
|
95
|
+
# Output: ["o'q", 'i', 'yotgan']
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## API Reference
|
|
99
|
+
|
|
100
|
+
### `normalize(text: str) -> str`
|
|
101
|
+
Normalizes Uzbek text by:
|
|
102
|
+
- Converting to NFC Unicode form
|
|
103
|
+
- Lowercasing
|
|
104
|
+
- Standardizing punctuation
|
|
105
|
+
- Handling Uzbek apostrophe variants
|
|
106
|
+
- Collapsing whitespace
|
|
107
|
+
|
|
108
|
+
**Parameters:**
|
|
109
|
+
- `text` (str): Input text
|
|
110
|
+
|
|
111
|
+
**Returns:** Normalized text
|
|
112
|
+
|
|
113
|
+
### `segment_morphological(token: str) -> list[str]`
|
|
114
|
+
Recursively segments a single token into morphemes.
|
|
115
|
+
|
|
116
|
+
**Parameters:**
|
|
117
|
+
- `token` (str): A single word/token
|
|
118
|
+
|
|
119
|
+
**Returns:** List of morpheme components
|
|
120
|
+
|
|
121
|
+
### `apply_segmentation(line: str) -> list[str]`
|
|
122
|
+
Normalizes text and segments all tokens into morphemes.
|
|
123
|
+
|
|
124
|
+
**Parameters:**
|
|
125
|
+
- `line` (str): Input text (can be multiple words)
|
|
126
|
+
|
|
127
|
+
**Returns:** List of all morphemes
|
|
128
|
+
|
|
129
|
+
## How It Works
|
|
130
|
+
|
|
131
|
+
The tokenizer uses a **greedy longest-match-first algorithm**:
|
|
132
|
+
|
|
133
|
+
1. **Normalization**: Cleans and standardizes the input text
|
|
134
|
+
2. **Prefix Stripping**: Removes longest matching prefixes (be-, no-)
|
|
135
|
+
3. **Suffix Stripping**: Removes longest matching suffixes (47+ Uzbek suffixes)
|
|
136
|
+
4. **Recursion**: Repeats until no more affixes can be removed
|
|
137
|
+
5. **Output**: Returns the stem + all stripped affixes
|
|
138
|
+
|
|
139
|
+
Example: `kitoblarimizdan`
|
|
140
|
+
- Remove suffix "dan" → remainder: "kitoblarimiz"
|
|
141
|
+
- Remove suffix "imiz" → remainder: "kitoblar"
|
|
142
|
+
- Remove suffix "lar" → remainder: "kitob"
|
|
143
|
+
- No more affixes → stem: "kitob"
|
|
144
|
+
- Result: `[kitob, lar, imiz, dan]`
|
|
145
|
+
|
|
146
|
+
## Supported Affixes
|
|
147
|
+
|
|
148
|
+
### Prefixes (2)
|
|
149
|
+
- be-, no-
|
|
150
|
+
|
|
151
|
+
### Suffixes (47+)
|
|
152
|
+
Includes: -cha, -lar, -dagi, -dan, -ga, -ni, -ning, -lik, -ish, and many more
|
|
153
|
+
|
|
154
|
+
See `uzbek_tokenizer/data/prefixes.txt` and `uzbek_tokenizer/data/suffixes.txt` for the complete list.
|
|
155
|
+
|
|
156
|
+
## Requirements
|
|
157
|
+
|
|
158
|
+
- Python 3.8+
|
|
159
|
+
- regex >= 2022.0.0
|
|
160
|
+
|
|
161
|
+
## License
|
|
162
|
+
|
|
163
|
+
MIT License - see LICENSE file for details
|
|
164
|
+
|
|
165
|
+
## Contributing
|
|
166
|
+
|
|
167
|
+
Contributions are welcome! Please:
|
|
168
|
+
|
|
169
|
+
1. Fork the repository
|
|
170
|
+
2. Create a feature branch (`git checkout -b feature/amazing-feature`)
|
|
171
|
+
3. Commit your changes (`git commit -m 'Add amazing feature'`)
|
|
172
|
+
4. Push to the branch (`git push origin feature/amazing-feature`)
|
|
173
|
+
5. Open a Pull Request
|
|
174
|
+
|
|
175
|
+
## Issues & Feedback
|
|
176
|
+
|
|
177
|
+
Found a bug or have a suggestion? Please open an [issue on GitHub](https://github.com/IbratDO/uzbek-tokenizer/issues).
|
|
178
|
+
|
|
179
|
+
## Citation
|
|
180
|
+
|
|
181
|
+
If you use this tokenizer in your research, please cite:
|
|
182
|
+
|
|
183
|
+
```bibtex
|
|
184
|
+
@software{uzbek_tokenizer_2026,
|
|
185
|
+
author = {Ibrat Usmonov},
|
|
186
|
+
title = {Uzbek Tokenizer: Morphological Segmentation for Uzbek},
|
|
187
|
+
year = {2026},
|
|
188
|
+
url = {https://github.com/IbratDO/uzbek-tokenizer}
|
|
189
|
+
}
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
## Roadmap
|
|
193
|
+
|
|
194
|
+
- [ ] Vowel harmony validation
|
|
195
|
+
- [ ] Stem lemmatization
|
|
196
|
+
- [ ] Support for additional Uzbek dialects
|
|
197
|
+
- [ ] Performance benchmarks
|
|
198
|
+
- [ ] Web API
|
|
199
|
+
|
|
200
|
+
## Acknowledgments
|
|
201
|
+
|
|
202
|
+
Built for Uzbek language processing with love ❤️
|
|
203
|
+
|
|
204
|
+
---
|
|
205
|
+
|
|
206
|
+
**Status**: Early alpha (v0.1.0) - API may change. Contributions welcome!
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
MANIFEST.in
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
tests/test_segmenter.py
|
|
5
|
+
uzbek_tokenizer/__init__.py
|
|
6
|
+
uzbek_tokenizer/segmenter.py
|
|
7
|
+
uzbek_tokenizer.egg-info/PKG-INFO
|
|
8
|
+
uzbek_tokenizer.egg-info/SOURCES.txt
|
|
9
|
+
uzbek_tokenizer.egg-info/dependency_links.txt
|
|
10
|
+
uzbek_tokenizer.egg-info/requires.txt
|
|
11
|
+
uzbek_tokenizer.egg-info/top_level.txt
|
|
12
|
+
uzbek_tokenizer/data/prefixes.txt
|
|
13
|
+
uzbek_tokenizer/data/suffixes.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
regex>=2022.0.0
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
uzbek_tokenizer
|