npltk 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- npltk-0.3.2/MANIFEST.in +6 -0
- npltk-0.3.2/PKG-INFO +307 -0
- npltk-0.3.2/README.md +290 -0
- npltk-0.3.2/setup.cfg +4 -0
- npltk-0.3.2/setup.py +23 -0
- npltk-0.3.2/src/npltk/__init__.py +26 -0
- npltk-0.3.2/src/npltk/lemmatizer/__init__.py +3 -0
- npltk-0.3.2/src/npltk/lemmatizer/data/lemma_dict.json +112 -0
- npltk-0.3.2/src/npltk/lemmatizer/dict_lookup.py +41 -0
- npltk-0.3.2/src/npltk/lemmatizer/hybrid_lemmatizer.py +10 -0
- npltk-0.3.2/src/npltk/lemmatizer/lemmatizer.py +59 -0
- npltk-0.3.2/src/npltk/lemmatizer/rule_stripper.py +123 -0
- npltk-0.3.2/src/npltk/ner/__init__.py +3 -0
- npltk-0.3.2/src/npltk/ner/model.py +117 -0
- npltk-0.3.2/src/npltk/ner/models/ner_model.pth +0 -0
- npltk-0.3.2/src/npltk/ner/tagger.py +209 -0
- npltk-0.3.2/src/npltk/normalizer/__init__.py +96 -0
- npltk-0.3.2/src/npltk/normalizer/config.py +31 -0
- npltk-0.3.2/src/npltk/normalizer/core.py +41 -0
- npltk-0.3.2/src/npltk/normalizer/rules.py +398 -0
- npltk-0.3.2/src/npltk/pos/__init__.py +3 -0
- npltk-0.3.2/src/npltk/pos/model.py +117 -0
- npltk-0.3.2/src/npltk/pos/models/npltk_pos_tagger.pth +0 -0
- npltk-0.3.2/src/npltk/pos/tagger.py +81 -0
- npltk-0.3.2/src/npltk/stop_word/__init__.py +1 -0
- npltk-0.3.2/src/npltk/stop_word/nepali_stopwords.txt +15 -0
- npltk-0.3.2/src/npltk/stop_word/remover.py +62 -0
- npltk-0.3.2/src/npltk/tokenizer/__init__.py +9 -0
- npltk-0.3.2/src/npltk/tokenizer/detokenize.py +32 -0
- npltk-0.3.2/src/npltk/tokenizer/factory.py +66 -0
- npltk-0.3.2/src/npltk/tokenizer/hybrid_tokenizer.py +206 -0
- npltk-0.3.2/src/npltk/tokenizer/models/nepali_tokenizer.model +0 -0
- npltk-0.3.2/src/npltk/tokenizer/models/nepali_tokenizer.vocab +64000 -0
- npltk-0.3.2/src/npltk/tokenizer/rule_engine.py +98 -0
- npltk-0.3.2/src/npltk/tokenizer/sentence_splitter.py +65 -0
- npltk-0.3.2/src/npltk/tokenizer/tokenizer.py +59 -0
- npltk-0.3.2/src/npltk/tokenizer/types.py +33 -0
- npltk-0.3.2/src/npltk/tokenizer/word_tokenizer.py +23 -0
- npltk-0.3.2/src/npltk.egg-info/PKG-INFO +307 -0
- npltk-0.3.2/src/npltk.egg-info/SOURCES.txt +41 -0
- npltk-0.3.2/src/npltk.egg-info/dependency_links.txt +1 -0
- npltk-0.3.2/src/npltk.egg-info/requires.txt +3 -0
- npltk-0.3.2/src/npltk.egg-info/top_level.txt +1 -0
npltk-0.3.2/MANIFEST.in
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
include README.md
|
|
2
|
+
recursive-include src/npltk/stop_word *.txt
|
|
3
|
+
recursive-include src/npltk/tokenizer/models *.model *.vocab
|
|
4
|
+
recursive-include src/npltk/lemmatizer/data *.json
|
|
5
|
+
recursive-include src/npltk/pos/models *.pth
|
|
6
|
+
recursive-include src/npltk/ner/models *.pth
|
npltk-0.3.2/PKG-INFO
ADDED
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: npltk
|
|
3
|
+
Version: 0.3.2
|
|
4
|
+
Summary: Nepali Language Processing Toolkit
|
|
5
|
+
Author: Anurag Sharma, Anita Budha Magar, Apeksha Parajuli, Apeksha Katwal
|
|
6
|
+
Requires-Python: >=3.7
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: sentencepiece<0.2.0,>=0.1.90
|
|
9
|
+
Requires-Dist: torch>=2.0
|
|
10
|
+
Requires-Dist: pytorch-crf>=0.7.2
|
|
11
|
+
Dynamic: author
|
|
12
|
+
Dynamic: description
|
|
13
|
+
Dynamic: description-content-type
|
|
14
|
+
Dynamic: requires-dist
|
|
15
|
+
Dynamic: requires-python
|
|
16
|
+
Dynamic: summary
|
|
17
|
+
|
|
18
|
+
# NPLTK
|
|
19
|
+
|
|
20
|
+
Nepali Language Processing Toolkit (NPLTK) is a lightweight and modular NLP library designed specifically for the Nepali language. It provides tools for tokenization, normalization, lemmatization, stop-word removal, POS tagging, and Named Entity Recognition (NER).
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Why NPLTK?
|
|
25
|
+
|
|
26
|
+
Most NLP libraries are designed primarily for English and do not handle Nepali morphology, suffixes, and tokenization well.
|
|
27
|
+
|
|
28
|
+
NPLTK is built specifically for Nepali and provides:
|
|
29
|
+
|
|
30
|
+
* Hybrid tokenizer combining rule-based logic and SentencePiece
|
|
31
|
+
* Hybrid lemmatization using dictionary + rules
|
|
32
|
+
* Lightweight POS and NER models
|
|
33
|
+
* Fully self-contained package with bundled resources
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
## Installation
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install npltk
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
For testing from TestPyPI:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install -i https://test.pypi.org/simple/ npltk
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## Minimal Example
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
from npltk import create_tokenizer
|
|
55
|
+
|
|
56
|
+
tokens = create_tokenizer().tokenize("नेपाल सुन्दर देश हो।")
|
|
57
|
+
print([t.text for t in tokens])
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## Tokenizer
|
|
63
|
+
|
|
64
|
+
NPLTK provides a tokenizer factory through `create_tokenizer(...)`.
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
create_tokenizer(
|
|
68
|
+
mode="hybrid",
|
|
69
|
+
split_into_sentences=True,
|
|
70
|
+
keep_punct=True,
|
|
71
|
+
model_path=None,
|
|
72
|
+
subword=True,
|
|
73
|
+
preprocess=None,
|
|
74
|
+
fallback_to_rule=True,
|
|
75
|
+
)
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Main arguments
|
|
79
|
+
|
|
80
|
+
* `mode`: `"hybrid"` or `"rule"`
|
|
81
|
+
|
|
82
|
+
* `"hybrid"` uses rule-based tokenization together with SentencePiece
|
|
83
|
+
* `"rule"` uses only rule-based tokenization
|
|
84
|
+
|
|
85
|
+
* `split_into_sentences`: whether sentence splitting is enabled internally
|
|
86
|
+
|
|
87
|
+
* `keep_punct`: whether punctuation tokens are kept in output
|
|
88
|
+
|
|
89
|
+
* `model_path`: optional custom SentencePiece model path
|
|
90
|
+
|
|
91
|
+
* `subword`: enables SentencePiece-based subword support in hybrid mode
|
|
92
|
+
|
|
93
|
+
* `preprocess`: optional preprocessing function applied before tokenization
|
|
94
|
+
|
|
95
|
+
* `fallback_to_rule`: if hybrid loading fails, automatically use rule mode
|
|
96
|
+
|
|
97
|
+
### Tokenizer Example
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
from npltk import create_tokenizer
|
|
101
|
+
|
|
102
|
+
tokenizer = create_tokenizer(
|
|
103
|
+
mode="hybrid",
|
|
104
|
+
keep_punct=True,
|
|
105
|
+
fallback_to_rule=True,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
tokens = tokenizer.tokenize("नेपाल एक सुन्दर देश हो।")
|
|
109
|
+
print([t.text for t in tokens])
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### Sentence Tokenization Example
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
from npltk import create_tokenizer
|
|
116
|
+
|
|
117
|
+
tokenizer = create_tokenizer(mode="hybrid")
|
|
118
|
+
sentences = tokenizer.tokenize_sentences("नेपाल सुन्दर देश हो। यहाँ हिमाल छन्।")
|
|
119
|
+
|
|
120
|
+
for sent in sentences:
|
|
121
|
+
print([t.text for t in sent.tokens])
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Detokenization Example
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
from npltk import create_tokenizer
|
|
128
|
+
|
|
129
|
+
tokenizer = create_tokenizer(mode="hybrid")
|
|
130
|
+
tokens = tokenizer.tokenize("नेपाल सुन्दर देश हो।")
|
|
131
|
+
text = tokenizer.detokenize(tokens)
|
|
132
|
+
|
|
133
|
+
print(text)
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
---
|
|
137
|
+
|
|
138
|
+
## Separate Examples for Each Component
|
|
139
|
+
|
|
140
|
+
### 1. Normalizer
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
from npltk.normalizer import build_normalizer
|
|
144
|
+
|
|
145
|
+
result = build_normalizer().normalize(" नेपाल।। ")
|
|
146
|
+
print(result.text)
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
### 2. Tokenizer
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
from npltk import create_tokenizer
|
|
153
|
+
|
|
154
|
+
tokenizer = create_tokenizer(mode="hybrid")
|
|
155
|
+
tokens = tokenizer.tokenize("नेपालको प्रधानमन्त्री काठमाडौं गए।")
|
|
156
|
+
print([t.text for t in tokens])
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### 3. Lemmatizer
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
from npltk import Lemmatizer
|
|
163
|
+
|
|
164
|
+
lemmatizer = Lemmatizer()
|
|
165
|
+
print(lemmatizer.lemmatize("गयो"))
|
|
166
|
+
print(lemmatizer.lemmatize("घरहरूमा"))
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### 4. Stop Word Removal
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
from npltk import create_tokenizer
|
|
173
|
+
from npltk.stop_word.remover import StopWordRemover
|
|
174
|
+
|
|
175
|
+
tokens = create_tokenizer().tokenize("नेपाल सुन्दर देश हो र यहाँ हिमाल छन् ।")
|
|
176
|
+
filtered, info = StopWordRemover().remove(tokens)
|
|
177
|
+
|
|
178
|
+
print([t.text for t in filtered])
|
|
179
|
+
print(info)
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
### 5. POS Tagger
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
from npltk import create_tokenizer, POSTagger
|
|
186
|
+
|
|
187
|
+
tokens = [t.text for t in create_tokenizer().tokenize("नेपालको प्रधानमन्त्री काठमाडौं गए।")]
|
|
188
|
+
tagger = POSTagger()
|
|
189
|
+
|
|
190
|
+
print(tagger.tag_with_tokens(tokens))
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
### 6. NER Tagger
|
|
194
|
+
|
|
195
|
+
```python
|
|
196
|
+
from npltk import NERTagger
|
|
197
|
+
|
|
198
|
+
tagger = NERTagger(tokenizer_mode="hybrid")
|
|
199
|
+
print(tagger.extract("शेरबहादुर देउवा काठमाडौं पुगे।"))
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
---
|
|
203
|
+
|
|
204
|
+
## Full Workflow Pipeline Example
|
|
205
|
+
|
|
206
|
+
```python
|
|
207
|
+
from pprint import pprint
|
|
208
|
+
|
|
209
|
+
from npltk import create_tokenizer, Lemmatizer, POSTagger, NERTagger
|
|
210
|
+
from npltk.normalizer import build_normalizer
|
|
211
|
+
from npltk.stop_word.remover import StopWordRemover
|
|
212
|
+
|
|
213
|
+
text = " शेरबहादुर देउवा काठमाडौं पुगे र नेपालको बारेमा बोले। "
|
|
214
|
+
|
|
215
|
+
# 1. Normalize
|
|
216
|
+
normalizer = build_normalizer()
|
|
217
|
+
norm_result = normalizer.normalize(text)
|
|
218
|
+
normalized_text = norm_result.text
|
|
219
|
+
print("Normalized:", normalized_text)
|
|
220
|
+
|
|
221
|
+
# 2. Tokenize
|
|
222
|
+
tokenizer = create_tokenizer(mode="hybrid", fallback_to_rule=True)
|
|
223
|
+
tokens = tokenizer.tokenize(normalized_text)
|
|
224
|
+
token_texts = [t.text for t in tokens]
|
|
225
|
+
print("Tokens:", token_texts)
|
|
226
|
+
|
|
227
|
+
# 3. Remove stop words
|
|
228
|
+
filtered_tokens, info = StopWordRemover().remove(tokens)
|
|
229
|
+
filtered_texts = [t.text for t in filtered_tokens]
|
|
230
|
+
print("Filtered Tokens:", filtered_texts)
|
|
231
|
+
print("Stopword Info:", info)
|
|
232
|
+
|
|
233
|
+
# 4. Lemmatize
|
|
234
|
+
lemmatizer = Lemmatizer()
|
|
235
|
+
lemmas = [lemmatizer.lemmatize(token) for token in filtered_texts]
|
|
236
|
+
print("Lemmas:", lemmas)
|
|
237
|
+
|
|
238
|
+
# 5. POS tagging
|
|
239
|
+
pos_tagger = POSTagger()
|
|
240
|
+
pos_pairs = pos_tagger.tag_with_tokens(token_texts)
|
|
241
|
+
print("POS Tags:", pos_pairs)
|
|
242
|
+
|
|
243
|
+
# 6. NER
|
|
244
|
+
ner_tagger = NERTagger(tokenizer_mode="hybrid")
|
|
245
|
+
ner_result = ner_tagger.predict(normalized_text)
|
|
246
|
+
|
|
247
|
+
print("NER Token-Tag Pairs:")
|
|
248
|
+
for token, tag in zip(ner_result["tokens"], ner_result["tags"]):
|
|
249
|
+
print(f"{token:12} {tag}")
|
|
250
|
+
|
|
251
|
+
print("Entities:")
|
|
252
|
+
pprint(ner_result["entities"], width=100)
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
---
|
|
256
|
+
|
|
257
|
+
## Features
|
|
258
|
+
|
|
259
|
+
* Nepali normalizer
|
|
260
|
+
* Hybrid tokenizer (rule-based + SentencePiece)
|
|
261
|
+
* Lemmatizer
|
|
262
|
+
* Stop-word removal
|
|
263
|
+
* POS tagging
|
|
264
|
+
* Named Entity Recognition (NER)
|
|
265
|
+
|
|
266
|
+
---
|
|
267
|
+
|
|
268
|
+
## Models
|
|
269
|
+
|
|
270
|
+
NPLTK includes bundled trained models for:
|
|
271
|
+
|
|
272
|
+
* POS Tagger
|
|
273
|
+
* NER Tagger
|
|
274
|
+
|
|
275
|
+
These work out of the box after installation.
|
|
276
|
+
|
|
277
|
+
---
|
|
278
|
+
|
|
279
|
+
## Suggested Workflow
|
|
280
|
+
|
|
281
|
+
1. Normalize text
|
|
282
|
+
2. Tokenize text
|
|
283
|
+
3. Optionally remove stop words
|
|
284
|
+
4. Lemmatize tokens
|
|
285
|
+
5. Run POS tagging
|
|
286
|
+
6. Run NER extraction
|
|
287
|
+
|
|
288
|
+
---
|
|
289
|
+
|
|
290
|
+
## Contributors
|
|
291
|
+
|
|
292
|
+
* Anurag Sharma
|
|
293
|
+
* Anita Budha Magar
|
|
294
|
+
* Apeksha Parajuli
|
|
295
|
+
* Apeksha Katwal
|
|
296
|
+
|
|
297
|
+
Supervisor:
|
|
298
|
+
|
|
299
|
+
* Pukar Karki
|
|
300
|
+
|
|
301
|
+
Institute of Engineering, Purwanchal Campus
|
|
302
|
+
|
|
303
|
+
---
|
|
304
|
+
|
|
305
|
+
## License
|
|
306
|
+
|
|
307
|
+
MIT License
|
npltk-0.3.2/README.md
ADDED
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
# NPLTK
|
|
2
|
+
|
|
3
|
+
Nepali Language Processing Toolkit (NPLTK) is a lightweight and modular NLP library designed specifically for the Nepali language. It provides tools for tokenization, normalization, lemmatization, stop-word removal, POS tagging, and Named Entity Recognition (NER).
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Why NPLTK?
|
|
8
|
+
|
|
9
|
+
Most NLP libraries are designed primarily for English and do not handle Nepali morphology, suffixes, and tokenization well.
|
|
10
|
+
|
|
11
|
+
NPLTK is built specifically for Nepali and provides:
|
|
12
|
+
|
|
13
|
+
* Hybrid tokenizer combining rule-based logic and SentencePiece
|
|
14
|
+
* Hybrid lemmatization using dictionary + rules
|
|
15
|
+
* Lightweight POS and NER models
|
|
16
|
+
* Fully self-contained package with bundled resources
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
pip install npltk
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
For testing from TestPyPI:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install -i https://test.pypi.org/simple/ npltk
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## Minimal Example
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from npltk import create_tokenizer
|
|
38
|
+
|
|
39
|
+
tokens = create_tokenizer().tokenize("नेपाल सुन्दर देश हो।")
|
|
40
|
+
print([t.text for t in tokens])
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Tokenizer
|
|
46
|
+
|
|
47
|
+
NPLTK provides a tokenizer factory through `create_tokenizer(...)`.
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
create_tokenizer(
|
|
51
|
+
mode="hybrid",
|
|
52
|
+
split_into_sentences=True,
|
|
53
|
+
keep_punct=True,
|
|
54
|
+
model_path=None,
|
|
55
|
+
subword=True,
|
|
56
|
+
preprocess=None,
|
|
57
|
+
fallback_to_rule=True,
|
|
58
|
+
)
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Main arguments
|
|
62
|
+
|
|
63
|
+
* `mode`: `"hybrid"` or `"rule"`
|
|
64
|
+
|
|
65
|
+
* `"hybrid"` uses rule-based tokenization together with SentencePiece
|
|
66
|
+
* `"rule"` uses only rule-based tokenization
|
|
67
|
+
|
|
68
|
+
* `split_into_sentences`: whether sentence splitting is enabled internally
|
|
69
|
+
|
|
70
|
+
* `keep_punct`: whether punctuation tokens are kept in output
|
|
71
|
+
|
|
72
|
+
* `model_path`: optional custom SentencePiece model path
|
|
73
|
+
|
|
74
|
+
* `subword`: enables SentencePiece-based subword support in hybrid mode
|
|
75
|
+
|
|
76
|
+
* `preprocess`: optional preprocessing function applied before tokenization
|
|
77
|
+
|
|
78
|
+
* `fallback_to_rule`: if hybrid loading fails, automatically use rule mode
|
|
79
|
+
|
|
80
|
+
### Tokenizer Example
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from npltk import create_tokenizer
|
|
84
|
+
|
|
85
|
+
tokenizer = create_tokenizer(
|
|
86
|
+
mode="hybrid",
|
|
87
|
+
keep_punct=True,
|
|
88
|
+
fallback_to_rule=True,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
tokens = tokenizer.tokenize("नेपाल एक सुन्दर देश हो।")
|
|
92
|
+
print([t.text for t in tokens])
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Sentence Tokenization Example
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
from npltk import create_tokenizer
|
|
99
|
+
|
|
100
|
+
tokenizer = create_tokenizer(mode="hybrid")
|
|
101
|
+
sentences = tokenizer.tokenize_sentences("नेपाल सुन्दर देश हो। यहाँ हिमाल छन्।")
|
|
102
|
+
|
|
103
|
+
for sent in sentences:
|
|
104
|
+
print([t.text for t in sent.tokens])
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### Detokenization Example
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
from npltk import create_tokenizer
|
|
111
|
+
|
|
112
|
+
tokenizer = create_tokenizer(mode="hybrid")
|
|
113
|
+
tokens = tokenizer.tokenize("नेपाल सुन्दर देश हो।")
|
|
114
|
+
text = tokenizer.detokenize(tokens)
|
|
115
|
+
|
|
116
|
+
print(text)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## Separate Examples for Each Component
|
|
122
|
+
|
|
123
|
+
### 1. Normalizer
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
from npltk.normalizer import build_normalizer
|
|
127
|
+
|
|
128
|
+
result = build_normalizer().normalize(" नेपाल।। ")
|
|
129
|
+
print(result.text)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### 2. Tokenizer
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
from npltk import create_tokenizer
|
|
136
|
+
|
|
137
|
+
tokenizer = create_tokenizer(mode="hybrid")
|
|
138
|
+
tokens = tokenizer.tokenize("नेपालको प्रधानमन्त्री काठमाडौं गए।")
|
|
139
|
+
print([t.text for t in tokens])
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### 3. Lemmatizer
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
from npltk import Lemmatizer
|
|
146
|
+
|
|
147
|
+
lemmatizer = Lemmatizer()
|
|
148
|
+
print(lemmatizer.lemmatize("गयो"))
|
|
149
|
+
print(lemmatizer.lemmatize("घरहरूमा"))
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
### 4. Stop Word Removal
|
|
153
|
+
|
|
154
|
+
```python
|
|
155
|
+
from npltk import create_tokenizer
|
|
156
|
+
from npltk.stop_word.remover import StopWordRemover
|
|
157
|
+
|
|
158
|
+
tokens = create_tokenizer().tokenize("नेपाल सुन्दर देश हो र यहाँ हिमाल छन् ।")
|
|
159
|
+
filtered, info = StopWordRemover().remove(tokens)
|
|
160
|
+
|
|
161
|
+
print([t.text for t in filtered])
|
|
162
|
+
print(info)
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### 5. POS Tagger
|
|
166
|
+
|
|
167
|
+
```python
|
|
168
|
+
from npltk import create_tokenizer, POSTagger
|
|
169
|
+
|
|
170
|
+
tokens = [t.text for t in create_tokenizer().tokenize("नेपालको प्रधानमन्त्री काठमाडौं गए।")]
|
|
171
|
+
tagger = POSTagger()
|
|
172
|
+
|
|
173
|
+
print(tagger.tag_with_tokens(tokens))
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
### 6. NER Tagger
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
from npltk import NERTagger
|
|
180
|
+
|
|
181
|
+
tagger = NERTagger(tokenizer_mode="hybrid")
|
|
182
|
+
print(tagger.extract("शेरबहादुर देउवा काठमाडौं पुगे।"))
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
---
|
|
186
|
+
|
|
187
|
+
## Full Workflow Pipeline Example
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
from pprint import pprint
|
|
191
|
+
|
|
192
|
+
from npltk import create_tokenizer, Lemmatizer, POSTagger, NERTagger
|
|
193
|
+
from npltk.normalizer import build_normalizer
|
|
194
|
+
from npltk.stop_word.remover import StopWordRemover
|
|
195
|
+
|
|
196
|
+
text = " शेरबहादुर देउवा काठमाडौं पुगे र नेपालको बारेमा बोले। "
|
|
197
|
+
|
|
198
|
+
# 1. Normalize
|
|
199
|
+
normalizer = build_normalizer()
|
|
200
|
+
norm_result = normalizer.normalize(text)
|
|
201
|
+
normalized_text = norm_result.text
|
|
202
|
+
print("Normalized:", normalized_text)
|
|
203
|
+
|
|
204
|
+
# 2. Tokenize
|
|
205
|
+
tokenizer = create_tokenizer(mode="hybrid", fallback_to_rule=True)
|
|
206
|
+
tokens = tokenizer.tokenize(normalized_text)
|
|
207
|
+
token_texts = [t.text for t in tokens]
|
|
208
|
+
print("Tokens:", token_texts)
|
|
209
|
+
|
|
210
|
+
# 3. Remove stop words
|
|
211
|
+
filtered_tokens, info = StopWordRemover().remove(tokens)
|
|
212
|
+
filtered_texts = [t.text for t in filtered_tokens]
|
|
213
|
+
print("Filtered Tokens:", filtered_texts)
|
|
214
|
+
print("Stopword Info:", info)
|
|
215
|
+
|
|
216
|
+
# 4. Lemmatize
|
|
217
|
+
lemmatizer = Lemmatizer()
|
|
218
|
+
lemmas = [lemmatizer.lemmatize(token) for token in filtered_texts]
|
|
219
|
+
print("Lemmas:", lemmas)
|
|
220
|
+
|
|
221
|
+
# 5. POS tagging
|
|
222
|
+
pos_tagger = POSTagger()
|
|
223
|
+
pos_pairs = pos_tagger.tag_with_tokens(token_texts)
|
|
224
|
+
print("POS Tags:", pos_pairs)
|
|
225
|
+
|
|
226
|
+
# 6. NER
|
|
227
|
+
ner_tagger = NERTagger(tokenizer_mode="hybrid")
|
|
228
|
+
ner_result = ner_tagger.predict(normalized_text)
|
|
229
|
+
|
|
230
|
+
print("NER Token-Tag Pairs:")
|
|
231
|
+
for token, tag in zip(ner_result["tokens"], ner_result["tags"]):
|
|
232
|
+
print(f"{token:12} {tag}")
|
|
233
|
+
|
|
234
|
+
print("Entities:")
|
|
235
|
+
pprint(ner_result["entities"], width=100)
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
---
|
|
239
|
+
|
|
240
|
+
## Features
|
|
241
|
+
|
|
242
|
+
* Nepali normalizer
|
|
243
|
+
* Hybrid tokenizer (rule-based + SentencePiece)
|
|
244
|
+
* Lemmatizer
|
|
245
|
+
* Stop-word removal
|
|
246
|
+
* POS tagging
|
|
247
|
+
* Named Entity Recognition (NER)
|
|
248
|
+
|
|
249
|
+
---
|
|
250
|
+
|
|
251
|
+
## Models
|
|
252
|
+
|
|
253
|
+
NPLTK includes bundled trained models for:
|
|
254
|
+
|
|
255
|
+
* POS Tagger
|
|
256
|
+
* NER Tagger
|
|
257
|
+
|
|
258
|
+
These work out of the box after installation.
|
|
259
|
+
|
|
260
|
+
---
|
|
261
|
+
|
|
262
|
+
## Suggested Workflow
|
|
263
|
+
|
|
264
|
+
1. Normalize text
|
|
265
|
+
2. Tokenize text
|
|
266
|
+
3. Optionally remove stop words
|
|
267
|
+
4. Lemmatize tokens
|
|
268
|
+
5. Run POS tagging
|
|
269
|
+
6. Run NER extraction
|
|
270
|
+
|
|
271
|
+
---
|
|
272
|
+
|
|
273
|
+
## Contributors
|
|
274
|
+
|
|
275
|
+
* Anurag Sharma
|
|
276
|
+
* Anita Budha Magar
|
|
277
|
+
* Apeksha Parajuli
|
|
278
|
+
* Apeksha Katwal
|
|
279
|
+
|
|
280
|
+
Supervisor:
|
|
281
|
+
|
|
282
|
+
* Pukar Karki
|
|
283
|
+
|
|
284
|
+
Institute of Engineering, Purwanchal Campus
|
|
285
|
+
|
|
286
|
+
---
|
|
287
|
+
|
|
288
|
+
## License
|
|
289
|
+
|
|
290
|
+
MIT License
|
npltk-0.3.2/setup.cfg
ADDED
npltk-0.3.2/setup.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
this_directory = Path(__file__).parent
|
|
5
|
+
long_description = (this_directory / "README.md").read_text(encoding="utf-8")
|
|
6
|
+
|
|
7
|
+
setup(
|
|
8
|
+
name="npltk",
|
|
9
|
+
version="0.3.2",
|
|
10
|
+
description="Nepali Language Processing Toolkit",
|
|
11
|
+
long_description=long_description,
|
|
12
|
+
long_description_content_type="text/markdown",
|
|
13
|
+
author="Anurag Sharma, Anita Budha Magar, Apeksha Parajuli, Apeksha Katwal",
|
|
14
|
+
packages=find_packages(where="src"),
|
|
15
|
+
package_dir={"": "src"},
|
|
16
|
+
include_package_data=True,
|
|
17
|
+
install_requires=[
|
|
18
|
+
"sentencepiece>=0.1.90,<0.2.0",
|
|
19
|
+
"torch>=2.0",
|
|
20
|
+
"pytorch-crf>=0.7.2",
|
|
21
|
+
],
|
|
22
|
+
python_requires=">=3.7",
|
|
23
|
+
)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""
|
|
2
|
+
npltk - Nepali Language Processing Toolkit
|
|
3
|
+
-------------------------------------------
|
|
4
|
+
Public API entry points:
|
|
5
|
+
- create_tokenizer
|
|
6
|
+
- Lemmatizer
|
|
7
|
+
- POSTagger
|
|
8
|
+
- NERTagger
|
|
9
|
+
"""
|
|
10
|
+
from .tokenizer.factory import create_tokenizer
|
|
11
|
+
from .lemmatizer import Lemmatizer
|
|
12
|
+
from .pos import POSTagger
|
|
13
|
+
from .ner import NERTagger
|
|
14
|
+
|
|
15
|
+
__all__ = ["create_tokenizer", "Lemmatizer", "POSTagger", "NERTagger"]
|
|
16
|
+
|
|
17
|
+
__version__ = "0.3.2"
|
|
18
|
+
__author__ = [
|
|
19
|
+
"Anurag Sharma",
|
|
20
|
+
"Anita Budha Magar",
|
|
21
|
+
"Apeksha Parajuli",
|
|
22
|
+
"Apeksha Katwal"
|
|
23
|
+
]
|
|
24
|
+
__credits__ = [
|
|
25
|
+
"Pukar Karki (Project Supervisor)"
|
|
26
|
+
]
|