npltk 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. npltk-0.3.2/MANIFEST.in +6 -0
  2. npltk-0.3.2/PKG-INFO +307 -0
  3. npltk-0.3.2/README.md +290 -0
  4. npltk-0.3.2/setup.cfg +4 -0
  5. npltk-0.3.2/setup.py +23 -0
  6. npltk-0.3.2/src/npltk/__init__.py +26 -0
  7. npltk-0.3.2/src/npltk/lemmatizer/__init__.py +3 -0
  8. npltk-0.3.2/src/npltk/lemmatizer/data/lemma_dict.json +112 -0
  9. npltk-0.3.2/src/npltk/lemmatizer/dict_lookup.py +41 -0
  10. npltk-0.3.2/src/npltk/lemmatizer/hybrid_lemmatizer.py +10 -0
  11. npltk-0.3.2/src/npltk/lemmatizer/lemmatizer.py +59 -0
  12. npltk-0.3.2/src/npltk/lemmatizer/rule_stripper.py +123 -0
  13. npltk-0.3.2/src/npltk/ner/__init__.py +3 -0
  14. npltk-0.3.2/src/npltk/ner/model.py +117 -0
  15. npltk-0.3.2/src/npltk/ner/models/ner_model.pth +0 -0
  16. npltk-0.3.2/src/npltk/ner/tagger.py +209 -0
  17. npltk-0.3.2/src/npltk/normalizer/__init__.py +96 -0
  18. npltk-0.3.2/src/npltk/normalizer/config.py +31 -0
  19. npltk-0.3.2/src/npltk/normalizer/core.py +41 -0
  20. npltk-0.3.2/src/npltk/normalizer/rules.py +398 -0
  21. npltk-0.3.2/src/npltk/pos/__init__.py +3 -0
  22. npltk-0.3.2/src/npltk/pos/model.py +117 -0
  23. npltk-0.3.2/src/npltk/pos/models/npltk_pos_tagger.pth +0 -0
  24. npltk-0.3.2/src/npltk/pos/tagger.py +81 -0
  25. npltk-0.3.2/src/npltk/stop_word/__init__.py +1 -0
  26. npltk-0.3.2/src/npltk/stop_word/nepali_stopwords.txt +15 -0
  27. npltk-0.3.2/src/npltk/stop_word/remover.py +62 -0
  28. npltk-0.3.2/src/npltk/tokenizer/__init__.py +9 -0
  29. npltk-0.3.2/src/npltk/tokenizer/detokenize.py +32 -0
  30. npltk-0.3.2/src/npltk/tokenizer/factory.py +66 -0
  31. npltk-0.3.2/src/npltk/tokenizer/hybrid_tokenizer.py +206 -0
  32. npltk-0.3.2/src/npltk/tokenizer/models/nepali_tokenizer.model +0 -0
  33. npltk-0.3.2/src/npltk/tokenizer/models/nepali_tokenizer.vocab +64000 -0
  34. npltk-0.3.2/src/npltk/tokenizer/rule_engine.py +98 -0
  35. npltk-0.3.2/src/npltk/tokenizer/sentence_splitter.py +65 -0
  36. npltk-0.3.2/src/npltk/tokenizer/tokenizer.py +59 -0
  37. npltk-0.3.2/src/npltk/tokenizer/types.py +33 -0
  38. npltk-0.3.2/src/npltk/tokenizer/word_tokenizer.py +23 -0
  39. npltk-0.3.2/src/npltk.egg-info/PKG-INFO +307 -0
  40. npltk-0.3.2/src/npltk.egg-info/SOURCES.txt +41 -0
  41. npltk-0.3.2/src/npltk.egg-info/dependency_links.txt +1 -0
  42. npltk-0.3.2/src/npltk.egg-info/requires.txt +3 -0
  43. npltk-0.3.2/src/npltk.egg-info/top_level.txt +1 -0
@@ -0,0 +1,6 @@
1
+ include README.md
2
+ recursive-include src/npltk/stop_word *.txt
3
+ recursive-include src/npltk/tokenizer/models *.model *.vocab
4
+ recursive-include src/npltk/lemmatizer/data *.json
5
+ recursive-include src/npltk/pos/models *.pth
6
+ recursive-include src/npltk/ner/models *.pth
npltk-0.3.2/PKG-INFO ADDED
@@ -0,0 +1,307 @@
1
+ Metadata-Version: 2.4
2
+ Name: npltk
3
+ Version: 0.3.2
4
+ Summary: Nepali Language Processing Toolkit
5
+ Author: Anurag Sharma, Anita Budha Magar, Apeksha Parajuli, Apeksha Katwal
6
+ Requires-Python: >=3.7
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: sentencepiece<0.2.0,>=0.1.90
9
+ Requires-Dist: torch>=2.0
10
+ Requires-Dist: pytorch-crf>=0.7.2
11
+ Dynamic: author
12
+ Dynamic: description
13
+ Dynamic: description-content-type
14
+ Dynamic: requires-dist
15
+ Dynamic: requires-python
16
+ Dynamic: summary
17
+
18
+ # NPLTK
19
+
20
+ Nepali Language Processing Toolkit (NPLTK) is a lightweight and modular NLP library designed specifically for the Nepali language. It provides tools for tokenization, normalization, lemmatization, stop-word removal, POS tagging, and Named Entity Recognition (NER).
21
+
22
+ ---
23
+
24
+ ## Why NPLTK?
25
+
26
+ Most NLP libraries are designed primarily for English and do not handle Nepali morphology, suffixes, and tokenization well.
27
+
28
+ NPLTK is built specifically for Nepali and provides:
29
+
30
+ * Hybrid tokenizer combining rule-based logic and SentencePiece
31
+ * Hybrid lemmatization using dictionary + rules
32
+ * Lightweight POS and NER models
33
+ * Fully self-contained package with bundled resources
34
+
35
+ ---
36
+
37
+ ## Installation
38
+
39
+ ```bash
40
+ pip install npltk
41
+ ```
42
+
43
+ For testing from TestPyPI:
44
+
45
+ ```bash
46
+ pip install -i https://test.pypi.org/simple/ npltk
47
+ ```
48
+
49
+ ---
50
+
51
+ ## Minimal Example
52
+
53
+ ```python
54
+ from npltk import create_tokenizer
55
+
56
+ tokens = create_tokenizer().tokenize("नेपाल सुन्दर देश हो।")
57
+ print([t.text for t in tokens])
58
+ ```
59
+
60
+ ---
61
+
62
+ ## Tokenizer
63
+
64
+ NPLTK provides a tokenizer factory through `create_tokenizer(...)`.
65
+
66
+ ```python
67
+ create_tokenizer(
68
+ mode="hybrid",
69
+ split_into_sentences=True,
70
+ keep_punct=True,
71
+ model_path=None,
72
+ subword=True,
73
+ preprocess=None,
74
+ fallback_to_rule=True,
75
+ )
76
+ ```
77
+
78
+ ### Main arguments
79
+
80
+ * `mode`: `"hybrid"` or `"rule"`
81
+
82
+ * `"hybrid"` uses rule-based tokenization together with SentencePiece
83
+ * `"rule"` uses only rule-based tokenization
84
+
85
+ * `split_into_sentences`: whether sentence splitting is enabled internally
86
+
87
+ * `keep_punct`: whether punctuation tokens are kept in output
88
+
89
+ * `model_path`: optional custom SentencePiece model path
90
+
91
+ * `subword`: enables SentencePiece-based subword support in hybrid mode
92
+
93
+ * `preprocess`: optional preprocessing function applied before tokenization
94
+
95
+ * `fallback_to_rule`: if hybrid loading fails, automatically use rule mode
96
+
97
+ ### Tokenizer Example
98
+
99
+ ```python
100
+ from npltk import create_tokenizer
101
+
102
+ tokenizer = create_tokenizer(
103
+ mode="hybrid",
104
+ keep_punct=True,
105
+ fallback_to_rule=True,
106
+ )
107
+
108
+ tokens = tokenizer.tokenize("नेपाल एक सुन्दर देश हो।")
109
+ print([t.text for t in tokens])
110
+ ```
111
+
112
+ ### Sentence Tokenization Example
113
+
114
+ ```python
115
+ from npltk import create_tokenizer
116
+
117
+ tokenizer = create_tokenizer(mode="hybrid")
118
+ sentences = tokenizer.tokenize_sentences("नेपाल सुन्दर देश हो। यहाँ हिमाल छन्।")
119
+
120
+ for sent in sentences:
121
+ print([t.text for t in sent.tokens])
122
+ ```
123
+
124
+ ### Detokenization Example
125
+
126
+ ```python
127
+ from npltk import create_tokenizer
128
+
129
+ tokenizer = create_tokenizer(mode="hybrid")
130
+ tokens = tokenizer.tokenize("नेपाल सुन्दर देश हो।")
131
+ text = tokenizer.detokenize(tokens)
132
+
133
+ print(text)
134
+ ```
135
+
136
+ ---
137
+
138
+ ## Separate Examples for Each Component
139
+
140
+ ### 1. Normalizer
141
+
142
+ ```python
143
+ from npltk.normalizer import build_normalizer
144
+
145
+ result = build_normalizer().normalize(" नेपाल।। ")
146
+ print(result.text)
147
+ ```
148
+
149
+ ### 2. Tokenizer
150
+
151
+ ```python
152
+ from npltk import create_tokenizer
153
+
154
+ tokenizer = create_tokenizer(mode="hybrid")
155
+ tokens = tokenizer.tokenize("नेपालको प्रधानमन्त्री काठमाडौं गए।")
156
+ print([t.text for t in tokens])
157
+ ```
158
+
159
+ ### 3. Lemmatizer
160
+
161
+ ```python
162
+ from npltk import Lemmatizer
163
+
164
+ lemmatizer = Lemmatizer()
165
+ print(lemmatizer.lemmatize("गयो"))
166
+ print(lemmatizer.lemmatize("घरहरूमा"))
167
+ ```
168
+
169
+ ### 4. Stop Word Removal
170
+
171
+ ```python
172
+ from npltk import create_tokenizer
173
+ from npltk.stop_word.remover import StopWordRemover
174
+
175
+ tokens = create_tokenizer().tokenize("नेपाल सुन्दर देश हो र यहाँ हिमाल छन् ।")
176
+ filtered, info = StopWordRemover().remove(tokens)
177
+
178
+ print([t.text for t in filtered])
179
+ print(info)
180
+ ```
181
+
182
+ ### 5. POS Tagger
183
+
184
+ ```python
185
+ from npltk import create_tokenizer, POSTagger
186
+
187
+ tokens = [t.text for t in create_tokenizer().tokenize("नेपालको प्रधानमन्त्री काठमाडौं गए।")]
188
+ tagger = POSTagger()
189
+
190
+ print(tagger.tag_with_tokens(tokens))
191
+ ```
192
+
193
+ ### 6. NER Tagger
194
+
195
+ ```python
196
+ from npltk import NERTagger
197
+
198
+ tagger = NERTagger(tokenizer_mode="hybrid")
199
+ print(tagger.extract("शेरबहादुर देउवा काठमाडौं पुगे।"))
200
+ ```
201
+
202
+ ---
203
+
204
+ ## Full Workflow Pipeline Example
205
+
206
+ ```python
207
+ from pprint import pprint
208
+
209
+ from npltk import create_tokenizer, Lemmatizer, POSTagger, NERTagger
210
+ from npltk.normalizer import build_normalizer
211
+ from npltk.stop_word.remover import StopWordRemover
212
+
213
+ text = " शेरबहादुर देउवा काठमाडौं पुगे र नेपालको बारेमा बोले। "
214
+
215
+ # 1. Normalize
216
+ normalizer = build_normalizer()
217
+ norm_result = normalizer.normalize(text)
218
+ normalized_text = norm_result.text
219
+ print("Normalized:", normalized_text)
220
+
221
+ # 2. Tokenize
222
+ tokenizer = create_tokenizer(mode="hybrid", fallback_to_rule=True)
223
+ tokens = tokenizer.tokenize(normalized_text)
224
+ token_texts = [t.text for t in tokens]
225
+ print("Tokens:", token_texts)
226
+
227
+ # 3. Remove stop words
228
+ filtered_tokens, info = StopWordRemover().remove(tokens)
229
+ filtered_texts = [t.text for t in filtered_tokens]
230
+ print("Filtered Tokens:", filtered_texts)
231
+ print("Stopword Info:", info)
232
+
233
+ # 4. Lemmatize
234
+ lemmatizer = Lemmatizer()
235
+ lemmas = [lemmatizer.lemmatize(token) for token in filtered_texts]
236
+ print("Lemmas:", lemmas)
237
+
238
+ # 5. POS tagging
239
+ pos_tagger = POSTagger()
240
+ pos_pairs = pos_tagger.tag_with_tokens(token_texts)
241
+ print("POS Tags:", pos_pairs)
242
+
243
+ # 6. NER
244
+ ner_tagger = NERTagger(tokenizer_mode="hybrid")
245
+ ner_result = ner_tagger.predict(normalized_text)
246
+
247
+ print("NER Token-Tag Pairs:")
248
+ for token, tag in zip(ner_result["tokens"], ner_result["tags"]):
249
+ print(f"{token:12} {tag}")
250
+
251
+ print("Entities:")
252
+ pprint(ner_result["entities"], width=100)
253
+ ```
254
+
255
+ ---
256
+
257
+ ## Features
258
+
259
+ * Nepali normalizer
260
+ * Hybrid tokenizer (rule-based + SentencePiece)
261
+ * Lemmatizer
262
+ * Stop-word removal
263
+ * POS tagging
264
+ * Named Entity Recognition (NER)
265
+
266
+ ---
267
+
268
+ ## Models
269
+
270
+ NPLTK includes bundled trained models for:
271
+
272
+ * POS Tagger
273
+ * NER Tagger
274
+
275
+ These work out of the box after installation.
276
+
277
+ ---
278
+
279
+ ## Suggested Workflow
280
+
281
+ 1. Normalize text
282
+ 2. Tokenize text
283
+ 3. Optionally remove stop words
284
+ 4. Lemmatize tokens
285
+ 5. Run POS tagging
286
+ 6. Run NER extraction
287
+
288
+ ---
289
+
290
+ ## Contributors
291
+
292
+ * Anurag Sharma
293
+ * Anita Budha Magar
294
+ * Apeksha Parajuli
295
+ * Apeksha Katwal
296
+
297
+ Supervisor:
298
+
299
+ * Pukar Karki
300
+
301
+ Institute of Engineering, Purwanchal Campus
302
+
303
+ ---
304
+
305
+ ## License
306
+
307
+ MIT License
npltk-0.3.2/README.md ADDED
@@ -0,0 +1,290 @@
1
+ # NPLTK
2
+
3
+ Nepali Language Processing Toolkit (NPLTK) is a lightweight and modular NLP library designed specifically for the Nepali language. It provides tools for tokenization, normalization, lemmatization, stop-word removal, POS tagging, and Named Entity Recognition (NER).
4
+
5
+ ---
6
+
7
+ ## Why NPLTK?
8
+
9
+ Most NLP libraries are designed primarily for English and do not handle Nepali morphology, suffixes, and tokenization well.
10
+
11
+ NPLTK is built specifically for Nepali and provides:
12
+
13
+ * Hybrid tokenizer combining rule-based logic and SentencePiece
14
+ * Hybrid lemmatization using dictionary + rules
15
+ * Lightweight POS and NER models
16
+ * Fully self-contained package with bundled resources
17
+
18
+ ---
19
+
20
+ ## Installation
21
+
22
+ ```bash
23
+ pip install npltk
24
+ ```
25
+
26
+ For testing from TestPyPI:
27
+
28
+ ```bash
29
+ pip install -i https://test.pypi.org/simple/ npltk
30
+ ```
31
+
32
+ ---
33
+
34
+ ## Minimal Example
35
+
36
+ ```python
37
+ from npltk import create_tokenizer
38
+
39
+ tokens = create_tokenizer().tokenize("नेपाल सुन्दर देश हो।")
40
+ print([t.text for t in tokens])
41
+ ```
42
+
43
+ ---
44
+
45
+ ## Tokenizer
46
+
47
+ NPLTK provides a tokenizer factory through `create_tokenizer(...)`.
48
+
49
+ ```python
50
+ create_tokenizer(
51
+ mode="hybrid",
52
+ split_into_sentences=True,
53
+ keep_punct=True,
54
+ model_path=None,
55
+ subword=True,
56
+ preprocess=None,
57
+ fallback_to_rule=True,
58
+ )
59
+ ```
60
+
61
+ ### Main arguments
62
+
63
+ * `mode`: `"hybrid"` or `"rule"`
64
+
65
+ * `"hybrid"` uses rule-based tokenization together with SentencePiece
66
+ * `"rule"` uses only rule-based tokenization
67
+
68
+ * `split_into_sentences`: whether sentence splitting is enabled internally
69
+
70
+ * `keep_punct`: whether punctuation tokens are kept in output
71
+
72
+ * `model_path`: optional custom SentencePiece model path
73
+
74
+ * `subword`: enables SentencePiece-based subword support in hybrid mode
75
+
76
+ * `preprocess`: optional preprocessing function applied before tokenization
77
+
78
+ * `fallback_to_rule`: if hybrid loading fails, automatically use rule mode
79
+
80
+ ### Tokenizer Example
81
+
82
+ ```python
83
+ from npltk import create_tokenizer
84
+
85
+ tokenizer = create_tokenizer(
86
+ mode="hybrid",
87
+ keep_punct=True,
88
+ fallback_to_rule=True,
89
+ )
90
+
91
+ tokens = tokenizer.tokenize("नेपाल एक सुन्दर देश हो।")
92
+ print([t.text for t in tokens])
93
+ ```
94
+
95
+ ### Sentence Tokenization Example
96
+
97
+ ```python
98
+ from npltk import create_tokenizer
99
+
100
+ tokenizer = create_tokenizer(mode="hybrid")
101
+ sentences = tokenizer.tokenize_sentences("नेपाल सुन्दर देश हो। यहाँ हिमाल छन्।")
102
+
103
+ for sent in sentences:
104
+ print([t.text for t in sent.tokens])
105
+ ```
106
+
107
+ ### Detokenization Example
108
+
109
+ ```python
110
+ from npltk import create_tokenizer
111
+
112
+ tokenizer = create_tokenizer(mode="hybrid")
113
+ tokens = tokenizer.tokenize("नेपाल सुन्दर देश हो।")
114
+ text = tokenizer.detokenize(tokens)
115
+
116
+ print(text)
117
+ ```
118
+
119
+ ---
120
+
121
+ ## Separate Examples for Each Component
122
+
123
+ ### 1. Normalizer
124
+
125
+ ```python
126
+ from npltk.normalizer import build_normalizer
127
+
128
+ result = build_normalizer().normalize(" नेपाल।। ")
129
+ print(result.text)
130
+ ```
131
+
132
+ ### 2. Tokenizer
133
+
134
+ ```python
135
+ from npltk import create_tokenizer
136
+
137
+ tokenizer = create_tokenizer(mode="hybrid")
138
+ tokens = tokenizer.tokenize("नेपालको प्रधानमन्त्री काठमाडौं गए।")
139
+ print([t.text for t in tokens])
140
+ ```
141
+
142
+ ### 3. Lemmatizer
143
+
144
+ ```python
145
+ from npltk import Lemmatizer
146
+
147
+ lemmatizer = Lemmatizer()
148
+ print(lemmatizer.lemmatize("गयो"))
149
+ print(lemmatizer.lemmatize("घरहरूमा"))
150
+ ```
151
+
152
+ ### 4. Stop Word Removal
153
+
154
+ ```python
155
+ from npltk import create_tokenizer
156
+ from npltk.stop_word.remover import StopWordRemover
157
+
158
+ tokens = create_tokenizer().tokenize("नेपाल सुन्दर देश हो र यहाँ हिमाल छन् ।")
159
+ filtered, info = StopWordRemover().remove(tokens)
160
+
161
+ print([t.text for t in filtered])
162
+ print(info)
163
+ ```
164
+
165
+ ### 5. POS Tagger
166
+
167
+ ```python
168
+ from npltk import create_tokenizer, POSTagger
169
+
170
+ tokens = [t.text for t in create_tokenizer().tokenize("नेपालको प्रधानमन्त्री काठमाडौं गए।")]
171
+ tagger = POSTagger()
172
+
173
+ print(tagger.tag_with_tokens(tokens))
174
+ ```
175
+
176
+ ### 6. NER Tagger
177
+
178
+ ```python
179
+ from npltk import NERTagger
180
+
181
+ tagger = NERTagger(tokenizer_mode="hybrid")
182
+ print(tagger.extract("शेरबहादुर देउवा काठमाडौं पुगे।"))
183
+ ```
184
+
185
+ ---
186
+
187
+ ## Full Workflow Pipeline Example
188
+
189
+ ```python
190
+ from pprint import pprint
191
+
192
+ from npltk import create_tokenizer, Lemmatizer, POSTagger, NERTagger
193
+ from npltk.normalizer import build_normalizer
194
+ from npltk.stop_word.remover import StopWordRemover
195
+
196
+ text = " शेरबहादुर देउवा काठमाडौं पुगे र नेपालको बारेमा बोले। "
197
+
198
+ # 1. Normalize
199
+ normalizer = build_normalizer()
200
+ norm_result = normalizer.normalize(text)
201
+ normalized_text = norm_result.text
202
+ print("Normalized:", normalized_text)
203
+
204
+ # 2. Tokenize
205
+ tokenizer = create_tokenizer(mode="hybrid", fallback_to_rule=True)
206
+ tokens = tokenizer.tokenize(normalized_text)
207
+ token_texts = [t.text for t in tokens]
208
+ print("Tokens:", token_texts)
209
+
210
+ # 3. Remove stop words
211
+ filtered_tokens, info = StopWordRemover().remove(tokens)
212
+ filtered_texts = [t.text for t in filtered_tokens]
213
+ print("Filtered Tokens:", filtered_texts)
214
+ print("Stopword Info:", info)
215
+
216
+ # 4. Lemmatize
217
+ lemmatizer = Lemmatizer()
218
+ lemmas = [lemmatizer.lemmatize(token) for token in filtered_texts]
219
+ print("Lemmas:", lemmas)
220
+
221
+ # 5. POS tagging
222
+ pos_tagger = POSTagger()
223
+ pos_pairs = pos_tagger.tag_with_tokens(token_texts)
224
+ print("POS Tags:", pos_pairs)
225
+
226
+ # 6. NER
227
+ ner_tagger = NERTagger(tokenizer_mode="hybrid")
228
+ ner_result = ner_tagger.predict(normalized_text)
229
+
230
+ print("NER Token-Tag Pairs:")
231
+ for token, tag in zip(ner_result["tokens"], ner_result["tags"]):
232
+ print(f"{token:12} {tag}")
233
+
234
+ print("Entities:")
235
+ pprint(ner_result["entities"], width=100)
236
+ ```
237
+
238
+ ---
239
+
240
+ ## Features
241
+
242
+ * Nepali normalizer
243
+ * Hybrid tokenizer (rule-based + SentencePiece)
244
+ * Lemmatizer
245
+ * Stop-word removal
246
+ * POS tagging
247
+ * Named Entity Recognition (NER)
248
+
249
+ ---
250
+
251
+ ## Models
252
+
253
+ NPLTK includes bundled trained models for:
254
+
255
+ * POS Tagger
256
+ * NER Tagger
257
+
258
+ These work out of the box after installation.
259
+
260
+ ---
261
+
262
+ ## Suggested Workflow
263
+
264
+ 1. Normalize text
265
+ 2. Tokenize text
266
+ 3. Optionally remove stop words
267
+ 4. Lemmatize tokens
268
+ 5. Run POS tagging
269
+ 6. Run NER extraction
270
+
271
+ ---
272
+
273
+ ## Contributors
274
+
275
+ * Anurag Sharma
276
+ * Anita Budha Magar
277
+ * Apeksha Parajuli
278
+ * Apeksha Katwal
279
+
280
+ Supervisor:
281
+
282
+ * Pukar Karki
283
+
284
+ Institute of Engineering, Purwanchal Campus
285
+
286
+ ---
287
+
288
+ ## License
289
+
290
+ MIT License
npltk-0.3.2/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
npltk-0.3.2/setup.py ADDED
@@ -0,0 +1,23 @@
1
+ from setuptools import setup, find_packages
2
+ from pathlib import Path
3
+
4
+ this_directory = Path(__file__).parent
5
+ long_description = (this_directory / "README.md").read_text(encoding="utf-8")
6
+
7
+ setup(
8
+ name="npltk",
9
+ version="0.3.2",
10
+ description="Nepali Language Processing Toolkit",
11
+ long_description=long_description,
12
+ long_description_content_type="text/markdown",
13
+ author="Anurag Sharma, Anita Budha Magar, Apeksha Parajuli, Apeksha Katwal",
14
+ packages=find_packages(where="src"),
15
+ package_dir={"": "src"},
16
+ include_package_data=True,
17
+ install_requires=[
18
+ "sentencepiece>=0.1.90,<0.2.0",
19
+ "torch>=2.0",
20
+ "pytorch-crf>=0.7.2",
21
+ ],
22
+ python_requires=">=3.7",
23
+ )
@@ -0,0 +1,26 @@
1
+ """
2
+ npltk - Nepali Language Processing Toolkit
3
+ -------------------------------------------
4
+ Public API entry points:
5
+ - create_tokenizer
6
+ - Lemmatizer
7
+ - POSTagger
8
+ - NERTagger
9
+ """
10
+ from .tokenizer.factory import create_tokenizer
11
+ from .lemmatizer import Lemmatizer
12
+ from .pos import POSTagger
13
+ from .ner import NERTagger
14
+
15
+ __all__ = ["create_tokenizer", "Lemmatizer", "POSTagger", "NERTagger"]
16
+
17
+ __version__ = "0.3.2"
18
+ __author__ = [
19
+ "Anurag Sharma",
20
+ "Anita Budha Magar",
21
+ "Apeksha Parajuli",
22
+ "Apeksha Katwal"
23
+ ]
24
+ __credits__ = [
25
+ "Pukar Karki (Project Supervisor)"
26
+ ]
@@ -0,0 +1,3 @@
1
+ from .lemmatizer import Lemmatizer
2
+
3
+ __all__ = ["Lemmatizer"]