sindhinltk 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sindhinltk-0.1.0/PKG-INFO +12 -0
- sindhinltk-0.1.0/README.md +1 -0
- sindhinltk-0.1.0/pyproject.toml +12 -0
- sindhinltk-0.1.0/setup.cfg +4 -0
- sindhinltk-0.1.0/sindhinltk/__init__.py +21 -0
- sindhinltk-0.1.0/sindhinltk/normalizer.py +11 -0
- sindhinltk-0.1.0/sindhinltk/sentiment.py +0 -0
- sindhinltk-0.1.0/sindhinltk/stemmer.py +7 -0
- sindhinltk-0.1.0/sindhinltk/stopwords.py +5 -0
- sindhinltk-0.1.0/sindhinltk/tokenizer.py +0 -0
- sindhinltk-0.1.0/sindhinltk.egg-info/PKG-INFO +12 -0
- sindhinltk-0.1.0/sindhinltk.egg-info/SOURCES.txt +13 -0
- sindhinltk-0.1.0/sindhinltk.egg-info/dependency_links.txt +1 -0
- sindhinltk-0.1.0/sindhinltk.egg-info/requires.txt +4 -0
- sindhinltk-0.1.0/sindhinltk.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sindhinltk
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Morphology-Aware Sindhi NLP Toolkit
|
|
5
|
+
Author: Aakash Meghwar
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: transformers
|
|
8
|
+
Requires-Dist: tokenizers
|
|
9
|
+
Requires-Dist: regex
|
|
10
|
+
Requires-Dist: torch
|
|
11
|
+
|
|
12
|
+
# SindhiNLTK
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# SindhiNLTK
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
[project]
|
|
5
|
+
name = "sindhinltk"
|
|
6
|
+
version = "0.1.0"
|
|
7
|
+
description = "Morphology-Aware Sindhi NLP Toolkit"
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
authors = [{ name = "Aakash Meghwar" }]
|
|
10
|
+
dependencies = ["transformers", "tokenizers", "regex", "torch"]
|
|
11
|
+
[tool.setuptools]
|
|
12
|
+
packages = ["sindhinltk"]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from .normalizer import SindhiNormalizer
|
|
2
|
+
from .tokenizer import SindhiTokenizer
|
|
3
|
+
from .stemmer import SindhiStemmer
|
|
4
|
+
from .stopwords import SindhiStopwords
|
|
5
|
+
from .sentiment import SindhiSentiment
|
|
6
|
+
import unicodedata
|
|
7
|
+
|
|
8
|
+
class SindhiNLP:
|
|
9
|
+
def __init__(self):
|
|
10
|
+
self.norm = SindhiNormalizer()
|
|
11
|
+
self.tok = SindhiTokenizer()
|
|
12
|
+
self.stemmer = SindhiStemmer()
|
|
13
|
+
self.sw = SindhiStopwords()
|
|
14
|
+
self.sent = SindhiSentiment()
|
|
15
|
+
def process(self, text):
|
|
16
|
+
clean = self.norm.normalize(text)
|
|
17
|
+
tokens = self.tok.tokenize(clean)
|
|
18
|
+
no_sw = self.sw.remove_stopwords(tokens)
|
|
19
|
+
stems = [self.stemmer.stem(t) for t in no_sw]
|
|
20
|
+
sentiment = self.sent.analyze(stems)
|
|
21
|
+
return {"tokens": tokens, "stems": stems, "sentiment": sentiment}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import unicodedata
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
class SindhiNormalizer:
|
|
5
|
+
def __init__(self):
|
|
6
|
+
self.ZERO_WIDTH = ['\u200C', '\u200D', '\u200B', '\uFEFF', '\u00AD']
|
|
7
|
+
def normalize(self, text: str) -> str:
|
|
8
|
+
text = unicodedata.normalize('NFC', text)
|
|
9
|
+
for zw in self.ZERO_WIDTH:
|
|
10
|
+
text = text.replace(zw, '')
|
|
11
|
+
return re.sub(r'[ \t]+', ' ', text).strip()
|
|
File without changes
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
class SindhiStemmer:
|
|
2
|
+
def __init__(self):
|
|
3
|
+
self.suffixes = ['ائيندڙ', 'يندڙ', 'يائين', 'ائين', 'يون', 'ان', 'ون', 'ين', 'ي', 'و']
|
|
4
|
+
def stem(self, word: str) -> str:
|
|
5
|
+
for s in self.suffixes:
|
|
6
|
+
if word.endswith(s) and len(word) - len(s) >= 2: return word[:-len(s)]
|
|
7
|
+
return word
|
|
File without changes
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sindhinltk
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Morphology-Aware Sindhi NLP Toolkit
|
|
5
|
+
Author: Aakash Meghwar
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: transformers
|
|
8
|
+
Requires-Dist: tokenizers
|
|
9
|
+
Requires-Dist: regex
|
|
10
|
+
Requires-Dist: torch
|
|
11
|
+
|
|
12
|
+
# SindhiNLTK
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
sindhinltk/__init__.py
|
|
4
|
+
sindhinltk/normalizer.py
|
|
5
|
+
sindhinltk/sentiment.py
|
|
6
|
+
sindhinltk/stemmer.py
|
|
7
|
+
sindhinltk/stopwords.py
|
|
8
|
+
sindhinltk/tokenizer.py
|
|
9
|
+
sindhinltk.egg-info/PKG-INFO
|
|
10
|
+
sindhinltk.egg-info/SOURCES.txt
|
|
11
|
+
sindhinltk.egg-info/dependency_links.txt
|
|
12
|
+
sindhinltk.egg-info/requires.txt
|
|
13
|
+
sindhinltk.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
sindhinltk
|