sindhinltk 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ Metadata-Version: 2.4
2
+ Name: sindhinltk
3
+ Version: 0.1.0
4
+ Summary: Morphology-Aware Sindhi NLP Toolkit
5
+ Author: Aakash Meghwar
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: transformers
8
+ Requires-Dist: tokenizers
9
+ Requires-Dist: regex
10
+ Requires-Dist: torch
11
+
12
+ # SindhiNLTK
@@ -0,0 +1 @@
1
+ # SindhiNLTK
@@ -0,0 +1,12 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+ [project]
5
+ name = "sindhinltk"
6
+ version = "0.1.0"
7
+ description = "Morphology-Aware Sindhi NLP Toolkit"
8
+ readme = "README.md"
9
+ authors = [{ name = "Aakash Meghwar" }]
10
+ dependencies = ["transformers", "tokenizers", "regex", "torch"]
11
+ [tool.setuptools]
12
+ packages = ["sindhinltk"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,21 @@
1
+ from .normalizer import SindhiNormalizer
2
+ from .tokenizer import SindhiTokenizer
3
+ from .stemmer import SindhiStemmer
4
+ from .stopwords import SindhiStopwords
5
+ from .sentiment import SindhiSentiment
6
+ import unicodedata
7
+
8
+ class SindhiNLP:
9
+ def __init__(self):
10
+ self.norm = SindhiNormalizer()
11
+ self.tok = SindhiTokenizer()
12
+ self.stemmer = SindhiStemmer()
13
+ self.sw = SindhiStopwords()
14
+ self.sent = SindhiSentiment()
15
+ def process(self, text):
16
+ clean = self.norm.normalize(text)
17
+ tokens = self.tok.tokenize(clean)
18
+ no_sw = self.sw.remove_stopwords(tokens)
19
+ stems = [self.stemmer.stem(t) for t in no_sw]
20
+ sentiment = self.sent.analyze(stems)
21
+ return {"tokens": tokens, "stems": stems, "sentiment": sentiment}
@@ -0,0 +1,11 @@
1
+ import unicodedata
2
+ import re
3
+
4
+ class SindhiNormalizer:
5
+ def __init__(self):
6
+ self.ZERO_WIDTH = ['\u200C', '\u200D', '\u200B', '\uFEFF', '\u00AD']
7
+ def normalize(self, text: str) -> str:
8
+ text = unicodedata.normalize('NFC', text)
9
+ for zw in self.ZERO_WIDTH:
10
+ text = text.replace(zw, '')
11
+ return re.sub(r'[ \t]+', ' ', text).strip()
File without changes
@@ -0,0 +1,7 @@
1
+ class SindhiStemmer:
2
+ def __init__(self):
3
+ self.suffixes = ['ائيندڙ', 'يندڙ', 'يائين', 'ائين', 'يون', 'ان', 'ون', 'ين', 'ي', 'و']
4
+ def stem(self, word: str) -> str:
5
+ for s in self.suffixes:
6
+ if word.endswith(s) and len(word) - len(s) >= 2: return word[:-len(s)]
7
+ return word
@@ -0,0 +1,5 @@
1
+ class SindhiStopwords:
2
+ def __init__(self):
3
+ self.stopwords = {"۽", "يا", "پر", "ڪرڻ", "ته", "کي", "جو", "جي", "جا", "تي", "تان", "کان", "۾", "سان", "آهي", "آهن"}
4
+ def remove_stopwords(self, tokens):
5
+ return [t for t in tokens if t not in self.stopwords]
File without changes
@@ -0,0 +1,12 @@
1
+ Metadata-Version: 2.4
2
+ Name: sindhinltk
3
+ Version: 0.1.0
4
+ Summary: Morphology-Aware Sindhi NLP Toolkit
5
+ Author: Aakash Meghwar
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: transformers
8
+ Requires-Dist: tokenizers
9
+ Requires-Dist: regex
10
+ Requires-Dist: torch
11
+
12
+ # SindhiNLTK
@@ -0,0 +1,13 @@
1
+ README.md
2
+ pyproject.toml
3
+ sindhinltk/__init__.py
4
+ sindhinltk/normalizer.py
5
+ sindhinltk/sentiment.py
6
+ sindhinltk/stemmer.py
7
+ sindhinltk/stopwords.py
8
+ sindhinltk/tokenizer.py
9
+ sindhinltk.egg-info/PKG-INFO
10
+ sindhinltk.egg-info/SOURCES.txt
11
+ sindhinltk.egg-info/dependency_links.txt
12
+ sindhinltk.egg-info/requires.txt
13
+ sindhinltk.egg-info/top_level.txt
@@ -0,0 +1,4 @@
1
+ transformers
2
+ tokenizers
3
+ regex
4
+ torch
@@ -0,0 +1 @@
1
+ sindhinltk