PyPI - sindhinltk - Versions diffs - 0.1.0__tar.gz - Mend

sindhinltk 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

sindhinltk-0.1.0/PKG-INFO +12 -0
sindhinltk-0.1.0/README.md +1 -0
sindhinltk-0.1.0/pyproject.toml +12 -0
sindhinltk-0.1.0/setup.cfg +4 -0
sindhinltk-0.1.0/sindhinltk/__init__.py +21 -0
sindhinltk-0.1.0/sindhinltk/normalizer.py +11 -0
sindhinltk-0.1.0/sindhinltk/sentiment.py +0 -0
sindhinltk-0.1.0/sindhinltk/stemmer.py +7 -0
sindhinltk-0.1.0/sindhinltk/stopwords.py +5 -0
sindhinltk-0.1.0/sindhinltk/tokenizer.py +0 -0
sindhinltk-0.1.0/sindhinltk.egg-info/PKG-INFO +12 -0
sindhinltk-0.1.0/sindhinltk.egg-info/SOURCES.txt +13 -0
sindhinltk-0.1.0/sindhinltk.egg-info/dependency_links.txt +1 -0
sindhinltk-0.1.0/sindhinltk.egg-info/requires.txt +4 -0
sindhinltk-0.1.0/sindhinltk.egg-info/top_level.txt +1 -0

sindhinltk-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,12 @@
+Metadata-Version: 2.4
+Name: sindhinltk
+Version: 0.1.0
+Summary: Morphology-Aware Sindhi NLP Toolkit
+Author: Aakash Meghwar
+Description-Content-Type: text/markdown
+Requires-Dist: transformers
+Requires-Dist: tokenizers
+Requires-Dist: regex
+Requires-Dist: torch
+# SindhiNLTK

sindhinltk-0.1.0/README.md ADDED Viewed

	@@ -0,0 +1 @@
1	+ # SindhiNLTK

sindhinltk-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,12 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "sindhinltk"
+version = "0.1.0"
+description = "Morphology-Aware Sindhi NLP Toolkit"
+readme = "README.md"
+authors = [{ name = "Aakash Meghwar" }]
+dependencies = ["transformers", "tokenizers", "regex", "torch"]
+[tool.setuptools]
+packages = ["sindhinltk"]

sindhinltk-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

sindhinltk-0.1.0/sindhinltk/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+from .normalizer import SindhiNormalizer
+from .tokenizer import SindhiTokenizer
+from .stemmer import SindhiStemmer
+from .stopwords import SindhiStopwords
+from .sentiment import SindhiSentiment
+import unicodedata
+class SindhiNLP:
+    def __init__(self):
+        self.norm = SindhiNormalizer()
+        self.tok = SindhiTokenizer()
+        self.stemmer = SindhiStemmer()
+        self.sw = SindhiStopwords()
+        self.sent = SindhiSentiment()
+    def process(self, text):
+        clean = self.norm.normalize(text)
+        tokens = self.tok.tokenize(clean)
+        no_sw = self.sw.remove_stopwords(tokens)
+        stems = [self.stemmer.stem(t) for t in no_sw]
+        sentiment = self.sent.analyze(stems)
+        return {"tokens": tokens, "stems": stems, "sentiment": sentiment}

sindhinltk-0.1.0/sindhinltk/normalizer.py ADDED Viewed

@@ -0,0 +1,11 @@
+import unicodedata
+import re
+class SindhiNormalizer:
+    def __init__(self):
+        self.ZERO_WIDTH = ['\u200C', '\u200D', '\u200B', '\uFEFF', '\u00AD']
+    def normalize(self, text: str) -> str:
+        text = unicodedata.normalize('NFC', text)
+        for zw in self.ZERO_WIDTH:
+            text = text.replace(zw, '')
+        return re.sub(r'[ \t]+', ' ', text).strip()

sindhinltk-0.1.0/sindhinltk/sentiment.py ADDED Viewed

File without changes

sindhinltk-0.1.0/sindhinltk/stemmer.py ADDED Viewed

@@ -0,0 +1,7 @@
+class SindhiStemmer:
+    def __init__(self):
+        self.suffixes = ['ائيندڙ', 'يندڙ', 'يائين', 'ائين', 'يون', 'ان', 'ون', 'ين', 'ي', 'و']
+    def stem(self, word: str) -> str:
+        for s in self.suffixes:
+            if word.endswith(s) and len(word) - len(s) >= 2: return word[:-len(s)]
+        return word

sindhinltk-0.1.0/sindhinltk/stopwords.py ADDED Viewed

@@ -0,0 +1,5 @@
+class SindhiStopwords:
+    def __init__(self):
+        self.stopwords = {"۽", "يا", "پر", "ڪرڻ", "ته", "کي", "جو", "جي", "جا", "تي", "تان", "کان", "۾", "سان", "آهي", "آهن"}
+    def remove_stopwords(self, tokens):
+        return [t for t in tokens if t not in self.stopwords]

sindhinltk-0.1.0/sindhinltk/tokenizer.py ADDED Viewed

File without changes

sindhinltk-0.1.0/sindhinltk.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,12 @@
+Metadata-Version: 2.4
+Name: sindhinltk
+Version: 0.1.0
+Summary: Morphology-Aware Sindhi NLP Toolkit
+Author: Aakash Meghwar
+Description-Content-Type: text/markdown
+Requires-Dist: transformers
+Requires-Dist: tokenizers
+Requires-Dist: regex
+Requires-Dist: torch
+# SindhiNLTK

sindhinltk-0.1.0/sindhinltk.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,13 @@
+README.md
+pyproject.toml
+sindhinltk/__init__.py
+sindhinltk/normalizer.py
+sindhinltk/sentiment.py
+sindhinltk/stemmer.py
+sindhinltk/stopwords.py
+sindhinltk/tokenizer.py
+sindhinltk.egg-info/PKG-INFO
+sindhinltk.egg-info/SOURCES.txt
+sindhinltk.egg-info/dependency_links.txt
+sindhinltk.egg-info/requires.txt
+sindhinltk.egg-info/top_level.txt

sindhinltk-0.1.0/sindhinltk.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

sindhinltk-0.1.0/sindhinltk.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,4 @@
+transformers
+tokenizers
+regex
+torch

sindhinltk-0.1.0/sindhinltk.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ sindhinltk