PyPI - tokenizebot - Versions diffs - 0.1.0__py3-none-any.whl - Mend

tokenizebot 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

tokenizebot/__init__.py +29 -0
tokenizebot-0.1.0.dist-info/METADATA +30 -0
tokenizebot-0.1.0.dist-info/RECORD +5 -0
tokenizebot-0.1.0.dist-info/WHEEL +5 -0
tokenizebot-0.1.0.dist-info/top_level.txt +1 -0

tokenizebot/__init__.py ADDED Viewed

@@ -0,0 +1,29 @@
+class TokenizeBot:
+    def __init__(self):
+        self.suffixes = ["n't", "'ve", "'re", "'ll", "'s", "'d", "'m"]
+        self.standard_punct = '][.,;"?():!_`'
+    def tokenize(self, text, lowercase=False):
+        if not text: return []
+        if lowercase: text = text.lower()
+        text = text.replace('"', " '' ").replace('“', " `` ").replace('”', " '' ").replace(" ` ", " `` ")
+        buffered = "".join([f" {c} " if c in self.standard_punct else c for c in text])
+        raw_words = buffered.split()
+        final_tokens = []
+        for word in raw_words:
+            low_word = word.lower()
+            if low_word == "can't":
+                final_tokens.extend(["ca" if lowercase else word[:2], word[-3:]])
+            else:
+                split = False
+                for s in self.suffixes:
+                    if low_word.endswith(s) and len(word) > len(s):
+                        final_tokens.extend([word[:-len(s)], word[-len(s):]])
+                        split = True
+                        break
+                if not split:
+                    if word.endswith("'") and len(word) > 1:
+                        final_tokens.extend([word[:-1], "'"])
+                    else:
+                        final_tokens.append(word)
+        return final_tokens

tokenizebot-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,30 @@
+Metadata-Version: 2.4
+Name: tokenizebot
+Version: 0.1.0
+Summary: A simple rule-based tokenizer for English text
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.7
+Description-Content-Type: text/markdown
+# TokenizeBot
+![PyPI version](https://img.shields.io/pypi/v/tokenizebot)
+![License](https://img.shields.io/pypi/l/tokenizebot)
+A lightweight, rule-based tokenizer for handling English clitics and punctuation.
+## Installation
+```bash
+pip install tokenizebot
+```
+## Usage
+```python
+from tokenizebot import TokenizeBot
+bot = TokenizeBot()
+tokens = bot.tokenize("They've been busy.", lowercase=True)
+print(tokens)
+```

tokenizebot-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,5 @@
+tokenizebot/__init__.py,sha256=zUvhufe9xmvN1KdzIRyNC970DvLdmkT9fmSl6XQxLg4,1285
+tokenizebot-0.1.0.dist-info/METADATA,sha256=kEd4TQ5KJZMPtwNWiw4xvcV8V4ILyOE-BZ4fRmIhaAg,741
+tokenizebot-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+tokenizebot-0.1.0.dist-info/top_level.txt,sha256=EBVd7toUraOljeTNPa0mDMuOSJIgDe_FlsBnGs6RvcE,12
+tokenizebot-0.1.0.dist-info/RECORD,,

tokenizebot-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

tokenizebot-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ tokenizebot