PyPI - tokenizebot - Versions diffs - 0.1.0__tar.gz - Mend

tokenizebot 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

tokenizebot-0.1.0/PKG-INFO +30 -0
tokenizebot-0.1.0/README.md +20 -0
tokenizebot-0.1.0/pyproject.toml +18 -0
tokenizebot-0.1.0/setup.cfg +4 -0
tokenizebot-0.1.0/tokenizebot/__init__.py +29 -0
tokenizebot-0.1.0/tokenizebot.egg-info/PKG-INFO +30 -0
tokenizebot-0.1.0/tokenizebot.egg-info/SOURCES.txt +7 -0
tokenizebot-0.1.0/tokenizebot.egg-info/dependency_links.txt +1 -0
tokenizebot-0.1.0/tokenizebot.egg-info/top_level.txt +1 -0

tokenizebot-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,30 @@
+Metadata-Version: 2.4
+Name: tokenizebot
+Version: 0.1.0
+Summary: A simple rule-based tokenizer for English text
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.7
+Description-Content-Type: text/markdown
+# TokenizeBot
+![PyPI version](https://img.shields.io/pypi/v/tokenizebot)
+![License](https://img.shields.io/pypi/l/tokenizebot)
+A lightweight, rule-based tokenizer for handling English clitics and punctuation.
+## Installation
+```bash
+pip install tokenizebot
+```
+## Usage
+```python
+from tokenizebot import TokenizeBot
+bot = TokenizeBot()
+tokens = bot.tokenize("They've been busy.", lowercase=True)
+print(tokens)
+```

tokenizebot-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,20 @@
+# TokenizeBot
+![PyPI version](https://img.shields.io/pypi/v/tokenizebot)
+![License](https://img.shields.io/pypi/l/tokenizebot)
+A lightweight, rule-based tokenizer for handling English clitics and punctuation.
+## Installation
+```bash
+pip install tokenizebot
+```
+## Usage
+```python
+from tokenizebot import TokenizeBot
+bot = TokenizeBot()
+tokens = bot.tokenize("They've been busy.", lowercase=True)
+print(tokens)
+```

tokenizebot-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,18 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "tokenizebot"
+version = "0.1.0"
+description = "A simple rule-based tokenizer for English text"
+readme = "README.md"
+requires-python = ">=3.7"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+]
+[tool.setuptools]
+packages = ["tokenizebot"]

tokenizebot-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

tokenizebot-0.1.0/tokenizebot/__init__.py ADDED Viewed

@@ -0,0 +1,29 @@
+class TokenizeBot:
+    def __init__(self):
+        self.suffixes = ["n't", "'ve", "'re", "'ll", "'s", "'d", "'m"]
+        self.standard_punct = '][.,;"?():!_`'
+    def tokenize(self, text, lowercase=False):
+        if not text: return []
+        if lowercase: text = text.lower()
+        text = text.replace('"', " '' ").replace('“', " `` ").replace('”', " '' ").replace(" ` ", " `` ")
+        buffered = "".join([f" {c} " if c in self.standard_punct else c for c in text])
+        raw_words = buffered.split()
+        final_tokens = []
+        for word in raw_words:
+            low_word = word.lower()
+            if low_word == "can't":
+                final_tokens.extend(["ca" if lowercase else word[:2], word[-3:]])
+            else:
+                split = False
+                for s in self.suffixes:
+                    if low_word.endswith(s) and len(word) > len(s):
+                        final_tokens.extend([word[:-len(s)], word[-len(s):]])
+                        split = True
+                        break
+                if not split:
+                    if word.endswith("'") and len(word) > 1:
+                        final_tokens.extend([word[:-1], "'"])
+                    else:
+                        final_tokens.append(word)
+        return final_tokens

tokenizebot-0.1.0/tokenizebot.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,30 @@
+Metadata-Version: 2.4
+Name: tokenizebot
+Version: 0.1.0
+Summary: A simple rule-based tokenizer for English text
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.7
+Description-Content-Type: text/markdown
+# TokenizeBot
+![PyPI version](https://img.shields.io/pypi/v/tokenizebot)
+![License](https://img.shields.io/pypi/l/tokenizebot)
+A lightweight, rule-based tokenizer for handling English clitics and punctuation.
+## Installation
+```bash
+pip install tokenizebot
+```
+## Usage
+```python
+from tokenizebot import TokenizeBot
+bot = TokenizeBot()
+tokens = bot.tokenize("They've been busy.", lowercase=True)
+print(tokens)
+```

tokenizebot-0.1.0/tokenizebot.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,7 @@
+README.md
+pyproject.toml
+tokenizebot/__init__.py
+tokenizebot.egg-info/PKG-INFO
+tokenizebot.egg-info/SOURCES.txt
+tokenizebot.egg-info/dependency_links.txt
+tokenizebot.egg-info/top_level.txt

tokenizebot-0.1.0/tokenizebot.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

tokenizebot-0.1.0/tokenizebot.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ tokenizebot