tokenizebot 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,29 @@
1
+ class TokenizeBot:
2
+ def __init__(self):
3
+ self.suffixes = ["n't", "'ve", "'re", "'ll", "'s", "'d", "'m"]
4
+ self.standard_punct = '][.,;"?():!_`'
5
+
6
+ def tokenize(self, text, lowercase=False):
7
+ if not text: return []
8
+ if lowercase: text = text.lower()
9
+ text = text.replace('"', " '' ").replace('“', " `` ").replace('”', " '' ").replace(" ` ", " `` ")
10
+ buffered = "".join([f" {c} " if c in self.standard_punct else c for c in text])
11
+ raw_words = buffered.split()
12
+ final_tokens = []
13
+ for word in raw_words:
14
+ low_word = word.lower()
15
+ if low_word == "can't":
16
+ final_tokens.extend(["ca" if lowercase else word[:2], word[-3:]])
17
+ else:
18
+ split = False
19
+ for s in self.suffixes:
20
+ if low_word.endswith(s) and len(word) > len(s):
21
+ final_tokens.extend([word[:-len(s)], word[-len(s):]])
22
+ split = True
23
+ break
24
+ if not split:
25
+ if word.endswith("'") and len(word) > 1:
26
+ final_tokens.extend([word[:-1], "'"])
27
+ else:
28
+ final_tokens.append(word)
29
+ return final_tokens
@@ -0,0 +1,30 @@
1
+ Metadata-Version: 2.4
2
+ Name: tokenizebot
3
+ Version: 0.1.0
4
+ Summary: A simple rule-based tokenizer for English text
5
+ Classifier: Programming Language :: Python :: 3
6
+ Classifier: License :: OSI Approved :: MIT License
7
+ Classifier: Operating System :: OS Independent
8
+ Requires-Python: >=3.7
9
+ Description-Content-Type: text/markdown
10
+
11
+ # TokenizeBot
12
+
13
+ ![PyPI version](https://img.shields.io/pypi/v/tokenizebot)
14
+ ![License](https://img.shields.io/pypi/l/tokenizebot)
15
+
16
+ A lightweight, rule-based tokenizer for handling English clitics and punctuation.
17
+
18
+ ## Installation
19
+ ```bash
20
+ pip install tokenizebot
21
+ ```
22
+
23
+ ## Usage
24
+ ```python
25
+ from tokenizebot import TokenizeBot
26
+ bot = TokenizeBot()
27
+
28
+ tokens = bot.tokenize("They've been busy.", lowercase=True)
29
+ print(tokens)
30
+ ```
@@ -0,0 +1,5 @@
1
+ tokenizebot/__init__.py,sha256=zUvhufe9xmvN1KdzIRyNC970DvLdmkT9fmSl6XQxLg4,1285
2
+ tokenizebot-0.1.0.dist-info/METADATA,sha256=kEd4TQ5KJZMPtwNWiw4xvcV8V4ILyOE-BZ4fRmIhaAg,741
3
+ tokenizebot-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
4
+ tokenizebot-0.1.0.dist-info/top_level.txt,sha256=EBVd7toUraOljeTNPa0mDMuOSJIgDe_FlsBnGs6RvcE,12
5
+ tokenizebot-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ tokenizebot