tokenizebot 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,30 @@
1
+ Metadata-Version: 2.4
2
+ Name: tokenizebot
3
+ Version: 0.1.0
4
+ Summary: A simple rule-based tokenizer for English text
5
+ Classifier: Programming Language :: Python :: 3
6
+ Classifier: License :: OSI Approved :: MIT License
7
+ Classifier: Operating System :: OS Independent
8
+ Requires-Python: >=3.7
9
+ Description-Content-Type: text/markdown
10
+
11
+ # TokenizeBot
12
+
13
+ ![PyPI version](https://img.shields.io/pypi/v/tokenizebot)
14
+ ![License](https://img.shields.io/pypi/l/tokenizebot)
15
+
16
+ A lightweight, rule-based tokenizer for handling English clitics and punctuation.
17
+
18
+ ## Installation
19
+ ```bash
20
+ pip install tokenizebot
21
+ ```
22
+
23
+ ## Usage
24
+ ```python
25
+ from tokenizebot import TokenizeBot
26
+ bot = TokenizeBot()
27
+
28
+ tokens = bot.tokenize("They've been busy.", lowercase=True)
29
+ print(tokens)
30
+ ```
@@ -0,0 +1,20 @@
1
+ # TokenizeBot
2
+
3
+ ![PyPI version](https://img.shields.io/pypi/v/tokenizebot)
4
+ ![License](https://img.shields.io/pypi/l/tokenizebot)
5
+
6
+ A lightweight, rule-based tokenizer for handling English clitics and punctuation.
7
+
8
+ ## Installation
9
+ ```bash
10
+ pip install tokenizebot
11
+ ```
12
+
13
+ ## Usage
14
+ ```python
15
+ from tokenizebot import TokenizeBot
16
+ bot = TokenizeBot()
17
+
18
+ tokens = bot.tokenize("They've been busy.", lowercase=True)
19
+ print(tokens)
20
+ ```
@@ -0,0 +1,18 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "tokenizebot"
7
+ version = "0.1.0"
8
+ description = "A simple rule-based tokenizer for English text"
9
+ readme = "README.md"
10
+ requires-python = ">=3.7"
11
+ classifiers = [
12
+ "Programming Language :: Python :: 3",
13
+ "License :: OSI Approved :: MIT License",
14
+ "Operating System :: OS Independent",
15
+ ]
16
+
17
+ [tool.setuptools]
18
+ packages = ["tokenizebot"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,29 @@
1
+ class TokenizeBot:
2
+ def __init__(self):
3
+ self.suffixes = ["n't", "'ve", "'re", "'ll", "'s", "'d", "'m"]
4
+ self.standard_punct = '][.,;"?():!_`'
5
+
6
+ def tokenize(self, text, lowercase=False):
7
+ if not text: return []
8
+ if lowercase: text = text.lower()
9
+ text = text.replace('"', " '' ").replace('“', " `` ").replace('”', " '' ").replace(" ` ", " `` ")
10
+ buffered = "".join([f" {c} " if c in self.standard_punct else c for c in text])
11
+ raw_words = buffered.split()
12
+ final_tokens = []
13
+ for word in raw_words:
14
+ low_word = word.lower()
15
+ if low_word == "can't":
16
+ final_tokens.extend(["ca" if lowercase else word[:2], word[-3:]])
17
+ else:
18
+ split = False
19
+ for s in self.suffixes:
20
+ if low_word.endswith(s) and len(word) > len(s):
21
+ final_tokens.extend([word[:-len(s)], word[-len(s):]])
22
+ split = True
23
+ break
24
+ if not split:
25
+ if word.endswith("'") and len(word) > 1:
26
+ final_tokens.extend([word[:-1], "'"])
27
+ else:
28
+ final_tokens.append(word)
29
+ return final_tokens
@@ -0,0 +1,30 @@
1
+ Metadata-Version: 2.4
2
+ Name: tokenizebot
3
+ Version: 0.1.0
4
+ Summary: A simple rule-based tokenizer for English text
5
+ Classifier: Programming Language :: Python :: 3
6
+ Classifier: License :: OSI Approved :: MIT License
7
+ Classifier: Operating System :: OS Independent
8
+ Requires-Python: >=3.7
9
+ Description-Content-Type: text/markdown
10
+
11
+ # TokenizeBot
12
+
13
+ ![PyPI version](https://img.shields.io/pypi/v/tokenizebot)
14
+ ![License](https://img.shields.io/pypi/l/tokenizebot)
15
+
16
+ A lightweight, rule-based tokenizer for handling English clitics and punctuation.
17
+
18
+ ## Installation
19
+ ```bash
20
+ pip install tokenizebot
21
+ ```
22
+
23
+ ## Usage
24
+ ```python
25
+ from tokenizebot import TokenizeBot
26
+ bot = TokenizeBot()
27
+
28
+ tokens = bot.tokenize("They've been busy.", lowercase=True)
29
+ print(tokens)
30
+ ```
@@ -0,0 +1,7 @@
1
+ README.md
2
+ pyproject.toml
3
+ tokenizebot/__init__.py
4
+ tokenizebot.egg-info/PKG-INFO
5
+ tokenizebot.egg-info/SOURCES.txt
6
+ tokenizebot.egg-info/dependency_links.txt
7
+ tokenizebot.egg-info/top_level.txt
@@ -0,0 +1 @@
1
+ tokenizebot