tokenizebot 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tokenizebot/__init__.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
class TokenizeBot:
|
|
2
|
+
def __init__(self):
|
|
3
|
+
self.suffixes = ["n't", "'ve", "'re", "'ll", "'s", "'d", "'m"]
|
|
4
|
+
self.standard_punct = '][.,;"?():!_`'
|
|
5
|
+
|
|
6
|
+
def tokenize(self, text, lowercase=False):
|
|
7
|
+
if not text: return []
|
|
8
|
+
if lowercase: text = text.lower()
|
|
9
|
+
text = text.replace('"', " '' ").replace('“', " `` ").replace('”', " '' ").replace(" ` ", " `` ")
|
|
10
|
+
buffered = "".join([f" {c} " if c in self.standard_punct else c for c in text])
|
|
11
|
+
raw_words = buffered.split()
|
|
12
|
+
final_tokens = []
|
|
13
|
+
for word in raw_words:
|
|
14
|
+
low_word = word.lower()
|
|
15
|
+
if low_word == "can't":
|
|
16
|
+
final_tokens.extend(["ca" if lowercase else word[:2], word[-3:]])
|
|
17
|
+
else:
|
|
18
|
+
split = False
|
|
19
|
+
for s in self.suffixes:
|
|
20
|
+
if low_word.endswith(s) and len(word) > len(s):
|
|
21
|
+
final_tokens.extend([word[:-len(s)], word[-len(s):]])
|
|
22
|
+
split = True
|
|
23
|
+
break
|
|
24
|
+
if not split:
|
|
25
|
+
if word.endswith("'") and len(word) > 1:
|
|
26
|
+
final_tokens.extend([word[:-1], "'"])
|
|
27
|
+
else:
|
|
28
|
+
final_tokens.append(word)
|
|
29
|
+
return final_tokens
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tokenizebot
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A simple rule-based tokenizer for English text
|
|
5
|
+
Classifier: Programming Language :: Python :: 3
|
|
6
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
7
|
+
Classifier: Operating System :: OS Independent
|
|
8
|
+
Requires-Python: >=3.7
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
|
|
11
|
+
# TokenizeBot
|
|
12
|
+
|
|
13
|
+

|
|
14
|
+

|
|
15
|
+
|
|
16
|
+
A lightweight, rule-based tokenizer for handling English clitics and punctuation.
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
```bash
|
|
20
|
+
pip install tokenizebot
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Usage
|
|
24
|
+
```python
|
|
25
|
+
from tokenizebot import TokenizeBot
|
|
26
|
+
bot = TokenizeBot()
|
|
27
|
+
|
|
28
|
+
tokens = bot.tokenize("They've been busy.", lowercase=True)
|
|
29
|
+
print(tokens)
|
|
30
|
+
```
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
tokenizebot/__init__.py,sha256=zUvhufe9xmvN1KdzIRyNC970DvLdmkT9fmSl6XQxLg4,1285
|
|
2
|
+
tokenizebot-0.1.0.dist-info/METADATA,sha256=kEd4TQ5KJZMPtwNWiw4xvcV8V4ILyOE-BZ4fRmIhaAg,741
|
|
3
|
+
tokenizebot-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
4
|
+
tokenizebot-0.1.0.dist-info/top_level.txt,sha256=EBVd7toUraOljeTNPa0mDMuOSJIgDe_FlsBnGs6RvcE,12
|
|
5
|
+
tokenizebot-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
tokenizebot
|