skl-mindforge 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skl_mindforge-0.1.0/PKG-INFO +5 -0
- skl_mindforge-0.1.0/setup.cfg +4 -0
- skl_mindforge-0.1.0/setup.py +11 -0
- skl_mindforge-0.1.0/skl_mindforge/__init__.py +32 -0
- skl_mindforge-0.1.0/skl_mindforge/private_vocab_40k.json +199220 -0
- skl_mindforge-0.1.0/skl_mindforge.egg-info/PKG-INFO +5 -0
- skl_mindforge-0.1.0/skl_mindforge.egg-info/SOURCES.txt +8 -0
- skl_mindforge-0.1.0/skl_mindforge.egg-info/dependency_links.txt +1 -0
- skl_mindforge-0.1.0/skl_mindforge.egg-info/requires.txt +1 -0
- skl_mindforge-0.1.0/skl_mindforge.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
|
|
2
|
+
import os
|
|
3
|
+
from tokenizers import Tokenizer
|
|
4
|
+
from tokenizers.processors import TemplateProcessing
|
|
5
|
+
|
|
6
|
+
class ZenithTokenizer:
|
|
7
|
+
def __init__(self, model_filename="private_vocab_40k.json"):
|
|
8
|
+
# Locates the file inside the package folder
|
|
9
|
+
current_dir = os.path.dirname(__file__)
|
|
10
|
+
model_path = os.path.join(current_dir, model_filename)
|
|
11
|
+
|
|
12
|
+
if not os.path.exists(model_path):
|
|
13
|
+
raise FileNotFoundError(f"Missing {model_filename} at {model_path}")
|
|
14
|
+
|
|
15
|
+
self.tokenizer = Tokenizer.from_file(model_path)
|
|
16
|
+
|
|
17
|
+
# Post-processor for the Chat/Assistant format
|
|
18
|
+
self.tokenizer.post_processor = TemplateProcessing(
|
|
19
|
+
single="<s> $A </s>",
|
|
20
|
+
pair="<s> $A </s> <s> $B </s>",
|
|
21
|
+
special_tokens=[("<s>", 1), ("</s>", 2)],
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
self.vocab_size = self.tokenizer.get_vocab_size()
|
|
25
|
+
|
|
26
|
+
def encode(self, text):
|
|
27
|
+
return self.tokenizer.encode(text).ids
|
|
28
|
+
|
|
29
|
+
def decode(self, ids, skip_special_tokens=True):
|
|
30
|
+
return self.tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
|
|
31
|
+
|
|
32
|
+
zenith_tokenizer = ZenithTokenizer
|