skl-mindforge 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ Metadata-Version: 2.4
2
+ Name: skl_mindforge
3
+ Version: 0.1.0
4
+ Requires-Dist: tokenizers
5
+ Dynamic: requires-dist
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,11 @@
1
+
2
+ from setuptools import setup, find_packages
3
+
4
+ setup(
5
+ name="skl_mindforge",
6
+ version="0.1.0",
7
+ packages=find_packages(),
8
+ include_package_data=True,
9
+ package_data={'skl_mindforge': ['*.json']},
10
+ install_requires=['tokenizers'],
11
+ )
@@ -0,0 +1,32 @@
1
+
2
+ import os
3
+ from tokenizers import Tokenizer
4
+ from tokenizers.processors import TemplateProcessing
5
+
6
+ class ZenithTokenizer:
7
+ def __init__(self, model_filename="private_vocab_40k.json"):
8
+ # Locates the file inside the package folder
9
+ current_dir = os.path.dirname(__file__)
10
+ model_path = os.path.join(current_dir, model_filename)
11
+
12
+ if not os.path.exists(model_path):
13
+ raise FileNotFoundError(f"Missing {model_filename} at {model_path}")
14
+
15
+ self.tokenizer = Tokenizer.from_file(model_path)
16
+
17
+ # Post-processor for the Chat/Assistant format
18
+ self.tokenizer.post_processor = TemplateProcessing(
19
+ single="<s> $A </s>",
20
+ pair="<s> $A </s> <s> $B </s>",
21
+ special_tokens=[("<s>", 1), ("</s>", 2)],
22
+ )
23
+
24
+ self.vocab_size = self.tokenizer.get_vocab_size()
25
+
26
+ def encode(self, text):
27
+ return self.tokenizer.encode(text).ids
28
+
29
+ def decode(self, ids, skip_special_tokens=True):
30
+ return self.tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
31
+
32
+ zenith_tokenizer = ZenithTokenizer