PyPI - ommlds - Versions diffs - 0.0.0.dev485__py3-none-any.whl → 0.0.0.dev486__py3-none-any.whl - Mend

ommlds 0.0.0.dev485py3-none-any.whl → 0.0.0.dev486py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

ommlds/__about__.py CHANGED Viewed

@@ -8,11 +8,14 @@ class Project(ProjectBase):
     description = 'ommlds'
     dependencies = [
-        f'omdev == {__version__}',
         f'omlish == {__version__}',
     ]
     optional_dependencies = {
+        'omdev': [
+            f'omdev == {__version__}',
+        ],
         'backends': [
             # 'diffusers ~= 0.36',

ommlds/nanochat/tokenizers.py CHANGED Viewed

@@ -18,7 +18,10 @@ from omlish import lang
 with lang.auto_proxy_import(globals()):
     import tiktoken
-    import tokenizers
+    import tokenizers.decoders
+    import tokenizers.models
+    import tokenizers.pre_tokenizers
+    import tokenizers.trainers
 rustbpe: ta.Any = lang.proxy_import('.rustbpe', __package__)
@@ -27,7 +30,7 @@ rustbpe: ta.Any = lang.proxy_import('.rustbpe', __package__)
 ##
-SPECIAL_TOKENS = [
+SPECIAL_TOKENS: ta.Sequence[str] = [
     # every document begins with the Beginning of Sequence (BOS) token that delimits documents
     '<|bos|>',
     # tokens below are only used during finetuning to render Conversations into token ids
@@ -45,10 +48,18 @@ SPECIAL_TOKENS = [
 # NOTE: this split pattern deviates from GPT-4 in that we use \p{N}{1,2} instead of \p{N}{1,3}
 # I did this because I didn't want to "waste" too many tokens on numbers for smaller vocab sizes.
 # I haven't validated that this is actually a good idea, TODO.
-SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""  # noqa
+SPLIT_PATTERN = (
+    r"'(?i:[sdmt]|ll|ve|re)|"
+    r"[^\r\n\p{L}\p{N}]?+\p{L}+|"
+    r"\p{N}{1,2}|"
+    r" ?[^\s\p{L}\p{N}]++[\r\n]*|"
+    r"\s*[\r\n]|"
+    r"\s+(?!\S)|"
+    r"\s+"
+)
-# -----------------------------------------------------------------------------
+##
 # Generic GPT-4-style tokenizer based on HuggingFace Tokenizer
@@ -87,22 +98,28 @@ class HuggingFaceTokenizer:
             unk_token=None,
             fuse_unk=False,
         ))
         # Normalizer: None
         tokenizer.normalizer = None
         # Pre-tokenizer: GPT-4 style
         # the regex pattern used by GPT-4 to split text into groups before BPE
         # NOTE: The pattern was changed from \p{N}{1,3} to \p{N}{1,2} because I suspect it is harmful to
         # very small models and smaller vocab sizes, because it is a little bit wasteful in the token space.
         # (but I haven't validated this! TODO)
         gpt4_split_regex = tokenizers.Regex(split_pattern)  # huggingface demands that you wrap it in Regex!!
         tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence([
             tokenizers.pre_tokenizers.Split(pattern=gpt4_split_regex, behavior='isolated', invert=False),
             tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False),
         ])
         # Decoder: ByteLevel (it pairs together with the ByteLevel pre-tokenizer)
         tokenizer.decoder = tokenizers.decoders.ByteLevel()
         # Post-processor: None
         tokenizer.post_processor = None
         # Trainer: BPE
         trainer = tokenizers.trainers.BpeTrainer(
             vocab_size=vocab_size,
@@ -111,8 +128,10 @@ class HuggingFaceTokenizer:
             initial_alphabet=tokenizers.pre_tokenizers.ByteLevel.alphabet(),
             special_tokens=special_tokens,
         )
         # Kick off the training
         tokenizer.train_from_iterator(text_iterator, trainer)
         return cls(tokenizer)
     def encode_ordinary(self, text):
@@ -174,7 +193,7 @@ class HuggingFaceTokenizer:
         print(f'Saved tokenizer to {tokenizer_path}')
-# -----------------------------------------------------------------------------
+##
 # Tokenizer based on rustbpe + tiktoken combo
@@ -255,6 +274,7 @@ class RustBPETokenizer:
                 ids.insert(0, prepend_id)  # TODO: slightly inefficient here? :( hmm
             if append is not None:
                 ids.append(append_id)
         elif isinstance(text, list):
             ids = self.enc.encode_ordinary_batch(text, num_threads=num_threads)
             if prepend is not None:
@@ -263,6 +283,7 @@ class RustBPETokenizer:
             if append is not None:
                 for ids_row in ids:
                     ids_row.append(append_id)
         else:
             raise ValueError(f'Invalid input type: {type(text)}')  # noqa
@@ -285,6 +306,7 @@ class RustBPETokenizer:
     def render_conversation(self, conversation, max_tokens=2048):
         """
         Tokenize a single Chat conversation (which we call a "doc" or "document" here).
         Returns:
         - ids: list[int] is a list of token ids of this rendered conversation
         - mask: list[int] of same length, mask = 1 for tokens that the Assistant is expected to train on.
@@ -324,7 +346,10 @@ class RustBPETokenizer:
         for i, message in enumerate(messages):
             # some sanity checking here around assumptions, to prevent footguns
             must_be_from = 'user' if i % 2 == 0 else 'assistant'
-            check.state(message['role'] == must_be_from, f"Message {i} is from {message['role']} but should be from {must_be_from}")  # noqa
+            check.state(
+                message['role'] == must_be_from,
+                f"Message {i} is from {message['role']} but should be from {must_be_from}",
+            )
             # content can be either a simple string or a list of parts (e.g. containing tool calls)
             content = message['content']
@@ -335,33 +360,42 @@ class RustBPETokenizer:
                 add_tokens(user_start, 0)
                 add_tokens(value_ids, 0)
                 add_tokens(user_end, 0)
             elif message['role'] == 'assistant':
                 add_tokens(assistant_start, 0)
                 if isinstance(content, str):
                     # simple string => simply add the tokens
                     value_ids = self.encode(content)
                     add_tokens(value_ids, 1)
                 elif isinstance(content, list):
                     for part in content:
                         value_ids = self.encode(part['text'])
                         if part['type'] == 'text':
                             # string part => simply add the tokens
                             add_tokens(value_ids, 1)
                         elif part['type'] == 'python':
                             # python tool call => add the tokens inside <|python_start|> and <|python_end|>
                             add_tokens(python_start, 1)
                             add_tokens(value_ids, 1)
                             add_tokens(python_end, 1)
                         elif part['type'] == 'python_output':
                             # python output => add the tokens inside <|output_start|> and <|output_end|>
                             # none of these tokens are supervised because the tokens come from Python at test time
                             add_tokens(output_start, 0)
                             add_tokens(value_ids, 0)
                             add_tokens(output_end, 0)
                         else:
                             raise ValueError(f"Unknown part type: {part['type']}")
                 else:
                     raise ValueError(f'Unknown content type: {type(content)}')
                 add_tokens(assistant_end, 1)
         # truncate to max_tokens tokens MAX (helps prevent OOMs)

{ommlds-0.0.0.dev485.dist-info → ommlds-0.0.0.dev486.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ommlds
-Version: 0.0.0.dev485
+Version: 0.0.0.dev486
 Summary: ommlds
 Author: wrmsr
 License-Expression: BSD-3-Clause
@@ -14,9 +14,9 @@ Classifier: Programming Language :: Python :: 3.13
 Requires-Python: >=3.13
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: omdev==0.0.0.dev485
-Requires-Dist: omlish==0.0.0.dev485
+Requires-Dist: omlish==0.0.0.dev486
 Provides-Extra: all
+Requires-Dist: omdev==0.0.0.dev486; extra == "all"
 Requires-Dist: llama-cpp-python~=0.3; extra == "all"
 Requires-Dist: mlx~=0.30; sys_platform == "darwin" and extra == "all"
 Requires-Dist: mlx-lm~=0.28; sys_platform == "darwin" and extra == "all"
@@ -37,6 +37,8 @@ Requires-Dist: ddgs~=9.9; extra == "all"
 Requires-Dist: mwparserfromhell~=0.7; extra == "all"
 Requires-Dist: wikitextparser~=0.56; extra == "all"
 Requires-Dist: lxml>=5.3; python_version < "3.13" and extra == "all"
+Provides-Extra: omdev
+Requires-Dist: omdev==0.0.0.dev486; extra == "omdev"
 Provides-Extra: backends
 Requires-Dist: llama-cpp-python~=0.3; extra == "backends"
 Requires-Dist: mlx~=0.30; sys_platform == "darwin" and extra == "backends"

{ommlds-0.0.0.dev485.dist-info → ommlds-0.0.0.dev486.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 ommlds/.omlish-manifests.json,sha256=yK0cA7mLFsZKuzVooF6fF6BhCRjbZ73d8pblEkLUsc8,26200
-ommlds/__about__.py,sha256=kNSQynzzTWcaJhqh70HtvZI-37TFEEvM0DqLjvRM0JU,1865
+ommlds/__about__.py,sha256=qfeQ6miEt4bO6LdLNb7kNqab6GA_7Oi9kPGR4wHcoOM,1900
 ommlds/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 ommlds/_hacks/__init__.py,sha256=ajfw7dMKH8UuloeQ5MSxWwgAmdWf2v8gm-K3uLP9wtY,196
 ommlds/_hacks/funcs.py,sha256=8XseIblP7yolDUD7WQSGn1LP90IQzByVejSzphAPDyM,2861
@@ -400,7 +400,7 @@ ommlds/minichain/vectors/stores.py,sha256=etbLCS0RXAEmqcCdqiys8twa8R7Y_DcjQ_VqnE
 ommlds/minichain/vectors/types.py,sha256=xSAK1Xfkubqf95QgJhSHrwBu_C5quuye3wZAASmxJkM,3473
 ommlds/nanochat/LICENSE,sha256=QrsJ8zmor4uQ07SWm29uS6Sv87XGFBA7Ax_M33HI93I,1072
 ommlds/nanochat/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-ommlds/nanochat/tokenizers.py,sha256=cU6ld0qdMG1T41_ijRD8EsbFMLLCpSNLDjgQOBi6RdM,17502
+ommlds/nanochat/tokenizers.py,sha256=DuDLJAawl_BuwTO1Jj8ANDkqwMFh6jkV9IPjj9DMhfA,17575
 ommlds/nanochat/rustbpe/LICENSE,sha256=QrsJ8zmor4uQ07SWm29uS6Sv87XGFBA7Ax_M33HI93I,1072
 ommlds/server/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 ommlds/server/__main__.py,sha256=morlItVl-0_MDK6xk2VKhqOtA8oQk0SoWOWEqcgqXTw,155
@@ -428,9 +428,9 @@ ommlds/wiki/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
 ommlds/wiki/utils/io.py,sha256=UKgDJGtmpnWvIqVd2mJc2QNPOqlToEY1GEveNp6_pMo,7088
 ommlds/wiki/utils/progress.py,sha256=EhvKcMFYtsarCQhIahlO6f0SboyAKP3UwUyrnVnP-Vk,3222
 ommlds/wiki/utils/xml.py,sha256=sNJNkZ9rT8B-kJMO6bRz8J1USy4fyPx0m2PwTX7vxYY,3846
-ommlds-0.0.0.dev485.dist-info/licenses/LICENSE,sha256=B_hVtavaA8zCYDW99DYdcpDLKz1n3BBRjZrcbv8uG8c,1451
-ommlds-0.0.0.dev485.dist-info/METADATA,sha256=vvsVRJ_zt5YRl3ABb04YQCJxKuh2T1gbl4Eof1asC6c,3402
-ommlds-0.0.0.dev485.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-ommlds-0.0.0.dev485.dist-info/entry_points.txt,sha256=Z5YWtX7ClfiCKdW-dd_CSVvM0h4yQpJPi-2G3q6gNFo,35
-ommlds-0.0.0.dev485.dist-info/top_level.txt,sha256=Rbnk5d5wi58vnAXx13WFZqdQ4VX8hBCS2hEL3WeXOhY,7
-ommlds-0.0.0.dev485.dist-info/RECORD,,
+ommlds-0.0.0.dev486.dist-info/licenses/LICENSE,sha256=B_hVtavaA8zCYDW99DYdcpDLKz1n3BBRjZrcbv8uG8c,1451
+ommlds-0.0.0.dev486.dist-info/METADATA,sha256=QHZAQXb9UIRMbEIfePwtnos7dBXG6hcsfwwn_hKUaf8,3493
+ommlds-0.0.0.dev486.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+ommlds-0.0.0.dev486.dist-info/entry_points.txt,sha256=Z5YWtX7ClfiCKdW-dd_CSVvM0h4yQpJPi-2G3q6gNFo,35
+ommlds-0.0.0.dev486.dist-info/top_level.txt,sha256=Rbnk5d5wi58vnAXx13WFZqdQ4VX8hBCS2hEL3WeXOhY,7
+ommlds-0.0.0.dev486.dist-info/RECORD,,

{ommlds-0.0.0.dev485.dist-info → ommlds-0.0.0.dev486.dist-info}/WHEEL RENAMED Viewed

File without changes

{ommlds-0.0.0.dev485.dist-info → ommlds-0.0.0.dev486.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{ommlds-0.0.0.dev485.dist-info → ommlds-0.0.0.dev486.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{ommlds-0.0.0.dev485.dist-info → ommlds-0.0.0.dev486.dist-info}/top_level.txt RENAMED Viewed

File without changes

ommlds 0.0.0.dev485__py3-none-any.whl → 0.0.0.dev486__py3-none-any.whl

ommlds 0.0.0.dev485py3-none-any.whl → 0.0.0.dev486py3-none-any.whl