ommlds 0.0.0.dev485__py3-none-any.whl → 0.0.0.dev486__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ommlds/__about__.py CHANGED
@@ -8,11 +8,14 @@ class Project(ProjectBase):
8
8
  description = 'ommlds'
9
9
 
10
10
  dependencies = [
11
- f'omdev == {__version__}',
12
11
  f'omlish == {__version__}',
13
12
  ]
14
13
 
15
14
  optional_dependencies = {
15
+ 'omdev': [
16
+ f'omdev == {__version__}',
17
+ ],
18
+
16
19
  'backends': [
17
20
  # 'diffusers ~= 0.36',
18
21
 
@@ -18,7 +18,10 @@ from omlish import lang
18
18
 
19
19
  with lang.auto_proxy_import(globals()):
20
20
  import tiktoken
21
- import tokenizers
21
+ import tokenizers.decoders
22
+ import tokenizers.models
23
+ import tokenizers.pre_tokenizers
24
+ import tokenizers.trainers
22
25
 
23
26
 
24
27
  rustbpe: ta.Any = lang.proxy_import('.rustbpe', __package__)
@@ -27,7 +30,7 @@ rustbpe: ta.Any = lang.proxy_import('.rustbpe', __package__)
27
30
  ##
28
31
 
29
32
 
30
- SPECIAL_TOKENS = [
33
+ SPECIAL_TOKENS: ta.Sequence[str] = [
31
34
  # every document begins with the Beginning of Sequence (BOS) token that delimits documents
32
35
  '<|bos|>',
33
36
  # tokens below are only used during finetuning to render Conversations into token ids
@@ -45,10 +48,18 @@ SPECIAL_TOKENS = [
45
48
  # NOTE: this split pattern deviates from GPT-4 in that we use \p{N}{1,2} instead of \p{N}{1,3}
46
49
  # I did this because I didn't want to "waste" too many tokens on numbers for smaller vocab sizes.
47
50
  # I haven't validated that this is actually a good idea, TODO.
48
- SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" # noqa
51
+ SPLIT_PATTERN = (
52
+ r"'(?i:[sdmt]|ll|ve|re)|"
53
+ r"[^\r\n\p{L}\p{N}]?+\p{L}+|"
54
+ r"\p{N}{1,2}|"
55
+ r" ?[^\s\p{L}\p{N}]++[\r\n]*|"
56
+ r"\s*[\r\n]|"
57
+ r"\s+(?!\S)|"
58
+ r"\s+"
59
+ )
49
60
 
50
61
 
51
- # -----------------------------------------------------------------------------
62
+ ##
52
63
  # Generic GPT-4-style tokenizer based on HuggingFace Tokenizer
53
64
 
54
65
 
@@ -87,22 +98,28 @@ class HuggingFaceTokenizer:
87
98
  unk_token=None,
88
99
  fuse_unk=False,
89
100
  ))
101
+
90
102
  # Normalizer: None
91
103
  tokenizer.normalizer = None
104
+
92
105
  # Pre-tokenizer: GPT-4 style
93
106
  # the regex pattern used by GPT-4 to split text into groups before BPE
94
107
  # NOTE: The pattern was changed from \p{N}{1,3} to \p{N}{1,2} because I suspect it is harmful to
95
108
  # very small models and smaller vocab sizes, because it is a little bit wasteful in the token space.
96
109
  # (but I haven't validated this! TODO)
97
110
  gpt4_split_regex = tokenizers.Regex(split_pattern) # huggingface demands that you wrap it in Regex!!
111
+
98
112
  tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence([
99
113
  tokenizers.pre_tokenizers.Split(pattern=gpt4_split_regex, behavior='isolated', invert=False),
100
114
  tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False),
101
115
  ])
116
+
102
117
  # Decoder: ByteLevel (it pairs together with the ByteLevel pre-tokenizer)
103
118
  tokenizer.decoder = tokenizers.decoders.ByteLevel()
119
+
104
120
  # Post-processor: None
105
121
  tokenizer.post_processor = None
122
+
106
123
  # Trainer: BPE
107
124
  trainer = tokenizers.trainers.BpeTrainer(
108
125
  vocab_size=vocab_size,
@@ -111,8 +128,10 @@ class HuggingFaceTokenizer:
111
128
  initial_alphabet=tokenizers.pre_tokenizers.ByteLevel.alphabet(),
112
129
  special_tokens=special_tokens,
113
130
  )
131
+
114
132
  # Kick off the training
115
133
  tokenizer.train_from_iterator(text_iterator, trainer)
134
+
116
135
  return cls(tokenizer)
117
136
 
118
137
  def encode_ordinary(self, text):
@@ -174,7 +193,7 @@ class HuggingFaceTokenizer:
174
193
  print(f'Saved tokenizer to {tokenizer_path}')
175
194
 
176
195
 
177
- # -----------------------------------------------------------------------------
196
+ ##
178
197
  # Tokenizer based on rustbpe + tiktoken combo
179
198
 
180
199
 
@@ -255,6 +274,7 @@ class RustBPETokenizer:
255
274
  ids.insert(0, prepend_id) # TODO: slightly inefficient here? :( hmm
256
275
  if append is not None:
257
276
  ids.append(append_id)
277
+
258
278
  elif isinstance(text, list):
259
279
  ids = self.enc.encode_ordinary_batch(text, num_threads=num_threads)
260
280
  if prepend is not None:
@@ -263,6 +283,7 @@ class RustBPETokenizer:
263
283
  if append is not None:
264
284
  for ids_row in ids:
265
285
  ids_row.append(append_id)
286
+
266
287
  else:
267
288
  raise ValueError(f'Invalid input type: {type(text)}') # noqa
268
289
 
@@ -285,6 +306,7 @@ class RustBPETokenizer:
285
306
  def render_conversation(self, conversation, max_tokens=2048):
286
307
  """
287
308
  Tokenize a single Chat conversation (which we call a "doc" or "document" here).
309
+
288
310
  Returns:
289
311
  - ids: list[int] is a list of token ids of this rendered conversation
290
312
  - mask: list[int] of same length, mask = 1 for tokens that the Assistant is expected to train on.
@@ -324,7 +346,10 @@ class RustBPETokenizer:
324
346
  for i, message in enumerate(messages):
325
347
  # some sanity checking here around assumptions, to prevent footguns
326
348
  must_be_from = 'user' if i % 2 == 0 else 'assistant'
327
- check.state(message['role'] == must_be_from, f"Message {i} is from {message['role']} but should be from {must_be_from}") # noqa
349
+ check.state(
350
+ message['role'] == must_be_from,
351
+ f"Message {i} is from {message['role']} but should be from {must_be_from}",
352
+ )
328
353
 
329
354
  # content can be either a simple string or a list of parts (e.g. containing tool calls)
330
355
  content = message['content']
@@ -335,33 +360,42 @@ class RustBPETokenizer:
335
360
  add_tokens(user_start, 0)
336
361
  add_tokens(value_ids, 0)
337
362
  add_tokens(user_end, 0)
363
+
338
364
  elif message['role'] == 'assistant':
339
365
  add_tokens(assistant_start, 0)
366
+
340
367
  if isinstance(content, str):
341
368
  # simple string => simply add the tokens
342
369
  value_ids = self.encode(content)
343
370
  add_tokens(value_ids, 1)
371
+
344
372
  elif isinstance(content, list):
345
373
  for part in content:
346
374
  value_ids = self.encode(part['text'])
375
+
347
376
  if part['type'] == 'text':
348
377
  # string part => simply add the tokens
349
378
  add_tokens(value_ids, 1)
379
+
350
380
  elif part['type'] == 'python':
351
381
  # python tool call => add the tokens inside <|python_start|> and <|python_end|>
352
382
  add_tokens(python_start, 1)
353
383
  add_tokens(value_ids, 1)
354
384
  add_tokens(python_end, 1)
385
+
355
386
  elif part['type'] == 'python_output':
356
387
  # python output => add the tokens inside <|output_start|> and <|output_end|>
357
388
  # none of these tokens are supervised because the tokens come from Python at test time
358
389
  add_tokens(output_start, 0)
359
390
  add_tokens(value_ids, 0)
360
391
  add_tokens(output_end, 0)
392
+
361
393
  else:
362
394
  raise ValueError(f"Unknown part type: {part['type']}")
395
+
363
396
  else:
364
397
  raise ValueError(f'Unknown content type: {type(content)}')
398
+
365
399
  add_tokens(assistant_end, 1)
366
400
 
367
401
  # truncate to max_tokens tokens MAX (helps prevent OOMs)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ommlds
3
- Version: 0.0.0.dev485
3
+ Version: 0.0.0.dev486
4
4
  Summary: ommlds
5
5
  Author: wrmsr
6
6
  License-Expression: BSD-3-Clause
@@ -14,9 +14,9 @@ Classifier: Programming Language :: Python :: 3.13
14
14
  Requires-Python: >=3.13
15
15
  Description-Content-Type: text/markdown
16
16
  License-File: LICENSE
17
- Requires-Dist: omdev==0.0.0.dev485
18
- Requires-Dist: omlish==0.0.0.dev485
17
+ Requires-Dist: omlish==0.0.0.dev486
19
18
  Provides-Extra: all
19
+ Requires-Dist: omdev==0.0.0.dev486; extra == "all"
20
20
  Requires-Dist: llama-cpp-python~=0.3; extra == "all"
21
21
  Requires-Dist: mlx~=0.30; sys_platform == "darwin" and extra == "all"
22
22
  Requires-Dist: mlx-lm~=0.28; sys_platform == "darwin" and extra == "all"
@@ -37,6 +37,8 @@ Requires-Dist: ddgs~=9.9; extra == "all"
37
37
  Requires-Dist: mwparserfromhell~=0.7; extra == "all"
38
38
  Requires-Dist: wikitextparser~=0.56; extra == "all"
39
39
  Requires-Dist: lxml>=5.3; python_version < "3.13" and extra == "all"
40
+ Provides-Extra: omdev
41
+ Requires-Dist: omdev==0.0.0.dev486; extra == "omdev"
40
42
  Provides-Extra: backends
41
43
  Requires-Dist: llama-cpp-python~=0.3; extra == "backends"
42
44
  Requires-Dist: mlx~=0.30; sys_platform == "darwin" and extra == "backends"
@@ -1,5 +1,5 @@
1
1
  ommlds/.omlish-manifests.json,sha256=yK0cA7mLFsZKuzVooF6fF6BhCRjbZ73d8pblEkLUsc8,26200
2
- ommlds/__about__.py,sha256=kNSQynzzTWcaJhqh70HtvZI-37TFEEvM0DqLjvRM0JU,1865
2
+ ommlds/__about__.py,sha256=qfeQ6miEt4bO6LdLNb7kNqab6GA_7Oi9kPGR4wHcoOM,1900
3
3
  ommlds/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  ommlds/_hacks/__init__.py,sha256=ajfw7dMKH8UuloeQ5MSxWwgAmdWf2v8gm-K3uLP9wtY,196
5
5
  ommlds/_hacks/funcs.py,sha256=8XseIblP7yolDUD7WQSGn1LP90IQzByVejSzphAPDyM,2861
@@ -400,7 +400,7 @@ ommlds/minichain/vectors/stores.py,sha256=etbLCS0RXAEmqcCdqiys8twa8R7Y_DcjQ_VqnE
400
400
  ommlds/minichain/vectors/types.py,sha256=xSAK1Xfkubqf95QgJhSHrwBu_C5quuye3wZAASmxJkM,3473
401
401
  ommlds/nanochat/LICENSE,sha256=QrsJ8zmor4uQ07SWm29uS6Sv87XGFBA7Ax_M33HI93I,1072
402
402
  ommlds/nanochat/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
403
- ommlds/nanochat/tokenizers.py,sha256=cU6ld0qdMG1T41_ijRD8EsbFMLLCpSNLDjgQOBi6RdM,17502
403
+ ommlds/nanochat/tokenizers.py,sha256=DuDLJAawl_BuwTO1Jj8ANDkqwMFh6jkV9IPjj9DMhfA,17575
404
404
  ommlds/nanochat/rustbpe/LICENSE,sha256=QrsJ8zmor4uQ07SWm29uS6Sv87XGFBA7Ax_M33HI93I,1072
405
405
  ommlds/server/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
406
406
  ommlds/server/__main__.py,sha256=morlItVl-0_MDK6xk2VKhqOtA8oQk0SoWOWEqcgqXTw,155
@@ -428,9 +428,9 @@ ommlds/wiki/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
428
428
  ommlds/wiki/utils/io.py,sha256=UKgDJGtmpnWvIqVd2mJc2QNPOqlToEY1GEveNp6_pMo,7088
429
429
  ommlds/wiki/utils/progress.py,sha256=EhvKcMFYtsarCQhIahlO6f0SboyAKP3UwUyrnVnP-Vk,3222
430
430
  ommlds/wiki/utils/xml.py,sha256=sNJNkZ9rT8B-kJMO6bRz8J1USy4fyPx0m2PwTX7vxYY,3846
431
- ommlds-0.0.0.dev485.dist-info/licenses/LICENSE,sha256=B_hVtavaA8zCYDW99DYdcpDLKz1n3BBRjZrcbv8uG8c,1451
432
- ommlds-0.0.0.dev485.dist-info/METADATA,sha256=vvsVRJ_zt5YRl3ABb04YQCJxKuh2T1gbl4Eof1asC6c,3402
433
- ommlds-0.0.0.dev485.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
434
- ommlds-0.0.0.dev485.dist-info/entry_points.txt,sha256=Z5YWtX7ClfiCKdW-dd_CSVvM0h4yQpJPi-2G3q6gNFo,35
435
- ommlds-0.0.0.dev485.dist-info/top_level.txt,sha256=Rbnk5d5wi58vnAXx13WFZqdQ4VX8hBCS2hEL3WeXOhY,7
436
- ommlds-0.0.0.dev485.dist-info/RECORD,,
431
+ ommlds-0.0.0.dev486.dist-info/licenses/LICENSE,sha256=B_hVtavaA8zCYDW99DYdcpDLKz1n3BBRjZrcbv8uG8c,1451
432
+ ommlds-0.0.0.dev486.dist-info/METADATA,sha256=QHZAQXb9UIRMbEIfePwtnos7dBXG6hcsfwwn_hKUaf8,3493
433
+ ommlds-0.0.0.dev486.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
434
+ ommlds-0.0.0.dev486.dist-info/entry_points.txt,sha256=Z5YWtX7ClfiCKdW-dd_CSVvM0h4yQpJPi-2G3q6gNFo,35
435
+ ommlds-0.0.0.dev486.dist-info/top_level.txt,sha256=Rbnk5d5wi58vnAXx13WFZqdQ4VX8hBCS2hEL3WeXOhY,7
436
+ ommlds-0.0.0.dev486.dist-info/RECORD,,