ommlds 0.0.0.dev485__py3-none-any.whl → 0.0.0.dev486__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ommlds/__about__.py +4 -1
- ommlds/nanochat/tokenizers.py +40 -6
- {ommlds-0.0.0.dev485.dist-info → ommlds-0.0.0.dev486.dist-info}/METADATA +5 -3
- {ommlds-0.0.0.dev485.dist-info → ommlds-0.0.0.dev486.dist-info}/RECORD +8 -8
- {ommlds-0.0.0.dev485.dist-info → ommlds-0.0.0.dev486.dist-info}/WHEEL +0 -0
- {ommlds-0.0.0.dev485.dist-info → ommlds-0.0.0.dev486.dist-info}/entry_points.txt +0 -0
- {ommlds-0.0.0.dev485.dist-info → ommlds-0.0.0.dev486.dist-info}/licenses/LICENSE +0 -0
- {ommlds-0.0.0.dev485.dist-info → ommlds-0.0.0.dev486.dist-info}/top_level.txt +0 -0
ommlds/__about__.py
CHANGED
|
@@ -8,11 +8,14 @@ class Project(ProjectBase):
|
|
|
8
8
|
description = 'ommlds'
|
|
9
9
|
|
|
10
10
|
dependencies = [
|
|
11
|
-
f'omdev == {__version__}',
|
|
12
11
|
f'omlish == {__version__}',
|
|
13
12
|
]
|
|
14
13
|
|
|
15
14
|
optional_dependencies = {
|
|
15
|
+
'omdev': [
|
|
16
|
+
f'omdev == {__version__}',
|
|
17
|
+
],
|
|
18
|
+
|
|
16
19
|
'backends': [
|
|
17
20
|
# 'diffusers ~= 0.36',
|
|
18
21
|
|
ommlds/nanochat/tokenizers.py
CHANGED
|
@@ -18,7 +18,10 @@ from omlish import lang
|
|
|
18
18
|
|
|
19
19
|
with lang.auto_proxy_import(globals()):
|
|
20
20
|
import tiktoken
|
|
21
|
-
import tokenizers
|
|
21
|
+
import tokenizers.decoders
|
|
22
|
+
import tokenizers.models
|
|
23
|
+
import tokenizers.pre_tokenizers
|
|
24
|
+
import tokenizers.trainers
|
|
22
25
|
|
|
23
26
|
|
|
24
27
|
rustbpe: ta.Any = lang.proxy_import('.rustbpe', __package__)
|
|
@@ -27,7 +30,7 @@ rustbpe: ta.Any = lang.proxy_import('.rustbpe', __package__)
|
|
|
27
30
|
##
|
|
28
31
|
|
|
29
32
|
|
|
30
|
-
SPECIAL_TOKENS = [
|
|
33
|
+
SPECIAL_TOKENS: ta.Sequence[str] = [
|
|
31
34
|
# every document begins with the Beginning of Sequence (BOS) token that delimits documents
|
|
32
35
|
'<|bos|>',
|
|
33
36
|
# tokens below are only used during finetuning to render Conversations into token ids
|
|
@@ -45,10 +48,18 @@ SPECIAL_TOKENS = [
|
|
|
45
48
|
# NOTE: this split pattern deviates from GPT-4 in that we use \p{N}{1,2} instead of \p{N}{1,3}
|
|
46
49
|
# I did this because I didn't want to "waste" too many tokens on numbers for smaller vocab sizes.
|
|
47
50
|
# I haven't validated that this is actually a good idea, TODO.
|
|
48
|
-
SPLIT_PATTERN =
|
|
51
|
+
SPLIT_PATTERN = (
|
|
52
|
+
r"'(?i:[sdmt]|ll|ve|re)|"
|
|
53
|
+
r"[^\r\n\p{L}\p{N}]?+\p{L}+|"
|
|
54
|
+
r"\p{N}{1,2}|"
|
|
55
|
+
r" ?[^\s\p{L}\p{N}]++[\r\n]*|"
|
|
56
|
+
r"\s*[\r\n]|"
|
|
57
|
+
r"\s+(?!\S)|"
|
|
58
|
+
r"\s+"
|
|
59
|
+
)
|
|
49
60
|
|
|
50
61
|
|
|
51
|
-
|
|
62
|
+
##
|
|
52
63
|
# Generic GPT-4-style tokenizer based on HuggingFace Tokenizer
|
|
53
64
|
|
|
54
65
|
|
|
@@ -87,22 +98,28 @@ class HuggingFaceTokenizer:
|
|
|
87
98
|
unk_token=None,
|
|
88
99
|
fuse_unk=False,
|
|
89
100
|
))
|
|
101
|
+
|
|
90
102
|
# Normalizer: None
|
|
91
103
|
tokenizer.normalizer = None
|
|
104
|
+
|
|
92
105
|
# Pre-tokenizer: GPT-4 style
|
|
93
106
|
# the regex pattern used by GPT-4 to split text into groups before BPE
|
|
94
107
|
# NOTE: The pattern was changed from \p{N}{1,3} to \p{N}{1,2} because I suspect it is harmful to
|
|
95
108
|
# very small models and smaller vocab sizes, because it is a little bit wasteful in the token space.
|
|
96
109
|
# (but I haven't validated this! TODO)
|
|
97
110
|
gpt4_split_regex = tokenizers.Regex(split_pattern) # huggingface demands that you wrap it in Regex!!
|
|
111
|
+
|
|
98
112
|
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence([
|
|
99
113
|
tokenizers.pre_tokenizers.Split(pattern=gpt4_split_regex, behavior='isolated', invert=False),
|
|
100
114
|
tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False),
|
|
101
115
|
])
|
|
116
|
+
|
|
102
117
|
# Decoder: ByteLevel (it pairs together with the ByteLevel pre-tokenizer)
|
|
103
118
|
tokenizer.decoder = tokenizers.decoders.ByteLevel()
|
|
119
|
+
|
|
104
120
|
# Post-processor: None
|
|
105
121
|
tokenizer.post_processor = None
|
|
122
|
+
|
|
106
123
|
# Trainer: BPE
|
|
107
124
|
trainer = tokenizers.trainers.BpeTrainer(
|
|
108
125
|
vocab_size=vocab_size,
|
|
@@ -111,8 +128,10 @@ class HuggingFaceTokenizer:
|
|
|
111
128
|
initial_alphabet=tokenizers.pre_tokenizers.ByteLevel.alphabet(),
|
|
112
129
|
special_tokens=special_tokens,
|
|
113
130
|
)
|
|
131
|
+
|
|
114
132
|
# Kick off the training
|
|
115
133
|
tokenizer.train_from_iterator(text_iterator, trainer)
|
|
134
|
+
|
|
116
135
|
return cls(tokenizer)
|
|
117
136
|
|
|
118
137
|
def encode_ordinary(self, text):
|
|
@@ -174,7 +193,7 @@ class HuggingFaceTokenizer:
|
|
|
174
193
|
print(f'Saved tokenizer to {tokenizer_path}')
|
|
175
194
|
|
|
176
195
|
|
|
177
|
-
|
|
196
|
+
##
|
|
178
197
|
# Tokenizer based on rustbpe + tiktoken combo
|
|
179
198
|
|
|
180
199
|
|
|
@@ -255,6 +274,7 @@ class RustBPETokenizer:
|
|
|
255
274
|
ids.insert(0, prepend_id) # TODO: slightly inefficient here? :( hmm
|
|
256
275
|
if append is not None:
|
|
257
276
|
ids.append(append_id)
|
|
277
|
+
|
|
258
278
|
elif isinstance(text, list):
|
|
259
279
|
ids = self.enc.encode_ordinary_batch(text, num_threads=num_threads)
|
|
260
280
|
if prepend is not None:
|
|
@@ -263,6 +283,7 @@ class RustBPETokenizer:
|
|
|
263
283
|
if append is not None:
|
|
264
284
|
for ids_row in ids:
|
|
265
285
|
ids_row.append(append_id)
|
|
286
|
+
|
|
266
287
|
else:
|
|
267
288
|
raise ValueError(f'Invalid input type: {type(text)}') # noqa
|
|
268
289
|
|
|
@@ -285,6 +306,7 @@ class RustBPETokenizer:
|
|
|
285
306
|
def render_conversation(self, conversation, max_tokens=2048):
|
|
286
307
|
"""
|
|
287
308
|
Tokenize a single Chat conversation (which we call a "doc" or "document" here).
|
|
309
|
+
|
|
288
310
|
Returns:
|
|
289
311
|
- ids: list[int] is a list of token ids of this rendered conversation
|
|
290
312
|
- mask: list[int] of same length, mask = 1 for tokens that the Assistant is expected to train on.
|
|
@@ -324,7 +346,10 @@ class RustBPETokenizer:
|
|
|
324
346
|
for i, message in enumerate(messages):
|
|
325
347
|
# some sanity checking here around assumptions, to prevent footguns
|
|
326
348
|
must_be_from = 'user' if i % 2 == 0 else 'assistant'
|
|
327
|
-
check.state(
|
|
349
|
+
check.state(
|
|
350
|
+
message['role'] == must_be_from,
|
|
351
|
+
f"Message {i} is from {message['role']} but should be from {must_be_from}",
|
|
352
|
+
)
|
|
328
353
|
|
|
329
354
|
# content can be either a simple string or a list of parts (e.g. containing tool calls)
|
|
330
355
|
content = message['content']
|
|
@@ -335,33 +360,42 @@ class RustBPETokenizer:
|
|
|
335
360
|
add_tokens(user_start, 0)
|
|
336
361
|
add_tokens(value_ids, 0)
|
|
337
362
|
add_tokens(user_end, 0)
|
|
363
|
+
|
|
338
364
|
elif message['role'] == 'assistant':
|
|
339
365
|
add_tokens(assistant_start, 0)
|
|
366
|
+
|
|
340
367
|
if isinstance(content, str):
|
|
341
368
|
# simple string => simply add the tokens
|
|
342
369
|
value_ids = self.encode(content)
|
|
343
370
|
add_tokens(value_ids, 1)
|
|
371
|
+
|
|
344
372
|
elif isinstance(content, list):
|
|
345
373
|
for part in content:
|
|
346
374
|
value_ids = self.encode(part['text'])
|
|
375
|
+
|
|
347
376
|
if part['type'] == 'text':
|
|
348
377
|
# string part => simply add the tokens
|
|
349
378
|
add_tokens(value_ids, 1)
|
|
379
|
+
|
|
350
380
|
elif part['type'] == 'python':
|
|
351
381
|
# python tool call => add the tokens inside <|python_start|> and <|python_end|>
|
|
352
382
|
add_tokens(python_start, 1)
|
|
353
383
|
add_tokens(value_ids, 1)
|
|
354
384
|
add_tokens(python_end, 1)
|
|
385
|
+
|
|
355
386
|
elif part['type'] == 'python_output':
|
|
356
387
|
# python output => add the tokens inside <|output_start|> and <|output_end|>
|
|
357
388
|
# none of these tokens are supervised because the tokens come from Python at test time
|
|
358
389
|
add_tokens(output_start, 0)
|
|
359
390
|
add_tokens(value_ids, 0)
|
|
360
391
|
add_tokens(output_end, 0)
|
|
392
|
+
|
|
361
393
|
else:
|
|
362
394
|
raise ValueError(f"Unknown part type: {part['type']}")
|
|
395
|
+
|
|
363
396
|
else:
|
|
364
397
|
raise ValueError(f'Unknown content type: {type(content)}')
|
|
398
|
+
|
|
365
399
|
add_tokens(assistant_end, 1)
|
|
366
400
|
|
|
367
401
|
# truncate to max_tokens tokens MAX (helps prevent OOMs)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ommlds
|
|
3
|
-
Version: 0.0.0.
|
|
3
|
+
Version: 0.0.0.dev486
|
|
4
4
|
Summary: ommlds
|
|
5
5
|
Author: wrmsr
|
|
6
6
|
License-Expression: BSD-3-Clause
|
|
@@ -14,9 +14,9 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
14
14
|
Requires-Python: >=3.13
|
|
15
15
|
Description-Content-Type: text/markdown
|
|
16
16
|
License-File: LICENSE
|
|
17
|
-
Requires-Dist:
|
|
18
|
-
Requires-Dist: omlish==0.0.0.dev485
|
|
17
|
+
Requires-Dist: omlish==0.0.0.dev486
|
|
19
18
|
Provides-Extra: all
|
|
19
|
+
Requires-Dist: omdev==0.0.0.dev486; extra == "all"
|
|
20
20
|
Requires-Dist: llama-cpp-python~=0.3; extra == "all"
|
|
21
21
|
Requires-Dist: mlx~=0.30; sys_platform == "darwin" and extra == "all"
|
|
22
22
|
Requires-Dist: mlx-lm~=0.28; sys_platform == "darwin" and extra == "all"
|
|
@@ -37,6 +37,8 @@ Requires-Dist: ddgs~=9.9; extra == "all"
|
|
|
37
37
|
Requires-Dist: mwparserfromhell~=0.7; extra == "all"
|
|
38
38
|
Requires-Dist: wikitextparser~=0.56; extra == "all"
|
|
39
39
|
Requires-Dist: lxml>=5.3; python_version < "3.13" and extra == "all"
|
|
40
|
+
Provides-Extra: omdev
|
|
41
|
+
Requires-Dist: omdev==0.0.0.dev486; extra == "omdev"
|
|
40
42
|
Provides-Extra: backends
|
|
41
43
|
Requires-Dist: llama-cpp-python~=0.3; extra == "backends"
|
|
42
44
|
Requires-Dist: mlx~=0.30; sys_platform == "darwin" and extra == "backends"
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
ommlds/.omlish-manifests.json,sha256=yK0cA7mLFsZKuzVooF6fF6BhCRjbZ73d8pblEkLUsc8,26200
|
|
2
|
-
ommlds/__about__.py,sha256=
|
|
2
|
+
ommlds/__about__.py,sha256=qfeQ6miEt4bO6LdLNb7kNqab6GA_7Oi9kPGR4wHcoOM,1900
|
|
3
3
|
ommlds/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
ommlds/_hacks/__init__.py,sha256=ajfw7dMKH8UuloeQ5MSxWwgAmdWf2v8gm-K3uLP9wtY,196
|
|
5
5
|
ommlds/_hacks/funcs.py,sha256=8XseIblP7yolDUD7WQSGn1LP90IQzByVejSzphAPDyM,2861
|
|
@@ -400,7 +400,7 @@ ommlds/minichain/vectors/stores.py,sha256=etbLCS0RXAEmqcCdqiys8twa8R7Y_DcjQ_VqnE
|
|
|
400
400
|
ommlds/minichain/vectors/types.py,sha256=xSAK1Xfkubqf95QgJhSHrwBu_C5quuye3wZAASmxJkM,3473
|
|
401
401
|
ommlds/nanochat/LICENSE,sha256=QrsJ8zmor4uQ07SWm29uS6Sv87XGFBA7Ax_M33HI93I,1072
|
|
402
402
|
ommlds/nanochat/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
403
|
-
ommlds/nanochat/tokenizers.py,sha256=
|
|
403
|
+
ommlds/nanochat/tokenizers.py,sha256=DuDLJAawl_BuwTO1Jj8ANDkqwMFh6jkV9IPjj9DMhfA,17575
|
|
404
404
|
ommlds/nanochat/rustbpe/LICENSE,sha256=QrsJ8zmor4uQ07SWm29uS6Sv87XGFBA7Ax_M33HI93I,1072
|
|
405
405
|
ommlds/server/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
406
406
|
ommlds/server/__main__.py,sha256=morlItVl-0_MDK6xk2VKhqOtA8oQk0SoWOWEqcgqXTw,155
|
|
@@ -428,9 +428,9 @@ ommlds/wiki/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
428
428
|
ommlds/wiki/utils/io.py,sha256=UKgDJGtmpnWvIqVd2mJc2QNPOqlToEY1GEveNp6_pMo,7088
|
|
429
429
|
ommlds/wiki/utils/progress.py,sha256=EhvKcMFYtsarCQhIahlO6f0SboyAKP3UwUyrnVnP-Vk,3222
|
|
430
430
|
ommlds/wiki/utils/xml.py,sha256=sNJNkZ9rT8B-kJMO6bRz8J1USy4fyPx0m2PwTX7vxYY,3846
|
|
431
|
-
ommlds-0.0.0.
|
|
432
|
-
ommlds-0.0.0.
|
|
433
|
-
ommlds-0.0.0.
|
|
434
|
-
ommlds-0.0.0.
|
|
435
|
-
ommlds-0.0.0.
|
|
436
|
-
ommlds-0.0.0.
|
|
431
|
+
ommlds-0.0.0.dev486.dist-info/licenses/LICENSE,sha256=B_hVtavaA8zCYDW99DYdcpDLKz1n3BBRjZrcbv8uG8c,1451
|
|
432
|
+
ommlds-0.0.0.dev486.dist-info/METADATA,sha256=QHZAQXb9UIRMbEIfePwtnos7dBXG6hcsfwwn_hKUaf8,3493
|
|
433
|
+
ommlds-0.0.0.dev486.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
434
|
+
ommlds-0.0.0.dev486.dist-info/entry_points.txt,sha256=Z5YWtX7ClfiCKdW-dd_CSVvM0h4yQpJPi-2G3q6gNFo,35
|
|
435
|
+
ommlds-0.0.0.dev486.dist-info/top_level.txt,sha256=Rbnk5d5wi58vnAXx13WFZqdQ4VX8hBCS2hEL3WeXOhY,7
|
|
436
|
+
ommlds-0.0.0.dev486.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|