ommlds 0.0.0.dev471__py3-none-any.whl → 0.0.0.dev474__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ommlds might be problematic. Click here for more details.
- ommlds/.omlish-manifests.json +3 -3
- ommlds/__about__.py +8 -2
- ommlds/cli/main.py +1 -1
- ommlds/cli/sessions/chat/backends/inject.py +2 -1
- ommlds/minichain/backends/impls/anthropic/stream.py +1 -1
- ommlds/minichain/backends/impls/google/stream.py +1 -1
- ommlds/minichain/backends/impls/ollama/chat.py +1 -1
- ommlds/minichain/backends/impls/openai/stream.py +1 -1
- ommlds/minichain/backends/impls/tinygrad/chat.py +3 -3
- ommlds/minichain/backends/strings/parsing.py +1 -1
- ommlds/minichain/backends/strings/resolving.py +4 -1
- ommlds/minichain/resources.py +28 -3
- ommlds/minichain/stream/services.py +19 -16
- ommlds/nanochat/LICENSE +21 -0
- ommlds/nanochat/__init__.py +0 -0
- ommlds/nanochat/rustbpe/LICENSE +21 -0
- ommlds/nanochat/tokenizers.py +406 -0
- {ommlds-0.0.0.dev471.dist-info → ommlds-0.0.0.dev474.dist-info}/METADATA +10 -7
- {ommlds-0.0.0.dev471.dist-info → ommlds-0.0.0.dev474.dist-info}/RECORD +23 -19
- {ommlds-0.0.0.dev471.dist-info → ommlds-0.0.0.dev474.dist-info}/WHEEL +0 -0
- {ommlds-0.0.0.dev471.dist-info → ommlds-0.0.0.dev474.dist-info}/entry_points.txt +0 -0
- {ommlds-0.0.0.dev471.dist-info → ommlds-0.0.0.dev474.dist-info}/licenses/LICENSE +0 -0
- {ommlds-0.0.0.dev471.dist-info → ommlds-0.0.0.dev474.dist-info}/top_level.txt +0 -0
ommlds/.omlish-manifests.json
CHANGED
|
@@ -468,7 +468,7 @@
|
|
|
468
468
|
"!.minichain.registries.manifests.RegistryManifest": {
|
|
469
469
|
"module": "ommlds.minichain.backends.impls.tinygrad.chat",
|
|
470
470
|
"attr": "TinygradLlama3ChatChoicesService",
|
|
471
|
-
"name": "
|
|
471
|
+
"name": "tinygrad-llama3",
|
|
472
472
|
"aliases": null,
|
|
473
473
|
"type": "ChatChoicesService"
|
|
474
474
|
}
|
|
@@ -483,7 +483,7 @@
|
|
|
483
483
|
"!.minichain.registries.manifests.RegistryManifest": {
|
|
484
484
|
"module": "ommlds.minichain.backends.impls.tinygrad.chat",
|
|
485
485
|
"attr": "TinygradLlama3ChatChoicesStreamService",
|
|
486
|
-
"name": "
|
|
486
|
+
"name": "tinygrad-llama3",
|
|
487
487
|
"aliases": null,
|
|
488
488
|
"type": "ChatChoicesStreamService"
|
|
489
489
|
}
|
|
@@ -500,7 +500,7 @@
|
|
|
500
500
|
"ChatChoicesService",
|
|
501
501
|
"ChatChoicesStreamService"
|
|
502
502
|
],
|
|
503
|
-
"backend_name": "
|
|
503
|
+
"backend_name": "tinygrad-llama3",
|
|
504
504
|
"model_names": null
|
|
505
505
|
}
|
|
506
506
|
}
|
ommlds/__about__.py
CHANGED
|
@@ -38,7 +38,11 @@ class Project(ProjectBase):
|
|
|
38
38
|
|
|
39
39
|
'huggingface': [
|
|
40
40
|
'huggingface-hub ~= 0.36',
|
|
41
|
-
'datasets ~= 4.
|
|
41
|
+
'datasets ~= 4.4',
|
|
42
|
+
],
|
|
43
|
+
|
|
44
|
+
'nanochat': [
|
|
45
|
+
'regex >= 2025.0',
|
|
42
46
|
],
|
|
43
47
|
|
|
44
48
|
'numpy': [
|
|
@@ -56,7 +60,7 @@ class Project(ProjectBase):
|
|
|
56
60
|
],
|
|
57
61
|
|
|
58
62
|
'search': [
|
|
59
|
-
'ddgs ~= 9.
|
|
63
|
+
'ddgs ~= 9.7',
|
|
60
64
|
],
|
|
61
65
|
|
|
62
66
|
'wiki': [
|
|
@@ -76,6 +80,8 @@ class Project(ProjectBase):
|
|
|
76
80
|
|
|
77
81
|
|
|
78
82
|
class Setuptools(SetuptoolsBase):
|
|
83
|
+
rs = True
|
|
84
|
+
|
|
79
85
|
find_packages = {
|
|
80
86
|
'include': [Project.name, f'{Project.name}.*'],
|
|
81
87
|
'exclude': [*SetuptoolsBase.find_packages['exclude']],
|
ommlds/cli/main.py
CHANGED
|
@@ -2,6 +2,7 @@ import typing as ta
|
|
|
2
2
|
|
|
3
3
|
from omlish import inject as inj
|
|
4
4
|
from omlish import lang
|
|
5
|
+
from omlish import typedvalues as tv
|
|
5
6
|
|
|
6
7
|
from ..... import minichain as mc
|
|
7
8
|
from .injection import backend_configs
|
|
@@ -41,7 +42,7 @@ def bind_backends(
|
|
|
41
42
|
async def inner(be: 'mc.BackendCatalog.Backend', cfgs: _types.BackendConfigs | None) -> ta.Any:
|
|
42
43
|
kwt = inj.build_kwargs_target(be.factory, non_strict=True)
|
|
43
44
|
kw = await injector.provide_kwargs(kwt)
|
|
44
|
-
return be.factory(*cfgs or [], **kw)
|
|
45
|
+
return be.factory(*tv.collect(*(be.configs or []), *(cfgs or []), override=True), **kw)
|
|
45
46
|
|
|
46
47
|
return _catalog.CatalogBackendProvider.Instantiator(inner)
|
|
47
48
|
|
|
@@ -52,7 +52,7 @@ class AnthropicChatChoicesStreamService:
|
|
|
52
52
|
self._model_name = cc.pop(AnthropicChatChoicesService.DEFAULT_MODEL_NAME)
|
|
53
53
|
self._api_key = check.not_none(ApiKey.pop_secret(cc, env='ANTHROPIC_API_KEY'))
|
|
54
54
|
|
|
55
|
-
READ_CHUNK_SIZE =
|
|
55
|
+
READ_CHUNK_SIZE: ta.ClassVar[int] = -1
|
|
56
56
|
|
|
57
57
|
async def invoke(
|
|
58
58
|
self,
|
|
@@ -146,7 +146,7 @@ class OllamaChatChoicesService(BaseOllamaChatChoicesService):
|
|
|
146
146
|
# )
|
|
147
147
|
@static_check_is_chat_choices_stream_service
|
|
148
148
|
class OllamaChatChoicesStreamService(BaseOllamaChatChoicesService):
|
|
149
|
-
READ_CHUNK_SIZE =
|
|
149
|
+
READ_CHUNK_SIZE: ta.ClassVar[int] = -1
|
|
150
150
|
|
|
151
151
|
async def invoke(
|
|
152
152
|
self,
|
|
@@ -54,7 +54,7 @@ class OpenaiChatChoicesStreamService:
|
|
|
54
54
|
self._model_name = cc.pop(OpenaiChatChoicesService.DEFAULT_MODEL_NAME)
|
|
55
55
|
self._api_key = ApiKey.pop_secret(cc, env='OPENAI_API_KEY')
|
|
56
56
|
|
|
57
|
-
READ_CHUNK_SIZE =
|
|
57
|
+
READ_CHUNK_SIZE: ta.ClassVar[int] = -1
|
|
58
58
|
|
|
59
59
|
async def invoke(self, request: ChatChoicesStreamRequest) -> ChatChoicesStreamResponse:
|
|
60
60
|
# check.isinstance(request, ChatRequest)
|
|
@@ -113,7 +113,7 @@ class BaseTinygradLlama3ChatService(lang.ExitStacked, lang.Abstract):
|
|
|
113
113
|
|
|
114
114
|
|
|
115
115
|
# @omlish-manifest $.minichain.registries.manifests.RegistryManifest(
|
|
116
|
-
# name='
|
|
116
|
+
# name='tinygrad-llama3',
|
|
117
117
|
# type='ChatChoicesService',
|
|
118
118
|
# )
|
|
119
119
|
@static_check_is_chat_choices_service
|
|
@@ -133,7 +133,7 @@ class TinygradLlama3ChatChoicesService(BaseTinygradLlama3ChatService):
|
|
|
133
133
|
|
|
134
134
|
|
|
135
135
|
# @omlish-manifest $.minichain.registries.manifests.RegistryManifest(
|
|
136
|
-
# name='
|
|
136
|
+
# name='tinygrad-llama3',
|
|
137
137
|
# type='ChatChoicesStreamService',
|
|
138
138
|
# )
|
|
139
139
|
@static_check_is_chat_choices_stream_service
|
|
@@ -168,5 +168,5 @@ class TinygradLlama3ChatChoicesStreamService(BaseTinygradLlama3ChatService):
|
|
|
168
168
|
# 'ChatChoicesService',
|
|
169
169
|
# 'ChatChoicesStreamService',
|
|
170
170
|
# ],
|
|
171
|
-
# '
|
|
171
|
+
# 'tinygrad-llama3',
|
|
172
172
|
# )
|
|
@@ -108,7 +108,10 @@ class ManifestBackendStringResolver(BackendStringResolver):
|
|
|
108
108
|
|
|
109
109
|
mn: str | None = mdl.name
|
|
110
110
|
|
|
111
|
-
if
|
|
111
|
+
if args.parsed.backend == m.backend_name and mn is not None:
|
|
112
|
+
pass
|
|
113
|
+
|
|
114
|
+
elif mn == m.backend_name:
|
|
112
115
|
if m.model_names is not None:
|
|
113
116
|
mn = m.model_names.resolved_default
|
|
114
117
|
else:
|
ommlds/minichain/resources.py
CHANGED
|
@@ -31,6 +31,7 @@ class ResourcesRefNotRegisteredError(Exception):
|
|
|
31
31
|
pass
|
|
32
32
|
|
|
33
33
|
|
|
34
|
+
@ta.final
|
|
34
35
|
class Resources(lang.Final, lang.NotPicklable):
|
|
35
36
|
def __init__(
|
|
36
37
|
self,
|
|
@@ -145,24 +146,48 @@ class Resources(lang.Final, lang.NotPicklable):
|
|
|
145
146
|
##
|
|
146
147
|
|
|
147
148
|
|
|
149
|
+
@ta.final
|
|
148
150
|
class ResourceManaged(ResourcesRef, lang.Final, lang.NotPicklable, ta.Generic[T]):
|
|
151
|
+
"""
|
|
152
|
+
A class to 'handoff' a ref to a `Resources`, allowing the `Resources` to temporarily survive being passed from
|
|
153
|
+
instantiation within a callee to being `__aenter__`'d in the caller.
|
|
154
|
+
|
|
155
|
+
The ref to the `Resources` is allocated in the ctor, so the contract is that an instance of this must be immediately
|
|
156
|
+
`__aenter__`'d before doing anything else with the return value of the call. Failure to do so leaks the `Resources`.
|
|
157
|
+
"""
|
|
158
|
+
|
|
149
159
|
def __init__(self, v: T, resources: Resources) -> None:
|
|
150
160
|
super().__init__()
|
|
151
161
|
|
|
152
|
-
self.
|
|
162
|
+
self.__v = v
|
|
153
163
|
self.__resources = resources
|
|
154
164
|
|
|
155
165
|
resources.add_ref(self)
|
|
156
166
|
|
|
167
|
+
__state: ta.Literal['new', 'entered', 'exited'] = 'new'
|
|
168
|
+
|
|
157
169
|
def __repr__(self) -> str:
|
|
158
|
-
return f'{self.__class__.__name__}<{self.
|
|
170
|
+
return f'{self.__class__.__name__}<{self.__v!r}, {self.__state}>'
|
|
159
171
|
|
|
160
172
|
async def __aenter__(self) -> T:
|
|
161
|
-
|
|
173
|
+
check.state(self.__state == 'new')
|
|
174
|
+
self.__state = 'entered'
|
|
175
|
+
return self.__v
|
|
162
176
|
|
|
163
177
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
178
|
+
check.state(self.__state == 'entered')
|
|
179
|
+
self.__state = 'exited'
|
|
164
180
|
await self.__resources.remove_ref(self)
|
|
165
181
|
|
|
182
|
+
def __del__(self) -> None:
|
|
183
|
+
if self.__state != 'exited':
|
|
184
|
+
log.error(
|
|
185
|
+
f'{__package__}.{self.__class__.__name__}.__del__: ' # noqa
|
|
186
|
+
f'%r deleted without being entered and exited! '
|
|
187
|
+
f'resources: %s',
|
|
188
|
+
repr(self),
|
|
189
|
+
repr(self.__resources),
|
|
190
|
+
)
|
|
166
191
|
|
|
167
192
|
##
|
|
168
193
|
|
|
@@ -125,17 +125,18 @@ class _StreamServiceResponse(StreamResponseIterator[V, OutputT]):
|
|
|
125
125
|
return
|
|
126
126
|
if self._cr.cr_running or self._cr.cr_suspended:
|
|
127
127
|
cex = StreamServiceCancelledError()
|
|
128
|
-
|
|
128
|
+
i = None
|
|
129
|
+
for n in itertools.count():
|
|
129
130
|
try:
|
|
130
|
-
if not
|
|
131
|
+
if not n:
|
|
131
132
|
x = self._g.throw(cex)
|
|
132
133
|
else:
|
|
133
|
-
x = self._g.send(
|
|
134
|
+
x = self._g.send(i)
|
|
134
135
|
except StreamServiceCancelledError as cex2:
|
|
135
136
|
if cex2 is cex:
|
|
136
137
|
break
|
|
137
138
|
raise
|
|
138
|
-
yield x
|
|
139
|
+
i = yield x
|
|
139
140
|
if self._cr.cr_running:
|
|
140
141
|
raise RuntimeError(f'Coroutine {self._cr!r} not terminated')
|
|
141
142
|
if self._g is not self._a:
|
|
@@ -155,9 +156,10 @@ class _StreamServiceResponse(StreamResponseIterator[V, OutputT]):
|
|
|
155
156
|
@types.coroutine
|
|
156
157
|
def _anext(self):
|
|
157
158
|
check.state(self._state == 'running')
|
|
159
|
+
i = None
|
|
158
160
|
while True:
|
|
159
161
|
try:
|
|
160
|
-
x = self._g.send(
|
|
162
|
+
x = self._g.send(i)
|
|
161
163
|
except StopIteration as e:
|
|
162
164
|
if e.value is not None:
|
|
163
165
|
self._outputs = tv.TypedValues(*check.isinstance(e.value, ta.Sequence))
|
|
@@ -170,7 +172,7 @@ class _StreamServiceResponse(StreamResponseIterator[V, OutputT]):
|
|
|
170
172
|
x.done = True
|
|
171
173
|
return x.value
|
|
172
174
|
|
|
173
|
-
yield x
|
|
175
|
+
i = yield x
|
|
174
176
|
|
|
175
177
|
async def __anext__(self) -> V:
|
|
176
178
|
return await self._anext()
|
|
@@ -195,13 +197,14 @@ async def new_stream_response(
|
|
|
195
197
|
fn: ta.Callable[[StreamResponseSink[V]], ta.Awaitable[ta.Sequence[OutputT] | None]],
|
|
196
198
|
outputs: ta.Sequence[StreamOutputT] | None = None,
|
|
197
199
|
) -> StreamResponse[V, OutputT, StreamOutputT]:
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
200
|
+
ssr = _StreamServiceResponse(fn)
|
|
201
|
+
|
|
202
|
+
v = rs.new_managed(await rs.enter_async_context(ssr))
|
|
203
|
+
try:
|
|
204
|
+
return StreamResponse(v, outputs or [])
|
|
205
|
+
except BaseException: # noqa
|
|
206
|
+
# The StreamResponse ctor can raise - for example in `_tv_field_coercer` - in which case we need to clean up the
|
|
207
|
+
# resources ref we have already allocated before reraising.
|
|
208
|
+
async with v:
|
|
209
|
+
pass
|
|
210
|
+
raise
|
ommlds/nanochat/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Andrej Karpathy
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
File without changes
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Andrej Karpathy
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,406 @@
|
|
|
1
|
+
# https://github.com/karpathy/nanochat/tree/9467d83cf23dcc9a9b4ca6e35103142f48a55b27
|
|
2
|
+
"""
|
|
3
|
+
BPE Tokenizer in the style of GPT-4.
|
|
4
|
+
|
|
5
|
+
Two implementations are available:
|
|
6
|
+
1) HuggingFace Tokenizer that can do both training and inference but is really confusing
|
|
7
|
+
2) Our own RustBPE Tokenizer for training and tiktoken for efficient inference
|
|
8
|
+
"""
|
|
9
|
+
import copy
|
|
10
|
+
import os
|
|
11
|
+
import pickle
|
|
12
|
+
import typing as ta
|
|
13
|
+
|
|
14
|
+
from omlish import check
|
|
15
|
+
from omlish import collections as col
|
|
16
|
+
from omlish import lang
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
with lang.auto_proxy_import(globals()):
|
|
20
|
+
import tiktoken
|
|
21
|
+
import tokenizers
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
rustbpe: ta.Any = lang.proxy_import('.rustbpe', __package__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
##
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
SPECIAL_TOKENS = [
|
|
31
|
+
# every document begins with the Beginning of Sequence (BOS) token that delimits documents
|
|
32
|
+
'<|bos|>',
|
|
33
|
+
# tokens below are only used during finetuning to render Conversations into token ids
|
|
34
|
+
'<|user_start|>', # user messages
|
|
35
|
+
'<|user_end|>',
|
|
36
|
+
'<|assistant_start|>', # assistant messages
|
|
37
|
+
'<|assistant_end|>',
|
|
38
|
+
'<|python_start|>', # assistant invokes python REPL tool
|
|
39
|
+
'<|python_end|>',
|
|
40
|
+
'<|output_start|>', # python REPL outputs back to assistant
|
|
41
|
+
'<|output_end|>',
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# NOTE: this split pattern deviates from GPT-4 in that we use \p{N}{1,2} instead of \p{N}{1,3}
|
|
46
|
+
# I did this because I didn't want to "waste" too many tokens on numbers for smaller vocab sizes.
|
|
47
|
+
# I haven't validated that this is actually a good idea, TODO.
|
|
48
|
+
SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" # noqa
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# -----------------------------------------------------------------------------
|
|
52
|
+
# Generic GPT-4-style tokenizer based on HuggingFace Tokenizer
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class HuggingFaceTokenizer:
|
|
56
|
+
"""Light wrapper around HuggingFace Tokenizer for some utilities"""
|
|
57
|
+
|
|
58
|
+
def __init__(self, tokenizer):
|
|
59
|
+
self.tokenizer = tokenizer
|
|
60
|
+
|
|
61
|
+
@classmethod
|
|
62
|
+
def from_pretrained(cls, hf_path):
|
|
63
|
+
# init from a HuggingFace pretrained tokenizer (e.g. "gpt2")
|
|
64
|
+
tokenizer = tokenizers.Tokenizer.from_pretrained(hf_path)
|
|
65
|
+
return cls(tokenizer)
|
|
66
|
+
|
|
67
|
+
@classmethod
|
|
68
|
+
def from_directory(cls, tokenizer_dir):
|
|
69
|
+
# init from a local directory on disk (e.g. "out/tokenizer")
|
|
70
|
+
tokenizer_path = os.path.join(tokenizer_dir, 'tokenizer.json')
|
|
71
|
+
tokenizer = tokenizers.Tokenizer.from_file(tokenizer_path)
|
|
72
|
+
return cls(tokenizer)
|
|
73
|
+
|
|
74
|
+
@classmethod
|
|
75
|
+
def train_from_iterator(
|
|
76
|
+
cls,
|
|
77
|
+
text_iterator,
|
|
78
|
+
vocab_size,
|
|
79
|
+
*,
|
|
80
|
+
split_pattern=SPLIT_PATTERN,
|
|
81
|
+
special_tokens=SPECIAL_TOKENS,
|
|
82
|
+
):
|
|
83
|
+
# train from an iterator of text
|
|
84
|
+
# Configure the HuggingFace Tokenizer
|
|
85
|
+
tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE(
|
|
86
|
+
byte_fallback=True, # needed!
|
|
87
|
+
unk_token=None,
|
|
88
|
+
fuse_unk=False,
|
|
89
|
+
))
|
|
90
|
+
# Normalizer: None
|
|
91
|
+
tokenizer.normalizer = None
|
|
92
|
+
# Pre-tokenizer: GPT-4 style
|
|
93
|
+
# the regex pattern used by GPT-4 to split text into groups before BPE
|
|
94
|
+
# NOTE: The pattern was changed from \p{N}{1,3} to \p{N}{1,2} because I suspect it is harmful to
|
|
95
|
+
# very small models and smaller vocab sizes, because it is a little bit wasteful in the token space.
|
|
96
|
+
# (but I haven't validated this! TODO)
|
|
97
|
+
gpt4_split_regex = tokenizers.Regex(split_pattern) # huggingface demands that you wrap it in Regex!!
|
|
98
|
+
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence([
|
|
99
|
+
tokenizers.pre_tokenizers.Split(pattern=gpt4_split_regex, behavior='isolated', invert=False),
|
|
100
|
+
tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False),
|
|
101
|
+
])
|
|
102
|
+
# Decoder: ByteLevel (it pairs together with the ByteLevel pre-tokenizer)
|
|
103
|
+
tokenizer.decoder = tokenizers.decoders.ByteLevel()
|
|
104
|
+
# Post-processor: None
|
|
105
|
+
tokenizer.post_processor = None
|
|
106
|
+
# Trainer: BPE
|
|
107
|
+
trainer = tokenizers.trainers.BpeTrainer(
|
|
108
|
+
vocab_size=vocab_size,
|
|
109
|
+
show_progress=True,
|
|
110
|
+
min_frequency=0, # no minimum frequency
|
|
111
|
+
initial_alphabet=tokenizers.pre_tokenizers.ByteLevel.alphabet(),
|
|
112
|
+
special_tokens=special_tokens,
|
|
113
|
+
)
|
|
114
|
+
# Kick off the training
|
|
115
|
+
tokenizer.train_from_iterator(text_iterator, trainer)
|
|
116
|
+
return cls(tokenizer)
|
|
117
|
+
|
|
118
|
+
def encode_ordinary(self, text):
|
|
119
|
+
ids = self.tokenizer.encode(text, add_special_tokens=False).ids
|
|
120
|
+
return ids
|
|
121
|
+
|
|
122
|
+
def get_vocab_size(self):
|
|
123
|
+
return self.tokenizer.get_vocab_size()
|
|
124
|
+
|
|
125
|
+
def get_special_tokens(self):
|
|
126
|
+
special_tokens_map = self.tokenizer.get_added_tokens_decoder()
|
|
127
|
+
special_tokens = [w.content for w in special_tokens_map.values()]
|
|
128
|
+
return special_tokens
|
|
129
|
+
|
|
130
|
+
def id_to_token(self, id): # noqa
|
|
131
|
+
return self.tokenizer.id_to_token(id)
|
|
132
|
+
|
|
133
|
+
def _encode_one(self, text, prepend=None, append=None):
|
|
134
|
+
# encode a single string
|
|
135
|
+
# prepend/append can be either a string of a special token or a token id directly.
|
|
136
|
+
check.isinstance(text, str)
|
|
137
|
+
ids = []
|
|
138
|
+
if prepend is not None:
|
|
139
|
+
prepend_id = prepend if isinstance(prepend, int) else self.encode_special(prepend)
|
|
140
|
+
ids.append(prepend_id)
|
|
141
|
+
ids.extend(self.tokenizer.encode(text, add_special_tokens=False).ids)
|
|
142
|
+
if append is not None:
|
|
143
|
+
append_id = append if isinstance(append, int) else self.encode_special(append)
|
|
144
|
+
ids.append(append_id)
|
|
145
|
+
return ids
|
|
146
|
+
|
|
147
|
+
def encode_special(self, text):
|
|
148
|
+
# encode a single special token via exact match
|
|
149
|
+
return self.tokenizer.token_to_id(text)
|
|
150
|
+
|
|
151
|
+
def get_bos_token_id(self):
|
|
152
|
+
bos = self.encode_special('<|bos|>')
|
|
153
|
+
return bos
|
|
154
|
+
|
|
155
|
+
def encode(self, text, *args, **kwargs):
|
|
156
|
+
if isinstance(text, str):
|
|
157
|
+
return self._encode_one(text, *args, **kwargs)
|
|
158
|
+
elif isinstance(text, list):
|
|
159
|
+
return [self._encode_one(t, *args, **kwargs) for t in text]
|
|
160
|
+
else:
|
|
161
|
+
raise ValueError(f'Invalid input type: {type(text)}') # noqa
|
|
162
|
+
|
|
163
|
+
def __call__(self, *args, **kwargs):
|
|
164
|
+
return self.encode(*args, **kwargs)
|
|
165
|
+
|
|
166
|
+
def decode(self, ids):
|
|
167
|
+
return self.tokenizer.decode(ids, skip_special_tokens=False)
|
|
168
|
+
|
|
169
|
+
def save(self, tokenizer_dir):
|
|
170
|
+
# save the tokenizer to disk
|
|
171
|
+
os.makedirs(tokenizer_dir, exist_ok=True)
|
|
172
|
+
tokenizer_path = os.path.join(tokenizer_dir, 'tokenizer.json')
|
|
173
|
+
self.tokenizer.save(tokenizer_path)
|
|
174
|
+
print(f'Saved tokenizer to {tokenizer_path}')
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
# -----------------------------------------------------------------------------
|
|
178
|
+
# Tokenizer based on rustbpe + tiktoken combo
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class RustBPETokenizer:
|
|
182
|
+
"""Light wrapper around tiktoken (for efficient inference) but train with rustbpe"""
|
|
183
|
+
|
|
184
|
+
def __init__(self, enc, bos_token):
|
|
185
|
+
self.enc = enc
|
|
186
|
+
self.bos_token_id = self.encode_special(bos_token)
|
|
187
|
+
|
|
188
|
+
@classmethod
|
|
189
|
+
def train_from_iterator(cls, text_iterator, vocab_size):
|
|
190
|
+
# 1) train using rustbpe
|
|
191
|
+
tokenizer = rustbpe.Tokenizer()
|
|
192
|
+
# the special tokens are inserted later in __init__, we don't train them here
|
|
193
|
+
vocab_size_no_special = vocab_size - len(SPECIAL_TOKENS)
|
|
194
|
+
check.state(vocab_size_no_special >= 256, f'vocab_size_no_special must be at least 256, got {vocab_size_no_special}') # noqa
|
|
195
|
+
tokenizer.train_from_iterator(text_iterator, vocab_size_no_special, pattern=SPLIT_PATTERN)
|
|
196
|
+
# 2) construct the associated tiktoken encoding for inference
|
|
197
|
+
pattern = tokenizer.get_pattern()
|
|
198
|
+
mergeable_ranks_list = tokenizer.get_mergeable_ranks()
|
|
199
|
+
mergeable_ranks = {bytes(k): v for k, v in mergeable_ranks_list}
|
|
200
|
+
tokens_offset = len(mergeable_ranks)
|
|
201
|
+
special_tokens = {name: tokens_offset + i for i, name in enumerate(SPECIAL_TOKENS)}
|
|
202
|
+
enc = tiktoken.Encoding(
|
|
203
|
+
name='rustbpe',
|
|
204
|
+
pat_str=pattern,
|
|
205
|
+
mergeable_ranks=mergeable_ranks, # dict[bytes, int] (token bytes -> merge priority rank)
|
|
206
|
+
special_tokens=special_tokens, # dict[str, int] (special token name -> token id)
|
|
207
|
+
)
|
|
208
|
+
return cls(enc, '<|bos|>')
|
|
209
|
+
|
|
210
|
+
@classmethod
|
|
211
|
+
def from_directory(cls, tokenizer_dir):
|
|
212
|
+
pickle_path = os.path.join(tokenizer_dir, 'tokenizer.pkl')
|
|
213
|
+
with open(pickle_path, 'rb') as f:
|
|
214
|
+
enc = pickle.load(f) # noqa
|
|
215
|
+
return cls(enc, '<|bos|>')
|
|
216
|
+
|
|
217
|
+
@classmethod
|
|
218
|
+
def from_pretrained(cls, tiktoken_name):
|
|
219
|
+
# https://github.com/openai/tiktoken/blob/eedc8563/tiktoken_ext/openai_public.py
|
|
220
|
+
enc = tiktoken.get_encoding(tiktoken_name)
|
|
221
|
+
# tiktoken calls the special document delimiter token "<|endoftext|>"
|
|
222
|
+
# yes this is confusing because this token is almost always PREPENDED to the beginning of the document
|
|
223
|
+
# it most often is used to signal the start of a new sequence to the LLM during inference etc.
|
|
224
|
+
# so in nanoChat we always use "<|bos|>" short for "beginning of sequence", but historically it is often called
|
|
225
|
+
# "<|endoftext|>".
|
|
226
|
+
return cls(enc, '<|endoftext|>')
|
|
227
|
+
|
|
228
|
+
def get_vocab_size(self):
|
|
229
|
+
return self.enc.n_vocab
|
|
230
|
+
|
|
231
|
+
def get_special_tokens(self):
|
|
232
|
+
return self.enc.special_tokens_set
|
|
233
|
+
|
|
234
|
+
def id_to_token(self, id): # noqa
|
|
235
|
+
return self.enc.decode([id])
|
|
236
|
+
|
|
237
|
+
@col.cache.cache(max_size=32)
|
|
238
|
+
def encode_special(self, text):
|
|
239
|
+
return self.enc.encode_single_token(text)
|
|
240
|
+
|
|
241
|
+
def get_bos_token_id(self):
|
|
242
|
+
return self.bos_token_id
|
|
243
|
+
|
|
244
|
+
def encode(self, text, prepend=None, append=None, num_threads=8):
|
|
245
|
+
# text can be either a string or a list of strings
|
|
246
|
+
|
|
247
|
+
if prepend is not None:
|
|
248
|
+
prepend_id = prepend if isinstance(prepend, int) else self.encode_special(prepend)
|
|
249
|
+
if append is not None:
|
|
250
|
+
append_id = append if isinstance(append, int) else self.encode_special(append)
|
|
251
|
+
|
|
252
|
+
if isinstance(text, str):
|
|
253
|
+
ids = self.enc.encode_ordinary(text)
|
|
254
|
+
if prepend is not None:
|
|
255
|
+
ids.insert(0, prepend_id) # TODO: slightly inefficient here? :( hmm
|
|
256
|
+
if append is not None:
|
|
257
|
+
ids.append(append_id)
|
|
258
|
+
elif isinstance(text, list):
|
|
259
|
+
ids = self.enc.encode_ordinary_batch(text, num_threads=num_threads)
|
|
260
|
+
if prepend is not None:
|
|
261
|
+
for ids_row in ids:
|
|
262
|
+
ids_row.insert(0, prepend_id) # TODO: same
|
|
263
|
+
if append is not None:
|
|
264
|
+
for ids_row in ids:
|
|
265
|
+
ids_row.append(append_id)
|
|
266
|
+
else:
|
|
267
|
+
raise ValueError(f'Invalid input type: {type(text)}') # noqa
|
|
268
|
+
|
|
269
|
+
return ids
|
|
270
|
+
|
|
271
|
+
def __call__(self, *args, **kwargs):
|
|
272
|
+
return self.encode(*args, **kwargs)
|
|
273
|
+
|
|
274
|
+
def decode(self, ids):
|
|
275
|
+
return self.enc.decode(ids)
|
|
276
|
+
|
|
277
|
+
def save(self, tokenizer_dir):
|
|
278
|
+
# save the encoding object to disk
|
|
279
|
+
os.makedirs(tokenizer_dir, exist_ok=True)
|
|
280
|
+
pickle_path = os.path.join(tokenizer_dir, 'tokenizer.pkl')
|
|
281
|
+
with open(pickle_path, 'wb') as f:
|
|
282
|
+
pickle.dump(self.enc, f)
|
|
283
|
+
print(f'Saved tokenizer encoding to {pickle_path}')
|
|
284
|
+
|
|
285
|
+
def render_conversation(self, conversation, max_tokens=2048):
|
|
286
|
+
"""
|
|
287
|
+
Tokenize a single Chat conversation (which we call a "doc" or "document" here).
|
|
288
|
+
Returns:
|
|
289
|
+
- ids: list[int] is a list of token ids of this rendered conversation
|
|
290
|
+
- mask: list[int] of same length, mask = 1 for tokens that the Assistant is expected to train on.
|
|
291
|
+
"""
|
|
292
|
+
|
|
293
|
+
# ids, masks that we will return and a helper function to help build them up.
|
|
294
|
+
ids, mask = [], []
|
|
295
|
+
|
|
296
|
+
def add_tokens(token_ids, mask_val):
|
|
297
|
+
if isinstance(token_ids, int):
|
|
298
|
+
token_ids = [token_ids]
|
|
299
|
+
ids.extend(token_ids)
|
|
300
|
+
mask.extend([mask_val] * len(token_ids))
|
|
301
|
+
|
|
302
|
+
# sometimes the first message is a system message...
|
|
303
|
+
# => just merge it with the second (user) message
|
|
304
|
+
if conversation['messages'][0]['role'] == 'system':
|
|
305
|
+
# some conversation surgery is necessary here for now...
|
|
306
|
+
conversation = copy.deepcopy(conversation) # avoid mutating the original
|
|
307
|
+
messages = conversation['messages']
|
|
308
|
+
check.state(messages[1]['role'] == 'user', 'System message must be followed by a user message')
|
|
309
|
+
messages[1]['content'] = messages[0]['content'] + '\n\n' + messages[1]['content']
|
|
310
|
+
messages = messages[1:]
|
|
311
|
+
else:
|
|
312
|
+
messages = conversation['messages']
|
|
313
|
+
check.state(len(messages) >= 1, f'Conversation has less than 1 message: {messages}')
|
|
314
|
+
|
|
315
|
+
# fetch all the special tokens we need
|
|
316
|
+
bos = self.get_bos_token_id()
|
|
317
|
+
user_start, user_end = self.encode_special('<|user_start|>'), self.encode_special('<|user_end|>')
|
|
318
|
+
assistant_start, assistant_end = self.encode_special('<|assistant_start|>'), self.encode_special('<|assistant_end|>') # noqa
|
|
319
|
+
python_start, python_end = self.encode_special('<|python_start|>'), self.encode_special('<|python_end|>')
|
|
320
|
+
output_start, output_end = self.encode_special('<|output_start|>'), self.encode_special('<|output_end|>')
|
|
321
|
+
|
|
322
|
+
# now we can tokenize the conversation
|
|
323
|
+
add_tokens(bos, 0)
|
|
324
|
+
for i, message in enumerate(messages):
|
|
325
|
+
# some sanity checking here around assumptions, to prevent footguns
|
|
326
|
+
must_be_from = 'user' if i % 2 == 0 else 'assistant'
|
|
327
|
+
check.state(message['role'] == must_be_from, f"Message {i} is from {message['role']} but should be from {must_be_from}") # noqa
|
|
328
|
+
|
|
329
|
+
# content can be either a simple string or a list of parts (e.g. containing tool calls)
|
|
330
|
+
content = message['content']
|
|
331
|
+
|
|
332
|
+
if message['role'] == 'user':
|
|
333
|
+
check.isinstance(content, str), 'User messages are simply expected to be strings'
|
|
334
|
+
value_ids = self.encode(content)
|
|
335
|
+
add_tokens(user_start, 0)
|
|
336
|
+
add_tokens(value_ids, 0)
|
|
337
|
+
add_tokens(user_end, 0)
|
|
338
|
+
elif message['role'] == 'assistant':
|
|
339
|
+
add_tokens(assistant_start, 0)
|
|
340
|
+
if isinstance(content, str):
|
|
341
|
+
# simple string => simply add the tokens
|
|
342
|
+
value_ids = self.encode(content)
|
|
343
|
+
add_tokens(value_ids, 1)
|
|
344
|
+
elif isinstance(content, list):
|
|
345
|
+
for part in content:
|
|
346
|
+
value_ids = self.encode(part['text'])
|
|
347
|
+
if part['type'] == 'text':
|
|
348
|
+
# string part => simply add the tokens
|
|
349
|
+
add_tokens(value_ids, 1)
|
|
350
|
+
elif part['type'] == 'python':
|
|
351
|
+
# python tool call => add the tokens inside <|python_start|> and <|python_end|>
|
|
352
|
+
add_tokens(python_start, 1)
|
|
353
|
+
add_tokens(value_ids, 1)
|
|
354
|
+
add_tokens(python_end, 1)
|
|
355
|
+
elif part['type'] == 'python_output':
|
|
356
|
+
# python output => add the tokens inside <|output_start|> and <|output_end|>
|
|
357
|
+
# none of these tokens are supervised because the tokens come from Python at test time
|
|
358
|
+
add_tokens(output_start, 0)
|
|
359
|
+
add_tokens(value_ids, 0)
|
|
360
|
+
add_tokens(output_end, 0)
|
|
361
|
+
else:
|
|
362
|
+
raise ValueError(f"Unknown part type: {part['type']}")
|
|
363
|
+
else:
|
|
364
|
+
raise ValueError(f'Unknown content type: {type(content)}')
|
|
365
|
+
add_tokens(assistant_end, 1)
|
|
366
|
+
|
|
367
|
+
# truncate to max_tokens tokens MAX (helps prevent OOMs)
|
|
368
|
+
ids = ids[:max_tokens]
|
|
369
|
+
mask = mask[:max_tokens]
|
|
370
|
+
return ids, mask
|
|
371
|
+
|
|
372
|
+
def visualize_tokenization(self, ids, mask, with_token_id=False):
|
|
373
|
+
"""Small helper function useful in debugging: visualize the tokenization of render_conversation"""
|
|
374
|
+
|
|
375
|
+
red = '\033[91m'
|
|
376
|
+
green = '\033[92m'
|
|
377
|
+
reset = '\033[0m'
|
|
378
|
+
gray = '\033[90m'
|
|
379
|
+
tokens = []
|
|
380
|
+
for i, (token_id, mask_val) in enumerate(zip(ids, mask)): # noqa
|
|
381
|
+
token_str = self.decode([token_id])
|
|
382
|
+
color = green if mask_val == 1 else red
|
|
383
|
+
tokens.append(f'{color}{token_str}{reset}')
|
|
384
|
+
if with_token_id:
|
|
385
|
+
tokens.append(f'{gray}({token_id}){reset}')
|
|
386
|
+
return '|'.join(tokens)
|
|
387
|
+
|
|
388
|
+
def render_for_completion(self, conversation):
|
|
389
|
+
"""
|
|
390
|
+
Used during Reinforcement Learning. In that setting, we want to render the conversation priming the Assistant
|
|
391
|
+
for a completion. Unlike the Chat SFT case, we don't need to return the mask.
|
|
392
|
+
"""
|
|
393
|
+
|
|
394
|
+
# We have some surgery to do: we need to pop the last message (of the Assistant)
|
|
395
|
+
conversation = copy.deepcopy(conversation) # avoid mutating the original
|
|
396
|
+
messages = conversation['messages']
|
|
397
|
+
check.state(messages[-1]['role'] == 'assistant', 'Last message must be from the Assistant')
|
|
398
|
+
messages.pop() # remove the last message (of the Assistant) inplace
|
|
399
|
+
|
|
400
|
+
# Now tokenize the conversation
|
|
401
|
+
ids, mask = self.render_conversation(conversation)
|
|
402
|
+
|
|
403
|
+
# Finally, to prime the Assistant for a completion, append the Assistant start token
|
|
404
|
+
assistant_start = self.encode_special('<|assistant_start|>')
|
|
405
|
+
ids.append(assistant_start)
|
|
406
|
+
return ids
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ommlds
|
|
3
|
-
Version: 0.0.0.
|
|
3
|
+
Version: 0.0.0.dev474
|
|
4
4
|
Summary: ommlds
|
|
5
5
|
Author: wrmsr
|
|
6
6
|
License-Expression: BSD-3-Clause
|
|
@@ -14,8 +14,8 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
14
14
|
Requires-Python: >=3.13
|
|
15
15
|
Description-Content-Type: text/markdown
|
|
16
16
|
License-File: LICENSE
|
|
17
|
-
Requires-Dist: omdev==0.0.0.
|
|
18
|
-
Requires-Dist: omlish==0.0.0.
|
|
17
|
+
Requires-Dist: omdev==0.0.0.dev474
|
|
18
|
+
Requires-Dist: omlish==0.0.0.dev474
|
|
19
19
|
Provides-Extra: all
|
|
20
20
|
Requires-Dist: llama-cpp-python~=0.3; extra == "all"
|
|
21
21
|
Requires-Dist: mlx~=0.29; extra == "all"
|
|
@@ -27,12 +27,13 @@ Requires-Dist: torch~=2.9; extra == "all"
|
|
|
27
27
|
Requires-Dist: transformers~=4.57; extra == "all"
|
|
28
28
|
Requires-Dist: sentence-transformers~=5.1; extra == "all"
|
|
29
29
|
Requires-Dist: huggingface-hub~=0.36; extra == "all"
|
|
30
|
-
Requires-Dist: datasets~=4.
|
|
30
|
+
Requires-Dist: datasets~=4.4; extra == "all"
|
|
31
|
+
Requires-Dist: regex>=2025.0; extra == "all"
|
|
31
32
|
Requires-Dist: numpy>=1.26; extra == "all"
|
|
32
33
|
Requires-Dist: pytesseract~=0.3; extra == "all"
|
|
33
34
|
Requires-Dist: rapidocr-onnxruntime~=1.4; extra == "all"
|
|
34
35
|
Requires-Dist: pillow~=12.0; extra == "all"
|
|
35
|
-
Requires-Dist: ddgs~=9.
|
|
36
|
+
Requires-Dist: ddgs~=9.7; extra == "all"
|
|
36
37
|
Requires-Dist: mwparserfromhell~=0.7; extra == "all"
|
|
37
38
|
Requires-Dist: wikitextparser~=0.56; extra == "all"
|
|
38
39
|
Requires-Dist: lxml>=5.3; python_version < "3.13" and extra == "all"
|
|
@@ -48,7 +49,9 @@ Requires-Dist: transformers~=4.57; extra == "backends"
|
|
|
48
49
|
Requires-Dist: sentence-transformers~=5.1; extra == "backends"
|
|
49
50
|
Provides-Extra: huggingface
|
|
50
51
|
Requires-Dist: huggingface-hub~=0.36; extra == "huggingface"
|
|
51
|
-
Requires-Dist: datasets~=4.
|
|
52
|
+
Requires-Dist: datasets~=4.4; extra == "huggingface"
|
|
53
|
+
Provides-Extra: nanochat
|
|
54
|
+
Requires-Dist: regex>=2025.0; extra == "nanochat"
|
|
52
55
|
Provides-Extra: numpy
|
|
53
56
|
Requires-Dist: numpy>=1.26; extra == "numpy"
|
|
54
57
|
Provides-Extra: ocr
|
|
@@ -57,7 +60,7 @@ Requires-Dist: rapidocr-onnxruntime~=1.4; extra == "ocr"
|
|
|
57
60
|
Provides-Extra: pillow
|
|
58
61
|
Requires-Dist: pillow~=12.0; extra == "pillow"
|
|
59
62
|
Provides-Extra: search
|
|
60
|
-
Requires-Dist: ddgs~=9.
|
|
63
|
+
Requires-Dist: ddgs~=9.7; extra == "search"
|
|
61
64
|
Provides-Extra: wiki
|
|
62
65
|
Requires-Dist: mwparserfromhell~=0.7; extra == "wiki"
|
|
63
66
|
Requires-Dist: wikitextparser~=0.56; extra == "wiki"
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
ommlds/.omlish-manifests.json,sha256=
|
|
2
|
-
ommlds/__about__.py,sha256=
|
|
1
|
+
ommlds/.omlish-manifests.json,sha256=jsNWNqNQTjpxr-irqzy0dWpqeZH9CsD0SFY3OgIGuZ4,21555
|
|
2
|
+
ommlds/__about__.py,sha256=JfWLdyd4-ql5G5V_NsPnN3NPkdhwhSFR0kQt9j7jhgA,1839
|
|
3
3
|
ommlds/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
ommlds/_hacks/__init__.py,sha256=ajfw7dMKH8UuloeQ5MSxWwgAmdWf2v8gm-K3uLP9wtY,196
|
|
5
5
|
ommlds/_hacks/funcs.py,sha256=8XseIblP7yolDUD7WQSGn1LP90IQzByVejSzphAPDyM,2861
|
|
@@ -86,7 +86,7 @@ ommlds/backends/transformers/streamers.py,sha256=Hu_9lp_kUilKjOfs7Ixqr2NoA5FuRn2
|
|
|
86
86
|
ommlds/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
87
87
|
ommlds/cli/__main__.py,sha256=1ffCb0fcUOJMzxROJmJRXQ8PSOVYv7KrcuBtT95cf0c,140
|
|
88
88
|
ommlds/cli/inject.py,sha256=WhTDabJz9b1NRRHVH-UyVN5nj6UncvIeTvgkGrcE9vc,666
|
|
89
|
-
ommlds/cli/main.py,sha256
|
|
89
|
+
ommlds/cli/main.py,sha256=-cMFxZqK4XTHxaFxDk4fvtWLB6LZ6UGlmbN74L2ir-g,5801
|
|
90
90
|
ommlds/cli/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
91
91
|
ommlds/cli/backends/inject.py,sha256=OVstNsoeVnprM9PBL_zP0N46KkoDg3_Wz90BWcQ7km4,1734
|
|
92
92
|
ommlds/cli/backends/standard.py,sha256=HnammWyAXJHeqXJrAMBdarcT4Nyt2CxudZdD2fW_Y9M,631
|
|
@@ -100,7 +100,7 @@ ommlds/cli/sessions/chat/inject.py,sha256=7Yg6wUs2Oej4UjNZCAWCJCEsDJZWvT4G8XvkvV
|
|
|
100
100
|
ommlds/cli/sessions/chat/session.py,sha256=eqwelLE74JFC-fBpk_hdwMD2nP4pLv3ZPwUn99200B8,521
|
|
101
101
|
ommlds/cli/sessions/chat/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
102
102
|
ommlds/cli/sessions/chat/backends/catalog.py,sha256=hIY0L1zewuJX0_xxcMcy4gylSLiQENB3YxgYJEoKgrU,2109
|
|
103
|
-
ommlds/cli/sessions/chat/backends/inject.py,sha256=
|
|
103
|
+
ommlds/cli/sessions/chat/backends/inject.py,sha256=er03V-84B-wWt86o2HXXqCb6uFRKbfXfUZsB6GrzYOA,1646
|
|
104
104
|
ommlds/cli/sessions/chat/backends/injection.py,sha256=GCn5OvNIEowgB70kQVuU84z3i8lLA4vOVkTZlQG8s0o,327
|
|
105
105
|
ommlds/cli/sessions/chat/backends/types.py,sha256=5eImYHXLKqbC5MDrN443eMGamP9snCmV1n7LtAsqgPk,696
|
|
106
106
|
ommlds/cli/sessions/chat/chat/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -163,7 +163,7 @@ ommlds/minichain/configs.py,sha256=WwrHxfkDAfo_RtuCqUgySthj-2W26lZbpuQoghUyGNw,1
|
|
|
163
163
|
ommlds/minichain/envs.py,sha256=vE2CSeT6KYxOpPY72VbFLzGUnBERYdhfiEUlvSRHkXE,225
|
|
164
164
|
ommlds/minichain/json.py,sha256=0_5rV5Zi2qPOvXi2CLAc5DF7FN3jK3ABbjoKdjtTuVo,360
|
|
165
165
|
ommlds/minichain/metadata.py,sha256=2jik8gEm_VMnknPuPwqRssTg0MClRFUrXz_IsyEgUt4,878
|
|
166
|
-
ommlds/minichain/resources.py,sha256=
|
|
166
|
+
ommlds/minichain/resources.py,sha256=CcFIUrxPGuxUabG74zL0yByZsyGJISxLVK1nULSZPyo,5488
|
|
167
167
|
ommlds/minichain/search.py,sha256=azRzWcYhcm9IgSHquqLwtbwowtYCRAtPLSm7Gvt9iNo,1262
|
|
168
168
|
ommlds/minichain/standard.py,sha256=cGXaGtC5iM9Q2lCcbhLtvEcPGKhcJUIh3UWyNgOssRM,2580
|
|
169
169
|
ommlds/minichain/types.py,sha256=K6RRjpUi17UEG0cqPrrvbVANU0iRVh3WLiH-y6oEWFI,414
|
|
@@ -180,14 +180,14 @@ ommlds/minichain/backends/impls/anthropic/__init__.py,sha256=47DEQpj8HBSa-_TImW-
|
|
|
180
180
|
ommlds/minichain/backends/impls/anthropic/chat.py,sha256=-qGr_DZgGe-dr1AKb6WLtCq_I2E9635X1rQZSJqOb04,4318
|
|
181
181
|
ommlds/minichain/backends/impls/anthropic/names.py,sha256=GPPeYt0CcDcDCR8I6BMd7bMjC_Zk_bjnLLpF9ClwXcg,1099
|
|
182
182
|
ommlds/minichain/backends/impls/anthropic/protocol.py,sha256=whPVYuKShKiMCzasHl77sCIiymhzXj8mFZXEyhZvld8,3292
|
|
183
|
-
ommlds/minichain/backends/impls/anthropic/stream.py,sha256=
|
|
183
|
+
ommlds/minichain/backends/impls/anthropic/stream.py,sha256=NNBFb0sMId9yWua3fkAMZ-qYhQN9nLrXiO4DViR77YI,8790
|
|
184
184
|
ommlds/minichain/backends/impls/duckduckgo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
185
185
|
ommlds/minichain/backends/impls/duckduckgo/search.py,sha256=igzeU9P9b1MMiu4KAJVS9H6KLIoPm68wXi4Kx3_DHyQ,940
|
|
186
186
|
ommlds/minichain/backends/impls/google/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
187
187
|
ommlds/minichain/backends/impls/google/chat.py,sha256=lGb5blGLlcBlt9xeDZJvbh5SlV7fgfezd5_As_SPBXo,6499
|
|
188
188
|
ommlds/minichain/backends/impls/google/names.py,sha256=HxHJ31HeKZg6aW1C_Anqp-gamCXpq9pOdKj8_yVgE8Y,871
|
|
189
189
|
ommlds/minichain/backends/impls/google/search.py,sha256=y5_6seSRU8CFnLA_Ja8XEMbIBWSgwBzE1iBf-qyz0tA,3427
|
|
190
|
-
ommlds/minichain/backends/impls/google/stream.py,sha256=
|
|
190
|
+
ommlds/minichain/backends/impls/google/stream.py,sha256=ITownhKSOJB4IG23wWZJepUImSM6vJsDMOM9W1STpwU,8013
|
|
191
191
|
ommlds/minichain/backends/impls/google/tools.py,sha256=Tty0gsyx7-PbeoNqMuql_ewQ6q-ZsDaDdsD5ShinGVY,5089
|
|
192
192
|
ommlds/minichain/backends/impls/huggingface/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
193
193
|
ommlds/minichain/backends/impls/huggingface/configs.py,sha256=6jsBtPNXOP57PcpxNTVLGWLc-18Iwn_lDbGouwCJTIQ,258
|
|
@@ -200,18 +200,18 @@ ommlds/minichain/backends/impls/llamacpp/stream.py,sha256=uzrXr2HhshgFe3Z0g8KTPc
|
|
|
200
200
|
ommlds/minichain/backends/impls/mlx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
201
201
|
ommlds/minichain/backends/impls/mlx/chat.py,sha256=sMlhgiFZrxAC-kKkLSJ6c-2uJn0IHZXH4EiPET_-CKI,7458
|
|
202
202
|
ommlds/minichain/backends/impls/ollama/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
203
|
-
ommlds/minichain/backends/impls/ollama/chat.py,sha256=
|
|
203
|
+
ommlds/minichain/backends/impls/ollama/chat.py,sha256=agnJcOwJGebSiV5TG0UmVFYGCc7hEpt7FM73rtyF8gk,6609
|
|
204
204
|
ommlds/minichain/backends/impls/openai/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
205
205
|
ommlds/minichain/backends/impls/openai/chat.py,sha256=QcQZO78p4UUzI4QU1K-057OEZGKIYxjXENhkifsSuaI,2841
|
|
206
206
|
ommlds/minichain/backends/impls/openai/completion.py,sha256=4Mi4Zvrq5fCqUd0asL3WiCbCdmxOdo0NFkoZMfdsYXY,1939
|
|
207
207
|
ommlds/minichain/backends/impls/openai/embedding.py,sha256=BNtvKYLTsnQwQR9Tv3Fr8zCYN1kr1UNdJ15lcsjz6X0,1765
|
|
208
208
|
ommlds/minichain/backends/impls/openai/format.py,sha256=teGX8mNU3sXNWP4YWGD8d59M4X9_r75ImSzfTJgtNCM,7351
|
|
209
209
|
ommlds/minichain/backends/impls/openai/names.py,sha256=b74t8FwSbGEveVtVz4SqM5tiRDyTKNlUKlseV6AX3Yo,1211
|
|
210
|
-
ommlds/minichain/backends/impls/openai/stream.py,sha256=
|
|
210
|
+
ommlds/minichain/backends/impls/openai/stream.py,sha256=1kh_V_eu8QAY_i4ulfm22kbZeMiIDLwmDPJf7aaIikI,5410
|
|
211
211
|
ommlds/minichain/backends/impls/sentencepiece/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
212
212
|
ommlds/minichain/backends/impls/sentencepiece/tokens.py,sha256=tUEBKyBgkTowssS_AdcAuPkyFzfyDfE935x4JG8PXM0,1602
|
|
213
213
|
ommlds/minichain/backends/impls/tinygrad/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
214
|
-
ommlds/minichain/backends/impls/tinygrad/chat.py,sha256=
|
|
214
|
+
ommlds/minichain/backends/impls/tinygrad/chat.py,sha256=Y3Lp08Sb0YUPAxEciexOUm0uyoJnhbH5pWT9buclx6Y,4916
|
|
215
215
|
ommlds/minichain/backends/impls/tokenizers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
216
216
|
ommlds/minichain/backends/impls/tokenizers/tokens.py,sha256=_8Q49k5YroG5wQI0cuK6kOJ3XYwjhpaAS04ejhzBsWw,1500
|
|
217
217
|
ommlds/minichain/backends/impls/transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -220,8 +220,8 @@ ommlds/minichain/backends/impls/transformers/tokens.py,sha256=uS3-IWOJRUMBfPDVRr
|
|
|
220
220
|
ommlds/minichain/backends/impls/transformers/transformers.py,sha256=laM8G2SAE6jUjnHkeZsbWxS2KJF4efi-35aBlRBzIsE,9053
|
|
221
221
|
ommlds/minichain/backends/strings/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
222
222
|
ommlds/minichain/backends/strings/manifests.py,sha256=kmlanVUAZqIh0P95Mm8H20e8ib3gEgYHHUlkCXDQGFk,413
|
|
223
|
-
ommlds/minichain/backends/strings/parsing.py,sha256=
|
|
224
|
-
ommlds/minichain/backends/strings/resolving.py,sha256=
|
|
223
|
+
ommlds/minichain/backends/strings/parsing.py,sha256=Etmk04BnKvCMtGg4AgbvxsPGvfRcLldLxpdpxcozdNk,1779
|
|
224
|
+
ommlds/minichain/backends/strings/resolving.py,sha256=q0qMdIvFZH-yScpXNX8GHE_yyQC_eEf1eptiUnNolUI,5849
|
|
225
225
|
ommlds/minichain/chat/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
226
226
|
ommlds/minichain/chat/_marshal.py,sha256=M3p093nxzxITbznc--P-tyCXuWDHrq4JFKTZAx6XWdk,740
|
|
227
227
|
ommlds/minichain/chat/formats.py,sha256=LmlU7iu8PMJuroFTmyWfP4tXvLjj5VNxdAp1Us9MSAA,562
|
|
@@ -318,7 +318,7 @@ ommlds/minichain/services/requests.py,sha256=VAfKbYu4T0CZTWVQmZ2LUmYU7DNm6IerYMN
|
|
|
318
318
|
ommlds/minichain/services/responses.py,sha256=4W6Z4Fx4_GFqKgle27OeLr0zzjVTA0pkZrlsZiFQNdo,1534
|
|
319
319
|
ommlds/minichain/services/services.py,sha256=WjkQNYIp87SflLSReOHMkG2qIVAOem6vsrs_2NxWN_M,325
|
|
320
320
|
ommlds/minichain/stream/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
321
|
-
ommlds/minichain/stream/services.py,sha256=
|
|
321
|
+
ommlds/minichain/stream/services.py,sha256=YXfEj3ZXKZ3Svkig6f3hOReHgZnLY2tDn2bgB0RIoRI,5566
|
|
322
322
|
ommlds/minichain/stream/wrap.py,sha256=nQC0aCi49I18nF0Yx8qiiLkhIAECV6s6o4pvOy5Kx98,2041
|
|
323
323
|
ommlds/minichain/text/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
324
324
|
ommlds/minichain/text/applypatch.py,sha256=YIN5JChJ0FXyK1I6OiAHQmE7BT-exHfaAMM9ay7ylyc,17705
|
|
@@ -356,6 +356,10 @@ ommlds/minichain/vectors/search.py,sha256=27MTUiVT2xmSnmgJTAR09oQaiNRh1ixj0mGZVu
|
|
|
356
356
|
ommlds/minichain/vectors/similarity.py,sha256=etqSswPH7ERThueqnCUHULsM3rpsVslRFua0m_ps_F4,1308
|
|
357
357
|
ommlds/minichain/vectors/stores.py,sha256=etbLCS0RXAEmqcCdqiys8twa8R7Y_DcjQ_VqnEnRF4s,530
|
|
358
358
|
ommlds/minichain/vectors/types.py,sha256=xSAK1Xfkubqf95QgJhSHrwBu_C5quuye3wZAASmxJkM,3473
|
|
359
|
+
ommlds/nanochat/LICENSE,sha256=QrsJ8zmor4uQ07SWm29uS6Sv87XGFBA7Ax_M33HI93I,1072
|
|
360
|
+
ommlds/nanochat/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
361
|
+
ommlds/nanochat/tokenizers.py,sha256=cU6ld0qdMG1T41_ijRD8EsbFMLLCpSNLDjgQOBi6RdM,17502
|
|
362
|
+
ommlds/nanochat/rustbpe/LICENSE,sha256=QrsJ8zmor4uQ07SWm29uS6Sv87XGFBA7Ax_M33HI93I,1072
|
|
359
363
|
ommlds/server/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
360
364
|
ommlds/server/__main__.py,sha256=morlItVl-0_MDK6xk2VKhqOtA8oQk0SoWOWEqcgqXTw,155
|
|
361
365
|
ommlds/server/cli.py,sha256=gCN__45IXjCtk-tWwO2hr8vs5K-R0e1auNWdIc7d6_U,1825
|
|
@@ -377,9 +381,9 @@ ommlds/wiki/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
377
381
|
ommlds/wiki/utils/io.py,sha256=UKgDJGtmpnWvIqVd2mJc2QNPOqlToEY1GEveNp6_pMo,7088
|
|
378
382
|
ommlds/wiki/utils/progress.py,sha256=EhvKcMFYtsarCQhIahlO6f0SboyAKP3UwUyrnVnP-Vk,3222
|
|
379
383
|
ommlds/wiki/utils/xml.py,sha256=vVV8Ctn13aaRM9eYfs9Wd6rHn5WOCEUzQ44fIhOvJdg,3754
|
|
380
|
-
ommlds-0.0.0.
|
|
381
|
-
ommlds-0.0.0.
|
|
382
|
-
ommlds-0.0.0.
|
|
383
|
-
ommlds-0.0.0.
|
|
384
|
-
ommlds-0.0.0.
|
|
385
|
-
ommlds-0.0.0.
|
|
384
|
+
ommlds-0.0.0.dev474.dist-info/licenses/LICENSE,sha256=B_hVtavaA8zCYDW99DYdcpDLKz1n3BBRjZrcbv8uG8c,1451
|
|
385
|
+
ommlds-0.0.0.dev474.dist-info/METADATA,sha256=4Qv1lhqufZX1cWjEccHM3FMxW5dnXFhraRwnEpP8C2k,3344
|
|
386
|
+
ommlds-0.0.0.dev474.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
387
|
+
ommlds-0.0.0.dev474.dist-info/entry_points.txt,sha256=Z5YWtX7ClfiCKdW-dd_CSVvM0h4yQpJPi-2G3q6gNFo,35
|
|
388
|
+
ommlds-0.0.0.dev474.dist-info/top_level.txt,sha256=Rbnk5d5wi58vnAXx13WFZqdQ4VX8hBCS2hEL3WeXOhY,7
|
|
389
|
+
ommlds-0.0.0.dev474.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|