ommlds 0.0.0.dev436__py3-none-any.whl → 0.0.0.dev480__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (271) hide show
  1. ommlds/.omlish-manifests.json +332 -35
  2. ommlds/__about__.py +15 -9
  3. ommlds/_hacks/__init__.py +4 -0
  4. ommlds/_hacks/funcs.py +110 -0
  5. ommlds/_hacks/names.py +158 -0
  6. ommlds/_hacks/params.py +73 -0
  7. ommlds/_hacks/patches.py +0 -3
  8. ommlds/backends/anthropic/protocol/_marshal.py +2 -2
  9. ommlds/backends/anthropic/protocol/sse/_marshal.py +1 -1
  10. ommlds/backends/anthropic/protocol/sse/assemble.py +23 -7
  11. ommlds/backends/anthropic/protocol/sse/events.py +13 -0
  12. ommlds/backends/anthropic/protocol/types.py +30 -9
  13. ommlds/backends/google/protocol/__init__.py +3 -0
  14. ommlds/backends/google/protocol/_marshal.py +16 -0
  15. ommlds/backends/google/protocol/types.py +626 -0
  16. ommlds/backends/groq/_marshal.py +23 -0
  17. ommlds/backends/groq/protocol.py +249 -0
  18. ommlds/backends/mlx/generation.py +1 -1
  19. ommlds/backends/mlx/loading.py +58 -1
  20. ommlds/backends/ollama/__init__.py +0 -0
  21. ommlds/backends/ollama/protocol.py +170 -0
  22. ommlds/backends/openai/protocol/__init__.py +9 -28
  23. ommlds/backends/openai/protocol/_common.py +18 -0
  24. ommlds/backends/openai/protocol/_marshal.py +27 -0
  25. ommlds/backends/openai/protocol/chatcompletion/chunk.py +58 -31
  26. ommlds/backends/openai/protocol/chatcompletion/contentpart.py +49 -44
  27. ommlds/backends/openai/protocol/chatcompletion/message.py +55 -43
  28. ommlds/backends/openai/protocol/chatcompletion/request.py +114 -66
  29. ommlds/backends/openai/protocol/chatcompletion/response.py +71 -45
  30. ommlds/backends/openai/protocol/chatcompletion/responseformat.py +27 -20
  31. ommlds/backends/openai/protocol/chatcompletion/tokenlogprob.py +16 -7
  32. ommlds/backends/openai/protocol/completionusage.py +24 -15
  33. ommlds/backends/tavily/__init__.py +0 -0
  34. ommlds/backends/tavily/protocol.py +301 -0
  35. ommlds/backends/tinygrad/models/llama3/__init__.py +22 -14
  36. ommlds/backends/transformers/__init__.py +0 -0
  37. ommlds/backends/transformers/filecache.py +109 -0
  38. ommlds/backends/transformers/streamers.py +73 -0
  39. ommlds/cli/asyncs.py +30 -0
  40. ommlds/cli/backends/catalog.py +93 -0
  41. ommlds/cli/backends/configs.py +9 -0
  42. ommlds/cli/backends/inject.py +31 -36
  43. ommlds/cli/backends/injection.py +16 -0
  44. ommlds/cli/backends/types.py +46 -0
  45. ommlds/cli/content/__init__.py +0 -0
  46. ommlds/cli/content/messages.py +34 -0
  47. ommlds/cli/content/strings.py +42 -0
  48. ommlds/cli/inject.py +15 -32
  49. ommlds/cli/inputs/__init__.py +0 -0
  50. ommlds/cli/inputs/asyncs.py +32 -0
  51. ommlds/cli/inputs/sync.py +75 -0
  52. ommlds/cli/main.py +270 -110
  53. ommlds/cli/rendering/__init__.py +0 -0
  54. ommlds/cli/rendering/configs.py +9 -0
  55. ommlds/cli/rendering/inject.py +31 -0
  56. ommlds/cli/rendering/markdown.py +52 -0
  57. ommlds/cli/rendering/raw.py +73 -0
  58. ommlds/cli/rendering/types.py +21 -0
  59. ommlds/cli/secrets.py +21 -0
  60. ommlds/cli/sessions/base.py +1 -1
  61. ommlds/cli/sessions/chat/chat/__init__.py +0 -0
  62. ommlds/cli/sessions/chat/chat/ai/__init__.py +0 -0
  63. ommlds/cli/sessions/chat/chat/ai/configs.py +11 -0
  64. ommlds/cli/sessions/chat/chat/ai/inject.py +74 -0
  65. ommlds/cli/sessions/chat/chat/ai/injection.py +14 -0
  66. ommlds/cli/sessions/chat/chat/ai/rendering.py +70 -0
  67. ommlds/cli/sessions/chat/chat/ai/services.py +79 -0
  68. ommlds/cli/sessions/chat/chat/ai/tools.py +44 -0
  69. ommlds/cli/sessions/chat/chat/ai/types.py +28 -0
  70. ommlds/cli/sessions/chat/chat/state/__init__.py +0 -0
  71. ommlds/cli/sessions/chat/chat/state/configs.py +11 -0
  72. ommlds/cli/sessions/chat/chat/state/inject.py +36 -0
  73. ommlds/cli/sessions/chat/chat/state/inmemory.py +33 -0
  74. ommlds/cli/sessions/chat/chat/state/storage.py +52 -0
  75. ommlds/cli/sessions/chat/chat/state/types.py +38 -0
  76. ommlds/cli/sessions/chat/chat/user/__init__.py +0 -0
  77. ommlds/cli/sessions/chat/chat/user/configs.py +17 -0
  78. ommlds/cli/sessions/chat/chat/user/inject.py +62 -0
  79. ommlds/cli/sessions/chat/chat/user/interactive.py +31 -0
  80. ommlds/cli/sessions/chat/chat/user/oneshot.py +25 -0
  81. ommlds/cli/sessions/chat/chat/user/types.py +15 -0
  82. ommlds/cli/sessions/chat/configs.py +27 -0
  83. ommlds/cli/sessions/chat/driver.py +43 -0
  84. ommlds/cli/sessions/chat/inject.py +33 -65
  85. ommlds/cli/sessions/chat/phases/__init__.py +0 -0
  86. ommlds/cli/sessions/chat/phases/inject.py +27 -0
  87. ommlds/cli/sessions/chat/phases/injection.py +14 -0
  88. ommlds/cli/sessions/chat/phases/manager.py +29 -0
  89. ommlds/cli/sessions/chat/phases/types.py +29 -0
  90. ommlds/cli/sessions/chat/session.py +27 -0
  91. ommlds/cli/sessions/chat/tools/__init__.py +0 -0
  92. ommlds/cli/sessions/chat/tools/configs.py +22 -0
  93. ommlds/cli/sessions/chat/tools/confirmation.py +46 -0
  94. ommlds/cli/sessions/chat/tools/execution.py +66 -0
  95. ommlds/cli/sessions/chat/tools/fs/__init__.py +0 -0
  96. ommlds/cli/sessions/chat/tools/fs/configs.py +12 -0
  97. ommlds/cli/sessions/chat/tools/fs/inject.py +35 -0
  98. ommlds/cli/sessions/chat/tools/inject.py +88 -0
  99. ommlds/cli/sessions/chat/tools/injection.py +44 -0
  100. ommlds/cli/sessions/chat/tools/rendering.py +58 -0
  101. ommlds/cli/sessions/chat/tools/todo/__init__.py +0 -0
  102. ommlds/cli/sessions/chat/tools/todo/configs.py +12 -0
  103. ommlds/cli/sessions/chat/tools/todo/inject.py +31 -0
  104. ommlds/cli/sessions/chat/tools/weather/__init__.py +0 -0
  105. ommlds/cli/sessions/chat/tools/weather/configs.py +12 -0
  106. ommlds/cli/sessions/chat/tools/weather/inject.py +22 -0
  107. ommlds/cli/{tools/weather.py → sessions/chat/tools/weather/tools.py} +1 -1
  108. ommlds/cli/sessions/completion/configs.py +21 -0
  109. ommlds/cli/sessions/completion/inject.py +42 -0
  110. ommlds/cli/sessions/completion/session.py +35 -0
  111. ommlds/cli/sessions/embedding/configs.py +21 -0
  112. ommlds/cli/sessions/embedding/inject.py +42 -0
  113. ommlds/cli/sessions/embedding/session.py +33 -0
  114. ommlds/cli/sessions/inject.py +28 -11
  115. ommlds/cli/state/__init__.py +0 -0
  116. ommlds/cli/state/inject.py +28 -0
  117. ommlds/cli/{state.py → state/storage.py} +41 -24
  118. ommlds/minichain/__init__.py +84 -24
  119. ommlds/minichain/_marshal.py +49 -9
  120. ommlds/minichain/_typedvalues.py +2 -4
  121. ommlds/minichain/backends/catalogs/base.py +20 -1
  122. ommlds/minichain/backends/catalogs/simple.py +2 -2
  123. ommlds/minichain/backends/catalogs/strings.py +10 -8
  124. ommlds/minichain/backends/impls/anthropic/chat.py +65 -27
  125. ommlds/minichain/backends/impls/anthropic/names.py +10 -8
  126. ommlds/minichain/backends/impls/anthropic/protocol.py +109 -0
  127. ommlds/minichain/backends/impls/anthropic/stream.py +111 -43
  128. ommlds/minichain/backends/impls/duckduckgo/search.py +1 -1
  129. ommlds/minichain/backends/impls/dummy/__init__.py +0 -0
  130. ommlds/minichain/backends/impls/dummy/chat.py +69 -0
  131. ommlds/minichain/backends/impls/google/chat.py +114 -22
  132. ommlds/minichain/backends/impls/google/search.py +7 -2
  133. ommlds/minichain/backends/impls/google/stream.py +219 -0
  134. ommlds/minichain/backends/impls/google/tools.py +149 -0
  135. ommlds/minichain/backends/impls/groq/__init__.py +0 -0
  136. ommlds/minichain/backends/impls/groq/chat.py +75 -0
  137. ommlds/minichain/backends/impls/groq/names.py +48 -0
  138. ommlds/minichain/backends/impls/groq/protocol.py +143 -0
  139. ommlds/minichain/backends/impls/groq/stream.py +125 -0
  140. ommlds/minichain/backends/impls/llamacpp/chat.py +33 -18
  141. ommlds/minichain/backends/impls/llamacpp/completion.py +1 -1
  142. ommlds/minichain/backends/impls/llamacpp/format.py +4 -2
  143. ommlds/minichain/backends/impls/llamacpp/stream.py +37 -20
  144. ommlds/minichain/backends/impls/mistral.py +20 -5
  145. ommlds/minichain/backends/impls/mlx/chat.py +96 -22
  146. ommlds/minichain/backends/impls/ollama/__init__.py +0 -0
  147. ommlds/minichain/backends/impls/ollama/chat.py +199 -0
  148. ommlds/minichain/backends/impls/openai/chat.py +18 -8
  149. ommlds/minichain/backends/impls/openai/completion.py +10 -3
  150. ommlds/minichain/backends/impls/openai/embedding.py +10 -3
  151. ommlds/minichain/backends/impls/openai/format.py +131 -106
  152. ommlds/minichain/backends/impls/openai/names.py +31 -5
  153. ommlds/minichain/backends/impls/openai/stream.py +43 -25
  154. ommlds/minichain/backends/impls/tavily.py +66 -0
  155. ommlds/minichain/backends/impls/tinygrad/chat.py +23 -16
  156. ommlds/minichain/backends/impls/transformers/sentence.py +1 -1
  157. ommlds/minichain/backends/impls/transformers/tokens.py +1 -1
  158. ommlds/minichain/backends/impls/transformers/transformers.py +155 -34
  159. ommlds/minichain/backends/strings/parsing.py +1 -1
  160. ommlds/minichain/backends/strings/resolving.py +4 -1
  161. ommlds/minichain/chat/_marshal.py +16 -9
  162. ommlds/minichain/chat/choices/adapters.py +4 -4
  163. ommlds/minichain/chat/choices/services.py +1 -1
  164. ommlds/minichain/chat/choices/stream/__init__.py +0 -0
  165. ommlds/minichain/chat/choices/stream/adapters.py +35 -0
  166. ommlds/minichain/chat/choices/stream/joining.py +31 -0
  167. ommlds/minichain/chat/choices/stream/services.py +45 -0
  168. ommlds/minichain/chat/choices/stream/types.py +43 -0
  169. ommlds/minichain/chat/choices/types.py +2 -2
  170. ommlds/minichain/chat/history.py +3 -3
  171. ommlds/minichain/chat/messages.py +55 -19
  172. ommlds/minichain/chat/services.py +3 -3
  173. ommlds/minichain/chat/stream/_marshal.py +16 -0
  174. ommlds/minichain/chat/stream/joining.py +85 -0
  175. ommlds/minichain/chat/stream/services.py +15 -21
  176. ommlds/minichain/chat/stream/types.py +32 -19
  177. ommlds/minichain/chat/tools/execution.py +8 -7
  178. ommlds/minichain/chat/tools/ids.py +9 -15
  179. ommlds/minichain/chat/tools/parsing.py +17 -26
  180. ommlds/minichain/chat/transforms/base.py +29 -38
  181. ommlds/minichain/chat/transforms/metadata.py +30 -4
  182. ommlds/minichain/chat/transforms/services.py +9 -11
  183. ommlds/minichain/content/_marshal.py +44 -20
  184. ommlds/minichain/content/json.py +13 -0
  185. ommlds/minichain/content/materialize.py +14 -21
  186. ommlds/minichain/content/prepare.py +4 -0
  187. ommlds/minichain/content/transforms/interleave.py +1 -1
  188. ommlds/minichain/content/transforms/squeeze.py +1 -1
  189. ommlds/minichain/content/transforms/stringify.py +1 -1
  190. ommlds/minichain/json.py +20 -0
  191. ommlds/minichain/lib/code/__init__.py +0 -0
  192. ommlds/minichain/lib/code/prompts.py +6 -0
  193. ommlds/minichain/lib/fs/binfiles.py +108 -0
  194. ommlds/minichain/lib/fs/context.py +126 -0
  195. ommlds/minichain/lib/fs/errors.py +101 -0
  196. ommlds/minichain/lib/fs/suggestions.py +36 -0
  197. ommlds/minichain/lib/fs/tools/__init__.py +0 -0
  198. ommlds/minichain/lib/fs/tools/edit.py +104 -0
  199. ommlds/minichain/lib/fs/tools/ls.py +38 -0
  200. ommlds/minichain/lib/fs/tools/read.py +115 -0
  201. ommlds/minichain/lib/fs/tools/recursivels/__init__.py +0 -0
  202. ommlds/minichain/lib/fs/tools/recursivels/execution.py +40 -0
  203. ommlds/minichain/lib/todo/__init__.py +0 -0
  204. ommlds/minichain/lib/todo/context.py +54 -0
  205. ommlds/minichain/lib/todo/tools/__init__.py +0 -0
  206. ommlds/minichain/lib/todo/tools/read.py +44 -0
  207. ommlds/minichain/lib/todo/tools/write.py +335 -0
  208. ommlds/minichain/lib/todo/types.py +60 -0
  209. ommlds/minichain/llms/_marshal.py +25 -17
  210. ommlds/minichain/llms/types.py +4 -0
  211. ommlds/minichain/registries/globals.py +18 -4
  212. ommlds/minichain/resources.py +66 -43
  213. ommlds/minichain/search.py +1 -1
  214. ommlds/minichain/services/_marshal.py +46 -39
  215. ommlds/minichain/services/facades.py +3 -3
  216. ommlds/minichain/services/services.py +1 -1
  217. ommlds/minichain/standard.py +8 -0
  218. ommlds/minichain/stream/services.py +152 -38
  219. ommlds/minichain/stream/wrap.py +22 -24
  220. ommlds/minichain/tools/_marshal.py +1 -1
  221. ommlds/minichain/tools/execution/catalog.py +2 -1
  222. ommlds/minichain/tools/execution/context.py +34 -14
  223. ommlds/minichain/tools/execution/errors.py +15 -0
  224. ommlds/minichain/tools/execution/executors.py +8 -3
  225. ommlds/minichain/tools/execution/reflect.py +40 -5
  226. ommlds/minichain/tools/fns.py +46 -9
  227. ommlds/minichain/tools/jsonschema.py +14 -5
  228. ommlds/minichain/tools/reflect.py +54 -18
  229. ommlds/minichain/tools/types.py +33 -1
  230. ommlds/minichain/utils.py +27 -0
  231. ommlds/minichain/vectors/_marshal.py +11 -10
  232. ommlds/nanochat/LICENSE +21 -0
  233. ommlds/nanochat/__init__.py +0 -0
  234. ommlds/nanochat/rustbpe/LICENSE +21 -0
  235. ommlds/nanochat/tokenizers.py +406 -0
  236. ommlds/server/server.py +3 -3
  237. ommlds/specs/__init__.py +0 -0
  238. ommlds/specs/mcp/__init__.py +0 -0
  239. ommlds/specs/mcp/_marshal.py +23 -0
  240. ommlds/specs/mcp/protocol.py +266 -0
  241. ommlds/tools/git.py +27 -10
  242. ommlds/tools/ocr.py +8 -9
  243. ommlds/wiki/analyze.py +2 -2
  244. ommlds/wiki/text/mfh.py +1 -5
  245. ommlds/wiki/text/wtp.py +1 -3
  246. ommlds/wiki/utils/xml.py +5 -5
  247. {ommlds-0.0.0.dev436.dist-info → ommlds-0.0.0.dev480.dist-info}/METADATA +24 -21
  248. ommlds-0.0.0.dev480.dist-info/RECORD +427 -0
  249. ommlds/cli/backends/standard.py +0 -20
  250. ommlds/cli/sessions/chat/base.py +0 -42
  251. ommlds/cli/sessions/chat/interactive.py +0 -73
  252. ommlds/cli/sessions/chat/printing.py +0 -96
  253. ommlds/cli/sessions/chat/prompt.py +0 -143
  254. ommlds/cli/sessions/chat/state.py +0 -109
  255. ommlds/cli/sessions/chat/tools.py +0 -91
  256. ommlds/cli/sessions/completion/completion.py +0 -44
  257. ommlds/cli/sessions/embedding/embedding.py +0 -42
  258. ommlds/cli/tools/config.py +0 -13
  259. ommlds/cli/tools/inject.py +0 -64
  260. ommlds/minichain/chat/stream/adapters.py +0 -69
  261. ommlds/minichain/lib/fs/ls/execution.py +0 -32
  262. ommlds-0.0.0.dev436.dist-info/RECORD +0 -303
  263. /ommlds/{cli/tools → backends/google}/__init__.py +0 -0
  264. /ommlds/{minichain/lib/fs/ls → backends/groq}/__init__.py +0 -0
  265. /ommlds/{huggingface.py → backends/huggingface.py} +0 -0
  266. /ommlds/minichain/lib/fs/{ls → tools/recursivels}/rendering.py +0 -0
  267. /ommlds/minichain/lib/fs/{ls → tools/recursivels}/running.py +0 -0
  268. {ommlds-0.0.0.dev436.dist-info → ommlds-0.0.0.dev480.dist-info}/WHEEL +0 -0
  269. {ommlds-0.0.0.dev436.dist-info → ommlds-0.0.0.dev480.dist-info}/entry_points.txt +0 -0
  270. {ommlds-0.0.0.dev436.dist-info → ommlds-0.0.0.dev480.dist-info}/licenses/LICENSE +0 -0
  271. {ommlds-0.0.0.dev436.dist-info → ommlds-0.0.0.dev480.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Andrej Karpathy
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,406 @@
1
+ # https://github.com/karpathy/nanochat/tree/9467d83cf23dcc9a9b4ca6e35103142f48a55b27
2
+ """
3
+ BPE Tokenizer in the style of GPT-4.
4
+
5
+ Two implementations are available:
6
+ 1) HuggingFace Tokenizer that can do both training and inference but is really confusing
7
+ 2) Our own RustBPE Tokenizer for training and tiktoken for efficient inference
8
+ """
9
+ import copy
10
+ import os
11
+ import pickle
12
+ import typing as ta
13
+
14
+ from omlish import check
15
+ from omlish import collections as col
16
+ from omlish import lang
17
+
18
+
19
+ with lang.auto_proxy_import(globals()):
20
+ import tiktoken
21
+ import tokenizers
22
+
23
+
24
+ rustbpe: ta.Any = lang.proxy_import('.rustbpe', __package__)
25
+
26
+
27
+ ##
28
+
29
+
30
+ SPECIAL_TOKENS = [
31
+ # every document begins with the Beginning of Sequence (BOS) token that delimits documents
32
+ '<|bos|>',
33
+ # tokens below are only used during finetuning to render Conversations into token ids
34
+ '<|user_start|>', # user messages
35
+ '<|user_end|>',
36
+ '<|assistant_start|>', # assistant messages
37
+ '<|assistant_end|>',
38
+ '<|python_start|>', # assistant invokes python REPL tool
39
+ '<|python_end|>',
40
+ '<|output_start|>', # python REPL outputs back to assistant
41
+ '<|output_end|>',
42
+ ]
43
+
44
+
45
+ # NOTE: this split pattern deviates from GPT-4 in that we use \p{N}{1,2} instead of \p{N}{1,3}
46
+ # I did this because I didn't want to "waste" too many tokens on numbers for smaller vocab sizes.
47
+ # I haven't validated that this is actually a good idea, TODO.
48
+ SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" # noqa
49
+
50
+
51
+ # -----------------------------------------------------------------------------
52
+ # Generic GPT-4-style tokenizer based on HuggingFace Tokenizer
53
+
54
+
55
+ class HuggingFaceTokenizer:
56
+ """Light wrapper around HuggingFace Tokenizer for some utilities"""
57
+
58
+ def __init__(self, tokenizer):
59
+ self.tokenizer = tokenizer
60
+
61
+ @classmethod
62
+ def from_pretrained(cls, hf_path):
63
+ # init from a HuggingFace pretrained tokenizer (e.g. "gpt2")
64
+ tokenizer = tokenizers.Tokenizer.from_pretrained(hf_path)
65
+ return cls(tokenizer)
66
+
67
+ @classmethod
68
+ def from_directory(cls, tokenizer_dir):
69
+ # init from a local directory on disk (e.g. "out/tokenizer")
70
+ tokenizer_path = os.path.join(tokenizer_dir, 'tokenizer.json')
71
+ tokenizer = tokenizers.Tokenizer.from_file(tokenizer_path)
72
+ return cls(tokenizer)
73
+
74
+ @classmethod
75
+ def train_from_iterator(
76
+ cls,
77
+ text_iterator,
78
+ vocab_size,
79
+ *,
80
+ split_pattern=SPLIT_PATTERN,
81
+ special_tokens=SPECIAL_TOKENS,
82
+ ):
83
+ # train from an iterator of text
84
+ # Configure the HuggingFace Tokenizer
85
+ tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE(
86
+ byte_fallback=True, # needed!
87
+ unk_token=None,
88
+ fuse_unk=False,
89
+ ))
90
+ # Normalizer: None
91
+ tokenizer.normalizer = None
92
+ # Pre-tokenizer: GPT-4 style
93
+ # the regex pattern used by GPT-4 to split text into groups before BPE
94
+ # NOTE: The pattern was changed from \p{N}{1,3} to \p{N}{1,2} because I suspect it is harmful to
95
+ # very small models and smaller vocab sizes, because it is a little bit wasteful in the token space.
96
+ # (but I haven't validated this! TODO)
97
+ gpt4_split_regex = tokenizers.Regex(split_pattern) # huggingface demands that you wrap it in Regex!!
98
+ tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence([
99
+ tokenizers.pre_tokenizers.Split(pattern=gpt4_split_regex, behavior='isolated', invert=False),
100
+ tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False),
101
+ ])
102
+ # Decoder: ByteLevel (it pairs together with the ByteLevel pre-tokenizer)
103
+ tokenizer.decoder = tokenizers.decoders.ByteLevel()
104
+ # Post-processor: None
105
+ tokenizer.post_processor = None
106
+ # Trainer: BPE
107
+ trainer = tokenizers.trainers.BpeTrainer(
108
+ vocab_size=vocab_size,
109
+ show_progress=True,
110
+ min_frequency=0, # no minimum frequency
111
+ initial_alphabet=tokenizers.pre_tokenizers.ByteLevel.alphabet(),
112
+ special_tokens=special_tokens,
113
+ )
114
+ # Kick off the training
115
+ tokenizer.train_from_iterator(text_iterator, trainer)
116
+ return cls(tokenizer)
117
+
118
+ def encode_ordinary(self, text):
119
+ ids = self.tokenizer.encode(text, add_special_tokens=False).ids
120
+ return ids
121
+
122
+ def get_vocab_size(self):
123
+ return self.tokenizer.get_vocab_size()
124
+
125
+ def get_special_tokens(self):
126
+ special_tokens_map = self.tokenizer.get_added_tokens_decoder()
127
+ special_tokens = [w.content for w in special_tokens_map.values()]
128
+ return special_tokens
129
+
130
+ def id_to_token(self, id): # noqa
131
+ return self.tokenizer.id_to_token(id)
132
+
133
+ def _encode_one(self, text, prepend=None, append=None):
134
+ # encode a single string
135
+ # prepend/append can be either a string of a special token or a token id directly.
136
+ check.isinstance(text, str)
137
+ ids = []
138
+ if prepend is not None:
139
+ prepend_id = prepend if isinstance(prepend, int) else self.encode_special(prepend)
140
+ ids.append(prepend_id)
141
+ ids.extend(self.tokenizer.encode(text, add_special_tokens=False).ids)
142
+ if append is not None:
143
+ append_id = append if isinstance(append, int) else self.encode_special(append)
144
+ ids.append(append_id)
145
+ return ids
146
+
147
+ def encode_special(self, text):
148
+ # encode a single special token via exact match
149
+ return self.tokenizer.token_to_id(text)
150
+
151
+ def get_bos_token_id(self):
152
+ bos = self.encode_special('<|bos|>')
153
+ return bos
154
+
155
+ def encode(self, text, *args, **kwargs):
156
+ if isinstance(text, str):
157
+ return self._encode_one(text, *args, **kwargs)
158
+ elif isinstance(text, list):
159
+ return [self._encode_one(t, *args, **kwargs) for t in text]
160
+ else:
161
+ raise ValueError(f'Invalid input type: {type(text)}') # noqa
162
+
163
+ def __call__(self, *args, **kwargs):
164
+ return self.encode(*args, **kwargs)
165
+
166
+ def decode(self, ids):
167
+ return self.tokenizer.decode(ids, skip_special_tokens=False)
168
+
169
+ def save(self, tokenizer_dir):
170
+ # save the tokenizer to disk
171
+ os.makedirs(tokenizer_dir, exist_ok=True)
172
+ tokenizer_path = os.path.join(tokenizer_dir, 'tokenizer.json')
173
+ self.tokenizer.save(tokenizer_path)
174
+ print(f'Saved tokenizer to {tokenizer_path}')
175
+
176
+
177
+ # -----------------------------------------------------------------------------
178
+ # Tokenizer based on rustbpe + tiktoken combo
179
+
180
+
181
+ class RustBPETokenizer:
182
+ """Light wrapper around tiktoken (for efficient inference) but train with rustbpe"""
183
+
184
+ def __init__(self, enc, bos_token):
185
+ self.enc = enc
186
+ self.bos_token_id = self.encode_special(bos_token)
187
+
188
+ @classmethod
189
+ def train_from_iterator(cls, text_iterator, vocab_size):
190
+ # 1) train using rustbpe
191
+ tokenizer = rustbpe.Tokenizer()
192
+ # the special tokens are inserted later in __init__, we don't train them here
193
+ vocab_size_no_special = vocab_size - len(SPECIAL_TOKENS)
194
+ check.state(vocab_size_no_special >= 256, f'vocab_size_no_special must be at least 256, got {vocab_size_no_special}') # noqa
195
+ tokenizer.train_from_iterator(text_iterator, vocab_size_no_special, pattern=SPLIT_PATTERN)
196
+ # 2) construct the associated tiktoken encoding for inference
197
+ pattern = tokenizer.get_pattern()
198
+ mergeable_ranks_list = tokenizer.get_mergeable_ranks()
199
+ mergeable_ranks = {bytes(k): v for k, v in mergeable_ranks_list}
200
+ tokens_offset = len(mergeable_ranks)
201
+ special_tokens = {name: tokens_offset + i for i, name in enumerate(SPECIAL_TOKENS)}
202
+ enc = tiktoken.Encoding(
203
+ name='rustbpe',
204
+ pat_str=pattern,
205
+ mergeable_ranks=mergeable_ranks, # dict[bytes, int] (token bytes -> merge priority rank)
206
+ special_tokens=special_tokens, # dict[str, int] (special token name -> token id)
207
+ )
208
+ return cls(enc, '<|bos|>')
209
+
210
+ @classmethod
211
+ def from_directory(cls, tokenizer_dir):
212
+ pickle_path = os.path.join(tokenizer_dir, 'tokenizer.pkl')
213
+ with open(pickle_path, 'rb') as f:
214
+ enc = pickle.load(f) # noqa
215
+ return cls(enc, '<|bos|>')
216
+
217
+ @classmethod
218
+ def from_pretrained(cls, tiktoken_name):
219
+ # https://github.com/openai/tiktoken/blob/eedc8563/tiktoken_ext/openai_public.py
220
+ enc = tiktoken.get_encoding(tiktoken_name)
221
+ # tiktoken calls the special document delimiter token "<|endoftext|>"
222
+ # yes this is confusing because this token is almost always PREPENDED to the beginning of the document
223
+ # it most often is used to signal the start of a new sequence to the LLM during inference etc.
224
+ # so in nanoChat we always use "<|bos|>" short for "beginning of sequence", but historically it is often called
225
+ # "<|endoftext|>".
226
+ return cls(enc, '<|endoftext|>')
227
+
228
+ def get_vocab_size(self):
229
+ return self.enc.n_vocab
230
+
231
+ def get_special_tokens(self):
232
+ return self.enc.special_tokens_set
233
+
234
+ def id_to_token(self, id): # noqa
235
+ return self.enc.decode([id])
236
+
237
+ @col.cache.cache(max_size=32)
238
+ def encode_special(self, text):
239
+ return self.enc.encode_single_token(text)
240
+
241
+ def get_bos_token_id(self):
242
+ return self.bos_token_id
243
+
244
+ def encode(self, text, prepend=None, append=None, num_threads=8):
245
+ # text can be either a string or a list of strings
246
+
247
+ if prepend is not None:
248
+ prepend_id = prepend if isinstance(prepend, int) else self.encode_special(prepend)
249
+ if append is not None:
250
+ append_id = append if isinstance(append, int) else self.encode_special(append)
251
+
252
+ if isinstance(text, str):
253
+ ids = self.enc.encode_ordinary(text)
254
+ if prepend is not None:
255
+ ids.insert(0, prepend_id) # TODO: slightly inefficient here? :( hmm
256
+ if append is not None:
257
+ ids.append(append_id)
258
+ elif isinstance(text, list):
259
+ ids = self.enc.encode_ordinary_batch(text, num_threads=num_threads)
260
+ if prepend is not None:
261
+ for ids_row in ids:
262
+ ids_row.insert(0, prepend_id) # TODO: same
263
+ if append is not None:
264
+ for ids_row in ids:
265
+ ids_row.append(append_id)
266
+ else:
267
+ raise ValueError(f'Invalid input type: {type(text)}') # noqa
268
+
269
+ return ids
270
+
271
+ def __call__(self, *args, **kwargs):
272
+ return self.encode(*args, **kwargs)
273
+
274
+ def decode(self, ids):
275
+ return self.enc.decode(ids)
276
+
277
+ def save(self, tokenizer_dir):
278
+ # save the encoding object to disk
279
+ os.makedirs(tokenizer_dir, exist_ok=True)
280
+ pickle_path = os.path.join(tokenizer_dir, 'tokenizer.pkl')
281
+ with open(pickle_path, 'wb') as f:
282
+ pickle.dump(self.enc, f)
283
+ print(f'Saved tokenizer encoding to {pickle_path}')
284
+
285
+ def render_conversation(self, conversation, max_tokens=2048):
286
+ """
287
+ Tokenize a single Chat conversation (which we call a "doc" or "document" here).
288
+ Returns:
289
+ - ids: list[int] is a list of token ids of this rendered conversation
290
+ - mask: list[int] of same length, mask = 1 for tokens that the Assistant is expected to train on.
291
+ """
292
+
293
+ # ids, masks that we will return and a helper function to help build them up.
294
+ ids, mask = [], []
295
+
296
+ def add_tokens(token_ids, mask_val):
297
+ if isinstance(token_ids, int):
298
+ token_ids = [token_ids]
299
+ ids.extend(token_ids)
300
+ mask.extend([mask_val] * len(token_ids))
301
+
302
+ # sometimes the first message is a system message...
303
+ # => just merge it with the second (user) message
304
+ if conversation['messages'][0]['role'] == 'system':
305
+ # some conversation surgery is necessary here for now...
306
+ conversation = copy.deepcopy(conversation) # avoid mutating the original
307
+ messages = conversation['messages']
308
+ check.state(messages[1]['role'] == 'user', 'System message must be followed by a user message')
309
+ messages[1]['content'] = messages[0]['content'] + '\n\n' + messages[1]['content']
310
+ messages = messages[1:]
311
+ else:
312
+ messages = conversation['messages']
313
+ check.state(len(messages) >= 1, f'Conversation has less than 1 message: {messages}')
314
+
315
+ # fetch all the special tokens we need
316
+ bos = self.get_bos_token_id()
317
+ user_start, user_end = self.encode_special('<|user_start|>'), self.encode_special('<|user_end|>')
318
+ assistant_start, assistant_end = self.encode_special('<|assistant_start|>'), self.encode_special('<|assistant_end|>') # noqa
319
+ python_start, python_end = self.encode_special('<|python_start|>'), self.encode_special('<|python_end|>')
320
+ output_start, output_end = self.encode_special('<|output_start|>'), self.encode_special('<|output_end|>')
321
+
322
+ # now we can tokenize the conversation
323
+ add_tokens(bos, 0)
324
+ for i, message in enumerate(messages):
325
+ # some sanity checking here around assumptions, to prevent footguns
326
+ must_be_from = 'user' if i % 2 == 0 else 'assistant'
327
+ check.state(message['role'] == must_be_from, f"Message {i} is from {message['role']} but should be from {must_be_from}") # noqa
328
+
329
+ # content can be either a simple string or a list of parts (e.g. containing tool calls)
330
+ content = message['content']
331
+
332
+ if message['role'] == 'user':
333
+ check.isinstance(content, str), 'User messages are simply expected to be strings'
334
+ value_ids = self.encode(content)
335
+ add_tokens(user_start, 0)
336
+ add_tokens(value_ids, 0)
337
+ add_tokens(user_end, 0)
338
+ elif message['role'] == 'assistant':
339
+ add_tokens(assistant_start, 0)
340
+ if isinstance(content, str):
341
+ # simple string => simply add the tokens
342
+ value_ids = self.encode(content)
343
+ add_tokens(value_ids, 1)
344
+ elif isinstance(content, list):
345
+ for part in content:
346
+ value_ids = self.encode(part['text'])
347
+ if part['type'] == 'text':
348
+ # string part => simply add the tokens
349
+ add_tokens(value_ids, 1)
350
+ elif part['type'] == 'python':
351
+ # python tool call => add the tokens inside <|python_start|> and <|python_end|>
352
+ add_tokens(python_start, 1)
353
+ add_tokens(value_ids, 1)
354
+ add_tokens(python_end, 1)
355
+ elif part['type'] == 'python_output':
356
+ # python output => add the tokens inside <|output_start|> and <|output_end|>
357
+ # none of these tokens are supervised because the tokens come from Python at test time
358
+ add_tokens(output_start, 0)
359
+ add_tokens(value_ids, 0)
360
+ add_tokens(output_end, 0)
361
+ else:
362
+ raise ValueError(f"Unknown part type: {part['type']}")
363
+ else:
364
+ raise ValueError(f'Unknown content type: {type(content)}')
365
+ add_tokens(assistant_end, 1)
366
+
367
+ # truncate to max_tokens tokens MAX (helps prevent OOMs)
368
+ ids = ids[:max_tokens]
369
+ mask = mask[:max_tokens]
370
+ return ids, mask
371
+
372
+ def visualize_tokenization(self, ids, mask, with_token_id=False):
373
+ """Small helper function useful in debugging: visualize the tokenization of render_conversation"""
374
+
375
+ red = '\033[91m'
376
+ green = '\033[92m'
377
+ reset = '\033[0m'
378
+ gray = '\033[90m'
379
+ tokens = []
380
+ for i, (token_id, mask_val) in enumerate(zip(ids, mask)): # noqa
381
+ token_str = self.decode([token_id])
382
+ color = green if mask_val == 1 else red
383
+ tokens.append(f'{color}{token_str}{reset}')
384
+ if with_token_id:
385
+ tokens.append(f'{gray}({token_id}){reset}')
386
+ return '|'.join(tokens)
387
+
388
+ def render_for_completion(self, conversation):
389
+ """
390
+ Used during Reinforcement Learning. In that setting, we want to render the conversation priming the Assistant
391
+ for a completion. Unlike the Chat SFT case, we don't need to return the mask.
392
+ """
393
+
394
+ # We have some surgery to do: we need to pop the last message (of the Assistant)
395
+ conversation = copy.deepcopy(conversation) # avoid mutating the original
396
+ messages = conversation['messages']
397
+ check.state(messages[-1]['role'] == 'assistant', 'Last message must be from the Assistant')
398
+ messages.pop() # remove the last message (of the Assistant) inplace
399
+
400
+ # Now tokenize the conversation
401
+ ids, mask = self.render_conversation(conversation)
402
+
403
+ # Finally, to prime the Assistant for a completion, append the Assistant start token
404
+ assistant_start = self.encode_special('<|assistant_start|>')
405
+ ids.append(assistant_start)
406
+ return ids
ommlds/server/server.py CHANGED
@@ -47,11 +47,11 @@ class McServerHandler(HttpHandler_):
47
47
 
48
48
  log.info('Server got prompt: %s', prompt)
49
49
 
50
- resp = self.llm.invoke(mc.ChatChoicesRequest(
50
+ resp = lang.sync_await(self.llm.invoke(mc.ChatChoicesRequest(
51
51
  [mc.UserMessage(prompt)],
52
52
  # Temperature(.1),
53
- ))
54
- resp_txt = check.isinstance(resp.v[0].m.c, str)
53
+ )))
54
+ resp_txt = check.isinstance(check.isinstance(check.single(resp.v[0].ms), mc.AiMessage).c, str)
55
55
 
56
56
  log.info('Server got response: %s', resp_txt)
57
57
 
File without changes
File without changes
@@ -0,0 +1,23 @@
1
+ from omlish import lang
2
+ from omlish import marshal as msh
3
+
4
+ from .protocol import ContentBlock
5
+
6
+
7
+ ##
8
+
9
+
10
+ @lang.static_init
11
+ def _install_standard_marshaling() -> None:
12
+ for root_cls, tag_field in [
13
+ (ContentBlock, 'type'),
14
+ ]:
15
+ msh.install_standard_factories(*msh.standard_polymorphism_factories(
16
+ msh.polymorphism_from_subclasses(
17
+ root_cls,
18
+ naming=msh.Naming.SNAKE,
19
+ strip_suffix=msh.AutoStripSuffix,
20
+ ),
21
+ msh.FieldTypeTagging(tag_field),
22
+ unions='partial',
23
+ ))