PyPI - ommlds - Versions diffs - 0.0.0.dev332__py3-none-any.whl → 0.0.0.dev334__py3-none-any.whl - Mend

ommlds 0.0.0.dev332py3-none-any.whl → 0.0.0.dev334py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

ommlds/backends/tinygrad/models/llama3/__main__.py +1 -1
ommlds/backends/tinygrad/models/llama3/attention.py +62 -27
ommlds/backends/tinygrad/models/llama3/{repl.py → cli.py} +56 -22
ommlds/backends/tinygrad/models/llama3/loading.py +25 -20
ommlds/backends/tinygrad/models/llama3/transformer.py +5 -7
ommlds/cli/main.py +1 -1
ommlds/cli/sessions/chat.py +1 -2
ommlds/cli/state.py +1 -1
ommlds/minichain/__init__.py +91 -46
ommlds/minichain/_typedvalues.py +93 -0
ommlds/minichain/backends/tinygrad/chat.py +1 -1
ommlds/minichain/backends/transformers/sentence.py +2 -2
ommlds/minichain/chat/messages.py +44 -2
ommlds/minichain/chat/metadata.py +16 -0
ommlds/minichain/content/{marshal.py → _marshal.py} +17 -6
ommlds/minichain/content/content.py +0 -6
ommlds/minichain/content/images.py +2 -2
ommlds/minichain/content/list.py +15 -0
ommlds/minichain/content/metadata.py +16 -0
ommlds/minichain/content/simple.py +38 -0
ommlds/minichain/content/text.py +12 -0
ommlds/minichain/content/transforms.py +16 -2
ommlds/minichain/metadata.py +48 -0
ommlds/minichain/services/_marshal.py +48 -6
ommlds/minichain/services/_typedvalues.py +0 -33
ommlds/minichain/services/requests.py +5 -2
ommlds/minichain/services/responses.py +5 -2
{ommlds-0.0.0.dev332.dist-info → ommlds-0.0.0.dev334.dist-info}/METADATA +3 -3
{ommlds-0.0.0.dev332.dist-info → ommlds-0.0.0.dev334.dist-info}/RECORD +36 -29
/ommlds/minichain/chat/{marshal.py → _marshal.py} +0 -0
/ommlds/minichain/llms/{marshal.py → _marshal.py} +0 -0
/ommlds/minichain/tools/{marshal.py → _marshal.py} +0 -0
{ommlds-0.0.0.dev332.dist-info → ommlds-0.0.0.dev334.dist-info}/WHEEL +0 -0
{ommlds-0.0.0.dev332.dist-info → ommlds-0.0.0.dev334.dist-info}/entry_points.txt +0 -0
{ommlds-0.0.0.dev332.dist-info → ommlds-0.0.0.dev334.dist-info}/licenses/LICENSE +0 -0
{ommlds-0.0.0.dev332.dist-info → ommlds-0.0.0.dev334.dist-info}/top_level.txt +0 -0

ommlds/backends/tinygrad/models/llama3/__main__.py CHANGED Viewed

@@ -1,4 +1,4 @@
 if __name__ == '__main__':
-    from .repl import _main
+    from .cli import _main
     _main()

ommlds/backends/tinygrad/models/llama3/attention.py CHANGED Viewed

@@ -57,8 +57,8 @@ class Attention:
             self,
             dim,
             n_heads,
-            n_kv_heads,
-            max_context,
+            n_kv_heads=None,
+            max_context=0,
             linear=nn.Linear,
             qk_norm: float | None = None,
     ) -> None:
@@ -85,7 +85,7 @@ class Attention:
             x: Tensor,
             start_pos: Variable_ | int,
             freqs_cis: Tensor,
-            mask: Tensor | None,
+            mask: Tensor | None = None,
     ) -> Tensor:
         if getenv('WQKV'):
             if not hasattr(self, 'wqkv'):
@@ -114,36 +114,71 @@ class Attention:
         bsz, seqlen, _, _ = xq.shape
         # create kv cache
-        if not hasattr(self, 'cache_kv'):
-            self.cache_kv = (
-                Tensor.zeros(
-                    2,
-                    bsz,
-                    self.max_context,
-                    self.n_kv_heads,
-                    self.head_dim,
-                    dtype=x.dtype,
+        # if not hasattr(self, 'cache_kv'):
+        #     self.cache_kv = (
+        #         Tensor.zeros(
+        #             2,
+        #             bsz,
+        #             self.max_context,
+        #             self.n_kv_heads,
+        #             self.head_dim,
+        #             dtype=x.dtype,
+        #         )
+        #         .contiguous()
+        #         .realize()
+        #     )
+        #     if isinstance(x.device, tuple):
+        #         # TODO: instead of specifying how to shard, it can follow how xk and xv are being sharded
+        #         self.cache_kv.shard_(
+        #             (x.device), axis=3 if getenv('SHARD_KVCACHE') else None,
+        #         ).realize()
+        #
+        # # update the cache
+        # check.state(xk.dtype == xv.dtype == self.cache_kv.dtype, f'{xk.dtype=}, {xv.dtype=}, {self.cache_kv.dtype=}')
+        #
+        # self.cache_kv[:, :, start_pos:start_pos + seqlen, :, :].assign(Tensor.stack(xk, xv)).realize()
+        #
+        # keys = self.cache_kv[0, :, 0:start_pos + seqlen, :, :]
+        # values = self.cache_kv[1, :, 0:start_pos + seqlen, :, :]
+        if self.max_context:
+            if not hasattr(self, 'cache_kv'):
+                self.cache_kv = (
+                    Tensor.zeros(
+                        2,
+                        bsz,
+                        self.max_context,
+                        self.n_kv_heads,
+                        self.head_dim,
+                        dtype=x.dtype,
+                    )
+                    .contiguous()
+                    .realize()
                 )
-                .contiguous()
-                .realize()
+                if isinstance(x.device, tuple):
+                    # TODO: instead of specifying how to shard, it can follow how xk and xv are being sharded
+                    self.cache_kv.shard_(
+                        (x.device),
+                        axis=3 if getenv('SHARD_KVCACHE') else None,
+                    ).realize()
+            # update the cache
+            check.state(
+                xk.dtype == xv.dtype == self.cache_kv.dtype,
+                f'{xk.dtype=}, {xv.dtype=}, {self.cache_kv.dtype=}',
             )
-            if isinstance(x.device, tuple):
-                # TODO: instead of specifying how to shard, it can follow how xk and xv are being sharded
-                self.cache_kv.shard_(
-                    (x.device), axis=3 if getenv('SHARD_KVCACHE') else None,
-                ).realize()
+            self.cache_kv[:, :, start_pos:start_pos + seqlen, :, :].assign(Tensor.stack(xk, xv)).realize()
-        # update the cache
-        check.state(xk.dtype == xv.dtype == self.cache_kv.dtype, f'{xk.dtype=}, {xv.dtype=}, {self.cache_kv.dtype=}')
+            keys = self.cache_kv[0, :, 0:start_pos + seqlen, :, :]
+            values = self.cache_kv[1, :, 0:start_pos + seqlen, :, :]
-        self.cache_kv.shrink(
-            (None, None, (start_pos, start_pos + seqlen), None, None),
-        ).assign(Tensor.stack(xk, xv)).realize()
+        else:
+            check.state(start_pos == 0)
+            keys, values = xk, xv
-        keys = self.cache_kv[0].shrink((None, (0, start_pos + seqlen), None, None))
-        values = self.cache_kv[1].shrink((None, (0, start_pos + seqlen), None, None))
+        keys = repeat_kv(keys, self.n_rep)
+        values = repeat_kv(values, self.n_rep)
-        keys, values = repeat_kv(keys, self.n_rep), repeat_kv(values, self.n_rep)
         xq, keys, values = (
             xq.transpose(1, 2),
             keys.transpose(1, 2),

ommlds/backends/tinygrad/models/llama3/{repl.py → cli.py} RENAMED Viewed

@@ -1,5 +1,6 @@
 import argparse
 import pathlib
+import typing as ta
 from tinygrad import Tensor
@@ -12,33 +13,60 @@ from .llm import Llama3Llm
 ##
-def run_repl(llm: Llama3Llm) -> None:
-    prompt = [
-        llm.tokenizer.bos_id,
-        *llm.encode_message('system', 'You are an helpful assistant.'),
-    ]
+class _RunToStopResult(ta.NamedTuple):
+    start_pos: int
+    last_tok: int
-    start_pos = llm.prefill(prompt)
+def _run_to_stop(llm: Llama3Llm, start_pos: int, last_tok: int) -> _RunToStopResult:
     while True:
-        toks = llm.encode_message('user', input('Q: ')) + llm.encode_role('assistant')
+        tok = llm.feed(
+            [last_tok],
+            start_pos,
+        )
+        tok = tok.item()
+        start_pos += 1
+        last_tok = tok
+        if tok in llm.tokenizer.stop_tokens:
+            break
+        print(llm.tokenizer.decode([tok]), end='', flush=True)
-        start_pos = llm.prefill(toks[:-1], start_pos=start_pos)
-        last_tok = toks[-1]
-        while True:
-            tok = llm.feed(
-                [last_tok],
-                start_pos,
-            )
-            tok = tok.item()
+    print(flush=True)
-            start_pos += 1
-            last_tok = tok
-            if tok in llm.tokenizer.stop_tokens:
-                break
+    return _RunToStopResult(start_pos, last_tok)
-            print(llm.tokenizer.decode([tok]), end='', flush=True)
-        print(flush=True)
+def _run_new_toks(llm: Llama3Llm, toks: list[int], start_pos: int = 0) -> int:
+    start_pos = llm.prefill(toks[:-1], start_pos=start_pos)
+    last_tok = toks[-1]
+    return _run_to_stop(llm, start_pos, last_tok).start_pos
+#
+def run_prompt(llm: Llama3Llm, prompt: str) -> None:
+    _run_new_toks(llm,[
+        llm.tokenizer.bos_id,
+        *llm.encode_message('system', 'You are an helpful assistant.'),
+        *llm.encode_message('user', prompt),
+        *llm.encode_role('assistant'),
+    ])
+def run_repl(llm: Llama3Llm) -> None:
+    start_pos = llm.prefill([
+        llm.tokenizer.bos_id,
+        *llm.encode_message('system', 'You are an helpful assistant.'),
+    ])
+    while True:
+        start_pos = _run_new_toks(llm, [
+            *llm.encode_message('user', input('Q: ')),
+            *llm.encode_role('assistant'),
+        ], start_pos)
 ##
@@ -84,6 +112,9 @@ def _build_arg_parser() -> argparse.ArgumentParser:
         default=0.85,
         help='Temperature',
     )
+    parser.add_argument(
+        '--prompt',
+    )
     return parser
@@ -112,7 +143,10 @@ def _main() -> None:
         temperature=args.temperature,
     )
-    run_repl(llm)
+    if (prompt := args.prompt) is not None:
+        run_prompt(llm, prompt)
+    else:
+        run_repl(llm)
 if __name__ == '__main__':

ommlds/backends/tinygrad/models/llama3/loading.py CHANGED Viewed

@@ -27,7 +27,7 @@ from .transformer import Transformer
 # TODO: model shouldn't be an input here, and n_kv_heads should support None
 def convert_from_huggingface(
         weights: dict[str, Tensor],
-        model: Transformer,
+        n_layers: int,
         n_heads: int,
         n_kv_heads: int,
         permute_layers: bool = True,
@@ -50,35 +50,35 @@ def convert_from_huggingface(
         'model.embed_tokens.weight': 'tok_embeddings.weight',
         **{
             f'model.layers.{l}.input_layernorm.weight': f'layers.{l}.attention_norm.weight'
-            for l in range(len(model.layers))
+            for l in range(n_layers)
         },
         **{
             f'model.layers.{l}.self_attn.{x}_norm.weight': f'layers.{l}.attention.{x}_norm.weight'
             for x in ['q', 'k']
-            for l in range(len(model.layers))
+            for l in range(n_layers)
         },
         **{
             f'model.layers.{l}.self_attn.{x}_proj.weight': f'layers.{l}.attention.w{x}.weight'
             for x in ['q', 'k', 'v', 'o']
-            for l in range(len(model.layers))
+            for l in range(n_layers)
         },
         **{
             f'model.layers.{l}.self_attn.{x}_proj.bias': f'layers.{l}.attention.w{x}.bias'
             for x in ['q', 'k', 'v', 'o']
-            for l in range(len(model.layers))
+            for l in range(n_layers)
         },
         **{
             f'model.layers.{l}.post_attention_layernorm.weight': f'layers.{l}.ffn_norm.weight'
-            for l in range(len(model.layers))
+            for l in range(n_layers)
         },
         **{
             f'model.layers.{l}.mlp.{x}_proj.weight': f'layers.{l}.feed_forward.w{y}.weight'
             for x, y in {'gate': '1', 'down': '2', 'up': '3'}.items()
-            for l in range(len(model.layers))
+            for l in range(n_layers)
         },
         **{
             f'model.layers.{l}.mlp.gate.weight': f'layers.{l}.feed_forward.gate.weight'
-            for l in range(len(model.layers))
+            for l in range(n_layers)
         },
         'model.norm.weight': 'norm.weight',
         'lm_head.weight': 'output.weight',
@@ -107,31 +107,31 @@ def convert_from_huggingface(
 def convert_from_gguf(
         weights: dict[str, Tensor],
-        model: Transformer,
+        n_layers: int,
 ):
     keymap = {
         'token_embd.weight': 'tok_embeddings.weight',
         **{
             f'blk.{l}.attn_norm.weight': f'layers.{l}.attention_norm.weight'
-            for l in range(len(model.layers))
+            for l in range(n_layers)
         },
         **{
             f'blk.{l}.attn_{x}.weight': f'layers.{l}.attention.w{x}.weight'
             for x in ['q', 'k', 'v']
-            for l in range(len(model.layers))
+            for l in range(n_layers)
         },
         **{
             f'blk.{l}.attn_output.weight': f'layers.{l}.attention.wo.weight'
-            for l in range(len(model.layers))
+            for l in range(n_layers)
         },
         **{
             f'blk.{l}.ffn_norm.weight': f'layers.{l}.ffn_norm.weight'
-            for l in range(len(model.layers))
+            for l in range(n_layers)
         },
         **{
             f'blk.{l}.ffn_{x}.weight': f'layers.{l}.feed_forward.w{y}.weight'
             for x, y in {'gate': '1', 'down': '2', 'up': '3'}.items()
-            for l in range(len(model.layers))
+            for l in range(n_layers)
         },
         'output_norm.weight': 'norm.weight',
         'rope_freqs.weight': 'rope_freqs.weight',
@@ -269,8 +269,10 @@ def build_transformer(
     else:
         linear, embedding, quantize_embeds = nn.Linear, nn.Embedding, False
+    model_params = MODEL_PARAMS[model_size]
     model = Transformer(
-        **MODEL_PARAMS[model_size]['args'],
+        **model_params['args'],
         linear=linear,
         embedding=embedding,
         max_context=max_context,
@@ -292,7 +294,7 @@ def build_transformer(
             weights = concat_weights(
                 [
                     load(str(model_path / f'consolidated.{i:02d}.pth'))
-                    for i in range(MODEL_PARAMS[model_size]['files'])
+                    for i in range(model_params['files'])
                 ],
                 device[0] if isinstance(device, tuple) else device,
             )
@@ -303,13 +305,16 @@ def build_transformer(
     if 'model.embed_tokens.weight' in weights:
         weights = convert_from_huggingface(
             weights,
-            model,
-            MODEL_PARAMS[model_size]['args']['n_heads'],
-            MODEL_PARAMS[model_size]['args']['n_kv_heads'],
+            model_params['args']['n_layers'],
+            model_params['args']['n_heads'],
+            model_params['args']['n_kv_heads'],
         )
     elif 'token_embd.weight' in weights:
-        weights = convert_from_gguf(weights, model)
+        weights = convert_from_gguf(
+            weights,
+            model_params['args']['n_layers'],
+        )
     weights = fix_bf16(weights)

ommlds/backends/tinygrad/models/llama3/transformer.py CHANGED Viewed

@@ -130,10 +130,8 @@ class Transformer:
         _bsz, seqlen = tokens.shape
         h = self.tok_embeddings(tokens)
-        self.freqs_cis = self.freqs_cis.cast(h.dtype).realize()
-        freqs_cis = self.freqs_cis.shrink(
-            (None, (start_pos, start_pos + seqlen), None, None, None),
-        )
+        self.freqs_cis = self.freqs_cis.cast(h.dtype).kernelize()
+        freqs_cis = self.freqs_cis[:, start_pos:start_pos + seqlen, :, :, :]
         mask = (
             Tensor.full(
@@ -143,7 +141,7 @@ class Transformer:
                 device=h.device,
             )
             .triu(start_pos + 1)
-            .realize()
+            .kernelize()
         ) if seqlen > 1 else None
         for layer in self.layers:
@@ -152,7 +150,7 @@ class Transformer:
         return sample(
             logits.flatten(), temperature, top_k, top_p, alpha_f, alpha_p,
-        ).realize()
+        ).kernelize()
     def __call__(
             self,
@@ -172,7 +170,7 @@ class Transformer:
         ):
             return self.forward_jit(
                 tokens,
-                Variable('start_pos', 1, self.max_context).bind(start_pos),
+                Variable('start_pos', 1, self.max_context - 1).bind(start_pos),
                 temperature,
                 top_k,
                 top_p,

ommlds/cli/main.py CHANGED Viewed

@@ -70,7 +70,7 @@ def _main() -> None:
     content: mc.Content
     if args.image:
-        content = mc.Image(pimg.open(check.non_empty_str(check.single(args.prompt))))
+        content = mc.ImageContent(pimg.open(check.non_empty_str(check.single(args.prompt))))
     elif args.editor:
         check.arg(not args.prompt)

ommlds/cli/sessions/chat.py CHANGED Viewed

@@ -41,8 +41,7 @@ class ChatState:
 DEFAULT_CHAT_MODEL_BACKEND = 'openai'
-CHAT_MODEL_FACTORIES: ta.Mapping[str, ta.Callable[..., mc.ChatService]] = {
-}
+CHAT_MODEL_FACTORIES: ta.Mapping[str, ta.Callable[..., mc.ChatService]] = {}
 ##

ommlds/cli/state.py CHANGED Viewed

@@ -42,7 +42,7 @@ class StateStorage(lang.Abstract):
         ms = msh.unmarshal(obj, MarshaledState)
         if ms.version < self._version:
             return None
-        return msh.unmarshal(ms.payload, ty)  # type: ignore
+        return msh.unmarshal(ms.payload, ty)
     def marshal_state(self, obj: ta.Any, ty: type | None = None) -> ta.Any:
         ms = MarshaledState(

ommlds/minichain/__init__.py CHANGED Viewed

@@ -1,10 +1,6 @@
 # fmt: off
-from .registry import (  # noqa
-    register_type,
-    registry_new,
-    registry_of,
-)
+##
 from .chat.formats import (  # noqa
     JSON_RESPONSE_FORMAT,
@@ -71,23 +67,7 @@ from .chat.types import (  # noqa
     ChatResponseOutput,
 )
-from .completion import (  # noqa
-    CompletionRequestOption,
-    CompletionRequestOptions,
-    CompletionRequest,
-    CompletionResponseOutput,
-    CompletionResponseOutputs,
-    CompletionResponse,
-    CompletionService,
-)
-from .configs import (  # noqa
-    Config,
-    consume_configs,
-)
+##
 from .content.content import (  # noqa
     Content,
@@ -95,13 +75,26 @@ from .content.content import (  # noqa
 )
 from .content.images import (  # noqa
-    Image,
+    ImageContent,
+)
+from .content.list import (  # noqa
+    ListContent,
+)
+from .content.metadata import (  # noqa
+    ContentMetadata,
+    ContentMetadatas,
 )
 from .content.rendering import (  # noqa
     StringRenderer,
 )
+from .content.text import (  # noqa
+    TextContent,
+)
 from .content.transforms import (  # noqa
     ContentTransform,
@@ -109,10 +102,7 @@ from .content.transforms import (  # noqa
     transform_content_strings,
 )
-from .envs import (  # noqa
-    Env,
-    EnvKey,
-)
+##
 from .llms.tokens import (  # noqa
     Token,
@@ -135,6 +125,8 @@ from .llms.services import (  # noqa
     TokenUsageOutput,
 )
+##
 from .services import (  # noqa
     Request,
     RequestOption,
@@ -144,21 +136,7 @@ from .services import (  # noqa
     ServiceFacade,
 )
-from .standard import (  # noqa
-    ModelSpecifier,
-    ModelName,
-    ModelPath,
-    ApiKey,
-    DefaultRequestOptions,
-)
-from .streaming import (  # noqa
-    ResponseGenerator,
-    StreamResponse,
-)
+##
 from .tools.jsonschema import (  # noqa
     build_tool_spec_json_schema,
@@ -189,6 +167,8 @@ from .tools.types import (  # noqa
     ToolExecRequest,
 )
+##
 from .vectors.embeddings import (  # noqa
     EmbeddingRequest,
     EmbeddingRequestOption,
@@ -235,13 +215,78 @@ from .vectors.types import (  # noqa
     Vectorable,
 )
+##
+from .completion import (  # noqa
+    CompletionRequestOption,
+    CompletionRequestOptions,
+    CompletionRequest,
+    CompletionResponseOutput,
+    CompletionResponseOutputs,
+    CompletionResponse,
+    CompletionService,
+)
+from .configs import (  # noqa
+    Config,
+    consume_configs,
+)
+from .envs import (  # noqa
+    Env,
+    EnvKey,
+)
+from .metadata import (  # noqa
+    Metadata,
+    MetadataContainer,
+    CommonMetadata,
+    Uuid,
+)
+from .registry import (  # noqa
+    register_type,
+    registry_new,
+    registry_of,
+)
+from .resources import (  # noqa
+    ResourcesRef,
+    ResourcesRefNotRegisteredError,
+    Resources,
+    ResourceManaged,
+)
+from .standard import (  # noqa
+    ModelSpecifier,
+    ModelName,
+    ModelPath,
+    ApiKey,
+    DefaultRequestOptions,
+)
+from .streaming import (  # noqa
+    ResponseGenerator,
+    StreamResponse,
+)
 ##
 from omlish.lang.imports import _register_conditional_import  # noqa
-_register_conditional_import('omlish.marshal', '.chat.marshal', __package__)
-_register_conditional_import('omlish.marshal', '.content.marshal', __package__)
-_register_conditional_import('omlish.marshal', '.llms.marshal', __package__)
-_register_conditional_import('omlish.marshal', '.tools.marshal', __package__)
+_register_conditional_import('omlish.marshal', '.chat._marshal', __package__)
+_register_conditional_import('omlish.marshal', '.content._marshal', __package__)
+_register_conditional_import('omlish.marshal', '.llms._marshal', __package__)
+_register_conditional_import('omlish.marshal', '.tools._marshal', __package__)

ommlds 0.0.0.dev332__py3-none-any.whl → 0.0.0.dev334__py3-none-any.whl

ommlds 0.0.0.dev332py3-none-any.whl → 0.0.0.dev334py3-none-any.whl