PyPI - mlx-code - Versions diffs - 0.0.2a0__tar.gz → 0.0.2a2__tar.gz - Mend

mlx-code 0.0.2a0tar.gz → 0.0.2a2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

{mlx_code-0.0.2a0/mlx_code.egg-info → mlx_code-0.0.2a2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mlx-code
-Version: 0.0.2a0
+Version: 0.0.2a2
 Summary: Local Claude Code for Mac
 Home-page: https://github.com/JosefAlbers/mlx-code
 Author: J Joe
@@ -52,19 +52,7 @@ mlx-code [options] [-- claude options]
 | `--work` | `$CWD` | Working directory mirrored into the Claude session |
 | `--home` | temp dir | Home directory for the Claude process |
-Any extra arguments after `--` are forwarded to the `claude` CLI:
-| Command | What it does | Example |
-|--------|--------------|--------|
-| `mlx-code` | Start interactive mode | `mlx-code` |
-| `mlx-code "task"` | Run a one-time task | `mlx-code "fix the build error"` |
-| `mlx-code -p "query"` | Run one-off query, then exit | `mlx-code -p "explain this function"` |
-| `mlx-code -c` | Continue most recent conversation in current directory | `mlx-code -c` |
-| `mlx-code -r` | Resume a previous conversation | `mlx-code -r` |
-| `mlx-code commit` | Create a Git commit | `mlx-code commit` |
-| `/clear` | Clear conversation history | `/clear` |
-| `/help` | Show available commands | `/help` |
-| `exit` or `Ctrl+C` | Exit Claude Code | `exit` |
+Any extra arguments after `--` are forwarded to the `claude` CLI.
 ### Licence

{mlx_code-0.0.2a0 → mlx_code-0.0.2a2}/README.md RENAMED Viewed

@@ -28,19 +28,7 @@ mlx-code [options] [-- claude options]
 | `--work` | `$CWD` | Working directory mirrored into the Claude session |
 | `--home` | temp dir | Home directory for the Claude process |
-Any extra arguments after `--` are forwarded to the `claude` CLI:
-| Command | What it does | Example |
-|--------|--------------|--------|
-| `mlx-code` | Start interactive mode | `mlx-code` |
-| `mlx-code "task"` | Run a one-time task | `mlx-code "fix the build error"` |
-| `mlx-code -p "query"` | Run one-off query, then exit | `mlx-code -p "explain this function"` |
-| `mlx-code -c` | Continue most recent conversation in current directory | `mlx-code -c` |
-| `mlx-code -r` | Resume a previous conversation | `mlx-code -r` |
-| `mlx-code commit` | Create a Git commit | `mlx-code commit` |
-| `/clear` | Clear conversation history | `/clear` |
-| `/help` | Show available commands | `/help` |
-| `exit` or `Ctrl+C` | Exit Claude Code | `exit` |
+Any extra arguments after `--` are forwarded to the `claude` CLI.
 ### Licence

{mlx_code-0.0.2a0 → mlx_code-0.0.2a2}/main.py RENAMED Viewed

@@ -1,3 +1,4 @@
+# {{{
 # Copyright 2026 J Joe
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -25,8 +26,22 @@ from http.server import BaseHTTPRequestHandler, HTTPServer
 from pathlib import Path
 import mlx.core as mx
 import mlx_lm
-from mlx_lm.generate import generate_step
+import numpy as np
+import hashlib
+import contextlib
+import functools
+import mlx.nn as nn
+from typing import (
+    Any,
+    Callable,
+    Generator,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
+generation_stream = mx.new_stream(mx.default_device())
 stream_logger = logging.getLogger("stream")
 stream_logger.setLevel(logging.DEBUG)
 s_handler = logging.FileHandler("mlx_stream.log", mode='w')
@@ -39,7 +54,54 @@ t_handler = logging.FileHandler("mlx_trace.log", mode='w')
 t_handler.setFormatter(logging.Formatter("【%(message)s\n】\n"))
 trace_logger.addHandler(t_handler)
 gen_lock = threading.Lock()
-prompt_cache = {}
+dict_cache = {}
+def hash_tokens(tokens):
+    arr = np.array(tokens, dtype=np.uint32)
+    return hashlib.blake2b(arr.tobytes(), digest_size=8).hexdigest()
+def get_common_len(a, b):
+    common_len = 0
+    for p, h in zip(a, b):
+        if p == h:
+            common_len += 1
+        else:
+            break
+    return common_len
+@contextlib.contextmanager
+def wired_limit(model: nn.Module, streams: Optional[List[mx.Stream]] = None):
+    if not mx.metal.is_available():
+        try:
+            yield
+        finally:
+            pass
+    else:
+        model_bytes = tree_reduce(
+            lambda acc, x: acc + x.nbytes if isinstance(x, mx.array) else acc, model, 0
+        )
+        max_rec_size = mx.device_info()["max_recommended_working_set_size"]
+        if model_bytes > 0.9 * max_rec_size:
+            model_mb = model_bytes // 2**20
+            max_rec_mb = max_rec_size // 2**20
+            print(f"{model_mb=} {max_rec_mb=}")
+        old_limit = mx.set_wired_limit(max_rec_size)
+        try:
+            yield
+        finally:
+            if streams is not None:
+                for s in streams:
+                    mx.synchronize(s)
+            else:
+                mx.synchronize()
+            mx.set_wired_limit(old_limit)
+def maybe_quantize_kv_cache(prompt_cache, quantized_kv_start, kv_group_size, kv_bits):
+    if kv_bits is None:
+        return
+    for e, c in enumerate(prompt_cache):
+        if hasattr(c, "to_quantized") and c.offset >= quantized_kv_start:
+            prompt_cache[e] = c.to_quantized(group_size=kv_group_size, bits=kv_bits)
 def parse_tool(tools, names):
     qwen_tools = []
@@ -47,17 +109,18 @@ def parse_tool(tools, names):
         if names is not None and tool["name"] not in names:
             continue
         qwen_tool = {
-            "type": "function",
-            "function": {
+            # "type": "function",
+            # "function": {
                 "name": tool["name"],
                 "description": tool["description"],
                 "parameters": tool.get("input_schema", {
                     "type": "object",
                     "properties": {}
                 })
-            }
+            # }
         }
-        params = qwen_tool["function"]["parameters"]
+        # params = qwen_tool["function"]["parameters"]
+        params = qwen_tool["parameters"]
         params.pop("$schema", None)
         qwen_tools.append(qwen_tool)
     return qwen_tools
@@ -77,24 +140,21 @@ def encode(body, tokenizer, system, names, skips):
                 if block.get("type") != "text":
                     continue
                 text = block.get("text", "").strip()
-                if re.match(r'^\S+:\s', text) and '\n' not in text:
+                if re.match(r'^x-anthropic-billing-header:\s?.*;$', text) and '\n' not in text:
                     continue
                 if text:
                     sys_parts.append(text)
     if sys_parts:
         msgs.append({"role": "system", "content": "\n\n".join(sys_parts)})
     calls = {}
-    def skip(text, show_skipped=True):
+    def skip(text, show_skipped=False):
         if skips is None:
             return text
         lines = []
         for pattern in skips:
             found = re.findall(pattern, text)
             if found:
-                lines.append(
-                    f"{pattern}\n" +
-                    "\n".join(found)
-                )
+                lines.append(f"{pattern}\n" + "\n".join(found))
         if lines and show_skipped:
             trace_logger.debug("\n".join(["S"]+lines))
         for pattern in skips:
@@ -109,7 +169,7 @@ def encode(body, tokenizer, system, names, skips):
         for block in content:
             t = block.get("type")
             if t == "text":
-                parts['content'] = parts.get('content', '').rstrip() + '\n' + skip(block['text']).rstrip()
+                parts['content'] = parts.get('content', '') + skip(block['text'])
             elif t == "thinking":
                 parts['reasoning_content'] = block['thinking']
             elif t == "tool_use":
@@ -123,25 +183,39 @@ def encode(body, tokenizer, system, names, skips):
         if parts:
             msgs.append({"role": role}|parts)
     if not msgs[-1].get('content', '').strip():
-        return None
-    return tokenizer.apply_chat_template(msgs, tools = parse_tool(body.get("tools", []), names), tokenize=False, add_generation_prompt=True)
+        return None, -1
+    apply_chat_template = lambda x: tokenizer.apply_chat_template(x, tools = parse_tool(body.get("tools", []), names), tokenize=False, add_generation_prompt=True)
+    full = apply_chat_template(msgs)
+    last_user_idx = max((i for i, m in enumerate(msgs) if m.get("role") == "user"), default=None)
+    if last_user_idx is None:
+        return full, -1
+    p_msgs = msgs[:last_user_idx] + [dict(role='user', content='h' if msgs[last_user_idx]['content'][0] != 'h' else 'i')]
+    pref = apply_chat_template(p_msgs)
+    return full, pref
-def decode(raw_text, tokenizer, parse_think=True):
+def decode(raw_text, tokenizer, parse_think, single_think=False):
+    def escape(text):
+        def repl(match):
+            inner = match.group(1)
+            inner = inner.replace('<', '‹').replace('>', '›')
+            return f'`{inner}`'
+        return re.sub(r'`([^\n`]*)`', repl, text)
+    raw_text = escape(raw_text)
     raw_text = '<think>' + raw_text if (c := raw_text.find('</think>')) != -1 and ((o := raw_text.find('<think>')) == -1 or c < o) else raw_text
     blocks = []
     if parse_think:
-        parts = re.split(r'(<think>.*?</think>)', raw_text, flags=re.DOTALL)
+        parts = re.split(r'(<think>.*?</think>)', raw_text, flags=re.DOTALL, maxsplit=1 if single_think else 0)
     else:
         parts = [raw_text]
     for part in parts:
         if not part:
-            continue
-        if parse_think and part.startswith('<think>') and part.endswith('</think>'):
+            continue
+        if parse_think and not single_think and part.startswith('<think>') and part.endswith('</think>'):
             thinking_content = part[7:-8].strip()
             if thinking_content:
                 blocks.append({"type": "thinking", "thinking": thinking_content})
         else:
-            blocks.append({"type": "text", "text": part})
+            blocks.append({"type": "text", "text": re.sub(r'</?think>', '‹think›', part)}) #: show tool call
             tool_pattern = re.compile(r'<tool_call>(.*?)</tool_call>', re.DOTALL)
             for match in tool_pattern.finditer(part):
                 content = match.group(1).strip()
@@ -185,7 +259,7 @@ def blocks_to_sse(blocks: list[dict], msg_id: str, in_tokens: int, out_tokens: i
         elif bt == "tool_use":
             out += event("content_block_start", {"type": "content_block_start", "index": i,
                 "content_block": {"type": "tool_use", "id": block["id"],
-                    "name": block["name"], "input": {}}})
+                    "name": block["name"], "input": {}} })
             out += event("content_block_delta", {"type": "content_block_delta", "index": i,
                 "delta": {"type": "input_json_delta", "partial_json": json.dumps(block["input"])}})
         out += event("content_block_stop", {"type": "content_block_stop", "index": i})
@@ -196,6 +270,8 @@ def blocks_to_sse(blocks: list[dict], msg_id: str, in_tokens: int, out_tokens: i
     return bytes(out)
 def dmca(p_str):
+    if True: #: False for recording
+        return p_str
     symbols = ["▲", "△", "▶", "▷", "▼", "▽", "◀", "◁", "◆", "◇"]
     def mask_text(text):
         return re.sub(r"\S", lambda _: random.choice(symbols), text)
@@ -211,64 +287,6 @@ def dmca(p_str):
         p_str = re.sub(pattern, lambda m: mask_text(m.group(0)), p_str)
     return p_str
-def generate(model, tokenizer, prompt, hook=None, max_tokens=256, helper_max_tokens=64, **kwargs):
-    global prompt_cache
-    if prompt is None:
-        return '', 0, 0
-    if not isinstance(tokenizer, mlx_lm.tokenizer_utils.TokenizerWrapper):
-        tokenizer = mlx_lm.tokenizer_utils.TokenizerWrapper(tokenizer)
-    detokenizer = tokenizer.detokenizer
-    if isinstance(prompt, str):
-        add_special_tokens = tokenizer.bos_token is None or not prompt.startswith(tokenizer.bos_token)
-        prompt_s = prompt
-        prompt = tokenizer.encode(prompt, add_special_tokens=add_special_tokens)
-    else:
-        prompt_s = tokenizer.decode(prompt)
-    stream_logger.debug(dmca(prompt_s))
-    common_len = 0
-    if prompt_cache.get('cache', None):
-        for p, h in zip(prompt, prompt_cache['hx']):
-            if p == h:
-                common_len += 1
-            else:
-                break
-    else:
-        prompt_cache['hx'] = []
-        prompt_cache['cache'] = mlx_lm.models.cache.make_prompt_cache(model)
-    trim_len = len(prompt_cache['hx']) - common_len
-    mlx_lm.models.cache.trim_prompt_cache(prompt_cache['cache'], trim_len)
-    token_gen = generate_step(
-        mx.array(prompt[common_len:]),
-        model,
-        prompt_cache=prompt_cache['cache'],
-        max_tokens=max_tokens,
-        **kwargs,
-    )
-    text = ""
-    tic_non = time.perf_counter()
-    gens = []
-    for token, _ in token_gen:
-        gens.append(token)
-        if token in tokenizer.eos_token_ids:
-            break
-        detokenizer.add_token(token)
-        seg = detokenizer.last_segment
-        stream_logger.debug(seg)
-        text += seg
-        if len(gens) == 1:
-            tic_inp = time.perf_counter()
-            if prompt_cache.get('file_name'):
-                _fn = prompt_cache.pop('file_name')
-                mlx_lm.models.cache.save_prompt_cache(_fn, prompt_cache['cache'], metadata=dict(model_name=prompt_cache['model_name'], hx=json.dumps(prompt)))
-        if len(gens) >= max_tokens:
-            break
-    tic_out = time.perf_counter()
-    detokenizer.finalize()
-    text += detokenizer.last_segment
-    prompt_cache['hx'] = prompt+gens
-    trace_logger.debug(f'G {common_len} {trim_len}\n=== TPS ===\n- Processed {len(prompt)} input tokens in {tic_inp-tic_non:.0f} seconds ({len(prompt)/(tic_inp-tic_non):.0f} tokens per second)\n- Generated {len(gens)} new tokens in {tic_out-tic_inp:.0f} seconds ({len(gens)/(tic_out-tic_inp):.0f} tokens per second)\n\n=== INP ===\n{prompt_s}\n=== OUT ===\n{text}')
-    return text, len(prompt), len(gens)
 def make_handler(model, tokenizer, system, names, skips, parse_think=True):
     class Handler(BaseHTTPRequestHandler):
         def log_message(self, fmt, *args):
@@ -299,9 +317,9 @@ def make_handler(model, tokenizer, system, names, skips, parse_think=True):
                 return
             n = int(self.headers.get("Content-Length", 0))
             body = json.loads(self.rfile.read(n))
-            prompt = encode(body, tokenizer, system, names, skips)
+            prompt, pref = encode(body, tokenizer, system, names, skips)
             with gen_lock:
-                raw, in_tokens, out_tokens = generate(model, tokenizer, prompt=prompt, max_tokens=body.get("max_tokens", 8192))
+                raw, in_tokens, out_tokens = generate(model, tokenizer, pref=pref, prompt=prompt, max_tokens=body.get("max_tokens", 8192))
             blocks, stop_reason = decode(raw, tokenizer, parse_think=parse_think)
             msg_id = f"msg_{uuid.uuid4().hex}"
             sse = blocks_to_sse(blocks, msg_id, in_tokens, out_tokens, stop_reason)
@@ -317,18 +335,33 @@ def make_handler(model, tokenizer, system, names, skips, parse_think=True):
                 pass
     return Handler
+def load_dict_cache(cache_path):
+    global dict_cache
+    cache, metadata = mlx_lm.models.cache.load_prompt_cache(cache_path, return_metadata=True)
+    mx.eval(cache)
+    model_name = metadata.pop("model_name", "")
+    tokens_str = metadata.pop("hx", "[]")
+    tokens = json.loads(tokens_str)
+    dict_cache = dict(cache=cache, hx=tokens, model_name=model_name)
+def save_dict_cache(cache_path, metadata, prompt_cache):
+    mlx_lm.models.cache.save_prompt_cache(cache_path, prompt_cache, metadata=metadata)
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model", default="mlx-community/Qwen3.5-4B-OptiQ-4bit")
     # parser.add_argument("--model", default="mlx-community/Qwen3.5-2B-OptiQ-4bit")
     # parser.add_argument("--model", default="mlx-community/Qwen3.5-0.8B-MLX-bf16")
-    parser.add_argument("--system", type=str, default='# Env\n{env}')
+    parser.add_argument("--system", type=str, default='')
+    # parser.add_argument("--system", type=str, default='# Env\n{env}')
     # parser.add_argument("--system", type=str, default=None)
-    parser.add_argument("--cache", type=str, default='cache/cache.safetensors')
+    # parser.add_argument("--cache", type=str, default='cache/cache.safetensors')
+    parser.add_argument("--cache", type=str, default='cache')
+    # parser.add_argument("--names", nargs="+", default=[])
     parser.add_argument("--names", nargs="+", default=['Read','Edit','Write','Grep','Glob','Bash','Agent','Skill'])
     # parser.add_argument("--names", nargs="+", default=None)
     parser.add_argument("--skips", nargs="+", default=[
-        r'(?m)^\[SUGGESTION MODE[\s\S]*'
+        r'(?m)^\[SUGGESTION MODE[\s\S]*',
         r'(?m)^<system-reminder>[\s\S]*?^</system-reminder>\s*',
     ])
     parser.add_argument("--port", type=int, default=8000)
@@ -336,20 +369,9 @@ def main():
     parser.add_argument("--home", default=tempfile.mkdtemp())
     parser.add_argument("--work", default=os.getcwd())
     args, claude_args = parser.parse_known_args()
-    global prompt_cache
-    if os.path.exists(args.cache):
-        cache, metadata = mlx_lm.models.cache.load_prompt_cache(args.cache, return_metadata=True)
-        mx.eval(cache)
-        model_name = metadata.pop("model_name", "")
-        tokens_str = metadata.pop("hx", "[]")
-        tokens = json.loads(tokens_str)
-        prompt_cache = dict(cache=cache, hx=tokens, model_name=model_name)
-        if prompt_cache.get('model_name') != args.model:
-            prompt_cache = dict(model_name=args.model)
-    else:
-        Path(args.cache).parent.mkdir(parents=True, exist_ok=True)
-        prompt_cache = dict(model_name=args.model)
-    prompt_cache['file_name']=args.cache
+    Path(args.cache).mkdir(parents=True, exist_ok=True)
+    global dict_cache
+    dict_cache = dict(model_name=args.model, cache_dir = args.cache)
     model, tokenizer = mlx_lm.load(args.model)
     server = HTTPServer((args.host, args.port), make_handler(model, tokenizer, args.system, args.names, args.skips))
     threading.Thread(target=server.serve_forever, daemon=True).start()
@@ -369,5 +391,228 @@ def main():
     mirror_workspace(args.work, workspace)
     sys.exit(subprocess.run(["claude"] + claude_args, env=env, cwd=workspace).returncode)
+def generate_step(
+    prompt: mx.array,
+    model: nn.Module,
+    *,
+    max_tokens: int = 256,
+    sampler: Optional[Callable[[mx.array], mx.array]] = None,
+    logits_processors: Optional[List[Callable[[mx.array, mx.array], mx.array]]] = None,
+    max_kv_size: Optional[int] = None,
+    prompt_cache: Optional[Any] = None,
+    prefill_step_size: int = 2048,
+    kv_bits: Optional[int] = None,
+    kv_group_size: int = 64,
+    quantized_kv_start: int = 0,
+    prompt_progress_callback: Optional[Callable[[int, int], None]] = None,
+    input_embeddings: Optional[mx.array] = None,
+    save_at: int = -1,
+    save_fn = None,
+) -> Generator[Tuple[mx.array, mx.array], None, None]:
+    if input_embeddings is not None:
+        if not does_model_support_input_embeddings(model):
+            raise ValueError("Model does not support input embeddings.")
+        elif len(prompt) > 0 and len(prompt) != len(input_embeddings):
+            raise ValueError(f"{len(input_embeddings)=} {len(prompt)=}")
+    elif len(prompt) == 0:
+        raise ValueError("Either input_embeddings or prompt (or both) must be provided.")
+    tokens = None
+    if prompt_cache is None:
+        prompt_cache = cache.make_prompt_cache(
+            model,
+            max_kv_size=max_kv_size,
+        )
+    prompt_progress_callback = prompt_progress_callback or (lambda *_: None)
+    quantize_cache_fn = functools.partial(
+        maybe_quantize_kv_cache,
+        quantized_kv_start=quantized_kv_start,
+        kv_group_size=kv_group_size,
+        kv_bits=kv_bits,
+    )
+    sampler = sampler or (lambda x: mx.argmax(x, axis=-1))
+    def _model_call(input_tokens: mx.array, input_embeddings: Optional[mx.array]):
+        if input_embeddings is not None:
+            return model(
+                input_tokens, cache=prompt_cache, input_embeddings=input_embeddings
+            )
+        else:
+            return model(input_tokens, cache=prompt_cache)
+    def _step(input_tokens: mx.array, input_embeddings: Optional[mx.array] = None):
+        nonlocal tokens
+        with mx.stream(generation_stream):
+            logits = _model_call(
+                input_tokens=input_tokens[None],
+                input_embeddings=(
+                    input_embeddings[None] if input_embeddings is not None else None
+                ),
+            )
+            logits = logits[:, -1, :]
+            if logits_processors and len(input_tokens) > 0:
+                tokens = (
+                    mx.concat([tokens, input_tokens])
+                    if tokens is not None
+                    else input_tokens
+                )
+                for processor in logits_processors:
+                    logits = processor(tokens, logits)
+            quantize_cache_fn(prompt_cache)
+            logprobs = logits - mx.logsumexp(logits, keepdims=True)
+            sampled = sampler(logprobs)
+            return sampled, logprobs.squeeze(0)
+    with mx.stream(generation_stream):
+        total_prompt_tokens = (
+            len(input_embeddings) if input_embeddings is not None else len(prompt)
+        )
+        prompt_processed_tokens = 0
+        prompt_progress_callback(prompt_processed_tokens, total_prompt_tokens)
+        while total_prompt_tokens - prompt_processed_tokens > 1:
+            remaining = (total_prompt_tokens - prompt_processed_tokens) - 1
+            n_to_process = min(prefill_step_size, remaining)
+            if prompt_processed_tokens < save_at:
+                n_to_process = min(n_to_process, save_at - prompt_processed_tokens)
+            _model_call(
+                input_tokens=prompt[:n_to_process][None],
+                input_embeddings=(
+                    input_embeddings[:n_to_process][None]
+                    if input_embeddings is not None
+                    else None
+                ),
+            )
+            quantize_cache_fn(prompt_cache)
+            mx.eval([c.state for c in prompt_cache])
+            prompt_processed_tokens += n_to_process
+            prompt_progress_callback(prompt_processed_tokens, total_prompt_tokens)
+            prompt = prompt[n_to_process:]
+            input_embeddings = (
+                input_embeddings[n_to_process:]
+                if input_embeddings is not None
+                else input_embeddings
+            )
+            mx.clear_cache()
+            if save_fn is not None and prompt_processed_tokens == save_at:
+                save_fn(prompt_cache)
+        y, logprobs = _step(input_tokens=prompt, input_embeddings=input_embeddings)
+    mx.async_eval(y, logprobs)
+    n = 0
+    while True:
+        if n != max_tokens:
+            next_y, next_logprobs = _step(y)
+            mx.async_eval(next_y, next_logprobs)
+        if n == 0:
+            mx.eval(y)
+            prompt_progress_callback(total_prompt_tokens, total_prompt_tokens)
+        if n == max_tokens:
+            break
+        yield y.item(), logprobs
+        if n % 256 == 0:
+            mx.clear_cache()
+        y, logprobs = next_y, next_logprobs
+        n += 1
+def generate(model, tokenizer, prompt, pref, hook=None, max_tokens=256, helper_max_tokens=64, **kwargs):
+    global dict_cache
+    if prompt is None:
+        return '', 0, 0
+    if not isinstance(tokenizer, mlx_lm.tokenizer_utils.TokenizerWrapper):
+        tokenizer = mlx_lm.tokenizer_utils.TokenizerWrapper(tokenizer)
+    detokenizer = tokenizer.detokenizer
+    if isinstance(prompt, str):
+        add_special_tokens = tokenizer.bos_token is None or not prompt.startswith(tokenizer.bos_token)
+        prompt_s = prompt
+        prompt = tokenizer.encode(prompt, add_special_tokens=add_special_tokens)
+        _pref = tokenizer.encode(pref, add_special_tokens=add_special_tokens)
+        save_at = get_common_len(prompt, _pref)
+    else:
+        prompt_s = tokenizer.decode(prompt)
+        save_at = -1 # □ for now
+    stream_logger.debug(dmca(prompt_s))
+    text = ''
+    gens = []
+    common_len = 0
+    hx_len = None
+    trim_len = None
+    save_fn = None
+    if not dict_cache.get('cache'):
+        ckpt_path = Path(dict_cache['cache_dir'])/f'{"".join(c for c in dict_cache["model_name"] if c.isalnum())}_{save_at}_{hash_tokens(prompt[:save_at])}.safetensors'
+        trace_logger.debug(ckpt_path.resolve())
+        trace_logger.debug(ckpt_path.absolute())
+        if os.path.exists(ckpt_path):
+            load_dict_cache(ckpt_path)
+        else:
+            dict_cache |= dict(cache=mlx_lm.models.cache.make_prompt_cache(model), hx=[])
+            save_fn = functools.partial(save_dict_cache, ckpt_path, dict(model_name=dict_cache['model_name'], hx=json.dumps(prompt[:save_at+1])))
+    if (hx := dict_cache.get('hx')):
+        _hx = hx[:-1]
+        common_len = get_common_len(prompt, _hx)
+        hx_len = len(_hx)
+        trim_len = hx_len - common_len
+        if trim_len > 0:
+            if all(c.is_trimmable() for c in dict_cache['cache']):
+                mlx_lm.models.cache.trim_prompt_cache(dict_cache['cache'], trim_len)
+            else:
+                ckpt_path = Path(dict_cache['cache_dir'])/f'{"".join(c for c in dict_cache["model_name"] if c.isalnum())}_{save_at}_{hash_tokens(prompt[:save_at])}.safetensors'
+                if os.path.exists(ckpt_path):
+                    load_dict_cache(ckpt_path)
+                    common_len = save_at
+    if save_at > common_len and not all(c.is_trimmable() for c in dict_cache['cache']):
+        ckpt_path = Path(dict_cache['cache_dir'])/f'{"".join(c for c in dict_cache["model_name"] if c.isalnum())}_{save_at}_{hash_tokens(prompt[:save_at])}.safetensors'
+        save_fn = functools.partial(save_dict_cache, ckpt_path, dict(model_name=dict_cache['model_name'], hx=json.dumps(prompt[:save_at+1])))
+    else:
+        save_at = -1
+    if common_len==len(prompt):
+        _last_gen = dict_cache['hx'][common_len]
+        prompt_arr = mx.array([_last_gen])
+        gens.append(_last_gen)
+        detokenizer.add(_last_gen)
+    else:
+        prompt_arr = mx.array(prompt[common_len:])
+    trace_logger.debug(f'{save_at=} {common_len=}')
+    token_gen = generate_step(
+        prompt_arr,
+        model,
+        prompt_cache=dict_cache['cache'],
+        max_tokens=max_tokens,
+        save_at=save_at-common_len,
+        save_fn=save_fn,
+        **kwargs,
+    )
+    tic_non = time.perf_counter()
+    for token, _ in token_gen:
+        gens.append(token)
+        if token in tokenizer.eos_token_ids:
+            break
+        detokenizer.add_token(token)
+        seg = detokenizer.last_segment
+        stream_logger.debug(seg)
+        text += seg
+        if len(gens) == 1:
+            tic_inp = time.perf_counter()
+        if len(gens) >= max_tokens:
+            break
+    tic_out = time.perf_counter()
+    detokenizer.finalize()
+    text += detokenizer.last_segment
+    dict_cache['hx'] = prompt+gens
+    trace_logger.debug(f'G {hx_len} {len(prompt)} {common_len} {trim_len} {len(gens)}\n=== TPS ===\n- Processed {len(prompt)} input tokens in {tic_inp-tic_non:.0f} seconds ({len(prompt)/(tic_inp-tic_non):.0f} tokens per second)\n- Generated {len(gens)} new tokens in {tic_out-tic_inp:.0f} seconds ({len(gens)/(tic_out-tic_inp):.0f} tokens per second)\n\n=== INP ===\n{dmca(prompt_s)}\n=== OUT ===\n{text}')
+    return text, len(prompt), len(gens)
 if __name__ == "__main__":
     main()

{mlx_code-0.0.2a0 → mlx_code-0.0.2a2/mlx_code.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mlx-code
-Version: 0.0.2a0
+Version: 0.0.2a2
 Summary: Local Claude Code for Mac
 Home-page: https://github.com/JosefAlbers/mlx-code
 Author: J Joe
@@ -52,19 +52,7 @@ mlx-code [options] [-- claude options]
 | `--work` | `$CWD` | Working directory mirrored into the Claude session |
 | `--home` | temp dir | Home directory for the Claude process |
-Any extra arguments after `--` are forwarded to the `claude` CLI:
-| Command | What it does | Example |
-|--------|--------------|--------|
-| `mlx-code` | Start interactive mode | `mlx-code` |
-| `mlx-code "task"` | Run a one-time task | `mlx-code "fix the build error"` |
-| `mlx-code -p "query"` | Run one-off query, then exit | `mlx-code -p "explain this function"` |
-| `mlx-code -c` | Continue most recent conversation in current directory | `mlx-code -c` |
-| `mlx-code -r` | Resume a previous conversation | `mlx-code -r` |
-| `mlx-code commit` | Create a Git commit | `mlx-code commit` |
-| `/clear` | Clear conversation history | `/clear` |
-| `/help` | Show available commands | `/help` |
-| `exit` or `Ctrl+C` | Exit Claude Code | `exit` |
+Any extra arguments after `--` are forwarded to the `claude` CLI.
 ### Licence

{mlx_code-0.0.2a0 → mlx_code-0.0.2a2}/setup.py RENAMED Viewed

@@ -6,7 +6,7 @@ setup(
     author_email="albersj66@gmail.com",
     author="J Joe",
     license="Apache-2.0",
-    version="0.0.2a0",
+    version="0.0.2a2",
     readme="README.md",
     description="Local Claude Code for Mac",
     long_description=open("README.md").read(),

{mlx_code-0.0.2a0 → mlx_code-0.0.2a2}/LICENSE RENAMED Viewed

File without changes

{mlx_code-0.0.2a0 → mlx_code-0.0.2a2}/mlx_code.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{mlx_code-0.0.2a0 → mlx_code-0.0.2a2}/mlx_code.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{mlx_code-0.0.2a0 → mlx_code-0.0.2a2}/mlx_code.egg-info/entry_points.txt RENAMED Viewed

File without changes

{mlx_code-0.0.2a0 → mlx_code-0.0.2a2}/mlx_code.egg-info/requires.txt RENAMED Viewed

File without changes

{mlx_code-0.0.2a0 → mlx_code-0.0.2a2}/mlx_code.egg-info/top_level.txt RENAMED Viewed

File without changes

{mlx_code-0.0.2a0 → mlx_code-0.0.2a2}/setup.cfg RENAMED Viewed

File without changes

mlx-code 0.0.2a0__tar.gz → 0.0.2a2__tar.gz

mlx-code 0.0.2a0tar.gz → 0.0.2a2tar.gz