PyPI - mlx-ssd - Versions diffs - 0.1.0__tar.gz - Mend

mlx-ssd 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

mlx_ssd-0.1.0/PKG-INFO +126 -0
mlx_ssd-0.1.0/README.md +113 -0
mlx_ssd-0.1.0/mlx_ssd/__init__.py +4 -0
mlx_ssd-0.1.0/mlx_ssd/cli.py +195 -0
mlx_ssd-0.1.0/mlx_ssd/configs/__init__.py +1 -0
mlx_ssd-0.1.0/mlx_ssd/configs/presets.py +45 -0
mlx_ssd-0.1.0/mlx_ssd/sampler.py +108 -0
mlx_ssd-0.1.0/mlx_ssd/trainer.py +73 -0
mlx_ssd-0.1.0/mlx_ssd/utils/__init__.py +1 -0
mlx_ssd-0.1.0/mlx_ssd/utils/data.py +37 -0
mlx_ssd-0.1.0/mlx_ssd/utils/prompts.py +54 -0
mlx_ssd-0.1.0/mlx_ssd.egg-info/PKG-INFO +126 -0
mlx_ssd-0.1.0/mlx_ssd.egg-info/SOURCES.txt +17 -0
mlx_ssd-0.1.0/mlx_ssd.egg-info/dependency_links.txt +1 -0
mlx_ssd-0.1.0/mlx_ssd.egg-info/entry_points.txt +2 -0
mlx_ssd-0.1.0/mlx_ssd.egg-info/requires.txt +4 -0
mlx_ssd-0.1.0/mlx_ssd.egg-info/top_level.txt +1 -0
mlx_ssd-0.1.0/pyproject.toml +27 -0
mlx_ssd-0.1.0/setup.cfg +4 -0

mlx_ssd-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,126 @@
+Metadata-Version: 2.4
+Name: mlx-ssd
+Version: 0.1.0
+Summary: Simple Self-Distillation training pipeline for MLX models
+Author: mlx-ssd contributors
+License: MIT
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Requires-Dist: mlx-lm>=0.21.0
+Requires-Dist: mlx-tokenizers>=1.0.0
+Requires-Dist: datasets>=3.0.0
+Requires-Dist: huggingface-hub>=0.24.0
+# mlx-ssd
+`mlx-ssd` is a practical MLX CLI implementation of simple self-distillation for code generation models on Apple Silicon.
+## Method
+This project follows the method introduced in:
+> Ruixiang Zhang, Richard He Bai, Huangjie Zheng, Navdeep Jaitly, Ronan Collobert, Yizhe Zhang.
+> **Embarrassingly Simple Self-Distillation Improves Code Generation**.
+> arXiv:2604.01193, 2026.
+> https://arxiv.org/abs/2604.01193
+Implementation by **Amirani Labs**.
+Core flow:
+1. Sample responses from a base model with train-time decoding settings.
+2. Fine-tune on those self-generated samples.
+3. Evaluate/run with eval-time decoding settings.
+Dataset defaults:
+- `--problems microsoft/rStar-Coder`
+- `--dataset-config seed_sft`
+- `--dataset-split train`
+- Records must contain a non-empty `question` field.
+This repository is an independent implementation and is **not** the original paper repository.
+## Presets
+Presets encode paper-aligned hyperparameters (Table 3 mapping) for supported model families.
+```bash
+mlx-ssd sample --model mlx-community/Qwen3-4B-Instruct-4bit --preset qwen3-4b-instruct --output ./ssd_data
+mlx-ssd train --model mlx-community/Qwen3-4B-Instruct-4bit --preset qwen3-4b-instruct --data ./ssd_data --output ./ssd_model
+mlx-ssd run --model ./ssd_model/fused --preset qwen3-4b-instruct --prompt "Write a function that..."
+```
+## Usage
+Install:
+```bash
+pip install -e .
+```
+Three-stage flow:
+```bash
+# 1) Sample
+mlx-ssd sample \
+  --model mlx-community/Qwen3-4B-Instruct-4bit \
+  --problems microsoft/rStar-Coder \
+  --dataset-config seed_sft \
+  --dataset-split train \
+  --output ./ssd_data \
+  --batch-size 16 \
+  --temperature 1.6 \
+  --top-k 20 \
+  --top-p 0.8 \
+  --limit 10
+# 2) Train
+mlx-ssd train \
+  --model mlx-community/Qwen3-4B-Instruct-4bit \
+  --data ./ssd_data \
+  --output ./ssd_model \
+  --iters 2500
+# 3) Run
+mlx-ssd run \
+  --model ./ssd_model/fused \
+  --temperature 1.1 \
+  --top-k 20 \
+  --top-p 0.8 \
+  --prompt "Write a function that..."
+```
+One-command flow:
+```bash
+mlx-ssd distill \
+  --model mlx-community/Qwen3-4B-Instruct-4bit \
+  --preset qwen3-4b-instruct \
+  --output ./my-better-qwen
+```
+Local smoke test (quick validation):
+```bash
+mlx-ssd sample \
+  --model mlx-community/SmolLM2-135M-Instruct \
+  --problems microsoft/rStar-Coder \
+  --dataset-config seed_sft \
+  --dataset-split train \
+  --output ./.smoke/data \
+  --batch-size 4 \
+  --temperature 0.8 \
+  --top-k 20 \
+  --top-p 0.8 \
+  --max-tokens 64 \
+  --limit 5
+```
+## Apple Silicon
+`mlx-ssd` itself is the Apple Silicon implementation: it is built on `mlx-lm` and targets local MLX workflows.
+## License
+MIT

mlx_ssd-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,113 @@
+# mlx-ssd
+`mlx-ssd` is a practical MLX CLI implementation of simple self-distillation for code generation models on Apple Silicon.
+## Method
+This project follows the method introduced in:
+> Ruixiang Zhang, Richard He Bai, Huangjie Zheng, Navdeep Jaitly, Ronan Collobert, Yizhe Zhang.
+> **Embarrassingly Simple Self-Distillation Improves Code Generation**.
+> arXiv:2604.01193, 2026.
+> https://arxiv.org/abs/2604.01193
+Implementation by **Amirani Labs**.
+Core flow:
+1. Sample responses from a base model with train-time decoding settings.
+2. Fine-tune on those self-generated samples.
+3. Evaluate/run with eval-time decoding settings.
+Dataset defaults:
+- `--problems microsoft/rStar-Coder`
+- `--dataset-config seed_sft`
+- `--dataset-split train`
+- Records must contain a non-empty `question` field.
+This repository is an independent implementation and is **not** the original paper repository.
+## Presets
+Presets encode paper-aligned hyperparameters (Table 3 mapping) for supported model families.
+```bash
+mlx-ssd sample --model mlx-community/Qwen3-4B-Instruct-4bit --preset qwen3-4b-instruct --output ./ssd_data
+mlx-ssd train --model mlx-community/Qwen3-4B-Instruct-4bit --preset qwen3-4b-instruct --data ./ssd_data --output ./ssd_model
+mlx-ssd run --model ./ssd_model/fused --preset qwen3-4b-instruct --prompt "Write a function that..."
+```
+## Usage
+Install:
+```bash
+pip install -e .
+```
+Three-stage flow:
+```bash
+# 1) Sample
+mlx-ssd sample \
+  --model mlx-community/Qwen3-4B-Instruct-4bit \
+  --problems microsoft/rStar-Coder \
+  --dataset-config seed_sft \
+  --dataset-split train \
+  --output ./ssd_data \
+  --batch-size 16 \
+  --temperature 1.6 \
+  --top-k 20 \
+  --top-p 0.8 \
+  --limit 10
+# 2) Train
+mlx-ssd train \
+  --model mlx-community/Qwen3-4B-Instruct-4bit \
+  --data ./ssd_data \
+  --output ./ssd_model \
+  --iters 2500
+# 3) Run
+mlx-ssd run \
+  --model ./ssd_model/fused \
+  --temperature 1.1 \
+  --top-k 20 \
+  --top-p 0.8 \
+  --prompt "Write a function that..."
+```
+One-command flow:
+```bash
+mlx-ssd distill \
+  --model mlx-community/Qwen3-4B-Instruct-4bit \
+  --preset qwen3-4b-instruct \
+  --output ./my-better-qwen
+```
+Local smoke test (quick validation):
+```bash
+mlx-ssd sample \
+  --model mlx-community/SmolLM2-135M-Instruct \
+  --problems microsoft/rStar-Coder \
+  --dataset-config seed_sft \
+  --dataset-split train \
+  --output ./.smoke/data \
+  --batch-size 4 \
+  --temperature 0.8 \
+  --top-k 20 \
+  --top-p 0.8 \
+  --max-tokens 64 \
+  --limit 5
+```
+## Apple Silicon
+`mlx-ssd` itself is the Apple Silicon implementation: it is built on `mlx-lm` and targets local MLX workflows.
+## License
+MIT

mlx_ssd-0.1.0/mlx_ssd/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+"""mlx-ssd package."""
+__all__ = ["__version__"]
+__version__ = "0.1.0"

mlx_ssd-0.1.0/mlx_ssd/cli.py ADDED Viewed

@@ -0,0 +1,195 @@
+"""Command-line interface for mlx-ssd."""
+from __future__ import annotations
+import argparse
+from pathlib import Path
+from mlx_lm import generate, load
+from .configs.presets import PRESETS, get_preset
+from .sampler import sample_dataset
+from .trainer import train_model
+from .utils.prompts import DEFAULT_CONFIG, DEFAULT_DATASET, DEFAULT_SPLIT
+def _merge_sample_args(args: argparse.Namespace) -> tuple[float, int, float]:
+    if args.preset:
+        preset = get_preset(args.preset)
+        temperature = args.temperature if args.temperature is not None else preset["train_temperature"]
+        top_k = args.top_k if args.top_k is not None else preset["train_top_k"]
+        top_p = args.top_p if args.top_p is not None else preset["train_top_p"]
+        return temperature, top_k, top_p
+    if args.temperature is None or args.top_k is None or args.top_p is None:
+        raise ValueError("Provide --preset or all of --temperature/--top-k/--top-p.")
+    return args.temperature, args.top_k, args.top_p
+def _merge_eval_args(args: argparse.Namespace) -> tuple[float, int, float]:
+    if args.preset:
+        preset = get_preset(args.preset)
+        temperature = args.temperature if args.temperature is not None else preset["eval_temperature"]
+        top_k = args.top_k if args.top_k is not None else preset["eval_top_k"]
+        top_p = args.top_p if args.top_p is not None else preset["eval_top_p"]
+        return temperature, top_k, top_p
+    if args.temperature is None or args.top_k is None or args.top_p is None:
+        raise ValueError("Provide --preset or all of --temperature/--top-k/--top-p.")
+    return args.temperature, args.top_k, args.top_p
+def _resolve_iters(args: argparse.Namespace) -> tuple[int, str]:
+    if args.preset:
+        preset = get_preset(args.preset)
+        iters = args.iters if args.iters is not None else preset["iters"]
+        fine_tune_type = preset["fine_tune_type"]
+        return iters, fine_tune_type
+    if args.iters is None:
+        raise ValueError("Provide --preset or --iters.")
+    return args.iters, "full"
+def cmd_sample(args: argparse.Namespace) -> int:
+    temperature, top_k, top_p = _merge_sample_args(args)
+    train_path, valid_path = sample_dataset(
+        model=args.model,
+        problems=args.problems,
+        output_dir=args.output,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        max_tokens=args.max_tokens,
+        batch_size=args.batch_size,
+        dataset_config=args.dataset_config,
+        dataset_split=args.dataset_split,
+        limit=args.limit,
+    )
+    print(f"Wrote {train_path} and {valid_path}")
+    return 0
+def cmd_train(args: argparse.Namespace) -> int:
+    iters, fine_tune_type = _resolve_iters(args)
+    fused_path = train_model(
+        model=args.model,
+        data_dir=args.data,
+        output_dir=args.output,
+        iters=iters,
+        fine_tune_type=fine_tune_type,
+        batch_size=args.batch_size,
+    )
+    print(f"Wrote fused model to {fused_path}")
+    return 0
+def cmd_run(args: argparse.Namespace) -> int:
+    temperature, top_k, top_p = _merge_eval_args(args)
+    mdl, tokenizer = load(args.model)
+    text = generate(
+        mdl,
+        tokenizer,
+        prompt=args.prompt,
+        temp=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        max_tokens=args.max_tokens,
+        verbose=False,
+    )
+    print(text)
+    return 0
+def cmd_distill(args: argparse.Namespace) -> int:
+    sample_output = Path(args.output) / "ssd_data"
+    model_output = Path(args.output) / "ssd_model"
+    sample_ns = argparse.Namespace(
+        model=args.model,
+        problems=args.problems,
+        dataset_config=args.dataset_config,
+        dataset_split=args.dataset_split,
+        output=str(sample_output),
+        temperature=None,
+        top_k=None,
+        top_p=None,
+        max_tokens=args.max_tokens,
+        batch_size=args.batch_size,
+        limit=args.limit,
+        preset=args.preset,
+    )
+    train_ns = argparse.Namespace(
+        model=args.model,
+        data=str(sample_output),
+        output=str(model_output),
+        iters=None,
+        batch_size=None,
+        preset=args.preset,
+    )
+    cmd_sample(sample_ns)
+    cmd_train(train_ns)
+    print(f"Distillation complete. Model: {model_output / 'fused'}")
+    return 0
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(prog="mlx-ssd")
+    sub = parser.add_subparsers(dest="command", required=True)
+    presets_help = f"Preset name ({', '.join(sorted(PRESETS))})"
+    p_sample = sub.add_parser("sample", help="Generate SFT data via temperature sampling.")
+    p_sample.add_argument("--model", required=True)
+    p_sample.add_argument("--problems", default=DEFAULT_DATASET)
+    p_sample.add_argument("--dataset-config", default=DEFAULT_CONFIG)
+    p_sample.add_argument("--dataset-split", default=DEFAULT_SPLIT)
+    p_sample.add_argument("--output", required=True)
+    p_sample.add_argument("--temperature", type=float)
+    p_sample.add_argument("--top-k", type=int)
+    p_sample.add_argument("--top-p", type=float)
+    p_sample.add_argument("--max-tokens", type=int, default=1024)
+    p_sample.add_argument("--batch-size", type=int, default=16)
+    p_sample.add_argument("--limit", type=int)
+    p_sample.add_argument("--preset", choices=sorted(PRESETS), help=presets_help)
+    p_sample.set_defaults(func=cmd_sample)
+    p_train = sub.add_parser("train", help="Fine-tune with mlx-lm and fuse adapters.")
+    p_train.add_argument("--model", required=True)
+    p_train.add_argument("--data", required=True)
+    p_train.add_argument("--output", required=True)
+    p_train.add_argument("--iters", type=int)
+    p_train.add_argument("--batch-size", type=int)
+    p_train.add_argument("--preset", choices=sorted(PRESETS), help=presets_help)
+    p_train.set_defaults(func=cmd_train)
+    p_run = sub.add_parser("run", help="Generate text using eval-time settings.")
+    p_run.add_argument("--model", required=True)
+    p_run.add_argument("--prompt", required=True)
+    p_run.add_argument("--temperature", type=float)
+    p_run.add_argument("--top-k", type=int)
+    p_run.add_argument("--top-p", type=float)
+    p_run.add_argument("--max-tokens", type=int, default=1024)
+    p_run.add_argument("--preset", choices=sorted(PRESETS), help=presets_help)
+    p_run.set_defaults(func=cmd_run)
+    p_distill = sub.add_parser("distill", help="Run sample + train in one command.")
+    p_distill.add_argument("--model", required=True)
+    p_distill.add_argument("--output", required=True)
+    p_distill.add_argument("--problems", default=DEFAULT_DATASET)
+    p_distill.add_argument("--dataset-config", default=DEFAULT_CONFIG)
+    p_distill.add_argument("--dataset-split", default=DEFAULT_SPLIT)
+    p_distill.add_argument("--preset", choices=sorted(PRESETS), required=True, help=presets_help)
+    p_distill.add_argument("--max-tokens", type=int, default=1024)
+    p_distill.add_argument("--batch-size", type=int, default=16)
+    p_distill.add_argument("--limit", type=int)
+    p_distill.set_defaults(func=cmd_distill)
+    return parser
+def main(argv: list[str] | None = None) -> int:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    try:
+        return args.func(args)
+    except Exception as exc:
+        parser.exit(status=1, message=f"error: {exc}\n")
+if __name__ == "__main__":
+    raise SystemExit(main())

mlx_ssd-0.1.0/mlx_ssd/configs/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Configuration helpers for mlx-ssd."""

mlx_ssd-0.1.0/mlx_ssd/configs/presets.py ADDED Viewed

@@ -0,0 +1,45 @@
+"""Paper-aligned presets used across pipeline stages."""
+from __future__ import annotations
+from copy import deepcopy
+PRESETS = {
+    "qwen3-4b-instruct": {
+        "train_temperature": 1.6,
+        "train_top_k": 20,
+        "train_top_p": 0.8,
+        "eval_temperature": 1.1,
+        "eval_top_k": 20,
+        "eval_top_p": 0.8,
+        "iters": 2500,
+        "fine_tune_type": "full",
+    },
+    "qwen3-30b-instruct": {
+        "train_temperature": 1.6,
+        "train_top_k": 20,
+        "train_top_p": 0.8,
+        "eval_temperature": 0.9,
+        "eval_top_k": 20,
+        "eval_top_p": 0.8,
+        "iters": 2500,
+        "fine_tune_type": "full",
+    },
+    "llama-3.1-8b-instruct": {
+        "train_temperature": 0.8,
+        "train_top_k": 20,
+        "train_top_p": 0.8,
+        "eval_temperature": 0.7,
+        "eval_top_k": 20,
+        "eval_top_p": 0.8,
+        "iters": 2500,
+        "fine_tune_type": "full",
+    },
+}
+def get_preset(name: str) -> dict:
+    if name not in PRESETS:
+        supported = ", ".join(sorted(PRESETS))
+        raise ValueError(f"Unknown preset '{name}'. Available presets: {supported}")
+    return deepcopy(PRESETS[name])

mlx_ssd-0.1.0/mlx_ssd/sampler.py ADDED Viewed

@@ -0,0 +1,108 @@
+"""Stage 1: sample model outputs and build SFT datasets."""
+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+from mlx_lm import load
+from mlx_lm.generate import BatchGenerator
+from mlx_lm.sample_utils import make_sampler
+from .utils.data import train_valid_split, write_jsonl
+from .utils.prompts import DEFAULT_CONFIG, DEFAULT_SPLIT, load_problem_prompts
+def _is_degenerate(text: str) -> bool:
+    clean = text.strip()
+    if not clean:
+        return True
+    # Minimal heuristic: drop very short one-liners.
+    if "\n" not in clean and len(clean.split()) < 8:
+        return True
+    return False
+def _batch_generate_texts(
+    model: Any,
+    tokenizer: Any,
+    prompts: list[list[int]],
+    sampler: Any,
+    max_tokens: int,
+) -> list[str]:
+    gen = BatchGenerator(
+        model,
+        stop_tokens=tokenizer.eos_token_ids,
+        sampler=sampler,
+    )
+    try:
+        uids = gen.insert(prompts, max_tokens)
+        tokens_by_uid: dict[int, list[int]] = {uid: [] for uid in uids}
+        while responses := gen.next():
+            for response in responses:
+                if response.finish_reason != "stop":
+                    tokens_by_uid[response.uid].append(response.token)
+        return [tokenizer.decode(tokens_by_uid[uid]) for uid in uids]
+    finally:
+        gen.close()
+def sample_dataset(
+    model: str,
+    problems: str,
+    output_dir: str,
+    temperature: float,
+    top_k: int,
+    top_p: float,
+    max_tokens: int = 1024,
+    batch_size: int = 16,
+    dataset_config: str | None = DEFAULT_CONFIG,
+    dataset_split: str = DEFAULT_SPLIT,
+    limit: int | None = None,
+    seed: int = 42,
+) -> tuple[Path, Path]:
+    prompts = load_problem_prompts(
+        problems,
+        split=dataset_split,
+        config=dataset_config,
+        limit=limit,
+    )
+    mdl, tokenizer = load(model)
+    sampler = make_sampler(temp=temperature, top_p=top_p, top_k=top_k)
+    rows: list[dict] = []
+    if batch_size <= 0:
+        raise ValueError("--batch-size must be positive.")
+    for i in range(0, len(prompts), batch_size):
+        prompt_batch = prompts[i : i + batch_size]
+        tokenized_batch = [tokenizer.encode(prompt) for prompt in prompt_batch]
+        completions = _batch_generate_texts(
+            model=mdl,
+            tokenizer=tokenizer,
+            prompts=tokenized_batch,
+            sampler=sampler,
+            max_tokens=max_tokens,
+        )
+        for prompt, completion in zip(prompt_batch, completions, strict=True):
+            if _is_degenerate(completion):
+                continue
+            rows.append(
+                {
+                    "messages": [
+                        {"role": "user", "content": prompt},
+                        {"role": "assistant", "content": completion.strip()},
+                    ]
+                }
+            )
+    if not rows:
+        raise ValueError("No valid samples generated; adjust sampling settings.")
+    train_rows, valid_rows = train_valid_split(rows, valid_ratio=0.05, seed=seed)
+    output_path = Path(output_dir)
+    train_path = output_path / "train.jsonl"
+    valid_path = output_path / "valid.jsonl"
+    write_jsonl(train_path, train_rows)
+    write_jsonl(valid_path, valid_rows)
+    return train_path, valid_path

mlx_ssd-0.1.0/mlx_ssd/trainer.py ADDED Viewed

@@ -0,0 +1,73 @@
+"""Stage 2: fine-tune with mlx-lm training entrypoint."""
+from __future__ import annotations
+import subprocess
+import sys
+from pathlib import Path
+def train_model(
+    model: str,
+    data_dir: str,
+    output_dir: str,
+    iters: int,
+    fine_tune_type: str = "full",
+    batch_size: int | None = None,
+) -> Path:
+    data_path = Path(data_dir)
+    train_file = data_path / "train.jsonl"
+    valid_file = data_path / "valid.jsonl"
+    if not train_file.exists() or not valid_file.exists():
+        raise FileNotFoundError(
+            f"Expected train.jsonl and valid.jsonl in '{data_path}'. Run sample stage first."
+        )
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    train_examples = sum(1 for _ in train_file.open("r", encoding="utf-8"))
+    valid_examples = sum(1 for _ in valid_file.open("r", encoding="utf-8"))
+    if train_examples < 1 or valid_examples < 1:
+        raise ValueError("Training requires at least one train and one valid example.")
+    effective_batch_size = batch_size if batch_size is not None else min(4, train_examples, valid_examples)
+    if effective_batch_size < 1:
+        raise ValueError("Resolved training batch size is invalid.")
+    cmd = [
+        sys.executable,
+        "-m",
+        "mlx_lm",
+        "lora",
+        "--train",
+        "--model",
+        model,
+        "--data",
+        str(data_path),
+        "--fine-tune-type",
+        fine_tune_type,
+        "--iters",
+        str(iters),
+        "--batch-size",
+        str(effective_batch_size),
+        "--adapter-path",
+        str(output_path / "adapters"),
+        "--save-every",
+        "200",
+    ]
+    subprocess.run(cmd, check=True)
+    fuse_cmd = [
+        sys.executable,
+        "-m",
+        "mlx_lm",
+        "fuse",
+        "--model",
+        model,
+        "--adapter-path",
+        str(output_path / "adapters"),
+        "--save-path",
+        str(output_path / "fused"),
+    ]
+    subprocess.run(fuse_cmd, check=True)
+    return output_path / "fused"

mlx_ssd-0.1.0/mlx_ssd/utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Utility helpers for mlx-ssd."""

mlx_ssd-0.1.0/mlx_ssd/utils/data.py ADDED Viewed

@@ -0,0 +1,37 @@
+"""Data loading and JSONL formatting utilities."""
+from __future__ import annotations
+import json
+import random
+from pathlib import Path
+from typing import Iterable
+def write_jsonl(path: Path, rows: Iterable[dict]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        for row in rows:
+            f.write(json.dumps(row, ensure_ascii=False) + "\n")
+def load_jsonl(path: Path) -> list[dict]:
+    rows: list[dict] = []
+    with path.open("r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                rows.append(json.loads(line))
+    return rows
+def train_valid_split(rows: list[dict], valid_ratio: float = 0.05, seed: int = 42) -> tuple[list[dict], list[dict]]:
+    if not rows:
+        return [], []
+    random.Random(seed).shuffle(rows)
+    valid_size = max(1, int(len(rows) * valid_ratio))
+    valid = rows[:valid_size]
+    train = rows[valid_size:]
+    if not train:
+        train, valid = valid, train
+    return train, valid

mlx_ssd-0.1.0/mlx_ssd/utils/prompts.py ADDED Viewed

@@ -0,0 +1,54 @@
+"""Prompt set loading for sampling."""
+from __future__ import annotations
+from typing import Iterable
+from datasets import load_dataset
+DEFAULT_DATASET = "microsoft/rStar-Coder"
+DEFAULT_CONFIG = "seed_sft"
+DEFAULT_SPLIT = "train"
+DATASET_ALIASES = {
+    "rstar-coder": DEFAULT_DATASET,
+    "rstarcoder": DEFAULT_DATASET,
+}
+def _extract_question(record: dict, index: int) -> str:
+    if "question" not in record:
+        raise ValueError(f"Dataset record {index} is missing required 'question' field.")
+    value = record["question"]
+    if not isinstance(value, str):
+        raise ValueError(f"Dataset record {index} has non-string 'question' field: {type(value).__name__}.")
+    question = value.strip()
+    if not question:
+        raise ValueError(f"Dataset record {index} has empty 'question' field.")
+    return question
+def resolve_dataset_id(name: str) -> str:
+    return DATASET_ALIASES.get(name, name)
+def load_problem_prompts(
+    name: str,
+    split: str = DEFAULT_SPLIT,
+    config: str | None = DEFAULT_CONFIG,
+    limit: int | None = None,
+) -> list[str]:
+    dataset_id = resolve_dataset_id(name)
+    dataset = load_dataset(dataset_id, config, split=split)
+    prompts: list[str] = []
+    for index, record in enumerate(dataset):
+        prompts.append(_extract_question(record, index))
+        if limit is not None and len(prompts) >= limit:
+            break
+    if not prompts:
+        raise ValueError(f"No prompts found for problem set '{dataset_id}' (config={config}, split={split}).")
+    return prompts
+def iter_user_messages(prompts: Iterable[str]) -> list[dict]:
+    return [{"role": "user", "content": prompt} for prompt in prompts]

mlx_ssd-0.1.0/mlx_ssd.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,126 @@
+Metadata-Version: 2.4
+Name: mlx-ssd
+Version: 0.1.0
+Summary: Simple Self-Distillation training pipeline for MLX models
+Author: mlx-ssd contributors
+License: MIT
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Requires-Dist: mlx-lm>=0.21.0
+Requires-Dist: mlx-tokenizers>=1.0.0
+Requires-Dist: datasets>=3.0.0
+Requires-Dist: huggingface-hub>=0.24.0
+# mlx-ssd
+`mlx-ssd` is a practical MLX CLI implementation of simple self-distillation for code generation models on Apple Silicon.
+## Method
+This project follows the method introduced in:
+> Ruixiang Zhang, Richard He Bai, Huangjie Zheng, Navdeep Jaitly, Ronan Collobert, Yizhe Zhang.
+> **Embarrassingly Simple Self-Distillation Improves Code Generation**.
+> arXiv:2604.01193, 2026.
+> https://arxiv.org/abs/2604.01193
+Implementation by **Amirani Labs**.
+Core flow:
+1. Sample responses from a base model with train-time decoding settings.
+2. Fine-tune on those self-generated samples.
+3. Evaluate/run with eval-time decoding settings.
+Dataset defaults:
+- `--problems microsoft/rStar-Coder`
+- `--dataset-config seed_sft`
+- `--dataset-split train`
+- Records must contain a non-empty `question` field.
+This repository is an independent implementation and is **not** the original paper repository.
+## Presets
+Presets encode paper-aligned hyperparameters (Table 3 mapping) for supported model families.
+```bash
+mlx-ssd sample --model mlx-community/Qwen3-4B-Instruct-4bit --preset qwen3-4b-instruct --output ./ssd_data
+mlx-ssd train --model mlx-community/Qwen3-4B-Instruct-4bit --preset qwen3-4b-instruct --data ./ssd_data --output ./ssd_model
+mlx-ssd run --model ./ssd_model/fused --preset qwen3-4b-instruct --prompt "Write a function that..."
+```
+## Usage
+Install:
+```bash
+pip install -e .
+```
+Three-stage flow:
+```bash
+# 1) Sample
+mlx-ssd sample \
+  --model mlx-community/Qwen3-4B-Instruct-4bit \
+  --problems microsoft/rStar-Coder \
+  --dataset-config seed_sft \
+  --dataset-split train \
+  --output ./ssd_data \
+  --batch-size 16 \
+  --temperature 1.6 \
+  --top-k 20 \
+  --top-p 0.8 \
+  --limit 10
+# 2) Train
+mlx-ssd train \
+  --model mlx-community/Qwen3-4B-Instruct-4bit \
+  --data ./ssd_data \
+  --output ./ssd_model \
+  --iters 2500
+# 3) Run
+mlx-ssd run \
+  --model ./ssd_model/fused \
+  --temperature 1.1 \
+  --top-k 20 \
+  --top-p 0.8 \
+  --prompt "Write a function that..."
+```
+One-command flow:
+```bash
+mlx-ssd distill \
+  --model mlx-community/Qwen3-4B-Instruct-4bit \
+  --preset qwen3-4b-instruct \
+  --output ./my-better-qwen
+```
+Local smoke test (quick validation):
+```bash
+mlx-ssd sample \
+  --model mlx-community/SmolLM2-135M-Instruct \
+  --problems microsoft/rStar-Coder \
+  --dataset-config seed_sft \
+  --dataset-split train \
+  --output ./.smoke/data \
+  --batch-size 4 \
+  --temperature 0.8 \
+  --top-k 20 \
+  --top-p 0.8 \
+  --max-tokens 64 \
+  --limit 5
+```
+## Apple Silicon
+`mlx-ssd` itself is the Apple Silicon implementation: it is built on `mlx-lm` and targets local MLX workflows.
+## License
+MIT

mlx_ssd-0.1.0/mlx_ssd.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,17 @@
+README.md
+pyproject.toml
+mlx_ssd/__init__.py
+mlx_ssd/cli.py
+mlx_ssd/sampler.py
+mlx_ssd/trainer.py
+mlx_ssd.egg-info/PKG-INFO
+mlx_ssd.egg-info/SOURCES.txt
+mlx_ssd.egg-info/dependency_links.txt
+mlx_ssd.egg-info/entry_points.txt
+mlx_ssd.egg-info/requires.txt
+mlx_ssd.egg-info/top_level.txt
+mlx_ssd/configs/__init__.py
+mlx_ssd/configs/presets.py
+mlx_ssd/utils/__init__.py
+mlx_ssd/utils/data.py
+mlx_ssd/utils/prompts.py

mlx_ssd-0.1.0/mlx_ssd.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

mlx_ssd-0.1.0/mlx_ssd.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ mlx-ssd = mlx_ssd.cli:main

mlx_ssd-0.1.0/mlx_ssd.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,4 @@
+mlx-lm>=0.21.0
+mlx-tokenizers>=1.0.0
+datasets>=3.0.0
+huggingface-hub>=0.24.0

mlx_ssd-0.1.0/mlx_ssd.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ mlx_ssd

mlx_ssd-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,27 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "mlx-ssd"
+version = "0.1.0"
+description = "Simple Self-Distillation training pipeline for MLX models"
+readme = "README.md"
+requires-python = ">=3.10"
+license = { text = "MIT" }
+authors = [{ name = "mlx-ssd contributors" }]
+dependencies = [
+  "mlx-lm>=0.21.0",
+  "mlx-tokenizers>=1.0.0",
+  "datasets>=3.0.0",
+  "huggingface-hub>=0.24.0",
+]
+[project.scripts]
+mlx-ssd = "mlx_ssd.cli:main"
+[tool.setuptools]
+include-package-data = true
+[tool.setuptools.packages.find]
+include = ["mlx_ssd*"]

mlx_ssd-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0