npm - titan-synapse - Versions diffs - 0.1.1 - Mend

titan-synapse 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

package/CONTRIBUTING.md +187 -0
package/Cargo.lock +3976 -0
package/Cargo.toml +10 -0
package/LICENSE +190 -0
package/PROGRESS.md +151 -0
package/README.md +514 -0
package/TEST_LOG.md +220 -0
package/config/default.yaml +36 -0
package/crates/synapse/Cargo.toml +70 -0
package/crates/synapse/src/cli/bench.rs +44 -0
package/crates/synapse/src/cli/eval.rs +395 -0
package/crates/synapse/src/cli/export.rs +45 -0
package/crates/synapse/src/cli/hub.rs +179 -0
package/crates/synapse/src/cli/import.rs +35 -0
package/crates/synapse/src/cli/learn.rs +53 -0
package/crates/synapse/src/cli/mod.rs +10 -0
package/crates/synapse/src/cli/models.rs +36 -0
package/crates/synapse/src/cli/pull.rs +60 -0
package/crates/synapse/src/cli/status.rs +52 -0
package/crates/synapse/src/cli/train.rs +99 -0
package/crates/synapse/src/config.rs +220 -0
package/crates/synapse/src/dashboard.rs +281 -0
package/crates/synapse/src/format/manifest.rs +57 -0
package/crates/synapse/src/format/mod.rs +4 -0
package/crates/synapse/src/format/packer.rs +213 -0
package/crates/synapse/src/inference/engine.rs +361 -0
package/crates/synapse/src/inference/kv_cache.rs +97 -0
package/crates/synapse/src/inference/lora.rs +166 -0
package/crates/synapse/src/inference/mod.rs +9 -0
package/crates/synapse/src/inference/model.rs +167 -0
package/crates/synapse/src/inference/sampler.rs +133 -0
package/crates/synapse/src/inference/speculative.rs +153 -0
package/crates/synapse/src/learn/cloud_fallback.rs +186 -0
package/crates/synapse/src/learn/engine.rs +109 -0
package/crates/synapse/src/learn/mod.rs +5 -0
package/crates/synapse/src/main.rs +185 -0
package/crates/synapse/src/memory/extractor.rs +201 -0
package/crates/synapse/src/memory/graph.rs +332 -0
package/crates/synapse/src/memory/hallucination.rs +259 -0
package/crates/synapse/src/memory/mod.rs +7 -0
package/crates/synapse/src/openai.rs +232 -0
package/crates/synapse/src/server.rs +166 -0
package/crates/synapse/src/streaming.rs +80 -0
package/crates/synapse/src/swarm/coordinator.rs +198 -0
package/crates/synapse/src/swarm/mod.rs +8 -0
package/crates/synapse/src/swarm/orchestrator.rs +225 -0
package/crates/synapse/src/swarm/pool.rs +64 -0
package/crates/synapse/src/swarm/spawner.rs +199 -0
package/crates/synapse/src/swarm/synthesizer.rs +26 -0
package/crates/synapse/src/vram/manager.rs +67 -0
package/crates/synapse/src/vram/mod.rs +3 -0
package/docker-compose.yml +19 -0
package/install.sh +311 -0
package/package.json +36 -0
package/python/Dockerfile.learn +18 -0
package/python/requirements.txt +11 -0
package/python/synapse_learn/__init__.py +0 -0
package/python/synapse_learn/datasets.py +233 -0
package/python/synapse_learn/real_eval.py +616 -0
package/python/synapse_learn/server.py +431 -0
package/python/synapse_learn/train_base.py +672 -0
package/python/synapse_learn/train_specialists.py +787 -0

package/install.sh ADDED Viewed

@@ -0,0 +1,311 @@
+#!/usr/bin/env bash
+# ──────────────────────────────────────────────────────────────────────
+# Titan Synapse — Install Script
+# Small models that think together. And learn.
+# https://github.com/Djtony707/titan-synapse
+# ──────────────────────────────────────────────────────────────────────
+set -euo pipefail
+# ── Colors ────────────────────────────────────────────────────────────
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+CYAN='\033[0;36m'
+MAGENTA='\033[0;35m'
+BOLD='\033[1m'
+DIM='\033[2m'
+NC='\033[0m'
+# ── Helpers ───────────────────────────────────────────────────────────
+info()    { echo -e "${BLUE}[INFO]${NC}  $*"; }
+success() { echo -e "${GREEN}[OK]${NC}    $*"; }
+warn()    { echo -e "${YELLOW}[WARN]${NC}  $*"; }
+fail()    { echo -e "${RED}[FAIL]${NC}  $*"; exit 1; }
+step()    { echo -e "\n${CYAN}${BOLD}>>> $*${NC}"; }
+# ── ASCII Header ──────────────────────────────────────────────────────
+echo -e "${MAGENTA}"
+cat << 'BANNER'
+   ███████╗██╗   ██╗███╗   ██╗ █████╗ ██████╗ ███████╗███████╗
+   ██╔════╝╚██╗ ██╔╝████╗  ██║██╔══██╗██╔══██╗██╔════╝██╔════╝
+   ███████╗ ╚████╔╝ ██╔██╗ ██║███████║██████╔╝███████╗█████╗
+   ╚════██║  ╚██╔╝  ██║╚██╗██║██╔══██║██╔═══╝ ╚════██║██╔══╝
+   ███████║   ██║   ██║ ╚████║██║  ██║██║     ███████║███████╗
+   ╚══════╝   ╚═╝   ╚═╝  ╚═══╝╚═╝  ╚═╝╚═╝     ╚══════╝╚══════╝
+BANNER
+echo -e "${NC}"
+echo -e "   ${DIM}Tiny models. Big brain. Your hardware. No excuses.${NC}"
+echo -e "   ${DIM}────────────────────────────────────────────────${NC}"
+echo ""
+# ── Constants ─────────────────────────────────────────────────────────
+REPO_URL="https://github.com/Djtony707/titan-synapse.git"
+SYNAPSE_DIR="${HOME}/.synapse"
+MODEL_URL="https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf"
+MODEL_FILE="qwen2.5-0.5b-instruct-q4_k_m.gguf"
+BINARY_NAME="synapse"
+# ── OS Detection ──────────────────────────────────────────────────────
+step "Detecting operating system"
+OS="$(uname -s)"
+ARCH="$(uname -m)"
+case "$OS" in
+    Linux)
+        PLATFORM="linux"
+        success "Linux detected (${ARCH})"
+        ;;
+    Darwin)
+        PLATFORM="macos"
+        success "macOS detected (${ARCH})"
+        ;;
+    *)
+        fail "Unsupported operating system: ${OS}. Synapse supports Linux and macOS."
+        ;;
+esac
+# ── Dependency Checks ─────────────────────────────────────────────────
+step "Checking dependencies"
+# git
+if command -v git &>/dev/null; then
+    success "git found: $(git --version | head -1)"
+else
+    fail "git is not installed. Please install git first."
+fi
+# curl or wget
+DOWNLOADER=""
+if command -v curl &>/dev/null; then
+    DOWNLOADER="curl"
+    success "curl found"
+elif command -v wget &>/dev/null; then
+    DOWNLOADER="wget"
+    success "wget found"
+else
+    fail "Neither curl nor wget found. Please install one of them."
+fi
+# ── Rust Toolchain ────────────────────────────────────────────────────
+step "Checking Rust toolchain"
+if command -v rustc &>/dev/null && command -v cargo &>/dev/null; then
+    RUST_VER="$(rustc --version)"
+    success "Rust already installed: ${RUST_VER}"
+else
+    warn "Rust not found. Installing via rustup..."
+    if [ "$DOWNLOADER" = "curl" ]; then
+        curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+    else
+        wget -qO- https://sh.rustup.rs | sh -s -- -y
+    fi
+    # Source cargo env for this session
+    # shellcheck source=/dev/null
+    source "${HOME}/.cargo/env" 2>/dev/null || true
+    if command -v rustc &>/dev/null; then
+        success "Rust installed: $(rustc --version)"
+    else
+        fail "Rust installation failed. Try manually: https://rustup.rs"
+    fi
+fi
+# ── CUDA Detection ────────────────────────────────────────────────────
+step "Checking for CUDA toolkit"
+CARGO_FEATURES=""
+if command -v nvcc &>/dev/null; then
+    CUDA_VER="$(nvcc --version | grep -oP 'release \K[0-9.]+' 2>/dev/null || nvcc --version | sed -n 's/.*release \([0-9.]*\).*/\1/p')"
+    success "CUDA toolkit detected: ${CUDA_VER}"
+    CARGO_FEATURES="--features cuda"
+    info "Build will include CUDA acceleration"
+elif [ -d "/usr/local/cuda" ] || [ -d "/opt/cuda" ]; then
+    warn "CUDA directory found but nvcc not in PATH. Building without CUDA."
+    info "To enable CUDA: export PATH=/usr/local/cuda/bin:\$PATH and re-run"
+else
+    info "No CUDA toolkit found. Building CPU-only (this is fine for starters)."
+    # Check for Metal on macOS
+    if [ "$PLATFORM" = "macos" ] && [ "$ARCH" = "arm64" ]; then
+        CARGO_FEATURES="--features metal"
+        info "Apple Silicon detected — building with Metal acceleration"
+    fi
+fi
+# ── Clone or Use Current Directory ────────────────────────────────────
+step "Setting up source code"
+BUILD_DIR=""
+if [ -f "Cargo.toml" ] && grep -q "titan-synapse\|synapse" Cargo.toml 2>/dev/null; then
+    BUILD_DIR="$(pwd)"
+    success "Already in titan-synapse repo: ${BUILD_DIR}"
+elif [ -d "titan-synapse" ]; then
+    BUILD_DIR="$(pwd)/titan-synapse"
+    success "Found existing clone: ${BUILD_DIR}"
+else
+    info "Cloning titan-synapse..."
+    git clone "$REPO_URL" titan-synapse
+    BUILD_DIR="$(pwd)/titan-synapse"
+    success "Cloned to ${BUILD_DIR}"
+fi
+cd "$BUILD_DIR"
+# ── Build ─────────────────────────────────────────────────────────────
+step "Building Synapse (release mode)"
+info "This may take a few minutes on first build..."
+if [ -n "$CARGO_FEATURES" ]; then
+    info "Build flags: ${CARGO_FEATURES}"
+    cargo build --release ${CARGO_FEATURES}
+else
+    cargo build --release
+fi
+if [ -f "target/release/${BINARY_NAME}" ]; then
+    success "Build complete!"
+else
+    fail "Build failed — binary not found at target/release/${BINARY_NAME}"
+fi
+# ── Create ~/.synapse Directory ───────────────────────────────────────
+step "Setting up Synapse home directory"
+mkdir -p "${SYNAPSE_DIR}"
+mkdir -p "${SYNAPSE_DIR}/models"
+mkdir -p "${SYNAPSE_DIR}/knowledge"
+mkdir -p "${SYNAPSE_DIR}/adapters"
+mkdir -p "${SYNAPSE_DIR}/logs"
+success "Created ${SYNAPSE_DIR}/"
+info "  models/     — GGUF model files"
+info "  knowledge/  — SQLite knowledge graphs"
+info "  adapters/   — QLoRA adapter weights"
+info "  logs/       — Runtime logs"
+# ── Install Binary ────────────────────────────────────────────────────
+step "Installing binary"
+INSTALL_DIR=""
+if [ -d "${HOME}/.local/bin" ] || mkdir -p "${HOME}/.local/bin" 2>/dev/null; then
+    INSTALL_DIR="${HOME}/.local/bin"
+elif [ -w "/usr/local/bin" ]; then
+    INSTALL_DIR="/usr/local/bin"
+else
+    warn "Cannot write to ~/.local/bin or /usr/local/bin"
+    info "Attempting /usr/local/bin with sudo..."
+    sudo mkdir -p /usr/local/bin 2>/dev/null || true
+    if [ -w "/usr/local/bin" ] || sudo test -w "/usr/local/bin" 2>/dev/null; then
+        INSTALL_DIR="/usr/local/bin"
+        sudo cp "target/release/${BINARY_NAME}" "${INSTALL_DIR}/${BINARY_NAME}"
+        sudo chmod +x "${INSTALL_DIR}/${BINARY_NAME}"
+        success "Installed to ${INSTALL_DIR}/${BINARY_NAME} (via sudo)"
+        INSTALL_DIR="" # skip the normal copy below
+    else
+        fail "No writable install directory. Copy target/release/synapse to your PATH manually."
+    fi
+fi
+if [ -n "$INSTALL_DIR" ]; then
+    cp "target/release/${BINARY_NAME}" "${INSTALL_DIR}/${BINARY_NAME}"
+    chmod +x "${INSTALL_DIR}/${BINARY_NAME}"
+    success "Installed to ${INSTALL_DIR}/${BINARY_NAME}"
+    # Check if install dir is in PATH
+    if ! echo "$PATH" | tr ':' '\n' | grep -qx "$INSTALL_DIR"; then
+        warn "${INSTALL_DIR} is not in your PATH"
+        info "Add this to your shell profile:"
+        echo -e "  ${BOLD}export PATH=\"${INSTALL_DIR}:\$PATH\"${NC}"
+    fi
+fi
+# ── Download Default Model ────────────────────────────────────────────
+step "Downloading default model (Qwen2.5-0.5B Q4_K_M)"
+MODEL_PATH="${SYNAPSE_DIR}/models/${MODEL_FILE}"
+if [ -f "$MODEL_PATH" ]; then
+    success "Model already exists: ${MODEL_PATH}"
+else
+    info "Downloading from HuggingFace (~400MB)..."
+    if [ "$DOWNLOADER" = "curl" ]; then
+        curl -L --progress-bar -o "$MODEL_PATH" "$MODEL_URL"
+    else
+        wget --show-progress -O "$MODEL_PATH" "$MODEL_URL"
+    fi
+    if [ -f "$MODEL_PATH" ] && [ -s "$MODEL_PATH" ]; then
+        MODEL_SIZE=$(du -h "$MODEL_PATH" | cut -f1)
+        success "Model downloaded: ${MODEL_PATH} (${MODEL_SIZE})"
+    else
+        warn "Model download may have failed. You can retry manually:"
+        info "  synapse pull qwen2.5-0.5b"
+    fi
+fi
+# ── Write Default Config ─────────────────────────────────────────────
+CONFIG_PATH="${SYNAPSE_DIR}/config.yaml"
+if [ ! -f "$CONFIG_PATH" ]; then
+    cat > "$CONFIG_PATH" << 'YAML'
+# Titan Synapse Configuration
+# Docs: https://github.com/Djtony707/titan-synapse
+server:
+  host: "127.0.0.1"
+  port: 6900
+model:
+  path: "~/.synapse/models/qwen2.5-0.5b-instruct-q4_k_m.gguf"
+  context_length: 4096
+learning:
+  enabled: true
+  min_conversations: 5
+  eval_threshold: 0.7
+knowledge:
+  database: "~/.synapse/knowledge/graph.db"
+logging:
+  level: "info"
+YAML
+    success "Default config written to ${CONFIG_PATH}"
+fi
+# ── Done ──────────────────────────────────────────────────────────────
+echo ""
+echo -e "${GREEN}${BOLD}"
+cat << 'DONE'
+   ╔══════════════════════════════════════════════════════╗
+   ║          Installation complete!                     ║
+   ╚══════════════════════════════════════════════════════╝
+DONE
+echo -e "${NC}"
+echo -e "  ${BOLD}Next steps:${NC}"
+echo ""
+echo -e "    ${CYAN}1.${NC} Start the engine:"
+echo -e "       ${BOLD}synapse up${NC}"
+echo ""
+echo -e "    ${CYAN}2.${NC} Chat with it:"
+echo -e "       ${BOLD}curl http://localhost:6900/v1/chat/completions \\${NC}"
+echo -e "       ${BOLD}  -H 'Content-Type: application/json' \\${NC}"
+echo -e "       ${BOLD}  -d '{\"model\":\"synapse\",\"messages\":[{\"role\":\"user\",\"content\":\"Hello\"}]}'${NC}"
+echo ""
+echo -e "    ${CYAN}3.${NC} Check status:"
+echo -e "       ${BOLD}synapse status${NC}"
+echo ""
+echo -e "    ${CYAN}4.${NC} Pull more models:"
+echo -e "       ${BOLD}synapse pull qwen3-3b${NC}"
+echo ""
+echo -e "  ${DIM}Config:  ${SYNAPSE_DIR}/config.yaml${NC}"
+echo -e "  ${DIM}Models:  ${SYNAPSE_DIR}/models/${NC}"
+echo -e "  ${DIM}Docs:    https://github.com/Djtony707/titan-synapse${NC}"
+echo ""
+echo -e "  ${DIM}────────────────────────────────────────────────${NC}"
+echo -e "  ${DIM}Created by Tony Elliott${NC}"
+echo -e "  ${DIM}https://github.com/Djtony707${NC}"
+echo ""

package/package.json ADDED Viewed

@@ -0,0 +1,36 @@
+{
+  "name": "titan-synapse",
+  "version": "0.1.1",
+  "description": "A Rust inference engine that runs a swarm of tiny specialist models that collaborate and learn continuously — on your GPU.",
+  "main": "install.sh",
+  "scripts": {
+    "postinstall": "echo 'titan-synapse is a Rust binary. Run: curl -sSL https://raw.githubusercontent.com/Djtony707/titan-synapse/main/install.sh | bash'"
+  },
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/Djtony707/titan-synapse.git"
+  },
+  "keywords": [
+    "ai",
+    "inference",
+    "llm",
+    "local",
+    "gpu",
+    "specialist",
+    "swarm",
+    "lora",
+    "qlora",
+    "continuous-learning",
+    "rust",
+    "gguf",
+    "openai-compatible",
+    "self-improving",
+    "titan"
+  ],
+  "author": "Tony Elliott <djtony707@gmail.com>",
+  "license": "Apache-2.0",
+  "bugs": {
+    "url": "https://github.com/Djtony707/titan-synapse/issues"
+  },
+  "homepage": "https://github.com/Djtony707/titan-synapse#readme"
+}

package/python/Dockerfile.learn ADDED Viewed

@@ -0,0 +1,18 @@
+FROM nvidia/cuda:12.6.3-runtime-ubuntu24.04
+WORKDIR /app
+RUN apt-get update && apt-get install -y python3 python3-pip python3-venv && \
+    rm -rf /var/lib/apt/lists/*
+RUN python3 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY synapse_learn/ ./synapse_learn/
+EXPOSE 8090
+CMD ["uvicorn", "synapse_learn.server:app", "--host", "0.0.0.0", "--port", "8090"]

package/python/requirements.txt ADDED Viewed

@@ -0,0 +1,11 @@
+fastapi==0.115.12
+uvicorn[standard]==0.34.2
+pydantic==2.11.3
+torch>=2.5.0
+transformers>=4.48.0
+safetensors>=0.4.0
+datasets>=3.0.0
+trl>=0.15.0
+peft>=0.15.0
+accelerate>=1.3.0
+bitsandbytes>=0.45.0

package/python/synapse_learn/__init__.py ADDED Viewed

File without changes

package/python/synapse_learn/datasets.py ADDED Viewed

@@ -0,0 +1,233 @@
+"""Public dataset downloader for specialist training.
+Uses clean, factual, publicly available datasets from HuggingFace.
+No proprietary data. No scraping. Just high-quality open datasets.
+Available datasets:
+- OpenWebMath: Mathematical reasoning
+- The Stack v2: Code (Python, SQL, Rust, JS, etc.)
+- SlimPajama: General knowledge
+- FLAN: Instruction following
+- MedQA: Medical knowledge
+- Alpaca-Cleaned: General instructions
+"""
+import os
+import json
+import logging
+from pathlib import Path
+from typing import Optional
+logger = logging.getLogger("synapse-datasets")
+DATA_DIR = Path(os.environ.get("SYNAPSE_DATA_DIR", os.path.expanduser("~/.synapse")))
+DATASETS_DIR = DATA_DIR / "datasets"
+DATASETS_DIR.mkdir(parents=True, exist_ok=True)
+# Registry of curated public datasets for specialist training
+DATASET_REGISTRY = {
+    "code_python": {
+        "hf_name": "bigcode/starcoderdata",
+        "subset": "python",
+        "description": "Python code from The Stack — for python_expert specialist",
+        "format": "code",
+        "specialist": "python_expert",
+    },
+    "code_sql": {
+        "hf_name": "b-mc2/sql-create-context",
+        "subset": None,
+        "description": "SQL queries with context — for sql_expert specialist",
+        "format": "instruction",
+        "specialist": "sql_expert",
+    },
+    "math": {
+        "hf_name": "open-web-math/open-web-math",
+        "subset": None,
+        "description": "OpenWebMath — mathematical reasoning and proofs",
+        "format": "text",
+        "specialist": "math_expert",
+    },
+    "general_instruct": {
+        "hf_name": "yahma/alpaca-cleaned",
+        "subset": None,
+        "description": "Cleaned Alpaca — general instruction following",
+        "format": "instruction",
+        "specialist": "general",
+    },
+    "science": {
+        "hf_name": "camel-ai/physics",
+        "subset": None,
+        "description": "Physics Q&A — for science_expert specialist",
+        "format": "qa",
+        "specialist": "science_expert",
+    },
+    "writing": {
+        "hf_name": "HuggingFaceFW/fineweb-edu",
+        "subset": "sample-10BT",
+        "description": "FineWeb-Edu — high-quality educational text",
+        "format": "text",
+        "specialist": "writing_expert",
+    },
+}
+def list_datasets() -> list:
+    """List all available datasets in the registry."""
+    return [
+        {
+            "id": k,
+            "hf_name": v["hf_name"],
+            "description": v["description"],
+            "specialist": v["specialist"],
+            "downloaded": (DATASETS_DIR / k).exists(),
+        }
+        for k, v in DATASET_REGISTRY.items()
+    ]
+def download_dataset(dataset_id: str, max_samples: int = 10000) -> dict:
+    """Download a dataset from HuggingFace and prepare for training.
+    Args:
+        dataset_id: Key from DATASET_REGISTRY
+        max_samples: Max number of samples to download (for VRAM-constrained training)
+    Returns:
+        Dict with path, sample count, and format info
+    """
+    if dataset_id not in DATASET_REGISTRY:
+        raise ValueError(f"Unknown dataset: {dataset_id}. Available: {list(DATASET_REGISTRY.keys())}")
+    info = DATASET_REGISTRY[dataset_id]
+    output_dir = DATASETS_DIR / dataset_id
+    output_dir.mkdir(parents=True, exist_ok=True)
+    try:
+        from datasets import load_dataset
+        logger.info(f"Downloading {info['hf_name']}...")
+        kwargs = {"split": f"train[:{max_samples}]", "trust_remote_code": True}
+        if info["subset"]:
+            kwargs["name"] = info["subset"]
+        dataset = load_dataset(info["hf_name"], **kwargs)
+        # Convert to training format
+        training_data = []
+        for item in dataset:
+            formatted = format_for_training(item, info["format"])
+            if formatted:
+                training_data.append(formatted)
+        # Save as JSONL
+        output_file = output_dir / "train.jsonl"
+        with open(output_file, "w") as f:
+            for item in training_data:
+                f.write(json.dumps(item) + "\n")
+        # Save metadata
+        meta = {
+            "dataset_id": dataset_id,
+            "hf_name": info["hf_name"],
+            "samples": len(training_data),
+            "format": info["format"],
+            "specialist": info["specialist"],
+            "downloaded_at": str(Path(output_file).stat().st_mtime),
+        }
+        with open(output_dir / "metadata.json", "w") as f:
+            json.dump(meta, f, indent=2)
+        logger.info(f"Downloaded {len(training_data)} samples for {dataset_id}")
+        return {
+            "path": str(output_file),
+            "samples": len(training_data),
+            "specialist": info["specialist"],
+        }
+    except ImportError:
+        logger.error("datasets library not installed. Run: pip install datasets")
+        return {"error": "datasets library not installed"}
+    except Exception as e:
+        logger.error(f"Failed to download {dataset_id}: {e}")
+        return {"error": str(e)}
+def format_for_training(item: dict, fmt: str) -> Optional[dict]:
+    """Convert a dataset item to Synapse training format.
+    All training data is stored as chat-template formatted text:
+    <|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>
+    """
+    try:
+        if fmt == "instruction":
+            # Alpaca-style: instruction + input → output
+            instruction = item.get("instruction") or item.get("question") or item.get("prompt", "")
+            inp = item.get("input", "")
+            output = item.get("output") or item.get("answer") or item.get("response", "")
+            if not instruction or not output:
+                return None
+            prompt = f"{instruction}\n{inp}".strip() if inp else instruction
+            return {
+                "text": f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n{output}<|im_end|>",
+            }
+        elif fmt == "code":
+            # Code: use the content directly as training text
+            content = item.get("content") or item.get("code") or item.get("text", "")
+            if not content or len(content) < 50:
+                return None
+            # Truncate very long code
+            content = content[:4096]
+            return {
+                "text": f"<|im_start|>user\nWrite the following code:<|im_end|>\n<|im_start|>assistant\n{content}<|im_end|>",
+            }
+        elif fmt == "qa":
+            # Q&A format
+            question = item.get("message_1") or item.get("question") or item.get("prompt", "")
+            answer = item.get("message_2") or item.get("answer") or item.get("response", "")
+            if not question or not answer:
+                return None
+            return {
+                "text": f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n{answer}<|im_end|>",
+            }
+        elif fmt == "text":
+            # Raw text — used for continued pretraining
+            text = item.get("text", "")
+            if not text or len(text) < 100:
+                return None
+            text = text[:4096]
+            return {"text": text}
+        return None
+    except Exception:
+        return None
+def prepare_specialist_dataset(specialist: str, max_samples: int = 5000) -> dict:
+    """Download and prepare all relevant datasets for a specialist."""
+    results = []
+    for dataset_id, info in DATASET_REGISTRY.items():
+        if info["specialist"] == specialist:
+            result = download_dataset(dataset_id, max_samples)
+            results.append(result)
+    return {
+        "specialist": specialist,
+        "datasets": results,
+        "total_samples": sum(r.get("samples", 0) for r in results),
+    }
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    print("Available datasets:")
+    for ds in list_datasets():
+        status = "✓" if ds["downloaded"] else "✗"
+        print(f"  {status} {ds['id']}: {ds['description']} (for {ds['specialist']})")