titan-synapse 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/CONTRIBUTING.md +187 -0
  2. package/Cargo.lock +3976 -0
  3. package/Cargo.toml +10 -0
  4. package/LICENSE +190 -0
  5. package/PROGRESS.md +151 -0
  6. package/README.md +514 -0
  7. package/TEST_LOG.md +220 -0
  8. package/config/default.yaml +36 -0
  9. package/crates/synapse/Cargo.toml +70 -0
  10. package/crates/synapse/src/cli/bench.rs +44 -0
  11. package/crates/synapse/src/cli/eval.rs +395 -0
  12. package/crates/synapse/src/cli/export.rs +45 -0
  13. package/crates/synapse/src/cli/hub.rs +179 -0
  14. package/crates/synapse/src/cli/import.rs +35 -0
  15. package/crates/synapse/src/cli/learn.rs +53 -0
  16. package/crates/synapse/src/cli/mod.rs +10 -0
  17. package/crates/synapse/src/cli/models.rs +36 -0
  18. package/crates/synapse/src/cli/pull.rs +60 -0
  19. package/crates/synapse/src/cli/status.rs +52 -0
  20. package/crates/synapse/src/cli/train.rs +99 -0
  21. package/crates/synapse/src/config.rs +220 -0
  22. package/crates/synapse/src/dashboard.rs +281 -0
  23. package/crates/synapse/src/format/manifest.rs +57 -0
  24. package/crates/synapse/src/format/mod.rs +4 -0
  25. package/crates/synapse/src/format/packer.rs +213 -0
  26. package/crates/synapse/src/inference/engine.rs +361 -0
  27. package/crates/synapse/src/inference/kv_cache.rs +97 -0
  28. package/crates/synapse/src/inference/lora.rs +166 -0
  29. package/crates/synapse/src/inference/mod.rs +9 -0
  30. package/crates/synapse/src/inference/model.rs +167 -0
  31. package/crates/synapse/src/inference/sampler.rs +133 -0
  32. package/crates/synapse/src/inference/speculative.rs +153 -0
  33. package/crates/synapse/src/learn/cloud_fallback.rs +186 -0
  34. package/crates/synapse/src/learn/engine.rs +109 -0
  35. package/crates/synapse/src/learn/mod.rs +5 -0
  36. package/crates/synapse/src/main.rs +185 -0
  37. package/crates/synapse/src/memory/extractor.rs +201 -0
  38. package/crates/synapse/src/memory/graph.rs +332 -0
  39. package/crates/synapse/src/memory/hallucination.rs +259 -0
  40. package/crates/synapse/src/memory/mod.rs +7 -0
  41. package/crates/synapse/src/openai.rs +232 -0
  42. package/crates/synapse/src/server.rs +166 -0
  43. package/crates/synapse/src/streaming.rs +80 -0
  44. package/crates/synapse/src/swarm/coordinator.rs +198 -0
  45. package/crates/synapse/src/swarm/mod.rs +8 -0
  46. package/crates/synapse/src/swarm/orchestrator.rs +225 -0
  47. package/crates/synapse/src/swarm/pool.rs +64 -0
  48. package/crates/synapse/src/swarm/spawner.rs +199 -0
  49. package/crates/synapse/src/swarm/synthesizer.rs +26 -0
  50. package/crates/synapse/src/vram/manager.rs +67 -0
  51. package/crates/synapse/src/vram/mod.rs +3 -0
  52. package/docker-compose.yml +19 -0
  53. package/install.sh +311 -0
  54. package/package.json +36 -0
  55. package/python/Dockerfile.learn +18 -0
  56. package/python/requirements.txt +11 -0
  57. package/python/synapse_learn/__init__.py +0 -0
  58. package/python/synapse_learn/datasets.py +233 -0
  59. package/python/synapse_learn/real_eval.py +616 -0
  60. package/python/synapse_learn/server.py +431 -0
  61. package/python/synapse_learn/train_base.py +672 -0
  62. package/python/synapse_learn/train_specialists.py +787 -0
package/install.sh ADDED
@@ -0,0 +1,311 @@
1
+ #!/usr/bin/env bash
2
+ # ──────────────────────────────────────────────────────────────────────
3
+ # Titan Synapse — Install Script
4
+ # Small models that think together. And learn.
5
+ # https://github.com/Djtony707/titan-synapse
6
+ # ──────────────────────────────────────────────────────────────────────
7
+ set -euo pipefail
8
+
9
+ # ── Colors ────────────────────────────────────────────────────────────
10
+ RED='\033[0;31m'
11
+ GREEN='\033[0;32m'
12
+ YELLOW='\033[1;33m'
13
+ BLUE='\033[0;34m'
14
+ CYAN='\033[0;36m'
15
+ MAGENTA='\033[0;35m'
16
+ BOLD='\033[1m'
17
+ DIM='\033[2m'
18
+ NC='\033[0m'
19
+
20
+ # ── Helpers ───────────────────────────────────────────────────────────
21
+ info() { echo -e "${BLUE}[INFO]${NC} $*"; }
22
+ success() { echo -e "${GREEN}[OK]${NC} $*"; }
23
+ warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
24
+ fail() { echo -e "${RED}[FAIL]${NC} $*"; exit 1; }
25
+ step() { echo -e "\n${CYAN}${BOLD}>>> $*${NC}"; }
26
+
27
+ # ── ASCII Header ──────────────────────────────────────────────────────
28
+ echo -e "${MAGENTA}"
29
+ cat << 'BANNER'
30
+
31
+ ███████╗██╗ ██╗███╗ ██╗ █████╗ ██████╗ ███████╗███████╗
32
+ ██╔════╝╚██╗ ██╔╝████╗ ██║██╔══██╗██╔══██╗██╔════╝██╔════╝
33
+ ███████╗ ╚████╔╝ ██╔██╗ ██║███████║██████╔╝███████╗█████╗
34
+ ╚════██║ ╚██╔╝ ██║╚██╗██║██╔══██║██╔═══╝ ╚════██║██╔══╝
35
+ ███████║ ██║ ██║ ╚████║██║ ██║██║ ███████║███████╗
36
+ ╚══════╝ ╚═╝ ╚═╝ ╚═══╝╚═╝ ╚═╝╚═╝ ╚══════╝╚══════╝
37
+
38
+ BANNER
39
+ echo -e "${NC}"
40
+ echo -e " ${DIM}Tiny models. Big brain. Your hardware. No excuses.${NC}"
41
+ echo -e " ${DIM}────────────────────────────────────────────────${NC}"
42
+ echo ""
43
+
44
+ # ── Constants ─────────────────────────────────────────────────────────
45
+ REPO_URL="https://github.com/Djtony707/titan-synapse.git"
46
+ SYNAPSE_DIR="${HOME}/.synapse"
47
+ MODEL_URL="https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf"
48
+ MODEL_FILE="qwen2.5-0.5b-instruct-q4_k_m.gguf"
49
+ BINARY_NAME="synapse"
50
+
51
+ # ── OS Detection ──────────────────────────────────────────────────────
52
+ step "Detecting operating system"
53
+
54
+ OS="$(uname -s)"
55
+ ARCH="$(uname -m)"
56
+
57
+ case "$OS" in
58
+ Linux)
59
+ PLATFORM="linux"
60
+ success "Linux detected (${ARCH})"
61
+ ;;
62
+ Darwin)
63
+ PLATFORM="macos"
64
+ success "macOS detected (${ARCH})"
65
+ ;;
66
+ *)
67
+ fail "Unsupported operating system: ${OS}. Synapse supports Linux and macOS."
68
+ ;;
69
+ esac
70
+
71
+ # ── Dependency Checks ─────────────────────────────────────────────────
72
+ step "Checking dependencies"
73
+
74
+ # git
75
+ if command -v git &>/dev/null; then
76
+ success "git found: $(git --version | head -1)"
77
+ else
78
+ fail "git is not installed. Please install git first."
79
+ fi
80
+
81
+ # curl or wget
82
+ DOWNLOADER=""
83
+ if command -v curl &>/dev/null; then
84
+ DOWNLOADER="curl"
85
+ success "curl found"
86
+ elif command -v wget &>/dev/null; then
87
+ DOWNLOADER="wget"
88
+ success "wget found"
89
+ else
90
+ fail "Neither curl nor wget found. Please install one of them."
91
+ fi
92
+
93
+ # ── Rust Toolchain ────────────────────────────────────────────────────
94
+ step "Checking Rust toolchain"
95
+
96
+ if command -v rustc &>/dev/null && command -v cargo &>/dev/null; then
97
+ RUST_VER="$(rustc --version)"
98
+ success "Rust already installed: ${RUST_VER}"
99
+ else
100
+ warn "Rust not found. Installing via rustup..."
101
+ if [ "$DOWNLOADER" = "curl" ]; then
102
+ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
103
+ else
104
+ wget -qO- https://sh.rustup.rs | sh -s -- -y
105
+ fi
106
+ # Source cargo env for this session
107
+ # shellcheck source=/dev/null
108
+ source "${HOME}/.cargo/env" 2>/dev/null || true
109
+ if command -v rustc &>/dev/null; then
110
+ success "Rust installed: $(rustc --version)"
111
+ else
112
+ fail "Rust installation failed. Try manually: https://rustup.rs"
113
+ fi
114
+ fi
115
+
116
+ # ── CUDA Detection ────────────────────────────────────────────────────
117
+ step "Checking for CUDA toolkit"
118
+
119
+ CARGO_FEATURES=""
120
+ if command -v nvcc &>/dev/null; then
121
+ CUDA_VER="$(nvcc --version | grep -oP 'release \K[0-9.]+' 2>/dev/null || nvcc --version | sed -n 's/.*release \([0-9.]*\).*/\1/p')"
122
+ success "CUDA toolkit detected: ${CUDA_VER}"
123
+ CARGO_FEATURES="--features cuda"
124
+ info "Build will include CUDA acceleration"
125
+ elif [ -d "/usr/local/cuda" ] || [ -d "/opt/cuda" ]; then
126
+ warn "CUDA directory found but nvcc not in PATH. Building without CUDA."
127
+ info "To enable CUDA: export PATH=/usr/local/cuda/bin:\$PATH and re-run"
128
+ else
129
+ info "No CUDA toolkit found. Building CPU-only (this is fine for starters)."
130
+ # Check for Metal on macOS
131
+ if [ "$PLATFORM" = "macos" ] && [ "$ARCH" = "arm64" ]; then
132
+ CARGO_FEATURES="--features metal"
133
+ info "Apple Silicon detected — building with Metal acceleration"
134
+ fi
135
+ fi
136
+
137
+ # ── Clone or Use Current Directory ────────────────────────────────────
138
+ step "Setting up source code"
139
+
140
+ BUILD_DIR=""
141
+ if [ -f "Cargo.toml" ] && grep -q "titan-synapse\|synapse" Cargo.toml 2>/dev/null; then
142
+ BUILD_DIR="$(pwd)"
143
+ success "Already in titan-synapse repo: ${BUILD_DIR}"
144
+ elif [ -d "titan-synapse" ]; then
145
+ BUILD_DIR="$(pwd)/titan-synapse"
146
+ success "Found existing clone: ${BUILD_DIR}"
147
+ else
148
+ info "Cloning titan-synapse..."
149
+ git clone "$REPO_URL" titan-synapse
150
+ BUILD_DIR="$(pwd)/titan-synapse"
151
+ success "Cloned to ${BUILD_DIR}"
152
+ fi
153
+
154
+ cd "$BUILD_DIR"
155
+
156
+ # ── Build ─────────────────────────────────────────────────────────────
157
+ step "Building Synapse (release mode)"
158
+
159
+ info "This may take a few minutes on first build..."
160
+ if [ -n "$CARGO_FEATURES" ]; then
161
+ info "Build flags: ${CARGO_FEATURES}"
162
+ cargo build --release ${CARGO_FEATURES}
163
+ else
164
+ cargo build --release
165
+ fi
166
+
167
+ if [ -f "target/release/${BINARY_NAME}" ]; then
168
+ success "Build complete!"
169
+ else
170
+ fail "Build failed — binary not found at target/release/${BINARY_NAME}"
171
+ fi
172
+
173
+ # ── Create ~/.synapse Directory ───────────────────────────────────────
174
+ step "Setting up Synapse home directory"
175
+
176
+ mkdir -p "${SYNAPSE_DIR}"
177
+ mkdir -p "${SYNAPSE_DIR}/models"
178
+ mkdir -p "${SYNAPSE_DIR}/knowledge"
179
+ mkdir -p "${SYNAPSE_DIR}/adapters"
180
+ mkdir -p "${SYNAPSE_DIR}/logs"
181
+
182
+ success "Created ${SYNAPSE_DIR}/"
183
+ info " models/ — GGUF model files"
184
+ info " knowledge/ — SQLite knowledge graphs"
185
+ info " adapters/ — QLoRA adapter weights"
186
+ info " logs/ — Runtime logs"
187
+
188
+ # ── Install Binary ────────────────────────────────────────────────────
189
+ step "Installing binary"
190
+
191
+ INSTALL_DIR=""
192
+ if [ -d "${HOME}/.local/bin" ] || mkdir -p "${HOME}/.local/bin" 2>/dev/null; then
193
+ INSTALL_DIR="${HOME}/.local/bin"
194
+ elif [ -w "/usr/local/bin" ]; then
195
+ INSTALL_DIR="/usr/local/bin"
196
+ else
197
+ warn "Cannot write to ~/.local/bin or /usr/local/bin"
198
+ info "Attempting /usr/local/bin with sudo..."
199
+ sudo mkdir -p /usr/local/bin 2>/dev/null || true
200
+ if [ -w "/usr/local/bin" ] || sudo test -w "/usr/local/bin" 2>/dev/null; then
201
+ INSTALL_DIR="/usr/local/bin"
202
+ sudo cp "target/release/${BINARY_NAME}" "${INSTALL_DIR}/${BINARY_NAME}"
203
+ sudo chmod +x "${INSTALL_DIR}/${BINARY_NAME}"
204
+ success "Installed to ${INSTALL_DIR}/${BINARY_NAME} (via sudo)"
205
+ INSTALL_DIR="" # skip the normal copy below
206
+ else
207
+ fail "No writable install directory. Copy target/release/synapse to your PATH manually."
208
+ fi
209
+ fi
210
+
211
+ if [ -n "$INSTALL_DIR" ]; then
212
+ cp "target/release/${BINARY_NAME}" "${INSTALL_DIR}/${BINARY_NAME}"
213
+ chmod +x "${INSTALL_DIR}/${BINARY_NAME}"
214
+ success "Installed to ${INSTALL_DIR}/${BINARY_NAME}"
215
+
216
+ # Check if install dir is in PATH
217
+ if ! echo "$PATH" | tr ':' '\n' | grep -qx "$INSTALL_DIR"; then
218
+ warn "${INSTALL_DIR} is not in your PATH"
219
+ info "Add this to your shell profile:"
220
+ echo -e " ${BOLD}export PATH=\"${INSTALL_DIR}:\$PATH\"${NC}"
221
+ fi
222
+ fi
223
+
224
+ # ── Download Default Model ────────────────────────────────────────────
225
+ step "Downloading default model (Qwen2.5-0.5B Q4_K_M)"
226
+
227
+ MODEL_PATH="${SYNAPSE_DIR}/models/${MODEL_FILE}"
228
+
229
+ if [ -f "$MODEL_PATH" ]; then
230
+ success "Model already exists: ${MODEL_PATH}"
231
+ else
232
+ info "Downloading from HuggingFace (~400MB)..."
233
+ if [ "$DOWNLOADER" = "curl" ]; then
234
+ curl -L --progress-bar -o "$MODEL_PATH" "$MODEL_URL"
235
+ else
236
+ wget --show-progress -O "$MODEL_PATH" "$MODEL_URL"
237
+ fi
238
+
239
+ if [ -f "$MODEL_PATH" ] && [ -s "$MODEL_PATH" ]; then
240
+ MODEL_SIZE=$(du -h "$MODEL_PATH" | cut -f1)
241
+ success "Model downloaded: ${MODEL_PATH} (${MODEL_SIZE})"
242
+ else
243
+ warn "Model download may have failed. You can retry manually:"
244
+ info " synapse pull qwen2.5-0.5b"
245
+ fi
246
+ fi
247
+
248
+ # ── Write Default Config ─────────────────────────────────────────────
249
+ CONFIG_PATH="${SYNAPSE_DIR}/config.yaml"
250
+
251
+ if [ ! -f "$CONFIG_PATH" ]; then
252
+ cat > "$CONFIG_PATH" << 'YAML'
253
+ # Titan Synapse Configuration
254
+ # Docs: https://github.com/Djtony707/titan-synapse
255
+
256
+ server:
257
+ host: "127.0.0.1"
258
+ port: 6900
259
+
260
+ model:
261
+ path: "~/.synapse/models/qwen2.5-0.5b-instruct-q4_k_m.gguf"
262
+ context_length: 4096
263
+
264
+ learning:
265
+ enabled: true
266
+ min_conversations: 5
267
+ eval_threshold: 0.7
268
+
269
+ knowledge:
270
+ database: "~/.synapse/knowledge/graph.db"
271
+
272
+ logging:
273
+ level: "info"
274
+ YAML
275
+ success "Default config written to ${CONFIG_PATH}"
276
+ fi
277
+
278
+ # ── Done ──────────────────────────────────────────────────────────────
279
+ echo ""
280
+ echo -e "${GREEN}${BOLD}"
281
+ cat << 'DONE'
282
+ ╔══════════════════════════════════════════════════════╗
283
+ ║ Installation complete! ║
284
+ ╚══════════════════════════════════════════════════════╝
285
+ DONE
286
+ echo -e "${NC}"
287
+
288
+ echo -e " ${BOLD}Next steps:${NC}"
289
+ echo ""
290
+ echo -e " ${CYAN}1.${NC} Start the engine:"
291
+ echo -e " ${BOLD}synapse up${NC}"
292
+ echo ""
293
+ echo -e " ${CYAN}2.${NC} Chat with it:"
294
+ echo -e " ${BOLD}curl http://localhost:6900/v1/chat/completions \\${NC}"
295
+ echo -e " ${BOLD} -H 'Content-Type: application/json' \\${NC}"
296
+ echo -e " ${BOLD} -d '{\"model\":\"synapse\",\"messages\":[{\"role\":\"user\",\"content\":\"Hello\"}]}'${NC}"
297
+ echo ""
298
+ echo -e " ${CYAN}3.${NC} Check status:"
299
+ echo -e " ${BOLD}synapse status${NC}"
300
+ echo ""
301
+ echo -e " ${CYAN}4.${NC} Pull more models:"
302
+ echo -e " ${BOLD}synapse pull qwen3-3b${NC}"
303
+ echo ""
304
+ echo -e " ${DIM}Config: ${SYNAPSE_DIR}/config.yaml${NC}"
305
+ echo -e " ${DIM}Models: ${SYNAPSE_DIR}/models/${NC}"
306
+ echo -e " ${DIM}Docs: https://github.com/Djtony707/titan-synapse${NC}"
307
+ echo ""
308
+ echo -e " ${DIM}────────────────────────────────────────────────${NC}"
309
+ echo -e " ${DIM}Created by Tony Elliott${NC}"
310
+ echo -e " ${DIM}https://github.com/Djtony707${NC}"
311
+ echo ""
package/package.json ADDED
@@ -0,0 +1,36 @@
1
+ {
2
+ "name": "titan-synapse",
3
+ "version": "0.1.1",
4
+ "description": "A Rust inference engine that runs a swarm of tiny specialist models that collaborate and learn continuously — on your GPU.",
5
+ "main": "install.sh",
6
+ "scripts": {
7
+ "postinstall": "echo 'titan-synapse is a Rust binary. Run: curl -sSL https://raw.githubusercontent.com/Djtony707/titan-synapse/main/install.sh | bash'"
8
+ },
9
+ "repository": {
10
+ "type": "git",
11
+ "url": "https://github.com/Djtony707/titan-synapse.git"
12
+ },
13
+ "keywords": [
14
+ "ai",
15
+ "inference",
16
+ "llm",
17
+ "local",
18
+ "gpu",
19
+ "specialist",
20
+ "swarm",
21
+ "lora",
22
+ "qlora",
23
+ "continuous-learning",
24
+ "rust",
25
+ "gguf",
26
+ "openai-compatible",
27
+ "self-improving",
28
+ "titan"
29
+ ],
30
+ "author": "Tony Elliott <djtony707@gmail.com>",
31
+ "license": "Apache-2.0",
32
+ "bugs": {
33
+ "url": "https://github.com/Djtony707/titan-synapse/issues"
34
+ },
35
+ "homepage": "https://github.com/Djtony707/titan-synapse#readme"
36
+ }
@@ -0,0 +1,18 @@
1
+ FROM nvidia/cuda:12.6.3-runtime-ubuntu24.04
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y python3 python3-pip python3-venv && \
6
+ rm -rf /var/lib/apt/lists/*
7
+
8
+ RUN python3 -m venv /opt/venv
9
+ ENV PATH="/opt/venv/bin:$PATH"
10
+
11
+ COPY requirements.txt .
12
+ RUN pip install --no-cache-dir -r requirements.txt
13
+
14
+ COPY synapse_learn/ ./synapse_learn/
15
+
16
+ EXPOSE 8090
17
+
18
+ CMD ["uvicorn", "synapse_learn.server:app", "--host", "0.0.0.0", "--port", "8090"]
@@ -0,0 +1,11 @@
1
+ fastapi==0.115.12
2
+ uvicorn[standard]==0.34.2
3
+ pydantic==2.11.3
4
+ torch>=2.5.0
5
+ transformers>=4.48.0
6
+ safetensors>=0.4.0
7
+ datasets>=3.0.0
8
+ trl>=0.15.0
9
+ peft>=0.15.0
10
+ accelerate>=1.3.0
11
+ bitsandbytes>=0.45.0
File without changes
@@ -0,0 +1,233 @@
1
+ """Public dataset downloader for specialist training.
2
+
3
+ Uses clean, factual, publicly available datasets from HuggingFace.
4
+ No proprietary data. No scraping. Just high-quality open datasets.
5
+
6
+ Available datasets:
7
+ - OpenWebMath: Mathematical reasoning
8
+ - The Stack v2: Code (Python, SQL, Rust, JS, etc.)
9
+ - SlimPajama: General knowledge
10
+ - FLAN: Instruction following
11
+ - MedQA: Medical knowledge
12
+ - Alpaca-Cleaned: General instructions
13
+ """
14
+
15
+ import os
16
+ import json
17
+ import logging
18
+ from pathlib import Path
19
+ from typing import Optional
20
+
21
+ logger = logging.getLogger("synapse-datasets")
22
+
23
+ DATA_DIR = Path(os.environ.get("SYNAPSE_DATA_DIR", os.path.expanduser("~/.synapse")))
24
+ DATASETS_DIR = DATA_DIR / "datasets"
25
+ DATASETS_DIR.mkdir(parents=True, exist_ok=True)
26
+
27
+ # Registry of curated public datasets for specialist training
28
+ DATASET_REGISTRY = {
29
+ "code_python": {
30
+ "hf_name": "bigcode/starcoderdata",
31
+ "subset": "python",
32
+ "description": "Python code from The Stack — for python_expert specialist",
33
+ "format": "code",
34
+ "specialist": "python_expert",
35
+ },
36
+ "code_sql": {
37
+ "hf_name": "b-mc2/sql-create-context",
38
+ "subset": None,
39
+ "description": "SQL queries with context — for sql_expert specialist",
40
+ "format": "instruction",
41
+ "specialist": "sql_expert",
42
+ },
43
+ "math": {
44
+ "hf_name": "open-web-math/open-web-math",
45
+ "subset": None,
46
+ "description": "OpenWebMath — mathematical reasoning and proofs",
47
+ "format": "text",
48
+ "specialist": "math_expert",
49
+ },
50
+ "general_instruct": {
51
+ "hf_name": "yahma/alpaca-cleaned",
52
+ "subset": None,
53
+ "description": "Cleaned Alpaca — general instruction following",
54
+ "format": "instruction",
55
+ "specialist": "general",
56
+ },
57
+ "science": {
58
+ "hf_name": "camel-ai/physics",
59
+ "subset": None,
60
+ "description": "Physics Q&A — for science_expert specialist",
61
+ "format": "qa",
62
+ "specialist": "science_expert",
63
+ },
64
+ "writing": {
65
+ "hf_name": "HuggingFaceFW/fineweb-edu",
66
+ "subset": "sample-10BT",
67
+ "description": "FineWeb-Edu — high-quality educational text",
68
+ "format": "text",
69
+ "specialist": "writing_expert",
70
+ },
71
+ }
72
+
73
+
74
+ def list_datasets() -> list:
75
+ """List all available datasets in the registry."""
76
+ return [
77
+ {
78
+ "id": k,
79
+ "hf_name": v["hf_name"],
80
+ "description": v["description"],
81
+ "specialist": v["specialist"],
82
+ "downloaded": (DATASETS_DIR / k).exists(),
83
+ }
84
+ for k, v in DATASET_REGISTRY.items()
85
+ ]
86
+
87
+
88
+ def download_dataset(dataset_id: str, max_samples: int = 10000) -> dict:
89
+ """Download a dataset from HuggingFace and prepare for training.
90
+
91
+ Args:
92
+ dataset_id: Key from DATASET_REGISTRY
93
+ max_samples: Max number of samples to download (for VRAM-constrained training)
94
+
95
+ Returns:
96
+ Dict with path, sample count, and format info
97
+ """
98
+ if dataset_id not in DATASET_REGISTRY:
99
+ raise ValueError(f"Unknown dataset: {dataset_id}. Available: {list(DATASET_REGISTRY.keys())}")
100
+
101
+ info = DATASET_REGISTRY[dataset_id]
102
+ output_dir = DATASETS_DIR / dataset_id
103
+ output_dir.mkdir(parents=True, exist_ok=True)
104
+
105
+ try:
106
+ from datasets import load_dataset
107
+
108
+ logger.info(f"Downloading {info['hf_name']}...")
109
+
110
+ kwargs = {"split": f"train[:{max_samples}]", "trust_remote_code": True}
111
+ if info["subset"]:
112
+ kwargs["name"] = info["subset"]
113
+
114
+ dataset = load_dataset(info["hf_name"], **kwargs)
115
+
116
+ # Convert to training format
117
+ training_data = []
118
+ for item in dataset:
119
+ formatted = format_for_training(item, info["format"])
120
+ if formatted:
121
+ training_data.append(formatted)
122
+
123
+ # Save as JSONL
124
+ output_file = output_dir / "train.jsonl"
125
+ with open(output_file, "w") as f:
126
+ for item in training_data:
127
+ f.write(json.dumps(item) + "\n")
128
+
129
+ # Save metadata
130
+ meta = {
131
+ "dataset_id": dataset_id,
132
+ "hf_name": info["hf_name"],
133
+ "samples": len(training_data),
134
+ "format": info["format"],
135
+ "specialist": info["specialist"],
136
+ "downloaded_at": str(Path(output_file).stat().st_mtime),
137
+ }
138
+ with open(output_dir / "metadata.json", "w") as f:
139
+ json.dump(meta, f, indent=2)
140
+
141
+ logger.info(f"Downloaded {len(training_data)} samples for {dataset_id}")
142
+
143
+ return {
144
+ "path": str(output_file),
145
+ "samples": len(training_data),
146
+ "specialist": info["specialist"],
147
+ }
148
+
149
+ except ImportError:
150
+ logger.error("datasets library not installed. Run: pip install datasets")
151
+ return {"error": "datasets library not installed"}
152
+ except Exception as e:
153
+ logger.error(f"Failed to download {dataset_id}: {e}")
154
+ return {"error": str(e)}
155
+
156
+
157
+ def format_for_training(item: dict, fmt: str) -> Optional[dict]:
158
+ """Convert a dataset item to Synapse training format.
159
+
160
+ All training data is stored as chat-template formatted text:
161
+ <|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>
162
+ """
163
+ try:
164
+ if fmt == "instruction":
165
+ # Alpaca-style: instruction + input → output
166
+ instruction = item.get("instruction") or item.get("question") or item.get("prompt", "")
167
+ inp = item.get("input", "")
168
+ output = item.get("output") or item.get("answer") or item.get("response", "")
169
+
170
+ if not instruction or not output:
171
+ return None
172
+
173
+ prompt = f"{instruction}\n{inp}".strip() if inp else instruction
174
+ return {
175
+ "text": f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n{output}<|im_end|>",
176
+ }
177
+
178
+ elif fmt == "code":
179
+ # Code: use the content directly as training text
180
+ content = item.get("content") or item.get("code") or item.get("text", "")
181
+ if not content or len(content) < 50:
182
+ return None
183
+ # Truncate very long code
184
+ content = content[:4096]
185
+ return {
186
+ "text": f"<|im_start|>user\nWrite the following code:<|im_end|>\n<|im_start|>assistant\n{content}<|im_end|>",
187
+ }
188
+
189
+ elif fmt == "qa":
190
+ # Q&A format
191
+ question = item.get("message_1") or item.get("question") or item.get("prompt", "")
192
+ answer = item.get("message_2") or item.get("answer") or item.get("response", "")
193
+ if not question or not answer:
194
+ return None
195
+ return {
196
+ "text": f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n{answer}<|im_end|>",
197
+ }
198
+
199
+ elif fmt == "text":
200
+ # Raw text — used for continued pretraining
201
+ text = item.get("text", "")
202
+ if not text or len(text) < 100:
203
+ return None
204
+ text = text[:4096]
205
+ return {"text": text}
206
+
207
+ return None
208
+
209
+ except Exception:
210
+ return None
211
+
212
+
213
+ def prepare_specialist_dataset(specialist: str, max_samples: int = 5000) -> dict:
214
+ """Download and prepare all relevant datasets for a specialist."""
215
+ results = []
216
+ for dataset_id, info in DATASET_REGISTRY.items():
217
+ if info["specialist"] == specialist:
218
+ result = download_dataset(dataset_id, max_samples)
219
+ results.append(result)
220
+
221
+ return {
222
+ "specialist": specialist,
223
+ "datasets": results,
224
+ "total_samples": sum(r.get("samples", 0) for r in results),
225
+ }
226
+
227
+
228
+ if __name__ == "__main__":
229
+ logging.basicConfig(level=logging.INFO)
230
+ print("Available datasets:")
231
+ for ds in list_datasets():
232
+ status = "✓" if ds["downloaded"] else "✗"
233
+ print(f" {status} {ds['id']}: {ds['description']} (for {ds['specialist']})")