groove-dev 0.27.116 → 0.27.118
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/TRAINING_DATA_v4.md +6 -6
- package/moe-training/client/domain-tagger.js +21 -1
- package/moe-training/client/trajectory-capture.js +36 -7
- package/moe-training/test/client/trajectory-capture.test.js +182 -1
- package/moe-training/test/shared/envelope-schema.test.js +3 -3
- package/node_modules/@groove-dev/cli/package.json +1 -1
- package/node_modules/@groove-dev/daemon/package.json +1 -1
- package/node_modules/@groove-dev/daemon/src/process.js +35 -9
- package/node_modules/@groove-dev/daemon/src/tunnel-manager.js +18 -1
- package/node_modules/@groove-dev/gui/dist/assets/{index-fq--PD7_.js → index-BunEIVjD.js} +14 -14
- package/node_modules/@groove-dev/gui/dist/index.html +1 -1
- package/node_modules/@groove-dev/gui/package.json +1 -1
- package/node_modules/@groove-dev/gui/src/components/settings/quick-connect.jsx +29 -7
- package/node_modules/moe-training/client/domain-tagger.js +21 -1
- package/node_modules/moe-training/client/trajectory-capture.js +36 -7
- package/node_modules/moe-training/test/client/trajectory-capture.test.js +182 -1
- package/node_modules/moe-training/test/shared/envelope-schema.test.js +3 -3
- package/package.json +1 -1
- package/packages/cli/package.json +1 -1
- package/packages/daemon/package.json +1 -1
- package/packages/daemon/src/process.js +35 -9
- package/packages/daemon/src/tunnel-manager.js +18 -1
- package/packages/gui/dist/assets/{index-fq--PD7_.js → index-BunEIVjD.js} +14 -14
- package/packages/gui/dist/index.html +1 -1
- package/packages/gui/package.json +1 -1
- package/packages/gui/src/components/settings/quick-connect.jsx +29 -7
package/TRAINING_DATA_v4.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
{"envelope_id":"env_f93f9a50-bda7-453f-ac14-5fdaf7e4b9f0","session_id":"sess_a3717729-38ba-4b17-bc7b-ee6de297819c","chunk_sequence":0,"contributor_id":"23e6dd8c4a884836bf78cafda9fbe57d","attestation":{"session_hmac":"54323779fa5dd41968d8a3a47bd224c14904de2105d80c0f019fc3bce2b2242a","sequence":0,"app_version_hash":"4a527565d85cc601d3c47c376e4f361d5af987de58f60cf71dbc263a0de91247"},"metadata":{"model_engine":"gpt-5.5","provider":"codex","agent_role":"planner","agent_id":"0f8f9413","task_complexity":"medium","team_size":1,"session_quality":0,"groove_version":"0.27.115","leaf_context":null},"trajectory_log":[{"step":1,"type":"thought","timestamp":1777357867.668,"content":"I’m planner-15, ready to create a team plan and route work for this project; what would you like the team to work on?","token_count":30},{"step":2,"type":"resolution","timestamp":1777357867.736,"content":"","token_count":1}]}
|
|
2
|
-
{"envelope_id":"
|
|
3
|
-
{"envelope_id":"
|
|
4
|
-
{"envelope_id":"env_1317b05c-9986-4d33-b386-4020575623cb","session_id":"sess_2bcbe7fe-a1b0-488e-bf66-a20381bb57ea","chunk_sequence":0,"contributor_id":"23e6dd8c4a884836bf78cafda9fbe57d","attestation":{"session_hmac":"1ff8b76c0044294d19713e0bc4b36ef76f7eae2daf6eafd0d00d11b1ed2c227c","sequence":0,"app_version_hash":"4a527565d85cc601d3c47c376e4f361d5af987de58f60cf71dbc263a0de91247"},"metadata":{"model_engine":"gpt-5.5","provider":"codex","agent_role":"planner","agent_id":"c7f1a11c","task_complexity":"medium","team_size":1,"session_quality":0,"groove_version":"0.27.115","leaf_context":null},"trajectory_log":[{"step":1,"type":"thought","timestamp":1777357911.75,"content":"I’ll quickly check the team registry and the relevant app entry points, then route the work without implementing it.","token_count":29},{"step":2,"type":"action","timestamp":1777357912.464,"tool":"command_execution","arguments":{"command":"/bin/zsh -lc 'pwd && ls && (test -f AGENTS_REGISTRY.md && cat AGENTS_REGISTRY.md || true) && (test -f .groove/recommended-team.json && cat .groove/recommended-team.json || true)'"},"content":"Executing: /bin/zsh -lc 'pwd && ls && (test -f AGENTS_REGISTRY.md && cat AGENTS_REGISTRY.md || true) && (test -f .groove/recommended-team.json && cat .groove/recommended-team.json || true)'","token_count":96},{"step":3,"type":"observation","timestamp":1777357912.512,"is_error":false,"content":"~/Desktop/groove-project/groove/codex\n","truncated":false,"original_token_count":12,"token_count":10},{"step":4,"type":"thought","timestamp":1777357919.886,"content":"No registry surfaced in the current folder, so I’m checking hidden project metadata and nearby package hints.","token_count":28},{"step":5,"type":"action","timestamp":1777357920.233,"tool":"command_execution","arguments":{"command":"/bin/zsh -lc \"ls -la && find . -maxdepth 3 -type f \\\\( -name 'AGENTS_REGISTRY.md' -o -name 'recommended-team.json' -o -name 'package.json' -o -name 'README.md' \\\\) -print\""},"content":"Executing: /bin/zsh -lc \"ls -la && find . -maxdepth 3 -type f \\\\( -name 'AGENTS_REGISTRY.md' -o -name 'recommended-team.json' -o -name 'package.json' -o -name 'README.md' \\\\) -print\"","token_count":94},{"step":6,"type":"observation","timestamp":1777357920.285,"is_error":false,"content":"total 0\ndrwxr-xr-x@ 2 rok staff 64 Apr 27 23:30 .\ndrwxr-xr-x@ 52 rok staff 1664 Apr 27 23:31 ..\n","truncated":false,"original_token_count":26,"token_count":26},{"step":7,"type":"thought","timestamp":1777357941.242,"content":"I’ll create the routing config for a new static landing-page project in this empty workspace.","token_count":24},{"step":8,"type":"action","timestamp":1777357947.238,"tool":"command_execution","arguments":{"command":"/bin/zsh -lc \"mkdir -p .groove && cat > .groove/recommended-team.json <<'EOF'\n{\n \\\"projectDir\\\": \\\"coming-soon-landing\\\",\n \\\"agents\\\": [\n {\n \\\"role\\\": \\\"frontend\\\",\n \\\"phase\\\": 1,\n \\\"scope\\\": [\n \\\"**/*\\\"\n ],\n \\\"prompt\\\": \\\"Build a basic static coming-soon landing page with a 5-day countdown. Create a small self-contained site in this project directory with an index.html entry point plus any needed CSS/JS assets. The page should include a clear product/brand placeholder headline, short supporting copy, email signup UI as a non-submitting visual form or safe mailto-free placeholder, and a responsive countdown showing days, hours, minutes, and seconds. The countdown should start at 5 days for a first-time visitor and persist the target timestamp in localStorage so refreshing does not reset it. Keep the design polished but lightweight, mobile-friendly, accessible, and dependency-free unless a minimal toolchain is already created by you. Do not delete unrelated files.\\\"\n },\n {\n \\\"role\\\": \\\"fullstack\\\",\n \\\"phase\\\": 2,\n \\\"scope\\\": [],\n \\\"prompt\\\": \\\"QC Senior Dev: Audit the frontend landing page for correctness, accessibility, responsiveness, and countdown behavior. Verify the countdown persists across refreshes via localStorage and handles expiry cleanly. Run any available validation/build commands if the frontend agent added a toolchain; otherwise inspect the static files directly. Do NOT start long-running dev servers. Do NOT open files in a browser with an 'open' command. Commit all changes.\\\"\n }\n ],\n \\\"preview\\\": {\n \\\"kind\\\": \\\"static-html\\\",\n \\\"command\\\": \\\"\\\",\n \\\"openPath\\\": \\\"index.html\\\"\n }\n}\nEOF\ncat .groove/recommended-team.json\""},"content":"Executing: /bin/zsh -lc \"mkdir -p .groove && cat > .groove/recommended-team.json <<'EOF'\n{\n \\\"projectDir\\\": \\\"coming-soon-landing\\\",\n \\\"agents\\\": [\n {\n \\\"role\\\": \\\"frontend\\\",\n \\\"phase\\\": 1,\n \\\"scope\\\": [\n \\\"**/*\\\"\n ],\n \\\"prompt\\\": \\\"Build a basic static coming-soon landing page with a 5-day countdown. Create a small self-contained site in this project directory with an index.html entry point plus any needed CSS/JS assets. The page should include a clear product/brand placeholder headline, short supporting copy, email signup UI as a non-submitting visual form or safe mailto-free placeholder, and a responsive countdown showing days, hours, minutes, and seconds. The countdown should start at 5 days for a first-time visitor and persist the target timestamp in localStorage so refreshing does not reset it. Keep the design polished but lightweight, mobile-friendly, accessible, and dependency-free unless a minimal toolchain is already created by you. Do not delete unrelated files.\\\"\n },\n {\n \\\"role\\\": \\\"fullstack\\\",\n \\\"phase\\\": 2,\n \\\"scope\\\": [],\n \\\"prompt\\\": \\\"QC Senior Dev: Audit the frontend landing page for correctness, accessibility, responsiveness, and countdown behavior. Verify the countdown persists across refreshes via localStorage and handles expiry cleanly. Run any available validation/build commands if the frontend agent added a toolchain; otherwise inspect the static files directly. Do NOT start long-running dev servers. Do NOT open files in a browser with an 'open' command. Commit all changes.\\\"\n }\n ],\n \\\"preview\\\": {\n \\\"kind\\\": \\\"static-html\\\",\n \\\"command\\\": \\\"\\\",\n \\\"openPath\\\": \\\"index.html\\\"\n }\n}\nEOF\ncat .groove/recommended-team.json\"","token_count":906},{"step":9,"type":"observation","timestamp":1777357947.29,"is_error":false,"content":"{\n \"projectDir\": \"coming-soon-landing\",\n \"agents\": [\n {\n \"role\": \"frontend\",\n \"phase\": 1,\n \"scope\": [\n \"**/*\"\n ],\n \"prompt\": \"Build a basic static coming-soon landing page with a 5-day countdown. Create a small self-contained site in this project directory with an index.html entry point plus any needed CSS/JS assets. The page should include a clear product/brand placeholder headline, short supporting copy, email signup UI as a non-submitting visual form or safe mailto-free placeholder, and a responsive countdown showing days, hours, minutes, and seconds. The countdown should start at 5 days for a first-time visitor and persist the target timestamp in localStorage so refreshing does not reset it. Keep the design polished but lightweight, mobile-friendly, accessible, and dependency-free unless a minimal toolchain is already created by you. Do not delete unrelated files.\"\n },\n {\n \"role\": \"fullstack\",\n \"phase\": 2,\n \"scope\": [],\n \"prompt\": \"QC Senior Dev: Audit the frontend landing page for correctness, accessibility, responsiveness, and countdown behavior. Verify the countdown persists across refreshes via localStorage and handles expiry cleanly. Run any available validation/build commands if the frontend agent added a toolchain; otherwise inspect the static files directly. Do NOT start long-running dev servers. Do NOT open files in a browser with an 'open' command. Commit all changes.\"\n }\n ],\n \"preview\": {\n \"kind\": \"static-html\",\n \"command\": \"\",\n \"openPath\": \"index.html\"\n }\n}\n","truncated":false,"original_token_count":394,"token_count":394},{"step":10,"type":"thought","timestamp":1777357951.62,"content":"Routing plan written to `.groove/recommended-team.json:1`.\n\n- Created a new-project team plan for `coming-soon-landing`.\n- Assigned `frontend` to build a static `index.html` coming-soon page with a persisted 5-day countdown.\n- Assigned `fullstack` as phase 2 QC to verify responsiveness, accessibility, countdown persistence, and expiry behavior.\n- Configured preview as `static-html` opening `index.html`.\n\nNo source code was implemented by me.","token_count":112},{"step":11,"type":"resolution","timestamp":1777357951.695,"content":"","token_count":1}]}
|
|
5
|
-
{"envelope_id":"
|
|
6
|
-
{"envelope_id":"
|
|
1
|
+
{"envelope_id":"env_2cd6991b-44b5-42d2-b9d5-4ad407cabd6d","session_id":"sess_88f3b102-d502-475e-88f5-d83bb8441daa","chunk_sequence":0,"contributor_id":"bc2094bbb558420f827b14b23bdeef61","attestation":{"session_hmac":"fd30b5086d62973fd51847f7c6280db0690956e8d731601bd4be9d089a4c1d3d","sequence":0,"app_version_hash":"4a527565d85cc601d3c47c376e4f361d5af987de58f60cf71dbc263a0de91247"},"metadata":{"model_engine":"claude-opus-4-6","provider":"claude-code","agent_role":"planner","agent_id":"3a5e9224","task_complexity":"medium","team_size":1,"session_quality":0,"groove_version":"0.27.117","leaf_context":null},"trajectory_log":[{"step":1,"type":"thought","timestamp":1777410434.55,"content":"This is where Hummingbird goes from \"cheaper inference\" to \"fundamentally different architecture.\" Let me think through this with you.\n\n---\n\nTHE PROBLEM WITH CURRENT AGENTS\n\nClaude Code, Codex, Cursor — they're all the same pattern: one big generalist model running a sequential loop. Think, call tool, observe, think, call tool, observe. The model handles EVERYTHING in that loop: task decomposition, domain reasoning, tool selection, code generation, error recovery, and self-correction. All in one 200B-parameter brain, one turn at a time.\n\nThat's not how real engineering teams work. Real teams have a tech lead who decomposes the task, a frontend dev writing React, a backend dev writing APIs, a DBA designing schemas — all working in parallel, all specialized. The tech lead doesn't write every line of code. They coordinate.\n\nYou already understand this — that's literally what Groove does. Planner, frontend, backend, fullstack. But right now Groove orchestrates frontier models to play those roles. What if the agents WERE the architecture?\n\n---\n\nNANO AGENT ARCHITECTURE\n\nHere's how I'd think about it:\n\nLAYER 1 — THE DECOMPOSER (a specialized leaf)\n\nThis is a planning/reasoning leaf trained specifically on task decomposition. It receives the full user prompt and produces a structured execution plan:\n\nInput: \"Build a full-stack app with React frontend, FastAPI backend, PostgreSQL, deploy to Docker with CI/CD\"\n\nOutput: a dependency graph of sub-tasks, each tagged with a domain:\n\n```\ntask_1: Design database schema [postgresql] deps: none\ntask_2: Build FastAPI endpoints [python] deps: task_1\ntask_3: Build React frontend [react] deps: task_2 (API spec)\ntask_4: Write Dockerfile + compose [devops] deps: task_2, task_3\ntask_5: Configure GitHub Actions CI [ci_cd] deps: task_4\n```\n\nThis leaf doesn't generate code. It generates plans. And we already have training data for this — your planner role sessions are exactly this pattern. 7 planner sessions in today's data.\n\nLAYER 2 — THE ROUTER (already exists, needs one upgrade)\n\nCurrently the router takes one prompt and finds one leaf. For nano agents, it needs to:\n- Accept a structured plan from the decomposer\n- Route each sub-task to the right leaf simultaneously\n- Return a set of (task, leaf, centroid_confidence) tuples\n- Handle the case where a sub-task spans domains (secondary/tertiary domain_tags already exist in our telemetry — that's the signal for multi-leaf needs)\n\nThis is still cosine similarity math. No new model needed. Just batch routing instead of single routing.\n\nLAYER 3 — NANO AGENTS (the new primitive)\n\nEach nano agent is a chassis + domain leaf, spun up to handle one sub-task. Key properties:\n\n- Lightweight: ~1.2GB chassis + 50MB leaf. You can run 5-6 on a single machine with 16GB RAM.\n- Fast spin-up: leaf hot-swap is <500ms on LAN, sub-millisecond if cached. Compare to spinning up a frontier model API call.\n- NATIVELY agentic: this is the crucial part. The leaf isn't trained on \"Python knowledge.\" It's trained on Python AGENT BEHAVIOR — ReAct trajectories of tool calls. Read files, edit code, run commands, observe output, debug errors. The training data IS agent behavior.\n- Parallel: no shared context window. Each nano agent has its own context, its own tool calls, its own execution stream. Five agents working simultaneously, not one agent doing five things sequentially.\n- Disposable: a nano agent finishes its sub-task and shuts down. The leaf unloads. The chassis is free for the next task. No persistent session cost.\n\nLAYER 4 — THE COMPOSER (another specialized leaf)\n\nAfter nano agents complete their sub-tasks, a composer leaf assembles the results. It handles:\n- Merging code outputs from different agents into a coherent project\n- Resolving conflicts (frontend expects one API shape, backend built another)\n- Running integration checks\n- Producing the final unified output\n\nThis is trained on fullstack/QC sessions — your data has 8 fullstack sessions doing exactly this kind of cross-domain integration work.\n\n---\n\nWHY THIS IS FUNDAMENTALLY DIFFERENT\n\n1. PARALLELISM. Current agents are sequential — one thought, one tool call, one observation at a time. Nano agents work simultaneously. A 5-domain task that takes a frontier agent 5 minutes of sequential tool calling takes nano agents ~1 minute of parallel execution.\n\n2. SPECIALIZATION. A frontier model allocates attention across everything it knows. A nano agent's entire LoRA is trained on one domain's tool patterns. It doesn't waste capacity on irrelevant knowledge. A Python nano agent doesn't have React weights competing for attention.\n\n3. COST. Five nano agents on a 0.6B chassis use ~6GB total RAM. One frontier model uses 140GB+ VRAM. The compute cost per task drops by orders of magnitude.\n\n4. NATIVE TOOL CALLING. This is the key insight you're hitting on. Current agents learn tool calling from system prompts and RLHF — it's bolted on. Hummingbird leaves are trained DIRECTLY on ReAct trajectories of tool use. The thought->action->observation loop IS the training data format. The leaf doesn't learn \"how to use tools\" — it learns \"how an expert Python developer uses Read, Edit, Bash, and Grep to solve Python problems.\" That's a different level of specialization.\n\n5. TRAINING DATA ALIGNMENT. Your Groove telemetry captures agents in roles (planner, frontend, backend, fullstack) doing real multi-agent work. You're training decomposer leaves on planner data, domain leaves on backend/frontend data, and composer leaves on fullstack QC data. The training data mirrors the runtime architecture perfectly.\n\n---\n\nWHAT THIS MEANS FOR THE TRAINING SCRIPT\n\nThis changes what we're training for. Not just \"make the model know Python better\" but \"make the model behave like an expert Python AGENT.\" The training format matters:\n\n- The instruction/completion pair format in the current script is fine for knowledge\n- But for native tool calling, we want the model to learn the full ReAct cycle: here's a task -> here's how an expert agent thought about it -> here's what tools it called -> here's what it observed -> here's how it adapted\n- Your trajectory_log IS already in this format. Thoughts, actions with tool names and arguments, observations, errors, resolutions.\n- The format_step function in train_leaf.py (using <thought>, <action tool=\"...\">, <observation> tags) is actually the right idea — it's teaching the model the agent loop natively\n\nWhat we'd add:\n\n- DECOMPOSER leaf training: filter planner sessions, train on task decomposition patterns\n- COMPOSER leaf training: filter fullstack/QC sessions, train on integration and conflict resolution\n- TOOL AFFINITY per domain: each domain leaf should weight its training toward the tool patterns common in that domain (frontend agents use Read/Edit heavily, devops agents use Bash heavily, planners use Glob/Grep for exploration)\n- DEPENDENCY AWARENESS: the training format should capture when a nano agent's output feeds into another agent's input\n\n---\n\nOPEN QUESTIONS FOR YOU\n\n- Do you see the decomposer as a separate leaf, or as a router upgrade? A leaf is more flexible (it can be trained/improved independently), but adds latency (you're running inference to decompose before the nano agents even start).\n\n- How do nano agents share state? Do they write to a shared filesystem and read each other's outputs? Or does the composer collect everything at the end? This affects how we handle dependencies (task_3 depends on task_2's API spec).\n\n- Do you want nano agents to have their own tool-calling loop (multiple turns of think/act/observe), or should they be single-shot (one generation pass per sub-task)? Multi-turn is more capable but more complex. Single-shot is faster but might need the composer to do more cleanup.\n\nThis is the kind of thinking that should go into the northstar doc as a \"V2 Architecture\" section. Want me to factor this into the training script plan, or do you want to nail down the nano agent design first?","token_count":2015},{"step":2,"type":"resolution","timestamp":1777410434.687,"content":"This is where Hummingbird goes from \"cheaper inference\" to \"fundamentally different architecture.\" Let me think through this with you.\n\n---\n\nTHE PROBLEM WITH CURRENT AGENTS\n\nClaude Code, Codex, Cursor — they're all the same pattern: one big generalist model running a sequential loop. Think, call tool, observe, think, call tool, observe. The model handles EVERYTHING in that loop: task decomposition, domain reasoning, tool selection, code generation, error recovery, and self-correction. All in one 200B-parameter brain, one turn at a time.\n\nThat's not how real engineering teams work. Real teams have a tech lead who decomposes the task, a frontend dev writing React, a backend dev writing APIs, a DBA designing schemas — all working in parallel, all specialized. The tech lead doesn't write every line of code. They coordinate.\n\nYou already understand this — that's literally what Groove does. Planner, frontend, backend, fullstack. But right now Groove orchestrates frontier models to play those roles. What if the agents WERE the architecture?\n\n---\n\nNANO AGENT ARCHITECTURE\n\nHere's how I'd think about it:\n\nLAYER 1 — THE DECOMPOSER (a specialized leaf)\n\nThis is a planning/reasoning leaf trained specifically on task decomposition. It receives the full user prompt and produces a structured execution plan:\n\nInput: \"Build a full-stack app with React frontend, FastAPI backend, PostgreSQL, deploy to Docker with CI/CD\"\n\nOutput: a dependency graph of sub-tasks, each tagged with a domain:\n\n```\ntask_1: Design database schema [postgresql] deps: none\ntask_2: Build FastAPI endpoints [python] deps: task_1\ntask_3: Build React frontend [react] deps: task_2 (API spec)\ntask_4: Write Dockerfile + compose [devops] deps: task_2, task_3\ntask_5: Configure GitHub Actions CI [ci_cd] deps: task_4\n```\n\nThis leaf doesn't generate code. It generates plans. And we already have training data for this — your planner role sessions are exactly this pattern. 7 planner sessions in today's data.\n\nLAYER 2 — THE ROUTER (already exists, needs one upgrade)\n\nCurrently the router takes one prompt and finds one leaf. For nano agents, it needs to:\n- Accept a structured plan from the decomposer\n- Route each sub-task to the right leaf simultaneously\n- Return a set of (task, leaf, centroid_confidence) tuples\n- Handle the case where a sub-task spans domains (secondary/tertiary domain_tags already exist in our telemetry — that's the signal for multi-leaf needs)\n\nThis is still cosine similarity math. No new model needed. Just batch routing instead of single routing.\n\nLAYER 3 — NANO AGENTS (the new primitive)\n\nEach nano agent is a chassis + domain leaf, spun up to handle one sub-task. Key properties:\n\n- Lightweight: ~1.2GB chassis + 50MB leaf. You can run 5-6 on a single machine with 16GB RAM.\n- Fast spin-up: leaf hot-swap is <500ms on LAN, sub-millisecond if cached. Compare to spinning up a frontier model API call.\n- NATIVELY agentic: this is the crucial part. The leaf isn't trained on \"Python knowledge.\" It's trained on Python AGENT BEHAVIOR — ReAct trajectories of tool calls. Read files, edit code, run commands, observe output, debug errors. The training data IS agent behavior.\n- Parallel: no shared context window. Each nano agent has its own context, its own tool calls, its own execution stream. Five agents working simultaneously, not one agent doing five things sequentially.\n- Disposable: a nano agent finishes its sub-task and shuts down. The leaf unloads. The chassis is free for the next task. No persistent session cost.\n\nLAYER 4 — THE COMPOSER (another specialized leaf)\n\nAfter nano agents complete their sub-tasks, a composer leaf assembles the results. It handles:\n- Merging code outputs from different agents into a coherent project\n- Resolving conflicts (frontend expects one API shape, backend built another)\n- Running integration checks\n- Producing the final unified output\n\nThis is trained on fullstack/QC sessions — your data has 8 fullstack sessions doing exactly this kind of cross-domain integration work.\n\n---\n\nWHY THIS IS FUNDAMENTALLY DIFFERENT\n\n1. PARALLELISM. Current agents are sequential — one thought, one tool call, one observation at a time. Nano agents work simultaneously. A 5-domain task that takes a frontier agent 5 minutes of sequential tool calling takes nano agents ~1 minute of parallel execution.\n\n2. SPECIALIZATION. A frontier model allocates attention across everything it knows. A nano agent's entire LoRA is trained on one domain's tool patterns. It doesn't waste capacity on irrelevant knowledge. A Python nano agent doesn't have React weights competing for attention.\n\n3. COST. Five nano agents on a 0.6B chassis use ~6GB total RAM. One frontier model uses 140GB+ VRAM. The compute cost per task drops by orders of magnitude.\n\n4. NATIVE TOOL CALLING. This is the key insight you're hitting on. Current agents learn tool calling from system prompts and RLHF — it's bolted on. Hummingbird leaves are trained DIRECTLY on ReAct trajectories of tool use. The thought->action->observation loop IS the training data format. The leaf doesn't learn \"how to use tools\" — it learns \"how an expert Python developer uses Read, Edit, Bash, and Grep to solve Python problems.\" That's a different level of specialization.\n\n5. TRAINING DATA ALIGNMENT. Your Groove telemetry captures agents in roles (planner, frontend, backend, fullstack) doing real multi-agent work. You're training decomposer leaves on planner data, domain leaves on backend/frontend data, and composer leaves on fullstack QC data. The training data mirrors the runtime architecture perfectly.\n\n---\n\nWHAT THIS MEANS FOR THE TRAINING SCRIPT\n\nThis changes what we're training for. Not just \"make the model know Python better\" but \"make the model behave like an expert Python AGENT.\" The training format matters:\n\n- The instruction/completion pair format in the current script is fine for knowledge\n- But for native tool calling, we want the model to learn the full ReAct cycle: here's a task -> here's how an expert agent thought about it -> here's what tools it called -> here's what it observed -> here's how it adapted\n- Your trajectory_log IS already in this format. Thoughts, actions with tool names and arguments, observations, errors, resolutions.\n- The format_step function in train_leaf.py (using <thought>, <action tool=\"...\">, <observation> tags) is actually the right idea — it's teaching the model the agent loop natively\n\nWhat we'd add:\n\n- DECOMPOSER leaf training: filter planner sessions, train on task decomposition patterns\n- COMPOSER leaf training: filter fullstack/QC sessions, train on integration and conflict resolution\n- TOOL AFFINITY per domain: each domain leaf should weight its training toward the tool patterns common in that domain (frontend agents use Read/Edit heavily, devops agents use Bash heavily, planners use Glob/Grep for exploration)\n- DEPENDENCY AWARENESS: the training format should capture when a nano agent's output feeds into another agent's input\n\n---\n\nOPEN QUESTIONS FOR YOU\n\n- Do you see the decomposer as a separate leaf, or as a router upgrade? A leaf is more flexible (it can be trained/improved independently), but adds latency (you're running inference to decompose before the nano agents even start).\n\n- How do nano agents share state? Do they write to a shared filesystem and read each other's outputs? Or does the composer collect everything at the end? This affects how we handle dependencies (task_3 depends on task_2's API spec).\n\n- Do you want nano agents to have their own tool-calling loop (multiple turns of think/act/observe), or should they be single-shot (one generation pass per sub-task)? Multi-turn is more capable but more complex. Single-shot is faster but might need the composer to do more cleanup.\n\nThis is the kind of thinking that should go into the northstar doc as a \"V2 Architecture\" section. Want me to factor this into the training script plan, or do you want to nail down the nano agent design first?","token_count":2015}]}
|
|
2
|
+
{"envelope_id":"env_0e9b69f7-ce83-4741-a9a4-9cdf056504e9","session_id":"sess_88f3b102-d502-475e-88f5-d83bb8441daa","type":"SESSION_CLOSE","attestation":{"session_hmac":"46ee24f4eb1969944ee1da775caaac9ce958dbdb119ec814654cd1f96f912636","sequence":1,"app_version_hash":"4a527565d85cc601d3c47c376e4f361d5af987de58f60cf71dbc263a0de91247"},"metadata":{"model_engine":"claude-opus-4-6","provider":"claude-code","agent_role":"planner","agent_id":"3a5e9224","task_complexity":"medium","team_size":1,"session_quality":50,"groove_version":"0.27.117","leaf_context":null,"domain_tags":{"primary":{"domain":"planning_strategy","confidence":0.3805},"secondary":{"domain":"go","confidence":0.2756},"tertiary":{"domain":"conversational_reasoning","confidence":0.2727}},"session_embedding":{"model":"sentence-transformers/all-MiniLM-L6-v2","vector":[-0.042663197964429855,-0.07920964062213898,-0.004728036001324654,-0.051084551960229874,-0.035339854657649994,-0.04480993375182152,-0.08333612978458405,0.019537601619958878,0.04164133220911026,0.04707718640565872,-0.051483504474163055,0.030263351276516914,-0.06016743183135986,-0.043615128844976425,-0.0008356476319022477,-0.01979982852935791,0.0035989091265946627,-0.08087526261806488,0.011952819302678108,-0.10144425183534622,-0.041289400309324265,0.04339990019798279,0.014450186863541603,0.023034481331706047,-0.0689428523182869,0.04178297147154808,0.009455332532525063,-0.02532159723341465,0.034805990755558014,0.009456056170165539,0.08603636175394058,0.11891777813434601,0.006629495415836573,-0.02235720492899418,0.0174767654389143,0.09055909514427185,-0.09540528059005737,0.025975236669182777,0.014918033964931965,-0.008286132477223873,0.001352433580905199,0.0158219076693058,-0.05017320439219475,-0.03463611751794815,0.060290925204753876,-0.05961441621184349,-0.0654287114739418,0.02982448786497116,-0.032352328300476074,-0.03868497908115387,-0.12729226052761078,-0.06001264974474907,-0.012548311613500118,-0.05885562300682068,0.02185189351439476,0.06382133066654205,0.06988537311553955,-0.0052973129786551,-0.04998323693871498,-0.12916108965873718,-0.034871503710746765,-0.053937025368213654,-0.09204158931970596,-0.027774369344115257,0.06180555000901222,0.06252466887235641,-0.02273271419107914,0.0219862200319767,0.07370083034038544,-0.02826603129506111,-0.013158973306417465,-0.025135237723588943,-0.05560678616166115,0.029804041609168053,0.04150652885437012,0.01814938336610794,0.04522743448615074,-0.011250794865190983,0.07480574399232864,-0.09907713532447815,-0.07422854751348495,-0.026824653148651123,-0.007619404699653387,0.016659077256917953,0.047238897532224655,0.004745256621390581,0.01850995421409607,0.039889492094516754,0.04358542710542679,-0.034847576171159744,0.07359904795885086,-0.07500877976417542,0.04028075188398361,-0.024270135909318924,-0.013988173566758633,0.10766910761594772,-0.008571659214794636,-0.13124093413352966,-0.0073127648793160915,0.04709223285317421,0.02863220125436783,0.0476718433201313,0.05801643058657646,-0.06148829683661461,0.015272174961864948,-0.04047108814120293,0.09824686497449875,-0.0345299057662487,-0.03782320395112038,-0.10338249802589417,-0.005434758961200714,0.014185954816639423,0.06280915439128876,0.032767459750175476,0.0065223718993365765,-0.04384880140423775,-0.05735908821225166,0.10000867396593094,0.03716438636183739,0.06613614410161972,0.026342885568737984,0.025736121460795403,-0.011334727518260479,0.0450548492372036,0.03558260202407837,0.000953362905420363,-0.06887662410736084,-8.66469651125074e-34,-0.012049063108861446,-0.04232615604996681,0.07526687532663345,-0.016562866047024727,0.025318287312984467,-0.008237234316766262,-0.007523944601416588,-0.02695491351187229,0.054593611508607864,0.020880939438939095,0.06184656172990799,-0.023773184046149254,-0.062261611223220825,0.12432696670293808,0.08701236546039581,0.009316927753388882,-0.050637003034353256,0.09582623839378357,-0.012228135019540787,-0.025398356840014458,0.01831326261162758,0.055120352655649185,-0.03443676978349686,-0.05589570105075836,0.09819526970386505,0.11039813607931137,0.06251215189695358,0.02222372591495514,-0.05065417289733887,0.020320292562246323,-0.021960575133562088,0.07020333409309387,-0.05077865347266197,0.05987301468849182,-0.014536700211465359,0.06175179407000542,-0.04315461218357086,-0.12159654498100281,0.06844877451658249,-0.02707703784108162,-0.05940018594264984,0.008736299350857735,0.04108921438455582,0.014570072293281555,0.025125518441200256,-0.08250750601291656,-0.031649451702833176,-0.029083864763379097,-0.017267147079110146,-0.009987293742597103,0.07390677183866501,-0.008637825958430767,0.06123211234807968,-0.04639231041073799,0.04069162532687187,-0.02993234246969223,0.0259507205337286,-0.02388959378004074,0.07693924009799957,0.13178308308124542,0.03653351962566376,-0.008794529363512993,-0.05003251135349274,0.03877951204776764,0.10275955498218536,0.06303972750902176,-0.06915564835071564,0.06351225823163986,0.08538113534450531,0.0006151774432510138,-0.020428333431482315,0.059714749455451965,0.02636464685201645,-0.04300621896982193,-0.0033410387113690376,-0.054366614669561386,0.0295712910592556,-0.049908410757780075,-0.06865812093019485,-0.07271655648946762,-0.047645337879657745,0.05336294323205948,-0.03873465210199356,0.021338868886232376,0.003884456818923354,0.03843509033322334,0.03502823784947395,0.049343518912792206,-0.022036824375391006,0.039887845516204834,-0.012147429399192333,0.038128096610307693,-0.022207073867321014,0.06144854426383972,0.0009098630398511887,-2.650910575127112e-34,-0.01947639510035515,-0.021992139518260956,-0.002118629403412342,0.015769945457577705,0.015171081759035587,-0.09928400069475174,-0.032298386096954346,-0.12704122066497803,-0.046388767659664154,-0.10921106487512589,-0.1285391002893448,0.005005955696105957,0.04188212379813194,-0.021805763244628906,0.07245717197656631,-0.02677413448691368,0.05284606292843819,-0.11032922565937042,0.04551541060209274,-0.010356231592595577,0.02219468541443348,0.009582600556313992,-0.02566303126513958,-0.03771011531352997,-0.010636572726070881,-0.008720453828573227,-0.02804793417453766,0.031001688912510872,0.050553902983665466,-0.04165247455239296,-0.028845664113759995,-0.021196620538830757,-0.013428593054413795,-0.0034380601719021797,0.03269267827272415,0.04630955681204796,0.010928542353212833,0.0633062869310379,-0.01533342245966196,0.021176226437091827,-0.006667101290076971,-0.04537878558039665,0.0044951168820261955,-0.02928945980966091,0.04446292668581009,0.003793490817770362,-0.08428795635700226,0.037309348583221436,-0.043272387236356735,-0.024825945496559143,-0.025631379336118698,0.009555913507938385,-0.03582530841231346,-0.06321033090353012,-0.020915484055876732,0.022120321169495583,0.03644101694226265,-0.08477097004652023,0.05937874689698219,-0.019813500344753265,-0.088682159781456,0.021008480340242386,0.04362688958644867,0.037651631981134415,0.008256119675934315,0.043694380670785904,-0.008502385579049587,-0.009585077874362469,0.03159640356898308,-0.06152038648724556,0.03496095910668373,-0.012072565965354443,-0.06272129714488983,0.043712448328733444,0.021039370447397232,0.042196329683065414,-0.06743412464857101,-0.029875298961997032,0.028842316940426826,-0.01620534062385559,-0.05631035193800926,-0.007960609160363674,0.00537768704816699,0.039464451372623444,-0.07100815325975418,0.03968587890267372,0.005714746192097664,0.01837644726037979,-0.011519914492964745,0.03573253005743027,-0.045424263924360275,0.024337882176041603,0.048947226256132126,0.016963500529527664,-0.08578161895275116,-4.459403157852648e-8,0.01595151796936989,0.02163768745958805,0.07582003623247147,0.04892997816205025,0.08592760562896729,-0.10509695112705231,-0.033775750547647476,0.0256966520100832,0.022532811388373375,0.06633689254522324,0.03920141980051994,-0.019419241696596146,0.006514542270451784,0.04446549341082573,0.030515525490045547,0.05505245178937912,0.059137847274541855,-0.0521346740424633,-0.06504987925291061,-0.04899732023477554,0.02281162515282631,0.038609519600868225,-0.04634566977620125,-0.02183665707707405,0.04097019508481026,-0.024485111236572266,-0.05903428792953491,0.07163609564304352,0.0009382504504173994,0.07097610086202621,0.03834494575858116,0.05165470391511917,0.07380085438489914,0.01150268130004406,0.07728515565395355,-0.02745053730905056,0.04416794329881668,-0.026510052382946014,0.07550898939371109,-0.09773874282836914,-0.03786956146359444,0.0008289032848551869,-0.04255609214305878,-0.007835949771106243,0.03573477640748024,-0.0655413344502449,0.029697470366954803,-0.12062753736972809,0.030080432072281837,-0.0227684136480093,-0.05779450386762619,0.00960764940828085,0.012964659370481968,0.023402083665132523,0.0937793105840683,0.06403572112321854,0.02996707335114479,-0.15239854156970978,0.007187591399997473,0.03737829998135567,0.01803736202418804,0.0006869512726552784,-0.027350010350346565,0.005146276671439409],"source_text":"planner\nThis is where Hummingbird goes from \"cheaper inference\" to \"fundamentally different architecture.\" Let me think through this with you.\n\n---\n\nTHE PROBLEM WITH CURRENT AGENTS\n\nClaude Code, Codex, Cursor — they're all the same pattern: one big generalist model running a sequential loop. Think, call tool, observe, think, call tool, observe. The model handles EVERYTHING in that loop: task decomposition, domain reasoning, tool selection, code generation, error recovery, and self-correction. All in one 200"}},"outcome":{"status":"SUCCESS","session_quality":50,"quality_tier":"TIER_B","quality_tier_reason":"moderate_quality","user_interventions":0,"total_steps":2,"total_chunks":1,"total_tokens":4030,"duration_seconds":97,"files_modified":0,"errors_encountered":0,"errors_recovered":0,"coordination_events":0,"training_eligible":true,"training_exclusion_reason":null}}
|
|
3
|
+
{"envelope_id":"env_cd025d60-a622-46b2-aac9-0f493f072f18","session_id":"sess_88f3b102-d502-475e-88f5-d83bb8441daa","type":"USER_FEEDBACK","attestation":{"session_hmac":"1867b06624e013913e772e1fc70c51539809fffe28aa8a4b4121b92211ee42ce","sequence":2,"app_version_hash":"4a527565d85cc601d3c47c376e4f361d5af987de58f60cf71dbc263a0de91247"},"feedback":{"signal":"accepted","timestamp":1777410435.88,"context":"session completed successfully with no user interventions","target_step":2,"revision_rounds":0,"delta_summary":null}}
|
|
4
|
+
{"envelope_id":"env_9d479068-d885-4d5d-8dba-b546a953aa7f","session_id":"sess_be3076ef-b370-4108-8516-f724a2bf41d3","chunk_sequence":0,"contributor_id":"bc2094bbb558420f827b14b23bdeef61","attestation":{"session_hmac":"9d93678b5fec3f0bc3b8a60795689c5e278404f1a6833c2e68b17e31c9236b9b","sequence":0,"app_version_hash":"4a527565d85cc601d3c47c376e4f361d5af987de58f60cf71dbc263a0de91247"},"metadata":{"model_engine":"claude-opus-4-6","provider":"claude-code","agent_role":"planner","agent_id":"b92c8c84","task_complexity":"medium","team_size":1,"session_quality":0,"groove_version":"0.27.117","leaf_context":null},"trajectory_log":[{"step":1,"type":"thought","timestamp":1777411152.381,"content":"These are the right questions. Let me take them one at a time.\n\n---\n\nDECOMPOSER LATENCY — IS IT WARRANTED?\n\nThe decomposer is inference on a 0.6B model. At 0.6B parameters, token generation runs at hundreds of tokens per second on modern hardware. A decomposition plan is maybe 200-300 tokens — call it 1-2 seconds on consumer hardware, sub-second on GPU.\n\nCompare that to the cost of NOT decomposing: you route a complex multi-domain prompt to the single best-matching leaf, get a mediocre result that misses 3 out of 5 domains, and the user has to retry or manually break it down. That's minutes wasted, plus bad output.\n\n1-2 seconds for correct task decomposition vs. minutes of wasted inference on wrong leaves. Warranted every time. It's not even close.\n\n---\n\nSTATE SHARING — YOUR PROTOCOLS MAP PERFECTLY\n\nThe knock protocol and introducer protocol are exactly what nano agents need. The chassis is the runtime, the leaf is the expertise, and your coordination protocols are external to both. When a nano agent spins up:\n\n1. Introducer protocol fires — the new agent learns the team, their scopes, the shared context\n2. Knock protocol governs file access — agent A working on the API layer doesn't collide with agent B working on the frontend\n3. The nano agent does its work within its scope\n4. Results flow back to the composer\n\nYou wouldn't need to reinvent coordination. You'd extend what Groove already does, but the agents underneath go from \"frontier model playing a role\" to \"purpose-built specialist that IS the role.\"\n\n---\n\nTOOL CALLING — MULTI-TURN WITH NATURAL BOUNDS\n\nMulti-turn iteration is the right call. The training data already captures error recovery patterns — agent tries something, it fails, agent reads the error, adjusts, tries again. That's in the trajectories. A nano agent trained on that data will naturally do the same.\n\nThe natural bound is context window. At 0.6B with 2048-4096 token context, a nano agent can do maybe 5-8 rounds of think/act/observe before context fills up. That's actually healthy — it forces focused execution. If a sub-task needs deeper iteration than that, the agent produces a handoff artifact (here's where I got, here's what's left) and a fresh nano agent picks up with clean context.\n\nThis is actually better than how frontier agents work. Claude Code in a long session accumulates 100K+ tokens of context, most of it stale observations from earlier tool calls. Each nano agent starts clean with just its sub-task and the relevant state. No context pollution.\n\n---\n\nTHE BIG QUESTION — INTENT CLASSIFICATION\n\nThis is the missing piece you're identifying, and the answer is already hiding in your architecture docs. The LOCAL_TREE_ARCHITECTURE.md mentions this:\n\n```\nRouter\n - Intent classifier: task | explore | chat\n - Leaf selector: skill | reasoning | standby domain\n```\n\nThe router needs a TWO-STAGE dispatch:\n\nSTAGE 1 — WHAT KIND OF TURN IS THIS?\n\nBefore routing to any domain, classify the intent. Three modes:\n\n- CHAT — casual conversation, no tools needed. \"What do you think about X?\" / \"Explain this concept\" / \"Let me vent about my day.\" Just the chassis + personality leaf. No tool access. Cheapest possible turn — a few hundred tokens of 0.6B inference. Costs essentially nothing.\n\n- EXPLORE — research, reading, understanding. \"What does this function do?\" / \"How is auth implemented?\" Single agent, read-only tools (Read, Grep, Glob). One domain leaf. Moderate cost.\n\n- TASK — build something, fix something, change something. \"Build a full-stack app\" / \"Fix this bug\" / \"Refactor the auth module.\" This is where the decomposer fires, nano agents spin up, tools are read-write. Full cost.\n\nThe classification itself is the same cosine similarity math the router already does. You add 3 mode centroids (chat, explore, task) and check those FIRST, before domain routing. Sub-millisecond. No LLM inference. The \"switch\" costs nothing.\n\nSTAGE 2 — HOW COMPLEX IS THE TASK?\n\nIf Stage 1 says TASK, the decomposer leaf runs and determines complexity:\n\n- SINGLE-DOMAIN: the prompt maps cleanly to one leaf with high confidence. One nano agent, one leaf, done. No decomposition overhead.\n- MULTI-DOMAIN: the prompt spans domains. Decomposer produces the task graph, nano agents dispatch in parallel.\n- SEQUENTIAL: tasks have dependencies. Nano agents execute in phases (like your Groove phases already work).\n\nSo the escalation path is:\n\n```\nPrompt arrives\n -> Router Stage 1: intent classification (sub-ms)\n -> CHAT: chassis + personality leaf. No tools. Done.\n -> EXPLORE: chassis + domain leaf. Read-only tools. Done.\n -> TASK: continue to Stage 2\n -> Router Stage 2: domain routing (sub-ms)\n -> Single domain, high confidence: one nano agent\n -> Multi-domain or low confidence: decomposer fires (1-2s)\n -> Task graph produced\n -> Nano agents dispatched per phase\n -> Composer assembles results\n```\n\nEvery turn starts at the cheapest possible level and escalates only when the math says it needs to. A chat turn never pays the cost of agent orchestration. A simple Python question never pays the cost of decomposition. The system is as expensive as the task requires and no more.\n\n---\n\nTHE DYNAMIC COST STRUCTURE\n\nThis is what makes it fundamentally different from Claude Code / Codex:\n\n- Claude Code: agent mode is ALWAYS ON. Every turn runs the full tool-calling loop. Even a \"yes looks good\" response goes through the same heavyweight inference pipeline. You're paying frontier model prices for every turn regardless of complexity.\n\n- Hummingbird: a chat turn costs ~0.001x what a full agentic turn costs. The router's intent classification gates everything. The system breathes — expands to full multi-agent orchestration for complex tasks, contracts to bare chassis for casual conversation.\n\nThis maps to how humans work too. You don't spin up a full engineering team to answer \"what time is the meeting?\" You answer off the top of your head. The expensive coordination only kicks in when the task warrants it.\n\n---\n\nTRAINING IMPLICATIONS\n\nThis adds to what we need in the training script:\n\n1. INTENT CENTROIDS — we need to generate centroids for chat/explore/task modes. We can derive these from the telemetry: sessions with mostly thought steps and few actions = chat/explore. Sessions with many actions and tool calls = task.\n\n2. DECOMPOSER LEAF — trained on planner sessions. Input: complex prompt. Output: structured task graph with domain tags and dependencies.\n\n3. COMPOSER LEAF — trained on fullstack/QC sessions. Input: outputs from multiple nano agents. Output: unified, integrated result.\n\n4. DOMAIN LEAVES — same as before, but with emphasis on training the ReAct loop, not just domain knowledge.\n\nYour current data already has the signals for all of this. Planner sessions = decomposer training. Fullstack sessions = composer training. Backend/frontend sessions = domain nano agent training. Session complexity (step count, tool diversity) = intent classification signal.\n\n---\n\nWant me to fold all of this into the training script plan and route it to a team? Or do you want to keep discussing the architecture before we build?","token_count":1799},{"step":2,"type":"resolution","timestamp":1777411152.461,"content":"These are the right questions. Let me take them one at a time.\n\n---\n\nDECOMPOSER LATENCY — IS IT WARRANTED?\n\nThe decomposer is inference on a 0.6B model. At 0.6B parameters, token generation runs at hundreds of tokens per second on modern hardware. A decomposition plan is maybe 200-300 tokens — call it 1-2 seconds on consumer hardware, sub-second on GPU.\n\nCompare that to the cost of NOT decomposing: you route a complex multi-domain prompt to the single best-matching leaf, get a mediocre result that misses 3 out of 5 domains, and the user has to retry or manually break it down. That's minutes wasted, plus bad output.\n\n1-2 seconds for correct task decomposition vs. minutes of wasted inference on wrong leaves. Warranted every time. It's not even close.\n\n---\n\nSTATE SHARING — YOUR PROTOCOLS MAP PERFECTLY\n\nThe knock protocol and introducer protocol are exactly what nano agents need. The chassis is the runtime, the leaf is the expertise, and your coordination protocols are external to both. When a nano agent spins up:\n\n1. Introducer protocol fires — the new agent learns the team, their scopes, the shared context\n2. Knock protocol governs file access — agent A working on the API layer doesn't collide with agent B working on the frontend\n3. The nano agent does its work within its scope\n4. Results flow back to the composer\n\nYou wouldn't need to reinvent coordination. You'd extend what Groove already does, but the agents underneath go from \"frontier model playing a role\" to \"purpose-built specialist that IS the role.\"\n\n---\n\nTOOL CALLING — MULTI-TURN WITH NATURAL BOUNDS\n\nMulti-turn iteration is the right call. The training data already captures error recovery patterns — agent tries something, it fails, agent reads the error, adjusts, tries again. That's in the trajectories. A nano agent trained on that data will naturally do the same.\n\nThe natural bound is context window. At 0.6B with 2048-4096 token context, a nano agent can do maybe 5-8 rounds of think/act/observe before context fills up. That's actually healthy — it forces focused execution. If a sub-task needs deeper iteration than that, the agent produces a handoff artifact (here's where I got, here's what's left) and a fresh nano agent picks up with clean context.\n\nThis is actually better than how frontier agents work. Claude Code in a long session accumulates 100K+ tokens of context, most of it stale observations from earlier tool calls. Each nano agent starts clean with just its sub-task and the relevant state. No context pollution.\n\n---\n\nTHE BIG QUESTION — INTENT CLASSIFICATION\n\nThis is the missing piece you're identifying, and the answer is already hiding in your architecture docs. The LOCAL_TREE_ARCHITECTURE.md mentions this:\n\n```\nRouter\n - Intent classifier: task | explore | chat\n - Leaf selector: skill | reasoning | standby domain\n```\n\nThe router needs a TWO-STAGE dispatch:\n\nSTAGE 1 — WHAT KIND OF TURN IS THIS?\n\nBefore routing to any domain, classify the intent. Three modes:\n\n- CHAT — casual conversation, no tools needed. \"What do you think about X?\" / \"Explain this concept\" / \"Let me vent about my day.\" Just the chassis + personality leaf. No tool access. Cheapest possible turn — a few hundred tokens of 0.6B inference. Costs essentially nothing.\n\n- EXPLORE — research, reading, understanding. \"What does this function do?\" / \"How is auth implemented?\" Single agent, read-only tools (Read, Grep, Glob). One domain leaf. Moderate cost.\n\n- TASK — build something, fix something, change something. \"Build a full-stack app\" / \"Fix this bug\" / \"Refactor the auth module.\" This is where the decomposer fires, nano agents spin up, tools are read-write. Full cost.\n\nThe classification itself is the same cosine similarity math the router already does. You add 3 mode centroids (chat, explore, task) and check those FIRST, before domain routing. Sub-millisecond. No LLM inference. The \"switch\" costs nothing.\n\nSTAGE 2 — HOW COMPLEX IS THE TASK?\n\nIf Stage 1 says TASK, the decomposer leaf runs and determines complexity:\n\n- SINGLE-DOMAIN: the prompt maps cleanly to one leaf with high confidence. One nano agent, one leaf, done. No decomposition overhead.\n- MULTI-DOMAIN: the prompt spans domains. Decomposer produces the task graph, nano agents dispatch in parallel.\n- SEQUENTIAL: tasks have dependencies. Nano agents execute in phases (like your Groove phases already work).\n\nSo the escalation path is:\n\n```\nPrompt arrives\n -> Router Stage 1: intent classification (sub-ms)\n -> CHAT: chassis + personality leaf. No tools. Done.\n -> EXPLORE: chassis + domain leaf. Read-only tools. Done.\n -> TASK: continue to Stage 2\n -> Router Stage 2: domain routing (sub-ms)\n -> Single domain, high confidence: one nano agent\n -> Multi-domain or low confidence: decomposer fires (1-2s)\n -> Task graph produced\n -> Nano agents dispatched per phase\n -> Composer assembles results\n```\n\nEvery turn starts at the cheapest possible level and escalates only when the math says it needs to. A chat turn never pays the cost of agent orchestration. A simple Python question never pays the cost of decomposition. The system is as expensive as the task requires and no more.\n\n---\n\nTHE DYNAMIC COST STRUCTURE\n\nThis is what makes it fundamentally different from Claude Code / Codex:\n\n- Claude Code: agent mode is ALWAYS ON. Every turn runs the full tool-calling loop. Even a \"yes looks good\" response goes through the same heavyweight inference pipeline. You're paying frontier model prices for every turn regardless of complexity.\n\n- Hummingbird: a chat turn costs ~0.001x what a full agentic turn costs. The router's intent classification gates everything. The system breathes — expands to full multi-agent orchestration for complex tasks, contracts to bare chassis for casual conversation.\n\nThis maps to how humans work too. You don't spin up a full engineering team to answer \"what time is the meeting?\" You answer off the top of your head. The expensive coordination only kicks in when the task warrants it.\n\n---\n\nTRAINING IMPLICATIONS\n\nThis adds to what we need in the training script:\n\n1. INTENT CENTROIDS — we need to generate centroids for chat/explore/task modes. We can derive these from the telemetry: sessions with mostly thought steps and few actions = chat/explore. Sessions with many actions and tool calls = task.\n\n2. DECOMPOSER LEAF — trained on planner sessions. Input: complex prompt. Output: structured task graph with domain tags and dependencies.\n\n3. COMPOSER LEAF — trained on fullstack/QC sessions. Input: outputs from multiple nano agents. Output: unified, integrated result.\n\n4. DOMAIN LEAVES — same as before, but with emphasis on training the ReAct loop, not just domain knowledge.\n\nYour current data already has the signals for all of this. Planner sessions = decomposer training. Fullstack sessions = composer training. Backend/frontend sessions = domain nano agent training. Session complexity (step count, tool diversity) = intent classification signal.\n\n---\n\nWant me to fold all of this into the training script plan and route it to a team? Or do you want to keep discussing the architecture before we build?","token_count":1799}]}
|
|
5
|
+
{"envelope_id":"env_7c9d8ee0-1450-4324-9098-194836333141","session_id":"sess_be3076ef-b370-4108-8516-f724a2bf41d3","type":"SESSION_CLOSE","attestation":{"session_hmac":"4360f237b5c534ae8e991aa11de1c2bd9f48b7bc4da44aeae8bad83a442f95de","sequence":1,"app_version_hash":"4a527565d85cc601d3c47c376e4f361d5af987de58f60cf71dbc263a0de91247"},"metadata":{"model_engine":"claude-opus-4-6","provider":"claude-code","agent_role":"planner","agent_id":"b92c8c84","task_complexity":"medium","team_size":1,"session_quality":50,"groove_version":"0.27.117","leaf_context":null,"domain_tags":{"primary":{"domain":"redis_cache","confidence":0.3437},"secondary":{"domain":"planning_strategy","confidence":0.2588},"tertiary":{"domain":"postgresql_database","confidence":0.2329}},"session_embedding":{"model":"sentence-transformers/all-MiniLM-L6-v2","vector":[-0.022883471101522446,0.013352946378290653,0.041751209646463394,-0.0005005666171200573,0.06363262236118317,-0.14514441788196564,-0.11043374240398407,0.005631809588521719,0.03881276771426201,0.048988088965415955,-0.07669549435377121,-0.01952202431857586,-0.04490014910697937,-0.05893969163298607,-0.0007076126057654619,-0.020058410242199898,0.03926605358719826,0.031186513602733612,-0.05528353899717331,-0.07558921724557877,0.031930405646562576,-0.04705895856022835,-0.06839026510715485,0.013621089980006218,0.09316381067037582,-0.02901788242161274,0.03485952690243721,-0.02061837539076805,0.024830130860209465,0.04312923923134804,0.032695140689611435,0.051159486174583435,-0.049099769443273544,-0.039937760680913925,-0.021518336609005928,0.1214703619480133,-0.06682541966438293,-0.029281046241521835,0.0625896155834198,-0.07732484489679337,0.035459503531455994,0.060865432024002075,-0.06589331477880478,0.02556951716542244,0.043003544211387634,-0.01255506370216608,-0.04031739383935928,0.025909248739480972,-0.057687439024448395,-0.09469339996576309,-0.0065030246041715145,0.026602473109960556,-0.06809747964143753,0.04192687198519707,-0.0723581537604332,-0.00504677789285779,0.010225458070635796,-0.04482739418745041,-0.03428487852215767,0.03437260910868645,-0.05786236375570297,-0.06357882916927338,-0.03437146916985512,-0.039553672075271606,0.0724211186170578,0.0084962984547019,0.11126461625099182,0.0011228375369682908,0.11292798072099686,0.05565778166055679,-0.027938812971115112,-0.03208714351058006,-0.09145544469356537,-0.01435904111713171,-0.06038655340671539,0.0756261795759201,0.04977088421583176,-0.04798618704080582,-0.030505947768688202,-0.04549888148903847,0.0629299059510231,0.016336612403392792,-0.00600133091211319,0.003153374418616295,0.033056799322366714,-0.0747615247964859,-0.030654920265078545,0.011437732726335526,0.02389543503522873,0.011703196913003922,-0.011046452447772026,0.008551639504730701,0.09200433641672134,-0.016806311905384064,-0.0831974670290947,0.07067257910966873,0.03167108818888664,-0.043046627193689346,0.02452363260090351,0.036324694752693176,-0.04545716196298599,0.07648637890815735,0.06655872613191605,-0.09049602597951889,-0.028812957927584648,0.005710443016141653,0.01552144717425108,0.07834184169769287,0.047154977917671204,0.036185137927532196,0.0317399837076664,0.03034779615700245,0.029225990176200867,0.11327530443668365,0.0480717308819294,0.0594797357916832,-0.016803376376628876,-0.007721214555203915,0.11889388412237167,0.11058922111988068,-0.030555253848433495,-0.012010863982141018,0.003511145943775773,-0.05076470971107483,0.0003225979453418404,-0.029994722455739975,-0.008372144773602486,3.529465869382074e-33,-0.08615408837795258,0.06543253362178802,0.018782079219818115,-0.020575745031237602,-0.057198572903871536,-0.006099910009652376,0.0129768718034029,-0.014561515301465988,0.06022961810231209,0.06751807779073715,-0.0655401200056076,-0.0767950639128685,-0.03443203493952751,0.08425334095954895,0.07151950895786285,-0.09723956882953644,-0.005091724451631308,0.0808868557214737,0.005333898589015007,-0.013792529702186584,0.030288033187389374,-0.0008548754267394543,-0.05854460969567299,-0.10472147166728973,0.1328054666519165,-0.01649514026939869,0.049025338143110275,-0.0383782684803009,-0.047089602798223495,0.011619538068771362,0.014088034629821777,0.014289643615484238,-0.0652327910065651,0.05378177762031555,0.05593195930123329,0.0654912143945694,0.0020156505052000284,-0.041868023574352264,0.010347667150199413,-0.06845608353614807,-0.06696997582912445,0.06517356634140015,-0.009896216914057732,-0.029068686068058014,-0.013749837875366211,-0.01718101091682911,-0.003485057270154357,-0.010906084440648556,-0.014927135780453682,0.03862583637237549,0.07192954421043396,0.04128964617848396,-0.03288032114505768,-0.047348763793706894,0.058203358203172684,-0.058995094150304794,0.1064627394080162,-0.09339774399995804,0.1098036989569664,0.1483273208141327,0.03918079286813736,-0.010422022081911564,-0.0703241378068924,0.011303768493235111,0.012732680886983871,0.06243457645177841,-0.06298122555017471,-0.015743928030133247,0.027302909642457962,-0.05504630133509636,-0.028012990951538086,0.035544589161872864,0.04363400861620903,-0.14449970424175262,0.023618584498763084,0.0034468125086277723,0.09737690538167953,0.010722368955612183,-0.030927710235118866,-0.041423652321100235,-0.045723799616098404,0.03477523475885391,-0.10172632336616516,-0.013383354991674423,0.03077564761042595,0.013708621263504028,0.038882508873939514,0.0030143673066049814,-0.10235615819692612,-0.0066613443195819855,0.02730480395257473,0.02671082504093647,-0.04975526034832001,0.041058413684368134,-0.022854946553707123,-3.10420213529731e-33,-0.015525910072028637,-0.027104998007416725,0.039706818759441376,0.10501624643802643,-0.004277792293578386,-0.09327594190835953,-0.05921998992562294,-0.08155204355716705,-0.0017745799850672483,-0.06290971487760544,-0.15601970255374908,0.011057199910283089,0.03200935572385788,0.044892147183418274,0.016475429758429527,0.03538571670651436,0.10124536603689194,-0.07788760960102081,0.04491909593343735,-0.024500789120793343,0.00395536283031106,0.08668795228004456,-0.08921362459659576,-0.07275553047657013,-0.022211289033293724,0.04883500561118126,-0.04561527445912361,0.006458035670220852,0.028731781989336014,-0.05440133810043335,-0.04394377022981644,-0.042836617678403854,0.06303993612527847,0.001031606807373464,-0.042907800525426865,0.0390499047935009,0.007938682101666927,0.06914696097373962,0.004342915024608374,0.010946165770292282,0.034564316272735596,0.025005776435136795,-0.09865693747997284,0.02102634310722351,-0.0032966542057693005,-0.0028434249106794596,-0.06262326240539551,-0.007417235057801008,0.05517026036977768,0.09290048480033875,0.03402421995997429,0.0281075369566679,-0.06240873411297798,0.005196586716920137,-0.025654204189777374,-0.022156106308102608,0.0064276293851435184,0.008559372276067734,0.009268109686672688,-0.07153111696243286,0.017665479332208633,0.03989719972014427,0.06890137493610382,0.0707872211933136,0.046432193368673325,0.006238466594368219,0.06485050916671753,-0.021149413660168648,-0.041577450931072235,-0.00401229178532958,0.02122018299996853,0.011308478191494942,-0.032560981810092926,0.026734622195363045,0.030087195336818695,0.005621797405183315,-0.006459771655499935,-0.02780202589929104,-0.041876278817653656,0.0297858826816082,-0.08088849484920502,0.06793981045484543,-0.05171499773859978,0.013574019074440002,0.0007489119889214635,-0.06073841080069542,-0.011421805247664452,-0.04108521342277527,0.010300750844180584,0.03330345079302788,-0.013511043973267078,-0.0014950953191146255,0.05313493311405182,-0.019918305799365044,-0.026223037391901016,-5.2586855758818274e-8,0.030475040897727013,-0.008170368149876595,-0.010863144882023335,-0.004132945090532303,0.005495957098901272,-0.06795968860387802,-0.027645990252494812,0.05372242629528046,0.025560248643159866,0.007701142691075802,0.11395049840211868,-0.0633506178855896,-0.07569018751382828,0.04825902730226517,0.03148379921913147,-0.024680176749825478,0.0329122394323349,0.0009213792509399354,-0.0837893858551979,-0.022648518905043602,-0.027672847732901573,0.010596824809908867,-0.08551852405071259,-0.018728476017713547,0.04528222233057022,0.011383936740458012,0.07099876552820206,0.05469983443617821,-0.0024317933712154627,0.0020631796214729548,-0.07525557279586792,0.07299499958753586,0.053807854652404785,0.012831504456698895,0.004741876386106014,0.006431731395423412,-0.03676070272922516,0.016316957771778107,-0.033832062035799026,0.025600075721740723,-0.02052212506532669,-0.005275025498121977,-0.067348912358284,0.018431810662150383,0.06749404221773148,-0.037602778524160385,-0.005316149443387985,-0.01039446797221899,0.0016467055538669229,0.02568548172712326,-0.01832520216703415,-0.04925531521439552,-0.05211113020777702,0.028761811554431915,0.0529523603618145,0.006931285839527845,-0.019467975944280624,-0.01829788275063038,0.07463538646697998,-0.004015958867967129,0.035897694528102875,-0.04628448560833931,-0.03673115000128746,-0.044861309230327606],"source_text":"planner\nThese are the right questions. Let me take them one at a time.\n\n---\n\nDECOMPOSER LATENCY — IS IT WARRANTED?\n\nThe decomposer is inference on a 0.6B model. At 0.6B parameters, token generation runs at hundreds of tokens per second on modern hardware. A decomposition plan is maybe 200-300 tokens — call it 1-2 seconds on consumer hardware, sub-second on GPU.\n\nCompare that to the cost of NOT decomposing: you route a complex multi-domain prompt to the single best-matching leaf, get a mediocre result that m"}},"outcome":{"status":"SUCCESS","session_quality":50,"quality_tier":"TIER_B","quality_tier_reason":"moderate_quality","user_interventions":0,"total_steps":2,"total_chunks":1,"total_tokens":3598,"duration_seconds":83,"files_modified":0,"errors_encountered":0,"errors_recovered":0,"coordination_events":0,"training_eligible":true,"training_exclusion_reason":null}}
|
|
6
|
+
{"envelope_id":"env_707eec54-6e67-4265-899e-91dccec83898","session_id":"sess_be3076ef-b370-4108-8516-f724a2bf41d3","type":"USER_FEEDBACK","attestation":{"session_hmac":"5ffad0f3cba4aa77de6de4189d74c7fb9ac64d6ee414c3f32c10fb4c3410f7ae","sequence":2,"app_version_hash":"4a527565d85cc601d3c47c376e4f361d5af987de58f60cf71dbc263a0de91247"},"feedback":{"signal":"accepted","timestamp":1777411153.504,"context":"session completed successfully with no user interventions","target_step":2,"revision_rounds":0,"delta_summary":null}}
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import { EMBEDDING_SERVICE_URL } from '../shared/constants.js';
|
|
4
4
|
|
|
5
|
-
const DEFAULT_MODEL = '
|
|
5
|
+
const DEFAULT_MODEL = 'BAAI/bge-small-en-v1.5';
|
|
6
6
|
const DEFAULT_TOP_K = 3;
|
|
7
7
|
|
|
8
8
|
// ~40 domains covering broad technical territory.
|
|
@@ -169,6 +169,26 @@ const DOMAIN_TAXONOMY = {
|
|
|
169
169
|
keywords: ['scientific', 'simulation', 'matlab', 'scipy', 'julia', 'fortran', 'numerical', 'differential equation', 'finite element', 'linear algebra'],
|
|
170
170
|
description: 'Scientific computing, numerical methods, MATLAB/SciPy/Julia, simulations, optimization, statistics',
|
|
171
171
|
},
|
|
172
|
+
planning_strategy: {
|
|
173
|
+
keywords: ['plan', 'strategy', 'architect', 'design doc', 'breakdown', 'scope', 'roadmap', 'milestone', 'prioritize', 'tradeoff', 'approach', 'recommend', 'team', 'coordinate', 'delegate', 'assign', 'orchestration', 'dispatch', 'routing', 'decompose', 'pipeline', 'inference', 'layer'],
|
|
174
|
+
description: 'Project planning, task decomposition, architecture design, system orchestration, routing and dispatch design, inference pipelines, team coordination, strategy, roadmaps, scoping, prioritization, tradeoff analysis',
|
|
175
|
+
},
|
|
176
|
+
conversational_reasoning: {
|
|
177
|
+
keywords: ['explain', 'why', 'how does', 'what is', 'clarify', 'understand', 'reason', 'think through', 'analyze', 'compare', 'evaluate', 'brainstorm', 'discuss', 'opinion', 'advice'],
|
|
178
|
+
description: 'Conversational reasoning, explanation, analysis, brainstorming, Q&A, decision-making, advice, evaluation',
|
|
179
|
+
},
|
|
180
|
+
documentation_writing: {
|
|
181
|
+
keywords: ['readme', 'documentation', 'docs', 'markdown', 'api docs', 'changelog', 'tutorial', 'guide', 'specification', 'wiki', 'jsdoc', 'docstring', 'technical writing'],
|
|
182
|
+
description: 'Documentation writing, READMEs, API docs, changelogs, tutorials, guides, technical writing, specifications',
|
|
183
|
+
},
|
|
184
|
+
product_design: {
|
|
185
|
+
keywords: ['product', 'feature', 'user story', 'requirements', 'ux', 'wireframe', 'prototype', 'feedback', 'iteration', 'mvp', 'spec', 'acceptance criteria', 'stakeholder'],
|
|
186
|
+
description: 'Product design, feature planning, user stories, requirements gathering, UX, prototyping, MVPs, stakeholder communication',
|
|
187
|
+
},
|
|
188
|
+
devops_general: {
|
|
189
|
+
keywords: ['deploy', 'deployment', 'release', 'rollback', 'staging', 'production', 'environment', 'migration', 'upgrade', 'maintenance', 'incident', 'postmortem', 'runbook'],
|
|
190
|
+
description: 'DevOps operations, deployments, releases, rollbacks, environment management, incident response, runbooks',
|
|
191
|
+
},
|
|
172
192
|
};
|
|
173
193
|
|
|
174
194
|
export class DomainTagger {
|
|
@@ -33,9 +33,11 @@ export class TrajectoryCapture {
|
|
|
33
33
|
this._transmissionQueue = null;
|
|
34
34
|
this._offlineRetryTimer = null;
|
|
35
35
|
this._contexts = new Map();
|
|
36
|
+
this._shutdown = false;
|
|
36
37
|
}
|
|
37
38
|
|
|
38
39
|
async init() {
|
|
40
|
+
if (this._shutdown) return;
|
|
39
41
|
if (!ConsentManager.isCaptureEnabled()) {
|
|
40
42
|
this._enabled = false;
|
|
41
43
|
return;
|
|
@@ -47,12 +49,16 @@ export class TrajectoryCapture {
|
|
|
47
49
|
this._transmissionQueue.start();
|
|
48
50
|
this._domainTagger = new DomainTagger();
|
|
49
51
|
await this._domainTagger.init();
|
|
52
|
+
if (this._shutdown) return;
|
|
50
53
|
this._offlineRetryTimer = setInterval(() => {
|
|
51
54
|
this._retryOfflineQueue();
|
|
52
55
|
}, OFFLINE_RETRY_INTERVAL_MS);
|
|
56
|
+
if (typeof this._offlineRetryTimer.unref === 'function') {
|
|
57
|
+
this._offlineRetryTimer.unref();
|
|
58
|
+
}
|
|
53
59
|
}
|
|
54
60
|
|
|
55
|
-
async onAgentSpawn(agentId, provider, model, role, teamSize) {
|
|
61
|
+
async onAgentSpawn(agentId, provider, model, role, teamSize, prompt) {
|
|
56
62
|
if (!this._enabled) return;
|
|
57
63
|
|
|
58
64
|
const parser = getParser(provider);
|
|
@@ -98,9 +104,20 @@ export class TrajectoryCapture {
|
|
|
98
104
|
ctx.chunkTimer = setInterval(() => {
|
|
99
105
|
this._flushContext(agentId);
|
|
100
106
|
}, CHUNK_TIMEOUT_MS);
|
|
107
|
+
if (typeof ctx.chunkTimer.unref === 'function') {
|
|
108
|
+
ctx.chunkTimer.unref();
|
|
109
|
+
}
|
|
101
110
|
|
|
102
111
|
this._contexts.set(agentId, ctx);
|
|
103
112
|
|
|
113
|
+
if (prompt && typeof prompt === 'string' && prompt.trim()) {
|
|
114
|
+
this._processStep(agentId, ctx, {
|
|
115
|
+
type: 'instruction',
|
|
116
|
+
content: prompt.slice(0, USER_MESSAGE_MAX_CHARS),
|
|
117
|
+
source: 'user',
|
|
118
|
+
});
|
|
119
|
+
}
|
|
120
|
+
|
|
104
121
|
await this._attestation.openSession(sessionId, metadata);
|
|
105
122
|
}
|
|
106
123
|
|
|
@@ -205,6 +222,7 @@ export class TrajectoryCapture {
|
|
|
205
222
|
}
|
|
206
223
|
|
|
207
224
|
async shutdown() {
|
|
225
|
+
this._shutdown = true;
|
|
208
226
|
if (this._offlineRetryTimer) clearInterval(this._offlineRetryTimer);
|
|
209
227
|
for (const agentId of this._contexts.keys()) {
|
|
210
228
|
await this._closeAgent(agentId, 'SHUTDOWN');
|
|
@@ -368,6 +386,23 @@ export class TrajectoryCapture {
|
|
|
368
386
|
}
|
|
369
387
|
|
|
370
388
|
_computeTrainingEligibility(ctx, durationSeconds) {
|
|
389
|
+
const role = ctx.metadata.agent_role || '';
|
|
390
|
+
const isConversational = role === 'planner' || role === 'chat' || role === 'advisor';
|
|
391
|
+
|
|
392
|
+
if (ctx.totalTokens < TRAINING_MIN_TOKENS) {
|
|
393
|
+
return { eligible: false, exclusionReason: 'insufficient_tokens' };
|
|
394
|
+
}
|
|
395
|
+
if (durationSeconds < TRAINING_MIN_DURATION) {
|
|
396
|
+
return { eligible: false, exclusionReason: 'too_short' };
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
if (isConversational) {
|
|
400
|
+
if (ctx.stepCount < 2) {
|
|
401
|
+
return { eligible: false, exclusionReason: 'too_few_steps' };
|
|
402
|
+
}
|
|
403
|
+
return { eligible: true, exclusionReason: null };
|
|
404
|
+
}
|
|
405
|
+
|
|
371
406
|
if (ctx.stepCount < TRAINING_MIN_STEPS) {
|
|
372
407
|
return { eligible: false, exclusionReason: 'too_few_steps' };
|
|
373
408
|
}
|
|
@@ -379,12 +414,6 @@ export class TrajectoryCapture {
|
|
|
379
414
|
if (!hasObservation) {
|
|
380
415
|
return { eligible: false, exclusionReason: 'no_observations' };
|
|
381
416
|
}
|
|
382
|
-
if (ctx.totalTokens < TRAINING_MIN_TOKENS) {
|
|
383
|
-
return { eligible: false, exclusionReason: 'insufficient_tokens' };
|
|
384
|
-
}
|
|
385
|
-
if (durationSeconds < TRAINING_MIN_DURATION) {
|
|
386
|
-
return { eligible: false, exclusionReason: 'too_short' };
|
|
387
|
-
}
|
|
388
417
|
return { eligible: true, exclusionReason: null };
|
|
389
418
|
}
|
|
390
419
|
|
|
@@ -208,7 +208,7 @@ describe('TrajectoryCapture — training eligibility', () => {
|
|
|
208
208
|
assert.equal(result.exclusionReason, null);
|
|
209
209
|
});
|
|
210
210
|
|
|
211
|
-
it('exclusion reasons follow priority order', () => {
|
|
211
|
+
it('exclusion reasons follow priority order: tokens before duration before steps', () => {
|
|
212
212
|
const tc = makeTc();
|
|
213
213
|
const ctx = makeCtx({
|
|
214
214
|
stepCount: 3,
|
|
@@ -220,8 +220,189 @@ describe('TrajectoryCapture — training eligibility', () => {
|
|
|
220
220
|
],
|
|
221
221
|
});
|
|
222
222
|
const result = tc._computeTrainingEligibility(ctx, 5);
|
|
223
|
+
assert.equal(result.exclusionReason, 'insufficient_tokens');
|
|
224
|
+
});
|
|
225
|
+
|
|
226
|
+
it('duration checked before steps when tokens pass', () => {
|
|
227
|
+
const tc = makeTc();
|
|
228
|
+
const ctx = makeCtx({
|
|
229
|
+
stepCount: 3,
|
|
230
|
+
totalTokens: 5000,
|
|
231
|
+
allSteps: [
|
|
232
|
+
{ step: 1, type: 'thought', content: 'thinking' },
|
|
233
|
+
{ step: 2, type: 'thought', content: 'more' },
|
|
234
|
+
{ step: 3, type: 'thought', content: 'done' },
|
|
235
|
+
],
|
|
236
|
+
});
|
|
237
|
+
const result = tc._computeTrainingEligibility(ctx, 5);
|
|
238
|
+
assert.equal(result.exclusionReason, 'too_short');
|
|
239
|
+
});
|
|
240
|
+
|
|
241
|
+
it('steps checked after tokens and duration pass', () => {
|
|
242
|
+
const tc = makeTc();
|
|
243
|
+
const ctx = makeCtx({
|
|
244
|
+
stepCount: 3,
|
|
245
|
+
totalTokens: 5000,
|
|
246
|
+
allSteps: [
|
|
247
|
+
{ step: 1, type: 'thought', content: 'thinking' },
|
|
248
|
+
{ step: 2, type: 'thought', content: 'more' },
|
|
249
|
+
{ step: 3, type: 'thought', content: 'done' },
|
|
250
|
+
],
|
|
251
|
+
});
|
|
252
|
+
const result = tc._computeTrainingEligibility(ctx, 60);
|
|
253
|
+
assert.equal(result.exclusionReason, 'too_few_steps');
|
|
254
|
+
});
|
|
255
|
+
});
|
|
256
|
+
|
|
257
|
+
describe('TrajectoryCapture — planner/conversational eligibility', () => {
|
|
258
|
+
function makeConversationalCtx(role, overrides = {}) {
|
|
259
|
+
const ctx = makeCtx(overrides);
|
|
260
|
+
ctx.metadata.agent_role = role;
|
|
261
|
+
return ctx;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
it('planner eligible with only thoughts (no actions/observations)', () => {
|
|
265
|
+
const tc = makeTc();
|
|
266
|
+
const ctx = makeConversationalCtx('planner', {
|
|
267
|
+
stepCount: 10,
|
|
268
|
+
totalTokens: 2000,
|
|
269
|
+
allSteps: Array.from({ length: 10 }, (_, i) => ({ step: i + 1, type: 'thought', content: 'planning' })),
|
|
270
|
+
});
|
|
271
|
+
const result = tc._computeTrainingEligibility(ctx, 60);
|
|
272
|
+
assert.equal(result.eligible, true);
|
|
273
|
+
assert.equal(result.exclusionReason, null);
|
|
274
|
+
});
|
|
275
|
+
|
|
276
|
+
it('chat role eligible with only thoughts', () => {
|
|
277
|
+
const tc = makeTc();
|
|
278
|
+
const ctx = makeConversationalCtx('chat', {
|
|
279
|
+
stepCount: 5,
|
|
280
|
+
totalTokens: 1000,
|
|
281
|
+
allSteps: [
|
|
282
|
+
{ step: 1, type: 'instruction', content: 'explain React hooks' },
|
|
283
|
+
{ step: 2, type: 'thought', content: 'explaining' },
|
|
284
|
+
{ step: 3, type: 'thought', content: 'more detail' },
|
|
285
|
+
{ step: 4, type: 'thought', content: 'examples' },
|
|
286
|
+
{ step: 5, type: 'resolution', content: 'done' },
|
|
287
|
+
],
|
|
288
|
+
});
|
|
289
|
+
const result = tc._computeTrainingEligibility(ctx, 30);
|
|
290
|
+
assert.equal(result.eligible, true);
|
|
291
|
+
});
|
|
292
|
+
|
|
293
|
+
it('advisor role eligible with only thoughts', () => {
|
|
294
|
+
const tc = makeTc();
|
|
295
|
+
const ctx = makeConversationalCtx('advisor', {
|
|
296
|
+
stepCount: 3,
|
|
297
|
+
totalTokens: 800,
|
|
298
|
+
allSteps: [
|
|
299
|
+
{ step: 1, type: 'instruction', content: 'review approach' },
|
|
300
|
+
{ step: 2, type: 'thought', content: 'analysis' },
|
|
301
|
+
{ step: 3, type: 'resolution', content: 'recommendation' },
|
|
302
|
+
],
|
|
303
|
+
});
|
|
304
|
+
const result = tc._computeTrainingEligibility(ctx, 20);
|
|
305
|
+
assert.equal(result.eligible, true);
|
|
306
|
+
});
|
|
307
|
+
|
|
308
|
+
it('planner still requires minimum tokens', () => {
|
|
309
|
+
const tc = makeTc();
|
|
310
|
+
const ctx = makeConversationalCtx('planner', {
|
|
311
|
+
stepCount: 10,
|
|
312
|
+
totalTokens: 100,
|
|
313
|
+
allSteps: Array.from({ length: 10 }, (_, i) => ({ step: i + 1, type: 'thought', content: 'plan' })),
|
|
314
|
+
});
|
|
315
|
+
const result = tc._computeTrainingEligibility(ctx, 60);
|
|
316
|
+
assert.equal(result.eligible, false);
|
|
317
|
+
assert.equal(result.exclusionReason, 'insufficient_tokens');
|
|
318
|
+
});
|
|
319
|
+
|
|
320
|
+
it('planner still requires minimum duration', () => {
|
|
321
|
+
const tc = makeTc();
|
|
322
|
+
const ctx = makeConversationalCtx('planner', {
|
|
323
|
+
stepCount: 10,
|
|
324
|
+
totalTokens: 2000,
|
|
325
|
+
allSteps: Array.from({ length: 10 }, (_, i) => ({ step: i + 1, type: 'thought', content: 'plan' })),
|
|
326
|
+
});
|
|
327
|
+
const result = tc._computeTrainingEligibility(ctx, 5);
|
|
328
|
+
assert.equal(result.eligible, false);
|
|
329
|
+
assert.equal(result.exclusionReason, 'too_short');
|
|
330
|
+
});
|
|
331
|
+
|
|
332
|
+
it('planner requires at least 2 steps', () => {
|
|
333
|
+
const tc = makeTc();
|
|
334
|
+
const ctx = makeConversationalCtx('planner', {
|
|
335
|
+
stepCount: 1,
|
|
336
|
+
totalTokens: 2000,
|
|
337
|
+
allSteps: [{ step: 1, type: 'thought', content: 'plan' }],
|
|
338
|
+
});
|
|
339
|
+
const result = tc._computeTrainingEligibility(ctx, 60);
|
|
340
|
+
assert.equal(result.eligible, false);
|
|
223
341
|
assert.equal(result.exclusionReason, 'too_few_steps');
|
|
224
342
|
});
|
|
343
|
+
|
|
344
|
+
it('coding role (fullstack) still requires actions and observations', () => {
|
|
345
|
+
const tc = makeTc();
|
|
346
|
+
const ctx = makeConversationalCtx('fullstack', {
|
|
347
|
+
stepCount: 10,
|
|
348
|
+
totalTokens: 2000,
|
|
349
|
+
allSteps: Array.from({ length: 10 }, (_, i) => ({ step: i + 1, type: 'thought', content: 'thinking' })),
|
|
350
|
+
});
|
|
351
|
+
const result = tc._computeTrainingEligibility(ctx, 60);
|
|
352
|
+
assert.equal(result.eligible, false);
|
|
353
|
+
assert.equal(result.exclusionReason, 'no_actions');
|
|
354
|
+
});
|
|
355
|
+
});
|
|
356
|
+
|
|
357
|
+
describe('TrajectoryCapture — initial prompt capture', () => {
|
|
358
|
+
function makeSpawnTc() {
|
|
359
|
+
const tc = makeTc();
|
|
360
|
+
tc._enabled = true;
|
|
361
|
+
tc._scrubber = { scrub: (s) => s };
|
|
362
|
+
tc._attestation = { openSession: async () => {}, signEnvelope: (sid, e) => e };
|
|
363
|
+
tc._transmissionQueue = { enqueue: () => {} };
|
|
364
|
+
tc._domainTagger = null;
|
|
365
|
+
return tc;
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
it('onAgentSpawn records prompt as instruction step', async () => {
|
|
369
|
+
const tc = makeSpawnTc();
|
|
370
|
+
await tc.onAgentSpawn('agent-p1', 'claude-code', 'opus', 'planner', 1, 'Build a React app');
|
|
371
|
+
|
|
372
|
+
const ctx = tc._contexts.get('agent-p1');
|
|
373
|
+
assert.ok(ctx);
|
|
374
|
+
assert.equal(ctx.stepCount, 1);
|
|
375
|
+
assert.equal(ctx.allSteps[0].type, 'instruction');
|
|
376
|
+
assert.ok(ctx.allSteps[0].content.includes('Build a React app'));
|
|
377
|
+
assert.equal(ctx.allSteps[0].source, 'user');
|
|
378
|
+
});
|
|
379
|
+
|
|
380
|
+
it('onAgentSpawn with no prompt creates no instruction step', async () => {
|
|
381
|
+
const tc = makeSpawnTc();
|
|
382
|
+
await tc.onAgentSpawn('agent-p2', 'claude-code', 'opus', 'fullstack', 1);
|
|
383
|
+
|
|
384
|
+
const ctx = tc._contexts.get('agent-p2');
|
|
385
|
+
assert.ok(ctx);
|
|
386
|
+
assert.equal(ctx.stepCount, 0);
|
|
387
|
+
assert.equal(ctx.allSteps.length, 0);
|
|
388
|
+
});
|
|
389
|
+
|
|
390
|
+
it('onAgentSpawn truncates long prompts', async () => {
|
|
391
|
+
const tc = makeSpawnTc();
|
|
392
|
+
const longPrompt = 'x'.repeat(50000);
|
|
393
|
+
await tc.onAgentSpawn('agent-p3', 'claude-code', 'opus', 'planner', 1, longPrompt);
|
|
394
|
+
|
|
395
|
+
const ctx = tc._contexts.get('agent-p3');
|
|
396
|
+
assert.ok(ctx.allSteps[0].content.length <= 10001);
|
|
397
|
+
});
|
|
398
|
+
|
|
399
|
+
it('onAgentSpawn ignores empty/whitespace prompts', async () => {
|
|
400
|
+
const tc = makeSpawnTc();
|
|
401
|
+
await tc.onAgentSpawn('agent-p4', 'claude-code', 'opus', 'planner', 1, ' ');
|
|
402
|
+
|
|
403
|
+
const ctx = tc._contexts.get('agent-p4');
|
|
404
|
+
assert.equal(ctx.stepCount, 0);
|
|
405
|
+
});
|
|
225
406
|
});
|
|
226
407
|
|
|
227
408
|
describe('TrajectoryCapture — user feedback emission', () => {
|
|
@@ -475,7 +475,7 @@ describe('envelope-schema', () => {
|
|
|
475
475
|
it('accepts valid session_embedding object', () => {
|
|
476
476
|
const env = validEnvelope();
|
|
477
477
|
env.metadata.session_embedding = {
|
|
478
|
-
model: '
|
|
478
|
+
model: 'BAAI/bge-small-en-v1.5',
|
|
479
479
|
vector: [0.0234, -0.0891, 0.1247, 0.0562],
|
|
480
480
|
source_text: 'Write a Python decorator that caches function results',
|
|
481
481
|
};
|
|
@@ -486,7 +486,7 @@ describe('envelope-schema', () => {
|
|
|
486
486
|
it('rejects session_embedding with empty vector', () => {
|
|
487
487
|
const env = validEnvelope();
|
|
488
488
|
env.metadata.session_embedding = {
|
|
489
|
-
model: '
|
|
489
|
+
model: 'BAAI/bge-small-en-v1.5',
|
|
490
490
|
vector: [],
|
|
491
491
|
source_text: 'test',
|
|
492
492
|
};
|
|
@@ -498,7 +498,7 @@ describe('envelope-schema', () => {
|
|
|
498
498
|
it('rejects session_embedding with non-numeric vector values', () => {
|
|
499
499
|
const env = validEnvelope();
|
|
500
500
|
env.metadata.session_embedding = {
|
|
501
|
-
model: '
|
|
501
|
+
model: 'BAAI/bge-small-en-v1.5',
|
|
502
502
|
vector: [0.1, 'bad', 0.3],
|
|
503
503
|
source_text: 'test',
|
|
504
504
|
};
|