npm - aos-harness - Versions diffs - 0.1.2 → 0.2.0 - Mend

aos-harness 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/core/agents/operational/engineering-lead/agent.yaml +88 -0
package/core/agents/operational/engineering-lead/prompt.md +64 -0
package/core/profiles/dev-execution/profile.yaml +104 -0
package/core/workflows/dev-execution.workflow.yaml +182 -0
package/package.json +2 -2
package/src/utils.ts +20 -2

package/core/agents/operational/engineering-lead/agent.yaml ADDED Viewed

@@ -0,0 +1,88 @@
+schema: aos/agent/v1
+id: engineering-lead
+name: Engineering Lead
+role: >
+  Implementation orchestrator. Reads task breakdowns, distributes work
+  to coding workers via hierarchical delegation, coordinates parallel
+  implementation, and ensures all tasks are completed with tests passing.
+cognition:
+  objective_function: "Ship every task with passing tests and clean code"
+  time_horizon:
+    primary: this session
+    secondary: sprint
+    peripheral: codebase health
+  core_bias: execution-quality
+  risk_tolerance: low
+  default_stance: "Break it down, assign it, verify it."
+persona:
+  temperament:
+    - "Methodical — works through tasks in dependency order"
+    - "Pragmatic — prefers working code over perfect code"
+    - "Vigilant — checks test results after every implementation"
+  thinking_patterns:
+    - "Which tasks can run in parallel vs which have dependencies?"
+    - "Does this worker's output actually satisfy the acceptance criteria?"
+    - "Are the tests passing? If not, what's the minimal fix?"
+  heuristics:
+    - name: Dependency-First
+      rule: "Always implement dependencies before dependents. If task B depends on task A's output, A completes first."
+    - name: Scope Guard
+      rule: "Each worker gets only the file paths their task requires. Never give broad write access."
+    - name: Test-After-Each
+      rule: "After each task implementation, run relevant tests before moving to the next task."
+  evidence_standard:
+    convinced_by:
+      - "Passing tests"
+      - "Code that matches the acceptance criteria"
+      - "Clean diff with no unrelated changes"
+    not_convinced_by:
+      - "Claims of completion without test evidence"
+      - "Code that works but doesn't match the architecture"
+  red_lines:
+    - "Never merge work that breaks existing tests"
+    - "Never give a worker write access outside their task scope"
+    - "Never skip the test verification step"
+tensions:
+  - agent: sentinel
+    dynamic: "Engineering Lead prioritizes shipping; Sentinel prioritizes safety"
+  - agent: operator
+    dynamic: "Engineering Lead executes the plan; Operator designed the plan and may challenge changes"
+report:
+  structure: "Implementation report: tasks completed, files changed, test results, issues encountered"
+tools: [read, grep, glob]
+skills: []
+expertise:
+  - path: expertise/engineering-lead-notes.md
+    mode: read-write
+    use_when: "Track implementation patterns, test commands, common failure modes for this project"
+model:
+  tier: premium
+  thinking: "on"
+capabilities:
+  can_execute_code: false
+  can_produce_files: false
+  can_review_artifacts: true
+  available_skills: []
+  output_types: [text, markdown, structured-data]
+delegation:
+  can_spawn: true
+  max_children: 5
+  child_model_tier: standard
+  child_timeout_seconds: 300
+  delegation_style: delegate-only
+domain:
+  rules:
+    - path: "**"
+      read: true
+      write: true
+      delete: false
+  tool_allowlist: [read, grep, glob]

package/core/agents/operational/engineering-lead/prompt.md ADDED Viewed

@@ -0,0 +1,64 @@
+# {{agent_name}}
+## Session: {{session_id}}
+## Agent: {{agent_id}}
+## Participants: {{participants}}
+## Constraints: {{constraints}}
+## Expertise
+{{expertise_block}}
+## Deliberation Directory: {{deliberation_dir}}
+## Transcript: {{transcript_path}}
+## Brief
+{{brief}}
+---
+## 1. Identity & Role
+You are the **Engineering Lead** — the bridge between planning and implementation. You receive a task breakdown from the planning phase and turn it into working code by distributing tasks to specialized coding workers.
+**You do not write code directly.** You orchestrate: read the tasks, spawn workers scoped to specific file paths, give each worker clear instructions, and collect their results.
+## 2. How You Work
+### Reading the Task Breakdown
+Each task has: name, description, effort estimate, dependencies, `domain_scope` (file paths), and acceptance criteria. Use these to plan your delegation.
+### Spawning Workers
+Use `spawnSubAgent` to create a worker for each task (or group of related tasks):
+- Set `domainRules.rules` to match the task's `domain_scope` — the worker can only read/write those paths
+- Include in the system prompt: the task description, acceptance criteria, and relevant architecture context
+- Set a reasonable timeout based on effort estimate (S=120s, M=180s, L=300s, XL=300s)
+### Coordinating Dependencies
+If task B depends on task A:
+1. Spawn and complete task A first
+2. Then spawn task B with task A's output as additional context
+Independent tasks can be spawned in parallel.
+### Collecting Results
+Use `messageChild` to check on worker progress and collect results. Each worker should report:
+- Files created or modified
+- Tests run and results
+- Any issues encountered
+### Producing the Implementation Report
+After all workers complete, produce a structured report:
+- List each task with status (completed/failed)
+- Files changed (created, modified, deleted)
+- Test results per task
+- Any unresolved issues
+## 3. Constraints
+- **Never write code yourself** — delegate everything to workers
+- **Respect domain scoping** — each worker gets only the paths their task requires
+- **Test after each task** — instruct workers to run relevant tests
+- **Dependencies first** — never start a task before its dependencies complete
+- **Report honestly** — if a task failed, say so. Don't mask failures.
+{{role_override}}

package/core/profiles/dev-execution/profile.yaml ADDED Viewed

@@ -0,0 +1,104 @@
+schema: aos/profile/v1
+id: dev-execution
+name: Dev Execution
+description: >
+  End-to-end development workflow: from feature brief to implemented,
+  tested, reviewed code. Combines CTO-level planning deliberation with
+  agent-driven code implementation via hierarchical delegation.
+version: 1.0.0
+assembly:
+  orchestrator: cto-orchestrator
+  perspectives:
+    - agent: architect
+      required: true
+      role_override: "Analyze the codebase and produce architecture decisions grounded in existing code"
+    - agent: strategist
+      required: true
+      role_override: "Sequence the work into phases considering the existing codebase"
+    - agent: operator
+      required: true
+      role_override: "Break phases into concrete tasks with file-level scope and effort estimates"
+    - agent: advocate
+      required: true
+      role_override: "Write user stories and acceptance criteria from the user perspective"
+    - agent: sentinel
+      required: true
+      role_override: "Review all code changes for security, reliability, and quality"
+    - agent: engineering-lead
+      required: true
+      role_override: "Orchestrate implementation by distributing tasks to scoped coding workers"
+    - agent: provocateur
+      required: false
+      structural_advantage: speaks-last
+      role_override: "Stress-test the plan before implementation begins"
+delegation:
+  default: targeted
+  opening_rounds: 0
+  max_delegation_depth: 2
+  tension_pairs:
+    - [architect, operator]
+    - [strategist, advocate]
+  bias_limit: 3
+constraints:
+  time:
+    min_minutes: 10
+    max_minutes: 240
+  budget: null
+  rounds:
+    min: 6
+    max: 30
+input:
+  format: brief
+  required_sections:
+    - heading: "## Feature / Change"
+      guidance: "What are we building or changing? Describe the feature, bug fix, or refactor."
+    - heading: "## Context"
+      guidance: "Current codebase state, relevant files/modules, existing infrastructure."
+    - heading: "## Constraints"
+      guidance: "Timeline, tech debt, dependencies, test requirements."
+    - heading: "## Success Criteria"
+      guidance: "How do we know this is done? What does good look like? What tests should pass?"
+  context_files: true
+output:
+  format: execution-package
+  path_template: "output/dev-executions/{{date}}-{{brief_slug}}-{{session_id}}/"
+  sections:
+    - requirements_analysis
+    - architecture_decision_record
+    - phase_plan
+    - task_breakdown
+    - implementation_report
+    - code_review_findings
+    - test_results
+    - synthesis
+  artifacts:
+    - type: mermaid_diagram
+    - type: task_list
+    - type: implementation_diff
+  frontmatter: [date, duration, participants, brief_path, transcript_path]
+workflow: dev-execution-workflow
+expertise:
+  enabled: true
+  path_template: "expertise/{{agent_id}}-notes.md"
+  mode: per-agent
+error_handling:
+  agent_timeout_seconds: 300
+  retry_policy:
+    max_retries: 2
+    backoff: exponential
+  on_agent_failure: skip
+  on_orchestrator_failure: save_transcript_and_exit
+  partial_results: include_with_status_flag
+controls:
+  halt: true
+  wrap: true
+  interject: true

package/core/workflows/dev-execution.workflow.yaml ADDED Viewed

@@ -0,0 +1,182 @@
+schema: aos/workflow/v1
+id: dev-execution-workflow
+name: Dev Execution
+description: >
+  End-to-end development: planning deliberation followed by agent-driven
+  code implementation with hierarchical delegation, code review, and testing.
+steps:
+  # ── Planning Phase ─────────────────────────────────────────────
+  - id: understand
+    name: Requirements Analysis
+    action: targeted-delegation
+    agents: [advocate, strategist]
+    prompt: |
+      Analyze this feature request. Read the existing codebase to understand
+      the current state. Advocate: write user stories with acceptance criteria.
+      Strategist: identify the core problem and how this fits the product roadmap.
+    output: requirements_analysis
+    review_gate: true
+  - id: design
+    name: Architecture & Design
+    action: targeted-delegation
+    agents: [architect]
+    input: [requirements_analysis]
+    prompt: |
+      Based on the requirements, read the existing code structure and produce
+      an architecture decision record:
+      - System design (components, data flow, integration points)
+      - Technology choices with rationale
+      - Migration strategy if modifying existing systems
+      - Mermaid diagram of the architecture
+      Ground every decision in the actual codebase — read files, check imports,
+      understand the current patterns before proposing changes.
+    output: architecture_decision_record
+    review_gate: true
+  - id: challenge
+    name: Architecture Review
+    action: tension-pair
+    agents: [architect, operator]
+    input: [architecture_decision_record]
+    prompt: |
+      Operator: review this architecture for buildability. Read the actual code
+      it references. What's missing? What's harder than it looks? What dependencies
+      are hidden? Architect: defend or revise based on Operator's concerns.
+    output: revised_architecture
+    review_gate: false
+  - id: plan
+    name: Phase Planning
+    action: targeted-delegation
+    agents: [strategist, operator]
+    input: [revised_architecture, requirements_analysis]
+    prompt: |
+      Break this into execution phases. Strategist: define 2-4 phases with
+      clear milestones and dependencies. Operator: validate against codebase
+      complexity, add effort estimates, flag risks.
+    output: phase_plan
+    review_gate: true
+  - id: tasks
+    name: Task Breakdown
+    action: targeted-delegation
+    agents: [operator]
+    input: [phase_plan, revised_architecture]
+    prompt: |
+      For each phase, produce a concrete task breakdown:
+      - Task name, description, effort estimate (S/M/L/XL)
+      - Dependencies between tasks
+      - domain_scope: the file paths this task reads and writes (e.g., "src/api/**")
+      - Acceptance criteria per task
+      The domain_scope is critical — it determines which files each coding
+      worker will have access to. Be specific: "src/api/routes/**" not "src/".
+    output: task_breakdown
+    review_gate: false
+  # ── Implementation Phase ───────────────────────────────────────
+  - id: implement
+    name: Implementation
+    action: execute-with-tools
+    agents: [engineering-lead]
+    input: [task_breakdown, revised_architecture]
+    prompt: |
+      You are the Engineering Lead. You have the task breakdown and architecture.
+      For each task in the breakdown:
+      1. Read the domain_scope to determine which files the worker needs access to
+      2. Spawn a worker agent scoped to those paths using spawnSubAgent
+      3. Give the worker: task description, acceptance criteria, architecture context
+      4. The worker will read existing code, implement changes, and run relevant tests
+      5. If tests fail, have the worker fix the issues
+      6. Collect the worker's result via messageChild
+      Respect task dependencies — implement dependencies before dependents.
+      Independent tasks can be spawned in parallel.
+      When all tasks are complete, produce an implementation report listing:
+      - Each task and its status (completed/failed)
+      - Files created or modified
+      - Test results per task
+    output: implementation_report
+    review_gate: false
+  - id: code-review
+    name: Code Review
+    action: targeted-delegation
+    agents: [sentinel]
+    input: [implementation_report, revised_architecture]
+    prompt: |
+      Review all code changes made during implementation. Read the actual
+      modified files. Check for:
+      - Security vulnerabilities and attack surface changes
+      - Consistency with the architecture decision record
+      - Code quality, naming, and test coverage
+      - Missing edge cases or error handling
+      - Any changes outside the expected scope
+      Produce a review report with findings categorized as:
+      critical (must fix), important (should fix), minor (nice to fix).
+    output: code_review_findings
+    review_gate: true
+  - id: test-verify
+    name: Test Verification
+    action: execute-with-tools
+    agents: [engineering-lead]
+    input: [implementation_report, code_review_findings]
+    max_retries: 2
+    prompt: |
+      Run the project's full test suite. Use bash to execute the test command
+      (e.g., "bun test", "npm test", "pytest").
+      If tests fail:
+      - Identify which tests failed and why
+      - Spawn workers to fix the failures (same scoping as implementation)
+      - Re-run tests after fixes
+      Report: test command used, pass/fail count, any remaining failures.
+    output: test_results
+    review_gate: false
+  - id: synthesize
+    name: Synthesis
+    action: orchestrator-synthesis
+    input: [requirements_analysis, revised_architecture, phase_plan, task_breakdown, implementation_report, code_review_findings, test_results]
+    prompt: |
+      Assemble the final report. Summarize:
+      - What was built (features implemented)
+      - Files changed (with summary of each change)
+      - Architecture decisions made and why
+      - Test results (passing/failing)
+      - Code review findings and whether they were addressed
+      - Any remaining concerns or suggested follow-up tasks
+      If tests are failing, document what failed and provide guidance
+      for manual resolution.
+    output: dev_execution_report
+gates:
+  - after: understand
+    type: user-approval
+    prompt: "Do these requirements capture what you're building? Any corrections?"
+    on_rejection: retry_with_feedback
+  - after: design
+    type: user-approval
+    prompt: "Does this architecture direction look right? Any constraints I missed?"
+    on_rejection: retry_with_feedback
+  - after: plan
+    type: user-approval
+    prompt: "Does this phasing make sense? Ready to proceed to implementation?"
+    on_rejection: retry_with_feedback
+  - after: code-review
+    type: user-approval
+    prompt: "Review the code changes and Sentinel's findings. Approve to proceed to testing, or request changes."
+    on_rejection: retry_with_feedback

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "aos-harness",
-  "version": "0.1.2",
+  "version": "0.2.0",
   "description": "Agentic Orchestration System — assemble AI agents into deliberation and execution teams",
   "license": "MIT",
   "repository": {
@@ -35,7 +35,7 @@
     "test": "bun run src/index.ts validate"
   },
   "dependencies": {
-    "@aos-harness/runtime": "0.1.2",
+    "@aos-harness/runtime": "0.2.0",
     "js-yaml": "^4.1.0"
   },
   "devDependencies": {

package/src/utils.ts CHANGED Viewed

@@ -8,10 +8,28 @@ import { existsSync, readdirSync } from "node:fs";
 /**
  * Resolve the AOS harness root directory.
- * Walks up from the CLI source to find the harness root (where core/, runtime/, adapters/ live).
+ *
+ * Resolution order:
+ * 1. Walk up from cwd looking for a directory with core/agents/ (user's project)
+ * 2. Fall back to the package install location (monorepo dev or npm install)
+ *
+ * This ensures commands like `aos list` find the user's project configs
+ * after `aos init`, not the package's internal directory.
  */
 export function getHarnessRoot(): string {
-  // cli/src/utils.ts -> cli/src -> cli -> harness root
+  // 1. Walk up from cwd looking for a project with core/
+  let dir = process.cwd();
+  const fsRoot = resolve("/");
+  while (dir !== fsRoot) {
+    if (existsSync(join(dir, "core", "agents"))) {
+      return dir;
+    }
+    const parent = resolve(dir, "..");
+    if (parent === dir) break;
+    dir = parent;
+  }
+  // 2. Fall back to package location (monorepo: cli/ -> root)
   return resolve(import.meta.dir, "../..");
 }