@opencode_weave/weave 0.7.1 → 0.7.4-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -196
- package/dist/agents/tapestry/prompt-composer.d.ts +3 -1
- package/dist/config/schema.d.ts +3 -0
- package/dist/features/analytics/generate-metrics-report.d.ts +4 -4
- package/dist/features/analytics/index.d.ts +4 -3
- package/dist/features/analytics/plan-token-aggregator.d.ts +24 -1
- package/dist/features/analytics/quality-score.d.ts +30 -0
- package/dist/features/analytics/session-tracker.d.ts +5 -0
- package/dist/features/analytics/types.d.ts +51 -14
- package/dist/features/evals/evaluators/trajectory-assertion.d.ts +2 -0
- package/dist/features/evals/executors/github-models-api.d.ts +13 -0
- package/dist/features/evals/executors/model-response.d.ts +6 -1
- package/dist/features/evals/executors/prompt-renderer.d.ts +1 -1
- package/dist/features/evals/executors/trajectory-run.d.ts +3 -0
- package/dist/features/evals/index.d.ts +8 -5
- package/dist/features/evals/loader.d.ts +2 -1
- package/dist/features/evals/reporter.d.ts +1 -0
- package/dist/features/evals/runner.d.ts +1 -1
- package/dist/features/evals/schema.d.ts +65 -16
- package/dist/features/evals/storage.d.ts +2 -0
- package/dist/features/evals/types.d.ts +43 -2
- package/dist/features/skill-loader/loader.d.ts +2 -0
- package/dist/features/workflow/context.d.ts +2 -1
- package/dist/features/workflow/discovery.d.ts +6 -3
- package/dist/features/workflow/hook.d.ts +2 -0
- package/dist/hooks/compaction-todo-preserver.d.ts +20 -0
- package/dist/hooks/create-hooks.d.ts +4 -0
- package/dist/hooks/index.d.ts +6 -0
- package/dist/hooks/todo-continuation-enforcer.d.ts +25 -0
- package/dist/hooks/todo-description-override.d.ts +18 -0
- package/dist/hooks/todo-writer.d.ts +17 -0
- package/dist/index.js +755 -254
- package/dist/plugin/types.d.ts +1 -1
- package/dist/shared/resolve-safe-path.d.ts +14 -0
- package/package.json +10 -8
- package/dist/features/analytics/suggestions.d.ts +0 -10
package/README.md
CHANGED
|
@@ -6,38 +6,6 @@
|
|
|
6
6
|
|
|
7
7
|
Weave is a lean OpenCode plugin with multi-agent orchestration. It provides a cohesive framework for weaving agents, tools, and skills into structured workflows. By delegating complex tasks to specialized agents and monitoring execution state through hooks, Weave ensures reliable and efficient project development.
|
|
8
8
|
|
|
9
|
-
## Table of Contents
|
|
10
|
-
|
|
11
|
-
- [Overview](#overview)
|
|
12
|
-
- [Documentation](#documentation)
|
|
13
|
-
- [Agents](#agents)
|
|
14
|
-
- [Agent Modes](#agent-modes)
|
|
15
|
-
- [Agent Details](#agent-details)
|
|
16
|
-
- [Workflow](#workflow)
|
|
17
|
-
- [When the Full Workflow Is Used](#when-the-full-workflow-is-used)
|
|
18
|
-
- [1. Plan](#1-plan)
|
|
19
|
-
- [2. Review (Optional)](#2-review-optional)
|
|
20
|
-
- [3. Execute](#3-execute)
|
|
21
|
-
- [Resuming Interrupted Work](#resuming-interrupted-work)
|
|
22
|
-
- [Quick Tasks (No Plan Needed)](#quick-tasks-no-plan-needed)
|
|
23
|
-
- [Installation](#installation)
|
|
24
|
-
- [Prerequisites](#prerequisites)
|
|
25
|
-
- [Step 1: Add to opencode.json](#step-1-add-to-opencodejson)
|
|
26
|
-
- [Step 2: Restart OpenCode](#step-2-restart-opencode)
|
|
27
|
-
- [Troubleshooting](#troubleshooting)
|
|
28
|
-
- [Uninstalling](#uninstalling)
|
|
29
|
-
- [Configuration](#configuration)
|
|
30
|
-
- [Example Configuration](#example-configuration)
|
|
31
|
-
- [Configuration Fields](#configuration-fields)
|
|
32
|
-
- [Features](#features)
|
|
33
|
-
- [Hooks](#hooks)
|
|
34
|
-
- [Skills](#skills)
|
|
35
|
-
- [Background Agents](#background-agents)
|
|
36
|
-
- [Tool Permissions](#tool-permissions)
|
|
37
|
-
- [Development](#development)
|
|
38
|
-
- [Acknowledgments](#acknowledgments)
|
|
39
|
-
- [License](#license)
|
|
40
|
-
|
|
41
9
|
## Overview
|
|
42
10
|
|
|
43
11
|
- **8 specialized agents** with weaving-themed names designed for specific roles in the development lifecycle.
|
|
@@ -50,7 +18,9 @@ Weave is a lean OpenCode plugin with multi-agent orchestration. It provides a co
|
|
|
50
18
|
|
|
51
19
|
## Documentation
|
|
52
20
|
|
|
53
|
-
|
|
21
|
+
For detailed guides on configuration, workflows, agents, features, and more, visit the **[Weave documentation](https://tryweave.io/docs/)**.
|
|
22
|
+
|
|
23
|
+
For agent routing eval trends and dashboards, see the **[Eval Dashboard](https://tryweave.io/evals/)**.
|
|
54
24
|
|
|
55
25
|
## Agents
|
|
56
26
|
|
|
@@ -65,87 +35,6 @@ Visit [tryweave.io](https://tryweave.io) for more information, or head straight
|
|
|
65
35
|
| **Weft** | reviewer/auditor | subagent | Reviews completed work and plans with a critical but fair eye, rejecting only for true blocking issues. |
|
|
66
36
|
| **Warp** | security auditor | subagent | Audits code changes for security vulnerabilities and specification compliance with a skeptical bias. |
|
|
67
37
|
|
|
68
|
-
### Agent Modes
|
|
69
|
-
|
|
70
|
-
- `primary`: Respects the user-selected model in the OpenCode UI.
|
|
71
|
-
- `subagent`: Uses its own model or fallback chain, ignoring UI selection for predictable specialization.
|
|
72
|
-
- `all`: Available in both primary and subagent contexts.
|
|
73
|
-
|
|
74
|
-
### Agent Details
|
|
75
|
-
|
|
76
|
-
**Loom** is the central orchestrator and the default entry point for every request. It breaks down complex problems into tasks, decides which agents to delegate to, and tracks progress obsessively via todo lists. Loom never implements code directly — it plans and delegates. For quick fixes it acts immediately; for complex work it kicks off the plan → review → execute workflow.
|
|
77
|
-
|
|
78
|
-
**Pattern** is the strategic planner. When a task requires 5+ steps or involves architectural decisions, Loom delegates to Pattern, which researches the codebase (via Thread) and external docs (via Spindle), then produces a structured implementation plan saved to `.weave/plans/{name}.md`. Plans use `- [ ]` checkboxes for every actionable task. Pattern never writes code — only plans.
|
|
79
|
-
|
|
80
|
-
**Weft** is the reviewer and auditor. It validates plans before execution and reviews completed work after implementation. Weft is approval-biased and only rejects for true blocking issues (max 3 per review). It checks that file references are correct, tasks have sufficient context, implementations match requirements, and no stubs or TODOs are left behind. Weft is read-only.
|
|
81
|
-
|
|
82
|
-
**Warp** is the security and specification compliance auditor. It reviews code changes for security vulnerabilities (injection, auth bypass, token handling, crypto weaknesses) and verifies compliance with standards like OAuth2, OIDC, WebAuthn, and JWT. Warp has a skeptical bias — unlike Weft, it rejects by default when security patterns are detected. It self-triages to fast-exit on non-security changes, and can webfetch RFCs for verification. Warp is read-only.
|
|
83
|
-
|
|
84
|
-
**Tapestry** is the execution engine. Activated by the `/start-work` command, it reads a plan from `.weave/plans/` and works through tasks sequentially — writing code, running commands, verifying output, and marking checkboxes as it goes. Tapestry cannot spawn subagents; it focuses on heads-down implementation. If interrupted, it resumes from the first unchecked task.
|
|
85
|
-
|
|
86
|
-
**Thread** is the fast codebase explorer. Loom delegates to Thread whenever it needs to understand code structure, find files, or answer questions about the repository. Thread uses grep, glob, and read tools with zero creativity (temperature 0.0) to return precise, factual answers with file paths and line numbers. Thread is read-only.
|
|
87
|
-
|
|
88
|
-
**Spindle** is the external researcher. When Loom needs documentation for a library, API reference, or any information outside the codebase, Spindle fetches URLs, reads docs, and synthesizes findings with source citations. Spindle is read-only.
|
|
89
|
-
|
|
90
|
-
**Shuttle** is the domain specialist. When work falls into a specific category (e.g., visual engineering, data processing), Loom dispatches Shuttle with full tool access to execute the task. Shuttle's model and configuration can be overridden per-category for domain-optimized performance.
|
|
91
|
-
|
|
92
|
-
## Workflow
|
|
93
|
-
|
|
94
|
-
Weave uses a structured **Plan → Review → Execute** workflow for complex tasks. Simple requests are handled directly by Loom without the full cycle.
|
|
95
|
-
|
|
96
|
-
### When the Full Workflow Is Used
|
|
97
|
-
|
|
98
|
-
- Tasks requiring 5+ steps or architectural decisions
|
|
99
|
-
- Multi-file refactors or new feature implementations
|
|
100
|
-
- Work that benefits from a reviewable plan before execution
|
|
101
|
-
|
|
102
|
-
### 1. Plan
|
|
103
|
-
|
|
104
|
-
Loom delegates to **Pattern**, which researches the codebase and produces a detailed implementation plan:
|
|
105
|
-
|
|
106
|
-
```
|
|
107
|
-
User Request → Loom (assesses complexity) → Pattern (researches + plans)
|
|
108
|
-
↓
|
|
109
|
-
.weave/plans/{name}.md
|
|
110
|
-
```
|
|
111
|
-
|
|
112
|
-
The plan includes clear objectives, deliverables, and atomic tasks marked with `- [ ]` checkboxes. Pattern never writes code.
|
|
113
|
-
|
|
114
|
-
### 2. Review (Optional)
|
|
115
|
-
|
|
116
|
-
For high-stakes or complex plans, Loom delegates to **Weft** to validate the plan before execution:
|
|
117
|
-
|
|
118
|
-
```
|
|
119
|
-
.weave/plans/{name}.md → Weft (validates) → APPROVE or REJECT
|
|
120
|
-
```
|
|
121
|
-
|
|
122
|
-
Weft checks that referenced files exist, tasks have sufficient context, and there are no contradictions. If rejected, issues are sent back to Pattern for revision.
|
|
123
|
-
|
|
124
|
-
### 3. Execute
|
|
125
|
-
|
|
126
|
-
The user runs `/start-work` to begin execution:
|
|
127
|
-
|
|
128
|
-
```
|
|
129
|
-
/start-work [plan-name] → creates .weave/state.json → switches to Tapestry
|
|
130
|
-
```
|
|
131
|
-
|
|
132
|
-
**Tapestry** reads the plan and executes tasks sequentially:
|
|
133
|
-
|
|
134
|
-
1. Find the first unchecked `- [ ]` task
|
|
135
|
-
2. Implement the task (write code, run commands, create files)
|
|
136
|
-
3. Verify completion (read files, run tests, check acceptance criteria)
|
|
137
|
-
4. Mark the checkbox `- [x]`
|
|
138
|
-
5. Move to the next unchecked task
|
|
139
|
-
6. When all tasks are complete, report a final summary
|
|
140
|
-
|
|
141
|
-
### Resuming Interrupted Work
|
|
142
|
-
|
|
143
|
-
If a session is interrupted, running `/start-work` again resumes from the first unchecked task — no re-planning or restarting. The work state is persisted in `.weave/state.json`, so progress is never lost.
|
|
144
|
-
|
|
145
|
-
### Quick Tasks (No Plan Needed)
|
|
146
|
-
|
|
147
|
-
For simple requests — single-file fixes, quick questions, small edits — Loom handles the work directly or delegates to the appropriate agent without creating a formal plan.
|
|
148
|
-
|
|
149
38
|
## Installation
|
|
150
39
|
|
|
151
40
|
This package is published on [npm](https://www.npmjs.com/package/@opencode_weave/weave).
|
|
@@ -211,88 +100,6 @@ If you no longer use Weave in any project, remove the global configuration:
|
|
|
211
100
|
rm -f ~/.config/opencode/weave-opencode.jsonc ~/.config/opencode/weave-opencode.json
|
|
212
101
|
```
|
|
213
102
|
|
|
214
|
-
## Configuration
|
|
215
|
-
|
|
216
|
-
Weave searches for configuration files in the following locations, merging them in order (user config → project config → defaults):
|
|
217
|
-
|
|
218
|
-
- **Project**: `.opencode/weave-opencode.jsonc` or `.opencode/weave-opencode.json`
|
|
219
|
-
- **User**: `~/.config/opencode/weave-opencode.jsonc` or `~/.config/opencode/weave-opencode.json`
|
|
220
|
-
|
|
221
|
-
The configuration uses JSONC format, allowing for comments and trailing commas.
|
|
222
|
-
|
|
223
|
-
### Example Configuration
|
|
224
|
-
|
|
225
|
-
```jsonc
|
|
226
|
-
{
|
|
227
|
-
// Override agent models and parameters
|
|
228
|
-
"agents": {
|
|
229
|
-
"loom": {
|
|
230
|
-
"model": "anthropic/claude-3-5-sonnet",
|
|
231
|
-
"temperature": 0.1
|
|
232
|
-
},
|
|
233
|
-
"thread": {
|
|
234
|
-
"model": "openai/gpt-4o-mini"
|
|
235
|
-
}
|
|
236
|
-
},
|
|
237
|
-
// Category-based dispatch overrides
|
|
238
|
-
"categories": {
|
|
239
|
-
"visual-engineering": {
|
|
240
|
-
"model": "google/gemini-2-pro"
|
|
241
|
-
}
|
|
242
|
-
},
|
|
243
|
-
// Selective feature toggling
|
|
244
|
-
"disabled_hooks": [],
|
|
245
|
-
"disabled_agents": [],
|
|
246
|
-
"disabled_tools": [],
|
|
247
|
-
"disabled_skills": [],
|
|
248
|
-
// Background agent concurrency limits
|
|
249
|
-
"background": {
|
|
250
|
-
"defaultConcurrency": 5
|
|
251
|
-
}
|
|
252
|
-
}
|
|
253
|
-
```
|
|
254
|
-
|
|
255
|
-
### Configuration Fields
|
|
256
|
-
|
|
257
|
-
- `agents` — Override model, temperature, prompt_append, tools, and skills per agent.
|
|
258
|
-
- `categories` — Custom model and tool configurations for category-based dispatch.
|
|
259
|
-
- `disabled_hooks` / `disabled_agents` / `disabled_tools` / `disabled_skills` — Selective feature disabling.
|
|
260
|
-
- `background` — Concurrency limits and timeouts for parallel background agents.
|
|
261
|
-
- `tmux` — Terminal multiplexer layout settings for TUI integration.
|
|
262
|
-
- `skills` — Custom skill discovery paths and recursion settings.
|
|
263
|
-
- `experimental` — Plugin load timeouts and context window threshold adjustments.
|
|
264
|
-
|
|
265
|
-
## Features
|
|
266
|
-
|
|
267
|
-
### Hooks
|
|
268
|
-
|
|
269
|
-
Weave includes 5 built-in hooks that monitor and modify agent behavior:
|
|
270
|
-
|
|
271
|
-
- `context-window-monitor` — Warns when token usage approaches limits and suggests recovery strategies.
|
|
272
|
-
- `write-existing-file-guard` — Tracks file reads to prevent agents from overwriting files they haven't examined.
|
|
273
|
-
- `rules-injector` — Automatically injects contextual rules when agents enter directories containing AGENTS.md.
|
|
274
|
-
- `first-message-variant` — Applies specific prompt variants on session start for consistent behavior.
|
|
275
|
-
- `keyword-detector` — Detects keywords in messages to trigger behavioral changes or agent switches.
|
|
276
|
-
|
|
277
|
-
All hooks are enabled by default and can be disabled via the `disabled_hooks` configuration.
|
|
278
|
-
|
|
279
|
-
### Skills
|
|
280
|
-
|
|
281
|
-
Skills are injectable prompt expertise loaded from markdown files (SKILL.md). They modify agent behavior by prepending domain-specific instructions to the agent's system prompt.
|
|
282
|
-
|
|
283
|
-
Skills are discovered across three scopes:
|
|
284
|
-
- `builtin` — Provided by the Weave plugin.
|
|
285
|
-
- `user` — Located in the user's global configuration directory.
|
|
286
|
-
- `project` — Located in the current project's `.opencode/skills/` directory.
|
|
287
|
-
|
|
288
|
-
### Background Agents
|
|
289
|
-
|
|
290
|
-
Weave supports parallel asynchronous sub-agent management via the BackgroundManager. This allows Loom to spawn multiple agents simultaneously to handle independent tasks, with configurable concurrency limits to manage API rate limits.
|
|
291
|
-
|
|
292
|
-
### Tool Permissions
|
|
293
|
-
|
|
294
|
-
Tool access is controlled per-agent to ensure safety and specialized focus. For example, **Thread** and **Spindle** are strictly read-only; they are denied access to write, edit, and task management tools. These permissions can be customized globally or per-agent in the configuration.
|
|
295
|
-
|
|
296
103
|
## Development
|
|
297
104
|
|
|
298
105
|
- **Build**: `bun run build`
|
|
@@ -9,13 +9,15 @@ export interface TapestryPromptOptions {
|
|
|
9
9
|
/** Set of disabled agent names (lowercase config keys) */
|
|
10
10
|
disabledAgents?: Set<string>;
|
|
11
11
|
}
|
|
12
|
-
export declare function buildTapestryRoleSection(): string;
|
|
12
|
+
export declare function buildTapestryRoleSection(disabled?: Set<string>): string;
|
|
13
13
|
export declare function buildTapestryDisciplineSection(): string;
|
|
14
14
|
export declare function buildTapestrySidebarTodosSection(): string;
|
|
15
15
|
export declare function buildTapestryPlanExecutionSection(disabled?: Set<string>): string;
|
|
16
16
|
export declare function buildTapestryVerificationSection(): string;
|
|
17
|
+
export declare function buildTapestryVerificationGateSection(): string;
|
|
17
18
|
export declare function buildTapestryPostExecutionReviewSection(disabled: Set<string>): string;
|
|
18
19
|
export declare function buildTapestryExecutionSection(): string;
|
|
20
|
+
export declare function buildTapestryDebuggingSection(): string;
|
|
19
21
|
export declare function buildTapestryStyleSection(): string;
|
|
20
22
|
/**
|
|
21
23
|
* Compose the full Tapestry system prompt from sections.
|
package/dist/config/schema.d.ts
CHANGED
|
@@ -161,6 +161,7 @@ export declare const AnalyticsConfigSchema: z.ZodObject<{
|
|
|
161
161
|
}, z.core.$strip>;
|
|
162
162
|
export declare const WorkflowConfigSchema: z.ZodObject<{
|
|
163
163
|
disabled_workflows: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
164
|
+
directories: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
164
165
|
}, z.core.$strip>;
|
|
165
166
|
export declare const WeaveConfigSchema: z.ZodObject<{
|
|
166
167
|
$schema: z.ZodOptional<z.ZodString>;
|
|
@@ -233,6 +234,7 @@ export declare const WeaveConfigSchema: z.ZodObject<{
|
|
|
233
234
|
disabled_tools: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
234
235
|
disabled_agents: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
235
236
|
disabled_skills: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
237
|
+
skill_directories: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
236
238
|
background: z.ZodOptional<z.ZodObject<{
|
|
237
239
|
defaultConcurrency: z.ZodOptional<z.ZodNumber>;
|
|
238
240
|
providerConcurrency: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>;
|
|
@@ -261,6 +263,7 @@ export declare const WeaveConfigSchema: z.ZodObject<{
|
|
|
261
263
|
}, z.core.$strip>>;
|
|
262
264
|
workflows: z.ZodOptional<z.ZodObject<{
|
|
263
265
|
disabled_workflows: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
266
|
+
directories: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
264
267
|
}, z.core.$strip>>;
|
|
265
268
|
}, z.core.$strip>;
|
|
266
269
|
export type AgentOverrideConfig = z.infer<typeof AgentOverrideConfigSchema>;
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
import type { WorkState } from "../work-state/types";
|
|
2
2
|
import type { MetricsReport } from "./types";
|
|
3
3
|
/**
|
|
4
|
-
* Generate a
|
|
4
|
+
* Generate a metrics report for a completed plan.
|
|
5
5
|
*
|
|
6
6
|
* Orchestrates:
|
|
7
7
|
* 1. Extract planned files from the plan markdown
|
|
8
8
|
* 2. Get actual changed files via git diff (startSha..HEAD)
|
|
9
9
|
* 3. Calculate adherence (coverage, precision)
|
|
10
|
-
* 4. Aggregate token usage across all sessions
|
|
10
|
+
* 4. Aggregate token usage (with per-session and model detail) across all sessions
|
|
11
11
|
* 5. Compute total duration from session summaries
|
|
12
|
-
* 6.
|
|
12
|
+
* 6. Calculate quality score (composite of adherence, task completion, efficiency)
|
|
13
|
+
* 7. Write the report to metrics-reports.jsonl
|
|
13
14
|
*
|
|
14
|
-
* In Phase 1, `quality` and `gaps` are undefined.
|
|
15
15
|
* Returns the report if successful, null on error.
|
|
16
16
|
*/
|
|
17
17
|
export declare function generateMetricsReport(directory: string, state: WorkState): MetricsReport | null;
|
|
@@ -1,16 +1,17 @@
|
|
|
1
|
-
export type { ToolUsageEntry, DelegationEntry, SessionSummary, TokenUsage, MetricsTokenUsage, AdherenceReport, MetricsReport, DetectedStack, ProjectFingerprint,
|
|
1
|
+
export type { ToolUsageEntry, DelegationEntry, SessionSummary, TokenUsage, MetricsTokenUsage, AdherenceReport, MetricsReport, QualityReport, SessionTokenBreakdown, DetectedStack, ProjectFingerprint, InFlightToolCall, TrackedSession, } from "./types";
|
|
2
2
|
export { ANALYTICS_DIR, SESSION_SUMMARIES_FILE, FINGERPRINT_FILE, METRICS_REPORTS_FILE, MAX_METRICS_ENTRIES, zeroTokenUsage, } from "./types";
|
|
3
3
|
export { ensureAnalyticsDir, appendSessionSummary, readSessionSummaries, writeFingerprint, readFingerprint, writeMetricsReport, readMetricsReports, } from "./storage";
|
|
4
4
|
export { detectStack, detectPackageManager, detectMonorepo, detectPrimaryLanguage, generateFingerprint, fingerprintProject, getOrCreateFingerprint, } from "./fingerprint";
|
|
5
5
|
export { SessionTracker, createSessionTracker } from "./session-tracker";
|
|
6
|
-
export { generateSuggestions, getSuggestionsForProject } from "./suggestions";
|
|
7
6
|
export { generateTokenReport, getTokenReport } from "./token-report";
|
|
8
7
|
export { formatMetricsMarkdown } from "./format-metrics";
|
|
9
8
|
export { generateMetricsReport } from "./generate-metrics-report";
|
|
10
9
|
export { extractPlannedFiles } from "./plan-parser";
|
|
11
10
|
export { getChangedFiles } from "./git-diff";
|
|
12
11
|
export { calculateAdherence } from "./adherence";
|
|
13
|
-
export { aggregateTokensForPlan } from "./plan-token-aggregator";
|
|
12
|
+
export { aggregateTokensForPlan, aggregateTokensDetailed } from "./plan-token-aggregator";
|
|
13
|
+
export type { DetailedTokenAggregation } from "./plan-token-aggregator";
|
|
14
|
+
export { calculateQualityScore, BASELINE_TOKENS_PER_TASK } from "./quality-score";
|
|
14
15
|
import type { SessionTracker } from "./session-tracker";
|
|
15
16
|
import type { ProjectFingerprint } from "./types";
|
|
16
17
|
/** Return value of createAnalytics — bundles tracker + fingerprint */
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { MetricsTokenUsage } from "./types";
|
|
1
|
+
import type { MetricsTokenUsage, SessionTokenBreakdown } from "./types";
|
|
2
2
|
/**
|
|
3
3
|
* Aggregate token usage for a plan by summing across matching session summaries.
|
|
4
4
|
*
|
|
@@ -9,3 +9,26 @@ import type { MetricsTokenUsage } from "./types";
|
|
|
9
9
|
* Maps from session TokenUsage (inputTokens/outputTokens) to MetricsTokenUsage (input/output).
|
|
10
10
|
*/
|
|
11
11
|
export declare function aggregateTokensForPlan(directory: string, sessionIds: string[]): MetricsTokenUsage;
|
|
12
|
+
/** Result of detailed token aggregation across sessions for a plan */
|
|
13
|
+
export interface DetailedTokenAggregation {
|
|
14
|
+
/** Total token usage across all sessions */
|
|
15
|
+
total: MetricsTokenUsage;
|
|
16
|
+
/** Total dollar cost across all sessions */
|
|
17
|
+
totalCost: number;
|
|
18
|
+
/** Per-session breakdowns */
|
|
19
|
+
sessions: SessionTokenBreakdown[];
|
|
20
|
+
/** Per-model aggregation (grouped by model ID, "(unknown)" for sessions without model) */
|
|
21
|
+
modelBreakdown: Array<{
|
|
22
|
+
model: string;
|
|
23
|
+
tokens: MetricsTokenUsage;
|
|
24
|
+
cost: number;
|
|
25
|
+
sessionCount: number;
|
|
26
|
+
}>;
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Aggregate token usage for a plan with per-session and per-model detail.
|
|
30
|
+
*
|
|
31
|
+
* The existing `aggregateTokensForPlan()` is unchanged for backward compatibility.
|
|
32
|
+
* This function adds per-session breakdowns and model attribution.
|
|
33
|
+
*/
|
|
34
|
+
export declare function aggregateTokensDetailed(directory: string, sessionIds: string[]): DetailedTokenAggregation;
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import type { AdherenceReport, QualityReport } from "./types";
|
|
2
|
+
/**
|
|
3
|
+
* Baseline tokens-per-task for efficiency scoring.
|
|
4
|
+
* A plan consuming this many tokens per task gets an efficiency score of 0.5.
|
|
5
|
+
* Plans below baseline score above 0.5; plans above baseline score below 0.5.
|
|
6
|
+
* Exported for future configurability and test reference.
|
|
7
|
+
*/
|
|
8
|
+
export declare const BASELINE_TOKENS_PER_TASK = 50000;
|
|
9
|
+
/**
|
|
10
|
+
* Calculate a composite quality score for a completed plan.
|
|
11
|
+
*
|
|
12
|
+
* Inputs:
|
|
13
|
+
* - adherence: coverage and precision from the adherence report
|
|
14
|
+
* - totalTasks / completedTasks: from getPlanProgress()
|
|
15
|
+
* - totalTokens: sum of input + output + reasoning tokens across all sessions
|
|
16
|
+
*
|
|
17
|
+
* Component weights:
|
|
18
|
+
* - adherenceCoverage (30%): fraction of planned files actually changed
|
|
19
|
+
* - adherencePrecision (25%): fraction of actual changes that were planned
|
|
20
|
+
* - taskCompletion (30%): fraction of tasks marked [x]
|
|
21
|
+
* - efficiency (15%): inverse of normalized tokens-per-task (sigmoid-like)
|
|
22
|
+
*
|
|
23
|
+
* Pure function — no I/O.
|
|
24
|
+
*/
|
|
25
|
+
export declare function calculateQualityScore(params: {
|
|
26
|
+
adherence: AdherenceReport;
|
|
27
|
+
totalTasks: number;
|
|
28
|
+
completedTasks: number;
|
|
29
|
+
totalTokens: number;
|
|
30
|
+
}): QualityReport;
|
|
@@ -28,6 +28,11 @@ export declare class SessionTracker {
|
|
|
28
28
|
* Set the agent name for a session. Only sets on first call (captures primary agent).
|
|
29
29
|
*/
|
|
30
30
|
setAgentName(sessionId: string, agentName: string): void;
|
|
31
|
+
/**
|
|
32
|
+
* Set the model ID for a session. Only sets on first call (captures primary model).
|
|
33
|
+
* Safe to call for untracked sessions (no-op, no throw).
|
|
34
|
+
*/
|
|
35
|
+
trackModel(sessionId: string, modelId: string): void;
|
|
31
36
|
/**
|
|
32
37
|
* Accumulate dollar cost from a message into the session total.
|
|
33
38
|
*/
|
|
@@ -59,6 +59,8 @@ export interface SessionSummary {
|
|
|
59
59
|
totalDelegations: number;
|
|
60
60
|
/** Display name of the agent that ran this session (e.g., "Loom (Main Orchestrator)") */
|
|
61
61
|
agentName?: string;
|
|
62
|
+
/** Model ID used in this session (e.g., "claude-sonnet-4-20250514") */
|
|
63
|
+
model?: string;
|
|
62
64
|
/** Total dollar cost accumulated across all messages */
|
|
63
65
|
totalCost?: number;
|
|
64
66
|
/** Aggregated token usage across all messages (absent for old entries or sessions with no messages) */
|
|
@@ -92,16 +94,45 @@ export interface ProjectFingerprint {
|
|
|
92
94
|
/** Weave version that generated this fingerprint (e.g., "0.6.3") */
|
|
93
95
|
weaveVersion?: string;
|
|
94
96
|
}
|
|
95
|
-
/**
|
|
96
|
-
export interface
|
|
97
|
-
/**
|
|
98
|
-
|
|
99
|
-
/**
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
97
|
+
/** Composite quality score for a completed plan */
|
|
98
|
+
export interface QualityReport {
|
|
99
|
+
/** Composite quality score (0-1) — weighted average of components */
|
|
100
|
+
composite: number;
|
|
101
|
+
/** Component scores (each 0-1) */
|
|
102
|
+
components: {
|
|
103
|
+
/** Fraction of planned files that were actually changed */
|
|
104
|
+
adherenceCoverage: number;
|
|
105
|
+
/** Fraction of actual changes that were planned */
|
|
106
|
+
adherencePrecision: number;
|
|
107
|
+
/** Fraction of plan tasks marked as complete ([x]) */
|
|
108
|
+
taskCompletion: number;
|
|
109
|
+
/** Efficiency score — inverse of normalized tokens-per-task */
|
|
110
|
+
efficiency: number;
|
|
111
|
+
};
|
|
112
|
+
/** Raw data used to compute efficiency (for transparency) */
|
|
113
|
+
efficiencyData: {
|
|
114
|
+
/** Total tokens consumed */
|
|
115
|
+
totalTokens: number;
|
|
116
|
+
/** Number of tasks in the plan */
|
|
117
|
+
totalTasks: number;
|
|
118
|
+
/** Tokens per task */
|
|
119
|
+
tokensPerTask: number;
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
/** Per-session token breakdown within a plan's metrics report */
|
|
123
|
+
export interface SessionTokenBreakdown {
|
|
124
|
+
/** Session ID */
|
|
125
|
+
sessionId: string;
|
|
126
|
+
/** Model ID used in this session */
|
|
127
|
+
model?: string;
|
|
128
|
+
/** Display name of the agent */
|
|
129
|
+
agentName?: string;
|
|
130
|
+
/** Token usage for this session */
|
|
131
|
+
tokens: MetricsTokenUsage;
|
|
132
|
+
/** Dollar cost for this session */
|
|
133
|
+
cost?: number;
|
|
134
|
+
/** Duration in milliseconds */
|
|
135
|
+
durationMs: number;
|
|
105
136
|
}
|
|
106
137
|
/** File name for metrics reports (JSONL format) */
|
|
107
138
|
export declare const METRICS_REPORTS_FILE = "metrics-reports.jsonl";
|
|
@@ -147,10 +178,8 @@ export interface MetricsReport {
|
|
|
147
178
|
generatedAt: string;
|
|
148
179
|
/** Adherence metrics */
|
|
149
180
|
adherence: AdherenceReport;
|
|
150
|
-
/**
|
|
151
|
-
quality?:
|
|
152
|
-
/** Quality gaps (Phase 2 — undefined in Phase 1) */
|
|
153
|
-
gaps?: unknown;
|
|
181
|
+
/** Composite quality score for the plan */
|
|
182
|
+
quality?: QualityReport;
|
|
154
183
|
/** Token usage across all sessions */
|
|
155
184
|
tokenUsage: MetricsTokenUsage;
|
|
156
185
|
/** Total duration of all sessions in milliseconds */
|
|
@@ -163,6 +192,12 @@ export interface MetricsReport {
|
|
|
163
192
|
endSha?: string;
|
|
164
193
|
/** Session IDs that contributed to this report */
|
|
165
194
|
sessionIds: string[];
|
|
195
|
+
/** Deduplicated list of model IDs used across all sessions */
|
|
196
|
+
modelsUsed?: string[];
|
|
197
|
+
/** Total dollar cost across all sessions */
|
|
198
|
+
totalCost?: number;
|
|
199
|
+
/** Per-session token breakdown */
|
|
200
|
+
sessionBreakdown?: SessionTokenBreakdown[];
|
|
166
201
|
}
|
|
167
202
|
/** Tracks in-flight tool calls for duration measurement */
|
|
168
203
|
export interface InFlightToolCall {
|
|
@@ -187,6 +222,8 @@ export interface TrackedSession {
|
|
|
187
222
|
inFlight: Record<string, InFlightToolCall>;
|
|
188
223
|
/** Display name of the agent running this session */
|
|
189
224
|
agentName?: string;
|
|
225
|
+
/** Model ID used in this session (e.g., "claude-sonnet-4-20250514") */
|
|
226
|
+
model?: string;
|
|
190
227
|
/** Accumulated dollar cost across all messages */
|
|
191
228
|
totalCost: number;
|
|
192
229
|
/** Cumulative token usage across all messages */
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GitHub Models API caller for live eval execution.
|
|
3
|
+
*
|
|
4
|
+
* Provides a fetch-based approach for calling GitHub Models API
|
|
5
|
+
* in Phase 2 live eval harness. Uses only built-in fetch() — no new dependencies.
|
|
6
|
+
*/
|
|
7
|
+
export declare const GITHUB_MODELS_API_URL = "https://models.inference.ai.azure.com/chat/completions";
|
|
8
|
+
export declare const DELAY_BETWEEN_CALLS_MS = 1000;
|
|
9
|
+
export interface GitHubModelsResponse {
|
|
10
|
+
content: string;
|
|
11
|
+
durationMs: number;
|
|
12
|
+
}
|
|
13
|
+
export declare function callGitHubModels(systemPrompt: string, userMessage: string, model: string, token: string): Promise<GitHubModelsResponse>;
|
|
@@ -1,2 +1,7 @@
|
|
|
1
1
|
import type { EvalArtifacts, ExecutionContext, ModelResponseExecutor, ResolvedTarget } from "../types";
|
|
2
|
-
|
|
2
|
+
/**
|
|
3
|
+
* Executes a model-response eval case by calling the GitHub Models API.
|
|
4
|
+
*
|
|
5
|
+
* Phase 2 is live-only — requires GITHUB_TOKEN env var.
|
|
6
|
+
*/
|
|
7
|
+
export declare function executeModelResponse(resolvedTarget: ResolvedTarget, executor: ModelResponseExecutor, context: ExecutionContext): Promise<EvalArtifacts>;
|
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
import type { EvalArtifacts, ExecutionContext, ExecutorSpec, ResolvedTarget } from "../types";
|
|
2
|
-
export declare function executePromptRender(resolvedTarget: ResolvedTarget, executor: ExecutorSpec, _context: ExecutionContext): EvalArtifacts
|
|
2
|
+
export declare function executePromptRender(resolvedTarget: ResolvedTarget, executor: ExecutorSpec, _context: ExecutionContext): Promise<EvalArtifacts>;
|
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
import type { EvalArtifacts, ExecutionContext, ResolvedTarget, TrajectoryRunExecutor } from "../types";
|
|
2
|
+
export declare function detectDelegation(response: string): string | null;
|
|
3
|
+
export declare function executeTrajectoryRun(resolvedTarget: ResolvedTarget, executor: TrajectoryRunExecutor, context: ExecutionContext): Promise<EvalArtifacts>;
|
|
@@ -9,16 +9,19 @@
|
|
|
9
9
|
*
|
|
10
10
|
* Promptfoo, if adopted later, should plug in behind executor/judge adapters.
|
|
11
11
|
*/
|
|
12
|
-
export type { EvalPhase, EvalTarget, ExecutorSpec, EvaluatorSpec, EvalSuiteManifest, EvalCase, LoadedEvalCase, LoadedEvalSuiteManifest, EvalArtifacts, AssertionResult, EvalCaseResult, EvalRunResult, EvalRunSummary, RunEvalSuiteOptions, RunnerFilters, } from "./types";
|
|
13
|
-
export {
|
|
14
|
-
export {
|
|
12
|
+
export type { EvalPhase, EvalTarget, ExecutorSpec, EvaluatorSpec, EvalSuiteManifest, EvalCase, LoadedEvalCase, LoadedEvalSuiteManifest, EvalArtifacts, AssertionResult, EvalCaseResult, EvalRunResult, EvalRunSummary, RunEvalSuiteOptions, RunnerFilters, TrajectoryScenario, TrajectoryTurn, TrajectoryTrace, TrajectoryTurnResult, TrajectoryAssertionEvaluator, } from "./types";
|
|
13
|
+
export { isTrajectoryTrace } from "./types";
|
|
14
|
+
export { EvalCaseSchema, EvalSuiteManifestSchema, EvalRunResultSchema, TrajectoryScenarioSchema, TrajectoryTurnSchema, TrajectoryAssertionEvaluatorSchema, } from "./schema";
|
|
15
|
+
export { EvalConfigError, loadEvalSuiteManifest, loadEvalCasesForSuite, resolveSuitePath, loadTrajectoryScenario, } from "./loader";
|
|
15
16
|
export { resolveBuiltinAgentTarget } from "./targets/builtin-agent-target";
|
|
16
17
|
export { executePromptRender } from "./executors/prompt-renderer";
|
|
17
18
|
export { executeModelResponse } from "./executors/model-response";
|
|
19
|
+
export { executeTrajectoryRun, detectDelegation } from "./executors/trajectory-run";
|
|
18
20
|
export { runDeterministicEvaluator } from "./evaluators/deterministic";
|
|
19
21
|
export { runLlmJudgeEvaluator } from "./evaluators/llm-judge";
|
|
22
|
+
export { runTrajectoryAssertionEvaluator } from "./evaluators/trajectory-assertion";
|
|
20
23
|
export { deriveDeterministicBaseline, readDeterministicBaseline, compareDeterministicBaseline } from "./baseline";
|
|
21
|
-
export { ensureEvalStorageDir, getDefaultEvalRunPath, writeEvalRunResult } from "./storage";
|
|
22
|
-
export { formatEvalSummary } from "./reporter";
|
|
24
|
+
export { ensureEvalStorageDir, getDefaultEvalRunPath, writeEvalRunResult, getDefaultJsonlPath, appendEvalRunJsonl } from "./storage";
|
|
25
|
+
export { formatEvalSummary, formatJobSummaryMarkdown } from "./reporter";
|
|
23
26
|
export type { RunEvalSuiteOutput } from "./runner";
|
|
24
27
|
export { runEvalSuite } from "./runner";
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { LoadedEvalCase, LoadedEvalSuiteManifest } from "./types";
|
|
1
|
+
import type { LoadedEvalCase, LoadedEvalSuiteManifest, TrajectoryScenario } from "./types";
|
|
2
2
|
export declare class EvalConfigError extends Error {
|
|
3
3
|
constructor(message: string);
|
|
4
4
|
}
|
|
@@ -6,3 +6,4 @@ export declare function resolveSuitePath(directory: string, suite: string): stri
|
|
|
6
6
|
export declare function loadEvalSuiteManifest(directory: string, suite: string): LoadedEvalSuiteManifest;
|
|
7
7
|
export declare function loadEvalCaseFile(directory: string, filePath: string): LoadedEvalCase;
|
|
8
8
|
export declare function loadEvalCasesForSuite(directory: string, suite: LoadedEvalSuiteManifest): LoadedEvalCase[];
|
|
9
|
+
export declare function loadTrajectoryScenario(directory: string, scenarioRef: string): TrajectoryScenario;
|
|
@@ -4,4 +4,4 @@ export interface RunEvalSuiteOutput {
|
|
|
4
4
|
artifactPath: string;
|
|
5
5
|
consoleSummary: string;
|
|
6
6
|
}
|
|
7
|
-
export declare function runEvalSuite(options: RunEvalSuiteOptions): RunEvalSuiteOutput
|
|
7
|
+
export declare function runEvalSuite(options: RunEvalSuiteOptions): Promise<RunEvalSuiteOutput>;
|