@kweaver-ai/kweaver-sdk 0.8.1 → 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +19 -5
- package/README.zh.md +19 -5
- package/dist/agent-providers/index.d.ts +7 -0
- package/dist/agent-providers/index.js +5 -0
- package/dist/agent-providers/prompt-template.d.ts +62 -0
- package/dist/agent-providers/prompt-template.js +105 -0
- package/dist/agent-providers/prompts/rubric-judge-v1.prompt.md +51 -0
- package/dist/agent-providers/prompts/within-trace-synthesizer-v1.prompt.md +60 -0
- package/dist/agent-providers/providers/claude-code-subprocess.d.ts +74 -0
- package/dist/agent-providers/providers/claude-code-subprocess.js +259 -0
- package/dist/agent-providers/providers/stub.d.ts +47 -0
- package/dist/agent-providers/providers/stub.js +77 -0
- package/dist/agent-providers/registry.d.ts +45 -0
- package/dist/agent-providers/registry.js +77 -0
- package/dist/agent-providers/types.d.ts +91 -0
- package/dist/agent-providers/types.js +25 -0
- package/dist/api/agent-chat.js +8 -6
- package/dist/api/context-loader.d.ts +1 -0
- package/dist/api/semantic-search.d.ts +5 -0
- package/dist/api/semantic-search.js +5 -0
- package/dist/api/skills.d.ts +75 -2
- package/dist/api/skills.js +108 -12
- package/dist/api/trace.d.ts +5 -0
- package/dist/api/trace.js +4 -0
- package/dist/cli.js +7 -5
- package/dist/commands/agent/mode.d.ts +6 -0
- package/dist/commands/agent/mode.js +75 -0
- package/dist/commands/agent.js +101 -29
- package/dist/commands/context-loader.js +608 -38
- package/dist/commands/skill.d.ts +21 -1
- package/dist/commands/skill.js +389 -1
- package/dist/commands/trace.d.ts +26 -1
- package/dist/commands/trace.js +515 -15
- package/dist/index.d.ts +2 -2
- package/dist/index.js +1 -1
- package/dist/resources/bkn.d.ts +5 -0
- package/dist/resources/bkn.js +5 -0
- package/dist/resources/skills.d.ts +17 -1
- package/dist/resources/skills.js +32 -1
- package/dist/trace-ai/diagnose/agent-binding.d.ts +67 -0
- package/dist/trace-ai/diagnose/agent-binding.js +257 -0
- package/dist/trace-ai/diagnose/builtin-rules/tool-retry-intent-mismatch.yaml +68 -0
- package/dist/trace-ai/diagnose/index.d.ts +32 -0
- package/dist/trace-ai/diagnose/index.js +246 -0
- package/dist/trace-ai/diagnose/output-schema-converter.d.ts +24 -0
- package/dist/trace-ai/diagnose/output-schema-converter.js +81 -0
- package/dist/trace-ai/diagnose/query-extractor.d.ts +14 -0
- package/dist/trace-ai/diagnose/query-extractor.js +45 -0
- package/dist/trace-ai/diagnose/report-assembler.d.ts +31 -0
- package/dist/{trace-core → trace-ai}/diagnose/report-assembler.js +19 -9
- package/dist/trace-ai/diagnose/report-markdown.d.ts +18 -0
- package/dist/trace-ai/diagnose/report-markdown.js +192 -0
- package/dist/{trace-core → trace-ai}/diagnose/rule-loader.js +42 -8
- package/dist/{trace-core → trace-ai}/diagnose/schemas.d.ts +77 -2
- package/dist/trace-ai/diagnose/schemas.js +154 -0
- package/dist/trace-ai/diagnose/signal-probe.d.ts +17 -0
- package/dist/trace-ai/diagnose/signal-probe.js +39 -0
- package/dist/trace-ai/diagnose/synthesizer-agent.d.ts +40 -0
- package/dist/trace-ai/diagnose/synthesizer-agent.js +158 -0
- package/dist/{trace-core → trace-ai}/diagnose/trace-shaper.js +1 -0
- package/dist/{trace-core → trace-ai}/diagnose/types.d.ts +55 -6
- package/dist/trace-ai/eval-set/assertion-evaluator.d.ts +29 -0
- package/dist/trace-ai/eval-set/assertion-evaluator.js +100 -0
- package/dist/trace-ai/eval-set/builder.d.ts +36 -0
- package/dist/trace-ai/eval-set/builder.js +126 -0
- package/dist/trace-ai/eval-set/index.d.ts +15 -0
- package/dist/trace-ai/eval-set/index.js +10 -0
- package/dist/trace-ai/eval-set/output-writer.d.ts +27 -0
- package/dist/trace-ai/eval-set/output-writer.js +126 -0
- package/dist/trace-ai/eval-set/query-picker.d.ts +37 -0
- package/dist/trace-ai/eval-set/query-picker.js +147 -0
- package/dist/trace-ai/eval-set/redactor.d.ts +42 -0
- package/dist/trace-ai/eval-set/redactor.js +133 -0
- package/dist/trace-ai/eval-set/rubric-templates/answer-match-reference.prompt.md +19 -0
- package/dist/trace-ai/eval-set/schemas.d.ts +136 -0
- package/dist/trace-ai/eval-set/schemas.js +130 -0
- package/dist/trace-ai/eval-set/semantic-match-provider.d.ts +33 -0
- package/dist/trace-ai/eval-set/semantic-match-provider.js +51 -0
- package/dist/trace-ai/eval-set/test-runner.d.ts +34 -0
- package/dist/trace-ai/eval-set/test-runner.js +153 -0
- package/dist/trace-ai/eval-set/types.d.ts +46 -0
- package/dist/trace-ai/eval-set/types.js +8 -0
- package/dist/trace-ai/exp/bundle-writer.d.ts +10 -0
- package/dist/trace-ai/exp/bundle-writer.js +54 -0
- package/dist/trace-ai/exp/claude-binary.d.ts +5 -0
- package/dist/trace-ai/exp/claude-binary.js +30 -0
- package/dist/trace-ai/exp/coordinator.d.ts +45 -0
- package/dist/trace-ai/exp/coordinator.js +203 -0
- package/dist/trace-ai/exp/eval-runner.d.ts +14 -0
- package/dist/trace-ai/exp/eval-runner.js +47 -0
- package/dist/trace-ai/exp/exp-store/abort-signal.d.ts +3 -0
- package/dist/trace-ai/exp/exp-store/abort-signal.js +27 -0
- package/dist/trace-ai/exp/exp-store/candidate-lineage-yaml.d.ts +4 -0
- package/dist/trace-ai/exp/exp-store/candidate-lineage-yaml.js +37 -0
- package/dist/trace-ai/exp/exp-store/events-jsonl.d.ts +17 -0
- package/dist/trace-ai/exp/exp-store/events-jsonl.js +60 -0
- package/dist/trace-ai/exp/exp-store/exp-registry.d.ts +6 -0
- package/dist/trace-ai/exp/exp-store/exp-registry.js +41 -0
- package/dist/trace-ai/exp/exp-store/index.d.ts +46 -0
- package/dist/trace-ai/exp/exp-store/index.js +59 -0
- package/dist/trace-ai/exp/exp-store/lock.d.ts +3 -0
- package/dist/trace-ai/exp/exp-store/lock.js +73 -0
- package/dist/trace-ai/exp/exp-store/mission-md.d.ts +3 -0
- package/dist/trace-ai/exp/exp-store/mission-md.js +37 -0
- package/dist/trace-ai/exp/exp-store/readme-template.d.ts +5 -0
- package/dist/trace-ai/exp/exp-store/readme-template.js +25 -0
- package/dist/trace-ai/exp/exp-store/round-yaml.d.ts +3 -0
- package/dist/trace-ai/exp/exp-store/round-yaml.js +33 -0
- package/dist/trace-ai/exp/index.d.ts +8 -0
- package/dist/trace-ai/exp/index.js +238 -0
- package/dist/trace-ai/exp/info.d.ts +35 -0
- package/dist/trace-ai/exp/info.js +120 -0
- package/dist/trace-ai/exp/patch/agent-config.d.ts +1 -0
- package/dist/trace-ai/exp/patch/agent-config.js +26 -0
- package/dist/trace-ai/exp/patch/index.d.ts +2 -0
- package/dist/trace-ai/exp/patch/index.js +13 -0
- package/dist/trace-ai/exp/patch/skill.d.ts +1 -0
- package/dist/trace-ai/exp/patch/skill.js +24 -0
- package/dist/trace-ai/exp/providers/synthesizer-client.d.ts +14 -0
- package/dist/trace-ai/exp/providers/synthesizer-client.js +39 -0
- package/dist/trace-ai/exp/providers/triage-client.d.ts +19 -0
- package/dist/trace-ai/exp/providers/triage-client.js +51 -0
- package/dist/trace-ai/exp/schemas.d.ts +147 -0
- package/dist/trace-ai/exp/schemas.js +50 -0
- package/dist/trace-ai/exp/scoring.d.ts +2 -0
- package/dist/trace-ai/exp/scoring.js +46 -0
- package/dist/trace-ai/scan/aggregator.d.ts +20 -0
- package/dist/trace-ai/scan/aggregator.js +26 -0
- package/dist/trace-ai/scan/artifacts/paths.d.ts +12 -0
- package/dist/trace-ai/scan/artifacts/paths.js +18 -0
- package/dist/trace-ai/scan/artifacts/writer.d.ts +67 -0
- package/dist/trace-ai/scan/artifacts/writer.js +96 -0
- package/dist/trace-ai/scan/batched-rubric.d.ts +55 -0
- package/dist/trace-ai/scan/batched-rubric.js +159 -0
- package/dist/trace-ai/scan/cross-trace-synthesizer.d.ts +24 -0
- package/dist/trace-ai/scan/cross-trace-synthesizer.js +93 -0
- package/dist/trace-ai/scan/index.d.ts +31 -0
- package/dist/trace-ai/scan/index.js +390 -0
- package/dist/trace-ai/scan/prompts/builtin/cross-trace-synthesizer-v1.prompt.md +44 -0
- package/dist/trace-ai/scan/prompts/builtin/rubric-judge-batch-v1.prompt.md +44 -0
- package/dist/trace-ai/scan/runner.d.ts +25 -0
- package/dist/trace-ai/scan/runner.js +42 -0
- package/dist/trace-ai/scan/sampler.d.ts +18 -0
- package/dist/trace-ai/scan/sampler.js +81 -0
- package/dist/trace-ai/scan/scan-summary-markdown.d.ts +2 -0
- package/dist/trace-ai/scan/scan-summary-markdown.js +71 -0
- package/dist/trace-ai/scan/scan-summary-schema.d.ts +73 -0
- package/dist/trace-ai/scan/scan-summary-schema.js +61 -0
- package/dist/trace-ai/scan/single-agent-validator.d.ts +23 -0
- package/dist/trace-ai/scan/single-agent-validator.js +42 -0
- package/dist/trace-ai/scan/traces-list-parser.d.ts +15 -0
- package/dist/trace-ai/scan/traces-list-parser.js +46 -0
- package/package.json +2 -2
- package/dist/trace-core/diagnose/index.d.ts +0 -9
- package/dist/trace-core/diagnose/index.js +0 -104
- package/dist/trace-core/diagnose/report-assembler.d.ts +0 -12
- package/dist/trace-core/diagnose/schemas.js +0 -94
- package/dist/trace-core/diagnose/signal-probe.d.ts +0 -5
- package/dist/trace-core/diagnose/signal-probe.js +0 -21
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/excessive-tool-calls-per-turn.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/excessive-tool-calls-per-turn.js +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/excessive-tool-calls-per-turn.yaml +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/llm-response-truncated-no-continue.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/llm-response-truncated-no-continue.js +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/llm-response-truncated-no-continue.yaml +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/register.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/register.js +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/retrieval-empty-no-fallback.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/retrieval-empty-no-fallback.js +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/retrieval-empty-no-fallback.yaml +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/tool-error-swallowed.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/tool-error-swallowed.js +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/tool-error-swallowed.yaml +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/tool-loop-no-state-change.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/tool-loop-no-state-change.js +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/tool-loop-no-state-change.yaml +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/predicate-registry.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/predicate-registry.js +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/rule-loader.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/synthesizer-template.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/synthesizer-template.js +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/trace-shaper.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/types.js +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import type { ClientContext } from "../client.js";
|
|
2
|
-
import { type SkillListResult, type SkillStatus } from "../api/skills.js";
|
|
2
|
+
import { type SkillCategory, type SkillListResult, type SkillStatus } from "../api/skills.js";
|
|
3
3
|
export declare class SkillsResource {
|
|
4
4
|
private readonly ctx;
|
|
5
5
|
constructor(ctx: ClientContext);
|
|
@@ -24,6 +24,7 @@ export declare class SkillsResource {
|
|
|
24
24
|
source?: string;
|
|
25
25
|
}): Promise<SkillListResult>;
|
|
26
26
|
get(skillId: string): Promise<import("../api/skills.js").SkillInfo>;
|
|
27
|
+
getMarket(skillId: string): Promise<import("../api/skills.js").SkillInfo>;
|
|
27
28
|
registerContent(content: string, opts?: {
|
|
28
29
|
source?: string;
|
|
29
30
|
extendInfo?: Record<string, unknown>;
|
|
@@ -34,6 +35,18 @@ export declare class SkillsResource {
|
|
|
34
35
|
}): Promise<import("../api/skills.js").RegisterSkillResult>;
|
|
35
36
|
delete(skillId: string): Promise<import("../api/skills.js").DeleteSkillResult>;
|
|
36
37
|
updateStatus(skillId: string, status: SkillStatus): Promise<import("../api/skills.js").UpdateSkillStatusResult>;
|
|
38
|
+
updateMetadata(skillId: string, metadata: {
|
|
39
|
+
name: string;
|
|
40
|
+
description: string;
|
|
41
|
+
category: SkillCategory;
|
|
42
|
+
source?: string;
|
|
43
|
+
extendInfo?: Record<string, unknown>;
|
|
44
|
+
}): Promise<import("../api/skills.js").UpdateSkillMetadataResult>;
|
|
45
|
+
updatePackageContent(skillId: string, content: string): Promise<import("../api/skills.js").UpdateSkillPackageResult>;
|
|
46
|
+
updatePackageZip(skillId: string, filename: string, bytes: Uint8Array): Promise<import("../api/skills.js").UpdateSkillPackageResult>;
|
|
47
|
+
history(skillId: string): Promise<import("../api/skills.js").SkillReleaseHistoryInfo[]>;
|
|
48
|
+
republishHistory(skillId: string, version: string): Promise<import("../api/skills.js").UpdateSkillPackageResult>;
|
|
49
|
+
publishHistory(skillId: string, version: string): Promise<import("../api/skills.js").UpdateSkillPackageResult>;
|
|
37
50
|
content(skillId: string): Promise<import("../api/skills.js").SkillContentIndex>;
|
|
38
51
|
fetchContent(skillId: string): Promise<string>;
|
|
39
52
|
readFile(skillId: string, relPath: string): Promise<import("../api/skills.js").SkillFileReadResult>;
|
|
@@ -44,4 +57,7 @@ export declare class SkillsResource {
|
|
|
44
57
|
}): Promise<{
|
|
45
58
|
directory: string;
|
|
46
59
|
}>;
|
|
60
|
+
getManagementContent(skillId: string, responseMode?: "url" | "content"): Promise<import("../api/skills.js").SkillManagementContentData>;
|
|
61
|
+
readManagementFile(skillId: string, relPath: string): Promise<import("../api/skills.js").SkillFileReadResult>;
|
|
62
|
+
downloadManagementArchive(skillId: string, responseMode?: "url" | "content"): Promise<import("../api/skills.js").DownloadedSkillArchive>;
|
|
47
63
|
}
|
package/dist/resources/skills.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { deleteSkill, downloadSkill, fetchSkillContent, fetchSkillFile, getSkill, getSkillContentIndex, installSkillArchive, listSkillMarket, listSkills, readSkillFile, registerSkillContent, registerSkillZip, updateSkillStatus, } from "../api/skills.js";
|
|
1
|
+
import { deleteSkill, downloadSkill, downloadSkillManagementArchive, fetchSkillContent, fetchSkillFile, getSkill, getSkillMarketDetail, getSkillContentIndex, getSkillManagementContent, installSkillArchive, listSkillMarket, listSkillHistory, listSkills, publishSkillHistory, readSkillFile, readSkillManagementFile, republishSkillHistory, registerSkillContent, registerSkillZip, updateSkillMetadata, updateSkillPackageContent, updateSkillPackageZip, updateSkillStatus, } from "../api/skills.js";
|
|
2
2
|
export class SkillsResource {
|
|
3
3
|
ctx;
|
|
4
4
|
constructor(ctx) {
|
|
@@ -13,6 +13,9 @@ export class SkillsResource {
|
|
|
13
13
|
async get(skillId) {
|
|
14
14
|
return getSkill({ ...this.ctx.base(), skillId });
|
|
15
15
|
}
|
|
16
|
+
async getMarket(skillId) {
|
|
17
|
+
return getSkillMarketDetail({ ...this.ctx.base(), skillId });
|
|
18
|
+
}
|
|
16
19
|
async registerContent(content, opts = {}) {
|
|
17
20
|
return registerSkillContent({ ...this.ctx.base(), content, ...opts });
|
|
18
21
|
}
|
|
@@ -25,6 +28,24 @@ export class SkillsResource {
|
|
|
25
28
|
async updateStatus(skillId, status) {
|
|
26
29
|
return updateSkillStatus({ ...this.ctx.base(), skillId, status });
|
|
27
30
|
}
|
|
31
|
+
async updateMetadata(skillId, metadata) {
|
|
32
|
+
return updateSkillMetadata({ ...this.ctx.base(), skillId, ...metadata });
|
|
33
|
+
}
|
|
34
|
+
async updatePackageContent(skillId, content) {
|
|
35
|
+
return updateSkillPackageContent({ ...this.ctx.base(), skillId, content });
|
|
36
|
+
}
|
|
37
|
+
async updatePackageZip(skillId, filename, bytes) {
|
|
38
|
+
return updateSkillPackageZip({ ...this.ctx.base(), skillId, filename, bytes });
|
|
39
|
+
}
|
|
40
|
+
async history(skillId) {
|
|
41
|
+
return listSkillHistory({ ...this.ctx.base(), skillId });
|
|
42
|
+
}
|
|
43
|
+
async republishHistory(skillId, version) {
|
|
44
|
+
return republishSkillHistory({ ...this.ctx.base(), skillId, version });
|
|
45
|
+
}
|
|
46
|
+
async publishHistory(skillId, version) {
|
|
47
|
+
return publishSkillHistory({ ...this.ctx.base(), skillId, version });
|
|
48
|
+
}
|
|
28
49
|
async content(skillId) {
|
|
29
50
|
return getSkillContentIndex({ ...this.ctx.base(), skillId });
|
|
30
51
|
}
|
|
@@ -44,4 +65,14 @@ export class SkillsResource {
|
|
|
44
65
|
const archive = await this.download(skillId);
|
|
45
66
|
return installSkillArchive({ bytes: archive.bytes, directory, force: opts.force });
|
|
46
67
|
}
|
|
68
|
+
// ── Management Content ───────────────────────────────────────────────────────
|
|
69
|
+
async getManagementContent(skillId, responseMode) {
|
|
70
|
+
return getSkillManagementContent({ ...this.ctx.base(), skillId, responseMode });
|
|
71
|
+
}
|
|
72
|
+
async readManagementFile(skillId, relPath) {
|
|
73
|
+
return readSkillManagementFile({ ...this.ctx.base(), skillId, relPath });
|
|
74
|
+
}
|
|
75
|
+
async downloadManagementArchive(skillId, responseMode) {
|
|
76
|
+
return downloadSkillManagementArchive({ ...this.ctx.base(), skillId, responseMode });
|
|
77
|
+
}
|
|
47
78
|
}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Stage-2 — rubric judgments: resolve rubric.inputs against a TraceTree,
|
|
3
|
+
* render the prompt template, invoke the agent provider, and map the
|
|
4
|
+
* schema-validated output to a `Finding`.
|
|
5
|
+
*
|
|
6
|
+
* Why this layer exists separate from `signal-probe`:
|
|
7
|
+
* - Async / I/O-bound (subprocess or HTTP) vs. sync predicates
|
|
8
|
+
* - Per-rule provider lookup + skip-on-unavailable
|
|
9
|
+
* - Error semantics differ (skip + record vs. throw RuleProbeError)
|
|
10
|
+
*
|
|
11
|
+
* Convergence invariant (enforced upstream in schemas.ts): every rubric
|
|
12
|
+
* `output_schema.required` includes `first_violating_step_id`, so each
|
|
13
|
+
* rubric finding always points at a concrete span and can be correlated
|
|
14
|
+
* with symbolic findings on the same span by the synthesizer.
|
|
15
|
+
*/
|
|
16
|
+
import type { Finding, Rule, RubricInputSpec, TraceTree } from "./types.js";
|
|
17
|
+
import type { AgentRegistry } from "../../agent-providers/registry.js";
|
|
18
|
+
import { PromptTemplateRegistry, type AgentOutputLang } from "../../agent-providers/prompt-template.js";
|
|
19
|
+
import type { ArtifactWriter } from "../scan/artifacts/writer.js";
|
|
20
|
+
export interface RubricEvaluateOpts {
|
|
21
|
+
rules: Rule[];
|
|
22
|
+
tree: TraceTree;
|
|
23
|
+
registry: AgentRegistry;
|
|
24
|
+
promptRegistry: PromptTemplateRegistry;
|
|
25
|
+
/** If true, all rubric rules are skipped with reason='no-llm-flag-set'. */
|
|
26
|
+
noLlm?: boolean;
|
|
27
|
+
/** Per-invocation timeout in ms; provider may apply its own ceiling. */
|
|
28
|
+
timeoutMs?: number;
|
|
29
|
+
/** Output locale for natural-language fields in the agent reply. Default 'en'. */
|
|
30
|
+
lang?: AgentOutputLang;
|
|
31
|
+
/** When provided, writes Stage-2 prompt/response artifacts per rule invocation. */
|
|
32
|
+
artifacts?: ArtifactWriter;
|
|
33
|
+
/** User query extracted from trace input.messages (2026-05-13). */
|
|
34
|
+
userQuery?: string | null;
|
|
35
|
+
/** Conversation/query ID for suggested_eval_case correlation (2026-05-13). */
|
|
36
|
+
queryId?: string | null;
|
|
37
|
+
}
|
|
38
|
+
export interface RubricEvaluateResult {
|
|
39
|
+
findings: Finding[];
|
|
40
|
+
skipped: {
|
|
41
|
+
ruleId: string;
|
|
42
|
+
reason: string;
|
|
43
|
+
}[];
|
|
44
|
+
}
|
|
45
|
+
/** Resolve one rubric input spec against the trace and return a value
|
|
46
|
+
* suitable for prompt interpolation. Pure for testability. */
|
|
47
|
+
export declare function resolveRubricInput(spec: RubricInputSpec, tree: TraceTree): unknown;
|
|
48
|
+
export declare class AgentBindingError extends Error {
|
|
49
|
+
constructor(message: string);
|
|
50
|
+
}
|
|
51
|
+
export declare function renderChangeTemplate(tpl: string, bindings: Record<string, unknown>): string;
|
|
52
|
+
/**
|
|
53
|
+
* Evaluate every rubric rule in `rules` and return findings + skip records.
|
|
54
|
+
*
|
|
55
|
+
* A rule is *skipped* (not failed) when:
|
|
56
|
+
* - `--no-llm` is set → reason: 'no-llm-flag-set'
|
|
57
|
+
* - rule's named provider isn't registered or `isAvailable()` is false
|
|
58
|
+
* → reason: `provider-not-available:<name>`
|
|
59
|
+
* - rule's prompt template isn't registered
|
|
60
|
+
* → reason: `prompt-template-missing:<ref>`
|
|
61
|
+
*
|
|
62
|
+
* Schema violations / transport errors from the provider are surfaced
|
|
63
|
+
* as `AgentBindingError` (let the CLI decide whether to fail the whole
|
|
64
|
+
* run or skip the single rule). Default callsite (`index.ts`) catches
|
|
65
|
+
* these and records them in `rules_skipped` rather than aborting.
|
|
66
|
+
*/
|
|
67
|
+
export declare function evaluateRubricRules(opts: RubricEvaluateOpts): Promise<RubricEvaluateResult>;
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Stage-2 — rubric judgments: resolve rubric.inputs against a TraceTree,
|
|
3
|
+
* render the prompt template, invoke the agent provider, and map the
|
|
4
|
+
* schema-validated output to a `Finding`.
|
|
5
|
+
*
|
|
6
|
+
* Why this layer exists separate from `signal-probe`:
|
|
7
|
+
* - Async / I/O-bound (subprocess or HTTP) vs. sync predicates
|
|
8
|
+
* - Per-rule provider lookup + skip-on-unavailable
|
|
9
|
+
* - Error semantics differ (skip + record vs. throw RuleProbeError)
|
|
10
|
+
*
|
|
11
|
+
* Convergence invariant (enforced upstream in schemas.ts): every rubric
|
|
12
|
+
* `output_schema.required` includes `first_violating_step_id`, so each
|
|
13
|
+
* rubric finding always points at a concrete span and can be correlated
|
|
14
|
+
* with symbolic findings on the same span by the synthesizer.
|
|
15
|
+
*/
|
|
16
|
+
import { AgentProviderError } from "../../agent-providers/types.js";
|
|
17
|
+
import { render as renderPrompt, languageInstructionFor, } from "../../agent-providers/prompt-template.js";
|
|
18
|
+
/** Resolve one rubric input spec against the trace and return a value
|
|
19
|
+
* suitable for prompt interpolation. Pure for testability. */
|
|
20
|
+
export function resolveRubricInput(spec, tree) {
|
|
21
|
+
const colon = spec.source.indexOf(":");
|
|
22
|
+
if (colon === -1) {
|
|
23
|
+
throw new AgentBindingError(`rubric input source missing prefix (expected '<scheme>:...'): '${spec.source}'`);
|
|
24
|
+
}
|
|
25
|
+
const scheme = spec.source.slice(0, colon);
|
|
26
|
+
const payload = spec.source.slice(colon + 1);
|
|
27
|
+
switch (scheme) {
|
|
28
|
+
case "extract_from_root_attr": {
|
|
29
|
+
// dotted path against root.attributes (nested attr lookups are common).
|
|
30
|
+
const root = tree.root;
|
|
31
|
+
if (!root)
|
|
32
|
+
return null;
|
|
33
|
+
return getDottedPath(root.attributes, payload);
|
|
34
|
+
}
|
|
35
|
+
case "filter_by_kind": {
|
|
36
|
+
// payload form: '[kind1,kind2,...]' or 'kind1,kind2,...'
|
|
37
|
+
const inner = payload.replace(/^\[|\]$/g, "");
|
|
38
|
+
const kinds = inner.split(",").map((s) => s.trim()).filter(Boolean);
|
|
39
|
+
const acc = [];
|
|
40
|
+
for (const k of kinds) {
|
|
41
|
+
const spans = tree.byKind.get(k) ?? [];
|
|
42
|
+
for (const s of spans) {
|
|
43
|
+
acc.push({
|
|
44
|
+
spanId: s.spanId,
|
|
45
|
+
name: s.name,
|
|
46
|
+
kind: s.kind,
|
|
47
|
+
attributes: s.attributes,
|
|
48
|
+
durationMs: s.durationMs,
|
|
49
|
+
status: s.status,
|
|
50
|
+
});
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
// Order chronologically so the agent reads a coherent timeline.
|
|
54
|
+
acc.sort((a, b) => {
|
|
55
|
+
const sa = tree.byId.get(a.spanId)?.startTimeUnixNano ?? "0";
|
|
56
|
+
const sb = tree.byId.get(b.spanId)?.startTimeUnixNano ?? "0";
|
|
57
|
+
return Number(BigInt(sa) - BigInt(sb));
|
|
58
|
+
});
|
|
59
|
+
return acc;
|
|
60
|
+
}
|
|
61
|
+
case "literal":
|
|
62
|
+
try {
|
|
63
|
+
return JSON.parse(payload);
|
|
64
|
+
}
|
|
65
|
+
catch (e) {
|
|
66
|
+
throw new AgentBindingError(`literal source has invalid JSON: ${e.message}`);
|
|
67
|
+
}
|
|
68
|
+
default:
|
|
69
|
+
throw new AgentBindingError(`unknown rubric input source scheme: '${scheme}'`);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
export class AgentBindingError extends Error {
|
|
73
|
+
constructor(message) {
|
|
74
|
+
super(message);
|
|
75
|
+
this.name = "AgentBindingError";
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
/** OTel attribute keys are typically flat strings with literal dots
|
|
79
|
+
* (e.g. `gen_ai.user.message`), but some traces nest objects under a
|
|
80
|
+
* parent attribute. Try direct lookup first; fall back to nested walk. */
|
|
81
|
+
function getDottedPath(obj, path) {
|
|
82
|
+
if (obj === null || obj === undefined || typeof obj !== "object")
|
|
83
|
+
return undefined;
|
|
84
|
+
const flat = obj[path];
|
|
85
|
+
if (flat !== undefined)
|
|
86
|
+
return flat;
|
|
87
|
+
let cur = obj;
|
|
88
|
+
for (const part of path.split(".")) {
|
|
89
|
+
if (cur === null || cur === undefined)
|
|
90
|
+
return undefined;
|
|
91
|
+
if (typeof cur !== "object")
|
|
92
|
+
return undefined;
|
|
93
|
+
cur = cur[part];
|
|
94
|
+
}
|
|
95
|
+
return cur;
|
|
96
|
+
}
|
|
97
|
+
function buildPromptVars(rule, tree, resolvedInputs, lang) {
|
|
98
|
+
// Surface enough context that builtin:rubric-judge-v1 can be a generic
|
|
99
|
+
// template without per-rule knowledge: judge question + inputs blob +
|
|
100
|
+
// rule metadata. `language_instruction` localizes prose fields only;
|
|
101
|
+
// schema-fixed values (enums, span IDs) stay English regardless.
|
|
102
|
+
return {
|
|
103
|
+
rule_id: rule.id,
|
|
104
|
+
judge_question: rule.rubric?.judgeQuestion ?? "",
|
|
105
|
+
output_schema: rule.rubric?.outputSchemaRaw ?? {},
|
|
106
|
+
inputs: resolvedInputs,
|
|
107
|
+
trace_id: tree.traceId,
|
|
108
|
+
language_instruction: languageInstructionFor(lang),
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
async function evaluateOne(rule, tree, provider, promptRegistry, timeoutMs, lang = "en", artifacts, userQuery = null, queryId = null) {
|
|
112
|
+
const rubric = rule.rubric; // caller guarantees
|
|
113
|
+
// Resolve inputs.
|
|
114
|
+
const resolvedInputs = {};
|
|
115
|
+
for (const inp of rubric.inputs) {
|
|
116
|
+
resolvedInputs[inp.kind] = resolveRubricInput(inp, tree);
|
|
117
|
+
}
|
|
118
|
+
// Render prompt.
|
|
119
|
+
const tpl = promptRegistry.get(rubric.agentBinding.promptTemplateRef);
|
|
120
|
+
const prompt = renderPrompt(tpl, buildPromptVars(rule, tree, resolvedInputs, lang));
|
|
121
|
+
if (artifacts) {
|
|
122
|
+
await artifacts.writeStageTwoPrompt(rule.id, 0, prompt); // chunk-000 — single-trace mode K=1
|
|
123
|
+
}
|
|
124
|
+
// Invoke.
|
|
125
|
+
const resp = await provider.invoke({
|
|
126
|
+
prompt,
|
|
127
|
+
outputSchema: rubric.outputZodSchema,
|
|
128
|
+
timeoutMs,
|
|
129
|
+
correlationId: `${tree.traceId}/${rule.id}`,
|
|
130
|
+
});
|
|
131
|
+
if (artifacts) {
|
|
132
|
+
await artifacts.writeStageTwoResponse(rule.id, 0, resp.output);
|
|
133
|
+
}
|
|
134
|
+
const out = resp.output;
|
|
135
|
+
const firstSpan = out.first_violating_step_id;
|
|
136
|
+
const otherSpans = Array.isArray(out.evidence_span_ids) ? out.evidence_span_ids : [];
|
|
137
|
+
// Convergence: ensure first_violating_step_id is in evidence.spans.
|
|
138
|
+
const spans = otherSpans.includes(firstSpan) ? otherSpans : [firstSpan, ...otherSpans];
|
|
139
|
+
return {
|
|
140
|
+
ruleId: rule.id,
|
|
141
|
+
judgmentKind: "rubric",
|
|
142
|
+
severity: out.severity ?? rule.severity, // agent may upgrade/downgrade
|
|
143
|
+
symptom: rule.symptom,
|
|
144
|
+
likelyCause: out.category ?? out.reasoning ?? rule.symptom,
|
|
145
|
+
evidence: {
|
|
146
|
+
spans,
|
|
147
|
+
excerpt: out.reasoning ?? "",
|
|
148
|
+
},
|
|
149
|
+
suggestedFix: {
|
|
150
|
+
target: rule.suggestedFix.target,
|
|
151
|
+
// Render changeTemplate with rubric output as bindings (best-effort:
|
|
152
|
+
// string-keyed values; complex shapes pass through unchanged).
|
|
153
|
+
change: renderChangeTemplate(rule.suggestedFix.changeTemplate, out),
|
|
154
|
+
},
|
|
155
|
+
confidence: out.confidence ?? "medium", // rubric default > symbolic
|
|
156
|
+
verifyWith: {
|
|
157
|
+
suggestedEvalCase: {
|
|
158
|
+
queryId,
|
|
159
|
+
query: userQuery,
|
|
160
|
+
assertions: rule.verifyWith.assertionTemplates.map((t) => renderChangeTemplate(t, out)),
|
|
161
|
+
},
|
|
162
|
+
},
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
export function renderChangeTemplate(tpl, bindings) {
|
|
166
|
+
return tpl.replace(/{{\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*}}/g, (_, key) => {
|
|
167
|
+
const v = bindings[key];
|
|
168
|
+
if (v === undefined || v === null)
|
|
169
|
+
return `{{${key}}}`;
|
|
170
|
+
return typeof v === "string" ? v : JSON.stringify(v);
|
|
171
|
+
});
|
|
172
|
+
}
|
|
173
|
+
/**
|
|
174
|
+
* Evaluate every rubric rule in `rules` and return findings + skip records.
|
|
175
|
+
*
|
|
176
|
+
* A rule is *skipped* (not failed) when:
|
|
177
|
+
* - `--no-llm` is set → reason: 'no-llm-flag-set'
|
|
178
|
+
* - rule's named provider isn't registered or `isAvailable()` is false
|
|
179
|
+
* → reason: `provider-not-available:<name>`
|
|
180
|
+
* - rule's prompt template isn't registered
|
|
181
|
+
* → reason: `prompt-template-missing:<ref>`
|
|
182
|
+
*
|
|
183
|
+
* Schema violations / transport errors from the provider are surfaced
|
|
184
|
+
* as `AgentBindingError` (let the CLI decide whether to fail the whole
|
|
185
|
+
* run or skip the single rule). Default callsite (`index.ts`) catches
|
|
186
|
+
* these and records them in `rules_skipped` rather than aborting.
|
|
187
|
+
*/
|
|
188
|
+
export async function evaluateRubricRules(opts) {
|
|
189
|
+
const findings = [];
|
|
190
|
+
const skipped = [];
|
|
191
|
+
for (const rule of opts.rules) {
|
|
192
|
+
if (!rule.rubric)
|
|
193
|
+
continue;
|
|
194
|
+
if (opts.noLlm) {
|
|
195
|
+
skipped.push({ ruleId: rule.id, reason: "no-llm-flag-set" });
|
|
196
|
+
continue;
|
|
197
|
+
}
|
|
198
|
+
// Resolve provider for this rule.
|
|
199
|
+
let provider;
|
|
200
|
+
try {
|
|
201
|
+
provider = opts.registry.resolve({
|
|
202
|
+
preferred: rule.rubric.agentBinding.provider,
|
|
203
|
+
requiredCapabilities: ["structured_output"],
|
|
204
|
+
});
|
|
205
|
+
}
|
|
206
|
+
catch (e) {
|
|
207
|
+
if (e instanceof AgentProviderError && e.kind === "not_available") {
|
|
208
|
+
skipped.push({
|
|
209
|
+
ruleId: rule.id,
|
|
210
|
+
reason: `provider-not-available:${rule.rubric.agentBinding.provider}`,
|
|
211
|
+
});
|
|
212
|
+
continue;
|
|
213
|
+
}
|
|
214
|
+
throw e;
|
|
215
|
+
}
|
|
216
|
+
if (!provider) {
|
|
217
|
+
skipped.push({
|
|
218
|
+
ruleId: rule.id,
|
|
219
|
+
reason: `provider-not-available:${rule.rubric.agentBinding.provider}`,
|
|
220
|
+
});
|
|
221
|
+
continue;
|
|
222
|
+
}
|
|
223
|
+
if (!(await provider.isAvailable())) {
|
|
224
|
+
skipped.push({
|
|
225
|
+
ruleId: rule.id,
|
|
226
|
+
reason: `provider-not-available:${rule.rubric.agentBinding.provider}`,
|
|
227
|
+
});
|
|
228
|
+
continue;
|
|
229
|
+
}
|
|
230
|
+
if (!opts.promptRegistry.has(rule.rubric.agentBinding.promptTemplateRef)) {
|
|
231
|
+
skipped.push({
|
|
232
|
+
ruleId: rule.id,
|
|
233
|
+
reason: `prompt-template-missing:${rule.rubric.agentBinding.promptTemplateRef}`,
|
|
234
|
+
});
|
|
235
|
+
continue;
|
|
236
|
+
}
|
|
237
|
+
try {
|
|
238
|
+
// Write work-queue once per rule before invoking (single-trace: 1 entry).
|
|
239
|
+
await opts.artifacts?.writeStageTwoWorkQueue(rule.id, [opts.tree.traceId]);
|
|
240
|
+
const finding = await evaluateOne(rule, opts.tree, provider, opts.promptRegistry, opts.timeoutMs, opts.lang ?? "en", opts.artifacts, opts.userQuery ?? null, opts.queryId ?? null);
|
|
241
|
+
findings.push(finding);
|
|
242
|
+
}
|
|
243
|
+
catch (e) {
|
|
244
|
+
if (e instanceof AgentProviderError) {
|
|
245
|
+
// Provider-level failures (timeout / transport / schema_violation) downgrade
|
|
246
|
+
// to a skip; the rest of the run still produces a usable report.
|
|
247
|
+
skipped.push({
|
|
248
|
+
ruleId: rule.id,
|
|
249
|
+
reason: `agent-error:${e.kind}`,
|
|
250
|
+
});
|
|
251
|
+
continue;
|
|
252
|
+
}
|
|
253
|
+
throw e;
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
return { findings, skipped };
|
|
257
|
+
}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
schema_version: diagnosis-rule/v1
|
|
2
|
+
id: tool_retry_intent_mismatch
|
|
3
|
+
|
|
4
|
+
# Paired with the symbolic rule `tool_loop_no_state_change`:
|
|
5
|
+
# - symbolic rule: "the same tool ran N times with identical args"
|
|
6
|
+
# - this rubric: "given the user's intent and the retry context,
|
|
7
|
+
# WHY did the agent keep retrying?"
|
|
8
|
+
#
|
|
9
|
+
# The two findings will share span sequences (Stage-1↔Stage-2 convergence
|
|
10
|
+
# is enforced because output_schema.required includes
|
|
11
|
+
# first_violating_step_id), so the within-trace synthesizer can collapse
|
|
12
|
+
# them into one cross_finding_link with relation="same span sequence;
|
|
13
|
+
# symbolic detects mechanical pattern, rubric judges semantic intent".
|
|
14
|
+
|
|
15
|
+
severity: high
|
|
16
|
+
symptom: repeated_tool_call_without_state_change
|
|
17
|
+
|
|
18
|
+
taxonomy:
|
|
19
|
+
signals_axis: execution
|
|
20
|
+
ms_class: retry_loop
|
|
21
|
+
|
|
22
|
+
suggested_fix:
|
|
23
|
+
target: decision_agent.prompt
|
|
24
|
+
change_template: "agent retried because of '{{category}}'; address that intent (e.g. add staleness detection, broaden query, escalate to human)"
|
|
25
|
+
|
|
26
|
+
verify_with:
|
|
27
|
+
assertion_templates:
|
|
28
|
+
- "for the same conversation, the agent reaches a non-retry next step"
|
|
29
|
+
|
|
30
|
+
rubric:
|
|
31
|
+
gates_on:
|
|
32
|
+
- tool_loop_no_state_change
|
|
33
|
+
judge_question: >-
|
|
34
|
+
Given the user's intent and the tool retry pattern in this trace,
|
|
35
|
+
classify why the agent kept calling the same tool: a legitimate
|
|
36
|
+
retry strategy (expecting changed state), a stale-results handling
|
|
37
|
+
failure (results were identical and the agent didn't recognize that),
|
|
38
|
+
prompt confusion (the agent misinterpreted its own instructions),
|
|
39
|
+
or other.
|
|
40
|
+
inputs:
|
|
41
|
+
- kind: user_intent
|
|
42
|
+
source: extract_from_root_attr:gen_ai.user.message
|
|
43
|
+
- kind: span_sequence
|
|
44
|
+
source: filter_by_kind:[tool,llm]
|
|
45
|
+
output_schema:
|
|
46
|
+
type: object
|
|
47
|
+
required: [category, reasoning, severity, first_violating_step_id]
|
|
48
|
+
properties:
|
|
49
|
+
category:
|
|
50
|
+
type: string
|
|
51
|
+
enum: [legitimate_retry, stale_results, prompt_confusion, other]
|
|
52
|
+
reasoning:
|
|
53
|
+
type: string
|
|
54
|
+
severity:
|
|
55
|
+
type: string
|
|
56
|
+
enum: [low, medium, high]
|
|
57
|
+
confidence:
|
|
58
|
+
type: string
|
|
59
|
+
enum: [low, medium, high]
|
|
60
|
+
first_violating_step_id:
|
|
61
|
+
type: string
|
|
62
|
+
evidence_span_ids:
|
|
63
|
+
type: array
|
|
64
|
+
items:
|
|
65
|
+
type: string
|
|
66
|
+
agent_binding:
|
|
67
|
+
provider: claude-code
|
|
68
|
+
prompt_template_ref: builtin:rubric-judge-v1
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import { RuleLoadError } from "./rule-loader.js";
|
|
2
|
+
import { RuleProbeError } from "./signal-probe.js";
|
|
3
|
+
import type { DiagnoseOpts, Report } from "./types.js";
|
|
4
|
+
import type { AgentRegistry } from "../../agent-providers/registry.js";
|
|
5
|
+
import { PromptTemplateRegistry } from "../../agent-providers/prompt-template.js";
|
|
6
|
+
import "./builtin-rules/register.js";
|
|
7
|
+
export declare class TraceNotFoundError extends Error {
|
|
8
|
+
constructor(conversationId: string);
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* Allow callers (CLI, tests, future scan-mode) to inject a custom registry
|
|
12
|
+
* + prompt registry without globals. The CLI in `commands/trace.ts` calls
|
|
13
|
+
* `diagnose()` and registers the default ClaudeCodeSubprocessProvider into
|
|
14
|
+
* `defaultRegistry` ahead of time; tests pass their own registry containing
|
|
15
|
+
* a StubAgentProvider.
|
|
16
|
+
*/
|
|
17
|
+
export interface DiagnoseInternalOpts {
|
|
18
|
+
/** Override the AgentRegistry used for rubric rules + synthesizer. */
|
|
19
|
+
registry?: AgentRegistry;
|
|
20
|
+
/** Override the PromptTemplateRegistry. */
|
|
21
|
+
promptRegistry?: PromptTemplateRegistry;
|
|
22
|
+
}
|
|
23
|
+
export declare function diagnose(conversationId: string, opts: DiagnoseOpts, internal?: DiagnoseInternalOpts): Promise<Report>;
|
|
24
|
+
/** Resolve which file paths to write given the user-supplied --out and format.
|
|
25
|
+
* Both: derive the missing extension from the given one; if --out had no
|
|
26
|
+
* recognized extension, append .yaml / .md. Single-format: write to --out
|
|
27
|
+
* verbatim (caller's extension is honored as-is). */
|
|
28
|
+
export declare function derivePaths(out: string, format: 'yaml' | 'markdown' | 'both'): {
|
|
29
|
+
yamlPath: string | null;
|
|
30
|
+
mdPath: string | null;
|
|
31
|
+
};
|
|
32
|
+
export { TraceNotFoundError as DiagnoseTraceNotFound, RuleLoadError, RuleProbeError };
|