@sanity/ailf 2.0.2 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/cli.js +0 -0
- package/package.json +24 -24
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
- package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
- package/dist/_vendor/ailf-tasks/cli.js +0 -61
- package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
- package/dist/_vendor/ailf-tasks/index.js +0 -16
- package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
- package/dist/_vendor/ailf-tasks/parser.js +0 -73
- package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
- package/dist/_vendor/ailf-tasks/schemas.js +0 -180
- package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
- package/dist/_vendor/ailf-tasks/validation.js +0 -162
- package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
- package/dist/adapters/task-sources/yaml-task-source.js +0 -139
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
- package/dist/commands/update-quality-scores.d.ts +0 -5
- package/dist/commands/update-quality-scores.js +0 -20
- package/dist/lib/agent-behavior-report.d.ts +0 -8
- package/dist/lib/agent-behavior-report.js +0 -185
- package/dist/lib/baseline.d.ts +0 -19
- package/dist/lib/baseline.js +0 -153
- package/dist/lib/calculate-scores.d.ts +0 -23
- package/dist/lib/calculate-scores.js +0 -42
- package/dist/lib/compare.d.ts +0 -18
- package/dist/lib/compare.js +0 -170
- package/dist/lib/coverage-audit.d.ts +0 -4
- package/dist/lib/coverage-audit.js +0 -42
- package/dist/lib/discovery-report.d.ts +0 -13
- package/dist/lib/discovery-report.js +0 -57
- package/dist/lib/fetch-docs.d.ts +0 -30
- package/dist/lib/fetch-docs.js +0 -171
- package/dist/lib/generate-configs.d.ts +0 -25
- package/dist/lib/generate-configs.js +0 -42
- package/dist/lib/grader-api.d.ts +0 -21
- package/dist/lib/grader-api.js +0 -34
- package/dist/lib/grader-compare.d.ts +0 -19
- package/dist/lib/grader-compare.js +0 -91
- package/dist/lib/grader-consistency.d.ts +0 -27
- package/dist/lib/grader-consistency.js +0 -79
- package/dist/lib/grader-sensitivity.d.ts +0 -19
- package/dist/lib/grader-sensitivity.js +0 -75
- package/dist/lib/grader-validate.d.ts +0 -19
- package/dist/lib/grader-validate.js +0 -78
- package/dist/lib/measure-retrieval.d.ts +0 -14
- package/dist/lib/measure-retrieval.js +0 -71
- package/dist/lib/pr-comment.d.ts +0 -16
- package/dist/lib/pr-comment.js +0 -28
- package/dist/lib/readiness-report.d.ts +0 -13
- package/dist/lib/readiness-report.js +0 -108
- package/dist/lib/webhook-server.d.ts +0 -11
- package/dist/lib/webhook-server.js +0 -24
- package/dist/lib/weekly-digest.d.ts +0 -24
- package/dist/lib/weekly-digest.js +0 -148
- package/dist/orchestration/env-bridge.d.ts +0 -21
- package/dist/orchestration/env-bridge.js +0 -66
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
- package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
- package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
- package/dist/pipeline/compiler/task-bridge.js +0 -92
- package/dist/pipeline/expand-tasks.d.ts +0 -232
- package/dist/pipeline/expand-tasks.js +0 -467
- package/dist/pipeline/generate-configs.d.ts +0 -92
- package/dist/pipeline/generate-configs.js +0 -445
- package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/calculate-scores-step.js +0 -89
- package/dist/pipeline/steps/compare-step.d.ts +0 -18
- package/dist/pipeline/steps/compare-step.js +0 -90
- package/dist/pipeline/steps/eval-step.d.ts +0 -53
- package/dist/pipeline/steps/eval-step.js +0 -347
- package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
- package/dist/pipeline/steps/fetch-docs-step.js +0 -84
- package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
- package/dist/pipeline/steps/generate-configs-step.js +0 -98
- package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
- package/dist/pipeline/steps/grader-consistency-step.js +0 -74
- package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
- package/dist/pipeline/steps/publish-report-step.js +0 -243
- package/dist/pipeline/steps/report-step.d.ts +0 -13
- package/dist/pipeline/steps/report-step.js +0 -56
- package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/update-scores-step.js +0 -42
- package/dist/scripts/agent-behavior-report.d.ts +0 -19
- package/dist/scripts/agent-behavior-report.js +0 -315
- package/dist/scripts/baseline.d.ts +0 -43
- package/dist/scripts/baseline.js +0 -267
- package/dist/scripts/calculate-scores.d.ts +0 -166
- package/dist/scripts/calculate-scores.js +0 -1296
- package/dist/scripts/compare.d.ts +0 -22
- package/dist/scripts/compare.js +0 -334
- package/dist/scripts/coverage-audit.d.ts +0 -44
- package/dist/scripts/coverage-audit.js +0 -209
- package/dist/scripts/debug-eval.d.ts +0 -19
- package/dist/scripts/debug-eval.js +0 -73
- package/dist/scripts/discovery-report.d.ts +0 -58
- package/dist/scripts/discovery-report.js +0 -250
- package/dist/scripts/fetch-docs.d.ts +0 -35
- package/dist/scripts/fetch-docs.js +0 -472
- package/dist/scripts/generate-configs.d.ts +0 -66
- package/dist/scripts/generate-configs.js +0 -459
- package/dist/scripts/grader-api.d.ts +0 -27
- package/dist/scripts/grader-api.js +0 -206
- package/dist/scripts/grader-compare.d.ts +0 -22
- package/dist/scripts/grader-compare.js +0 -368
- package/dist/scripts/grader-consistency.d.ts +0 -20
- package/dist/scripts/grader-consistency.js +0 -313
- package/dist/scripts/grader-sensitivity.d.ts +0 -22
- package/dist/scripts/grader-sensitivity.js +0 -354
- package/dist/scripts/grader-validate.d.ts +0 -19
- package/dist/scripts/grader-validate.js +0 -267
- package/dist/scripts/measure-retrieval.d.ts +0 -10
- package/dist/scripts/measure-retrieval.js +0 -145
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
- package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
- package/dist/scripts/pipeline.d.ts +0 -76
- package/dist/scripts/pipeline.js +0 -1031
- package/dist/scripts/pr-comment.d.ts +0 -10
- package/dist/scripts/pr-comment.js +0 -510
- package/dist/scripts/readiness-report.d.ts +0 -88
- package/dist/scripts/readiness-report.js +0 -342
- package/dist/scripts/update-quality-scores.d.ts +0 -15
- package/dist/scripts/update-quality-scores.js +0 -184
- package/dist/scripts/validate-task-sources.d.ts +0 -21
- package/dist/scripts/validate-task-sources.js +0 -210
- package/dist/scripts/validate.d.ts +0 -13
- package/dist/scripts/validate.js +0 -79
- package/dist/scripts/webhook-server.d.ts +0 -26
- package/dist/scripts/webhook-server.js +0 -147
- package/dist/scripts/weekly-digest.d.ts +0 -24
- package/dist/scripts/weekly-digest.js +0 -144
- package/dist/sinks/format-slack.d.ts +0 -64
- package/dist/sinks/format-slack.js +0 -306
- package/dist/sinks/slack-sink.d.ts +0 -27
- package/dist/sinks/slack-sink.js +0 -78
- package/dist/sinks/webhook-sink.d.ts +0 -19
- package/dist/sinks/webhook-sink.js +0 -50
- package/tasks/.expanded.agentic.yaml +0 -280
- package/tasks/.expanded.yaml +0 -565
|
@@ -1,139 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Adapter: Load task definitions from tasks/*.yaml files.
|
|
3
|
-
*
|
|
4
|
-
* This adapter reads the raw YAML task definitions (before Promptfoo
|
|
5
|
-
* expansion) and maps them to GeneralizedTaskDefinition
|
|
6
|
-
* (LiteracyTaskDefinition variant) from @sanity/ailf-core. It handles
|
|
7
|
-
* area filtering (filename stem) and task ID filtering.
|
|
8
|
-
*
|
|
9
|
-
* Unlike loadAndExpandTasks() — which produces Promptfoo-specific
|
|
10
|
-
* ExpandedTestEntry objects — this adapter produces domain-level
|
|
11
|
-
* GeneralizedTaskDefinition objects suitable for the pipeline orchestrator.
|
|
12
|
-
*/
|
|
13
|
-
import { existsSync, readdirSync, readFileSync } from "fs";
|
|
14
|
-
import { resolve } from "path";
|
|
15
|
-
import { load } from "js-yaml";
|
|
16
|
-
// ---------------------------------------------------------------------------
|
|
17
|
-
// YamlTaskSource adapter
|
|
18
|
-
// ---------------------------------------------------------------------------
|
|
19
|
-
export class YamlTaskSource {
|
|
20
|
-
rootDir;
|
|
21
|
-
constructor(rootDir) {
|
|
22
|
-
this.rootDir = rootDir;
|
|
23
|
-
}
|
|
24
|
-
async loadTasks(filter) {
|
|
25
|
-
const tasksDir = resolve(this.rootDir, "tasks");
|
|
26
|
-
if (!existsSync(tasksDir)) {
|
|
27
|
-
throw new Error(`tasks/ directory not found at ${tasksDir}`);
|
|
28
|
-
}
|
|
29
|
-
let yamlFiles = readdirSync(tasksDir)
|
|
30
|
-
.filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) && !f.startsWith("."))
|
|
31
|
-
.sort();
|
|
32
|
-
// Area filter — area name = filename stem (e.g., "groq" matches "groq.yaml")
|
|
33
|
-
if (filter?.areas && filter.areas.length > 0) {
|
|
34
|
-
const allowedAreas = new Set(filter.areas.map((a) => a.toLowerCase()));
|
|
35
|
-
yamlFiles = yamlFiles.filter((f) => {
|
|
36
|
-
const stem = f.replace(/\.ya?ml$/, "").toLowerCase();
|
|
37
|
-
return allowedAreas.has(stem);
|
|
38
|
-
});
|
|
39
|
-
}
|
|
40
|
-
const definitions = [];
|
|
41
|
-
for (const file of yamlFiles) {
|
|
42
|
-
const featureArea = file.replace(/\.ya?ml$/, "");
|
|
43
|
-
const filePath = resolve(tasksDir, file);
|
|
44
|
-
const raw = readFileSync(filePath, "utf-8");
|
|
45
|
-
const parsed = load(raw);
|
|
46
|
-
if (!Array.isArray(parsed)) {
|
|
47
|
-
throw new Error(`${file} did not parse to an array of tasks`);
|
|
48
|
-
}
|
|
49
|
-
for (const entry of parsed) {
|
|
50
|
-
if (!isRawYamlTask(entry))
|
|
51
|
-
continue;
|
|
52
|
-
// Task ID filter
|
|
53
|
-
if (filter?.taskIds &&
|
|
54
|
-
filter.taskIds.length > 0 &&
|
|
55
|
-
!filter.taskIds.includes(entry.id)) {
|
|
56
|
-
continue;
|
|
57
|
-
}
|
|
58
|
-
definitions.push(mapToLiteracyTask(entry, featureArea));
|
|
59
|
-
}
|
|
60
|
-
}
|
|
61
|
-
return definitions;
|
|
62
|
-
}
|
|
63
|
-
}
|
|
64
|
-
// ---------------------------------------------------------------------------
|
|
65
|
-
// Mapping helpers
|
|
66
|
-
// ---------------------------------------------------------------------------
|
|
67
|
-
/**
|
|
68
|
-
* Map a raw YAML entry directly to a LiteracyTaskDefinition.
|
|
69
|
-
*
|
|
70
|
-
* Renames snake_case YAML keys to the generalized type's field names and
|
|
71
|
-
* extracts the task prompt from `vars.task`. Additional vars beyond `task`
|
|
72
|
-
* and `docs` are collected into `prompt.vars`.
|
|
73
|
-
*/
|
|
74
|
-
function mapToLiteracyTask(raw, featureArea) {
|
|
75
|
-
const { task, docs: _docs, ...rest } = (raw.vars ?? {});
|
|
76
|
-
const docs = (raw.canonical_docs ?? [])
|
|
77
|
-
.map(mapCanonicalDoc)
|
|
78
|
-
.filter((d) => d !== null);
|
|
79
|
-
const extraVars = Object.keys(rest).length > 0 ? rest : undefined;
|
|
80
|
-
return {
|
|
81
|
-
mode: "literacy",
|
|
82
|
-
id: raw.id,
|
|
83
|
-
title: raw.description,
|
|
84
|
-
area: featureArea,
|
|
85
|
-
prompt: {
|
|
86
|
-
text: typeof task === "string" ? task : "",
|
|
87
|
-
...(extraVars ? { vars: extraVars } : {}),
|
|
88
|
-
},
|
|
89
|
-
context: { docs },
|
|
90
|
-
referenceSolution: raw.reference_solution ?? "",
|
|
91
|
-
docCoverage: raw.doc_coverage ?? false,
|
|
92
|
-
assertions: (raw.assert ?? []),
|
|
93
|
-
...(raw.baseline ? { baseline: raw.baseline } : {}),
|
|
94
|
-
};
|
|
95
|
-
}
|
|
96
|
-
// ---------------------------------------------------------------------------
|
|
97
|
-
// Canonical doc mapping
|
|
98
|
-
// ---------------------------------------------------------------------------
|
|
99
|
-
/**
|
|
100
|
-
* Map a raw YAML canonical doc entry to the polymorphic CanonicalDocRef.
|
|
101
|
-
*
|
|
102
|
-
* Discriminates by key presence: slug, path, id, or perspective.
|
|
103
|
-
* Returns null (with a warning) if no valid resolution key is found.
|
|
104
|
-
*/
|
|
105
|
-
function mapCanonicalDoc(raw) {
|
|
106
|
-
const reason = raw.reason ?? "";
|
|
107
|
-
// Resolution priority: id > slug > path > perspective.
|
|
108
|
-
// When `id` is present it's always an IdDocRef — slug and path are carried
|
|
109
|
-
// as optional annotations for human readability (they are NOT used for
|
|
110
|
-
// resolution by the pipeline).
|
|
111
|
-
if (raw.id) {
|
|
112
|
-
return {
|
|
113
|
-
id: raw.id,
|
|
114
|
-
reason,
|
|
115
|
-
...(raw.slug ? { slug: raw.slug } : {}),
|
|
116
|
-
...(raw.path ? { path: raw.path } : {}),
|
|
117
|
-
};
|
|
118
|
-
}
|
|
119
|
-
if (raw.slug)
|
|
120
|
-
return { slug: raw.slug, reason };
|
|
121
|
-
if (raw.path)
|
|
122
|
-
return { path: raw.path, reason };
|
|
123
|
-
if (raw.perspective)
|
|
124
|
-
return { perspective: raw.perspective, reason };
|
|
125
|
-
console.warn(" [warn] Skipping canonical doc entry with no resolution key (id, slug, path, or perspective)");
|
|
126
|
-
return null;
|
|
127
|
-
}
|
|
128
|
-
// ---------------------------------------------------------------------------
|
|
129
|
-
// Type guard
|
|
130
|
-
// ---------------------------------------------------------------------------
|
|
131
|
-
/** Check whether a parsed YAML value looks like a single task definition. */
|
|
132
|
-
function isRawYamlTask(entry) {
|
|
133
|
-
return (typeof entry === "object" &&
|
|
134
|
-
entry !== null &&
|
|
135
|
-
"id" in entry &&
|
|
136
|
-
typeof entry.id === "string" &&
|
|
137
|
-
"description" in entry &&
|
|
138
|
-
typeof entry.description === "string");
|
|
139
|
-
}
|
|
@@ -1,185 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Final validation — ensures all agent-observer modules work together
|
|
3
|
-
* and the full data pipeline (record → classify → summarize) is correct.
|
|
4
|
-
*
|
|
5
|
-
* Run: tsx src/agent-observer/test-imports.ts
|
|
6
|
-
*/
|
|
7
|
-
import { classifyRequests, extractDocSlug, extractSearchQuery, extractApiEndpoint, extractDomain, extractPageTitle, isDocPageRequest, isSearchRequest, isSanityApiRequest, } from "./classifier.js";
|
|
8
|
-
import { RequestRecorder } from "./proxy.js";
|
|
9
|
-
import { default as InstrumentedProvider } from "./provider.js";
|
|
10
|
-
// ─── Test data ───────────────────────────────────────────────────────────────
|
|
11
|
-
const now = Date.now();
|
|
12
|
-
const mockRequests = [
|
|
13
|
-
// 1. Doc page visit
|
|
14
|
-
{
|
|
15
|
-
headers: {},
|
|
16
|
-
latencyMs: 234,
|
|
17
|
-
method: "GET",
|
|
18
|
-
responsePreview: "<html><head><title>Create a Schema - Sanity</title></head>...",
|
|
19
|
-
responseSize: 45000,
|
|
20
|
-
seq: 0,
|
|
21
|
-
statusCode: 200,
|
|
22
|
-
timestamp: new Date(now).toISOString(),
|
|
23
|
-
url: "https://www.sanity.io/docs/create-a-schema-and-configure-sanity-studio",
|
|
24
|
-
},
|
|
25
|
-
// 2. Another doc page
|
|
26
|
-
{
|
|
27
|
-
headers: {},
|
|
28
|
-
latencyMs: 180,
|
|
29
|
-
method: "GET",
|
|
30
|
-
responsePreview: "<html><head><title>Object Type - Sanity Docs</title></head>...",
|
|
31
|
-
responseSize: 32000,
|
|
32
|
-
seq: 1,
|
|
33
|
-
statusCode: 200,
|
|
34
|
-
timestamp: new Date(now + 100).toISOString(),
|
|
35
|
-
url: "https://www.sanity.io/docs/schema-types/object-type",
|
|
36
|
-
},
|
|
37
|
-
// 3. Search query
|
|
38
|
-
{
|
|
39
|
-
headers: {},
|
|
40
|
-
latencyMs: 450,
|
|
41
|
-
method: "GET",
|
|
42
|
-
responseSize: 12000,
|
|
43
|
-
seq: 2,
|
|
44
|
-
statusCode: 200,
|
|
45
|
-
timestamp: new Date(now + 200).toISOString(),
|
|
46
|
-
url: "https://www.sanity.io/search?q=visual+editing+preview",
|
|
47
|
-
},
|
|
48
|
-
// 4. Sanity API call (GROQ query — must NOT be classified as search)
|
|
49
|
-
{
|
|
50
|
-
headers: {},
|
|
51
|
-
latencyMs: 320,
|
|
52
|
-
method: "GET",
|
|
53
|
-
responseSize: 8500,
|
|
54
|
-
seq: 3,
|
|
55
|
-
statusCode: 200,
|
|
56
|
-
timestamp: new Date(now + 300).toISOString(),
|
|
57
|
-
url: 'https://api.sanity.io/v2021-03-25/data/query/production?query=*[_type=="article"]',
|
|
58
|
-
},
|
|
59
|
-
// 5. CDN API call
|
|
60
|
-
{
|
|
61
|
-
headers: {},
|
|
62
|
-
latencyMs: 85,
|
|
63
|
-
method: "GET",
|
|
64
|
-
responseSize: 150000,
|
|
65
|
-
seq: 4,
|
|
66
|
-
statusCode: 200,
|
|
67
|
-
timestamp: new Date(now + 350).toISOString(),
|
|
68
|
-
url: "https://cdn.sanity.io/images/abc123/production/image-xyz.jpg",
|
|
69
|
-
},
|
|
70
|
-
// 6. External request (npm docs)
|
|
71
|
-
{
|
|
72
|
-
headers: {},
|
|
73
|
-
latencyMs: 300,
|
|
74
|
-
method: "GET",
|
|
75
|
-
responseSize: 20000,
|
|
76
|
-
seq: 5,
|
|
77
|
-
statusCode: 200,
|
|
78
|
-
timestamp: new Date(now + 400).toISOString(),
|
|
79
|
-
url: "https://docs.npmjs.com/cli/install",
|
|
80
|
-
},
|
|
81
|
-
// 7. Algolia search
|
|
82
|
-
{
|
|
83
|
-
body: JSON.stringify({ query: "presentation tool setup" }),
|
|
84
|
-
headers: {},
|
|
85
|
-
latencyMs: 150,
|
|
86
|
-
method: "POST",
|
|
87
|
-
responseSize: 5000,
|
|
88
|
-
seq: 6,
|
|
89
|
-
statusCode: 200,
|
|
90
|
-
timestamp: new Date(now + 500).toISOString(),
|
|
91
|
-
url: "https://abc123.algolia.net/1/indexes/sanity_docs/query",
|
|
92
|
-
},
|
|
93
|
-
// 8. Google search
|
|
94
|
-
{
|
|
95
|
-
headers: {},
|
|
96
|
-
latencyMs: 200,
|
|
97
|
-
method: "GET",
|
|
98
|
-
responseSize: 80000,
|
|
99
|
-
seq: 7,
|
|
100
|
-
statusCode: 200,
|
|
101
|
-
timestamp: new Date(now + 600).toISOString(),
|
|
102
|
-
url: "https://www.google.com/search?q=sanity+studio+custom+tool",
|
|
103
|
-
},
|
|
104
|
-
// 9. Failed request (should be skipped)
|
|
105
|
-
{
|
|
106
|
-
headers: {},
|
|
107
|
-
latencyMs: 0,
|
|
108
|
-
method: "GET",
|
|
109
|
-
responseSize: 0,
|
|
110
|
-
seq: 8,
|
|
111
|
-
statusCode: 0,
|
|
112
|
-
timestamp: new Date(now + 700).toISOString(),
|
|
113
|
-
url: "https://www.sanity.io/docs/nonexistent-page",
|
|
114
|
-
},
|
|
115
|
-
];
|
|
116
|
-
// ─── Run tests ───────────────────────────────────────────────────────────────
|
|
117
|
-
let passed = 0;
|
|
118
|
-
let failed = 0;
|
|
119
|
-
function assert(condition, msg) {
|
|
120
|
-
if (condition) {
|
|
121
|
-
console.log(` ✅ ${msg}`);
|
|
122
|
-
passed++;
|
|
123
|
-
}
|
|
124
|
-
else {
|
|
125
|
-
console.log(` ❌ FAIL: ${msg}`);
|
|
126
|
-
failed++;
|
|
127
|
-
}
|
|
128
|
-
}
|
|
129
|
-
console.log("\n═══ Agent Observer — Final Validation ═══\n");
|
|
130
|
-
// --- Individual function tests ---
|
|
131
|
-
console.log("1. Individual detection functions:");
|
|
132
|
-
assert(isDocPageRequest(mockRequests[0]) === true, "Doc page detected");
|
|
133
|
-
assert(isDocPageRequest(mockRequests[3]) === false, "API call NOT detected as doc page");
|
|
134
|
-
assert(isSearchRequest(mockRequests[2]) === true, "Search detected");
|
|
135
|
-
assert(isSearchRequest(mockRequests[3]) === false, "API call NOT detected as search");
|
|
136
|
-
assert(isSanityApiRequest(mockRequests[3]) === true, "API call detected");
|
|
137
|
-
assert(isSanityApiRequest(mockRequests[0]) === false, "Doc page NOT detected as API call");
|
|
138
|
-
// --- Slug extraction ---
|
|
139
|
-
console.log("\n2. Metadata extraction:");
|
|
140
|
-
assert(extractDocSlug("https://www.sanity.io/docs/create-a-schema-and-configure-sanity-studio") === "create-a-schema-and-configure-sanity-studio", "Doc slug extracted correctly");
|
|
141
|
-
assert(extractDocSlug("https://www.sanity.io/docs/schema-types/object-type") ===
|
|
142
|
-
"schema-types/object-type", "Nested doc slug extracted correctly");
|
|
143
|
-
assert(extractSearchQuery(mockRequests[2]) === "visual editing preview", `Search query extracted: "${extractSearchQuery(mockRequests[2])}"`);
|
|
144
|
-
assert(extractApiEndpoint("https://api.sanity.io/v2021-03-25/data/query/production?query=*") === "/data/query/production?query=*", `API endpoint extracted: "${extractApiEndpoint("https://api.sanity.io/v2021-03-25/data/query/production?query=*")}"`);
|
|
145
|
-
assert(extractDomain("https://docs.npmjs.com/cli/install") === "docs.npmjs.com", "Domain extracted correctly");
|
|
146
|
-
assert(extractPageTitle("<html><head><title>Create a Schema - Sanity</title></head>") === "Create a Schema - Sanity", "Page title extracted correctly");
|
|
147
|
-
// --- Full classification pipeline ---
|
|
148
|
-
console.log("\n3. Full classification pipeline:");
|
|
149
|
-
const classified = classifyRequests(mockRequests);
|
|
150
|
-
assert(classified.docPageVisits.length === 2, `Doc pages: ${classified.docPageVisits.length} (expected 2)`);
|
|
151
|
-
assert(classified.searchQueries.length === 3, `Searches: ${classified.searchQueries.length} (expected 3 — site search + algolia + google)`);
|
|
152
|
-
assert(classified.apiCalls.length === 2, `API calls: ${classified.apiCalls.length} (expected 2 — GROQ query + CDN)`);
|
|
153
|
-
assert(classified.externalRequests.length === 1, `External: ${classified.externalRequests.length} (expected 1 — npmjs)`);
|
|
154
|
-
// Verify the API ?query= param was NOT classified as a search
|
|
155
|
-
const searchUrls = classified.searchQueries.map((s) => s.url);
|
|
156
|
-
assert(!searchUrls.includes('https://api.sanity.io/v2021-03-25/data/query/production?query=*[_type=="article"]'), "API ?query= param NOT misclassified as search");
|
|
157
|
-
// Verify the failed request was skipped
|
|
158
|
-
const allUrls = [
|
|
159
|
-
...classified.docPageVisits.map((d) => d.url),
|
|
160
|
-
...classified.searchQueries.map((s) => s.url),
|
|
161
|
-
...classified.apiCalls.map((a) => a.url),
|
|
162
|
-
...classified.externalRequests.map((e) => e.url),
|
|
163
|
-
];
|
|
164
|
-
assert(!allUrls.includes("https://www.sanity.io/docs/nonexistent-page"), "Failed request (status 0) was skipped");
|
|
165
|
-
// --- RequestRecorder ---
|
|
166
|
-
console.log("\n4. RequestRecorder:");
|
|
167
|
-
const recorder = new RequestRecorder({ includePatterns: [/sanity\.io/] });
|
|
168
|
-
assert(typeof recorder.start === "function", "RequestRecorder.start() exists");
|
|
169
|
-
assert(typeof recorder.stop === "function", "RequestRecorder.stop() exists");
|
|
170
|
-
// --- InstrumentedProvider ---
|
|
171
|
-
console.log("\n5. InstrumentedProvider:");
|
|
172
|
-
const provider = new InstrumentedProvider({ config: {}, id: "test-validation" });
|
|
173
|
-
assert(provider.id() === "instrumented:test-validation", `Provider ID: "${provider.id()}"`);
|
|
174
|
-
assert(typeof provider.callApi === "function", "Provider.callApi() exists");
|
|
175
|
-
// --- Summary ---
|
|
176
|
-
console.log(`\n${"═".repeat(50)}`);
|
|
177
|
-
console.log(`Results: ${passed} passed, ${failed} failed, ${passed + failed} total`);
|
|
178
|
-
if (failed > 0) {
|
|
179
|
-
console.log("\n⚠️ Some tests failed!");
|
|
180
|
-
process.exit(1);
|
|
181
|
-
}
|
|
182
|
-
else {
|
|
183
|
-
console.log("\n✅ All tests passed! Agent observer system is ready.");
|
|
184
|
-
process.exit(0);
|
|
185
|
-
}
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* update-quality-scores command — update QUALITY_SCORE.md from scores.
|
|
3
|
-
*/
|
|
4
|
-
import { Command } from "commander";
|
|
5
|
-
export function createUpdateQualityScoresCommand() {
|
|
6
|
-
return new Command("update-quality-scores")
|
|
7
|
-
.description("Update docs/QUALITY_SCORE.md from score-summary.json")
|
|
8
|
-
.action(async () => {
|
|
9
|
-
const { updateQualityScores } = await import("../scripts/update-quality-scores.js");
|
|
10
|
-
console.log("=== Updating QUALITY_SCORE.md from score-summary.json ===\n");
|
|
11
|
-
const result = updateQualityScores();
|
|
12
|
-
if (result.success) {
|
|
13
|
-
console.log(` ✅ ${result.message}`);
|
|
14
|
-
}
|
|
15
|
-
else {
|
|
16
|
-
console.error(` ❌ ${result.message}`);
|
|
17
|
-
process.exit(1);
|
|
18
|
-
}
|
|
19
|
-
});
|
|
20
|
-
}
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* lib/agent-behavior-report.ts — DEPRECATED re-export shim.
|
|
3
|
-
* @deprecated Import from ../pipeline/agent-behavior-report.js instead.
|
|
4
|
-
*/
|
|
5
|
-
import "dotenv/config";
|
|
6
|
-
export { analyzeResults, CANONICAL_DOC_MAP, detectFeatureArea, } from "../pipeline/agent-behavior-report.js";
|
|
7
|
-
export type { AnalysisResult, FeatureAnalysis, TaskBehavior, TestResult, } from "../pipeline/agent-behavior-report.js";
|
|
8
|
-
export declare function main(resultsPathArg?: string): void;
|
|
@@ -1,185 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* lib/agent-behavior-report.ts — DEPRECATED re-export shim.
|
|
3
|
-
* @deprecated Import from ../pipeline/agent-behavior-report.js instead.
|
|
4
|
-
*/
|
|
5
|
-
// oxlint-disable-next-line import/no-unassigned-import -- side-effect: loads .env into process.env
|
|
6
|
-
import "dotenv/config";
|
|
7
|
-
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
8
|
-
import { dirname, join } from "path";
|
|
9
|
-
export { analyzeResults, CANONICAL_DOC_MAP, detectFeatureArea, } from "../pipeline/agent-behavior-report.js";
|
|
10
|
-
import { analyzeResults, } from "../pipeline/agent-behavior-report.js";
|
|
11
|
-
export function main(resultsPathArg) {
|
|
12
|
-
const ROOT = join(dirname(new URL(import.meta.url).pathname), "..", "..");
|
|
13
|
-
const resultsPath = resultsPathArg ??
|
|
14
|
-
process.argv[2] ??
|
|
15
|
-
join(ROOT, "results", "latest", "eval-results.json");
|
|
16
|
-
if (!existsSync(resultsPath)) {
|
|
17
|
-
console.error(`Results file not found: ${resultsPath}`);
|
|
18
|
-
console.error("Run an evaluation first: pnpm eval:observed");
|
|
19
|
-
process.exit(1);
|
|
20
|
-
}
|
|
21
|
-
console.log(`Reading results from: ${resultsPath}`);
|
|
22
|
-
console.log();
|
|
23
|
-
const json = JSON.parse(readFileSync(resultsPath, "utf-8"));
|
|
24
|
-
const rawResults = Array.isArray(json.results)
|
|
25
|
-
? json.results
|
|
26
|
-
: json.results.results;
|
|
27
|
-
const analysis = analyzeResults(rawResults);
|
|
28
|
-
if (!analysis.hasData) {
|
|
29
|
-
console.log("No agent behavior data found in the results.");
|
|
30
|
-
console.log("Make sure you ran the evaluation with the observed config:");
|
|
31
|
-
console.log(" pnpm eval:observed");
|
|
32
|
-
process.exit(0);
|
|
33
|
-
}
|
|
34
|
-
printReport(analysis);
|
|
35
|
-
// Persist detailed report as JSON
|
|
36
|
-
const outDir = join(ROOT, "results", "latest");
|
|
37
|
-
mkdirSync(outDir, { recursive: true });
|
|
38
|
-
const reportData = {
|
|
39
|
-
features: analysis.features.map((f) => ({
|
|
40
|
-
avgDocPages: f.avgDocPages,
|
|
41
|
-
avgNetworkMs: f.avgNetworkMs,
|
|
42
|
-
avgSearches: f.avgSearches,
|
|
43
|
-
canonicalCoverage: f.canonicalCoverage,
|
|
44
|
-
canonicalSlugs: f.canonicalSlugs,
|
|
45
|
-
docSlugsVisited: f.allDocSlugs,
|
|
46
|
-
externalDomains: f.allExternalDomains,
|
|
47
|
-
feature: f.feature,
|
|
48
|
-
searchQueries: f.allSearchQueries,
|
|
49
|
-
taskCount: f.tasks.length,
|
|
50
|
-
})),
|
|
51
|
-
tasks: analysis.tasks.map((t) => ({
|
|
52
|
-
behavior: t.behavior,
|
|
53
|
-
description: t.description,
|
|
54
|
-
feature: t.feature,
|
|
55
|
-
hasDocs: t.hasDocs,
|
|
56
|
-
})),
|
|
57
|
-
timestamp: new Date().toISOString(),
|
|
58
|
-
totalTasks: analysis.tasks.length,
|
|
59
|
-
};
|
|
60
|
-
writeFileSync(join(outDir, "agent-behavior-report.json"), JSON.stringify(reportData, null, 2));
|
|
61
|
-
console.log("Agent behavior report written to results/latest/agent-behavior-report.json");
|
|
62
|
-
}
|
|
63
|
-
// ---------------------------------------------------------------------------
|
|
64
|
-
// Report output (kept in shim for backward compat)
|
|
65
|
-
// ---------------------------------------------------------------------------
|
|
66
|
-
function printReport(analysis) {
|
|
67
|
-
console.log("=".repeat(80));
|
|
68
|
-
console.log(" AGENT BEHAVIOR OBSERVATION REPORT");
|
|
69
|
-
console.log("=".repeat(80));
|
|
70
|
-
console.log();
|
|
71
|
-
// ---- Overview table ----
|
|
72
|
-
console.log("OVERVIEW BY FEATURE AREA");
|
|
73
|
-
console.log("-".repeat(80));
|
|
74
|
-
const h = "| Feature Area | Tasks | Avg Docs | Avg Search | Avg Net(ms) | Canon% |";
|
|
75
|
-
const sep = "|---------------------|-------|----------|------------|-------------|--------|";
|
|
76
|
-
console.log(h);
|
|
77
|
-
console.log(sep);
|
|
78
|
-
for (const f of analysis.features) {
|
|
79
|
-
console.log(`| ${f.feature.padEnd(19)} | ` +
|
|
80
|
-
`${f.tasks.length.toString().padStart(5)} | ` +
|
|
81
|
-
`${f.avgDocPages.toFixed(1).padStart(8)} | ` +
|
|
82
|
-
`${f.avgSearches.toFixed(1).padStart(10)} | ` +
|
|
83
|
-
`${Math.round(f.avgNetworkMs).toString().padStart(11)} | ` +
|
|
84
|
-
`${(f.canonicalCoverage * 100).toFixed(0).padStart(5)}% |`);
|
|
85
|
-
}
|
|
86
|
-
console.log();
|
|
87
|
-
// ---- Canonical coverage breakdown ----
|
|
88
|
-
console.log("CANONICAL DOCUMENTATION COVERAGE");
|
|
89
|
-
console.log("-".repeat(80));
|
|
90
|
-
console.log();
|
|
91
|
-
for (const f of analysis.features) {
|
|
92
|
-
console.log(` ${f.feature} (${(f.canonicalCoverage * 100).toFixed(0)}% canonical coverage):`);
|
|
93
|
-
if (f.canonicalSlugs.length === 0) {
|
|
94
|
-
console.log(" (no canonical docs defined)");
|
|
95
|
-
}
|
|
96
|
-
else {
|
|
97
|
-
for (const slug of f.canonicalSlugs) {
|
|
98
|
-
const found = f.allDocSlugs.some((visited) => visited.includes(slug));
|
|
99
|
-
const marker = found ? "[x]" : "[ ]";
|
|
100
|
-
console.log(` ${marker} ${slug}`);
|
|
101
|
-
}
|
|
102
|
-
}
|
|
103
|
-
if (f.allDocSlugs.length > 0) {
|
|
104
|
-
const nonCanonical = f.allDocSlugs.filter((slug) => !f.canonicalSlugs.some((c) => slug.includes(c)));
|
|
105
|
-
if (nonCanonical.length > 0) {
|
|
106
|
-
console.log(" Additional docs visited:");
|
|
107
|
-
for (const slug of nonCanonical) {
|
|
108
|
-
console.log(` + ${slug}`);
|
|
109
|
-
}
|
|
110
|
-
}
|
|
111
|
-
}
|
|
112
|
-
console.log();
|
|
113
|
-
}
|
|
114
|
-
// ---- Search strategy ----
|
|
115
|
-
const allSearches = analysis.features.flatMap((f) => f.allSearchQueries);
|
|
116
|
-
if (allSearches.length > 0) {
|
|
117
|
-
console.log("SEARCH STRATEGY");
|
|
118
|
-
console.log("-".repeat(80));
|
|
119
|
-
console.log();
|
|
120
|
-
for (const f of analysis.features) {
|
|
121
|
-
if (f.allSearchQueries.length === 0)
|
|
122
|
-
continue;
|
|
123
|
-
console.log(` ${f.feature}:`);
|
|
124
|
-
for (const q of f.allSearchQueries) {
|
|
125
|
-
console.log(` -> "${q}"`);
|
|
126
|
-
}
|
|
127
|
-
}
|
|
128
|
-
console.log();
|
|
129
|
-
}
|
|
130
|
-
// ---- Per-task detail ----
|
|
131
|
-
console.log("PER-TASK DETAIL");
|
|
132
|
-
console.log("-".repeat(80));
|
|
133
|
-
console.log();
|
|
134
|
-
for (const f of analysis.features) {
|
|
135
|
-
console.log(` ## ${f.feature}`);
|
|
136
|
-
console.log();
|
|
137
|
-
for (const t of f.tasks) {
|
|
138
|
-
const variant = t.hasDocs ? "[gold]" : "[baseline]";
|
|
139
|
-
console.log(` ${variant} ${t.description}`);
|
|
140
|
-
console.log(` Requests: ${t.behavior.totalRequests} | ` +
|
|
141
|
-
`Doc pages: ${t.behavior.docPagesVisited} | ` +
|
|
142
|
-
`Searches: ${t.behavior.searchesPerformed} | ` +
|
|
143
|
-
`External: ${t.behavior.externalRequestCount}`);
|
|
144
|
-
if (t.behavior.docSlugsVisited.length > 0) {
|
|
145
|
-
console.log(` Docs: ${t.behavior.docSlugsVisited.join(", ")}`);
|
|
146
|
-
}
|
|
147
|
-
if (t.behavior.uniqueSearchQueries.length > 0) {
|
|
148
|
-
console.log(` Queries: ${t.behavior.uniqueSearchQueries.map((q) => `"${q}"`).join(", ")}`);
|
|
149
|
-
}
|
|
150
|
-
console.log();
|
|
151
|
-
}
|
|
152
|
-
}
|
|
153
|
-
// ---- External domains ----
|
|
154
|
-
const allDomains = [
|
|
155
|
-
...new Set(analysis.features.flatMap((f) => f.allExternalDomains)),
|
|
156
|
-
];
|
|
157
|
-
if (allDomains.length > 0) {
|
|
158
|
-
console.log("EXTERNAL DOMAINS");
|
|
159
|
-
console.log("-".repeat(80));
|
|
160
|
-
console.log();
|
|
161
|
-
for (const d of allDomains) {
|
|
162
|
-
console.log(` - ${d}`);
|
|
163
|
-
}
|
|
164
|
-
console.log();
|
|
165
|
-
}
|
|
166
|
-
// ---- Summary stats ----
|
|
167
|
-
console.log("OVERALL STATISTICS");
|
|
168
|
-
console.log("-".repeat(80));
|
|
169
|
-
console.log();
|
|
170
|
-
const totalTasks = analysis.tasks.length;
|
|
171
|
-
const tasksUsingDocs = analysis.tasks.filter((t) => t.behavior.usedDocs).length;
|
|
172
|
-
const tasksUsingSearch = analysis.tasks.filter((t) => t.behavior.usedSearch).length;
|
|
173
|
-
const avgCanonical = analysis.features.reduce((s, f) => s + f.canonicalCoverage, 0) /
|
|
174
|
-
(analysis.features.length || 1);
|
|
175
|
-
console.log(` Total tasks observed: ${totalTasks}`);
|
|
176
|
-
console.log(` Tasks that used docs: ${tasksUsingDocs}/${totalTasks} (${((tasksUsingDocs / totalTasks) * 100).toFixed(0)}%)`);
|
|
177
|
-
console.log(` Tasks that used search: ${tasksUsingSearch}/${totalTasks} (${((tasksUsingSearch / totalTasks) * 100).toFixed(0)}%)`);
|
|
178
|
-
console.log(` Avg canonical coverage: ${(avgCanonical * 100).toFixed(1)}%`);
|
|
179
|
-
console.log();
|
|
180
|
-
}
|
|
181
|
-
// Only run when invoked directly (not when imported)
|
|
182
|
-
if (process.argv[1]?.endsWith("agent-behavior-report.ts") ||
|
|
183
|
-
process.argv[1]?.endsWith("agent-behavior-report.js")) {
|
|
184
|
-
main();
|
|
185
|
-
}
|
package/dist/lib/baseline.d.ts
DELETED
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* lib/baseline.ts — DEPRECATED re-export shim.
|
|
3
|
-
*
|
|
4
|
-
* The real implementation has moved to pipeline/baseline.ts.
|
|
5
|
-
* This shim preserves backward compatibility for:
|
|
6
|
-
* - Direct CLI invocation: `tsx src/lib/baseline.ts`
|
|
7
|
-
* - Test imports that haven't been updated yet
|
|
8
|
-
*
|
|
9
|
-
* TODO: Update all importers to use pipeline/baseline.ts, then delete this file.
|
|
10
|
-
*
|
|
11
|
-
* @deprecated Import from ../pipeline/baseline.js instead.
|
|
12
|
-
*/
|
|
13
|
-
export type { BaselineMetadata, CompareResult, ScoreComparison, } from "../pipeline/baseline.js";
|
|
14
|
-
export declare function saveBaseline(tag?: string): {
|
|
15
|
-
success: boolean;
|
|
16
|
-
message: string;
|
|
17
|
-
};
|
|
18
|
-
export declare function compareBaseline(baselineFile?: string): import("./baseline.js").CompareResult;
|
|
19
|
-
export declare function listBaselines(): import("./baseline.js").BaselineMetadata[];
|