@hevmind/ask 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +116 -0
- package/bin/ask-launcher.mjs +110 -0
- package/bin/ask.mjs +4 -0
- package/openapi.yaml +363 -0
- package/package.json +61 -0
- package/skills/build-digest/SKILL.md +164 -0
- package/src/components/SearchOverlay.astro +1375 -0
- package/src/components/markdown.ts +107 -0
- package/src/digest/build.ts +432 -0
- package/src/digest/cli.ts +148 -0
- package/src/digest/expand.ts +24 -0
- package/src/digest/facts.ts +77 -0
- package/src/digest/frontmatter.ts +41 -0
- package/src/digest/read.ts +63 -0
- package/src/digest/schema.ts +185 -0
- package/src/digest/verify.ts +116 -0
- package/src/endpoint.ts +247 -0
- package/src/index.ts +2 -0
- package/src/integration.ts +146 -0
- package/src/llm.ts +239 -0
- package/src/observability.ts +213 -0
- package/src/search/chunk.ts +137 -0
- package/src/search/index.ts +44 -0
- package/src/search/loop.ts +525 -0
- package/src/search/prefilter.ts +93 -0
- package/src/types.ts +99 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { tokenize } from '../search/chunk.ts';
|
|
2
|
+
import type { GlossaryEntry } from './schema';
|
|
3
|
+
|
|
4
|
+
export function expandQueryTerms(query: string, glossary: GlossaryEntry[], cap = 24): string[] {
|
|
5
|
+
const terms = new Set(tokenize(query));
|
|
6
|
+
if (!terms.size) return [];
|
|
7
|
+
|
|
8
|
+
for (const entry of glossary) {
|
|
9
|
+
if (terms.size >= cap) break;
|
|
10
|
+
const entryTerms = new Set([...tokenize(entry.term), ...entry.aliases.flatMap((alias) => tokenize(alias))]);
|
|
11
|
+
if (!intersects(terms, entryTerms)) continue;
|
|
12
|
+
for (const term of entryTerms) {
|
|
13
|
+
terms.add(term);
|
|
14
|
+
if (terms.size >= cap) break;
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
return [...terms];
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
function intersects(a: Set<string>, b: Set<string>): boolean {
|
|
22
|
+
for (const item of b) if (a.has(item)) return true;
|
|
23
|
+
return false;
|
|
24
|
+
}
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import { tokenize } from '../search/chunk.ts';
|
|
2
|
+
import type { Fact } from './schema.ts';
|
|
3
|
+
|
|
4
|
+
const FENCE_RE = /```[a-zA-Z0-9]*\n([\s\S]*?)```/g;
|
|
5
|
+
const INLINE_CODE_RE = /`([^`\n]+)`/g;
|
|
6
|
+
const FLAG_RE = /(?<![\w-])(--?[a-zA-Z][\w-]*)/g;
|
|
7
|
+
const VERSION_RE = /\bv?\d+(?:\.\d+)+\b/g;
|
|
8
|
+
const MODEL_ID_RE = /\b[a-z][a-z0-9]*(?:-[a-z0-9]+)*-\d[a-z0-9-]*\b/gi;
|
|
9
|
+
const DOTTED_RE = /\b[a-z0-9-]+(?:\.[a-z0-9-]+)+\b/gi;
|
|
10
|
+
|
|
11
|
+
const MAX_FACTS = 24;
|
|
12
|
+
const MAX_LITERAL = 400;
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Extracts byte-verbatim literals (code, flags, identifiers, versions) from a
|
|
16
|
+
* section's raw markdown. These are the answer-critical strings the prose
|
|
17
|
+
* tokenizer destroys; they are carried into the digest node unchanged so the agent
|
|
18
|
+
* can quote them exactly without re-reading the source.
|
|
19
|
+
*
|
|
20
|
+
* Fully deterministic — the model never authors or edits a fact.
|
|
21
|
+
*/
|
|
22
|
+
export function extractFacts(chunkId: string, raw: string): Fact[] {
|
|
23
|
+
const seen = new Set<string>();
|
|
24
|
+
const facts: Fact[] = [];
|
|
25
|
+
const push = (kind: Fact['kind'], literal: string) => {
|
|
26
|
+
const value = literal.trim();
|
|
27
|
+
if (value.length < 2 || value.length > MAX_LITERAL || seen.has(value)) return;
|
|
28
|
+
seen.add(value);
|
|
29
|
+
facts.push({ kind, literal: value, chunkId });
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
// Fenced code blocks first, then strip them so inline scanners don't re-read them.
|
|
33
|
+
let rest = raw;
|
|
34
|
+
for (const match of raw.matchAll(FENCE_RE)) push('code', match[1]);
|
|
35
|
+
rest = rest.replace(FENCE_RE, ' ');
|
|
36
|
+
|
|
37
|
+
for (const match of rest.matchAll(INLINE_CODE_RE)) push('code', match[1]);
|
|
38
|
+
// Inline code is the richest source of exact tokens; scan the remainder for
|
|
39
|
+
// bare flags/identifiers/versions that weren't wrapped in backticks.
|
|
40
|
+
const bare = rest.replace(INLINE_CODE_RE, ' ');
|
|
41
|
+
for (const match of bare.matchAll(FLAG_RE)) push('flag', match[1]);
|
|
42
|
+
for (const match of bare.matchAll(MODEL_ID_RE)) push('value', match[0]);
|
|
43
|
+
for (const match of bare.matchAll(DOTTED_RE)) push('value', match[0]);
|
|
44
|
+
for (const match of bare.matchAll(VERSION_RE)) push('value', match[0]);
|
|
45
|
+
|
|
46
|
+
return facts.slice(0, MAX_FACTS);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/** Sections in these groups carry dense literals the agent should read verbatim. */
|
|
50
|
+
const SOURCE_PRIMARY_GROUP_RE = /reference|api/i;
|
|
51
|
+
|
|
52
|
+
export function classifyMode(group: string | undefined | null): 'agent-primary' | 'source-primary' {
|
|
53
|
+
return group && SOURCE_PRIMARY_GROUP_RE.test(group) ? 'source-primary' : 'agent-primary';
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
const STOPWORDS = new Set([
|
|
57
|
+
'the', 'and', 'for', 'with', 'that', 'this', 'from', 'into', 'are', 'was', 'has', 'have', 'its',
|
|
58
|
+
'use', 'used', 'using', 'can', 'will', 'when', 'where', 'how', 'what', 'which', 'each', 'all',
|
|
59
|
+
'one', 'two', 'per', 'via', 'not', 'but', 'you', 'your', 'they', 'them', 'then', 'than', 'over',
|
|
60
|
+
]);
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Distinctive tokens for the render-time link-support check: tokens long enough
|
|
64
|
+
* to be meaningful, minus common words. A cited link survives only if the
|
|
65
|
+
* answer block that contains it shares one of these with the cited node.
|
|
66
|
+
*/
|
|
67
|
+
export function distinctiveTokens(text: string, cap = 40): string[] {
|
|
68
|
+
const out: string[] = [];
|
|
69
|
+
const seen = new Set<string>();
|
|
70
|
+
for (const token of tokenize(text)) {
|
|
71
|
+
if (token.length < 4 || STOPWORDS.has(token) || seen.has(token)) continue;
|
|
72
|
+
seen.add(token);
|
|
73
|
+
out.push(token);
|
|
74
|
+
if (out.length >= cap) break;
|
|
75
|
+
}
|
|
76
|
+
return out;
|
|
77
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
export interface FrontmatterDocument {
|
|
2
|
+
data: Record<string, unknown>;
|
|
3
|
+
body: string;
|
|
4
|
+
}
|
|
5
|
+
|
|
6
|
+
export function parseFrontmatter(src: string): FrontmatterDocument {
|
|
7
|
+
if (!src.startsWith('---')) return { data: {}, body: src };
|
|
8
|
+
const end = src.indexOf('\n---', 3);
|
|
9
|
+
if (end === -1) return { data: {}, body: src };
|
|
10
|
+
|
|
11
|
+
const raw = src.slice(3, end).trim();
|
|
12
|
+
const body = src.slice(end).replace(/^\n---\s*\r?\n?/, '');
|
|
13
|
+
return { data: parseFlatYaml(raw), body };
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
function parseFlatYaml(src: string): Record<string, unknown> {
|
|
17
|
+
const data: Record<string, unknown> = {};
|
|
18
|
+
for (const line of src.split(/\r?\n/)) {
|
|
19
|
+
const trimmed = line.trim();
|
|
20
|
+
if (!trimmed || trimmed.startsWith('#')) continue;
|
|
21
|
+
const colon = trimmed.indexOf(':');
|
|
22
|
+
if (colon === -1) continue;
|
|
23
|
+
const key = trimmed.slice(0, colon).trim();
|
|
24
|
+
const raw = trimmed.slice(colon + 1).trim();
|
|
25
|
+
if (!key) continue;
|
|
26
|
+
data[key] = parseScalar(raw);
|
|
27
|
+
}
|
|
28
|
+
return data;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
function parseScalar(value: string): unknown {
|
|
32
|
+
if (!value) return '';
|
|
33
|
+
if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
|
|
34
|
+
return value.slice(1, -1);
|
|
35
|
+
}
|
|
36
|
+
if (value === 'true') return true;
|
|
37
|
+
if (value === 'false') return false;
|
|
38
|
+
const numberValue = Number(value);
|
|
39
|
+
if (Number.isFinite(numberValue) && /^-?\d+(\.\d+)?$/.test(value)) return numberValue;
|
|
40
|
+
return value;
|
|
41
|
+
}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import type { GlossaryEntry, Digest, DigestNode } from './schema.ts';
|
|
2
|
+
|
|
3
|
+
export interface SectionSummary {
|
|
4
|
+
id: string;
|
|
5
|
+
title: string;
|
|
6
|
+
heading: string | null;
|
|
7
|
+
group: string | null;
|
|
8
|
+
url: string;
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export function listGlossary(digest: Digest): GlossaryEntry[] {
|
|
12
|
+
return digest.glossary;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export function getGlossaryEntry(digest: Digest, term: string): GlossaryEntry | null {
|
|
16
|
+
const needle = normalizeLookup(term);
|
|
17
|
+
if (!needle) return null;
|
|
18
|
+
return (
|
|
19
|
+
digest.glossary.find((entry) => {
|
|
20
|
+
if (normalizeLookup(entry.term) === needle) return true;
|
|
21
|
+
return entry.aliases.some((alias) => normalizeLookup(alias) === needle);
|
|
22
|
+
}) ?? null
|
|
23
|
+
);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export function listSectionSummaries(digest: Digest, group?: string | null): SectionSummary[] {
|
|
27
|
+
const wantedGroup = group ? normalizeLookup(group) : '';
|
|
28
|
+
return digest.nodes
|
|
29
|
+
.filter((node) => !wantedGroup || normalizeLookup(node.group ?? '') === wantedGroup)
|
|
30
|
+
.map(sectionSummary);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export function getSection(digest: Digest, id: string): DigestNode | null {
|
|
34
|
+
const needle = decodePathValue(id).trim();
|
|
35
|
+
if (!needle) return null;
|
|
36
|
+
return digest.nodes.find((node) => node.id === needle) ?? null;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
export function getOverview(digest: Digest): { overview: string; context: string } {
|
|
40
|
+
return { overview: digest.overview, context: digest.context };
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
export function sectionSummary(node: DigestNode): SectionSummary {
|
|
44
|
+
return {
|
|
45
|
+
id: node.id,
|
|
46
|
+
title: node.title,
|
|
47
|
+
heading: node.heading,
|
|
48
|
+
group: node.group,
|
|
49
|
+
url: node.url,
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
export function decodePathValue(value: string): string {
|
|
54
|
+
try {
|
|
55
|
+
return decodeURIComponent(value);
|
|
56
|
+
} catch {
|
|
57
|
+
return value;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
function normalizeLookup(value: string): string {
|
|
62
|
+
return decodePathValue(value).trim().toLowerCase();
|
|
63
|
+
}
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
export interface GlossaryEntry {
|
|
2
|
+
term: string;
|
|
3
|
+
aliases: string[];
|
|
4
|
+
definition: string;
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
/** A byte-verbatim literal lifted from a source section (a flag, code span, value). */
|
|
8
|
+
export interface Fact {
|
|
9
|
+
kind: 'flag' | 'code' | 'value' | 'default' | 'key';
|
|
10
|
+
/** Exact text as it appears in the source — never paraphrased by the model. */
|
|
11
|
+
literal: string;
|
|
12
|
+
/** The chunk id this literal was extracted from. */
|
|
13
|
+
chunkId: string;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/** Where a node's knowledge came from, and the human deep link for it. */
|
|
17
|
+
export interface SourceRef {
|
|
18
|
+
/** Equals a Chunk.id. */
|
|
19
|
+
chunkId: string;
|
|
20
|
+
/** The rendered deep link, e.g. "/docs/concepts#kubernetes-autoscaling". */
|
|
21
|
+
url: string;
|
|
22
|
+
/** The github-slugger anchor, or null for a page-level (intro) section. */
|
|
23
|
+
anchor: string | null;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* One distilled, source-grounded section of the docs — the agent's "shadow site"
|
|
28
|
+
* view of a single heading section. `id` equals the originating chunk id.
|
|
29
|
+
*/
|
|
30
|
+
export interface DigestNode {
|
|
31
|
+
id: string;
|
|
32
|
+
kind: 'section';
|
|
33
|
+
title: string;
|
|
34
|
+
heading: string | null;
|
|
35
|
+
group: string | null;
|
|
36
|
+
url: string;
|
|
37
|
+
/** Model-distilled prose. May paraphrase; exact strings live in `facts`. */
|
|
38
|
+
summary: string;
|
|
39
|
+
/** Deterministically extracted verbatim literals. */
|
|
40
|
+
facts: Fact[];
|
|
41
|
+
/** Provenance — for a section node, its own chunk. */
|
|
42
|
+
sources: SourceRef[];
|
|
43
|
+
/**
|
|
44
|
+
* 'source-primary' sections (reference/API) carry dense literals the agent
|
|
45
|
+
* should read verbatim rather than trust a paraphrase of.
|
|
46
|
+
*/
|
|
47
|
+
mode: 'agent-primary' | 'source-primary';
|
|
48
|
+
/** Distinctive tokens used for the render-time link-support check. */
|
|
49
|
+
terms: string[];
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/** Reserved for the deferred edge layer; ships empty in this version. */
|
|
53
|
+
export interface DigestEdge {
|
|
54
|
+
rel: string;
|
|
55
|
+
from: string;
|
|
56
|
+
to: string;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
export interface Digest {
|
|
60
|
+
version: 2;
|
|
61
|
+
generatedAt: string;
|
|
62
|
+
contentHash: string;
|
|
63
|
+
/** Compact prose orientation. Degradation fallback when `nodes` is empty. */
|
|
64
|
+
context: string;
|
|
65
|
+
glossary: GlossaryEntry[];
|
|
66
|
+
/** Deterministic high-level map injected into the agent's system prompt. */
|
|
67
|
+
overview: string;
|
|
68
|
+
/** Model-authored example questions the overlay offers on open. */
|
|
69
|
+
suggestions: string[];
|
|
70
|
+
nodes: DigestNode[];
|
|
71
|
+
edges: DigestEdge[];
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
export const EMPTY_DIGEST: Digest = {
|
|
75
|
+
version: 2,
|
|
76
|
+
generatedAt: '',
|
|
77
|
+
contentHash: '',
|
|
78
|
+
context: '',
|
|
79
|
+
glossary: [],
|
|
80
|
+
overview: '',
|
|
81
|
+
suggestions: [],
|
|
82
|
+
nodes: [],
|
|
83
|
+
edges: [],
|
|
84
|
+
};
|
|
85
|
+
|
|
86
|
+
const FACT_KINDS = new Set<Fact['kind']>(['flag', 'code', 'value', 'default', 'key']);
|
|
87
|
+
const NODE_MODES = new Set<DigestNode['mode']>(['agent-primary', 'source-primary']);
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Coerces unknown JSON into a Digest. A v1 artifact (`{context,
|
|
91
|
+
* glossary}` with no `nodes`) degrades cleanly to an empty-node v2 digest, so the
|
|
92
|
+
* runtime falls back to keyword/legacy behavior rather than hard-failing.
|
|
93
|
+
*/
|
|
94
|
+
export function normalizeDigest(value: unknown): Digest {
|
|
95
|
+
if (!value || typeof value !== 'object') return EMPTY_DIGEST;
|
|
96
|
+
const maybe = value as Partial<Digest>;
|
|
97
|
+
const glossary = Array.isArray(maybe.glossary)
|
|
98
|
+
? maybe.glossary
|
|
99
|
+
.map((entry) => normalizeGlossaryEntry(entry))
|
|
100
|
+
.filter((entry): entry is GlossaryEntry => entry !== null)
|
|
101
|
+
: [];
|
|
102
|
+
const nodes = Array.isArray(maybe.nodes)
|
|
103
|
+
? maybe.nodes.map((node) => normalizeNode(node)).filter((node): node is DigestNode => node !== null)
|
|
104
|
+
: [];
|
|
105
|
+
const edges = Array.isArray(maybe.edges)
|
|
106
|
+
? maybe.edges.map((edge) => normalizeEdge(edge)).filter((edge): edge is DigestEdge => edge !== null)
|
|
107
|
+
: [];
|
|
108
|
+
|
|
109
|
+
return {
|
|
110
|
+
version: 2,
|
|
111
|
+
generatedAt: typeof maybe.generatedAt === 'string' ? maybe.generatedAt : '',
|
|
112
|
+
contentHash: typeof maybe.contentHash === 'string' ? maybe.contentHash : '',
|
|
113
|
+
context: typeof maybe.context === 'string' ? maybe.context : '',
|
|
114
|
+
glossary,
|
|
115
|
+
overview: typeof maybe.overview === 'string' ? maybe.overview : '',
|
|
116
|
+
suggestions: Array.isArray(maybe.suggestions)
|
|
117
|
+
? maybe.suggestions.filter((s): s is string => typeof s === 'string' && s.trim().length > 0)
|
|
118
|
+
: [],
|
|
119
|
+
nodes,
|
|
120
|
+
edges,
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
function normalizeGlossaryEntry(value: unknown): GlossaryEntry | null {
|
|
125
|
+
if (!value || typeof value !== 'object') return null;
|
|
126
|
+
const maybe = value as Partial<GlossaryEntry>;
|
|
127
|
+
if (typeof maybe.term !== 'string' || typeof maybe.definition !== 'string') return null;
|
|
128
|
+
return {
|
|
129
|
+
term: maybe.term,
|
|
130
|
+
aliases: Array.isArray(maybe.aliases) ? maybe.aliases.filter((alias): alias is string => typeof alias === 'string') : [],
|
|
131
|
+
definition: maybe.definition,
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
function normalizeNode(value: unknown): DigestNode | null {
|
|
136
|
+
if (!value || typeof value !== 'object') return null;
|
|
137
|
+
const maybe = value as Partial<DigestNode>;
|
|
138
|
+
if (typeof maybe.id !== 'string' || typeof maybe.url !== 'string') return null;
|
|
139
|
+
return {
|
|
140
|
+
id: maybe.id,
|
|
141
|
+
kind: 'section',
|
|
142
|
+
title: typeof maybe.title === 'string' ? maybe.title : maybe.id,
|
|
143
|
+
heading: typeof maybe.heading === 'string' ? maybe.heading : null,
|
|
144
|
+
group: typeof maybe.group === 'string' ? maybe.group : null,
|
|
145
|
+
url: maybe.url,
|
|
146
|
+
summary: typeof maybe.summary === 'string' ? maybe.summary : '',
|
|
147
|
+
facts: Array.isArray(maybe.facts)
|
|
148
|
+
? maybe.facts.map((fact) => normalizeFact(fact)).filter((fact): fact is Fact => fact !== null)
|
|
149
|
+
: [],
|
|
150
|
+
sources: Array.isArray(maybe.sources)
|
|
151
|
+
? maybe.sources.map((src) => normalizeSource(src)).filter((src): src is SourceRef => src !== null)
|
|
152
|
+
: [],
|
|
153
|
+
mode: maybe.mode && NODE_MODES.has(maybe.mode) ? maybe.mode : 'agent-primary',
|
|
154
|
+
terms: Array.isArray(maybe.terms) ? maybe.terms.filter((term): term is string => typeof term === 'string') : [],
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
function normalizeFact(value: unknown): Fact | null {
|
|
159
|
+
if (!value || typeof value !== 'object') return null;
|
|
160
|
+
const maybe = value as Partial<Fact>;
|
|
161
|
+
if (typeof maybe.literal !== 'string' || !maybe.literal) return null;
|
|
162
|
+
return {
|
|
163
|
+
kind: maybe.kind && FACT_KINDS.has(maybe.kind) ? maybe.kind : 'value',
|
|
164
|
+
literal: maybe.literal,
|
|
165
|
+
chunkId: typeof maybe.chunkId === 'string' ? maybe.chunkId : '',
|
|
166
|
+
};
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
function normalizeSource(value: unknown): SourceRef | null {
|
|
170
|
+
if (!value || typeof value !== 'object') return null;
|
|
171
|
+
const maybe = value as Partial<SourceRef>;
|
|
172
|
+
if (typeof maybe.chunkId !== 'string' || typeof maybe.url !== 'string') return null;
|
|
173
|
+
return {
|
|
174
|
+
chunkId: maybe.chunkId,
|
|
175
|
+
url: maybe.url,
|
|
176
|
+
anchor: typeof maybe.anchor === 'string' ? maybe.anchor : null,
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
function normalizeEdge(value: unknown): DigestEdge | null {
|
|
181
|
+
if (!value || typeof value !== 'object') return null;
|
|
182
|
+
const maybe = value as Partial<DigestEdge>;
|
|
183
|
+
if (typeof maybe.rel !== 'string' || typeof maybe.from !== 'string' || typeof maybe.to !== 'string') return null;
|
|
184
|
+
return { rel: maybe.rel, from: maybe.from, to: maybe.to };
|
|
185
|
+
}
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import { execFile } from 'node:child_process';
|
|
2
|
+
import { readFile } from 'node:fs/promises';
|
|
3
|
+
import path from 'node:path';
|
|
4
|
+
import { promisify } from 'node:util';
|
|
5
|
+
import { buildCorpus } from './build.ts';
|
|
6
|
+
import { extractFacts } from './facts.ts';
|
|
7
|
+
import { normalizeDigest } from './schema.ts';
|
|
8
|
+
|
|
9
|
+
const execFileAsync = promisify(execFile);
|
|
10
|
+
|
|
11
|
+
export interface VerifyAnchorsOptions {
|
|
12
|
+
siteRoot: string;
|
|
13
|
+
collections: string[] | null;
|
|
14
|
+
basePath: string;
|
|
15
|
+
digestContentGlobs?: string[];
|
|
16
|
+
chunkHeadingDepth: number;
|
|
17
|
+
digestPath?: string;
|
|
18
|
+
buildCommand?: string;
|
|
19
|
+
distDir?: string;
|
|
20
|
+
skipBuild?: boolean;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export interface VerifyAnchorsResult {
|
|
24
|
+
checked: number;
|
|
25
|
+
missing: Array<{ url: string; file: string; anchorId: string }>;
|
|
26
|
+
/** Source literals that the committed digest dropped from an agent-primary node. */
|
|
27
|
+
dropped: Array<{ id: string; literal: string }>;
|
|
28
|
+
/** Section ids present in the corpus but absent from the committed digest. */
|
|
29
|
+
uncovered: string[];
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export async function verifyAnchors(options: VerifyAnchorsOptions): Promise<VerifyAnchorsResult> {
|
|
33
|
+
if (!options.skipBuild) {
|
|
34
|
+
const command = options.buildCommand ?? 'pnpm build';
|
|
35
|
+
await execFileAsync('sh', ['-c', command], { cwd: options.siteRoot, maxBuffer: 1024 * 1024 * 8 });
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
const corpus = await buildCorpus(options);
|
|
39
|
+
const distDir = path.resolve(options.siteRoot, options.distDir ?? 'dist');
|
|
40
|
+
const anchored = corpus.chunks.filter((chunk) => chunk.anchorId);
|
|
41
|
+
const missing: VerifyAnchorsResult['missing'] = [];
|
|
42
|
+
|
|
43
|
+
for (const chunk of anchored) {
|
|
44
|
+
const files = htmlFilesForUrl(distDir, chunk.url);
|
|
45
|
+
const found = await findHtmlWithId(files, chunk.anchorId!);
|
|
46
|
+
if (!found) {
|
|
47
|
+
const file = files[0];
|
|
48
|
+
missing.push({ url: chunk.url, file, anchorId: chunk.anchorId! });
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
const { dropped, uncovered } = await verifyFidelity(options, corpus.chunks);
|
|
53
|
+
return { checked: anchored.length, missing, dropped, uncovered };
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Literal-fidelity gate: every verbatim literal the builder would extract from a
|
|
58
|
+
* section must be recoverable from that section's committed node — unless the
|
|
59
|
+
* node is source-primary (the agent reads its raw text instead). Catches lossy
|
|
60
|
+
* distillation that would otherwise produce confident-but-wrong exact answers.
|
|
61
|
+
*
|
|
62
|
+
* Reads the committed artifact only; never calls the model.
|
|
63
|
+
*/
|
|
64
|
+
async function verifyFidelity(
|
|
65
|
+
options: VerifyAnchorsOptions,
|
|
66
|
+
chunks: Awaited<ReturnType<typeof buildCorpus>>['chunks'],
|
|
67
|
+
): Promise<{ dropped: VerifyAnchorsResult['dropped']; uncovered: string[] }> {
|
|
68
|
+
const digestPath = path.resolve(options.siteRoot, options.digestPath ?? '.hev-ask/digest.json');
|
|
69
|
+
const digest = normalizeDigest(await readJson(digestPath));
|
|
70
|
+
if (!digest.nodes.length) return { dropped: [], uncovered: [] }; // v1 / degraded digest — nothing to check
|
|
71
|
+
|
|
72
|
+
const nodeById = new Map(digest.nodes.map((node) => [node.id, node]));
|
|
73
|
+
const dropped: VerifyAnchorsResult['dropped'] = [];
|
|
74
|
+
const uncovered: string[] = [];
|
|
75
|
+
|
|
76
|
+
for (const chunk of chunks) {
|
|
77
|
+
const node = nodeById.get(chunk.id);
|
|
78
|
+
if (!node) {
|
|
79
|
+
uncovered.push(chunk.id);
|
|
80
|
+
continue;
|
|
81
|
+
}
|
|
82
|
+
if (node.mode === 'source-primary') continue;
|
|
83
|
+
const carried = new Set(node.facts.map((fact) => fact.literal));
|
|
84
|
+
for (const fact of extractFacts(chunk.id, chunk.raw)) {
|
|
85
|
+
if (!carried.has(fact.literal)) dropped.push({ id: chunk.id, literal: fact.literal });
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
return { dropped, uncovered };
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
async function readJson(file: string): Promise<unknown> {
|
|
93
|
+
try {
|
|
94
|
+
return JSON.parse(await readFile(file, 'utf8'));
|
|
95
|
+
} catch {
|
|
96
|
+
return null;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
async function findHtmlWithId(files: string[], id: string): Promise<boolean> {
|
|
101
|
+
for (const file of files) {
|
|
102
|
+
const html = await readFile(file, 'utf8').catch(() => '');
|
|
103
|
+
if (html && hasId(html, id)) return true;
|
|
104
|
+
}
|
|
105
|
+
return false;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
function htmlFilesForUrl(distDir: string, url: string): string[] {
|
|
109
|
+
const pathname = url.split('#')[0].replace(/^\//, '').replace(/\/$/, '');
|
|
110
|
+
return [path.join(distDir, pathname, 'index.html'), path.join(distDir, 'client', pathname, 'index.html')];
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
function hasId(html: string, id: string): boolean {
|
|
114
|
+
const escaped = id.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
115
|
+
return new RegExp(`\\sid=(["'])${escaped}\\1`).test(html);
|
|
116
|
+
}
|