@hevmind/ask 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +116 -0
- package/bin/ask-launcher.mjs +110 -0
- package/bin/ask.mjs +4 -0
- package/openapi.yaml +363 -0
- package/package.json +61 -0
- package/skills/build-digest/SKILL.md +164 -0
- package/src/components/SearchOverlay.astro +1375 -0
- package/src/components/markdown.ts +107 -0
- package/src/digest/build.ts +432 -0
- package/src/digest/cli.ts +148 -0
- package/src/digest/expand.ts +24 -0
- package/src/digest/facts.ts +77 -0
- package/src/digest/frontmatter.ts +41 -0
- package/src/digest/read.ts +63 -0
- package/src/digest/schema.ts +185 -0
- package/src/digest/verify.ts +116 -0
- package/src/endpoint.ts +247 -0
- package/src/index.ts +2 -0
- package/src/integration.ts +146 -0
- package/src/llm.ts +239 -0
- package/src/observability.ts +213 -0
- package/src/search/chunk.ts +137 -0
- package/src/search/index.ts +44 -0
- package/src/search/loop.ts +525 -0
- package/src/search/prefilter.ts +93 -0
- package/src/types.ts +99 -0
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
// Minimal, dependency-free, streaming-safe Markdown renderer for the AI answer.
|
|
2
|
+
//
|
|
3
|
+
// The renderer is fed the *whole* accumulated answer on every token, so partial
|
|
4
|
+
// syntax (an unterminated `[label](`) simply fails to match and renders as
|
|
5
|
+
// literal text until the closing token arrives — never a broken DOM. Model
|
|
6
|
+
// output is untrusted: the source text is HTML-escaped first, then our own
|
|
7
|
+
// trusted tags are injected. Links are validated against the grounding source
|
|
8
|
+
// set so a hallucinated URL degrades to plain text — and, when the source
|
|
9
|
+
// carries distinctive `terms`, a link whose surrounding text shares none of
|
|
10
|
+
// them (a misattributed citation) also degrades to plain text.
|
|
11
|
+
|
|
12
|
+
export interface Source {
|
|
13
|
+
title: string;
|
|
14
|
+
heading?: string;
|
|
15
|
+
url: string;
|
|
16
|
+
group?: string;
|
|
17
|
+
/** Distinctive tokens of the cited section, for the link-support check. */
|
|
18
|
+
terms?: string[];
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
const LINK_RE = /\[([^\]]+)\]\(([^)\s]+)\)/g;
|
|
22
|
+
const CODE_RE = /`([^`]+)`/g;
|
|
23
|
+
const BOLD_RE = /\*\*([^*]+)\*\*/g;
|
|
24
|
+
const LIST_ITEM_RE = /^\s*(?:[-*]|\d+\.)\s+/;
|
|
25
|
+
|
|
26
|
+
export function escapeHtml(text: string): string {
|
|
27
|
+
return text
|
|
28
|
+
.replace(/&/g, '&')
|
|
29
|
+
.replace(/</g, '<')
|
|
30
|
+
.replace(/>/g, '>')
|
|
31
|
+
.replace(/"/g, '"');
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/** Breadcrumb for a source, e.g. "Concepts › Kubernetes autoscaling". */
|
|
35
|
+
export function sourceBreadcrumb(source: Source): string {
|
|
36
|
+
return [source.group, source.heading ?? source.title].filter(Boolean).join(' › ');
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
function tokenSet(text: string): Set<string> {
|
|
40
|
+
// Strip link *targets* (keep labels) so a URL/anchor slug can't leak its own
|
|
41
|
+
// terms into the support check — otherwise a link to #autoscaling would always
|
|
42
|
+
// "support" itself via the word "autoscaling" in its href.
|
|
43
|
+
const withoutUrls = text.replace(LINK_RE, (_m, label: string) => label);
|
|
44
|
+
return new Set(withoutUrls.toLowerCase().match(/[a-z0-9]+/g) ?? []);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* A cited link survives only if the text around it shares a distinctive term
|
|
49
|
+
* with the cited section. Lenient by design: a source with no `terms` (e.g. a
|
|
50
|
+
* legacy/degraded digest) is never degraded on this basis.
|
|
51
|
+
*/
|
|
52
|
+
function supportsClaim(unitTokens: Set<string>, source: Source): boolean {
|
|
53
|
+
if (!source.terms || source.terms.length === 0) return true;
|
|
54
|
+
for (const term of source.terms) if (unitTokens.has(term)) return true;
|
|
55
|
+
return false;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
export function renderMarkdown(md: string, sources: Source[] = []): string {
|
|
59
|
+
const urlMap = new Map(sources.map((source) => [source.url, source]));
|
|
60
|
+
// Split into blocks on blank lines; render each as a list or a paragraph.
|
|
61
|
+
return md
|
|
62
|
+
.split(/\n{2,}/)
|
|
63
|
+
.map((block) => renderBlock(block.trim(), urlMap))
|
|
64
|
+
.filter(Boolean)
|
|
65
|
+
.join('');
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function renderBlock(block: string, urlMap: Map<string, Source>): string {
|
|
69
|
+
if (!block) return '';
|
|
70
|
+
// Horizontal rules and headings don't belong in a compact popover; the model
|
|
71
|
+
// is told not to use them, but degrade them gracefully if it does.
|
|
72
|
+
if (/^(-{3,}|\*{3,}|_{3,})$/.test(block)) return '';
|
|
73
|
+
const heading = block.match(/^#{1,6}\s+(.+)$/);
|
|
74
|
+
if (heading && !block.includes('\n')) {
|
|
75
|
+
return `<p><strong>${renderInline(escapeHtml(heading[1]), urlMap, tokenSet(heading[1]))}</strong></p>`;
|
|
76
|
+
}
|
|
77
|
+
const lines = block.split('\n');
|
|
78
|
+
if (lines.every((line) => !line.trim() || LIST_ITEM_RE.test(line))) {
|
|
79
|
+
const items = lines.filter((line) => line.trim());
|
|
80
|
+
const ordered = /^\s*\d+\./.test(items[0] ?? '');
|
|
81
|
+
const tag = ordered ? 'ol' : 'ul';
|
|
82
|
+
const body = items
|
|
83
|
+
.map((line) => {
|
|
84
|
+
const text = line.replace(LIST_ITEM_RE, '');
|
|
85
|
+
return `<li>${renderInline(escapeHtml(text), urlMap, tokenSet(text))}</li>`;
|
|
86
|
+
})
|
|
87
|
+
.join('');
|
|
88
|
+
return `<${tag}>${body}</${tag}>`;
|
|
89
|
+
}
|
|
90
|
+
const text = block.replace(/\n/g, ' ');
|
|
91
|
+
return `<p>${renderInline(escapeHtml(text), urlMap, tokenSet(text))}</p>`;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
function renderInline(escaped: string, urlMap: Map<string, Source>, unitTokens: Set<string>): string {
|
|
95
|
+
// Order: code, then bold, then links. The source text is already escaped, so
|
|
96
|
+
// injected tags are the only markup in the string.
|
|
97
|
+
let out = escaped.replace(CODE_RE, (_m, code: string) => `<code>${code}</code>`);
|
|
98
|
+
out = out.replace(BOLD_RE, (_m, inner: string) => `<strong>${inner}</strong>`);
|
|
99
|
+
out = out.replace(LINK_RE, (whole, label: string, url: string) => {
|
|
100
|
+
const source = urlMap.get(url);
|
|
101
|
+
if (!source) return label; // hallucinated / off-corpus link → plain text
|
|
102
|
+
if (!supportsClaim(unitTokens, source)) return label; // misattributed citation → plain text
|
|
103
|
+
const title = escapeHtml(sourceBreadcrumb(source));
|
|
104
|
+
return `<a class="as-answer-link" href="${escapeHtml(url)}" title="${title}">${label}</a>`;
|
|
105
|
+
});
|
|
106
|
+
return out;
|
|
107
|
+
}
|
|
@@ -0,0 +1,432 @@
|
|
|
1
|
+
import { createHash } from 'node:crypto';
|
|
2
|
+
import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises';
|
|
3
|
+
import path from 'node:path';
|
|
4
|
+
import { callClaude, type AnthropicTool } from '../llm.ts';
|
|
5
|
+
import { chunkDocument, hashableChunkText, type Chunk, type SourceDocument } from '../search/chunk.ts';
|
|
6
|
+
import { classifyMode, distinctiveTokens, extractFacts } from './facts.ts';
|
|
7
|
+
import { parseFrontmatter } from './frontmatter.ts';
|
|
8
|
+
import { normalizeDigest, type Digest, type DigestNode } from './schema.ts';
|
|
9
|
+
|
|
10
|
+
export interface DigestBuildOptions {
|
|
11
|
+
siteRoot: string;
|
|
12
|
+
collections: string[] | null;
|
|
13
|
+
basePath: string;
|
|
14
|
+
digestPath: string;
|
|
15
|
+
digestContentGlobs?: string[];
|
|
16
|
+
chunkHeadingDepth: number;
|
|
17
|
+
digestModel: string;
|
|
18
|
+
apiKey?: string;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export interface CorpusBuild {
|
|
22
|
+
documents: SourceDocument[];
|
|
23
|
+
chunks: Chunk[];
|
|
24
|
+
contentHash: string;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export interface DigestBuildResult {
|
|
28
|
+
status: 'built' | 'skipped';
|
|
29
|
+
path: string;
|
|
30
|
+
contentHash: string;
|
|
31
|
+
chunks: number;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
const DIGEST_TOOL: AnthropicTool = {
|
|
35
|
+
name: 'emit_digest',
|
|
36
|
+
description:
|
|
37
|
+
'Emit a documentation digest: a compact orientation, a glossary, and one distilled summary per section.',
|
|
38
|
+
input_schema: {
|
|
39
|
+
type: 'object',
|
|
40
|
+
properties: {
|
|
41
|
+
context: {
|
|
42
|
+
type: 'string',
|
|
43
|
+
description:
|
|
44
|
+
'Compact markdown orientation explaining the product, core concepts, feature areas, and how users talk about them.',
|
|
45
|
+
},
|
|
46
|
+
glossary: {
|
|
47
|
+
type: 'array',
|
|
48
|
+
items: {
|
|
49
|
+
type: 'object',
|
|
50
|
+
properties: {
|
|
51
|
+
term: { type: 'string' },
|
|
52
|
+
aliases: { type: 'array', items: { type: 'string' } },
|
|
53
|
+
definition: { type: 'string' },
|
|
54
|
+
},
|
|
55
|
+
required: ['term', 'aliases', 'definition'],
|
|
56
|
+
},
|
|
57
|
+
},
|
|
58
|
+
summaries: {
|
|
59
|
+
type: 'array',
|
|
60
|
+
description:
|
|
61
|
+
'One entry per section id in the corpus. The summary is the agent-facing distillation of that section.',
|
|
62
|
+
items: {
|
|
63
|
+
type: 'object',
|
|
64
|
+
properties: {
|
|
65
|
+
id: { type: 'string', description: 'The exact section id from the corpus.' },
|
|
66
|
+
summary: {
|
|
67
|
+
type: 'string',
|
|
68
|
+
description:
|
|
69
|
+
'A tight 1-3 sentence distillation of the section a search agent can answer from. Paraphrase prose; do NOT restate code, flags, or exact identifiers (those are preserved separately).',
|
|
70
|
+
},
|
|
71
|
+
},
|
|
72
|
+
required: ['id', 'summary'],
|
|
73
|
+
},
|
|
74
|
+
},
|
|
75
|
+
suggestions: {
|
|
76
|
+
type: 'array',
|
|
77
|
+
description:
|
|
78
|
+
'3-5 natural questions a real reader might ask that these docs genuinely answer. Phrase them the way a user would type them, not as headings.',
|
|
79
|
+
items: { type: 'string' },
|
|
80
|
+
},
|
|
81
|
+
},
|
|
82
|
+
required: ['context', 'glossary', 'summaries', 'suggestions'],
|
|
83
|
+
},
|
|
84
|
+
};
|
|
85
|
+
|
|
86
|
+
export interface EmittedDistillation {
|
|
87
|
+
context: string;
|
|
88
|
+
glossary: Digest['glossary'];
|
|
89
|
+
summaries: Array<{ id: string; summary: string }>;
|
|
90
|
+
suggestions: string[];
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/** The exact model-input payload the skill (or `build`) distils from. */
|
|
94
|
+
export interface DigestInput {
|
|
95
|
+
contentHash: string;
|
|
96
|
+
digestPath: string;
|
|
97
|
+
/** True when the committed digest.json already matches this corpus — no rebuild needed. */
|
|
98
|
+
upToDate: boolean;
|
|
99
|
+
sections: Array<{ id: string; url: string; title: string; text: string }>;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/** Sections handed to the model: one entry per heading chunk. */
|
|
103
|
+
export function corpusSections(corpus: CorpusBuild): DigestInput['sections'] {
|
|
104
|
+
return corpus.chunks.map((chunk) => ({
|
|
105
|
+
id: chunk.id,
|
|
106
|
+
url: chunk.url,
|
|
107
|
+
title: chunk.heading ? `${chunk.docTitle} > ${chunk.heading}` : chunk.docTitle,
|
|
108
|
+
text: chunk.text,
|
|
109
|
+
}));
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Assembles the committed digest from a model distillation. Everything but the
|
|
114
|
+
* distilled fields (`summary`, `glossary`, `context`, `suggestions`) is derived
|
|
115
|
+
* here deterministically, so it is identical whether the distillation came from
|
|
116
|
+
* the API or a Claude Code skill.
|
|
117
|
+
*/
|
|
118
|
+
export function assembleDigest(emitted: EmittedDistillation, corpus: CorpusBuild): Digest {
|
|
119
|
+
const summaryById = new Map(emitted.summaries.map((entry) => [entry.id, entry.summary]));
|
|
120
|
+
return {
|
|
121
|
+
version: 2,
|
|
122
|
+
generatedAt: new Date().toISOString(),
|
|
123
|
+
contentHash: corpus.contentHash,
|
|
124
|
+
context: emitted.context,
|
|
125
|
+
glossary: emitted.glossary,
|
|
126
|
+
overview: buildOverview(corpus.chunks),
|
|
127
|
+
suggestions: emitted.suggestions,
|
|
128
|
+
nodes: buildNodes(corpus.chunks, summaryById),
|
|
129
|
+
edges: [],
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
export async function buildDigest(options: DigestBuildOptions): Promise<DigestBuildResult> {
|
|
134
|
+
const corpus = await buildCorpus(options);
|
|
135
|
+
const outPath = path.resolve(options.siteRoot, options.digestPath);
|
|
136
|
+
const existing = await readExistingDigest(outPath);
|
|
137
|
+
// Skip only when the committed artifact is already a current-version digest with
|
|
138
|
+
// nodes built from this exact corpus. A v1 (node-less) artifact always rebuilds.
|
|
139
|
+
if (existing && existing.version === 2 && existing.contentHash === corpus.contentHash && existing.nodes.length > 0) {
|
|
140
|
+
return { status: 'skipped', path: outPath, contentHash: corpus.contentHash, chunks: corpus.chunks.length };
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
const apiKey = options.apiKey ?? process.env.ANTHROPIC_API_KEY;
|
|
144
|
+
if (!apiKey) throw new Error('ANTHROPIC_API_KEY is required to build a fresh digest.');
|
|
145
|
+
|
|
146
|
+
const corpusText = corpusSections(corpus)
|
|
147
|
+
.map((section) => `id: ${section.id}\nurl: ${section.url}\ntitle: ${section.title}\n\n${section.text}`)
|
|
148
|
+
.join('\n\n---\n\n');
|
|
149
|
+
|
|
150
|
+
const response = await callClaude({
|
|
151
|
+
apiKey,
|
|
152
|
+
model: options.digestModel,
|
|
153
|
+
maxTokens: 8192,
|
|
154
|
+
system: [
|
|
155
|
+
{
|
|
156
|
+
type: 'text',
|
|
157
|
+
text: DIGEST_SYSTEM_PROMPT,
|
|
158
|
+
},
|
|
159
|
+
{
|
|
160
|
+
type: 'text',
|
|
161
|
+
text: `<corpus>\n${corpusText}\n</corpus>`,
|
|
162
|
+
cache_control: { type: 'ephemeral' },
|
|
163
|
+
},
|
|
164
|
+
],
|
|
165
|
+
messages: [
|
|
166
|
+
{
|
|
167
|
+
role: 'user',
|
|
168
|
+
content:
|
|
169
|
+
'Emit the context, glossary, one summary per section id, and 3-5 suggested questions. Every id in the corpus must get a summary.',
|
|
170
|
+
},
|
|
171
|
+
],
|
|
172
|
+
tools: [DIGEST_TOOL],
|
|
173
|
+
toolChoice: { type: 'tool', name: 'emit_digest' },
|
|
174
|
+
});
|
|
175
|
+
|
|
176
|
+
const toolUse = response.content.find((block) => block.type === 'tool_use' && block.name === 'emit_digest');
|
|
177
|
+
const emitted = parseEmittedDigest(toolUse?.type === 'tool_use' ? toolUse.input : null);
|
|
178
|
+
const digest = assembleDigest(emitted, corpus);
|
|
179
|
+
|
|
180
|
+
await writeGraph(outPath, digest);
|
|
181
|
+
return { status: 'built', path: outPath, contentHash: corpus.contentHash, chunks: corpus.chunks.length };
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/** Shared instruction for the model step, whether it runs via API or a skill. */
|
|
185
|
+
export const DIGEST_SYSTEM_PROMPT =
|
|
186
|
+
'You build documentation digests for an AI search agent. Return only the forced tool call. Write a compact orientation, a glossary with aliases real users would type, one tight summary for every section id in the corpus, and 3-5 natural questions a reader might ask that these docs answer. Summaries are what the agent reasons from, so make them faithful and self-contained; paraphrase prose but never restate code, flags, or exact identifiers.';
|
|
187
|
+
|
|
188
|
+
/** Reads the forced tool call (or a skill's distillation file) into the emit shape. */
|
|
189
|
+
export function parseEmittedDigest(input: unknown): EmittedDistillation {
|
|
190
|
+
const base = normalizeDigest(input);
|
|
191
|
+
const raw = (input ?? {}) as { summaries?: unknown };
|
|
192
|
+
const summaries = Array.isArray(raw.summaries)
|
|
193
|
+
? raw.summaries
|
|
194
|
+
.map((entry) => {
|
|
195
|
+
if (!entry || typeof entry !== 'object') return null;
|
|
196
|
+
const maybe = entry as { id?: unknown; summary?: unknown };
|
|
197
|
+
if (typeof maybe.id !== 'string' || typeof maybe.summary !== 'string') return null;
|
|
198
|
+
return { id: maybe.id, summary: maybe.summary.trim() };
|
|
199
|
+
})
|
|
200
|
+
.filter((entry): entry is { id: string; summary: string } => entry !== null)
|
|
201
|
+
: [];
|
|
202
|
+
return { context: base.context, glossary: base.glossary, summaries, suggestions: base.suggestions };
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
async function writeGraph(outPath: string, digest: Digest): Promise<void> {
|
|
206
|
+
await mkdir(path.dirname(outPath), { recursive: true });
|
|
207
|
+
await writeFile(outPath, JSON.stringify(digest, null, 2) + '\n', 'utf8');
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
/**
|
|
211
|
+
* `corpus` command: chunk the content and write the model-input payload (plus a
|
|
212
|
+
* freshness flag) for a Claude Code skill to distil. Fully deterministic, keyless.
|
|
213
|
+
*/
|
|
214
|
+
export async function writeCorpusInput(options: {
|
|
215
|
+
siteRoot: string;
|
|
216
|
+
collections: string[] | null;
|
|
217
|
+
basePath: string;
|
|
218
|
+
digestPath: string;
|
|
219
|
+
outPath: string;
|
|
220
|
+
digestContentGlobs?: string[];
|
|
221
|
+
chunkHeadingDepth: number;
|
|
222
|
+
}): Promise<{ path: string; upToDate: boolean; sections: number }> {
|
|
223
|
+
const corpus = await buildCorpus(options);
|
|
224
|
+
const committed = await readExistingDigest(path.resolve(options.siteRoot, options.digestPath));
|
|
225
|
+
const upToDate = Boolean(
|
|
226
|
+
committed && committed.version === 2 && committed.contentHash === corpus.contentHash && committed.nodes.length > 0,
|
|
227
|
+
);
|
|
228
|
+
const payload: DigestInput = {
|
|
229
|
+
contentHash: corpus.contentHash,
|
|
230
|
+
digestPath: options.digestPath,
|
|
231
|
+
upToDate,
|
|
232
|
+
sections: corpusSections(corpus),
|
|
233
|
+
};
|
|
234
|
+
const outPath = path.resolve(options.siteRoot, options.outPath);
|
|
235
|
+
await mkdir(path.dirname(outPath), { recursive: true });
|
|
236
|
+
await writeFile(outPath, JSON.stringify(payload, null, 2) + '\n', 'utf8');
|
|
237
|
+
return { path: outPath, upToDate, sections: payload.sections.length };
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
/**
|
|
241
|
+
* `assemble` command: read a skill-produced distillation, re-chunk the content
|
|
242
|
+
* from disk, and write the committed digest with the deterministic parts computed
|
|
243
|
+
* in code. Keyless — the model never runs here.
|
|
244
|
+
*/
|
|
245
|
+
export async function assembleFromDistillation(options: {
|
|
246
|
+
siteRoot: string;
|
|
247
|
+
collections: string[] | null;
|
|
248
|
+
basePath: string;
|
|
249
|
+
digestPath: string;
|
|
250
|
+
inputPath: string;
|
|
251
|
+
digestContentGlobs?: string[];
|
|
252
|
+
chunkHeadingDepth: number;
|
|
253
|
+
}): Promise<DigestBuildResult> {
|
|
254
|
+
const inputPath = path.resolve(options.siteRoot, options.inputPath);
|
|
255
|
+
let raw: unknown;
|
|
256
|
+
try {
|
|
257
|
+
raw = JSON.parse(await readFile(inputPath, 'utf8'));
|
|
258
|
+
} catch {
|
|
259
|
+
throw new Error(`Could not read distillation JSON at ${options.inputPath}. Run \`ask digest corpus\` first.`);
|
|
260
|
+
}
|
|
261
|
+
const corpus = await buildCorpus(options);
|
|
262
|
+
const digest = assembleDigest(parseEmittedDigest(raw), corpus);
|
|
263
|
+
const outPath = path.resolve(options.siteRoot, options.digestPath);
|
|
264
|
+
await writeGraph(outPath, digest);
|
|
265
|
+
return { status: 'built', path: outPath, contentHash: corpus.contentHash, chunks: corpus.chunks.length };
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
/** Assembles section nodes. Everything but `summary` is derived deterministically. */
|
|
269
|
+
export function buildNodes(chunks: Chunk[], summaryById: Map<string, string>): DigestNode[] {
|
|
270
|
+
return chunks
|
|
271
|
+
.map((chunk): DigestNode => {
|
|
272
|
+
const facts = extractFacts(chunk.id, chunk.raw);
|
|
273
|
+
const summary = summaryById.get(chunk.id)?.trim() || excerpt(chunk.text);
|
|
274
|
+
const terms = distinctiveTokens(
|
|
275
|
+
[chunk.heading ?? '', summary, facts.map((fact) => fact.literal).join(' '), chunk.text].join(' '),
|
|
276
|
+
);
|
|
277
|
+
return {
|
|
278
|
+
id: chunk.id,
|
|
279
|
+
kind: 'section',
|
|
280
|
+
title: chunk.docTitle,
|
|
281
|
+
heading: chunk.heading ?? null,
|
|
282
|
+
group: chunk.group ?? null,
|
|
283
|
+
url: chunk.url,
|
|
284
|
+
summary,
|
|
285
|
+
facts,
|
|
286
|
+
sources: [{ chunkId: chunk.id, url: chunk.url, anchor: chunk.anchorId ?? null }],
|
|
287
|
+
mode: classifyMode(chunk.group),
|
|
288
|
+
terms,
|
|
289
|
+
};
|
|
290
|
+
})
|
|
291
|
+
.sort((a, b) => a.id.localeCompare(b.id));
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
/** Deterministic grouped table of contents — the agent's cheap high-level map. */
|
|
295
|
+
export function buildOverview(chunks: Chunk[]): string {
|
|
296
|
+
const byGroup = new Map<string, Chunk[]>();
|
|
297
|
+
for (const chunk of chunks) {
|
|
298
|
+
const group = chunk.group ?? 'Docs';
|
|
299
|
+
(byGroup.get(group) ?? byGroup.set(group, []).get(group)!).push(chunk);
|
|
300
|
+
}
|
|
301
|
+
const lines: string[] = [];
|
|
302
|
+
for (const [group, items] of [...byGroup.entries()].sort((a, b) => a[0].localeCompare(b[0]))) {
|
|
303
|
+
lines.push(`## ${group}`);
|
|
304
|
+
for (const chunk of items) {
|
|
305
|
+
lines.push(`- ${chunk.heading ?? chunk.docTitle} — \`${chunk.id}\``);
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
return lines.join('\n');
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
function excerpt(text: string, max = 220): string {
|
|
312
|
+
const trimmed = text.replace(/\s+/g, ' ').trim();
|
|
313
|
+
return trimmed.length > max ? trimmed.slice(0, max).trimEnd() + '…' : trimmed;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
export async function buildCorpus(options: {
|
|
317
|
+
siteRoot: string;
|
|
318
|
+
collections: string[] | null;
|
|
319
|
+
basePath: string;
|
|
320
|
+
digestContentGlobs?: string[];
|
|
321
|
+
chunkHeadingDepth: number;
|
|
322
|
+
}): Promise<CorpusBuild> {
|
|
323
|
+
const files = await resolveContentFiles(options.siteRoot, options.collections, options.digestContentGlobs);
|
|
324
|
+
const documents = await Promise.all(
|
|
325
|
+
files.map(async (file) => {
|
|
326
|
+
const raw = await readFile(file, 'utf8');
|
|
327
|
+
const { data, body } = parseFrontmatter(raw);
|
|
328
|
+
const slug = slugFromFile(options.siteRoot, file, options.collections);
|
|
329
|
+
return {
|
|
330
|
+
slug,
|
|
331
|
+
title: typeof data.title === 'string' ? data.title : slug,
|
|
332
|
+
group: typeof data.group === 'string' ? data.group : undefined,
|
|
333
|
+
description: typeof data.description === 'string' ? data.description : undefined,
|
|
334
|
+
body,
|
|
335
|
+
} satisfies SourceDocument;
|
|
336
|
+
}),
|
|
337
|
+
);
|
|
338
|
+
|
|
339
|
+
const chunks = documents
|
|
340
|
+
.sort((a, b) => a.slug.localeCompare(b.slug))
|
|
341
|
+
.flatMap((doc) => chunkDocument(doc, options.basePath, options.chunkHeadingDepth))
|
|
342
|
+
.sort((a, b) => a.id.localeCompare(b.id));
|
|
343
|
+
const contentHash = sha256(hashableChunkText(chunks));
|
|
344
|
+
return { documents, chunks, contentHash };
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
export function sha256(text: string): string {
|
|
348
|
+
return createHash('sha256').update(text).digest('hex');
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
async function readExistingDigest(file: string): Promise<Digest | null> {
|
|
352
|
+
try {
|
|
353
|
+
return normalizeDigest(JSON.parse(await readFile(file, 'utf8')));
|
|
354
|
+
} catch {
|
|
355
|
+
return null;
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
async function resolveContentFiles(
|
|
360
|
+
siteRoot: string,
|
|
361
|
+
collections: string[] | null,
|
|
362
|
+
digestContentGlobs: string[] | undefined,
|
|
363
|
+
): Promise<string[]> {
|
|
364
|
+
const collectionsForDefault = collections?.length ? collections : ['docs'];
|
|
365
|
+
const globs =
|
|
366
|
+
digestContentGlobs?.length
|
|
367
|
+
? digestContentGlobs
|
|
368
|
+
: collectionsForDefault.map((collection) => `src/content/${collection}/**/*.{md,mdx}`);
|
|
369
|
+
const files = new Set<string>();
|
|
370
|
+
|
|
371
|
+
for (const glob of globs) {
|
|
372
|
+
for (const file of await filesForGlob(siteRoot, glob)) files.add(file);
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
return [...files].sort((a, b) => a.localeCompare(b));
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
async function filesForGlob(siteRoot: string, glob: string): Promise<string[]> {
|
|
379
|
+
const normalized = glob.replace(/\\/g, '/');
|
|
380
|
+
const rootPart = globRoot(normalized);
|
|
381
|
+
const root = path.resolve(siteRoot, rootPart);
|
|
382
|
+
const all = await walk(root).catch(() => []);
|
|
383
|
+
const re = globToRegex(normalized);
|
|
384
|
+
return all.filter((file) => re.test(path.relative(siteRoot, file).replace(/\\/g, '/')));
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
function globRoot(glob: string): string {
|
|
388
|
+
const wildcard = glob.search(/[*{]/);
|
|
389
|
+
if (wildcard === -1) return path.dirname(glob);
|
|
390
|
+
const before = glob.slice(0, wildcard);
|
|
391
|
+
return before.replace(/[/\\][^/\\]*$/, '') || '.';
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
function globToRegex(glob: string): RegExp {
|
|
395
|
+
let pattern = glob.replace(/[.+^${}()|[\]\\]/g, '\\$&');
|
|
396
|
+
pattern = pattern.replace(/\\\{([^}]+)\\\}/g, (_, inner: string) => `(${inner.split(',').map(escapeRegex).join('|')})`);
|
|
397
|
+
pattern = pattern.replace(/\*\*\/?/g, '::GLOBSTAR::');
|
|
398
|
+
pattern = pattern.replace(/\*/g, '[^/]*');
|
|
399
|
+
pattern = pattern.replace(/::GLOBSTAR::/g, '(?:.*/)?');
|
|
400
|
+
return new RegExp(`^${pattern}$`);
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
function escapeRegex(value: string): string {
|
|
404
|
+
return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
async function walk(dir: string): Promise<string[]> {
|
|
408
|
+
const entries = await readdir(dir, { withFileTypes: true });
|
|
409
|
+
const files = await Promise.all(
|
|
410
|
+
entries.map(async (entry) => {
|
|
411
|
+
const file = path.join(dir, entry.name);
|
|
412
|
+
return entry.isDirectory() ? walk(file) : [file];
|
|
413
|
+
}),
|
|
414
|
+
);
|
|
415
|
+
return files.flat();
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
function slugFromFile(siteRoot: string, file: string, collections: string[] | null): string {
|
|
419
|
+
const normalizedFile = path.resolve(file);
|
|
420
|
+
const collectionNames = collections?.length ? collections : ['docs'];
|
|
421
|
+
for (const collection of collectionNames) {
|
|
422
|
+
const root = path.resolve(siteRoot, 'src/content', collection);
|
|
423
|
+
const rel = path.relative(root, normalizedFile);
|
|
424
|
+
if (!rel.startsWith('..') && !path.isAbsolute(rel)) return cleanSlug(rel);
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
return cleanSlug(path.relative(path.resolve(siteRoot, 'src/content'), normalizedFile));
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
function cleanSlug(rel: string): string {
|
|
431
|
+
return rel.replace(/\\/g, '/').replace(/\.(md|mdx)$/i, '').replace(/\/index$/, '');
|
|
432
|
+
}
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { assembleFromDistillation, buildDigest, writeCorpusInput } from './build.ts';
|
|
3
|
+
import { verifyAnchors } from './verify.ts';
|
|
4
|
+
|
|
5
|
+
interface Flags {
|
|
6
|
+
collections: string[];
|
|
7
|
+
basePath?: string;
|
|
8
|
+
digestPath?: string;
|
|
9
|
+
digestContentGlobs: string[];
|
|
10
|
+
chunkHeadingDepth?: number;
|
|
11
|
+
digestModel?: string;
|
|
12
|
+
buildCommand?: string;
|
|
13
|
+
skipBuild?: boolean;
|
|
14
|
+
strict?: boolean;
|
|
15
|
+
out?: string;
|
|
16
|
+
input?: string;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
const [command = 'build', ...args] = process.argv.slice(2);
|
|
20
|
+
const flags = parseFlags(args);
|
|
21
|
+
|
|
22
|
+
try {
|
|
23
|
+
if (command === 'build') {
|
|
24
|
+
const result = await buildDigest({
|
|
25
|
+
siteRoot: process.cwd(),
|
|
26
|
+
collections: flags.collections.length ? flags.collections : ['docs'],
|
|
27
|
+
basePath: flags.basePath ?? '/docs/',
|
|
28
|
+
digestPath: flags.digestPath ?? '.hev-ask/digest.json',
|
|
29
|
+
digestContentGlobs: flags.digestContentGlobs.length ? flags.digestContentGlobs : undefined,
|
|
30
|
+
chunkHeadingDepth: flags.chunkHeadingDepth ?? 3,
|
|
31
|
+
digestModel: flags.digestModel ?? 'claude-opus-4-8',
|
|
32
|
+
});
|
|
33
|
+
console.log(`[hev-ask] digest:${result.status} ${result.path} (${result.chunks} chunks)`);
|
|
34
|
+
} else if (command === 'corpus') {
|
|
35
|
+
const result = await writeCorpusInput({
|
|
36
|
+
siteRoot: process.cwd(),
|
|
37
|
+
collections: flags.collections.length ? flags.collections : ['docs'],
|
|
38
|
+
basePath: flags.basePath ?? '/docs/',
|
|
39
|
+
digestPath: flags.digestPath ?? '.hev-ask/digest.json',
|
|
40
|
+
outPath: flags.out ?? '.hev-ask/digest-input.json',
|
|
41
|
+
digestContentGlobs: flags.digestContentGlobs.length ? flags.digestContentGlobs : undefined,
|
|
42
|
+
chunkHeadingDepth: flags.chunkHeadingDepth ?? 3,
|
|
43
|
+
});
|
|
44
|
+
const state = result.upToDate ? 'up-to-date' : 'needs-rebuild';
|
|
45
|
+
console.log(`[hev-ask] digest:corpus ${result.path} (${result.sections} sections, ${state})`);
|
|
46
|
+
} else if (command === 'assemble') {
|
|
47
|
+
const result = await assembleFromDistillation({
|
|
48
|
+
siteRoot: process.cwd(),
|
|
49
|
+
collections: flags.collections.length ? flags.collections : ['docs'],
|
|
50
|
+
basePath: flags.basePath ?? '/docs/',
|
|
51
|
+
digestPath: flags.digestPath ?? '.hev-ask/digest.json',
|
|
52
|
+
inputPath: flags.input ?? '.hev-ask/digest-distill.json',
|
|
53
|
+
digestContentGlobs: flags.digestContentGlobs.length ? flags.digestContentGlobs : undefined,
|
|
54
|
+
chunkHeadingDepth: flags.chunkHeadingDepth ?? 3,
|
|
55
|
+
});
|
|
56
|
+
console.log(`[hev-ask] digest:${result.status} ${result.path} (${result.chunks} chunks)`);
|
|
57
|
+
} else if (command === 'verify') {
|
|
58
|
+
const result = await verifyAnchors({
|
|
59
|
+
siteRoot: process.cwd(),
|
|
60
|
+
collections: flags.collections.length ? flags.collections : ['docs'],
|
|
61
|
+
basePath: flags.basePath ?? '/docs/',
|
|
62
|
+
digestPath: flags.digestPath ?? '.hev-ask/digest.json',
|
|
63
|
+
digestContentGlobs: flags.digestContentGlobs.length ? flags.digestContentGlobs : undefined,
|
|
64
|
+
chunkHeadingDepth: flags.chunkHeadingDepth ?? 3,
|
|
65
|
+
buildCommand: flags.buildCommand,
|
|
66
|
+
skipBuild: flags.skipBuild,
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
let failed = false;
|
|
70
|
+
|
|
71
|
+
// Anchor drift is always fatal — it is fully deterministic and keyless.
|
|
72
|
+
if (result.missing.length) {
|
|
73
|
+
for (const miss of result.missing) {
|
|
74
|
+
console.error(`[hev-ask] missing anchor ${miss.anchorId} for ${miss.url} in ${miss.file}`);
|
|
75
|
+
}
|
|
76
|
+
failed = true;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// Coverage + literal-fidelity warn by default; --strict makes them fatal.
|
|
80
|
+
if (result.uncovered.length) {
|
|
81
|
+
const sample = result.uncovered.slice(0, 5).join(', ');
|
|
82
|
+
const more = result.uncovered.length > 5 ? `, …(+${result.uncovered.length - 5})` : '';
|
|
83
|
+
console.warn(`[hev-ask] ${result.uncovered.length} section(s) missing from the digest: ${sample}${more} — run \`ask digest build\`.`);
|
|
84
|
+
if (flags.strict) failed = true;
|
|
85
|
+
}
|
|
86
|
+
if (result.dropped.length) {
|
|
87
|
+
console.warn(`[hev-ask] ${result.dropped.length} source literal(s) dropped from agent-primary nodes — run \`ask digest build\`:`);
|
|
88
|
+
for (const drop of result.dropped.slice(0, 8)) console.warn(` - ${drop.id}: ${drop.literal}`);
|
|
89
|
+
if (flags.strict) failed = true;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
if (failed) {
|
|
93
|
+
process.exitCode = 1;
|
|
94
|
+
} else {
|
|
95
|
+
const warnings = result.dropped.length || result.uncovered.length ? ' (with warnings)' : '';
|
|
96
|
+
console.log(`[hev-ask] verified ${result.checked} anchors${warnings}`);
|
|
97
|
+
}
|
|
98
|
+
} else {
|
|
99
|
+
console.error(
|
|
100
|
+
'Usage: ask digest build|corpus|assemble|verify [--collection docs] [--base-path /docs/] [--out path] [--input path] [--strict]',
|
|
101
|
+
);
|
|
102
|
+
process.exitCode = 1;
|
|
103
|
+
}
|
|
104
|
+
} catch (err) {
|
|
105
|
+
console.error(`[hev-ask] ${(err as Error).message}`);
|
|
106
|
+
process.exitCode = 1;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
function parseFlags(args: string[]): Flags {
|
|
110
|
+
const flags: Flags = { collections: [], digestContentGlobs: [] };
|
|
111
|
+
for (let i = 0; i < args.length; i += 1) {
|
|
112
|
+
const arg = args[i];
|
|
113
|
+
const next = args[i + 1];
|
|
114
|
+
if (arg === '--collection' && next) {
|
|
115
|
+
flags.collections.push(next);
|
|
116
|
+
i += 1;
|
|
117
|
+
} else if (arg === '--base-path' && next) {
|
|
118
|
+
flags.basePath = next;
|
|
119
|
+
i += 1;
|
|
120
|
+
} else if (arg === '--digest-path' && next) {
|
|
121
|
+
flags.digestPath = next;
|
|
122
|
+
i += 1;
|
|
123
|
+
} else if (arg === '--content-glob' && next) {
|
|
124
|
+
flags.digestContentGlobs.push(next);
|
|
125
|
+
i += 1;
|
|
126
|
+
} else if (arg === '--chunk-heading-depth' && next) {
|
|
127
|
+
flags.chunkHeadingDepth = Number(next);
|
|
128
|
+
i += 1;
|
|
129
|
+
} else if (arg === '--digest-model' && next) {
|
|
130
|
+
flags.digestModel = next;
|
|
131
|
+
i += 1;
|
|
132
|
+
} else if (arg === '--build-command' && next) {
|
|
133
|
+
flags.buildCommand = next;
|
|
134
|
+
i += 1;
|
|
135
|
+
} else if (arg === '--out' && next) {
|
|
136
|
+
flags.out = next;
|
|
137
|
+
i += 1;
|
|
138
|
+
} else if (arg === '--input' && next) {
|
|
139
|
+
flags.input = next;
|
|
140
|
+
i += 1;
|
|
141
|
+
} else if (arg === '--skip-build') {
|
|
142
|
+
flags.skipBuild = true;
|
|
143
|
+
} else if (arg === '--strict') {
|
|
144
|
+
flags.strict = true;
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
return flags;
|
|
148
|
+
}
|