fullstackgtm 0.15.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/market.js ADDED
@@ -0,0 +1,395 @@
1
+ import { createHash } from "node:crypto";
2
+ import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync } from "node:fs";
3
+ import { join } from "node:path";
4
+ import { credentialsDir } from "./credentials.js";
5
+ const INTENSITY_RANK = {
6
+ loud: 3,
7
+ quiet: 2,
8
+ absent: 1,
9
+ unobservable: 0,
10
+ };
11
+ // Mirrors stableHash in rules.ts (FNV-1a); duplicated to keep market.ts
12
+ // importable without pulling the audit engine.
13
+ function fnv1a(value) {
14
+ let hash = 0x811c9dc5;
15
+ for (let i = 0; i < value.length; i += 1) {
16
+ hash ^= value.charCodeAt(i);
17
+ hash = Math.imul(hash, 0x01000193);
18
+ }
19
+ return (hash >>> 0).toString(16).padStart(8, "0");
20
+ }
21
+ export function observationId(category, runLabel, vendorId, claimId) {
22
+ return `obs_${fnv1a(`${category}|${runLabel}|${vendorId}|${claimId}`)}`;
23
+ }
24
+ // ---------------------------------------------------------------------------
25
+ // Config
26
+ export function parseMarketConfig(raw) {
27
+ const config = JSON.parse(raw);
28
+ if (!config.category)
29
+ throw new Error("market config: missing category");
30
+ if (!Array.isArray(config.vendors) || config.vendors.length === 0) {
31
+ throw new Error("market config: at least one vendor is required");
32
+ }
33
+ if (!Array.isArray(config.claims) || config.claims.length === 0) {
34
+ throw new Error("market config: at least one claim is required");
35
+ }
36
+ for (const [label, items] of [
37
+ ["vendor", config.vendors],
38
+ ["claim", config.claims],
39
+ ]) {
40
+ const seen = new Set();
41
+ for (const item of items) {
42
+ if (!item.id)
43
+ throw new Error(`market config: ${label} missing id`);
44
+ if (seen.has(item.id))
45
+ throw new Error(`market config: duplicate ${label} id "${item.id}"`);
46
+ seen.add(item.id);
47
+ }
48
+ }
49
+ if (config.anchorVendor && !config.vendors.some((v) => v.id === config.anchorVendor)) {
50
+ throw new Error(`market config: anchorVendor "${config.anchorVendor}" is not in vendors`);
51
+ }
52
+ return config;
53
+ }
54
+ export function loadMarketConfig(path) {
55
+ return parseMarketConfig(readFileSync(path, "utf8"));
56
+ }
57
+ export function starterMarketConfig(category) {
58
+ return {
59
+ category,
60
+ anchorVendor: "your-company",
61
+ vendors: [
62
+ {
63
+ id: "your-company",
64
+ name: "Your Company",
65
+ urls: { home: "https://example.com/", pricing: null, product: [] },
66
+ notes: "Replace with the real vendor set (≤10 works well). pricing: null records 'no public pricing page'.",
67
+ },
68
+ ],
69
+ claims: [
70
+ {
71
+ id: "example-claim",
72
+ capability: "Example capability: what is being claimed, stated precisely",
73
+ icp: "who-buys-it",
74
+ pricingStructure: "how-it-is-priced",
75
+ definition: "LOUD if the claim is hero copy or a top-nav named product with a dedicated page; QUIET if it appears only on pages below that; ABSENT if nowhere. Write the definition so a human could judge any vendor's page against it.",
76
+ },
77
+ ],
78
+ surfaceRule: "LOUD = hero copy OR top-level-nav named product with dedicated page; QUIET = present on any indexed page below that; ABSENT = nowhere observed (explicit disavowals score ABSENT with the disavowal quoted in reason); UNOBSERVABLE = capture empty/failed — never score ABSENT from a failed capture.",
79
+ };
80
+ }
81
+ // ---------------------------------------------------------------------------
82
+ // Profile-scoped market home: captures and observations live with credentials
83
+ // so --profile isolation covers category intel too.
84
+ export function marketHome(category, baseDir) {
85
+ return join(baseDir ?? credentialsDir(), "market", category);
86
+ }
87
+ // ---------------------------------------------------------------------------
88
+ // Capture: fetch vendor pages, strip to readable text, store content-addressed.
89
+ // The hash cache is the change detector (unchanged page = same hash = no new
90
+ // classification needed), the replay buffer (re-judge a revised taxonomy
91
+ // without re-scraping), and the evidence chain (quoted spans stay resolvable).
92
+ const STRIP_BLOCKS = /<(script|style|noscript|svg|head)\b[\s\S]*?<\/\1\s*>/gi;
93
+ const ENTITIES = {
94
+ "&amp;": "&",
95
+ "&lt;": "<",
96
+ "&gt;": ">",
97
+ "&quot;": '"',
98
+ "&#39;": "'",
99
+ "&apos;": "'",
100
+ "&nbsp;": " ",
101
+ "&mdash;": "—",
102
+ "&ndash;": "–",
103
+ };
104
+ export function extractReadableText(html) {
105
+ const withoutBlocks = html.replace(STRIP_BLOCKS, " ");
106
+ const withBreaks = withoutBlocks.replace(/<(\/p|\/div|\/li|\/h[1-6]|br\s*\/?)>/gi, "\n");
107
+ const withoutTags = withBreaks.replace(/<[^>]+>/g, " ");
108
+ const decoded = withoutTags
109
+ .replace(/&[a-z#0-9]+;/gi, (entity) => ENTITIES[entity.toLowerCase()] ?? " ")
110
+ .replace(/[ \t]+/g, " ");
111
+ return decoded
112
+ .split("\n")
113
+ .map((line) => line.trim())
114
+ .filter(Boolean)
115
+ .join("\n");
116
+ }
117
+ const defaultFetchPage = async (url) => {
118
+ const response = await fetch(url, {
119
+ headers: {
120
+ "User-Agent": "fullstackgtm-market/0 (+https://github.com/fullstackgtm/core)",
121
+ "Accept-Language": "en-US",
122
+ },
123
+ redirect: "follow",
124
+ });
125
+ return { status: response.status, body: await response.text() };
126
+ };
127
+ export async function captureMarket(config, options = {}) {
128
+ const dir = options.dir ?? join(marketHome(config.category), "captures");
129
+ const runLabel = options.runLabel ?? "run-1";
130
+ const fetchPage = options.fetchPage ?? defaultFetchPage;
131
+ const fetchedAt = (options.now ?? (() => new Date()))().toISOString();
132
+ mkdirSync(dir, { recursive: true });
133
+ const manifestPath = join(dir, "manifest.json");
134
+ const manifest = existsSync(manifestPath)
135
+ ? JSON.parse(readFileSync(manifestPath, "utf8"))
136
+ : [];
137
+ const entries = [];
138
+ for (const vendor of config.vendors) {
139
+ const targets = [
140
+ { kind: "home", url: vendor.urls.home },
141
+ ];
142
+ if (vendor.urls.pricing)
143
+ targets.push({ kind: "pricing", url: vendor.urls.pricing });
144
+ for (const url of vendor.urls.product)
145
+ targets.push({ kind: "product", url });
146
+ for (const target of targets) {
147
+ let status = null;
148
+ let text = "";
149
+ try {
150
+ const page = await fetchPage(target.url);
151
+ status = page.status;
152
+ if (page.status === 200)
153
+ text = extractReadableText(page.body);
154
+ }
155
+ catch {
156
+ status = null;
157
+ }
158
+ let captureHash = null;
159
+ if (text) {
160
+ captureHash = createHash("sha256").update(text).digest("hex");
161
+ // Content-addressed: an unchanged page dedupes to the same file.
162
+ writeFileSync(join(dir, `${captureHash}.txt`), text);
163
+ }
164
+ const entry = {
165
+ runLabel,
166
+ vendorId: vendor.id,
167
+ kind: target.kind,
168
+ url: target.url,
169
+ fetchedAt,
170
+ httpStatus: status,
171
+ captureHash,
172
+ textChars: text.length,
173
+ };
174
+ manifest.push(entry);
175
+ entries.push(entry);
176
+ }
177
+ }
178
+ writeFileSync(manifestPath, `${JSON.stringify(manifest, null, 2)}\n`);
179
+ return { entries, manifestPath };
180
+ }
181
+ export function createFileObservationStore(category, directory) {
182
+ const dir = directory ?? join(marketHome(category), "observations");
183
+ function fileFor(runLabel) {
184
+ if (!/^[\w.-]+$/.test(runLabel))
185
+ throw new Error(`Invalid run label: ${runLabel}`);
186
+ return join(dir, `${runLabel}.json`);
187
+ }
188
+ function read(runLabel) {
189
+ try {
190
+ return JSON.parse(readFileSync(fileFor(runLabel), "utf8"));
191
+ }
192
+ catch {
193
+ return null;
194
+ }
195
+ }
196
+ function listSets() {
197
+ let names = [];
198
+ try {
199
+ names = readdirSync(dir).filter((name) => name.endsWith(".json"));
200
+ }
201
+ catch {
202
+ return [];
203
+ }
204
+ return names
205
+ .map((name) => read(name.replace(/\.json$/, "")))
206
+ .filter((set) => set !== null)
207
+ .sort((a, b) => a.runAt.localeCompare(b.runAt));
208
+ }
209
+ return {
210
+ async append(set) {
211
+ if (set.category !== category) {
212
+ throw new Error(`Observation set category "${set.category}" does not match store "${category}"`);
213
+ }
214
+ if (read(set.runLabel)) {
215
+ throw new Error(`Run "${set.runLabel}" already exists — observations are append-only; use a new run label`);
216
+ }
217
+ mkdirSync(dir, { recursive: true });
218
+ writeFileSync(fileFor(set.runLabel), `${JSON.stringify(set, null, 2)}\n`);
219
+ return set;
220
+ },
221
+ async get(runLabel) {
222
+ return read(runLabel);
223
+ },
224
+ async list() {
225
+ return listSets().map((set) => ({
226
+ runLabel: set.runLabel,
227
+ runAt: set.runAt,
228
+ observations: set.observations.length,
229
+ }));
230
+ },
231
+ async latest() {
232
+ const sets = listSets();
233
+ return sets.length ? sets[sets.length - 1] : null;
234
+ },
235
+ };
236
+ }
237
+ /**
238
+ * Validate a proposed observation set against the config before it enters
239
+ * the store: known vendors/claims, full coverage, legal readings, and the
240
+ * verbatim-evidence rule (non-absent readings must quote something).
241
+ * Returns problems; an empty array means accept.
242
+ */
243
+ export function validateObservationSet(config, set) {
244
+ const problems = [];
245
+ const vendorIds = new Set(config.vendors.map((v) => v.id));
246
+ const claimIds = new Set(config.claims.map((c) => c.id));
247
+ const seen = new Set();
248
+ for (const obs of set.observations) {
249
+ const cell = `${obs.vendorId} × ${obs.claimId}`;
250
+ if (!vendorIds.has(obs.vendorId))
251
+ problems.push(`unknown vendor "${obs.vendorId}"`);
252
+ if (!claimIds.has(obs.claimId))
253
+ problems.push(`unknown claim "${obs.claimId}"`);
254
+ if (seen.has(cell))
255
+ problems.push(`duplicate observation for ${cell}`);
256
+ seen.add(cell);
257
+ if (!INTENSITY_RANK[obs.intensity] && obs.intensity !== "unobservable") {
258
+ problems.push(`${cell}: invalid intensity "${obs.intensity}"`);
259
+ }
260
+ if ((obs.intensity === "loud" || obs.intensity === "quiet") && obs.evidence.length === 0) {
261
+ problems.push(`${cell}: ${obs.intensity} reading with no quoted evidence`);
262
+ }
263
+ }
264
+ for (const vendor of config.vendors) {
265
+ for (const claim of config.claims) {
266
+ if (!seen.has(`${vendor.id} × ${claim.id}`)) {
267
+ problems.push(`missing observation for ${vendor.id} × ${claim.id}`);
268
+ }
269
+ }
270
+ }
271
+ return problems;
272
+ }
273
+ // ---------------------------------------------------------------------------
274
+ // Evidence span verification — the deterministic gate that makes the
275
+ // verbatim-quote rule mechanical instead of a prompt instruction. Because the
276
+ // source documents are *stored* (unlike call transcripts, which pass through),
277
+ // every quoted span can be checked against the capture it cites before the
278
+ // observation is accepted. Comparison is whitespace-normalized only: case and
279
+ // wording must match the page exactly.
280
+ export function loadCaptureTexts(category, directory) {
281
+ const dir = directory ?? join(marketHome(category), "captures");
282
+ const manifestPath = join(dir, "manifest.json");
283
+ const entries = existsSync(manifestPath)
284
+ ? JSON.parse(readFileSync(manifestPath, "utf8"))
285
+ : [];
286
+ const textByHash = new Map();
287
+ for (const entry of entries) {
288
+ if (entry.captureHash && !textByHash.has(entry.captureHash)) {
289
+ try {
290
+ textByHash.set(entry.captureHash, readFileSync(join(dir, `${entry.captureHash}.txt`), "utf8"));
291
+ }
292
+ catch {
293
+ // Missing capture file: verification of anything citing it will fail loudly.
294
+ }
295
+ }
296
+ }
297
+ return { entries, textByHash };
298
+ }
299
+ /**
300
+ * Whitespace-only normalization for span matching, plus one extraction
301
+ * artifact: the HTML-to-text step can emit a line break before punctuation
302
+ * that follows an inline tag ("placements\n. Districts"), which no honest
303
+ * quoter would reproduce — so whitespace *before* punctuation is dropped
304
+ * too. Words, casing, and characters must still match the page exactly.
305
+ */
306
+ export function normalizeForMatch(value) {
307
+ return value
308
+ .replace(/\s+([.,;:!?])/g, "$1")
309
+ .replace(/\s+/g, " ")
310
+ .trim();
311
+ }
312
+ export function verifyEvidenceSpans(observations, textByHash) {
313
+ const failures = [];
314
+ for (const obs of observations) {
315
+ for (const evidence of obs.evidence) {
316
+ const quote = evidence.text ?? "";
317
+ const hash = String(evidence.metadata?.captureHash ?? "");
318
+ if (!hash) {
319
+ failures.push({
320
+ vendorId: obs.vendorId,
321
+ claimId: obs.claimId,
322
+ quote,
323
+ problem: "evidence has no captureHash — spans must cite a stored capture",
324
+ });
325
+ continue;
326
+ }
327
+ const captureText = textByHash.get(hash);
328
+ if (captureText === undefined) {
329
+ failures.push({
330
+ vendorId: obs.vendorId,
331
+ claimId: obs.claimId,
332
+ quote,
333
+ problem: `capture ${hash.slice(0, 12)} not found — evidence must stay resolvable`,
334
+ });
335
+ continue;
336
+ }
337
+ if (!normalizeForMatch(captureText).includes(normalizeForMatch(quote))) {
338
+ failures.push({
339
+ vendorId: obs.vendorId,
340
+ claimId: obs.claimId,
341
+ quote,
342
+ problem: `quote not found verbatim in capture ${hash.slice(0, 12)}`,
343
+ });
344
+ }
345
+ }
346
+ }
347
+ return failures;
348
+ }
349
+ /**
350
+ * Front rule v1: 0 loud → open (if anyone is quiet) or vacant; 1 loud →
351
+ * owned; 2–3 loud → contested; ≥4 loud → saturated. Unobservable cells are
352
+ * excluded — a failed capture never reads as absence.
353
+ */
354
+ export function computeFrontStates(config, set) {
355
+ const byCell = new Map();
356
+ for (const obs of set.observations) {
357
+ const key = `${obs.vendorId}|${obs.claimId}`;
358
+ const existing = byCell.get(key);
359
+ if (!existing || INTENSITY_RANK[obs.intensity] > INTENSITY_RANK[existing.intensity]) {
360
+ byCell.set(key, obs);
361
+ }
362
+ }
363
+ return config.claims.map((claim) => {
364
+ const loud = [];
365
+ const quiet = [];
366
+ for (const vendor of config.vendors) {
367
+ const obs = byCell.get(`${vendor.id}|${claim.id}`);
368
+ if (obs?.intensity === "loud")
369
+ loud.push(vendor.id);
370
+ if (obs?.intensity === "quiet")
371
+ quiet.push(vendor.id);
372
+ }
373
+ let state;
374
+ if (loud.length === 0)
375
+ state = quiet.length >= 1 ? "open" : "vacant";
376
+ else if (loud.length === 1)
377
+ state = "owned";
378
+ else if (loud.length <= 3)
379
+ state = "contested";
380
+ else
381
+ state = "saturated";
382
+ return { claimId: claim.id, state, loudVendorIds: loud, quietVendorIds: quiet };
383
+ });
384
+ }
385
+ /** What changed in the category between two runs — the refresh's whole point. */
386
+ export function diffFrontStates(before, after) {
387
+ const prior = new Map(before.map((front) => [front.claimId, front.state]));
388
+ const drift = [];
389
+ for (const front of after) {
390
+ const was = prior.get(front.claimId);
391
+ if (was && was !== front.state)
392
+ drift.push({ claimId: front.claimId, before: was, after: front.state });
393
+ }
394
+ return drift;
395
+ }
@@ -0,0 +1,49 @@
1
+ import { type LlmCallOptions } from "./llm.ts";
2
+ import { type CaptureEntry, type MarketClaim, type MarketConfig, type ObservationSet } from "./market.ts";
3
+ export type ClassifyMarketOptions = {
4
+ llm: LlmCallOptions;
5
+ /** Observation run label to produce; must be new (the store is append-only). */
6
+ runLabel: string;
7
+ /** Capture run to classify; defaults to the most recent run in the manifest. */
8
+ captureRun?: string;
9
+ /** Restrict to these vendor ids (e.g. one new vendor); defaults to all. */
10
+ vendors?: string[];
11
+ /** Captures directory override (tests); defaults to the profile market home. */
12
+ capturesDir?: string;
13
+ now?: () => Date;
14
+ };
15
+ export type ClassifyMarketResult = {
16
+ set: ObservationSet;
17
+ model: string;
18
+ /** Cells where the model's quote failed mechanical verification and the retry fixed it. */
19
+ retriedVendorIds: string[];
20
+ };
21
+ export declare function classifyMarket(config: MarketConfig, options: ClassifyMarketOptions): Promise<ClassifyMarketResult>;
22
+ /**
23
+ * The agent-driven alternative to LLM classification: a worksheet carrying
24
+ * everything needed to classify one vendor by hand or by an agent driving
25
+ * the CLI/MCP — claims with judging definitions, the surface rule, and the
26
+ * captured page texts. Submissions come back through `market observe`,
27
+ * which runs the same validation and span verification as `classify`.
28
+ */
29
+ export type MarketWorksheet = {
30
+ category: string;
31
+ captureRun: string;
32
+ surfaceRule?: string;
33
+ vendor: {
34
+ id: string;
35
+ name: string;
36
+ };
37
+ claims: MarketClaim[];
38
+ pages: Array<{
39
+ kind: CaptureEntry["kind"];
40
+ url: string;
41
+ captureHash: string;
42
+ text: string;
43
+ }>;
44
+ instructions: string;
45
+ };
46
+ export declare function buildWorksheet(config: MarketConfig, vendorId: string, options?: {
47
+ captureRun?: string;
48
+ capturesDir?: string;
49
+ }): MarketWorksheet;
@@ -0,0 +1,201 @@
1
+ import { DEFAULT_MODELS, forcedToolCall } from "./llm.js";
2
+ import { loadCaptureTexts, observationId, verifyEvidenceSpans, } from "./market.js";
3
+ /**
4
+ * LLM intensity classification for the market map — the same
5
+ * semi-deterministic posture as call extraction, with one upgrade calls
6
+ * can't have: because the source pages are stored captures, every quoted
7
+ * span is verified mechanically against the capture it cites before the
8
+ * observation is accepted. A reading whose quote isn't verbatim on the page
9
+ * bounces back to the model once with the failures named; if it still can't
10
+ * quote the page, classification fails rather than storing unverifiable
11
+ * evidence.
12
+ *
13
+ * Deterministic parts stay deterministic: vendors with no usable captures
14
+ * score UNOBSERVABLE on every claim without an LLM call, and front states
15
+ * downstream are computed from the store, never from model output.
16
+ */
17
+ // Bound cost and context: a vendor's pages are classified in one call.
18
+ const MAX_DOSSIER_CHARS = 48_000;
19
+ const CLASSIFY_INSTRUCTIONS = `Classify this vendor's messaging intensity for EVERY claim listed.
20
+ Rules:
21
+ - Judge ONLY from the captured pages below. Do not use outside knowledge of the vendor.
22
+ - intensity per the surface rule: "loud" = hero copy or a top-level-nav named product/program with a dedicated page; "quiet" = present on any page below that; "absent" = nowhere in the captures.
23
+ - evidence quotes MUST be verbatim spans copied exactly from the captured text (≤300 chars). Every loud or quiet reading needs at least one quote. If you cannot quote it, the reading is absent.
24
+ - An explicit disavowal ("we do not offer X", "call 988") is absent — put the disavowal quote in reason, it is informative signal.
25
+ - url must be the page the quote came from, exactly as given in the page headers below.
26
+ - reason: one reviewer-facing sentence.
27
+ - Return a reading for every claim id. Never invent claim ids.`;
28
+ const classifySchema = (claimIds) => ({
29
+ type: "object",
30
+ required: ["readings"],
31
+ properties: {
32
+ readings: {
33
+ type: "array",
34
+ items: {
35
+ type: "object",
36
+ required: ["claimId", "intensity", "confidence", "reason", "evidence"],
37
+ properties: {
38
+ claimId: { type: "string", enum: claimIds },
39
+ intensity: { type: "string", enum: ["loud", "quiet", "absent"] },
40
+ confidence: { type: "string", enum: ["high", "medium", "low"] },
41
+ reason: { type: "string", description: "One reviewer-facing sentence." },
42
+ evidence: {
43
+ type: "array",
44
+ items: {
45
+ type: "object",
46
+ required: ["quote", "url"],
47
+ properties: {
48
+ quote: { type: "string", description: "VERBATIM span copied exactly from the captured page text. Never paraphrase." },
49
+ url: { type: "string", description: "The page URL the quote came from, exactly as shown in the page header." },
50
+ },
51
+ },
52
+ },
53
+ },
54
+ },
55
+ },
56
+ },
57
+ });
58
+ function buildDossier(entries, textByHash) {
59
+ const pages = entries
60
+ .filter((entry) => entry.captureHash && textByHash.has(entry.captureHash))
61
+ .map((entry) => ({ entry, text: textByHash.get(entry.captureHash) }));
62
+ if (pages.length === 0)
63
+ return "";
64
+ const budget = Math.floor(MAX_DOSSIER_CHARS / pages.length);
65
+ return pages
66
+ .map(({ entry, text }) => {
67
+ const body = text.length <= budget
68
+ ? text
69
+ : `${text.slice(0, budget / 2)}\n[... middle of page truncated ...]\n${text.slice(-budget / 2)}`;
70
+ return `=== PAGE (${entry.kind}) ${entry.url} ===\n${body}`;
71
+ })
72
+ .join("\n\n");
73
+ }
74
+ function claimsBlock(claims) {
75
+ return claims
76
+ .map((claim) => `- ${claim.id}: ${claim.capability}\n How to judge: ${claim.definition}`)
77
+ .join("\n");
78
+ }
79
+ export async function classifyMarket(config, options) {
80
+ const model = options.llm.model ?? DEFAULT_MODELS[options.llm.provider];
81
+ const { entries, textByHash } = loadCaptureTexts(config.category, options.capturesDir);
82
+ if (entries.length === 0) {
83
+ throw new Error(`No captures for ${config.category} — run \`market capture\` first`);
84
+ }
85
+ const captureRun = options.captureRun ?? entries[entries.length - 1].runLabel;
86
+ const runEntries = entries.filter((entry) => entry.runLabel === captureRun);
87
+ if (runEntries.length === 0) {
88
+ throw new Error(`No captures for run "${captureRun}" — available: ${[...new Set(entries.map((e) => e.runLabel))].join(", ")}`);
89
+ }
90
+ const observedAt = (options.now ?? (() => new Date()))().toISOString();
91
+ const vendorIds = options.vendors ?? config.vendors.map((vendor) => vendor.id);
92
+ const claimIds = config.claims.map((claim) => claim.id);
93
+ const observations = [];
94
+ const retriedVendorIds = [];
95
+ for (const vendorId of vendorIds) {
96
+ const vendor = config.vendors.find((candidate) => candidate.id === vendorId);
97
+ if (!vendor)
98
+ throw new Error(`Unknown vendor "${vendorId}"`);
99
+ const vendorEntries = runEntries.filter((entry) => entry.vendorId === vendorId);
100
+ const hashByUrl = new Map(vendorEntries.filter((entry) => entry.captureHash).map((entry) => [entry.url, entry.captureHash]));
101
+ const dossier = buildDossier(vendorEntries, textByHash);
102
+ if (!dossier) {
103
+ // Deterministic: no usable captures means UNOBSERVABLE everywhere — never
104
+ // ask a model to judge pages that were never read.
105
+ for (const claim of config.claims) {
106
+ observations.push({
107
+ id: observationId(config.category, options.runLabel, vendorId, claim.id),
108
+ vendorId,
109
+ claimId: claim.id,
110
+ observedAt,
111
+ intensity: "unobservable",
112
+ confidence: "high",
113
+ reason: `No usable captures for ${vendor.name} in run ${captureRun} — cannot judge.`,
114
+ evidence: [],
115
+ });
116
+ }
117
+ continue;
118
+ }
119
+ const prompt = (feedback) => `${CLASSIFY_INSTRUCTIONS}\n\nSurface rule for this category:\n${config.surfaceRule ?? "(default rule above)"}\n\nClaims to classify (all of them):\n${claimsBlock(config.claims)}\n${feedback}\nVendor: ${vendor.name}\nCaptured pages:\n${dossier}`;
120
+ const attempt = async (feedback) => {
121
+ const result = (await forcedToolCall(prompt(feedback), "classify_market_claims", classifySchema(claimIds), model, options.llm));
122
+ const readings = (result.readings ?? []).filter((reading) => claimIds.includes(reading.claimId));
123
+ const seen = new Set(readings.map((reading) => reading.claimId));
124
+ const problems = claimIds.filter((claimId) => !seen.has(claimId)).map((claimId) => `missing reading for ${claimId}`);
125
+ const candidate = readings.map((reading) => toObservation(reading, vendorId));
126
+ const failures = verifyEvidenceSpans(candidate, textByHash);
127
+ return { readings, problems, failures };
128
+ };
129
+ const toObservation = (reading, vendor) => ({
130
+ id: observationId(config.category, options.runLabel, vendor, reading.claimId),
131
+ vendorId: vendor,
132
+ claimId: reading.claimId,
133
+ observedAt,
134
+ intensity: reading.intensity,
135
+ confidence: reading.confidence,
136
+ reason: reading.reason,
137
+ evidence: (reading.evidence ?? []).map((item, index) => ({
138
+ id: `${observationId(config.category, options.runLabel, vendor, reading.claimId)}_ev${index}`,
139
+ sourceSystem: "web",
140
+ sourceObjectType: "page",
141
+ sourceObjectId: item.url,
142
+ text: item.quote,
143
+ observedAt,
144
+ metadata: { url: item.url, captureHash: hashByUrl.get(item.url) ?? "" },
145
+ })),
146
+ });
147
+ let outcome = await attempt("");
148
+ if (outcome.problems.length > 0 || outcome.failures.length > 0) {
149
+ retriedVendorIds.push(vendorId);
150
+ const failureLines = [
151
+ ...outcome.problems,
152
+ ...outcome.failures.map((failure) => `${failure.claimId}: ${failure.problem} (your quote: "${failure.quote.slice(0, 80)}")`),
153
+ ].join("\n- ");
154
+ outcome = await attempt(`\nYour previous answer had problems. Fix exactly these and answer again in full:\n- ${failureLines}\nQuotes must be copied character-for-character from the captured text.\n`);
155
+ }
156
+ if (outcome.problems.length > 0 || outcome.failures.length > 0) {
157
+ const detail = [...outcome.problems, ...outcome.failures.map((failure) => `${failure.claimId}: ${failure.problem}`)].slice(0, 10);
158
+ throw new Error(`Classification for ${vendor.name} failed mechanical verification after a retry:\n ${detail.join("\n ")}\nNothing was stored. Re-run, try another --model, or classify this vendor by hand via the worksheet.`);
159
+ }
160
+ for (const reading of outcome.readings)
161
+ observations.push(toObservation(reading, vendorId));
162
+ }
163
+ return {
164
+ set: {
165
+ id: `set_${config.category}_${options.runLabel}`,
166
+ category: config.category,
167
+ runLabel: options.runLabel,
168
+ runAt: observedAt,
169
+ extractor: `llm:${options.llm.provider}:${model}`,
170
+ observations,
171
+ },
172
+ model,
173
+ retriedVendorIds,
174
+ };
175
+ }
176
+ export function buildWorksheet(config, vendorId, options = {}) {
177
+ const vendor = config.vendors.find((candidate) => candidate.id === vendorId);
178
+ if (!vendor)
179
+ throw new Error(`Unknown vendor "${vendorId}"`);
180
+ const { entries, textByHash } = loadCaptureTexts(config.category, options.capturesDir);
181
+ const captureRun = options.captureRun ?? entries[entries.length - 1]?.runLabel;
182
+ if (!captureRun)
183
+ throw new Error(`No captures for ${config.category} — run \`market capture\` first`);
184
+ const pages = entries
185
+ .filter((entry) => entry.runLabel === captureRun && entry.vendorId === vendorId && entry.captureHash)
186
+ .map((entry) => ({
187
+ kind: entry.kind,
188
+ url: entry.url,
189
+ captureHash: entry.captureHash,
190
+ text: textByHash.get(entry.captureHash) ?? "",
191
+ }));
192
+ return {
193
+ category: config.category,
194
+ captureRun,
195
+ surfaceRule: config.surfaceRule,
196
+ vendor: { id: vendor.id, name: vendor.name },
197
+ claims: config.claims,
198
+ pages,
199
+ instructions: "Produce one observation per claim (intensity loud|quiet|absent from these pages only; unobservable only if a page you need failed to capture). Every loud/quiet reading must quote a verbatim span (≤300 chars) from a page's text, with that page's url and captureHash in evidence metadata. Submit as an ObservationSet via `market observe --from <file>` — quotes are mechanically verified against the captures.",
200
+ };
201
+ }
@@ -0,0 +1,3 @@
1
+ import type { MarketConfig, ObservationSet } from "./market.ts";
2
+ export declare function marketMapToMarkdown(config: MarketConfig, set: ObservationSet): string;
3
+ export declare function marketMapToHtml(config: MarketConfig, set: ObservationSet): string;