npm - r2mcp - Versions diffs - 0.2.0 - Mend

r2mcp 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (138) hide show

package/CHANGELOG.md +66 -0
package/LICENSE +21 -0
package/README.md +532 -0
package/dist/breadcrumbs.d.ts +123 -0
package/dist/breadcrumbs.js +135 -0
package/dist/cli/classify-edges.d.ts +2 -0
package/dist/cli/classify-edges.js +130 -0
package/dist/cli/compile-wiki.d.ts +2 -0
package/dist/cli/compile-wiki.js +173 -0
package/dist/cli/dump-edges-json.d.ts +2 -0
package/dist/cli/dump-edges-json.js +21 -0
package/dist/cli/extract-entities.d.ts +17 -0
package/dist/cli/extract-entities.js +166 -0
package/dist/cli/lint-memory.d.ts +16 -0
package/dist/cli/lint-memory.js +94 -0
package/dist/cli/migrate.d.ts +17 -0
package/dist/cli/migrate.js +146 -0
package/dist/cli/setup-helpers.d.ts +7 -0
package/dist/cli/setup-helpers.js +72 -0
package/dist/cli/setup.d.ts +15 -0
package/dist/cli/setup.js +95 -0
package/dist/compiler/clustering.d.ts +29 -0
package/dist/compiler/clustering.js +66 -0
package/dist/compiler/frontmatter.d.ts +35 -0
package/dist/compiler/frontmatter.js +168 -0
package/dist/compiler/manifest.d.ts +32 -0
package/dist/compiler/manifest.js +82 -0
package/dist/compiler/prompts.d.ts +17 -0
package/dist/compiler/prompts.js +82 -0
package/dist/compiler/run.d.ts +52 -0
package/dist/compiler/run.js +186 -0
package/dist/compiler/tier.d.ts +10 -0
package/dist/compiler/tier.js +85 -0
package/dist/compiler/topic.d.ts +16 -0
package/dist/compiler/topic.js +105 -0
package/dist/compiler/types.d.ts +101 -0
package/dist/compiler/types.js +4 -0
package/dist/db.d.ts +10 -0
package/dist/db.js +46 -0
package/dist/edges/candidate-pairs.d.ts +24 -0
package/dist/edges/candidate-pairs.js +35 -0
package/dist/edges/classifier.d.ts +45 -0
package/dist/edges/classifier.js +172 -0
package/dist/edges/signals.d.ts +13 -0
package/dist/edges/signals.js +45 -0
package/dist/edges/stage1-haiku.d.ts +21 -0
package/dist/edges/stage1-haiku.js +33 -0
package/dist/edges/stage2-opus.d.ts +41 -0
package/dist/edges/stage2-opus.js +101 -0
package/dist/edges/state.d.ts +44 -0
package/dist/edges/state.js +79 -0
package/dist/edges/types.d.ts +20 -0
package/dist/edges/types.js +1 -0
package/dist/embeddings.d.ts +13 -0
package/dist/embeddings.js +54 -0
package/dist/entities/db.d.ts +49 -0
package/dist/entities/db.js +109 -0
package/dist/entities/extractor.d.ts +14 -0
package/dist/entities/extractor.js +154 -0
package/dist/entities/normalize.d.ts +5 -0
package/dist/entities/normalize.js +7 -0
package/dist/entities/prompt.d.ts +19 -0
package/dist/entities/prompt.js +100 -0
package/dist/entities/state.d.ts +44 -0
package/dist/entities/state.js +99 -0
package/dist/entities/types.d.ts +62 -0
package/dist/entities/types.js +6 -0
package/dist/env.d.ts +13 -0
package/dist/env.js +32 -0
package/dist/fingerprint.d.ts +2 -0
package/dist/fingerprint.js +12 -0
package/dist/graph-rebuild.d.ts +6 -0
package/dist/graph-rebuild.js +20 -0
package/dist/index.d.ts +4 -0
package/dist/index.js +403 -0
package/dist/instrumentation.d.ts +10 -0
package/dist/instrumentation.js +37 -0
package/dist/lint/checks/contradictions.d.ts +30 -0
package/dist/lint/checks/contradictions.js +52 -0
package/dist/lint/checks/drift.d.ts +5 -0
package/dist/lint/checks/drift.js +34 -0
package/dist/lint/checks/orphans.d.ts +5 -0
package/dist/lint/checks/orphans.js +25 -0
package/dist/lint/checks/stale.d.ts +6 -0
package/dist/lint/checks/stale.js +29 -0
package/dist/lint/checks/superseded-unflagged.d.ts +5 -0
package/dist/lint/checks/superseded-unflagged.js +47 -0
package/dist/lint/run.d.ts +11 -0
package/dist/lint/run.js +95 -0
package/dist/lint/types.d.ts +60 -0
package/dist/lint/types.js +13 -0
package/dist/mcp-response.d.ts +7 -0
package/dist/mcp-response.js +13 -0
package/dist/providers/anthropic.d.ts +13 -0
package/dist/providers/anthropic.js +56 -0
package/dist/providers/claude-code.d.ts +35 -0
package/dist/providers/claude-code.js +175 -0
package/dist/providers/errors.d.ts +12 -0
package/dist/providers/errors.js +19 -0
package/dist/providers/index.d.ts +30 -0
package/dist/providers/index.js +71 -0
package/dist/providers/openrouter.d.ts +19 -0
package/dist/providers/openrouter.js +76 -0
package/dist/providers/semaphore.d.ts +19 -0
package/dist/providers/semaphore.js +51 -0
package/dist/providers/types.d.ts +27 -0
package/dist/providers/types.js +7 -0
package/dist/schema.sql +116 -0
package/dist/server-instructions.d.ts +9 -0
package/dist/server-instructions.js +20 -0
package/dist/telemetry.d.ts +39 -0
package/dist/telemetry.js +130 -0
package/dist/tools/classify.d.ts +44 -0
package/dist/tools/classify.js +121 -0
package/dist/tools/compile.d.ts +31 -0
package/dist/tools/compile.js +132 -0
package/dist/tools/dump-edges-sidecar.d.ts +37 -0
package/dist/tools/dump-edges-sidecar.js +80 -0
package/dist/tools/extract-entities.d.ts +53 -0
package/dist/tools/extract-entities.js +169 -0
package/dist/tools/lint.d.ts +10 -0
package/dist/tools/lint.js +13 -0
package/dist/tools/meditate.d.ts +25 -0
package/dist/tools/meditate.js +128 -0
package/dist/tools/recall.d.ts +66 -0
package/dist/tools/recall.js +409 -0
package/dist/tools/reject.d.ts +10 -0
package/dist/tools/reject.js +24 -0
package/dist/tools/remember.d.ts +26 -0
package/dist/tools/remember.js +140 -0
package/dist/tools/search.d.ts +30 -0
package/dist/tools/search.js +69 -0
package/dist/tools/spawn-cli.d.ts +14 -0
package/dist/tools/spawn-cli.js +41 -0
package/dist/tools/stats.d.ts +31 -0
package/dist/tools/stats.js +88 -0
package/package.json +86 -0
package/skills/remember/SKILL.md +357 -0

package/dist/entities/db.d.ts ADDED Viewed

@@ -0,0 +1,49 @@
+import type pg from 'pg';
+import type { EntityRow, EntityType } from './types.js';
+type DbClient = pg.Pool | pg.PoolClient;
+export interface UpsertEntityInput {
+    type: EntityType;
+    canonical_name: string;
+    aliases?: string[];
+}
+export interface UpsertEntityResult {
+    id: string;
+    created: boolean;
+}
+export declare function upsertEntity(client: DbClient, input: UpsertEntityInput): Promise<UpsertEntityResult>;
+export declare function findEntityByInput(client: DbClient, input: string): Promise<EntityRow | null>;
+export declare function mergeAliases(client: DbClient, entityId: string, newAliases: string[]): Promise<string[]>;
+export interface LinkResult {
+    inserted: boolean;
+}
+export declare function linkMemoryToEntity(client: DbClient, memoryId: string, entityId: string, confidence: number, source: string): Promise<LinkResult>;
+export declare function getTopEntitiesByFrequency(client: DbClient, n: number): Promise<Array<{
+    id: string;
+    type: EntityType;
+    canonical_name: string;
+    normalized_name: string;
+    aliases: string[];
+    link_count: number;
+}>>;
+export interface CandidateFilter {
+    sinceDays?: number;
+    /**
+     * When true, bypass the "no existing entity rows OR updated since most-recent link"
+     * pre-filter and return ALL memories in the corpus. Spec R6: full-corpus
+     * re-extraction is opt-in via this flag, not the default. The `sinceDays`
+     * filter still applies if both are set (though the CLI/MCP guards against
+     * passing them together).
+     */
+    full?: boolean;
+}
+export declare function findCandidateMemories(client: DbClient, filter: CandidateFilter): Promise<Array<{
+    id: string;
+    content: string;
+    updated_at: Date;
+}>>;
+export declare function getEntityLinksForMemories(client: DbClient, memoryIds: string[]): Promise<Map<string, Array<{
+    type: EntityType;
+    canonical_name: string;
+    confidence: number;
+}>>>;
+export {};

package/dist/entities/db.js ADDED Viewed

@@ -0,0 +1,109 @@
+// SPEC-046 Task 4 — entities DB layer.
+//
+// Pure DB access for the entity extraction pipeline. All functions accept
+// either a pg.Pool or pg.PoolClient (both expose a compatible .query()).
+// Callers in the extractor driver and recall path use the shared pool from
+// src/db.ts; tests pass the pool directly.
+import { normalizeEntityName } from './normalize.js';
+export async function upsertEntity(client, input) {
+    const normalized = normalizeEntityName(input.canonical_name);
+    const aliases = (input.aliases ?? [])
+        .map((a) => normalizeEntityName(a))
+        .filter((a) => a.length > 0);
+    // ON CONFLICT also merges aliases (claw-2jbo, PR #1 finding 3). Prior shape
+    // dropped EXCLUDED.aliases silently, requiring callers to invoke
+    // mergeAliases() separately. Direct consumers of upsertEntity now get the
+    // union-merge for free. ARRAY(SELECT DISTINCT UNNEST(...)) is the same
+    // union shape used by mergeAliases below.
+    const { rows } = await client.query(`INSERT INTO entities (type, canonical_name, normalized_name, aliases)
+     VALUES ($1, $2, $3, $4::text[])
+     ON CONFLICT (type, normalized_name) DO UPDATE SET
+       aliases = ARRAY(SELECT DISTINCT UNNEST(entities.aliases || EXCLUDED.aliases)),
+       last_seen_at = NOW()
+     RETURNING id, (xmax = 0) AS created`, [input.type, input.canonical_name, normalized, aliases]);
+    return { id: rows[0].id, created: rows[0].created };
+}
+export async function findEntityByInput(client, input) {
+    const normalized = normalizeEntityName(input);
+    if (!normalized)
+        return null;
+    const { rows } = await client.query(`SELECT id, type, canonical_name, normalized_name, aliases, metadata, first_seen_at, last_seen_at
+     FROM entities
+     WHERE normalized_name = $1 OR $1 = ANY(aliases)
+     LIMIT 1`, [normalized]);
+    return rows[0] ?? null;
+}
+export async function mergeAliases(client, entityId, newAliases) {
+    const normalized = newAliases.map((a) => normalizeEntityName(a)).filter((a) => a.length > 0);
+    const { rows } = await client.query(`UPDATE entities
+     SET aliases = ARRAY(SELECT DISTINCT UNNEST(aliases || $2::text[])),
+         last_seen_at = NOW()
+     WHERE id = $1
+     RETURNING aliases`, [entityId, normalized]);
+    return rows[0]?.aliases ?? [];
+}
+export async function linkMemoryToEntity(client, memoryId, entityId, confidence, source) {
+    const { rowCount } = await client.query(`INSERT INTO memory_entities (memory_id, entity_id, confidence, source)
+     VALUES ($1, $2, $3, $4)
+     ON CONFLICT (memory_id, entity_id) DO NOTHING`, [memoryId, entityId, confidence, source]);
+    return { inserted: (rowCount ?? 0) > 0 };
+}
+export async function getTopEntitiesByFrequency(client, n) {
+    // Returns id + normalized_name (additive vs. prior shape) so callers can
+    // build an in-memory lookup keyed by normalized canonical AND alias, used
+    // by the extractor to resolve LLM-matched canonical_names without a DB
+    // round-trip per match (claw-2jbo finding 1).
+    const { rows } = await client.query(`SELECT e.id, e.type, e.canonical_name, e.normalized_name, e.aliases,
+            COUNT(me.memory_id)::int AS link_count
+     FROM entities e
+     LEFT JOIN memory_entities me ON me.entity_id = e.id
+     GROUP BY e.id
+     ORDER BY link_count DESC, e.canonical_name ASC
+     LIMIT $1`, [n]);
+    return rows;
+}
+export async function findCandidateMemories(client, filter) {
+    const params = [];
+    const clauses = [];
+    if (!filter.full) {
+        // Default pre-filter: no existing entity rows OR memory updated since most recent link
+        clauses.push(`(
+      NOT EXISTS (SELECT 1 FROM memory_entities me WHERE me.memory_id = m.id)
+      OR m.updated_at > (SELECT MAX(me.created_at) FROM memory_entities me WHERE me.memory_id = m.id)
+    )`);
+    }
+    if (filter.sinceDays !== undefined) {
+        if (filter.sinceDays === 0)
+            return [];
+        params.push(filter.sinceDays);
+        clauses.push(`m.updated_at >= NOW() - ($${params.length}::int * INTERVAL '1 day')`);
+    }
+    const where = clauses.length > 0 ? `WHERE ${clauses.join(' AND ')}` : '';
+    const { rows } = await client.query(`SELECT m.id, m.content, m.updated_at
+     FROM memories m
+     ${where}
+     ORDER BY m.updated_at DESC`, params);
+    return rows;
+}
+export async function getEntityLinksForMemories(client, memoryIds) {
+    if (memoryIds.length === 0)
+        return new Map();
+    // Cast confidence::float — schema is NUMERIC(3,2) to match memory_edges
+    // (SPEC-043), but pg returns NUMERIC as a JS string by default. Callers
+    // (recall's EntityLink) type this as number, so cast at the query.
+    const { rows } = await client.query(`SELECT me.memory_id, e.type, e.canonical_name, me.confidence::float AS confidence
+     FROM memory_entities me
+     JOIN entities e ON e.id = me.entity_id
+     WHERE me.memory_id = ANY($1::uuid[])`, [memoryIds]);
+    const out = new Map();
+    for (const r of rows) {
+        const arr = out.get(r.memory_id) ?? [];
+        arr.push({
+            type: r.type,
+            canonical_name: r.canonical_name,
+            confidence: r.confidence,
+        });
+        out.set(r.memory_id, arr);
+    }
+    return out;
+}

package/dist/entities/extractor.d.ts ADDED Viewed

@@ -0,0 +1,14 @@
+import type pg from 'pg';
+import type { LLMProvider } from '../providers/types.js';
+import type { RunSummary } from './types.js';
+export interface RunExtractorOptions {
+    client: pg.Pool | pg.PoolClient;
+    provider: LLMProvider;
+    dataDir: string;
+    maxCostUsd: number;
+    contextTopN: number;
+    sinceDays?: number;
+    full?: boolean;
+    resumeFrom?: string;
+}
+export declare function runExtractor(opts: RunExtractorOptions): Promise<RunSummary>;

package/dist/entities/extractor.js ADDED Viewed

@@ -0,0 +1,154 @@
+// SPEC-046 Task 6 — extractor driver.
+//
+// Orchestrates pre-filter → LLM call → strict parse → DB writes → state
+// recording → run summary, with cost cap enforcement and resume support.
+// All DB writes go through the entity DB layer (Task 4), which accepts both
+// pg.Pool and pg.PoolClient. State machinery is delegated to EntityState
+// (Task 5).
+import { randomUUID } from 'node:crypto';
+import { withLLMCallSpan } from '../telemetry.js';
+import { buildExtractionPrompt, parseExtractionResponse } from './prompt.js';
+import { findCandidateMemories, getTopEntitiesByFrequency, upsertEntity, linkMemoryToEntity, } from './db.js';
+import { normalizeEntityName } from './normalize.js';
+import { EntityState } from './state.js';
+export async function runExtractor(opts) {
+    const startedAt = new Date();
+    const runId = randomUUID();
+    const state = new EntityState({ runId, dataDir: opts.dataDir, resumeFrom: opts.resumeFrom });
+    let memories_seen = 0;
+    let memories_extracted = 0;
+    let entities_created = 0;
+    let entities_updated = 0;
+    let links_created = 0;
+    let total_cost_usd = 0;
+    let parse_failures = 0;
+    let hit_cost_cap = false;
+    let hallucinated_matched = 0;
+    const candidates = await findCandidateMemories(opts.client, {
+        sinceDays: opts.sinceDays,
+        full: opts.full,
+    });
+    const known = await getTopEntitiesByFrequency(opts.client, opts.contextTopN);
+    // claw-2jbo finding 1: build a normalized-name → entity-id map once per run
+    // so LLM-matched canonical_names resolve synchronously instead of issuing
+    // findEntityByInput() per match (N+1). The spec requires the LLM to echo a
+    // canonical_name verbatim from the known set, so the map should always hit
+    // for well-behaved LLM output. Map misses (counted via hallucinated_matched)
+    // are the hallucination signal — see finding 2.
+    //
+    // Key by normalized canonical_name AND each alias so a match-by-alias still
+    // resolves (aliases stored normalized; see db.ts upsertEntity/mergeAliases).
+    const knownById = new Map(); // normalized lookup → entity id
+    for (const e of known) {
+        knownById.set(e.normalized_name, e.id);
+        for (const alias of e.aliases)
+            knownById.set(alias, e.id);
+    }
+    for (const mem of candidates) {
+        memories_seen++;
+        if (state.isMemoryTerminal(mem.id))
+            continue;
+        if (total_cost_usd >= opts.maxCostUsd) {
+            state.recordTerminal(mem.id, 'cap_reached');
+            hit_cost_cap = true;
+            continue;
+        }
+        const prompt = buildExtractionPrompt({
+            memory_content: mem.content,
+            known_entities: known.map((e) => ({
+                type: e.type,
+                canonical_name: e.canonical_name,
+                aliases: e.aliases,
+            })),
+        });
+        let rawResponse;
+        try {
+            // claw-1ejd: wrap the LLM call in a child span so the parent context
+            // restored from OTEL_TRACEPARENT (set by the MCP wrapper) has a
+            // concrete operation to inherit. No-op when SDK is not initialized.
+            const result = await withLLMCallSpan('memory.extract_entities.call', { provider: opts.provider.name, model: 'haiku' }, () => opts.provider.complete({ prompt, model: 'haiku' }));
+            total_cost_usd += result.cost_usd;
+            rawResponse = result.response;
+        }
+        catch (e) {
+            // Surface provider failures via run summary error; do not mark terminal.
+            return finalize(state, startedAt, runId, {
+                memories_seen,
+                memories_extracted,
+                entities_created,
+                entities_updated,
+                links_created,
+                total_cost_usd,
+                hit_cost_cap,
+                parse_failures,
+                hallucinated_matched,
+                error: `provider error: ${e.message}`,
+            });
+        }
+        if (total_cost_usd >= opts.maxCostUsd)
+            hit_cost_cap = true;
+        const parsed = parseExtractionResponse(rawResponse);
+        if (!parsed.ok) {
+            parse_failures++;
+            state.recordParseFailed(mem.id, rawResponse);
+            continue;
+        }
+        for (const m of parsed.value.matched) {
+            // Synchronous lookup against the in-memory map built from the known set
+            // above (claw-2jbo finding 1). A miss means the LLM returned a
+            // canonical_name not in the context we provided — i.e. a hallucination.
+            // Increment hallucinated_matched and skip the link (no DB write for a
+            // canonical we never told the model about).
+            const entityId = knownById.get(normalizeEntityName(m.canonical_name));
+            if (!entityId) {
+                hallucinated_matched++;
+                continue;
+            }
+            const link = await linkMemoryToEntity(opts.client, mem.id, entityId, m.confidence, 'classifier');
+            if (link.inserted)
+                links_created++;
+        }
+        for (const n of parsed.value.new_entities) {
+            const up = await upsertEntity(opts.client, {
+                type: n.type,
+                canonical_name: n.canonical_name,
+                aliases: n.aliases,
+            });
+            if (up.created)
+                entities_created++;
+            else
+                entities_updated++;
+            // Alias merge happens inside upsertEntity's ON CONFLICT clause (see db.ts).
+            const link = await linkMemoryToEntity(opts.client, mem.id, up.id, n.confidence, 'classifier');
+            if (link.inserted)
+                links_created++;
+        }
+        memories_extracted++;
+        state.recordTerminal(mem.id, 'extracted');
+    }
+    return finalize(state, startedAt, runId, {
+        memories_seen,
+        memories_extracted,
+        entities_created,
+        entities_updated,
+        links_created,
+        total_cost_usd,
+        hit_cost_cap,
+        parse_failures,
+        hallucinated_matched,
+    });
+}
+function finalize(state, startedAt, runId, counts) {
+    const summary = {
+        run_id: runId,
+        started_at: startedAt.toISOString(),
+        ended_at: new Date().toISOString(),
+        ...counts,
+    };
+    if (counts.parse_failures > 0 && !summary.error) {
+        summary.error = `parse failures: ${counts.parse_failures}`;
+    }
+    state.writeRunSummary(summary);
+    state.close();
+    return summary;
+}

package/dist/entities/normalize.d.ts ADDED Viewed

@@ -0,0 +1,5 @@
+/**
+ * SPEC-046 R3 normalization rule: NFKC → lowercase → trim → collapse internal whitespace.
+ * Applied identically to user input (recall), entity normalized_name writes, and alias compares.
+ */
+export declare function normalizeEntityName(input: string): string;

package/dist/entities/normalize.js ADDED Viewed

@@ -0,0 +1,7 @@
+/**
+ * SPEC-046 R3 normalization rule: NFKC → lowercase → trim → collapse internal whitespace.
+ * Applied identically to user input (recall), entity normalized_name writes, and alias compares.
+ */
+export function normalizeEntityName(input) {
+    return input.normalize('NFKC').toLowerCase().trim().replace(/\s+/g, ' ');
+}

package/dist/entities/prompt.d.ts ADDED Viewed

@@ -0,0 +1,19 @@
+import { EntityType, ExtractionResponse } from './types.js';
+export type ParseResult = {
+    ok: true;
+    value: ExtractionResponse;
+    warnings: string[];
+} | {
+    ok: false;
+    error: string;
+};
+export declare function parseExtractionResponse(raw: string): ParseResult;
+export interface PromptInput {
+    memory_content: string;
+    known_entities: Array<{
+        type: EntityType;
+        canonical_name: string;
+        aliases: string[];
+    }>;
+}
+export declare function buildExtractionPrompt({ memory_content, known_entities }: PromptInput): string;

package/dist/entities/prompt.js ADDED Viewed

@@ -0,0 +1,100 @@
+import { ENTITY_TYPES, } from './types.js';
+function isMatched(x) {
+    return (typeof x === 'object' &&
+        x !== null &&
+        typeof x.canonical_name === 'string' &&
+        typeof x.confidence === 'number');
+}
+function clamp01(n) {
+    return Math.max(0, Math.min(1, n));
+}
+export function parseExtractionResponse(raw) {
+    let json;
+    try {
+        json = JSON.parse(raw);
+    }
+    catch (e) {
+        return { ok: false, error: `invalid JSON: ${e.message}` };
+    }
+    if (typeof json !== 'object' || json === null)
+        return { ok: false, error: 'response is not an object' };
+    if (!Array.isArray(json.matched))
+        return { ok: false, error: 'missing or non-array `matched`' };
+    if (!Array.isArray(json.new_entities))
+        return { ok: false, error: 'missing or non-array `new_entities`' };
+    const warnings = [];
+    const matched = [];
+    for (const m of json.matched) {
+        if (!isMatched(m)) {
+            warnings.push(`dropped malformed matched entry: ${JSON.stringify(m).slice(0, 100)}`);
+            continue;
+        }
+        const c = clamp01(m.confidence);
+        if (c !== m.confidence)
+            warnings.push(`clamped matched confidence ${m.confidence} → ${c} for ${m.canonical_name}`);
+        matched.push({ canonical_name: m.canonical_name, confidence: c });
+    }
+    const new_entities = [];
+    for (const n of json.new_entities) {
+        if (typeof n !== 'object' || n === null) {
+            warnings.push(`dropped non-object new_entity`);
+            continue;
+        }
+        if (!ENTITY_TYPES.includes(n.type)) {
+            warnings.push(`dropped new_entity with invalid type: ${n.type}`);
+            continue;
+        }
+        if (typeof n.canonical_name !== 'string' || n.canonical_name.length === 0) {
+            warnings.push(`dropped new_entity with empty canonical_name`);
+            continue;
+        }
+        if (typeof n.confidence !== 'number') {
+            warnings.push(`dropped new_entity ${n.canonical_name}: confidence not a number`);
+            continue;
+        }
+        const aliases = Array.isArray(n.aliases)
+            ? n.aliases.filter((a) => typeof a === 'string')
+            : [];
+        const c = clamp01(n.confidence);
+        if (c !== n.confidence)
+            warnings.push(`clamped new_entity confidence ${n.confidence} → ${c} for ${n.canonical_name}`);
+        new_entities.push({
+            type: n.type,
+            canonical_name: n.canonical_name,
+            aliases,
+            confidence: c,
+        });
+    }
+    return { ok: true, value: { matched, new_entities }, warnings };
+}
+export function buildExtractionPrompt({ memory_content, known_entities }) {
+    const knownBlock = known_entities.length
+        ? known_entities
+            .map((e) => `- ${e.type}: ${e.canonical_name}${e.aliases.length ? ` (aliases: ${e.aliases.join(', ')})` : ''}`)
+            .join('\n')
+        : '(none yet)';
+    return `You are an entity extractor. Identify entities in the memory below.
+KNOWN ENTITIES (use these canonical names exactly when matched):
+${knownBlock}
+ENTITY TYPES: project, person, tool, decision (no other types allowed)
+MEMORY:
+"""
+${memory_content}
+"""
+Return a single JSON object — no prose, no markdown — with this exact shape:
+{
+  "matched": [{"canonical_name": "<must-match-known-entity-exactly>", "confidence": 0.0-1.0}],
+  "new_entities": [{"type": "project|person|tool|decision", "canonical_name": "<name>", "aliases": ["alt1"], "confidence": 0.0-1.0}]
+}
+Rules:
+- "matched" canonical_names must EXACTLY match a known entity's canonical_name (case-sensitive).
+- "new_entities" type MUST be one of project, person, tool, decision. Other types will be rejected.
+- "aliases" is optional; omit or use [] if no aliases observed.
+- If the memory references no entities, return {"matched": [], "new_entities": []}.
+- Output JSON only. No code fences. No commentary.`;
+}

package/dist/entities/state.d.ts ADDED Viewed

@@ -0,0 +1,44 @@
+import type { RunSummary } from './types.js';
+export interface EntityStateInit {
+    runId: string;
+    dataDir: string;
+    resumeFrom?: string;
+}
+export declare class EntityState {
+    readonly runId: string;
+    private readonly dataDir;
+    private readonly stateFile;
+    private readonly terminalMemoryIds;
+    constructor({ runId, dataDir, resumeFrom }: EntityStateInit);
+    private loadTerminalSet;
+    isMemoryTerminal(memoryId: string): boolean;
+    /**
+     * Append a terminal-status row for one memory. Flushes synchronously.
+     *
+     * Per-record sync flush (see appendRecord) is intentional: it guarantees
+     * a crash mid-run can be resumed exactly. At current corpus scale (~100
+     * memories per run) the syscall overhead is negligible — the LLM call
+     * dominates the loop.
+     *
+     * TODO(perf, claw-2jbo finding 5): batch flushes (every N records or
+     * every M seconds) when the corpus exceeds ~1000 memories. Until then,
+     * durability beats batching.
+     */
+    recordTerminal(memoryId: string, status: 'extracted' | 'cap_reached' | 'skipped'): void;
+    /**
+     * Append a parse_failed row (non-terminal — resume will retry).
+     *
+     * Same per-record sync flush as recordTerminal; same durability rationale.
+     * See TODO(perf, claw-2jbo finding 5) on recordTerminal.
+     */
+    recordParseFailed(memoryId: string, raw: string): void;
+    /**
+     * Sync-append a state record to the JSONL state file. Per-record flush
+     * is durability-first by design — see recordTerminal doc-comment for the
+     * crash-recovery rationale and the batching TODO. Do not refactor to
+     * async/batched writes without a covering benchmark on real backfill load.
+     */
+    private appendRecord;
+    close(): void;
+    writeRunSummary(summary: RunSummary): void;
+}

package/dist/entities/state.js ADDED Viewed

@@ -0,0 +1,99 @@
+import { appendFileSync, existsSync, mkdirSync, readFileSync, writeFileSync, closeSync, openSync, } from 'node:fs';
+import { join } from 'node:path';
+const RAW_TRUNC = 2048;
+const STATE_FILE = 'entity-state.jsonl';
+const RUNS_DIR = 'entity-state.runs';
+export class EntityState {
+    runId;
+    dataDir;
+    stateFile;
+    terminalMemoryIds = new Set();
+    constructor({ runId, dataDir, resumeFrom }) {
+        this.runId = runId;
+        this.dataDir = dataDir;
+        if (!existsSync(dataDir))
+            mkdirSync(dataDir, { recursive: true });
+        this.stateFile = join(dataDir, STATE_FILE);
+        if (!existsSync(this.stateFile))
+            closeSync(openSync(this.stateFile, 'a'));
+        if (resumeFrom)
+            this.loadTerminalSet(resumeFrom);
+    }
+    loadTerminalSet(resumeRunId) {
+        const content = readFileSync(this.stateFile, 'utf8');
+        for (const line of content.split('\n')) {
+            if (!line.trim())
+                continue;
+            let rec;
+            try {
+                rec = JSON.parse(line);
+            }
+            catch {
+                continue;
+            }
+            if (rec.run_id !== resumeRunId)
+                continue;
+            if (rec.status === 'extracted' || rec.status === 'cap_reached' || rec.status === 'skipped') {
+                this.terminalMemoryIds.add(rec.memory_id);
+            }
+            // parse_failed is intentionally non-terminal — re-run will retry
+        }
+    }
+    isMemoryTerminal(memoryId) {
+        return this.terminalMemoryIds.has(memoryId);
+    }
+    /**
+     * Append a terminal-status row for one memory. Flushes synchronously.
+     *
+     * Per-record sync flush (see appendRecord) is intentional: it guarantees
+     * a crash mid-run can be resumed exactly. At current corpus scale (~100
+     * memories per run) the syscall overhead is negligible — the LLM call
+     * dominates the loop.
+     *
+     * TODO(perf, claw-2jbo finding 5): batch flushes (every N records or
+     * every M seconds) when the corpus exceeds ~1000 memories. Until then,
+     * durability beats batching.
+     */
+    recordTerminal(memoryId, status) {
+        this.terminalMemoryIds.add(memoryId);
+        this.appendRecord({
+            run_id: this.runId,
+            memory_id: memoryId,
+            status,
+            timestamp: new Date().toISOString(),
+        });
+    }
+    /**
+     * Append a parse_failed row (non-terminal — resume will retry).
+     *
+     * Same per-record sync flush as recordTerminal; same durability rationale.
+     * See TODO(perf, claw-2jbo finding 5) on recordTerminal.
+     */
+    recordParseFailed(memoryId, raw) {
+        this.appendRecord({
+            run_id: this.runId,
+            memory_id: memoryId,
+            status: 'parse_failed',
+            timestamp: new Date().toISOString(),
+            raw: raw.length > RAW_TRUNC ? raw.slice(0, RAW_TRUNC) : raw,
+        });
+    }
+    /**
+     * Sync-append a state record to the JSONL state file. Per-record flush
+     * is durability-first by design — see recordTerminal doc-comment for the
+     * crash-recovery rationale and the batching TODO. Do not refactor to
+     * async/batched writes without a covering benchmark on real backfill load.
+     */
+    appendRecord(rec) {
+        appendFileSync(this.stateFile, JSON.stringify(rec) + '\n');
+    }
+    close() {
+        /* explicit no-op; appendFileSync flushes per call */
+    }
+    writeRunSummary(summary) {
+        const runsDir = join(this.dataDir, RUNS_DIR);
+        if (!existsSync(runsDir))
+            mkdirSync(runsDir, { recursive: true });
+        writeFileSync(join(runsDir, `${summary.run_id}.json`), JSON.stringify(summary, null, 2));
+    }
+}

package/dist/entities/types.d.ts ADDED Viewed

@@ -0,0 +1,62 @@
+export type EntityType = 'project' | 'person' | 'tool' | 'decision';
+export declare const ENTITY_TYPES: readonly EntityType[];
+export interface EntityRow {
+    id: string;
+    type: EntityType;
+    canonical_name: string;
+    normalized_name: string;
+    aliases: string[];
+    metadata: Record<string, unknown>;
+    first_seen_at: Date;
+    last_seen_at: Date;
+}
+export interface MemoryEntityLink {
+    memory_id: string;
+    entity_id: string;
+    confidence: number;
+    source: string;
+}
+export interface ExtractionMatched {
+    canonical_name: string;
+    confidence: number;
+}
+export interface ExtractionNewEntity {
+    type: EntityType;
+    canonical_name: string;
+    aliases?: string[];
+    confidence: number;
+}
+export interface ExtractionResponse {
+    matched: ExtractionMatched[];
+    new_entities: ExtractionNewEntity[];
+}
+export interface RunSummary {
+    run_id: string;
+    started_at: string;
+    ended_at: string;
+    memories_seen: number;
+    memories_extracted: number;
+    entities_created: number;
+    entities_updated: number;
+    links_created: number;
+    total_cost_usd: number;
+    hit_cost_cap: boolean;
+    error?: string;
+    parse_failures?: number;
+    /**
+     * Count of `matched` entries returned by the LLM whose canonical_name did
+     * not appear in the known-entities context. The spec requires the LLM to
+     * echo a canonical_name verbatim from the known set; a miss here is an
+     * LLM hallucination. These are silently dropped (no DB write), but the
+     * count is surfaced so observability can alarm if it climbs. Added in
+     * claw-2jbo (PR #1 finding 2).
+     */
+    hallucinated_matched: number;
+}
+export interface StateRecord {
+    run_id: string;
+    memory_id: string;
+    status: 'extracted' | 'parse_failed' | 'cap_reached' | 'skipped';
+    timestamp: string;
+    raw?: string;
+}

package/dist/entities/types.js ADDED Viewed

@@ -0,0 +1,6 @@
+export const ENTITY_TYPES = [
+    'project',
+    'person',
+    'tool',
+    'decision',
+];