r2mcp 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. package/CHANGELOG.md +66 -0
  2. package/LICENSE +21 -0
  3. package/README.md +532 -0
  4. package/dist/breadcrumbs.d.ts +123 -0
  5. package/dist/breadcrumbs.js +135 -0
  6. package/dist/cli/classify-edges.d.ts +2 -0
  7. package/dist/cli/classify-edges.js +130 -0
  8. package/dist/cli/compile-wiki.d.ts +2 -0
  9. package/dist/cli/compile-wiki.js +173 -0
  10. package/dist/cli/dump-edges-json.d.ts +2 -0
  11. package/dist/cli/dump-edges-json.js +21 -0
  12. package/dist/cli/extract-entities.d.ts +17 -0
  13. package/dist/cli/extract-entities.js +166 -0
  14. package/dist/cli/lint-memory.d.ts +16 -0
  15. package/dist/cli/lint-memory.js +94 -0
  16. package/dist/cli/migrate.d.ts +17 -0
  17. package/dist/cli/migrate.js +146 -0
  18. package/dist/cli/setup-helpers.d.ts +7 -0
  19. package/dist/cli/setup-helpers.js +72 -0
  20. package/dist/cli/setup.d.ts +15 -0
  21. package/dist/cli/setup.js +95 -0
  22. package/dist/compiler/clustering.d.ts +29 -0
  23. package/dist/compiler/clustering.js +66 -0
  24. package/dist/compiler/frontmatter.d.ts +35 -0
  25. package/dist/compiler/frontmatter.js +168 -0
  26. package/dist/compiler/manifest.d.ts +32 -0
  27. package/dist/compiler/manifest.js +82 -0
  28. package/dist/compiler/prompts.d.ts +17 -0
  29. package/dist/compiler/prompts.js +82 -0
  30. package/dist/compiler/run.d.ts +52 -0
  31. package/dist/compiler/run.js +186 -0
  32. package/dist/compiler/tier.d.ts +10 -0
  33. package/dist/compiler/tier.js +85 -0
  34. package/dist/compiler/topic.d.ts +16 -0
  35. package/dist/compiler/topic.js +105 -0
  36. package/dist/compiler/types.d.ts +101 -0
  37. package/dist/compiler/types.js +4 -0
  38. package/dist/db.d.ts +10 -0
  39. package/dist/db.js +46 -0
  40. package/dist/edges/candidate-pairs.d.ts +24 -0
  41. package/dist/edges/candidate-pairs.js +35 -0
  42. package/dist/edges/classifier.d.ts +45 -0
  43. package/dist/edges/classifier.js +172 -0
  44. package/dist/edges/signals.d.ts +13 -0
  45. package/dist/edges/signals.js +45 -0
  46. package/dist/edges/stage1-haiku.d.ts +21 -0
  47. package/dist/edges/stage1-haiku.js +33 -0
  48. package/dist/edges/stage2-opus.d.ts +41 -0
  49. package/dist/edges/stage2-opus.js +101 -0
  50. package/dist/edges/state.d.ts +44 -0
  51. package/dist/edges/state.js +79 -0
  52. package/dist/edges/types.d.ts +20 -0
  53. package/dist/edges/types.js +1 -0
  54. package/dist/embeddings.d.ts +13 -0
  55. package/dist/embeddings.js +54 -0
  56. package/dist/entities/db.d.ts +49 -0
  57. package/dist/entities/db.js +109 -0
  58. package/dist/entities/extractor.d.ts +14 -0
  59. package/dist/entities/extractor.js +154 -0
  60. package/dist/entities/normalize.d.ts +5 -0
  61. package/dist/entities/normalize.js +7 -0
  62. package/dist/entities/prompt.d.ts +19 -0
  63. package/dist/entities/prompt.js +100 -0
  64. package/dist/entities/state.d.ts +44 -0
  65. package/dist/entities/state.js +99 -0
  66. package/dist/entities/types.d.ts +62 -0
  67. package/dist/entities/types.js +6 -0
  68. package/dist/env.d.ts +13 -0
  69. package/dist/env.js +32 -0
  70. package/dist/fingerprint.d.ts +2 -0
  71. package/dist/fingerprint.js +12 -0
  72. package/dist/graph-rebuild.d.ts +6 -0
  73. package/dist/graph-rebuild.js +20 -0
  74. package/dist/index.d.ts +4 -0
  75. package/dist/index.js +403 -0
  76. package/dist/instrumentation.d.ts +10 -0
  77. package/dist/instrumentation.js +37 -0
  78. package/dist/lint/checks/contradictions.d.ts +30 -0
  79. package/dist/lint/checks/contradictions.js +52 -0
  80. package/dist/lint/checks/drift.d.ts +5 -0
  81. package/dist/lint/checks/drift.js +34 -0
  82. package/dist/lint/checks/orphans.d.ts +5 -0
  83. package/dist/lint/checks/orphans.js +25 -0
  84. package/dist/lint/checks/stale.d.ts +6 -0
  85. package/dist/lint/checks/stale.js +29 -0
  86. package/dist/lint/checks/superseded-unflagged.d.ts +5 -0
  87. package/dist/lint/checks/superseded-unflagged.js +47 -0
  88. package/dist/lint/run.d.ts +11 -0
  89. package/dist/lint/run.js +95 -0
  90. package/dist/lint/types.d.ts +60 -0
  91. package/dist/lint/types.js +13 -0
  92. package/dist/mcp-response.d.ts +7 -0
  93. package/dist/mcp-response.js +13 -0
  94. package/dist/providers/anthropic.d.ts +13 -0
  95. package/dist/providers/anthropic.js +56 -0
  96. package/dist/providers/claude-code.d.ts +35 -0
  97. package/dist/providers/claude-code.js +175 -0
  98. package/dist/providers/errors.d.ts +12 -0
  99. package/dist/providers/errors.js +19 -0
  100. package/dist/providers/index.d.ts +30 -0
  101. package/dist/providers/index.js +71 -0
  102. package/dist/providers/openrouter.d.ts +19 -0
  103. package/dist/providers/openrouter.js +76 -0
  104. package/dist/providers/semaphore.d.ts +19 -0
  105. package/dist/providers/semaphore.js +51 -0
  106. package/dist/providers/types.d.ts +27 -0
  107. package/dist/providers/types.js +7 -0
  108. package/dist/schema.sql +116 -0
  109. package/dist/server-instructions.d.ts +9 -0
  110. package/dist/server-instructions.js +20 -0
  111. package/dist/telemetry.d.ts +39 -0
  112. package/dist/telemetry.js +130 -0
  113. package/dist/tools/classify.d.ts +44 -0
  114. package/dist/tools/classify.js +121 -0
  115. package/dist/tools/compile.d.ts +31 -0
  116. package/dist/tools/compile.js +132 -0
  117. package/dist/tools/dump-edges-sidecar.d.ts +37 -0
  118. package/dist/tools/dump-edges-sidecar.js +80 -0
  119. package/dist/tools/extract-entities.d.ts +53 -0
  120. package/dist/tools/extract-entities.js +169 -0
  121. package/dist/tools/lint.d.ts +10 -0
  122. package/dist/tools/lint.js +13 -0
  123. package/dist/tools/meditate.d.ts +25 -0
  124. package/dist/tools/meditate.js +128 -0
  125. package/dist/tools/recall.d.ts +66 -0
  126. package/dist/tools/recall.js +409 -0
  127. package/dist/tools/reject.d.ts +10 -0
  128. package/dist/tools/reject.js +24 -0
  129. package/dist/tools/remember.d.ts +26 -0
  130. package/dist/tools/remember.js +140 -0
  131. package/dist/tools/search.d.ts +30 -0
  132. package/dist/tools/search.js +69 -0
  133. package/dist/tools/spawn-cli.d.ts +14 -0
  134. package/dist/tools/spawn-cli.js +41 -0
  135. package/dist/tools/stats.d.ts +31 -0
  136. package/dist/tools/stats.js +88 -0
  137. package/package.json +86 -0
  138. package/skills/remember/SKILL.md +357 -0
@@ -0,0 +1,49 @@
1
+ import type pg from 'pg';
2
+ import type { EntityRow, EntityType } from './types.js';
3
+ type DbClient = pg.Pool | pg.PoolClient;
4
+ export interface UpsertEntityInput {
5
+ type: EntityType;
6
+ canonical_name: string;
7
+ aliases?: string[];
8
+ }
9
+ export interface UpsertEntityResult {
10
+ id: string;
11
+ created: boolean;
12
+ }
13
+ export declare function upsertEntity(client: DbClient, input: UpsertEntityInput): Promise<UpsertEntityResult>;
14
+ export declare function findEntityByInput(client: DbClient, input: string): Promise<EntityRow | null>;
15
+ export declare function mergeAliases(client: DbClient, entityId: string, newAliases: string[]): Promise<string[]>;
16
+ export interface LinkResult {
17
+ inserted: boolean;
18
+ }
19
+ export declare function linkMemoryToEntity(client: DbClient, memoryId: string, entityId: string, confidence: number, source: string): Promise<LinkResult>;
20
+ export declare function getTopEntitiesByFrequency(client: DbClient, n: number): Promise<Array<{
21
+ id: string;
22
+ type: EntityType;
23
+ canonical_name: string;
24
+ normalized_name: string;
25
+ aliases: string[];
26
+ link_count: number;
27
+ }>>;
28
+ export interface CandidateFilter {
29
+ sinceDays?: number;
30
+ /**
31
+ * When true, bypass the "no existing entity rows OR updated since most-recent link"
32
+ * pre-filter and return ALL memories in the corpus. Spec R6: full-corpus
33
+ * re-extraction is opt-in via this flag, not the default. The `sinceDays`
34
+ * filter still applies if both are set (though the CLI/MCP guards against
35
+ * passing them together).
36
+ */
37
+ full?: boolean;
38
+ }
39
+ export declare function findCandidateMemories(client: DbClient, filter: CandidateFilter): Promise<Array<{
40
+ id: string;
41
+ content: string;
42
+ updated_at: Date;
43
+ }>>;
44
+ export declare function getEntityLinksForMemories(client: DbClient, memoryIds: string[]): Promise<Map<string, Array<{
45
+ type: EntityType;
46
+ canonical_name: string;
47
+ confidence: number;
48
+ }>>>;
49
+ export {};
@@ -0,0 +1,109 @@
1
+ // SPEC-046 Task 4 — entities DB layer.
2
+ //
3
+ // Pure DB access for the entity extraction pipeline. All functions accept
4
+ // either a pg.Pool or pg.PoolClient (both expose a compatible .query()).
5
+ // Callers in the extractor driver and recall path use the shared pool from
6
+ // src/db.ts; tests pass the pool directly.
7
+ import { normalizeEntityName } from './normalize.js';
8
+ export async function upsertEntity(client, input) {
9
+ const normalized = normalizeEntityName(input.canonical_name);
10
+ const aliases = (input.aliases ?? [])
11
+ .map((a) => normalizeEntityName(a))
12
+ .filter((a) => a.length > 0);
13
+ // ON CONFLICT also merges aliases (claw-2jbo, PR #1 finding 3). Prior shape
14
+ // dropped EXCLUDED.aliases silently, requiring callers to invoke
15
+ // mergeAliases() separately. Direct consumers of upsertEntity now get the
16
+ // union-merge for free. ARRAY(SELECT DISTINCT UNNEST(...)) is the same
17
+ // union shape used by mergeAliases below.
18
+ const { rows } = await client.query(`INSERT INTO entities (type, canonical_name, normalized_name, aliases)
19
+ VALUES ($1, $2, $3, $4::text[])
20
+ ON CONFLICT (type, normalized_name) DO UPDATE SET
21
+ aliases = ARRAY(SELECT DISTINCT UNNEST(entities.aliases || EXCLUDED.aliases)),
22
+ last_seen_at = NOW()
23
+ RETURNING id, (xmax = 0) AS created`, [input.type, input.canonical_name, normalized, aliases]);
24
+ return { id: rows[0].id, created: rows[0].created };
25
+ }
26
+ export async function findEntityByInput(client, input) {
27
+ const normalized = normalizeEntityName(input);
28
+ if (!normalized)
29
+ return null;
30
+ const { rows } = await client.query(`SELECT id, type, canonical_name, normalized_name, aliases, metadata, first_seen_at, last_seen_at
31
+ FROM entities
32
+ WHERE normalized_name = $1 OR $1 = ANY(aliases)
33
+ LIMIT 1`, [normalized]);
34
+ return rows[0] ?? null;
35
+ }
36
+ export async function mergeAliases(client, entityId, newAliases) {
37
+ const normalized = newAliases.map((a) => normalizeEntityName(a)).filter((a) => a.length > 0);
38
+ const { rows } = await client.query(`UPDATE entities
39
+ SET aliases = ARRAY(SELECT DISTINCT UNNEST(aliases || $2::text[])),
40
+ last_seen_at = NOW()
41
+ WHERE id = $1
42
+ RETURNING aliases`, [entityId, normalized]);
43
+ return rows[0]?.aliases ?? [];
44
+ }
45
+ export async function linkMemoryToEntity(client, memoryId, entityId, confidence, source) {
46
+ const { rowCount } = await client.query(`INSERT INTO memory_entities (memory_id, entity_id, confidence, source)
47
+ VALUES ($1, $2, $3, $4)
48
+ ON CONFLICT (memory_id, entity_id) DO NOTHING`, [memoryId, entityId, confidence, source]);
49
+ return { inserted: (rowCount ?? 0) > 0 };
50
+ }
51
+ export async function getTopEntitiesByFrequency(client, n) {
52
+ // Returns id + normalized_name (additive vs. prior shape) so callers can
53
+ // build an in-memory lookup keyed by normalized canonical AND alias, used
54
+ // by the extractor to resolve LLM-matched canonical_names without a DB
55
+ // round-trip per match (claw-2jbo finding 1).
56
+ const { rows } = await client.query(`SELECT e.id, e.type, e.canonical_name, e.normalized_name, e.aliases,
57
+ COUNT(me.memory_id)::int AS link_count
58
+ FROM entities e
59
+ LEFT JOIN memory_entities me ON me.entity_id = e.id
60
+ GROUP BY e.id
61
+ ORDER BY link_count DESC, e.canonical_name ASC
62
+ LIMIT $1`, [n]);
63
+ return rows;
64
+ }
65
+ export async function findCandidateMemories(client, filter) {
66
+ const params = [];
67
+ const clauses = [];
68
+ if (!filter.full) {
69
+ // Default pre-filter: no existing entity rows OR memory updated since most recent link
70
+ clauses.push(`(
71
+ NOT EXISTS (SELECT 1 FROM memory_entities me WHERE me.memory_id = m.id)
72
+ OR m.updated_at > (SELECT MAX(me.created_at) FROM memory_entities me WHERE me.memory_id = m.id)
73
+ )`);
74
+ }
75
+ if (filter.sinceDays !== undefined) {
76
+ if (filter.sinceDays === 0)
77
+ return [];
78
+ params.push(filter.sinceDays);
79
+ clauses.push(`m.updated_at >= NOW() - ($${params.length}::int * INTERVAL '1 day')`);
80
+ }
81
+ const where = clauses.length > 0 ? `WHERE ${clauses.join(' AND ')}` : '';
82
+ const { rows } = await client.query(`SELECT m.id, m.content, m.updated_at
83
+ FROM memories m
84
+ ${where}
85
+ ORDER BY m.updated_at DESC`, params);
86
+ return rows;
87
+ }
88
+ export async function getEntityLinksForMemories(client, memoryIds) {
89
+ if (memoryIds.length === 0)
90
+ return new Map();
91
+ // Cast confidence::float — schema is NUMERIC(3,2) to match memory_edges
92
+ // (SPEC-043), but pg returns NUMERIC as a JS string by default. Callers
93
+ // (recall's EntityLink) type this as number, so cast at the query.
94
+ const { rows } = await client.query(`SELECT me.memory_id, e.type, e.canonical_name, me.confidence::float AS confidence
95
+ FROM memory_entities me
96
+ JOIN entities e ON e.id = me.entity_id
97
+ WHERE me.memory_id = ANY($1::uuid[])`, [memoryIds]);
98
+ const out = new Map();
99
+ for (const r of rows) {
100
+ const arr = out.get(r.memory_id) ?? [];
101
+ arr.push({
102
+ type: r.type,
103
+ canonical_name: r.canonical_name,
104
+ confidence: r.confidence,
105
+ });
106
+ out.set(r.memory_id, arr);
107
+ }
108
+ return out;
109
+ }
@@ -0,0 +1,14 @@
1
+ import type pg from 'pg';
2
+ import type { LLMProvider } from '../providers/types.js';
3
+ import type { RunSummary } from './types.js';
4
+ export interface RunExtractorOptions {
5
+ client: pg.Pool | pg.PoolClient;
6
+ provider: LLMProvider;
7
+ dataDir: string;
8
+ maxCostUsd: number;
9
+ contextTopN: number;
10
+ sinceDays?: number;
11
+ full?: boolean;
12
+ resumeFrom?: string;
13
+ }
14
+ export declare function runExtractor(opts: RunExtractorOptions): Promise<RunSummary>;
@@ -0,0 +1,154 @@
1
+ // SPEC-046 Task 6 — extractor driver.
2
+ //
3
+ // Orchestrates pre-filter → LLM call → strict parse → DB writes → state
4
+ // recording → run summary, with cost cap enforcement and resume support.
5
+ // All DB writes go through the entity DB layer (Task 4), which accepts both
6
+ // pg.Pool and pg.PoolClient. State machinery is delegated to EntityState
7
+ // (Task 5).
8
+ import { randomUUID } from 'node:crypto';
9
+ import { withLLMCallSpan } from '../telemetry.js';
10
+ import { buildExtractionPrompt, parseExtractionResponse } from './prompt.js';
11
+ import { findCandidateMemories, getTopEntitiesByFrequency, upsertEntity, linkMemoryToEntity, } from './db.js';
12
+ import { normalizeEntityName } from './normalize.js';
13
+ import { EntityState } from './state.js';
14
+ export async function runExtractor(opts) {
15
+ const startedAt = new Date();
16
+ const runId = randomUUID();
17
+ const state = new EntityState({ runId, dataDir: opts.dataDir, resumeFrom: opts.resumeFrom });
18
+ let memories_seen = 0;
19
+ let memories_extracted = 0;
20
+ let entities_created = 0;
21
+ let entities_updated = 0;
22
+ let links_created = 0;
23
+ let total_cost_usd = 0;
24
+ let parse_failures = 0;
25
+ let hit_cost_cap = false;
26
+ let hallucinated_matched = 0;
27
+ const candidates = await findCandidateMemories(opts.client, {
28
+ sinceDays: opts.sinceDays,
29
+ full: opts.full,
30
+ });
31
+ const known = await getTopEntitiesByFrequency(opts.client, opts.contextTopN);
32
+ // claw-2jbo finding 1: build a normalized-name → entity-id map once per run
33
+ // so LLM-matched canonical_names resolve synchronously instead of issuing
34
+ // findEntityByInput() per match (N+1). The spec requires the LLM to echo a
35
+ // canonical_name verbatim from the known set, so the map should always hit
36
+ // for well-behaved LLM output. Map misses (counted via hallucinated_matched)
37
+ // are the hallucination signal — see finding 2.
38
+ //
39
+ // Key by normalized canonical_name AND each alias so a match-by-alias still
40
+ // resolves (aliases stored normalized; see db.ts upsertEntity/mergeAliases).
41
+ const knownById = new Map(); // normalized lookup → entity id
42
+ for (const e of known) {
43
+ knownById.set(e.normalized_name, e.id);
44
+ for (const alias of e.aliases)
45
+ knownById.set(alias, e.id);
46
+ }
47
+ for (const mem of candidates) {
48
+ memories_seen++;
49
+ if (state.isMemoryTerminal(mem.id))
50
+ continue;
51
+ if (total_cost_usd >= opts.maxCostUsd) {
52
+ state.recordTerminal(mem.id, 'cap_reached');
53
+ hit_cost_cap = true;
54
+ continue;
55
+ }
56
+ const prompt = buildExtractionPrompt({
57
+ memory_content: mem.content,
58
+ known_entities: known.map((e) => ({
59
+ type: e.type,
60
+ canonical_name: e.canonical_name,
61
+ aliases: e.aliases,
62
+ })),
63
+ });
64
+ let rawResponse;
65
+ try {
66
+ // claw-1ejd: wrap the LLM call in a child span so the parent context
67
+ // restored from OTEL_TRACEPARENT (set by the MCP wrapper) has a
68
+ // concrete operation to inherit. No-op when SDK is not initialized.
69
+ const result = await withLLMCallSpan('memory.extract_entities.call', { provider: opts.provider.name, model: 'haiku' }, () => opts.provider.complete({ prompt, model: 'haiku' }));
70
+ total_cost_usd += result.cost_usd;
71
+ rawResponse = result.response;
72
+ }
73
+ catch (e) {
74
+ // Surface provider failures via run summary error; do not mark terminal.
75
+ return finalize(state, startedAt, runId, {
76
+ memories_seen,
77
+ memories_extracted,
78
+ entities_created,
79
+ entities_updated,
80
+ links_created,
81
+ total_cost_usd,
82
+ hit_cost_cap,
83
+ parse_failures,
84
+ hallucinated_matched,
85
+ error: `provider error: ${e.message}`,
86
+ });
87
+ }
88
+ if (total_cost_usd >= opts.maxCostUsd)
89
+ hit_cost_cap = true;
90
+ const parsed = parseExtractionResponse(rawResponse);
91
+ if (!parsed.ok) {
92
+ parse_failures++;
93
+ state.recordParseFailed(mem.id, rawResponse);
94
+ continue;
95
+ }
96
+ for (const m of parsed.value.matched) {
97
+ // Synchronous lookup against the in-memory map built from the known set
98
+ // above (claw-2jbo finding 1). A miss means the LLM returned a
99
+ // canonical_name not in the context we provided — i.e. a hallucination.
100
+ // Increment hallucinated_matched and skip the link (no DB write for a
101
+ // canonical we never told the model about).
102
+ const entityId = knownById.get(normalizeEntityName(m.canonical_name));
103
+ if (!entityId) {
104
+ hallucinated_matched++;
105
+ continue;
106
+ }
107
+ const link = await linkMemoryToEntity(opts.client, mem.id, entityId, m.confidence, 'classifier');
108
+ if (link.inserted)
109
+ links_created++;
110
+ }
111
+ for (const n of parsed.value.new_entities) {
112
+ const up = await upsertEntity(opts.client, {
113
+ type: n.type,
114
+ canonical_name: n.canonical_name,
115
+ aliases: n.aliases,
116
+ });
117
+ if (up.created)
118
+ entities_created++;
119
+ else
120
+ entities_updated++;
121
+ // Alias merge happens inside upsertEntity's ON CONFLICT clause (see db.ts).
122
+ const link = await linkMemoryToEntity(opts.client, mem.id, up.id, n.confidence, 'classifier');
123
+ if (link.inserted)
124
+ links_created++;
125
+ }
126
+ memories_extracted++;
127
+ state.recordTerminal(mem.id, 'extracted');
128
+ }
129
+ return finalize(state, startedAt, runId, {
130
+ memories_seen,
131
+ memories_extracted,
132
+ entities_created,
133
+ entities_updated,
134
+ links_created,
135
+ total_cost_usd,
136
+ hit_cost_cap,
137
+ parse_failures,
138
+ hallucinated_matched,
139
+ });
140
+ }
141
+ function finalize(state, startedAt, runId, counts) {
142
+ const summary = {
143
+ run_id: runId,
144
+ started_at: startedAt.toISOString(),
145
+ ended_at: new Date().toISOString(),
146
+ ...counts,
147
+ };
148
+ if (counts.parse_failures > 0 && !summary.error) {
149
+ summary.error = `parse failures: ${counts.parse_failures}`;
150
+ }
151
+ state.writeRunSummary(summary);
152
+ state.close();
153
+ return summary;
154
+ }
@@ -0,0 +1,5 @@
1
+ /**
2
+ * SPEC-046 R3 normalization rule: NFKC → lowercase → trim → collapse internal whitespace.
3
+ * Applied identically to user input (recall), entity normalized_name writes, and alias compares.
4
+ */
5
+ export declare function normalizeEntityName(input: string): string;
@@ -0,0 +1,7 @@
1
+ /**
2
+ * SPEC-046 R3 normalization rule: NFKC → lowercase → trim → collapse internal whitespace.
3
+ * Applied identically to user input (recall), entity normalized_name writes, and alias compares.
4
+ */
5
+ export function normalizeEntityName(input) {
6
+ return input.normalize('NFKC').toLowerCase().trim().replace(/\s+/g, ' ');
7
+ }
@@ -0,0 +1,19 @@
1
+ import { EntityType, ExtractionResponse } from './types.js';
2
+ export type ParseResult = {
3
+ ok: true;
4
+ value: ExtractionResponse;
5
+ warnings: string[];
6
+ } | {
7
+ ok: false;
8
+ error: string;
9
+ };
10
+ export declare function parseExtractionResponse(raw: string): ParseResult;
11
+ export interface PromptInput {
12
+ memory_content: string;
13
+ known_entities: Array<{
14
+ type: EntityType;
15
+ canonical_name: string;
16
+ aliases: string[];
17
+ }>;
18
+ }
19
+ export declare function buildExtractionPrompt({ memory_content, known_entities }: PromptInput): string;
@@ -0,0 +1,100 @@
1
+ import { ENTITY_TYPES, } from './types.js';
2
+ function isMatched(x) {
3
+ return (typeof x === 'object' &&
4
+ x !== null &&
5
+ typeof x.canonical_name === 'string' &&
6
+ typeof x.confidence === 'number');
7
+ }
8
+ function clamp01(n) {
9
+ return Math.max(0, Math.min(1, n));
10
+ }
11
+ export function parseExtractionResponse(raw) {
12
+ let json;
13
+ try {
14
+ json = JSON.parse(raw);
15
+ }
16
+ catch (e) {
17
+ return { ok: false, error: `invalid JSON: ${e.message}` };
18
+ }
19
+ if (typeof json !== 'object' || json === null)
20
+ return { ok: false, error: 'response is not an object' };
21
+ if (!Array.isArray(json.matched))
22
+ return { ok: false, error: 'missing or non-array `matched`' };
23
+ if (!Array.isArray(json.new_entities))
24
+ return { ok: false, error: 'missing or non-array `new_entities`' };
25
+ const warnings = [];
26
+ const matched = [];
27
+ for (const m of json.matched) {
28
+ if (!isMatched(m)) {
29
+ warnings.push(`dropped malformed matched entry: ${JSON.stringify(m).slice(0, 100)}`);
30
+ continue;
31
+ }
32
+ const c = clamp01(m.confidence);
33
+ if (c !== m.confidence)
34
+ warnings.push(`clamped matched confidence ${m.confidence} → ${c} for ${m.canonical_name}`);
35
+ matched.push({ canonical_name: m.canonical_name, confidence: c });
36
+ }
37
+ const new_entities = [];
38
+ for (const n of json.new_entities) {
39
+ if (typeof n !== 'object' || n === null) {
40
+ warnings.push(`dropped non-object new_entity`);
41
+ continue;
42
+ }
43
+ if (!ENTITY_TYPES.includes(n.type)) {
44
+ warnings.push(`dropped new_entity with invalid type: ${n.type}`);
45
+ continue;
46
+ }
47
+ if (typeof n.canonical_name !== 'string' || n.canonical_name.length === 0) {
48
+ warnings.push(`dropped new_entity with empty canonical_name`);
49
+ continue;
50
+ }
51
+ if (typeof n.confidence !== 'number') {
52
+ warnings.push(`dropped new_entity ${n.canonical_name}: confidence not a number`);
53
+ continue;
54
+ }
55
+ const aliases = Array.isArray(n.aliases)
56
+ ? n.aliases.filter((a) => typeof a === 'string')
57
+ : [];
58
+ const c = clamp01(n.confidence);
59
+ if (c !== n.confidence)
60
+ warnings.push(`clamped new_entity confidence ${n.confidence} → ${c} for ${n.canonical_name}`);
61
+ new_entities.push({
62
+ type: n.type,
63
+ canonical_name: n.canonical_name,
64
+ aliases,
65
+ confidence: c,
66
+ });
67
+ }
68
+ return { ok: true, value: { matched, new_entities }, warnings };
69
+ }
70
+ export function buildExtractionPrompt({ memory_content, known_entities }) {
71
+ const knownBlock = known_entities.length
72
+ ? known_entities
73
+ .map((e) => `- ${e.type}: ${e.canonical_name}${e.aliases.length ? ` (aliases: ${e.aliases.join(', ')})` : ''}`)
74
+ .join('\n')
75
+ : '(none yet)';
76
+ return `You are an entity extractor. Identify entities in the memory below.
77
+
78
+ KNOWN ENTITIES (use these canonical names exactly when matched):
79
+ ${knownBlock}
80
+
81
+ ENTITY TYPES: project, person, tool, decision (no other types allowed)
82
+
83
+ MEMORY:
84
+ """
85
+ ${memory_content}
86
+ """
87
+
88
+ Return a single JSON object — no prose, no markdown — with this exact shape:
89
+ {
90
+ "matched": [{"canonical_name": "<must-match-known-entity-exactly>", "confidence": 0.0-1.0}],
91
+ "new_entities": [{"type": "project|person|tool|decision", "canonical_name": "<name>", "aliases": ["alt1"], "confidence": 0.0-1.0}]
92
+ }
93
+
94
+ Rules:
95
+ - "matched" canonical_names must EXACTLY match a known entity's canonical_name (case-sensitive).
96
+ - "new_entities" type MUST be one of project, person, tool, decision. Other types will be rejected.
97
+ - "aliases" is optional; omit or use [] if no aliases observed.
98
+ - If the memory references no entities, return {"matched": [], "new_entities": []}.
99
+ - Output JSON only. No code fences. No commentary.`;
100
+ }
@@ -0,0 +1,44 @@
1
+ import type { RunSummary } from './types.js';
2
+ export interface EntityStateInit {
3
+ runId: string;
4
+ dataDir: string;
5
+ resumeFrom?: string;
6
+ }
7
+ export declare class EntityState {
8
+ readonly runId: string;
9
+ private readonly dataDir;
10
+ private readonly stateFile;
11
+ private readonly terminalMemoryIds;
12
+ constructor({ runId, dataDir, resumeFrom }: EntityStateInit);
13
+ private loadTerminalSet;
14
+ isMemoryTerminal(memoryId: string): boolean;
15
+ /**
16
+ * Append a terminal-status row for one memory. Flushes synchronously.
17
+ *
18
+ * Per-record sync flush (see appendRecord) is intentional: it guarantees
19
+ * a crash mid-run can be resumed exactly. At current corpus scale (~100
20
+ * memories per run) the syscall overhead is negligible — the LLM call
21
+ * dominates the loop.
22
+ *
23
+ * TODO(perf, claw-2jbo finding 5): batch flushes (every N records or
24
+ * every M seconds) when the corpus exceeds ~1000 memories. Until then,
25
+ * durability beats batching.
26
+ */
27
+ recordTerminal(memoryId: string, status: 'extracted' | 'cap_reached' | 'skipped'): void;
28
+ /**
29
+ * Append a parse_failed row (non-terminal — resume will retry).
30
+ *
31
+ * Same per-record sync flush as recordTerminal; same durability rationale.
32
+ * See TODO(perf, claw-2jbo finding 5) on recordTerminal.
33
+ */
34
+ recordParseFailed(memoryId: string, raw: string): void;
35
+ /**
36
+ * Sync-append a state record to the JSONL state file. Per-record flush
37
+ * is durability-first by design — see recordTerminal doc-comment for the
38
+ * crash-recovery rationale and the batching TODO. Do not refactor to
39
+ * async/batched writes without a covering benchmark on real backfill load.
40
+ */
41
+ private appendRecord;
42
+ close(): void;
43
+ writeRunSummary(summary: RunSummary): void;
44
+ }
@@ -0,0 +1,99 @@
1
+ import { appendFileSync, existsSync, mkdirSync, readFileSync, writeFileSync, closeSync, openSync, } from 'node:fs';
2
+ import { join } from 'node:path';
3
+ const RAW_TRUNC = 2048;
4
+ const STATE_FILE = 'entity-state.jsonl';
5
+ const RUNS_DIR = 'entity-state.runs';
6
+ export class EntityState {
7
+ runId;
8
+ dataDir;
9
+ stateFile;
10
+ terminalMemoryIds = new Set();
11
+ constructor({ runId, dataDir, resumeFrom }) {
12
+ this.runId = runId;
13
+ this.dataDir = dataDir;
14
+ if (!existsSync(dataDir))
15
+ mkdirSync(dataDir, { recursive: true });
16
+ this.stateFile = join(dataDir, STATE_FILE);
17
+ if (!existsSync(this.stateFile))
18
+ closeSync(openSync(this.stateFile, 'a'));
19
+ if (resumeFrom)
20
+ this.loadTerminalSet(resumeFrom);
21
+ }
22
+ loadTerminalSet(resumeRunId) {
23
+ const content = readFileSync(this.stateFile, 'utf8');
24
+ for (const line of content.split('\n')) {
25
+ if (!line.trim())
26
+ continue;
27
+ let rec;
28
+ try {
29
+ rec = JSON.parse(line);
30
+ }
31
+ catch {
32
+ continue;
33
+ }
34
+ if (rec.run_id !== resumeRunId)
35
+ continue;
36
+ if (rec.status === 'extracted' || rec.status === 'cap_reached' || rec.status === 'skipped') {
37
+ this.terminalMemoryIds.add(rec.memory_id);
38
+ }
39
+ // parse_failed is intentionally non-terminal — re-run will retry
40
+ }
41
+ }
42
+ isMemoryTerminal(memoryId) {
43
+ return this.terminalMemoryIds.has(memoryId);
44
+ }
45
+ /**
46
+ * Append a terminal-status row for one memory. Flushes synchronously.
47
+ *
48
+ * Per-record sync flush (see appendRecord) is intentional: it guarantees
49
+ * a crash mid-run can be resumed exactly. At current corpus scale (~100
50
+ * memories per run) the syscall overhead is negligible — the LLM call
51
+ * dominates the loop.
52
+ *
53
+ * TODO(perf, claw-2jbo finding 5): batch flushes (every N records or
54
+ * every M seconds) when the corpus exceeds ~1000 memories. Until then,
55
+ * durability beats batching.
56
+ */
57
+ recordTerminal(memoryId, status) {
58
+ this.terminalMemoryIds.add(memoryId);
59
+ this.appendRecord({
60
+ run_id: this.runId,
61
+ memory_id: memoryId,
62
+ status,
63
+ timestamp: new Date().toISOString(),
64
+ });
65
+ }
66
+ /**
67
+ * Append a parse_failed row (non-terminal — resume will retry).
68
+ *
69
+ * Same per-record sync flush as recordTerminal; same durability rationale.
70
+ * See TODO(perf, claw-2jbo finding 5) on recordTerminal.
71
+ */
72
+ recordParseFailed(memoryId, raw) {
73
+ this.appendRecord({
74
+ run_id: this.runId,
75
+ memory_id: memoryId,
76
+ status: 'parse_failed',
77
+ timestamp: new Date().toISOString(),
78
+ raw: raw.length > RAW_TRUNC ? raw.slice(0, RAW_TRUNC) : raw,
79
+ });
80
+ }
81
+ /**
82
+ * Sync-append a state record to the JSONL state file. Per-record flush
83
+ * is durability-first by design — see recordTerminal doc-comment for the
84
+ * crash-recovery rationale and the batching TODO. Do not refactor to
85
+ * async/batched writes without a covering benchmark on real backfill load.
86
+ */
87
+ appendRecord(rec) {
88
+ appendFileSync(this.stateFile, JSON.stringify(rec) + '\n');
89
+ }
90
+ close() {
91
+ /* explicit no-op; appendFileSync flushes per call */
92
+ }
93
+ writeRunSummary(summary) {
94
+ const runsDir = join(this.dataDir, RUNS_DIR);
95
+ if (!existsSync(runsDir))
96
+ mkdirSync(runsDir, { recursive: true });
97
+ writeFileSync(join(runsDir, `${summary.run_id}.json`), JSON.stringify(summary, null, 2));
98
+ }
99
+ }
@@ -0,0 +1,62 @@
1
+ export type EntityType = 'project' | 'person' | 'tool' | 'decision';
2
+ export declare const ENTITY_TYPES: readonly EntityType[];
3
+ export interface EntityRow {
4
+ id: string;
5
+ type: EntityType;
6
+ canonical_name: string;
7
+ normalized_name: string;
8
+ aliases: string[];
9
+ metadata: Record<string, unknown>;
10
+ first_seen_at: Date;
11
+ last_seen_at: Date;
12
+ }
13
+ export interface MemoryEntityLink {
14
+ memory_id: string;
15
+ entity_id: string;
16
+ confidence: number;
17
+ source: string;
18
+ }
19
+ export interface ExtractionMatched {
20
+ canonical_name: string;
21
+ confidence: number;
22
+ }
23
+ export interface ExtractionNewEntity {
24
+ type: EntityType;
25
+ canonical_name: string;
26
+ aliases?: string[];
27
+ confidence: number;
28
+ }
29
+ export interface ExtractionResponse {
30
+ matched: ExtractionMatched[];
31
+ new_entities: ExtractionNewEntity[];
32
+ }
33
+ export interface RunSummary {
34
+ run_id: string;
35
+ started_at: string;
36
+ ended_at: string;
37
+ memories_seen: number;
38
+ memories_extracted: number;
39
+ entities_created: number;
40
+ entities_updated: number;
41
+ links_created: number;
42
+ total_cost_usd: number;
43
+ hit_cost_cap: boolean;
44
+ error?: string;
45
+ parse_failures?: number;
46
+ /**
47
+ * Count of `matched` entries returned by the LLM whose canonical_name did
48
+ * not appear in the known-entities context. The spec requires the LLM to
49
+ * echo a canonical_name verbatim from the known set; a miss here is an
50
+ * LLM hallucination. These are silently dropped (no DB write), but the
51
+ * count is surfaced so observability can alarm if it climbs. Added in
52
+ * claw-2jbo (PR #1 finding 2).
53
+ */
54
+ hallucinated_matched: number;
55
+ }
56
+ export interface StateRecord {
57
+ run_id: string;
58
+ memory_id: string;
59
+ status: 'extracted' | 'parse_failed' | 'cap_reached' | 'skipped';
60
+ timestamp: string;
61
+ raw?: string;
62
+ }
@@ -0,0 +1,6 @@
1
+ export const ENTITY_TYPES = [
2
+ 'project',
3
+ 'person',
4
+ 'tool',
5
+ 'decision',
6
+ ];