nlm-memory 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +72 -34
- package/dist/cli/nlm.js +2 -1
- package/dist/cli/nlm.js.map +1 -1
- package/dist/http/app.js +2 -1
- package/dist/http/app.js.map +1 -1
- package/dist/mcp/server.js +20 -1
- package/dist/mcp/server.js.map +1 -1
- package/dist/ui/assets/{index-C8cpwbYJ.css → index-Beo8psd-.css} +1 -1
- package/dist/ui/assets/{index-CB50QnL-.js → index-CSPTTeeM.js} +8 -8
- package/dist/ui/index.html +2 -2
- package/package.json +26 -1
- package/.agents/plugins/marketplace.json +0 -20
- package/.github/workflows/ci.yml +0 -30
- package/docs/methodology/re-derivation-rate.md +0 -112
- package/docs/methodology/useful-hit-rate.md +0 -79
- package/docs/plans/2026-05-20-fts5-lexical-recall.md +0 -1088
- package/docs/plans/2026-05-20-recall-daemon-wedge-fix.md +0 -662
- package/docs/plans/2026-05-20-recall-hook-design.md +0 -131
- package/docs/plans/2026-05-20-recall-hook-implementation.md +0 -1222
- package/docs/plans/desktop-product.md +0 -69
- package/docs/plans/factstore-design.md +0 -236
- package/logs/CHANGELOG/CHANGELOG-2026.md +0 -1575
- package/logs/CHANGELOG/CHANGELOG.md +0 -209
- package/migrations/000_initial_schema.sql +0 -174
- package/migrations/001_entity_type_rename.sql +0 -17
- package/migrations/002_adapter_state_extend.sql +0 -12
- package/migrations/003_session_embeddings.sql +0 -11
- package/migrations/004_facts.sql +0 -46
- package/migrations/005_sources.sql +0 -31
- package/migrations/006_providers.sql +0 -33
- package/migrations/007_source_tokens.sql +0 -17
- package/migrations/008_fts_rebuild.sql +0 -9
- package/migrations/009_session_embedding_chunks.sql +0 -46
- package/migrations/010_sources_opencode.sql +0 -30
- package/migrations/011_sources_hermes_agent.sql +0 -30
- package/migrations/012_sources_aider.sql +0 -30
- package/migrations/013_adapter_state_failure_count.sql +0 -12
- package/migrations/014_sources_cursor.sql +0 -30
- package/migrations/015_sources_windsurf.sql +0 -30
- package/plugin-hermes-agent/README.md +0 -49
- package/plugin-hermes-agent/__init__.py +0 -75
- package/plugin-hermes-agent/plugin.yaml +0 -15
- package/scripts/backfill-citations.mjs +0 -0
- package/scripts/build-codex-plugin.mjs +0 -61
- package/scripts/deepseek-probe.mjs +0 -67
- package/scripts/extract-triples.mjs +0 -207
- package/scripts/longmemeval/embedding-cache.ts +0 -77
- package/scripts/longmemeval/fetch-dataset.sh +0 -25
- package/scripts/longmemeval/run-harness.ts +0 -315
- package/scripts/longmemeval/scorer.ts +0 -99
- package/scripts/longmemeval/tsconfig.json +0 -9
- package/scripts/longmemeval/types.ts +0 -35
- package/scripts/nlm-daily-digest.py +0 -239
- package/scripts/nlm-daily-digest.sh +0 -28
- package/src/cli/classify-parity.ts +0 -257
- package/src/cli/launchctl-helpers.ts +0 -49
- package/src/cli/nlm.ts +0 -1078
- package/src/core/actions/actions-log.ts +0 -118
- package/src/core/actions/overlay.ts +0 -117
- package/src/core/adapters/aider.ts +0 -205
- package/src/core/adapters/claude-code.ts +0 -293
- package/src/core/adapters/common.ts +0 -54
- package/src/core/adapters/cursor.ts +0 -486
- package/src/core/adapters/from-source.ts +0 -67
- package/src/core/adapters/hermes-agent.ts +0 -240
- package/src/core/adapters/hermes.ts +0 -277
- package/src/core/adapters/jsonl-generic.ts +0 -208
- package/src/core/adapters/opencode.ts +0 -281
- package/src/core/adapters/pi.ts +0 -264
- package/src/core/adapters/windsurf.ts +0 -386
- package/src/core/classifier/prompt.ts +0 -200
- package/src/core/dataset/build-dataset.ts +0 -463
- package/src/core/embedding/chunk-body.ts +0 -76
- package/src/core/embedding/embed-backfill.ts +0 -210
- package/src/core/embedding/embed-normalize.ts +0 -135
- package/src/core/facts/backfill-facts.ts +0 -254
- package/src/core/facts/extract-facts.ts +0 -50
- package/src/core/hook/citation-detect.ts +0 -124
- package/src/core/hook/cite-memo.ts +0 -68
- package/src/core/hook/claude-settings.ts +0 -187
- package/src/core/hook/gate.ts +0 -25
- package/src/core/hook/hook-log.ts +0 -41
- package/src/core/hook/memo-sweep.ts +0 -164
- package/src/core/hook/memo.ts +0 -67
- package/src/core/hook/pointer-block.ts +0 -26
- package/src/core/hook/select.ts +0 -32
- package/src/core/hook/transcript.ts +0 -121
- package/src/core/ingest/ingest-session.ts +0 -111
- package/src/core/providers/provider-models.ts +0 -100
- package/src/core/providers/provider-registry.ts +0 -196
- package/src/core/recall/citation-log.ts +0 -108
- package/src/core/recall/filter.ts +0 -27
- package/src/core/recall/index.ts +0 -6
- package/src/core/recall/match-fields.ts +0 -40
- package/src/core/recall/query-log.ts +0 -149
- package/src/core/recall/query-shape.ts +0 -66
- package/src/core/recall/recall-service.ts +0 -320
- package/src/core/recall/recent-log.ts +0 -59
- package/src/core/recall/tokenize.ts +0 -18
- package/src/core/recall/useful-scan.ts +0 -336
- package/src/core/recall-facts/fact-query-log.ts +0 -150
- package/src/core/recall-facts/fact-recall-service.ts +0 -327
- package/src/core/scheduler/scan-once.ts +0 -142
- package/src/core/scheduler/scheduler.ts +0 -225
- package/src/core/sources/source-registry.ts +0 -278
- package/src/core/storage/db-restore.ts +0 -133
- package/src/core/storage/live-status.ts +0 -45
- package/src/core/storage/migrate.ts +0 -72
- package/src/core/storage/sqlite-fact-store.ts +0 -304
- package/src/core/storage/sqlite-session-store.ts +0 -810
- package/src/hook/hook-auth.ts +0 -18
- package/src/hook/prompt-recall-hook.ts +0 -180
- package/src/hook/session-end-hook.ts +0 -81
- package/src/hook/session-start-hook.ts +0 -168
- package/src/hook/stop-hook.ts +0 -239
- package/src/http/app.ts +0 -1215
- package/src/install/claude-code.ts +0 -128
- package/src/install/codex.ts +0 -367
- package/src/install/cursor.ts +0 -68
- package/src/install/hermes-agent.ts +0 -76
- package/src/install/hermes.ts +0 -78
- package/src/install/nlm-dir-perms.ts +0 -55
- package/src/install/ollama.ts +0 -284
- package/src/install/setup.ts +0 -489
- package/src/install/windsurf.ts +0 -68
- package/src/llm/classifier-box.ts +0 -64
- package/src/llm/deepseek-client.ts +0 -150
- package/src/llm/env-autoload.ts +0 -55
- package/src/llm/ollama-client.ts +0 -189
- package/src/mcp/server.ts +0 -534
- package/src/ports/fact-store.ts +0 -102
- package/src/ports/llm-client.ts +0 -52
- package/src/ports/logger.ts +0 -16
- package/src/ports/session-store.ts +0 -45
- package/src/ports/transcript-adapter.ts +0 -55
- package/src/shared/types.ts +0 -149
- package/src/ui/App.tsx +0 -58
- package/src/ui/components/PromoteOpenButton.tsx +0 -65
- package/src/ui/components/SessionDrawer.tsx +0 -199
- package/src/ui/components/SideNav.tsx +0 -162
- package/src/ui/components/Skeleton.tsx +0 -107
- package/src/ui/index.html +0 -13
- package/src/ui/lib/actions.ts +0 -30
- package/src/ui/lib/api.ts +0 -92
- package/src/ui/lib/dataset.ts +0 -141
- package/src/ui/lib/registries.ts +0 -155
- package/src/ui/lib/view-settings.ts +0 -41
- package/src/ui/main.tsx +0 -15
- package/src/ui/pages/Live.tsx +0 -229
- package/src/ui/pages/Pulse.tsx +0 -415
- package/src/ui/pages/Recall.tsx +0 -190
- package/src/ui/pages/River.tsx +0 -354
- package/src/ui/pages/Search.tsx +0 -386
- package/src/ui/pages/Stub.tsx +0 -9
- package/src/ui/pages/Thread.tsx +0 -473
- package/src/ui/pages/settings/Classifier.tsx +0 -227
- package/src/ui/pages/settings/Data.tsx +0 -190
- package/src/ui/pages/settings/Index.tsx +0 -65
- package/src/ui/pages/settings/Labels.tsx +0 -224
- package/src/ui/pages/settings/Providers.tsx +0 -305
- package/src/ui/pages/settings/SettingsSubnav.tsx +0 -28
- package/src/ui/pages/settings/Sources.tsx +0 -326
- package/src/ui/pages/settings/Views.tsx +0 -96
- package/src/ui/styles.css +0 -1890
- package/src/ui/tsconfig.json +0 -21
- package/src/ui/vite.config.ts +0 -19
- package/tests/fixtures/claude_code/short_session.jsonl +0 -2
- package/tests/fixtures/claude_code/standard_iso.jsonl +0 -4
- package/tests/fixtures/claude_code/tool_heavy.jsonl +0 -8
- package/tests/fixtures/claude_code/with_subagent.jsonl +0 -7
- package/tests/fixtures/facts.ts +0 -17
- package/tests/fixtures/golden-corpus.ts +0 -85
- package/tests/fixtures/hermes/paired_request_dump.json +0 -24
- package/tests/fixtures/hermes/paired_session.json +0 -23
- package/tests/fixtures/hermes/request_dump.json +0 -28
- package/tests/fixtures/hermes/session_iso.json +0 -38
- package/tests/fixtures/hermes/session_unix.json +0 -38
- package/tests/fixtures/hermes/system_only.json +0 -18
- package/tests/fixtures/pi/error-connection-abort.jsonl +0 -8
- package/tests/fixtures/pi/short-successful.jsonl +0 -5
- package/tests/fixtures/pi/with-custom-message.jsonl +0 -6
- package/tests/fixtures/sessions.ts +0 -22
- package/tests/integration/backfill-facts.test.ts +0 -362
- package/tests/integration/citation-explicit.test.ts +0 -111
- package/tests/integration/cite-event.test.ts +0 -169
- package/tests/integration/cite-memo.test.ts +0 -87
- package/tests/integration/db-restore.test.ts +0 -153
- package/tests/integration/embed-backfill.test.ts +0 -176
- package/tests/integration/fact-supersedence.test.ts +0 -313
- package/tests/integration/fts-index.test.ts +0 -60
- package/tests/integration/getbyids-sqlite.test.ts +0 -100
- package/tests/integration/hermes-agent-hooks.test.ts +0 -248
- package/tests/integration/hook-claude-settings.test.ts +0 -218
- package/tests/integration/hook-log.test.ts +0 -54
- package/tests/integration/hook-memo.test.ts +0 -68
- package/tests/integration/hook-pre-compact.test.ts +0 -105
- package/tests/integration/hook-subagent-start.test.ts +0 -102
- package/tests/integration/http.test.ts +0 -401
- package/tests/integration/keyword-search-fts.test.ts +0 -66
- package/tests/integration/mcp-recall-logging.test.ts +0 -88
- package/tests/integration/mcp.test.ts +0 -260
- package/tests/integration/memo-sweep.test.ts +0 -91
- package/tests/integration/prompt-recall-hook.test.ts +0 -88
- package/tests/integration/provider-registry.test.ts +0 -107
- package/tests/integration/recall-golden.test.ts +0 -59
- package/tests/integration/recall-sqlite.test.ts +0 -169
- package/tests/integration/scheduler.test.ts +0 -391
- package/tests/integration/session-end-hook.test.ts +0 -48
- package/tests/integration/session-start-hook.test.ts +0 -126
- package/tests/integration/source-registry.test.ts +0 -122
- package/tests/integration/sqlite-fact-store.test.ts +0 -346
- package/tests/integration/stop-hook.test.ts +0 -560
- package/tests/integration/wal-checkpoint.test.ts +0 -49
- package/tests/unit/cli/launchctl-helpers.test.ts +0 -60
- package/tests/unit/core/adapters/aider.test.ts +0 -230
- package/tests/unit/core/adapters/claude-code.test.ts +0 -118
- package/tests/unit/core/adapters/cursor.test.ts +0 -485
- package/tests/unit/core/adapters/hermes-agent.test.ts +0 -329
- package/tests/unit/core/adapters/hermes.test.ts +0 -81
- package/tests/unit/core/adapters/jsonl-generic.test.ts +0 -142
- package/tests/unit/core/adapters/opencode.test.ts +0 -354
- package/tests/unit/core/adapters/pi.test.ts +0 -110
- package/tests/unit/core/adapters/windsurf.test.ts +0 -416
- package/tests/unit/core/classifier/prompt.test.ts +0 -126
- package/tests/unit/core/embedding/chunk-body.test.ts +0 -100
- package/tests/unit/core/facts/extract-facts.test.ts +0 -117
- package/tests/unit/core/filter.test.ts +0 -40
- package/tests/unit/core/hook/citation-detect-cite-session.test.ts +0 -96
- package/tests/unit/core/hook/citation-detect.test.ts +0 -124
- package/tests/unit/core/hook/gate.test.ts +0 -29
- package/tests/unit/core/hook/pointer-block.test.ts +0 -22
- package/tests/unit/core/hook/select.test.ts +0 -66
- package/tests/unit/core/match-fields.test.ts +0 -39
- package/tests/unit/core/mcp-cite-session.test.ts +0 -51
- package/tests/unit/core/providers/provider-models.test.ts +0 -101
- package/tests/unit/core/query-shape.test.ts +0 -92
- package/tests/unit/core/recall-facts/fact-recall-service.test.ts +0 -258
- package/tests/unit/core/recall-service.test.ts +0 -200
- package/tests/unit/core/storage/live-status.test.ts +0 -54
- package/tests/unit/core/tokenize.test.ts +0 -32
- package/tests/unit/core/useful-scan.test.ts +0 -537
- package/tests/unit/llm/embed.test.ts +0 -93
- package/tests/unit/llm/ollama-client.test.ts +0 -124
- package/tests/unit/scripts/longmemeval-scorer.test.ts +0 -114
- package/tsconfig.json +0 -31
- package/tsconfig.test.json +0 -11
- package/vitest.config.ts +0 -22
|
@@ -1,76 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* chunk-body — split a session body into ≤MAX_CHUNK_CHARS-char windows
|
|
3
|
-
* for the chunk + max-pool semantic index. Header (label + summary) is
|
|
4
|
-
* prepended to the first chunk so it's always part of the index without
|
|
5
|
-
* inflating later chunk sizes.
|
|
6
|
-
*
|
|
7
|
-
* MAX_CHUNK_CHARS sized for nomic-embed-text's 2048-token context. Char
|
|
8
|
-
* density varies by content: prose ~4 chars/token, code/tool-output ~3
|
|
9
|
-
* chars/token. The 2026-05-26 backfill bisect found the cliff at ~6,388
|
|
10
|
-
* chars for token-dense Claude Code session bodies — 5,500 holds a safe
|
|
11
|
-
* margin and eliminates the "input exceeds context length" 500s that
|
|
12
|
-
* drove ~76% per-chunk rejection at 7,500. See 2026-05-26 CHANGELOG.
|
|
13
|
-
*
|
|
14
|
-
* OVERLAP_CHARS preserves context across boundaries so a phrase split
|
|
15
|
-
* mid-chunk still appears intact in one neighboring chunk.
|
|
16
|
-
*
|
|
17
|
-
* Pure function. No I/O, no allocations beyond the returned array.
|
|
18
|
-
*/
|
|
19
|
-
|
|
20
|
-
export const MAX_CHUNK_CHARS = 5_500;
|
|
21
|
-
export const OVERLAP_CHARS = 500;
|
|
22
|
-
|
|
23
|
-
export interface ChunkInput {
|
|
24
|
-
readonly label?: string | null;
|
|
25
|
-
readonly summary?: string | null;
|
|
26
|
-
readonly body?: string | null;
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
export interface ChunkOptions {
|
|
30
|
-
readonly maxChars?: number;
|
|
31
|
-
readonly overlap?: number;
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
export function chunkSessionText(
|
|
35
|
-
input: ChunkInput,
|
|
36
|
-
opts: ChunkOptions = {},
|
|
37
|
-
): string[] {
|
|
38
|
-
const maxChars = opts.maxChars ?? MAX_CHUNK_CHARS;
|
|
39
|
-
const overlap = opts.overlap ?? OVERLAP_CHARS;
|
|
40
|
-
if (maxChars <= 0) throw new Error("chunkSessionText: maxChars must be > 0");
|
|
41
|
-
if (overlap < 0 || overlap >= maxChars) {
|
|
42
|
-
throw new Error("chunkSessionText: overlap must satisfy 0 <= overlap < maxChars");
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
const header = [input.label ?? "", input.summary ?? ""]
|
|
46
|
-
.map((s) => s.trim())
|
|
47
|
-
.filter((s) => s.length > 0)
|
|
48
|
-
.join(" ");
|
|
49
|
-
const body = (input.body ?? "").trim();
|
|
50
|
-
|
|
51
|
-
if (!header && !body) return [];
|
|
52
|
-
if (!body) return [header];
|
|
53
|
-
|
|
54
|
-
// First chunk: header + as much body as fits.
|
|
55
|
-
const headerPrefix = header ? header + " " : "";
|
|
56
|
-
const firstBodyBudget = Math.max(1, maxChars - headerPrefix.length);
|
|
57
|
-
|
|
58
|
-
if (body.length <= firstBodyBudget) {
|
|
59
|
-
return [(headerPrefix + body).trim()];
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
const chunks: string[] = [];
|
|
63
|
-
chunks.push((headerPrefix + body.slice(0, firstBodyBudget)).trim());
|
|
64
|
-
|
|
65
|
-
// Subsequent chunks: body windows with overlap.
|
|
66
|
-
const step = maxChars - overlap;
|
|
67
|
-
let pos = Math.max(0, firstBodyBudget - overlap);
|
|
68
|
-
while (pos < body.length) {
|
|
69
|
-
const end = Math.min(pos + maxChars, body.length);
|
|
70
|
-
const slice = body.slice(pos, end).trim();
|
|
71
|
-
if (slice.length > 0) chunks.push(slice);
|
|
72
|
-
if (end >= body.length) break;
|
|
73
|
-
pos += step;
|
|
74
|
-
}
|
|
75
|
-
return chunks;
|
|
76
|
-
}
|
|
@@ -1,210 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* embed-backfill — re-embed every session in canonical.sqlite into the
|
|
3
|
-
* chunk + max-pool index (session_embedding_chunks). Replaces the prior
|
|
4
|
-
* one-vector-per-session backfill that wrote to session_embeddings.
|
|
5
|
-
*
|
|
6
|
-
* For each session: chunk (label + summary + body) via chunkSessionText,
|
|
7
|
-
* embed each chunk with kind="document", and write to the chunk table +
|
|
8
|
-
* session_chunk_map via the same INSERT pair used by the live ingest path.
|
|
9
|
-
*
|
|
10
|
-
* Resumable via a JSON state file at $NLM_EMBED_STATE (default
|
|
11
|
-
* ~/.nlm/embed_reembed.state). Interrupting + rerunning skips already-done
|
|
12
|
-
* session ids. A session is considered "done" only when ALL its chunks
|
|
13
|
-
* embed successfully — partial sessions are retried on the next run.
|
|
14
|
-
*
|
|
15
|
-
* Layering: depends on the LLMClient port. SQLite touched directly via
|
|
16
|
-
* better-sqlite3 because this is a one-shot operational tool, not a hot
|
|
17
|
-
* path. Lives under core/ but is invoked from the CLI composition root.
|
|
18
|
-
*/
|
|
19
|
-
|
|
20
|
-
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
21
|
-
import { dirname, join } from "node:path";
|
|
22
|
-
import { homedir } from "node:os";
|
|
23
|
-
import Database from "better-sqlite3";
|
|
24
|
-
import * as sqliteVec from "sqlite-vec";
|
|
25
|
-
import type { LLMClient } from "@ports/llm-client.js";
|
|
26
|
-
import { LLMUnreachableError } from "@ports/llm-client.js";
|
|
27
|
-
import { chunkSessionText } from "@core/embedding/chunk-body.js";
|
|
28
|
-
|
|
29
|
-
const DEFAULT_STATE_PATH = join(homedir(), ".nlm", "embed_reembed.state");
|
|
30
|
-
const SAVE_EVERY = 25;
|
|
31
|
-
|
|
32
|
-
export interface BackfillOptions {
|
|
33
|
-
readonly dbPath: string;
|
|
34
|
-
readonly embedder: LLMClient;
|
|
35
|
-
readonly statePath?: string;
|
|
36
|
-
readonly limit?: number;
|
|
37
|
-
readonly onProgress?: (i: number, total: number, sid: string, status: string) => void;
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
export interface BackfillReport {
|
|
41
|
-
readonly total: number;
|
|
42
|
-
readonly processed: number;
|
|
43
|
-
readonly succeeded: number;
|
|
44
|
-
readonly failed: number;
|
|
45
|
-
readonly skippedAlreadyDone: number;
|
|
46
|
-
readonly dbMissing: boolean;
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
interface SessionRow {
|
|
50
|
-
id: string;
|
|
51
|
-
label: string | null;
|
|
52
|
-
summary: string | null;
|
|
53
|
-
body: string | null;
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
function loadState(path: string): Set<string> {
|
|
57
|
-
if (!existsSync(path)) return new Set();
|
|
58
|
-
try {
|
|
59
|
-
const data = JSON.parse(readFileSync(path, "utf8")) as { done?: string[] };
|
|
60
|
-
return new Set(data.done ?? []);
|
|
61
|
-
} catch {
|
|
62
|
-
return new Set();
|
|
63
|
-
}
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
function saveState(path: string, done: Set<string>): void {
|
|
67
|
-
mkdirSync(dirname(path), { recursive: true });
|
|
68
|
-
writeFileSync(path, JSON.stringify({ done: [...done].sort() }));
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
export async function reembedCorpus(opts: BackfillOptions): Promise<BackfillReport> {
|
|
73
|
-
const statePath = opts.statePath ?? DEFAULT_STATE_PATH;
|
|
74
|
-
|
|
75
|
-
if (!existsSync(opts.dbPath)) {
|
|
76
|
-
return { total: 0, processed: 0, succeeded: 0, failed: 0, skippedAlreadyDone: 0, dbMissing: true };
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
const db = new Database(opts.dbPath);
|
|
80
|
-
sqliteVec.load(db);
|
|
81
|
-
|
|
82
|
-
// Backfill every session with content; live ingest covers ongoing writes.
|
|
83
|
-
// The state file dedupes across runs so partial completion resumes cleanly.
|
|
84
|
-
const sql =
|
|
85
|
-
"SELECT s.id, s.label, s.summary, s.body FROM sessions s " +
|
|
86
|
-
"WHERE s.body IS NOT NULL OR s.summary IS NOT NULL OR s.label IS NOT NULL " +
|
|
87
|
-
"ORDER BY s.started_at" +
|
|
88
|
-
(opts.limit ? ` LIMIT ${Math.trunc(opts.limit)}` : "");
|
|
89
|
-
const rows = db.prepare<[], SessionRow>(sql).all();
|
|
90
|
-
const total = rows.length;
|
|
91
|
-
|
|
92
|
-
const done = loadState(statePath);
|
|
93
|
-
|
|
94
|
-
const selectChunks = db.prepare<[string], { chunk_id: number }>(
|
|
95
|
-
"SELECT chunk_id FROM session_chunk_map WHERE session_id = ?",
|
|
96
|
-
);
|
|
97
|
-
const delChunks = (sessionId: string): void => {
|
|
98
|
-
const existing = selectChunks.all(sessionId);
|
|
99
|
-
if (existing.length === 0) return;
|
|
100
|
-
const placeholders = existing.map(() => "?").join(",");
|
|
101
|
-
const ids = existing.map((r) => r.chunk_id);
|
|
102
|
-
db.prepare(
|
|
103
|
-
`DELETE FROM session_embedding_chunks WHERE chunk_id IN (${placeholders})`,
|
|
104
|
-
).run(...ids);
|
|
105
|
-
db.prepare("DELETE FROM session_chunk_map WHERE session_id = ?").run(sessionId);
|
|
106
|
-
};
|
|
107
|
-
const insChunk = db.prepare(
|
|
108
|
-
"INSERT INTO session_embedding_chunks (embedding, session_id, chunk_idx) VALUES (?, ?, ?)",
|
|
109
|
-
);
|
|
110
|
-
const insMap = db.prepare(
|
|
111
|
-
"INSERT INTO session_chunk_map (chunk_id, session_id, chunk_idx) VALUES (?, ?, ?)",
|
|
112
|
-
);
|
|
113
|
-
|
|
114
|
-
let succeeded = 0;
|
|
115
|
-
let failed = 0;
|
|
116
|
-
let skipped = 0;
|
|
117
|
-
|
|
118
|
-
try {
|
|
119
|
-
for (let i = 0; i < rows.length; i++) {
|
|
120
|
-
const row = rows[i]!;
|
|
121
|
-
const idx = i + 1;
|
|
122
|
-
if (done.has(row.id)) {
|
|
123
|
-
skipped += 1;
|
|
124
|
-
continue;
|
|
125
|
-
}
|
|
126
|
-
const chunks = chunkSessionText({
|
|
127
|
-
label: row.label,
|
|
128
|
-
summary: row.summary,
|
|
129
|
-
body: row.body,
|
|
130
|
-
});
|
|
131
|
-
if (chunks.length === 0) {
|
|
132
|
-
failed += 1;
|
|
133
|
-
opts.onProgress?.(idx, total, row.id, "SKIP (no text)");
|
|
134
|
-
continue;
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
// Per-chunk failure tolerance matches live ingest: one chunk hitting
|
|
138
|
-
// the Ollama edge-cliff 500 must not zero out an entire session's
|
|
139
|
-
// coverage. Single retry on LLMUnreachableError catches transient
|
|
140
|
-
// failures; persistent ones are dropped. Session is "done" if any
|
|
141
|
-
// chunk landed — partial max-pool coverage beats none.
|
|
142
|
-
const vectors: { idx: number; vec: Float32Array }[] = [];
|
|
143
|
-
let chunkSkipped = 0;
|
|
144
|
-
for (let c = 0; c < chunks.length; c++) {
|
|
145
|
-
const chunk = chunks[c]!;
|
|
146
|
-
let lastErr: unknown;
|
|
147
|
-
for (let attempt = 0; attempt < 2; attempt++) {
|
|
148
|
-
try {
|
|
149
|
-
const out = await opts.embedder.embed(chunk, "document");
|
|
150
|
-
vectors.push({ idx: c, vec: out.vector });
|
|
151
|
-
lastErr = undefined;
|
|
152
|
-
break;
|
|
153
|
-
} catch (e) {
|
|
154
|
-
lastErr = e;
|
|
155
|
-
if (!(e instanceof LLMUnreachableError)) throw e;
|
|
156
|
-
if (attempt === 0) await new Promise((r) => setTimeout(r, 200));
|
|
157
|
-
}
|
|
158
|
-
}
|
|
159
|
-
if (lastErr !== undefined) chunkSkipped += 1;
|
|
160
|
-
}
|
|
161
|
-
if (vectors.length === 0) {
|
|
162
|
-
failed += 1;
|
|
163
|
-
opts.onProgress?.(idx, total, row.id, `FAIL (embedder, ${chunkSkipped}/${chunks.length} chunks)`);
|
|
164
|
-
continue;
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
try {
|
|
168
|
-
delChunks(row.id);
|
|
169
|
-
for (const { idx: cidx, vec } of vectors) {
|
|
170
|
-
const blob = Buffer.from(vec.buffer, vec.byteOffset, vec.byteLength);
|
|
171
|
-
// BigInt cast so vec0's aux chunk_idx column receives an INTEGER.
|
|
172
|
-
const info = insChunk.run(blob, row.id, BigInt(cidx));
|
|
173
|
-
insMap.run(Number(info.lastInsertRowid), row.id, cidx);
|
|
174
|
-
}
|
|
175
|
-
} catch (e) {
|
|
176
|
-
failed += 1;
|
|
177
|
-
opts.onProgress?.(idx, total, row.id, `FAIL (db): ${(e as Error).message}`);
|
|
178
|
-
continue;
|
|
179
|
-
}
|
|
180
|
-
|
|
181
|
-
done.add(row.id);
|
|
182
|
-
succeeded += 1;
|
|
183
|
-
const status =
|
|
184
|
-
chunkSkipped === 0
|
|
185
|
-
? `OK (${vectors.length} chunks)`
|
|
186
|
-
: `PARTIAL (${vectors.length}/${chunks.length} chunks, ${chunkSkipped} skipped)`;
|
|
187
|
-
opts.onProgress?.(idx, total, row.id, status);
|
|
188
|
-
if (succeeded % SAVE_EVERY === 0) saveState(statePath, done);
|
|
189
|
-
}
|
|
190
|
-
saveState(statePath, done);
|
|
191
|
-
} finally {
|
|
192
|
-
db.close();
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
return {
|
|
196
|
-
total,
|
|
197
|
-
processed: succeeded + failed + skipped,
|
|
198
|
-
succeeded,
|
|
199
|
-
failed,
|
|
200
|
-
skippedAlreadyDone: skipped,
|
|
201
|
-
dbMissing: false,
|
|
202
|
-
};
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
export function clearBackfillState(statePath: string = DEFAULT_STATE_PATH): void {
|
|
206
|
-
if (existsSync(statePath)) {
|
|
207
|
-
const { unlinkSync } = require("node:fs") as typeof import("node:fs");
|
|
208
|
-
unlinkSync(statePath);
|
|
209
|
-
}
|
|
210
|
-
}
|
|
@@ -1,135 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* embed-normalize — one-shot migration: L2-normalize every row in
|
|
3
|
-
* session_embeddings. Ports `embed_normalize.py`.
|
|
4
|
-
*
|
|
5
|
-
* vec0 with implicit L2 distance ranks correctly by cosine similarity
|
|
6
|
-
* only when stored vectors are unit-length. New writes (post-this-fix)
|
|
7
|
-
* are normalized at source by OllamaClient.embed; this module brings
|
|
8
|
-
* existing rows to the same invariant.
|
|
9
|
-
*
|
|
10
|
-
* Idempotent: re-running on already-normalized vectors is a no-op
|
|
11
|
-
* within float tolerance (EPS = 1e-3). Each row is rewritten in its
|
|
12
|
-
* own transaction so interrupts are safe.
|
|
13
|
-
*/
|
|
14
|
-
|
|
15
|
-
import { existsSync } from "node:fs";
|
|
16
|
-
import Database from "better-sqlite3";
|
|
17
|
-
import * as sqliteVec from "sqlite-vec";
|
|
18
|
-
|
|
19
|
-
const EPS = 1e-3;
|
|
20
|
-
const DEFAULT_DIM = 768;
|
|
21
|
-
const DEFAULT_BATCH = 100;
|
|
22
|
-
|
|
23
|
-
export interface NormalizeOptions {
|
|
24
|
-
readonly dbPath: string;
|
|
25
|
-
readonly dim?: number;
|
|
26
|
-
readonly batchSize?: number;
|
|
27
|
-
readonly dryRun?: boolean;
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
export interface NormalizeReport {
|
|
31
|
-
readonly total: number;
|
|
32
|
-
readonly alreadyNormalized: number;
|
|
33
|
-
readonly rewritten: number;
|
|
34
|
-
readonly zeroVector: number;
|
|
35
|
-
readonly dbMissing: boolean;
|
|
36
|
-
readonly dryRun: boolean;
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
interface EmbeddingRow {
|
|
40
|
-
session_id: string;
|
|
41
|
-
embedding: Buffer;
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
interface IdRow {
|
|
45
|
-
session_id: string;
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
function bytesToFloats(buf: Buffer, dim: number): Float32Array {
|
|
49
|
-
if (buf.byteLength !== dim * 4) {
|
|
50
|
-
throw new Error(`expected ${dim * 4} bytes, got ${buf.byteLength}`);
|
|
51
|
-
}
|
|
52
|
-
return new Float32Array(buf.buffer, buf.byteOffset, dim);
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
function floatsToBytes(vec: Float32Array): Buffer {
|
|
56
|
-
return Buffer.from(vec.buffer, vec.byteOffset, vec.byteLength);
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
export function normalizeEmbeddings(opts: NormalizeOptions): NormalizeReport {
|
|
60
|
-
const dim = opts.dim ?? DEFAULT_DIM;
|
|
61
|
-
const batchSize = opts.batchSize ?? DEFAULT_BATCH;
|
|
62
|
-
const dryRun = opts.dryRun ?? false;
|
|
63
|
-
|
|
64
|
-
if (!existsSync(opts.dbPath)) {
|
|
65
|
-
return { total: 0, alreadyNormalized: 0, rewritten: 0, zeroVector: 0, dbMissing: true, dryRun };
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
const db = new Database(opts.dbPath);
|
|
69
|
-
sqliteVec.load(db);
|
|
70
|
-
|
|
71
|
-
let total = 0;
|
|
72
|
-
let alreadyNormalized = 0;
|
|
73
|
-
let rewritten = 0;
|
|
74
|
-
let zeroVector = 0;
|
|
75
|
-
|
|
76
|
-
try {
|
|
77
|
-
const ids = db
|
|
78
|
-
.prepare<[], IdRow>("SELECT session_id FROM session_embeddings")
|
|
79
|
-
.all()
|
|
80
|
-
.map((r) => r.session_id);
|
|
81
|
-
total = ids.length;
|
|
82
|
-
|
|
83
|
-
const sel = db.prepare<[string], EmbeddingRow>(
|
|
84
|
-
"SELECT session_id, embedding FROM session_embeddings WHERE session_id = ?",
|
|
85
|
-
);
|
|
86
|
-
const del = db.prepare("DELETE FROM session_embeddings WHERE session_id = ?");
|
|
87
|
-
const ins = db.prepare("INSERT INTO session_embeddings (session_id, embedding) VALUES (?, ?)");
|
|
88
|
-
|
|
89
|
-
for (let start = 0; start < total; start += batchSize) {
|
|
90
|
-
const batch = ids.slice(start, start + batchSize);
|
|
91
|
-
for (const sid of batch) {
|
|
92
|
-
const row = sel.get(sid);
|
|
93
|
-
if (!row) continue;
|
|
94
|
-
|
|
95
|
-
const vec = bytesToFloats(row.embedding, dim);
|
|
96
|
-
let sumSq = 0;
|
|
97
|
-
for (let i = 0; i < dim; i++) {
|
|
98
|
-
const v = vec[i] ?? 0;
|
|
99
|
-
sumSq += v * v;
|
|
100
|
-
}
|
|
101
|
-
if (sumSq === 0) {
|
|
102
|
-
zeroVector += 1;
|
|
103
|
-
continue;
|
|
104
|
-
}
|
|
105
|
-
const norm = Math.sqrt(sumSq);
|
|
106
|
-
if (Math.abs(norm - 1) <= EPS) {
|
|
107
|
-
alreadyNormalized += 1;
|
|
108
|
-
continue;
|
|
109
|
-
}
|
|
110
|
-
if (dryRun) {
|
|
111
|
-
rewritten += 1;
|
|
112
|
-
continue;
|
|
113
|
-
}
|
|
114
|
-
const normalized = new Float32Array(dim);
|
|
115
|
-
for (let i = 0; i < dim; i++) {
|
|
116
|
-
normalized[i] = (vec[i] ?? 0) / norm;
|
|
117
|
-
}
|
|
118
|
-
del.run(sid);
|
|
119
|
-
ins.run(sid, floatsToBytes(normalized));
|
|
120
|
-
rewritten += 1;
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
} finally {
|
|
124
|
-
db.close();
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
return {
|
|
128
|
-
total,
|
|
129
|
-
alreadyNormalized,
|
|
130
|
-
rewritten,
|
|
131
|
-
zeroVector,
|
|
132
|
-
dbMissing: false,
|
|
133
|
-
dryRun,
|
|
134
|
-
};
|
|
135
|
-
}
|
|
@@ -1,254 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* backfill-facts — one-shot population of the FactStore from the existing
|
|
3
|
-
* session corpus. Phase B.5, see docs/plans/factstore-design.md Section 7.
|
|
4
|
-
*
|
|
5
|
-
* For each session in `sessions` that has no facts yet (and was started
|
|
6
|
-
* before the script's start timestamp, to avoid racing with live ingest),
|
|
7
|
-
* runs the classifier over its body, extracts facts, and writes them via
|
|
8
|
-
* SqliteSessionStore.insertFactsForSession.
|
|
9
|
-
*
|
|
10
|
-
* Resumable via a JSON state file (mirrors core/embedding/embed-backfill).
|
|
11
|
-
* Interrupting and rerunning skips already-processed sessions. State path
|
|
12
|
-
* defaults to ~/.nlm/backfill_facts.state.
|
|
13
|
-
*
|
|
14
|
-
* Layering: depends on the LLMClient + FactStore ports through the
|
|
15
|
-
* SqliteSessionStore + SqliteFactStore composition. Lives under core/ but
|
|
16
|
-
* is invoked from the CLI composition root, like embed-backfill.
|
|
17
|
-
*/
|
|
18
|
-
|
|
19
|
-
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
20
|
-
import { dirname, join } from "node:path";
|
|
21
|
-
import { homedir } from "node:os";
|
|
22
|
-
import { extractFacts } from "@core/facts/extract-facts.js";
|
|
23
|
-
import type { SqliteFactStore } from "@core/storage/sqlite-fact-store.js";
|
|
24
|
-
import type { SqliteSessionStore } from "@core/storage/sqlite-session-store.js";
|
|
25
|
-
import type { LLMClient } from "@ports/llm-client.js";
|
|
26
|
-
import { LLMUnreachableError } from "@ports/llm-client.js";
|
|
27
|
-
|
|
28
|
-
const DEFAULT_STATE_PATH = join(homedir(), ".nlm", "backfill_facts.state");
|
|
29
|
-
const SAVE_EVERY = 25;
|
|
30
|
-
|
|
31
|
-
export interface BackfillFactsOptions {
|
|
32
|
-
readonly store: SqliteSessionStore;
|
|
33
|
-
readonly factStore: SqliteFactStore;
|
|
34
|
-
readonly classifier: LLMClient;
|
|
35
|
-
/** Optional embedder. When omitted, facts are written without semantic vectors. */
|
|
36
|
-
readonly embedder?: LLMClient | null;
|
|
37
|
-
readonly statePath?: string;
|
|
38
|
-
/** Cap on sessions processed this run. Default: all eligible. */
|
|
39
|
-
readonly limit?: number;
|
|
40
|
-
/**
|
|
41
|
-
* Resume from a specific session id. When set, sessions with id
|
|
42
|
-
* lexicographically <= this value are skipped on top of the state file's
|
|
43
|
-
* done set. Useful when the state file is lost but the operator
|
|
44
|
-
* remembers the last successful id.
|
|
45
|
-
*/
|
|
46
|
-
readonly from?: string;
|
|
47
|
-
/** Don't write — just count what would happen. */
|
|
48
|
-
readonly dryRun?: boolean;
|
|
49
|
-
/**
|
|
50
|
-
* Re-process sessions that already have facts. Default: false (skip).
|
|
51
|
-
* Use when iterating the classifier prompt to refresh the corpus.
|
|
52
|
-
*/
|
|
53
|
-
readonly reprocess?: boolean;
|
|
54
|
-
readonly onProgress?: (
|
|
55
|
-
i: number,
|
|
56
|
-
total: number,
|
|
57
|
-
sessionId: string,
|
|
58
|
-
status: BackfillStatus,
|
|
59
|
-
details?: string,
|
|
60
|
-
) => void;
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
export type BackfillStatus =
|
|
64
|
-
| "ok"
|
|
65
|
-
| "skipped_done"
|
|
66
|
-
| "skipped_existing_facts"
|
|
67
|
-
| "skipped_no_body"
|
|
68
|
-
| "skipped_low_confidence"
|
|
69
|
-
| "classify_failed"
|
|
70
|
-
| "storage_failed";
|
|
71
|
-
|
|
72
|
-
export interface BackfillFactsReport {
|
|
73
|
-
readonly total: number;
|
|
74
|
-
readonly processed: number;
|
|
75
|
-
readonly factsWritten: number;
|
|
76
|
-
readonly skippedAlreadyDone: number;
|
|
77
|
-
readonly skippedExistingFacts: number;
|
|
78
|
-
readonly skippedNoBody: number;
|
|
79
|
-
readonly skippedLowConfidence: number;
|
|
80
|
-
readonly classifyFailures: number;
|
|
81
|
-
readonly storageFailures: number;
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
interface CandidateRow {
|
|
85
|
-
id: string;
|
|
86
|
-
started_at: string;
|
|
87
|
-
body: string | null;
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
function loadState(path: string): Set<string> {
|
|
91
|
-
if (!existsSync(path)) return new Set();
|
|
92
|
-
try {
|
|
93
|
-
const data = JSON.parse(readFileSync(path, "utf8")) as { done?: string[] };
|
|
94
|
-
return new Set(data.done ?? []);
|
|
95
|
-
} catch {
|
|
96
|
-
return new Set();
|
|
97
|
-
}
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
function saveState(path: string, done: Set<string>): void {
|
|
101
|
-
const dir = dirname(path);
|
|
102
|
-
if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
|
|
103
|
-
writeFileSync(path, JSON.stringify({ done: Array.from(done) }, null, 0));
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
export async function backfillFacts(
|
|
107
|
-
opts: BackfillFactsOptions,
|
|
108
|
-
): Promise<BackfillFactsReport> {
|
|
109
|
-
const startedAtCutoff = new Date().toISOString();
|
|
110
|
-
const statePath = opts.statePath ?? DEFAULT_STATE_PATH;
|
|
111
|
-
const done = opts.dryRun ? new Set<string>() : loadState(statePath);
|
|
112
|
-
|
|
113
|
-
const db = opts.store.rawDb();
|
|
114
|
-
|
|
115
|
-
// Eligible sessions: started strictly before this run's cutoff (don't
|
|
116
|
-
// race with live ingest), with a non-empty body (the classifier needs
|
|
117
|
-
// transcript text). When reprocess=false, exclude sessions that already
|
|
118
|
-
// have facts attributed to them.
|
|
119
|
-
const sql = opts.reprocess
|
|
120
|
-
? `
|
|
121
|
-
SELECT id, started_at, body
|
|
122
|
-
FROM sessions
|
|
123
|
-
WHERE started_at < ?
|
|
124
|
-
AND body IS NOT NULL AND length(body) > 0
|
|
125
|
-
${opts.from ? "AND id > ?" : ""}
|
|
126
|
-
ORDER BY started_at ASC, id ASC
|
|
127
|
-
`
|
|
128
|
-
: `
|
|
129
|
-
SELECT s.id, s.started_at, s.body
|
|
130
|
-
FROM sessions s
|
|
131
|
-
WHERE s.started_at < ?
|
|
132
|
-
AND s.body IS NOT NULL AND length(s.body) > 0
|
|
133
|
-
AND NOT EXISTS (
|
|
134
|
-
SELECT 1 FROM facts f WHERE f.source_session_id = s.id
|
|
135
|
-
)
|
|
136
|
-
${opts.from ? "AND s.id > ?" : ""}
|
|
137
|
-
ORDER BY s.started_at ASC, s.id ASC
|
|
138
|
-
`;
|
|
139
|
-
const rows: CandidateRow[] = opts.from
|
|
140
|
-
? db.prepare<[string, string], CandidateRow>(sql).all(startedAtCutoff, opts.from)
|
|
141
|
-
: db.prepare<[string], CandidateRow>(sql).all(startedAtCutoff);
|
|
142
|
-
|
|
143
|
-
// Filter state-file-known done ids BEFORE applying limit. Without this,
|
|
144
|
-
// a dense cluster of previously-skipped (low-confidence) sessions would
|
|
145
|
-
// burn the batch's --limit on no-op skips. With it, --limit N means
|
|
146
|
-
// "N actually-processable sessions" — much more useful UX for repeated
|
|
147
|
-
// small batches that walk forward through the corpus. The pre-filter
|
|
148
|
-
// count gets reported as `skippedAlreadyDone` so the operator still sees
|
|
149
|
-
// how big the skip region was.
|
|
150
|
-
const skippedByStateFile = rows.filter((r) => done.has(r.id)).length;
|
|
151
|
-
const candidates = rows.filter((r) => !done.has(r.id));
|
|
152
|
-
const limit = opts.limit ?? candidates.length;
|
|
153
|
-
const work = candidates.slice(0, limit);
|
|
154
|
-
const total = work.length;
|
|
155
|
-
|
|
156
|
-
let processed = 0;
|
|
157
|
-
let factsWritten = 0;
|
|
158
|
-
let skippedAlreadyDone = skippedByStateFile;
|
|
159
|
-
let skippedExistingFacts = 0;
|
|
160
|
-
let skippedNoBody = 0;
|
|
161
|
-
let skippedLowConfidence = 0;
|
|
162
|
-
let classifyFailures = 0;
|
|
163
|
-
let storageFailures = 0;
|
|
164
|
-
|
|
165
|
-
for (let i = 0; i < work.length; i++) {
|
|
166
|
-
const row = work[i]!;
|
|
167
|
-
const sid = row.id;
|
|
168
|
-
|
|
169
|
-
// No per-iteration `done` check needed — `work` is already filtered
|
|
170
|
-
// against the state file above.
|
|
171
|
-
|
|
172
|
-
if (!row.body || row.body.length === 0) {
|
|
173
|
-
skippedNoBody += 1;
|
|
174
|
-
opts.onProgress?.(i + 1, total, sid, "skipped_no_body");
|
|
175
|
-
continue;
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
let classification;
|
|
179
|
-
try {
|
|
180
|
-
classification = await opts.classifier.classify(row.body);
|
|
181
|
-
} catch (err) {
|
|
182
|
-
classifyFailures += 1;
|
|
183
|
-
const detail =
|
|
184
|
-
err instanceof LLMUnreachableError
|
|
185
|
-
? "ollama unreachable — stopping run"
|
|
186
|
-
: err instanceof Error
|
|
187
|
-
? err.message
|
|
188
|
-
: String(err);
|
|
189
|
-
opts.onProgress?.(i + 1, total, sid, "classify_failed", detail);
|
|
190
|
-
// Ollama-down is fatal: every subsequent classify will fail. Stop
|
|
191
|
-
// here so the operator can fix and resume.
|
|
192
|
-
if (err instanceof LLMUnreachableError) break;
|
|
193
|
-
continue;
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
const facts = extractFacts(classification, sid, row.started_at);
|
|
197
|
-
if (facts.length === 0) {
|
|
198
|
-
skippedLowConfidence += 1;
|
|
199
|
-
opts.onProgress?.(
|
|
200
|
-
i + 1,
|
|
201
|
-
total,
|
|
202
|
-
sid,
|
|
203
|
-
"skipped_low_confidence",
|
|
204
|
-
`confidence=${classification.confidence}`,
|
|
205
|
-
);
|
|
206
|
-
// Mark done so a re-run doesn't keep paying the classifier cost on
|
|
207
|
-
// sessions the model can't extract anything from.
|
|
208
|
-
done.add(sid);
|
|
209
|
-
if (!opts.dryRun && processed % SAVE_EVERY === 0) saveState(statePath, done);
|
|
210
|
-
continue;
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
if (opts.dryRun) {
|
|
214
|
-
factsWritten += facts.length;
|
|
215
|
-
processed += 1;
|
|
216
|
-
opts.onProgress?.(i + 1, total, sid, "ok", `would-write=${facts.length}`);
|
|
217
|
-
continue;
|
|
218
|
-
}
|
|
219
|
-
|
|
220
|
-
try {
|
|
221
|
-
await opts.store.insertFactsForSession(
|
|
222
|
-
sid,
|
|
223
|
-
opts.factStore,
|
|
224
|
-
facts,
|
|
225
|
-
opts.embedder ?? null,
|
|
226
|
-
);
|
|
227
|
-
} catch (err) {
|
|
228
|
-
storageFailures += 1;
|
|
229
|
-
const detail = err instanceof Error ? err.message : String(err);
|
|
230
|
-
opts.onProgress?.(i + 1, total, sid, "storage_failed", detail);
|
|
231
|
-
continue;
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
factsWritten += facts.length;
|
|
235
|
-
processed += 1;
|
|
236
|
-
done.add(sid);
|
|
237
|
-
opts.onProgress?.(i + 1, total, sid, "ok", `wrote=${facts.length}`);
|
|
238
|
-
if (processed % SAVE_EVERY === 0) saveState(statePath, done);
|
|
239
|
-
}
|
|
240
|
-
|
|
241
|
-
if (!opts.dryRun) saveState(statePath, done);
|
|
242
|
-
|
|
243
|
-
return {
|
|
244
|
-
total,
|
|
245
|
-
processed,
|
|
246
|
-
factsWritten,
|
|
247
|
-
skippedAlreadyDone,
|
|
248
|
-
skippedExistingFacts,
|
|
249
|
-
skippedNoBody,
|
|
250
|
-
skippedLowConfidence,
|
|
251
|
-
classifyFailures,
|
|
252
|
-
storageFailures,
|
|
253
|
-
};
|
|
254
|
-
}
|