membot 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +81 -24
- package/patches/@huggingface%2Ftransformers@4.2.0.patch +137 -0
- package/scripts/apply-transformers-patch.sh +35 -0
- package/src/cli.ts +70 -0
- package/src/commands/check-update.ts +69 -0
- package/src/commands/mcpx.ts +112 -0
- package/src/commands/reindex.ts +53 -0
- package/src/commands/serve.ts +58 -0
- package/src/commands/upgrade.ts +220 -0
- package/src/config/loader.ts +100 -0
- package/src/config/schemas.ts +39 -0
- package/src/constants.ts +42 -0
- package/src/context.ts +80 -0
- package/src/db/blobs.ts +53 -0
- package/src/db/chunks.ts +176 -0
- package/src/db/connection.ts +173 -0
- package/src/db/files.ts +325 -0
- package/src/db/migrations/001-init.ts +63 -0
- package/src/db/migrations/002-fts.ts +12 -0
- package/src/db/migrations.ts +45 -0
- package/src/errors.ts +87 -0
- package/src/ingest/chunker.ts +117 -0
- package/src/ingest/converter/docx.ts +15 -0
- package/src/ingest/converter/html.ts +20 -0
- package/src/ingest/converter/image.ts +71 -0
- package/src/ingest/converter/index.ts +119 -0
- package/src/ingest/converter/llm.ts +66 -0
- package/src/ingest/converter/ocr.ts +51 -0
- package/src/ingest/converter/pdf.ts +38 -0
- package/src/ingest/converter/text.ts +8 -0
- package/src/ingest/describer.ts +72 -0
- package/src/ingest/embedder.ts +83 -0
- package/src/ingest/fetcher.ts +280 -0
- package/src/ingest/ingest.ts +444 -0
- package/src/ingest/local-reader.ts +64 -0
- package/src/ingest/search-text.ts +18 -0
- package/src/ingest/source-resolver.ts +186 -0
- package/src/mcp/instructions.ts +34 -0
- package/src/mcp/server.ts +101 -0
- package/src/mount/commander.ts +174 -0
- package/src/mount/mcp.ts +111 -0
- package/src/mount/zod-to-cli.ts +158 -0
- package/src/operations/add.ts +69 -0
- package/src/operations/diff.ts +105 -0
- package/src/operations/index.ts +38 -0
- package/src/operations/info.ts +95 -0
- package/src/operations/list.ts +87 -0
- package/src/operations/move.ts +83 -0
- package/src/operations/prune.ts +80 -0
- package/src/operations/read.ts +102 -0
- package/src/operations/refresh.ts +72 -0
- package/src/operations/remove.ts +35 -0
- package/src/operations/search.ts +72 -0
- package/src/operations/tree.ts +103 -0
- package/src/operations/types.ts +81 -0
- package/src/operations/versions.ts +78 -0
- package/src/operations/write.ts +77 -0
- package/src/output/formatter.ts +68 -0
- package/src/output/logger.ts +114 -0
- package/src/output/progress.ts +78 -0
- package/src/output/tty.ts +91 -0
- package/src/refresh/runner.ts +296 -0
- package/src/refresh/scheduler.ts +54 -0
- package/src/sdk.ts +27 -0
- package/src/search/hybrid.ts +100 -0
- package/src/search/keyword.ts +62 -0
- package/src/search/semantic.ts +56 -0
- package/src/update/background.ts +73 -0
- package/src/update/cache.ts +40 -0
- package/src/update/checker.ts +117 -0
- package/.claude/settings.local.json +0 -7
- package/CLAUDE.md +0 -139
- package/docs/plan.md +0 -905
package/src/db/chunks.ts
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
import { EMBEDDING_DIMENSION } from "../constants.ts";
|
|
2
|
+
import { HelpfulError } from "../errors.ts";
|
|
3
|
+
import type { DbConnection } from "./connection.ts";
|
|
4
|
+
|
|
5
|
+
export interface ChunkInput {
|
|
6
|
+
chunk_index: number;
|
|
7
|
+
chunk_content: string;
|
|
8
|
+
search_text: string;
|
|
9
|
+
embedding: number[];
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export interface ChunkRow extends ChunkInput {
|
|
13
|
+
logical_path: string;
|
|
14
|
+
version_id: string;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Insert all chunks for a given version. Throws `HelpfulError` if any
|
|
19
|
+
* embedding's dimensionality doesn't match `EMBEDDING_DIMENSION` — DuckDB's
|
|
20
|
+
* `FLOAT[N]` column would reject the bind, so we surface a clearer error
|
|
21
|
+
* before reaching the driver.
|
|
22
|
+
*/
|
|
23
|
+
export async function insertChunksForVersion(
|
|
24
|
+
db: DbConnection,
|
|
25
|
+
logical_path: string,
|
|
26
|
+
version_id: string,
|
|
27
|
+
chunks: ChunkInput[],
|
|
28
|
+
): Promise<void> {
|
|
29
|
+
for (const c of chunks) {
|
|
30
|
+
if (c.embedding.length !== EMBEDDING_DIMENSION) {
|
|
31
|
+
throw new HelpfulError({
|
|
32
|
+
kind: "internal_error",
|
|
33
|
+
message: `Chunk embedding dimension ${c.embedding.length} does not match expected ${EMBEDDING_DIMENSION}`,
|
|
34
|
+
hint: `The embedding model must produce ${EMBEDDING_DIMENSION}-dim vectors. Check config.embedding_model.`,
|
|
35
|
+
});
|
|
36
|
+
}
|
|
37
|
+
await db.queryRun(
|
|
38
|
+
`INSERT INTO chunks (logical_path, version_id, chunk_index, chunk_content, search_text, embedding)
|
|
39
|
+
VALUES (?1, CAST(?2 AS TIMESTAMP), ?3, ?4, ?5, ?6::FLOAT[${EMBEDDING_DIMENSION}])`,
|
|
40
|
+
logical_path,
|
|
41
|
+
version_id,
|
|
42
|
+
c.chunk_index,
|
|
43
|
+
c.chunk_content,
|
|
44
|
+
c.search_text,
|
|
45
|
+
c.embedding,
|
|
46
|
+
);
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/** Drop every chunk for a single version. Called by `deleteVersionAndChunks` during prune. */
|
|
51
|
+
export async function deleteChunksForVersion(
|
|
52
|
+
db: DbConnection,
|
|
53
|
+
logical_path: string,
|
|
54
|
+
version_id: string,
|
|
55
|
+
): Promise<void> {
|
|
56
|
+
await db.queryRun(
|
|
57
|
+
`DELETE FROM chunks WHERE logical_path = ?1 AND version_id = CAST(?2 AS TIMESTAMP)`,
|
|
58
|
+
logical_path,
|
|
59
|
+
version_id,
|
|
60
|
+
);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
interface RawChunkRow {
|
|
64
|
+
logical_path: string;
|
|
65
|
+
version_id: string;
|
|
66
|
+
chunk_index: number;
|
|
67
|
+
chunk_content: string;
|
|
68
|
+
search_text: string;
|
|
69
|
+
embedding: number[];
|
|
70
|
+
[key: string]: unknown;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/** All chunks for a single version, ordered by `chunk_index`. */
|
|
74
|
+
export async function listChunksForVersion(
|
|
75
|
+
db: DbConnection,
|
|
76
|
+
logical_path: string,
|
|
77
|
+
version_id: string,
|
|
78
|
+
): Promise<ChunkRow[]> {
|
|
79
|
+
const rows = await db.queryAll<RawChunkRow>(
|
|
80
|
+
`SELECT logical_path, CAST(version_id AS VARCHAR) AS version_id,
|
|
81
|
+
chunk_index, chunk_content, search_text, embedding
|
|
82
|
+
FROM chunks
|
|
83
|
+
WHERE logical_path = ?1 AND version_id = CAST(?2 AS TIMESTAMP)
|
|
84
|
+
ORDER BY chunk_index`,
|
|
85
|
+
logical_path,
|
|
86
|
+
version_id,
|
|
87
|
+
);
|
|
88
|
+
return rows.map((r) => ({
|
|
89
|
+
...r,
|
|
90
|
+
version_id: String(r.version_id),
|
|
91
|
+
chunk_index: Number(r.chunk_index),
|
|
92
|
+
}));
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Outcome of a `rebuildFts` call. Distinct kinds let callers (and the
|
|
97
|
+
* `reindex` CLI) distinguish "empty DB — that's fine" from "extension truly
|
|
98
|
+
* couldn't load — search will degrade".
|
|
99
|
+
*/
|
|
100
|
+
export type RebuildFtsResult =
|
|
101
|
+
| { kind: "rebuilt"; chunk_count: number }
|
|
102
|
+
| { kind: "extension_unavailable"; cause?: string }
|
|
103
|
+
| { kind: "no_chunks" }
|
|
104
|
+
| { kind: "rebuild_failed"; cause?: string };
|
|
105
|
+
|
|
106
|
+
let ftsAttempted = false;
|
|
107
|
+
let ftsAvailable = false;
|
|
108
|
+
let ftsLoadError: string | undefined;
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Build/refresh the FTS index over `current_chunks(search_text)`. DuckDB's FTS
|
|
112
|
+
* is a snapshot — call this after batch inserts/deletes that change the
|
|
113
|
+
* current_chunks set. The first call attempts to LOAD fts; on failure the
|
|
114
|
+
* underlying error is captured and returned via `extension_unavailable.cause`
|
|
115
|
+
* so callers can render it diagnostically.
|
|
116
|
+
*/
|
|
117
|
+
export async function rebuildFts(db: DbConnection): Promise<RebuildFtsResult> {
|
|
118
|
+
if (!ftsAttempted) {
|
|
119
|
+
ftsAttempted = true;
|
|
120
|
+
try {
|
|
121
|
+
await db.exec(`INSTALL fts`);
|
|
122
|
+
await db.exec(`LOAD fts`);
|
|
123
|
+
ftsAvailable = true;
|
|
124
|
+
} catch (e) {
|
|
125
|
+
ftsAvailable = false;
|
|
126
|
+
ftsLoadError = errorMessage(e);
|
|
127
|
+
return { kind: "extension_unavailable", cause: ftsLoadError };
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
if (!ftsAvailable) return { kind: "extension_unavailable", cause: ftsLoadError };
|
|
131
|
+
|
|
132
|
+
const sample = await db.queryGet<{ n: number }>(`SELECT COUNT(*) AS n FROM current_chunks`);
|
|
133
|
+
const chunkCount = sample ? Number(sample.n) : 0;
|
|
134
|
+
if (chunkCount === 0) return { kind: "no_chunks" };
|
|
135
|
+
|
|
136
|
+
try {
|
|
137
|
+
// FTS over current_chunks (a view) requires materializing into a table.
|
|
138
|
+
// Drop & recreate the materialized projection on each rebuild.
|
|
139
|
+
await db.exec(`DROP TABLE IF EXISTS _current_chunks_fts`);
|
|
140
|
+
await db.exec(
|
|
141
|
+
`CREATE TABLE _current_chunks_fts AS
|
|
142
|
+
SELECT (logical_path || '::' || CAST(version_id AS VARCHAR) || '::' || chunk_index) AS row_key,
|
|
143
|
+
logical_path, CAST(version_id AS VARCHAR) AS version_id, chunk_index, search_text
|
|
144
|
+
FROM current_chunks`,
|
|
145
|
+
);
|
|
146
|
+
await db.exec(
|
|
147
|
+
`PRAGMA create_fts_index('_current_chunks_fts', 'row_key', 'search_text', stemmer='porter', overwrite=1)`,
|
|
148
|
+
);
|
|
149
|
+
await db.exec(`CHECKPOINT`);
|
|
150
|
+
return { kind: "rebuilt", chunk_count: chunkCount };
|
|
151
|
+
} catch (e) {
|
|
152
|
+
return { kind: "rebuild_failed", cause: errorMessage(e) };
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
function errorMessage(e: unknown): string {
|
|
157
|
+
if (e instanceof Error) return e.message;
|
|
158
|
+
return String(e);
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* True once `rebuildFts` has succeeded at least once in this process.
|
|
163
|
+
* False until then, or permanently false on platforms where the DuckDB
|
|
164
|
+
* `fts` extension cannot load — in which case search degrades to
|
|
165
|
+
* semantic-only without erroring.
|
|
166
|
+
*/
|
|
167
|
+
export function isFtsAvailable(): boolean {
|
|
168
|
+
return ftsAvailable;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/** Test-only: reset the cached extension-load state so per-test ephemeral DBs start clean. */
|
|
172
|
+
export function _resetFtsState(): void {
|
|
173
|
+
ftsAttempted = false;
|
|
174
|
+
ftsAvailable = false;
|
|
175
|
+
ftsLoadError = undefined;
|
|
176
|
+
}
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
import {
|
|
2
|
+
blobValue,
|
|
3
|
+
DuckDBInstance,
|
|
4
|
+
type DuckDBConnection as DuckDBNativeConnection,
|
|
5
|
+
type DuckDBValue,
|
|
6
|
+
listValue,
|
|
7
|
+
} from "@duckdb/node-api";
|
|
8
|
+
|
|
9
|
+
import { EMBEDDING_DIMENSION } from "../constants.ts";
|
|
10
|
+
import { asHelpful } from "../errors.ts";
|
|
11
|
+
import { applyMigrations } from "./migrations.ts";
|
|
12
|
+
|
|
13
|
+
/** Subset of @duckdb/node-api types we feed into / get out of queries. */
|
|
14
|
+
export type SqlScalar = string | number | boolean | bigint | null | Uint8Array;
|
|
15
|
+
export type SqlParam = SqlScalar | number[] | SqlScalar[];
|
|
16
|
+
|
|
17
|
+
export interface RunResult {
|
|
18
|
+
changes: number;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Thin async wrapper around a DuckDB connection. Uses ?N placeholders
|
|
23
|
+
* (translated to $N internally) and returns plain JS objects.
|
|
24
|
+
*/
|
|
25
|
+
export class DbConnection {
|
|
26
|
+
private readonly conn: DuckDBNativeConnection;
|
|
27
|
+
private readonly instance: DuckDBInstance | null;
|
|
28
|
+
readonly path: string;
|
|
29
|
+
private closed = false;
|
|
30
|
+
|
|
31
|
+
constructor(conn: DuckDBNativeConnection, instance: DuckDBInstance | null, path: string) {
|
|
32
|
+
this.conn = conn;
|
|
33
|
+
this.instance = instance;
|
|
34
|
+
this.path = path;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/** Run a parameter-less SQL statement (DDL, PRAGMA, batch SQL). */
|
|
38
|
+
async exec(sql: string): Promise<void> {
|
|
39
|
+
await this.conn.run(sql);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/** Run a query and return the first row, or null. SQL uses `?N` placeholders. */
|
|
43
|
+
async queryGet<T extends Record<string, unknown> = Record<string, unknown>>(
|
|
44
|
+
sql: string,
|
|
45
|
+
...params: SqlParam[]
|
|
46
|
+
): Promise<T | null> {
|
|
47
|
+
const result = await this.conn.runAndReadAll(translateParams(sql), flattenParams(params) as DuckDBValue[]);
|
|
48
|
+
const rows = (await result.getRowObjectsJS()) as Record<string, unknown>[];
|
|
49
|
+
if (!rows[0]) return null;
|
|
50
|
+
return convertRow(rows[0]) as T;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/** Run a query and return all rows. SQL uses `?N` placeholders. */
|
|
54
|
+
async queryAll<T extends Record<string, unknown> = Record<string, unknown>>(
|
|
55
|
+
sql: string,
|
|
56
|
+
...params: SqlParam[]
|
|
57
|
+
): Promise<T[]> {
|
|
58
|
+
const result = await this.conn.runAndReadAll(translateParams(sql), flattenParams(params) as DuckDBValue[]);
|
|
59
|
+
const rows = (await result.getRowObjectsJS()) as Record<string, unknown>[];
|
|
60
|
+
return rows.map(convertRow) as T[];
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/** Run a mutation (INSERT/UPDATE/DELETE) and report rows changed. SQL uses `?N` placeholders. */
|
|
64
|
+
async queryRun(sql: string, ...params: SqlParam[]): Promise<RunResult> {
|
|
65
|
+
const result = await this.conn.run(translateParams(sql), flattenParams(params) as DuckDBValue[]);
|
|
66
|
+
return { changes: Number(result.rowsChanged) };
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/** Disconnect and close the owning DuckDB instance. Idempotent; subsequent calls are no-ops. */
|
|
70
|
+
async close(): Promise<void> {
|
|
71
|
+
if (this.closed) return;
|
|
72
|
+
this.closed = true;
|
|
73
|
+
this.conn.disconnectSync();
|
|
74
|
+
if (this.instance) {
|
|
75
|
+
try {
|
|
76
|
+
this.instance.closeSync();
|
|
77
|
+
} catch {
|
|
78
|
+
// best effort
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/** Type guard for the JS values DuckDB returns directly without further coercion. */
|
|
85
|
+
function isDuckDBPrimitive(v: unknown): v is string | number | boolean | bigint | null | Uint8Array | Date {
|
|
86
|
+
if (v === null) return true;
|
|
87
|
+
const t = typeof v;
|
|
88
|
+
return (
|
|
89
|
+
t === "string" ||
|
|
90
|
+
t === "number" ||
|
|
91
|
+
t === "boolean" ||
|
|
92
|
+
t === "bigint" ||
|
|
93
|
+
v instanceof Uint8Array ||
|
|
94
|
+
v instanceof Date
|
|
95
|
+
);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Normalize a value coming out of DuckDB into something the rest of the
|
|
100
|
+
* codebase expects: `bigint` → `number` (we never have row counts that
|
|
101
|
+
* exceed Number.MAX_SAFE_INTEGER), `Date` → ISO string (so JSON
|
|
102
|
+
* serialization is stable), and recurse into arrays/objects.
|
|
103
|
+
*/
|
|
104
|
+
function convertValue(v: unknown): unknown {
|
|
105
|
+
if (typeof v === "bigint") {
|
|
106
|
+
// Bigints from row counts and TIMESTAMP fit in Number safely for our use.
|
|
107
|
+
return Number(v);
|
|
108
|
+
}
|
|
109
|
+
if (v instanceof Date) return v.toISOString();
|
|
110
|
+
if (Array.isArray(v)) return v.map(convertValue);
|
|
111
|
+
if (v && typeof v === "object" && !(v instanceof Uint8Array)) {
|
|
112
|
+
const out: Record<string, unknown> = {};
|
|
113
|
+
for (const [k, val] of Object.entries(v as Record<string, unknown>)) {
|
|
114
|
+
out[k] = convertValue(val);
|
|
115
|
+
}
|
|
116
|
+
return out;
|
|
117
|
+
}
|
|
118
|
+
if (isDuckDBPrimitive(v)) return v;
|
|
119
|
+
return v;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/** Apply `convertValue` to every column of a row. */
|
|
123
|
+
function convertRow(row: Record<string, unknown>): Record<string, unknown> {
|
|
124
|
+
const out: Record<string, unknown> = {};
|
|
125
|
+
for (const [k, v] of Object.entries(row)) {
|
|
126
|
+
out[k] = convertValue(v);
|
|
127
|
+
}
|
|
128
|
+
return out;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/** Rewrite our `?N` placeholder convention to DuckDB's native `$N` form. */
|
|
132
|
+
function translateParams(sql: string): string {
|
|
133
|
+
return sql.replace(/\?(\d+)/g, "$$$1");
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* Coerce JS values into types that `@duckdb/node-api` knows how to bind.
|
|
138
|
+
* Plain JS arrays and Uint8Arrays both fall through to ANY in DuckDB's
|
|
139
|
+
* type-inference path, so we wrap them with the proper value classes here.
|
|
140
|
+
* Use `?N::FLOAT[384]` / `?N::BLOB` SQL casts at the binding site to land
|
|
141
|
+
* the value in the right column type.
|
|
142
|
+
*/
|
|
143
|
+
function flattenParams(params: SqlParam[]): unknown[] {
|
|
144
|
+
return params.map((p) => {
|
|
145
|
+
if (p instanceof Uint8Array) return blobValue(p);
|
|
146
|
+
if (Array.isArray(p)) return listValue(p as readonly (string | number | boolean | bigint | null)[]);
|
|
147
|
+
return p;
|
|
148
|
+
});
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Open a DuckDB-backed connection for the given file path. Runs all migrations
|
|
153
|
+
* against the connection before returning. Pass `:memory:` for in-process tests.
|
|
154
|
+
*/
|
|
155
|
+
export async function openDb(path: string): Promise<DbConnection> {
|
|
156
|
+
let instance: DuckDBInstance;
|
|
157
|
+
try {
|
|
158
|
+
instance = await DuckDBInstance.create(path);
|
|
159
|
+
} catch (err) {
|
|
160
|
+
throw asHelpful(
|
|
161
|
+
err,
|
|
162
|
+
`while opening DuckDB at ${path}`,
|
|
163
|
+
`Check that ${path} is writable and not held open by another process. Delete the file to start fresh.`,
|
|
164
|
+
"internal_error",
|
|
165
|
+
);
|
|
166
|
+
}
|
|
167
|
+
const conn = await instance.connect();
|
|
168
|
+
const wrapper = new DbConnection(conn, instance, path);
|
|
169
|
+
await applyMigrations(wrapper);
|
|
170
|
+
return wrapper;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
export { EMBEDDING_DIMENSION };
|
package/src/db/files.ts
ADDED
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
import type { DbConnection, SqlParam } from "./connection.ts";
|
|
2
|
+
|
|
3
|
+
export type SourceType = "local" | "remote" | "inline";
|
|
4
|
+
export type FetcherKind = "http" | "mcpx" | "local" | "inline";
|
|
5
|
+
|
|
6
|
+
export interface FileRow {
|
|
7
|
+
logical_path: string;
|
|
8
|
+
version_id: string;
|
|
9
|
+
tombstone: boolean;
|
|
10
|
+
source_type: SourceType;
|
|
11
|
+
source_path: string | null;
|
|
12
|
+
source_mtime_ms: number | null;
|
|
13
|
+
source_sha256: string | null;
|
|
14
|
+
blob_sha256: string | null;
|
|
15
|
+
content_sha256: string | null;
|
|
16
|
+
content: string | null;
|
|
17
|
+
description: string | null;
|
|
18
|
+
mime_type: string | null;
|
|
19
|
+
size_bytes: number | null;
|
|
20
|
+
fetcher: FetcherKind | null;
|
|
21
|
+
fetcher_server: string | null;
|
|
22
|
+
fetcher_tool: string | null;
|
|
23
|
+
fetcher_args: Record<string, unknown> | null;
|
|
24
|
+
refresh_frequency_sec: number | null;
|
|
25
|
+
refreshed_at: string | null;
|
|
26
|
+
last_refresh_status: string | null;
|
|
27
|
+
change_note: string | null;
|
|
28
|
+
created_at: string;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export interface NewFileVersion {
|
|
32
|
+
logical_path: string;
|
|
33
|
+
version_id?: string;
|
|
34
|
+
tombstone?: boolean;
|
|
35
|
+
source_type: SourceType;
|
|
36
|
+
source_path?: string | null;
|
|
37
|
+
source_mtime_ms?: number | null;
|
|
38
|
+
source_sha256?: string | null;
|
|
39
|
+
blob_sha256?: string | null;
|
|
40
|
+
content_sha256?: string | null;
|
|
41
|
+
content?: string | null;
|
|
42
|
+
description?: string | null;
|
|
43
|
+
mime_type?: string | null;
|
|
44
|
+
size_bytes?: number | null;
|
|
45
|
+
fetcher?: FetcherKind | null;
|
|
46
|
+
fetcher_server?: string | null;
|
|
47
|
+
fetcher_tool?: string | null;
|
|
48
|
+
fetcher_args?: Record<string, unknown> | null;
|
|
49
|
+
refresh_frequency_sec?: number | null;
|
|
50
|
+
refreshed_at?: string | null;
|
|
51
|
+
last_refresh_status?: string | null;
|
|
52
|
+
change_note?: string | null;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
const ROW_COLUMNS = [
|
|
56
|
+
"logical_path",
|
|
57
|
+
"version_id",
|
|
58
|
+
"tombstone",
|
|
59
|
+
"source_type",
|
|
60
|
+
"source_path",
|
|
61
|
+
"source_mtime_ms",
|
|
62
|
+
"source_sha256",
|
|
63
|
+
"blob_sha256",
|
|
64
|
+
"content_sha256",
|
|
65
|
+
"content",
|
|
66
|
+
"description",
|
|
67
|
+
"mime_type",
|
|
68
|
+
"size_bytes",
|
|
69
|
+
"fetcher",
|
|
70
|
+
"fetcher_server",
|
|
71
|
+
"fetcher_tool",
|
|
72
|
+
"fetcher_args",
|
|
73
|
+
"refresh_frequency_sec",
|
|
74
|
+
"refreshed_at",
|
|
75
|
+
"last_refresh_status",
|
|
76
|
+
"change_note",
|
|
77
|
+
"created_at",
|
|
78
|
+
] as const;
|
|
79
|
+
|
|
80
|
+
const COLUMN_LIST = ROW_COLUMNS.join(", ");
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Insert a new (logical_path, version_id) row. Returns the assigned version_id.
|
|
84
|
+
* If `version_id` is omitted, uses `now()` at millisecond precision; the caller
|
|
85
|
+
* should retry with a bumped timestamp on the rare collision case.
|
|
86
|
+
*/
|
|
87
|
+
export async function insertVersion(db: DbConnection, file: NewFileVersion): Promise<string> {
|
|
88
|
+
const versionId = file.version_id ?? millisIso(Date.now());
|
|
89
|
+
const fetcherArgsJson = file.fetcher_args ? JSON.stringify(file.fetcher_args) : null;
|
|
90
|
+
|
|
91
|
+
await db.queryRun(
|
|
92
|
+
`INSERT INTO files (
|
|
93
|
+
logical_path, version_id, tombstone, source_type,
|
|
94
|
+
source_path, source_mtime_ms, source_sha256, blob_sha256,
|
|
95
|
+
content_sha256, content, description, mime_type, size_bytes,
|
|
96
|
+
fetcher, fetcher_server, fetcher_tool, fetcher_args,
|
|
97
|
+
refresh_frequency_sec, refreshed_at, last_refresh_status, change_note
|
|
98
|
+
) VALUES (
|
|
99
|
+
?1, CAST(?2 AS TIMESTAMP), ?3, ?4,
|
|
100
|
+
?5, ?6, ?7, ?8,
|
|
101
|
+
?9, ?10, ?11, ?12, ?13,
|
|
102
|
+
?14, ?15, ?16, ?17,
|
|
103
|
+
?18, ?19, ?20, ?21
|
|
104
|
+
)`,
|
|
105
|
+
file.logical_path,
|
|
106
|
+
versionId,
|
|
107
|
+
!!file.tombstone,
|
|
108
|
+
file.source_type,
|
|
109
|
+
file.source_path ?? null,
|
|
110
|
+
file.source_mtime_ms ?? null,
|
|
111
|
+
file.source_sha256 ?? null,
|
|
112
|
+
file.blob_sha256 ?? null,
|
|
113
|
+
file.content_sha256 ?? null,
|
|
114
|
+
file.content ?? null,
|
|
115
|
+
file.description ?? null,
|
|
116
|
+
file.mime_type ?? null,
|
|
117
|
+
file.size_bytes ?? null,
|
|
118
|
+
file.fetcher ?? null,
|
|
119
|
+
file.fetcher_server ?? null,
|
|
120
|
+
file.fetcher_tool ?? null,
|
|
121
|
+
fetcherArgsJson,
|
|
122
|
+
file.refresh_frequency_sec ?? null,
|
|
123
|
+
file.refreshed_at ?? null,
|
|
124
|
+
file.last_refresh_status ?? null,
|
|
125
|
+
file.change_note ?? null,
|
|
126
|
+
);
|
|
127
|
+
return versionId;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/** Convert a unix-millis number to an ISO string at ms precision. */
|
|
131
|
+
export function millisIso(ms: number): string {
|
|
132
|
+
return new Date(ms).toISOString();
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
interface RawFileRow extends Omit<FileRow, "fetcher_args" | "tombstone"> {
|
|
136
|
+
fetcher_args: string | null | Record<string, unknown>;
|
|
137
|
+
tombstone: boolean | number;
|
|
138
|
+
[key: string]: unknown;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* Coerce a raw DuckDB row into a typed `FileRow`. JSON-parses the
|
|
143
|
+
* `fetcher_args` column (DuckDB returns it as text or a parsed object
|
|
144
|
+
* depending on driver version) and normalizes `tombstone` to a boolean
|
|
145
|
+
* (some drivers return 0/1).
|
|
146
|
+
*/
|
|
147
|
+
function toFileRow(row: RawFileRow | null): FileRow | null {
|
|
148
|
+
if (!row) return null;
|
|
149
|
+
let parsed: Record<string, unknown> | null = null;
|
|
150
|
+
if (row.fetcher_args && typeof row.fetcher_args === "string") {
|
|
151
|
+
try {
|
|
152
|
+
parsed = JSON.parse(row.fetcher_args);
|
|
153
|
+
} catch {
|
|
154
|
+
parsed = null;
|
|
155
|
+
}
|
|
156
|
+
} else if (row.fetcher_args && typeof row.fetcher_args === "object") {
|
|
157
|
+
parsed = row.fetcher_args;
|
|
158
|
+
}
|
|
159
|
+
return {
|
|
160
|
+
...row,
|
|
161
|
+
fetcher_args: parsed,
|
|
162
|
+
tombstone: !!row.tombstone,
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
/** Fetch the current (latest non-tombstoned) row for a logical_path, or null. */
|
|
167
|
+
export async function getCurrent(db: DbConnection, logicalPath: string): Promise<FileRow | null> {
|
|
168
|
+
const row = await db.queryGet<RawFileRow>(
|
|
169
|
+
`SELECT ${COLUMN_LIST} FROM current_files WHERE logical_path = ?1`,
|
|
170
|
+
logicalPath,
|
|
171
|
+
);
|
|
172
|
+
return toFileRow(row);
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
/** Fetch the exact (logical_path, version_id) row, or null if it doesn't exist. */
|
|
176
|
+
export async function getVersion(db: DbConnection, logicalPath: string, versionId: string): Promise<FileRow | null> {
|
|
177
|
+
const row = await db.queryGet<RawFileRow>(
|
|
178
|
+
`SELECT ${COLUMN_LIST} FROM files
|
|
179
|
+
WHERE logical_path = ?1 AND version_id = CAST(?2 AS TIMESTAMP)`,
|
|
180
|
+
logicalPath,
|
|
181
|
+
versionId,
|
|
182
|
+
);
|
|
183
|
+
return toFileRow(row);
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
/** All versions for a logical_path (including tombstones), newest first. */
|
|
187
|
+
export async function listVersions(db: DbConnection, logicalPath: string): Promise<FileRow[]> {
|
|
188
|
+
const rows = await db.queryAll<RawFileRow>(
|
|
189
|
+
`SELECT ${COLUMN_LIST} FROM files WHERE logical_path = ?1 ORDER BY version_id DESC`,
|
|
190
|
+
logicalPath,
|
|
191
|
+
);
|
|
192
|
+
return rows.map((r) => toFileRow(r) as FileRow);
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
export interface ListCurrentOptions {
|
|
196
|
+
prefix?: string;
|
|
197
|
+
limit?: number;
|
|
198
|
+
offset?: number;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* List current (latest, non-tombstoned) rows ordered by logical_path.
|
|
203
|
+
* `prefix` filters to paths starting with the given string. `limit` defaults
|
|
204
|
+
* to 1000 and `offset` to 0; together they support cursor-style pagination.
|
|
205
|
+
*/
|
|
206
|
+
export async function listCurrent(db: DbConnection, options: ListCurrentOptions = {}): Promise<FileRow[]> {
|
|
207
|
+
const where: string[] = [];
|
|
208
|
+
const params: SqlParam[] = [];
|
|
209
|
+
if (options.prefix) {
|
|
210
|
+
where.push(`logical_path LIKE ?${params.length + 1}`);
|
|
211
|
+
params.push(`${options.prefix}%`);
|
|
212
|
+
}
|
|
213
|
+
const limit = options.limit ?? 1000;
|
|
214
|
+
const offset = options.offset ?? 0;
|
|
215
|
+
const sql = `SELECT ${COLUMN_LIST} FROM current_files
|
|
216
|
+
${where.length ? `WHERE ${where.join(" AND ")}` : ""}
|
|
217
|
+
ORDER BY logical_path
|
|
218
|
+
LIMIT ${Number(limit)} OFFSET ${Number(offset)}`;
|
|
219
|
+
const rows = await db.queryAll<RawFileRow>(sql, ...params);
|
|
220
|
+
return rows.map((r) => toFileRow(r) as FileRow);
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
/** Just the logical_paths of every current row, alphabetized. Used by `tree` and discovery flows. */
|
|
224
|
+
export async function listAllCurrentPaths(db: DbConnection): Promise<string[]> {
|
|
225
|
+
const rows = await db.queryAll<{ logical_path: string }>(
|
|
226
|
+
`SELECT logical_path FROM current_files ORDER BY logical_path`,
|
|
227
|
+
);
|
|
228
|
+
return rows.map((r) => r.logical_path);
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
/** Insert a tombstone version for the given path. */
|
|
232
|
+
export async function tombstone(db: DbConnection, logicalPath: string, changeNote?: string): Promise<string> {
|
|
233
|
+
return insertVersion(db, {
|
|
234
|
+
logical_path: logicalPath,
|
|
235
|
+
source_type: "inline",
|
|
236
|
+
tombstone: true,
|
|
237
|
+
content: "",
|
|
238
|
+
change_note: changeNote ?? null,
|
|
239
|
+
});
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
/** Update only the mutable status fields on the latest row for a logical_path. */
|
|
243
|
+
export async function updateRefreshStatus(
|
|
244
|
+
db: DbConnection,
|
|
245
|
+
logicalPath: string,
|
|
246
|
+
versionId: string,
|
|
247
|
+
status: { refreshed_at: string; last_refresh_status: string },
|
|
248
|
+
): Promise<void> {
|
|
249
|
+
await db.queryRun(
|
|
250
|
+
`UPDATE files
|
|
251
|
+
SET refreshed_at = CAST(?1 AS TIMESTAMP),
|
|
252
|
+
last_refresh_status = ?2
|
|
253
|
+
WHERE logical_path = ?3 AND version_id = CAST(?4 AS TIMESTAMP)`,
|
|
254
|
+
status.refreshed_at,
|
|
255
|
+
status.last_refresh_status,
|
|
256
|
+
logicalPath,
|
|
257
|
+
versionId,
|
|
258
|
+
);
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
export interface DueRefreshRow {
|
|
262
|
+
logical_path: string;
|
|
263
|
+
version_id: string;
|
|
264
|
+
refresh_frequency_sec: number;
|
|
265
|
+
refreshed_at: string | null;
|
|
266
|
+
[key: string]: unknown;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
/** Rows whose refresh frequency has elapsed (current versions only). */
|
|
270
|
+
export async function listDueRefreshes(db: DbConnection): Promise<DueRefreshRow[]> {
|
|
271
|
+
const rows = await db.queryAll<DueRefreshRow>(
|
|
272
|
+
`SELECT logical_path, CAST(version_id AS VARCHAR) AS version_id,
|
|
273
|
+
refresh_frequency_sec,
|
|
274
|
+
CAST(refreshed_at AS VARCHAR) AS refreshed_at
|
|
275
|
+
FROM current_files
|
|
276
|
+
WHERE refresh_frequency_sec IS NOT NULL
|
|
277
|
+
AND (refreshed_at IS NULL
|
|
278
|
+
OR CURRENT_TIMESTAMP > refreshed_at + (refresh_frequency_sec * INTERVAL '1 second'))`,
|
|
279
|
+
);
|
|
280
|
+
return rows.map((r) => ({
|
|
281
|
+
logical_path: r.logical_path,
|
|
282
|
+
version_id: String(r.version_id),
|
|
283
|
+
refresh_frequency_sec: Number(r.refresh_frequency_sec),
|
|
284
|
+
refreshed_at: r.refreshed_at ? String(r.refreshed_at) : null,
|
|
285
|
+
}));
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
/**
|
|
289
|
+
* Delete non-current versions whose version_id is older than `beforeIso`.
|
|
290
|
+
* Returns the count of removed file rows. Tombstones for paths with no
|
|
291
|
+
* newer version are preserved (they are themselves the current row).
|
|
292
|
+
*/
|
|
293
|
+
export async function pruneOldVersions(db: DbConnection, beforeIso: string): Promise<{ removed: number }> {
|
|
294
|
+
// Versions older than cutoff that are NOT the current version for their path.
|
|
295
|
+
const result = await db.queryRun(
|
|
296
|
+
`DELETE FROM files
|
|
297
|
+
WHERE version_id < CAST(?1 AS TIMESTAMP)
|
|
298
|
+
AND (logical_path, version_id) NOT IN (
|
|
299
|
+
SELECT logical_path, MAX(version_id) FROM files GROUP BY logical_path
|
|
300
|
+
)`,
|
|
301
|
+
beforeIso,
|
|
302
|
+
);
|
|
303
|
+
return { removed: result.changes };
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
/**
|
|
307
|
+
* Hard-delete a single (logical_path, version_id) row and its chunks.
|
|
308
|
+
* Bypasses the append-only versioning model — used by `prune` to reclaim
|
|
309
|
+
* space. Prefer `tombstone()` for user-driven deletes so history is
|
|
310
|
+
* preserved.
|
|
311
|
+
*/
|
|
312
|
+
export async function deleteVersionAndChunks(db: DbConnection, logicalPath: string, versionId: string): Promise<void> {
|
|
313
|
+
await db.queryRun(
|
|
314
|
+
`DELETE FROM chunks WHERE logical_path = ?1 AND version_id = CAST(?2 AS TIMESTAMP)`,
|
|
315
|
+
logicalPath,
|
|
316
|
+
versionId,
|
|
317
|
+
);
|
|
318
|
+
await db.queryRun(
|
|
319
|
+
`DELETE FROM files WHERE logical_path = ?1 AND version_id = CAST(?2 AS TIMESTAMP)`,
|
|
320
|
+
logicalPath,
|
|
321
|
+
versionId,
|
|
322
|
+
);
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
export { COLUMN_LIST as FILE_COLUMNS };
|