@orellbuehler/paperless-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,184 @@
1
+ import { z } from "zod";
2
+ import { buildQS, ok, err } from "../paperless/format.js";
3
+ export function registerHelperTools(server, client) {
4
+ server.tool("get_document_content", "Get the text content of a document (OCR'd text for PDFs, raw text for text files)", {
5
+ id: z.number().describe("Document ID"),
6
+ max_length: z.number().optional().describe("Truncate content to this many characters"),
7
+ }, async ({ id, max_length }) => {
8
+ try {
9
+ let content = await client.getDocumentContent(id);
10
+ if (!content) {
11
+ const doc = (await client.fetch(`/api/documents/${id}/`));
12
+ content = doc.content || "";
13
+ }
14
+ if (!content)
15
+ return ok({ id, content: "", note: "No text content available for this document" });
16
+ if (max_length && content.length > max_length) {
17
+ content =
18
+ content.slice(0, max_length) +
19
+ `\n\n[Truncated at ${max_length} characters, total: ${content.length}]`;
20
+ }
21
+ return ok({ id, length: content.length, content });
22
+ }
23
+ catch (e) {
24
+ return err(e);
25
+ }
26
+ });
27
+ server.tool("get_documents", "Get full details (including OCR text content) for one or more documents by ID. Use this after list_documents/search_documents, which return metadata only.", {
28
+ ids: z.array(z.number()).describe("Document IDs to fetch in full"),
29
+ max_content_length: z
30
+ .number()
31
+ .optional()
32
+ .describe("Truncate each document's content to this many characters"),
33
+ }, async ({ ids, max_content_length }) => {
34
+ try {
35
+ const docs = await Promise.all(ids.map((id) => client.fetch(`/api/documents/${id}/`)));
36
+ const result = docs.map((doc) => {
37
+ if (max_content_length &&
38
+ typeof doc.content === "string" &&
39
+ doc.content.length > max_content_length) {
40
+ return {
41
+ ...doc,
42
+ content: doc.content.slice(0, max_content_length),
43
+ content_length: doc.content.length,
44
+ content_truncated: true,
45
+ };
46
+ }
47
+ return doc;
48
+ });
49
+ return ok(result);
50
+ }
51
+ catch (e) {
52
+ return err(e);
53
+ }
54
+ });
55
+ server.tool("get_documents_by_correspondent", "Find a correspondent by name and list their documents", {
56
+ name: z.string().describe("Correspondent name (partial match)"),
57
+ page: z.number().optional(),
58
+ page_size: z.number().optional(),
59
+ }, async ({ name, page, page_size }) => {
60
+ try {
61
+ const corrs = (await client.fetch(`/api/correspondents/${buildQS({ name__icontains: name })}`));
62
+ if (corrs.results.length === 0) {
63
+ return ok({ query: name, message: "No correspondents found matching that name" });
64
+ }
65
+ const correspondent = corrs.results[0];
66
+ const docs = (await client.fetch(`/api/documents/${buildQS({
67
+ correspondent__id: correspondent.id,
68
+ page: page || 1,
69
+ page_size: page_size || 25,
70
+ ordering: "-created",
71
+ })}`));
72
+ return ok({
73
+ correspondent: {
74
+ id: correspondent.id,
75
+ name: correspondent.name,
76
+ document_count: correspondent.document_count,
77
+ },
78
+ other_matches: corrs.results.length > 1
79
+ ? corrs.results.slice(1).map((c) => ({ id: c.id, name: c.name }))
80
+ : undefined,
81
+ documents: docs,
82
+ });
83
+ }
84
+ catch (e) {
85
+ return err(e);
86
+ }
87
+ });
88
+ server.tool("monthly_summary", "Get a summary of documents added or created in a given month", {
89
+ year: z.number().describe("Year (e.g. 2024)"),
90
+ month: z.number().describe("Month (1-12)"),
91
+ }, async ({ year, month }) => {
92
+ try {
93
+ const startDate = `${year}-${String(month).padStart(2, "0")}-01`;
94
+ const endMonth = month === 12 ? 1 : month + 1;
95
+ const endYear = month === 12 ? year + 1 : year;
96
+ const endDate = `${endYear}-${String(endMonth).padStart(2, "0")}-01`;
97
+ const allDocs = await client.fetchAllPages(`/api/documents/${buildQS({
98
+ added__date__gte: startDate,
99
+ added__date__lt: endDate,
100
+ ordering: "-added",
101
+ })}`);
102
+ const byType = {};
103
+ const byCorrespondent = {};
104
+ for (const doc of allDocs) {
105
+ const typeKey = doc.document_type ? String(doc.document_type) : "unclassified";
106
+ byType[typeKey] = (byType[typeKey] || 0) + 1;
107
+ const corrKey = doc.correspondent ? String(doc.correspondent) : "unknown";
108
+ byCorrespondent[corrKey] = (byCorrespondent[corrKey] || 0) + 1;
109
+ }
110
+ return ok({
111
+ period: `${year}-${String(month).padStart(2, "0")}`,
112
+ total_added: allDocs.length,
113
+ by_document_type_id: byType,
114
+ by_correspondent_id: byCorrespondent,
115
+ documents: allDocs.map((d) => ({
116
+ id: d.id,
117
+ title: d.title,
118
+ created: d.created,
119
+ added: d.added,
120
+ correspondent: d.correspondent,
121
+ document_type: d.document_type,
122
+ })),
123
+ });
124
+ }
125
+ catch (e) {
126
+ return err(e);
127
+ }
128
+ });
129
+ server.tool("upload_from_url", "Download a file from a URL and upload it to Paperless-ngx", {
130
+ url: z.string().describe("URL to download the file from"),
131
+ title: z.string().optional(),
132
+ correspondent: z.number().optional(),
133
+ document_type: z.number().optional(),
134
+ storage_path: z.number().optional(),
135
+ tags: z.array(z.number()).optional(),
136
+ }, async ({ url, title, correspondent, document_type, storage_path, tags }) => {
137
+ try {
138
+ const parsed = new URL(url);
139
+ if (!["http:", "https:"].includes(parsed.protocol)) {
140
+ throw new Error(`Unsupported URL scheme: ${parsed.protocol}. Only http and https are allowed.`);
141
+ }
142
+ const MAX_DOWNLOAD_SIZE = 100 * 1024 * 1024; // 100 MB
143
+ const fileRes = await fetch(url, { redirect: "error" });
144
+ if (!fileRes.ok)
145
+ throw new Error(`Failed to download: ${fileRes.status} ${fileRes.statusText}`);
146
+ const contentLength = parseInt(fileRes.headers.get("content-length") || "0", 10);
147
+ if (contentLength > MAX_DOWNLOAD_SIZE) {
148
+ throw new Error(`File too large: ${contentLength} bytes (max ${MAX_DOWNLOAD_SIZE})`);
149
+ }
150
+ const contentDisposition = fileRes.headers.get("content-disposition");
151
+ let filename = url.split("/").pop()?.split("?")[0] || "document";
152
+ if (contentDisposition) {
153
+ const match = contentDisposition.match(/filename[^;=\n]*=((['"]).*?\2|[^;\n]*)/);
154
+ if (match)
155
+ filename = match[1].replace(/['"]/g, "");
156
+ }
157
+ const blob = await fileRes.blob();
158
+ const form = new FormData();
159
+ form.append("document", blob, filename);
160
+ if (title !== undefined)
161
+ form.append("title", title);
162
+ if (correspondent !== undefined)
163
+ form.append("correspondent", String(correspondent));
164
+ if (document_type !== undefined)
165
+ form.append("document_type", String(document_type));
166
+ if (storage_path !== undefined)
167
+ form.append("storage_path", String(storage_path));
168
+ if (tags)
169
+ tags.forEach((t) => form.append("tags", String(t)));
170
+ const res = await client.upload("/api/documents/post_document/", form);
171
+ if (!res.ok)
172
+ throw new Error(`Upload failed: ${res.status}: ${await res.text()}`);
173
+ return ok(await res.json().catch(() => ({
174
+ status: "accepted",
175
+ task: res.headers.get("location"),
176
+ filename,
177
+ source_url: url,
178
+ })));
179
+ }
180
+ catch (e) {
181
+ return err(e);
182
+ }
183
+ });
184
+ }
@@ -0,0 +1,122 @@
1
+ import { z } from "zod";
2
+ import { createHash } from "node:crypto";
3
+ import { ok, err, buildQS } from "../paperless/format.js";
4
+ import { config, adminClient } from "../config.js";
5
+ import { embed, embedSingle, getProviderInfo } from "../embeddings.js";
6
+ import { upsertDocument, searchSimilar, getIndexedDocIds, getDocumentHash, getStats, removeDocument, } from "../vectordb.js";
7
+ function contentHash(text) {
8
+ return createHash("sha256").update(text).digest("hex").slice(0, 16);
9
+ }
10
+ export function registerSearchTools(server, client) {
11
+ server.tool("semantic_search", "Search documents by meaning using vector embeddings. Requires sync_embeddings to be run first.", {
12
+ query: z.string().describe("Natural language search query"),
13
+ limit: z.number().optional().describe("Max results (default 10)"),
14
+ }, async ({ query, limit }) => {
15
+ try {
16
+ const limitN = limit || 10;
17
+ const queryEmbedding = await embedSingle(query);
18
+ // Over-fetch from the global index, then drop docs the user's token
19
+ // cannot see, so permission filtering doesn't starve restricted users.
20
+ const hits = searchSimilar(queryEmbedding, limitN * 5);
21
+ if (hits.length === 0)
22
+ return ok({ count: 0, results: [] });
23
+ const ids = hits.map((h) => h.id);
24
+ const resp = (await client.fetch(`/api/documents/${buildQS({ id__in: ids, page_size: ids.length })}`));
25
+ const allowed = new Set((resp.results || []).map((d) => d.id));
26
+ const results = hits
27
+ .filter((h) => allowed.has(h.id))
28
+ .slice(0, limitN)
29
+ .map((h) => ({ id: h.id, title: h.title, distance: h.distance }));
30
+ return ok({ count: results.length, results });
31
+ }
32
+ catch (e) {
33
+ return err(e);
34
+ }
35
+ });
36
+ if (client.token === config.adminToken) {
37
+ server.tool("sync_embeddings", "Sync document embeddings to the local vector database. Indexes all documents for semantic search. Only re-embeds documents whose content has changed. Admin only.", {
38
+ force: z.boolean().optional().describe("Force re-embedding of all documents"),
39
+ }, async ({ force }) => {
40
+ try {
41
+ const provider = getProviderInfo();
42
+ const docs = await adminClient.fetchAllPages("/api/documents/");
43
+ const indexedIds = new Set(getIndexedDocIds());
44
+ let indexed = 0;
45
+ let skipped = 0;
46
+ let removed = 0;
47
+ const errors = [];
48
+ const BATCH_SIZE = 20;
49
+ const currentIds = new Set(docs.map((d) => d.id));
50
+ for (const id of indexedIds) {
51
+ if (!currentIds.has(id)) {
52
+ removeDocument(id);
53
+ removed++;
54
+ }
55
+ }
56
+ for (let i = 0; i < docs.length; i += BATCH_SIZE) {
57
+ const batch = docs.slice(i, i + BATCH_SIZE);
58
+ const toEmbed = [];
59
+ for (const doc of batch) {
60
+ try {
61
+ let content = doc.content;
62
+ if (!content) {
63
+ const detail = (await adminClient.fetch(`/api/documents/${doc.id}/`));
64
+ content = detail.content;
65
+ }
66
+ if (!content) {
67
+ skipped++;
68
+ continue;
69
+ }
70
+ const hash = contentHash(content);
71
+ if (!force && getDocumentHash(doc.id) === hash) {
72
+ skipped++;
73
+ continue;
74
+ }
75
+ toEmbed.push({ doc, content, hash });
76
+ }
77
+ catch (e) {
78
+ errors.push(`Doc ${doc.id}: ${e}`);
79
+ }
80
+ }
81
+ if (toEmbed.length === 0)
82
+ continue;
83
+ try {
84
+ const texts = toEmbed.map((t) => `${t.doc.title}\n\n${t.content}`.slice(0, 8000));
85
+ const embeddings = await embed(texts);
86
+ for (let j = 0; j < toEmbed.length; j++) {
87
+ upsertDocument(toEmbed[j].doc.id, toEmbed[j].doc.title, toEmbed[j].hash, embeddings[j]);
88
+ indexed++;
89
+ }
90
+ }
91
+ catch (e) {
92
+ errors.push(`Batch embed error: ${e}`);
93
+ }
94
+ }
95
+ const stats = getStats();
96
+ return ok({
97
+ provider,
98
+ indexed,
99
+ skipped,
100
+ removed,
101
+ total_in_db: stats.indexed_documents,
102
+ db_path: stats.db_path,
103
+ errors: errors.length > 0 ? errors : undefined,
104
+ });
105
+ }
106
+ catch (e) {
107
+ return err(e);
108
+ }
109
+ });
110
+ }
111
+ server.tool("embedding_status", "Get the status of the local vector embedding database", {}, async () => {
112
+ try {
113
+ const { db_path, ...stats } = getStats();
114
+ const provider = getProviderInfo();
115
+ const isAdmin = client.token === config.adminToken;
116
+ return ok({ ...stats, ...(isAdmin ? { db_path } : {}), ...provider });
117
+ }
118
+ catch (e) {
119
+ return err(e);
120
+ }
121
+ });
122
+ }
@@ -0,0 +1,126 @@
1
+ import { z } from "zod";
2
+ import { buildQS, ok, err } from "../paperless/format.js";
3
+ export function registerUserTools(server, client) {
4
+ // --- Users ---
5
+ server.tool("list_users", "List all users", {
6
+ page: z.number().optional(),
7
+ page_size: z.number().optional(),
8
+ ordering: z.string().optional(),
9
+ }, async (params) => {
10
+ try {
11
+ return ok(await client.fetch(`/api/users/${buildQS(params)}`));
12
+ }
13
+ catch (e) {
14
+ return err(e);
15
+ }
16
+ });
17
+ server.tool("get_user", "Get a single user by ID. Reveals the permission-string format used by user_permissions.", { id: z.number() }, async ({ id }) => {
18
+ try {
19
+ return ok(await client.fetch(`/api/users/${id}/`));
20
+ }
21
+ catch (e) {
22
+ return err(e);
23
+ }
24
+ });
25
+ server.tool("create_user", "Create a new user (requires Paperless admin privileges)", {
26
+ username: z.string(),
27
+ password: z.string().optional(),
28
+ email: z.string().optional(),
29
+ first_name: z.string().optional(),
30
+ last_name: z.string().optional(),
31
+ is_active: z.boolean().optional(),
32
+ is_staff: z.boolean().optional(),
33
+ is_superuser: z.boolean().optional(),
34
+ groups: z.array(z.number()).optional().describe("Group IDs"),
35
+ user_permissions: z
36
+ .array(z.string())
37
+ .optional()
38
+ .describe("Permission codenames like 'documents.view_document'. Call get_user/get_group to see valid strings."),
39
+ }, async (body) => {
40
+ try {
41
+ return ok(await client.fetch("/api/users/", {
42
+ method: "POST",
43
+ body: JSON.stringify(body),
44
+ }));
45
+ }
46
+ catch (e) {
47
+ return err(e);
48
+ }
49
+ });
50
+ server.tool("update_user", "Update an existing user (partial update; requires Paperless admin privileges)", {
51
+ id: z.number(),
52
+ username: z.string().optional(),
53
+ password: z.string().optional(),
54
+ email: z.string().optional(),
55
+ first_name: z.string().optional(),
56
+ last_name: z.string().optional(),
57
+ is_active: z.boolean().optional(),
58
+ is_staff: z.boolean().optional(),
59
+ is_superuser: z.boolean().optional(),
60
+ groups: z.array(z.number()).optional().describe("Group IDs"),
61
+ user_permissions: z.array(z.string()).optional(),
62
+ }, async ({ id, ...body }) => {
63
+ try {
64
+ return ok(await client.fetch(`/api/users/${id}/`, {
65
+ method: "PATCH",
66
+ body: JSON.stringify(body),
67
+ }));
68
+ }
69
+ catch (e) {
70
+ return err(e);
71
+ }
72
+ });
73
+ // --- Groups ---
74
+ server.tool("list_groups", "List all groups", {
75
+ page: z.number().optional(),
76
+ page_size: z.number().optional(),
77
+ ordering: z.string().optional(),
78
+ }, async (params) => {
79
+ try {
80
+ return ok(await client.fetch(`/api/groups/${buildQS(params)}`));
81
+ }
82
+ catch (e) {
83
+ return err(e);
84
+ }
85
+ });
86
+ server.tool("get_group", "Get a single group by ID. Reveals the permission-string format used by permissions.", { id: z.number() }, async ({ id }) => {
87
+ try {
88
+ return ok(await client.fetch(`/api/groups/${id}/`));
89
+ }
90
+ catch (e) {
91
+ return err(e);
92
+ }
93
+ });
94
+ server.tool("create_group", "Create a new group (requires Paperless admin privileges)", {
95
+ name: z.string(),
96
+ permissions: z
97
+ .array(z.string())
98
+ .optional()
99
+ .describe("Permission codenames like 'documents.view_document'"),
100
+ }, async (body) => {
101
+ try {
102
+ return ok(await client.fetch("/api/groups/", {
103
+ method: "POST",
104
+ body: JSON.stringify(body),
105
+ }));
106
+ }
107
+ catch (e) {
108
+ return err(e);
109
+ }
110
+ });
111
+ server.tool("update_group", "Update an existing group (partial update; requires Paperless admin privileges)", {
112
+ id: z.number(),
113
+ name: z.string().optional(),
114
+ permissions: z.array(z.string()).optional(),
115
+ }, async ({ id, ...body }) => {
116
+ try {
117
+ return ok(await client.fetch(`/api/groups/${id}/`, {
118
+ method: "PATCH",
119
+ body: JSON.stringify(body),
120
+ }));
121
+ }
122
+ catch (e) {
123
+ return err(e);
124
+ }
125
+ });
126
+ }
@@ -0,0 +1,141 @@
1
+ import { z } from "zod";
2
+ import { buildQS, ok, err } from "../paperless/format.js";
3
+ export function registerWorkflowTools(server, client) {
4
+ server.tool("auto_classify_document", "Get AI suggestions for a document and apply them in one step. Returns what was changed.", {
5
+ id: z.number().describe("Document ID"),
6
+ apply_correspondent: z
7
+ .boolean()
8
+ .optional()
9
+ .describe("Apply suggested correspondent (default true)"),
10
+ apply_document_type: z
11
+ .boolean()
12
+ .optional()
13
+ .describe("Apply suggested document type (default true)"),
14
+ apply_tags: z.boolean().optional().describe("Apply suggested tags (default true)"),
15
+ apply_storage_path: z
16
+ .boolean()
17
+ .optional()
18
+ .describe("Apply suggested storage path (default true)"),
19
+ }, async ({ id, apply_correspondent, apply_document_type, apply_tags, apply_storage_path }) => {
20
+ try {
21
+ const [suggestions, doc] = await Promise.all([
22
+ client.fetch(`/api/documents/${id}/suggestions/`),
23
+ client.fetch(`/api/documents/${id}/`),
24
+ ]);
25
+ const updates = {};
26
+ if ((apply_correspondent ?? true) && suggestions.correspondents?.length > 0) {
27
+ updates.correspondent = suggestions.correspondents[0];
28
+ }
29
+ if ((apply_document_type ?? true) && suggestions.document_types?.length > 0) {
30
+ updates.document_type = suggestions.document_types[0];
31
+ }
32
+ if ((apply_tags ?? true) && suggestions.tags?.length > 0) {
33
+ const merged = [...new Set([...doc.tags, ...suggestions.tags])];
34
+ updates.tags = merged;
35
+ }
36
+ if ((apply_storage_path ?? true) && suggestions.storage_paths?.length > 0) {
37
+ updates.storage_path = suggestions.storage_paths[0];
38
+ }
39
+ if (Object.keys(updates).length === 0) {
40
+ return ok({ id, message: "No suggestions available", suggestions });
41
+ }
42
+ const updated = await client.fetch(`/api/documents/${id}/`, {
43
+ method: "PATCH",
44
+ body: JSON.stringify(updates),
45
+ });
46
+ return ok({ id, applied: updates, suggestions, updated_document: updated });
47
+ }
48
+ catch (e) {
49
+ return err(e);
50
+ }
51
+ });
52
+ server.tool("process_inbox", "Review all inbox documents and return proposed classifications. Does NOT apply changes — returns a plan for review.", {
53
+ limit: z.number().optional().describe("Max documents to process (default 20)"),
54
+ }, async ({ limit }) => {
55
+ try {
56
+ const maxDocs = limit || 20;
57
+ const data = (await client.fetch(`/api/documents/${buildQS({ is_in_inbox: true, page_size: maxDocs })}`));
58
+ const proposals = [];
59
+ for (const doc of data.results) {
60
+ try {
61
+ const suggestions = (await client.fetch(`/api/documents/${doc.id}/suggestions/`));
62
+ proposals.push({
63
+ id: doc.id,
64
+ title: doc.title,
65
+ current: {
66
+ correspondent: doc.correspondent,
67
+ document_type: doc.document_type,
68
+ tags: doc.tags,
69
+ storage_path: doc.storage_path,
70
+ },
71
+ suggested: {
72
+ correspondent: suggestions.correspondents?.[0] ?? null,
73
+ document_type: suggestions.document_types?.[0] ?? null,
74
+ tags: suggestions.tags ?? [],
75
+ storage_path: suggestions.storage_paths?.[0] ?? null,
76
+ },
77
+ });
78
+ }
79
+ catch (e) {
80
+ proposals.push({ id: doc.id, title: doc.title, error: String(e) });
81
+ }
82
+ }
83
+ return ok({
84
+ total_inbox: data.count,
85
+ processed: proposals.length,
86
+ proposals,
87
+ note: "Use auto_classify_document or update_document to apply changes.",
88
+ });
89
+ }
90
+ catch (e) {
91
+ return err(e);
92
+ }
93
+ });
94
+ server.tool("bulk_tag_by_content", "Search for documents matching a query and add a tag to all results", {
95
+ query: z.string().describe("Search query to find matching documents"),
96
+ tag_id: z.number().describe("Tag ID to add to matching documents"),
97
+ dry_run: z
98
+ .boolean()
99
+ .optional()
100
+ .describe("If true, only return matching documents without tagging"),
101
+ }, async ({ query, tag_id, dry_run }) => {
102
+ try {
103
+ const data = (await client.fetch(`/api/documents/${buildQS({ query, page_size: 100 })}`));
104
+ const docIds = data.results.map((d) => d.id);
105
+ if (dry_run) {
106
+ return ok({
107
+ query,
108
+ tag_id,
109
+ matching_documents: data.results.map((d) => ({ id: d.id, title: d.title })),
110
+ count: docIds.length,
111
+ total_matches: data.count,
112
+ note: "Dry run — no changes made. Set dry_run to false to apply.",
113
+ });
114
+ }
115
+ if (docIds.length === 0) {
116
+ return ok({ query, tag_id, message: "No documents matched the query" });
117
+ }
118
+ const result = await client.fetch("/api/documents/bulk_edit/", {
119
+ method: "POST",
120
+ body: JSON.stringify({
121
+ documents: docIds,
122
+ method: "add_tag",
123
+ parameters: { tag: tag_id },
124
+ }),
125
+ });
126
+ return ok({
127
+ query,
128
+ tag_id,
129
+ tagged_count: docIds.length,
130
+ total_matches: data.count,
131
+ result,
132
+ note: data.count > 100
133
+ ? `Only tagged first 100 of ${data.count} matches. Run again to tag more.`
134
+ : undefined,
135
+ });
136
+ }
137
+ catch (e) {
138
+ return err(e);
139
+ }
140
+ });
141
+ }
@@ -0,0 +1,109 @@
1
+ import Database from "better-sqlite3";
2
+ import * as sqliteVec from "sqlite-vec";
3
+ import { join } from "node:path";
4
+ import { mkdirSync } from "node:fs";
5
+ import { homedir } from "node:os";
6
+ import { getEmbeddingDimensions } from "./embeddings.js";
7
+ const DB_DIR = process.env.PAPERLESS_MCP_DATA || join(homedir(), ".paperless-mcp");
8
+ const DB_PATH = join(DB_DIR, "vectors.db");
9
+ let db = null;
10
+ export function getDb() {
11
+ if (db)
12
+ return db;
13
+ mkdirSync(DB_DIR, { recursive: true });
14
+ db = new Database(DB_PATH);
15
+ sqliteVec.load(db);
16
+ const dims = getEmbeddingDimensions();
17
+ db.exec(`
18
+ CREATE TABLE IF NOT EXISTS documents (
19
+ id INTEGER PRIMARY KEY,
20
+ title TEXT,
21
+ content_hash TEXT,
22
+ updated_at TEXT
23
+ )
24
+ `);
25
+ db.exec(`
26
+ CREATE TABLE IF NOT EXISTS sync_state (
27
+ key TEXT PRIMARY KEY,
28
+ value TEXT
29
+ )
30
+ `);
31
+ const storedDims = db
32
+ .prepare("SELECT value FROM sync_state WHERE key = 'embedding_dimensions'")
33
+ .get();
34
+ if (storedDims && parseInt(storedDims.value, 10) !== dims) {
35
+ db.exec("DROP TABLE IF EXISTS vec_documents");
36
+ db.exec("DELETE FROM documents");
37
+ db.prepare("DELETE FROM sync_state WHERE key = 'embedding_dimensions'").run();
38
+ console.error(`Embedding dimensions changed from ${storedDims.value} to ${dims}. Vector index has been reset. Run sync_embeddings to re-index.`);
39
+ }
40
+ db.exec(`
41
+ CREATE VIRTUAL TABLE IF NOT EXISTS vec_documents USING vec0(
42
+ embedding float[${dims}]
43
+ )
44
+ `);
45
+ if (!storedDims || parseInt(storedDims.value, 10) !== dims) {
46
+ db.prepare("INSERT OR REPLACE INTO sync_state (key, value) VALUES ('embedding_dimensions', ?)").run(String(dims));
47
+ }
48
+ return db;
49
+ }
50
+ export function upsertDocument(id, title, contentHash, embedding) {
51
+ const d = getDb();
52
+ const embBuf = Buffer.from(new Float32Array(embedding).buffer);
53
+ const tx = d.transaction(() => {
54
+ const existing = d.prepare("SELECT content_hash FROM documents WHERE id = ?").get(id);
55
+ if (existing) {
56
+ d.prepare("UPDATE documents SET title = ?, content_hash = ?, updated_at = ? WHERE id = ?").run(title, contentHash, new Date().toISOString(), id);
57
+ d.prepare("UPDATE vec_documents SET embedding = ? WHERE rowid = ?").run(embBuf, BigInt(id));
58
+ }
59
+ else {
60
+ d.prepare("INSERT INTO documents (id, title, content_hash, updated_at) VALUES (?, ?, ?, ?)").run(id, title, contentHash, new Date().toISOString());
61
+ d.prepare("INSERT INTO vec_documents (rowid, embedding) VALUES (?, ?)").run(BigInt(id), embBuf);
62
+ }
63
+ });
64
+ tx();
65
+ }
66
+ export function removeDocument(id) {
67
+ const d = getDb();
68
+ const tx = d.transaction(() => {
69
+ d.prepare("DELETE FROM documents WHERE id = ?").run(id);
70
+ d.prepare("DELETE FROM vec_documents WHERE rowid = ?").run(BigInt(id));
71
+ });
72
+ tx();
73
+ }
74
+ export function searchSimilar(embedding, limit = 10) {
75
+ const d = getDb();
76
+ const rows = d
77
+ .prepare(`
78
+ SELECT v.rowid as id, d.title, v.distance
79
+ FROM vec_documents v
80
+ JOIN documents d ON d.id = v.rowid
81
+ WHERE v.embedding MATCH ? AND k = ?
82
+ ORDER BY v.distance
83
+ `)
84
+ .all(Buffer.from(new Float32Array(embedding).buffer), limit);
85
+ return rows;
86
+ }
87
+ export function getIndexedDocIds() {
88
+ const d = getDb();
89
+ return d.prepare("SELECT id FROM documents").all().map((r) => r.id);
90
+ }
91
+ export function getDocumentHash(id) {
92
+ const d = getDb();
93
+ const row = d.prepare("SELECT content_hash FROM documents WHERE id = ?").get(id);
94
+ return row?.content_hash;
95
+ }
96
+ export function getSyncState(key) {
97
+ const d = getDb();
98
+ const row = d.prepare("SELECT value FROM sync_state WHERE key = ?").get(key);
99
+ return row?.value;
100
+ }
101
+ export function setSyncState(key, value) {
102
+ const d = getDb();
103
+ d.prepare("INSERT OR REPLACE INTO sync_state (key, value) VALUES (?, ?)").run(key, value);
104
+ }
105
+ export function getStats() {
106
+ const d = getDb();
107
+ const count = d.prepare("SELECT COUNT(*) as c FROM documents").get().c;
108
+ return { indexed_documents: count, db_path: DB_PATH };
109
+ }