@cerefox/memory 0.7.2 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/dist/bin/cerefox.js +1357 -361
  2. package/dist/frontend/assets/{index-BzAPcCXA.js → index-CAp2_lFX.js} +2 -2
  3. package/dist/frontend/assets/index-CAp2_lFX.js.map +1 -0
  4. package/dist/frontend/index.html +1 -1
  5. package/dist/server-assets/_shared/ef-meta/index.ts +97 -0
  6. package/dist/server-assets/_shared/embeddings/index.ts +175 -0
  7. package/dist/server-assets/_shared/mcp-tools/_chunker.ts +187 -0
  8. package/dist/server-assets/_shared/mcp-tools/_projects.ts +121 -0
  9. package/dist/server-assets/_shared/mcp-tools/_utils.ts +73 -0
  10. package/dist/server-assets/_shared/mcp-tools/audit-log.ts +95 -0
  11. package/dist/server-assets/_shared/mcp-tools/get-document.ts +73 -0
  12. package/dist/server-assets/_shared/mcp-tools/get-help-content.ts +26 -0
  13. package/dist/server-assets/_shared/mcp-tools/get-help.ts +90 -0
  14. package/dist/server-assets/_shared/mcp-tools/index.ts +67 -0
  15. package/dist/server-assets/_shared/mcp-tools/ingest.ts +315 -0
  16. package/dist/server-assets/_shared/mcp-tools/list-metadata-keys.ts +55 -0
  17. package/dist/server-assets/_shared/mcp-tools/list-projects.ts +59 -0
  18. package/dist/server-assets/_shared/mcp-tools/list-versions.ts +72 -0
  19. package/dist/server-assets/_shared/mcp-tools/metadata-search.ts +154 -0
  20. package/dist/server-assets/_shared/mcp-tools/search.ts +193 -0
  21. package/dist/server-assets/_shared/mcp-tools/set-document-projects.ts +163 -0
  22. package/dist/server-assets/_shared/mcp-tools/types.ts +92 -0
  23. package/dist/server-assets/db/migrations/0003_add_document_versions.sql +91 -0
  24. package/dist/server-assets/db/migrations/0004_add_audit_log_review_status_archived.sql +71 -0
  25. package/dist/server-assets/db/migrations/0005_metadata_search.sql +628 -0
  26. package/dist/server-assets/db/migrations/0006_usage_log.sql +255 -0
  27. package/dist/server-assets/db/migrations/0007_usage_log_requestor.sql +178 -0
  28. package/dist/server-assets/db/migrations/0008_soft_delete.sql +130 -0
  29. package/dist/server-assets/db/migrations/0009_audit_log_restore_operation.sql +20 -0
  30. package/dist/server-assets/db/migrations/0010_requestor_enforcement_config.sql +12 -0
  31. package/dist/server-assets/db/migrations/0011_title_boosting.sql +48 -0
  32. package/dist/server-assets/db/rpcs.sql +1723 -0
  33. package/dist/server-assets/db/schema.sql +380 -0
  34. package/dist/server-assets/supabase/functions/cerefox-get-audit-log/index.ts +117 -0
  35. package/dist/server-assets/supabase/functions/cerefox-get-document/index.ts +138 -0
  36. package/dist/server-assets/supabase/functions/cerefox-ingest/index.ts +819 -0
  37. package/dist/server-assets/supabase/functions/cerefox-list-projects/index.ts +96 -0
  38. package/dist/server-assets/supabase/functions/cerefox-list-versions/index.ts +113 -0
  39. package/dist/server-assets/supabase/functions/cerefox-mcp/index.ts +294 -0
  40. package/dist/server-assets/supabase/functions/cerefox-mcp/shared.ts +42 -0
  41. package/dist/server-assets/supabase/functions/cerefox-metadata/index.ts +99 -0
  42. package/dist/server-assets/supabase/functions/cerefox-metadata-search/index.ts +146 -0
  43. package/dist/server-assets/supabase/functions/cerefox-search/index.ts +382 -0
  44. package/docs/guides/connect-agents.md +78 -3
  45. package/docs/guides/migration-v0.5.md +50 -0
  46. package/docs/guides/quickstart.md +6 -2
  47. package/package.json +3 -2
  48. package/dist/frontend/assets/index-BzAPcCXA.js.map +0 -1
@@ -0,0 +1,1723 @@
1
+ -- Cerefox Search & Retrieval RPCs
2
+ -- These functions are exposed as MCP tools via Supabase.
3
+ -- Run via: python scripts/db_deploy.py (after schema.sql)
4
+ --
5
+ -- All RPCs are SECURITY DEFINER so they can be called safely via the
6
+ -- Supabase anon/service key without exposing the underlying tables directly.
7
+
8
+ -- ── Return-type change drops ──────────────────────────────────────────────────
9
+ -- When CREATE OR REPLACE cannot be used because the return type changes,
10
+ -- we drop the old function first. These drops are safe to re-run.
11
+
12
+ -- Drop old 4-param overload (pre p_min_score) and current 5-param semantic search
13
+ DROP FUNCTION IF EXISTS cerefox_semantic_search(VECTOR(768), INT, BOOLEAN, UUID);
14
+ DROP FUNCTION IF EXISTS cerefox_semantic_search(VECTOR(768), INT, BOOLEAN, UUID, FLOAT);
15
+
16
+ -- Drop old 6-param hybrid_search (pre p_min_score, pre M2M join, used d.project_id column).
17
+ DROP FUNCTION IF EXISTS cerefox_hybrid_search(TEXT, VECTOR(768), INT, FLOAT, BOOLEAN, UUID);
18
+
19
+ -- Drop old 7-param hybrid_search that returned doc_project_id UUID (singular, pre-M2M).
20
+ DROP FUNCTION IF EXISTS cerefox_hybrid_search(TEXT, VECTOR(768), INT, FLOAT, BOOLEAN, UUID, FLOAT);
21
+
22
+ -- Drop old 5-param search_docs (pre p_min_score).
23
+ DROP FUNCTION IF EXISTS cerefox_search_docs(TEXT, VECTOR(768), INT, FLOAT, UUID);
24
+
25
+ -- Drop 6-param search_docs that returned doc_project_id UUID (singular) or lacked doc_updated_at.
26
+ DROP FUNCTION IF EXISTS cerefox_search_docs(TEXT, VECTOR(768), INT, FLOAT, UUID, FLOAT);
27
+
28
+ -- Drop 8-param search_docs (pre is_partial) so return-type change can be applied cleanly.
29
+ DROP FUNCTION IF EXISTS cerefox_search_docs(TEXT, VECTOR(768), INT, FLOAT, UUID, FLOAT, INT, INT);
30
+
31
+ DROP FUNCTION IF EXISTS cerefox_fts_search(TEXT, INT, UUID);
32
+ DROP FUNCTION IF EXISTS cerefox_reconstruct_doc(UUID);
33
+
34
+ -- Drop current signatures before adding version_count to their return types.
35
+ -- Iteration 12B: all chunk-level and document-level search results now include
36
+ -- version_count so agents and the web UI know when previous versions are available.
37
+ DROP FUNCTION IF EXISTS cerefox_hybrid_search(TEXT, VECTOR(768), INT, FLOAT, BOOLEAN, UUID, FLOAT);
38
+ DROP FUNCTION IF EXISTS cerefox_fts_search(TEXT, INT, UUID);
39
+ DROP FUNCTION IF EXISTS cerefox_semantic_search(VECTOR(768), INT, BOOLEAN, UUID, FLOAT);
40
+ DROP FUNCTION IF EXISTS cerefox_reconstruct_doc(UUID);
41
+ DROP FUNCTION IF EXISTS cerefox_search_docs(TEXT, VECTOR(768), INT, FLOAT, UUID, FLOAT);
42
+
43
+ -- Iteration 13: Drop pre-metadata-filter signatures so we can add p_metadata_filter JSONB.
44
+ -- Backwards-compatible: the new parameter has DEFAULT NULL so existing callers are unaffected.
45
+ DROP FUNCTION IF EXISTS cerefox_hybrid_search(TEXT, VECTOR(768), INT, FLOAT, BOOLEAN, UUID, FLOAT);
46
+ DROP FUNCTION IF EXISTS cerefox_fts_search(TEXT, INT, UUID);
47
+ DROP FUNCTION IF EXISTS cerefox_semantic_search(VECTOR(768), INT, BOOLEAN, UUID, FLOAT);
48
+ DROP FUNCTION IF EXISTS cerefox_search_docs(TEXT, VECTOR(768), INT, FLOAT, UUID, FLOAT, INT, INT);
49
+
50
+ -- Iteration 16B: Drop pre-project_names signatures so we can add doc_project_names TEXT[]
51
+ -- to all RETURNS TABLE shapes. Also drops reconstruct_doc and get_document for the same reason.
52
+ DROP FUNCTION IF EXISTS cerefox_hybrid_search(TEXT, VECTOR(768), INT, FLOAT, BOOLEAN, UUID, FLOAT, JSONB);
53
+ DROP FUNCTION IF EXISTS cerefox_fts_search(TEXT, INT, UUID, JSONB);
54
+ DROP FUNCTION IF EXISTS cerefox_semantic_search(VECTOR(768), INT, BOOLEAN, UUID, FLOAT, JSONB);
55
+ DROP FUNCTION IF EXISTS cerefox_search_docs(TEXT, VECTOR(768), INT, FLOAT, UUID, FLOAT, INT, INT, JSONB);
56
+ DROP FUNCTION IF EXISTS cerefox_reconstruct_doc(UUID);
57
+ DROP FUNCTION IF EXISTS cerefox_get_document(UUID, UUID);
58
+
59
+ -- ── Shared return type note ────────────────────────────────────────────────────
60
+ -- All chunk-level search RPCs return the same shape for consistency:
61
+ -- chunk_id, document_id, chunk_index, title, content, heading_path,
62
+ -- heading_level, score, doc_title, doc_source, doc_project_ids,
63
+ -- doc_project_names, doc_metadata, version_count
64
+ -- Note: doc_project_ids is UUID[] (array) — a document can belong to many projects.
65
+ -- Note: doc_project_names is TEXT[] (array) — human-readable project names.
66
+ -- Note: version_count is INT — number of archived versions for the parent document.
67
+ -- Agents and the web UI use this to know when previous versions are available
68
+ -- for retrieval. 0 means the current content has never been overwritten.
69
+
70
+ -- ── Hybrid Search ─────────────────────────────────────────────────────────────
71
+ -- Combines full-text search (FTS) and vector similarity with a configurable
72
+ -- alpha weight. alpha=1.0 means pure semantic; alpha=0.0 means pure FTS.
73
+ --
74
+ -- V1 approach: run both searches (top N*5 candidates each), FULL OUTER JOIN on
75
+ -- chunk ID, then combine scores with weighted average. Simple and fast for
76
+ -- typical knowledge base sizes.
77
+
78
+ CREATE OR REPLACE FUNCTION cerefox_hybrid_search(
79
+ p_query_text TEXT,
80
+ p_query_embedding VECTOR(768),
81
+ p_match_count INT DEFAULT 10,
82
+ p_alpha FLOAT DEFAULT 0.7,
83
+ p_use_upgrade BOOLEAN DEFAULT FALSE,
84
+ p_project_id UUID DEFAULT NULL,
85
+ p_min_score FLOAT DEFAULT 0.0,
86
+ p_metadata_filter JSONB DEFAULT NULL
87
+ )
88
+ RETURNS TABLE (
89
+ chunk_id UUID,
90
+ document_id UUID,
91
+ chunk_index INT,
92
+ title TEXT,
93
+ content TEXT,
94
+ heading_path TEXT[],
95
+ heading_level INT,
96
+ score FLOAT,
97
+ doc_title TEXT,
98
+ doc_source TEXT,
99
+ doc_project_ids UUID[],
100
+ doc_project_names TEXT[],
101
+ doc_metadata JSONB,
102
+ version_count INT
103
+ )
104
+ LANGUAGE plpgsql
105
+ SECURITY DEFINER
106
+ SET search_path = public, pg_catalog
107
+ AS $$
108
+ DECLARE
109
+ -- plainto_tsquery: ANDs all terms, treats every token as a literal word.
110
+ -- We deliberately avoid websearch_to_tsquery here because it interprets `-` as
111
+ -- a negation operator, which traps natural queries against dashed titles
112
+ -- (e.g. `Job Hunting - Opportunity Index`). Agent queries don't use the
113
+ -- websearch operators (phrase, OR, NOT); semantic ranking is the soft-match
114
+ -- layer for "broadly related". If operator support is ever needed, gate it
115
+ -- behind an opt-in flag rather than changing the default.
116
+ query_fts tsquery := plainto_tsquery('english', p_query_text);
117
+ candidate_count INT := p_match_count * 5;
118
+ BEGIN
119
+ RETURN QUERY
120
+ WITH
121
+ fts_results AS (
122
+ SELECT
123
+ c.id,
124
+ ts_rank_cd(c.fts, query_fts)::FLOAT AS fts_score
125
+ FROM cerefox_chunks c
126
+ JOIN cerefox_documents d ON c.document_id = d.id
127
+ WHERE c.version_id IS NULL
128
+ AND d.deleted_at IS NULL
129
+ AND c.fts @@ query_fts
130
+ AND (p_project_id IS NULL OR EXISTS (
131
+ SELECT 1 FROM cerefox_document_projects dp
132
+ WHERE dp.document_id = d.id AND dp.project_id = p_project_id
133
+ ))
134
+ AND (p_metadata_filter IS NULL OR d.metadata @> p_metadata_filter)
135
+ ORDER BY fts_score DESC
136
+ LIMIT candidate_count
137
+ ),
138
+ vec_results AS (
139
+ SELECT
140
+ c.id,
141
+ CASE
142
+ WHEN p_use_upgrade AND c.embedding_upgrade IS NOT NULL
143
+ THEN (1.0 - (c.embedding_upgrade <=> p_query_embedding))::FLOAT
144
+ ELSE
145
+ (1.0 - (c.embedding_primary <=> p_query_embedding))::FLOAT
146
+ END AS vec_score
147
+ FROM cerefox_chunks c
148
+ JOIN cerefox_documents d ON c.document_id = d.id
149
+ WHERE c.version_id IS NULL
150
+ AND d.deleted_at IS NULL
151
+ AND (p_project_id IS NULL OR EXISTS (
152
+ SELECT 1 FROM cerefox_document_projects dp
153
+ WHERE dp.document_id = d.id AND dp.project_id = p_project_id
154
+ ))
155
+ AND (p_metadata_filter IS NULL OR d.metadata @> p_metadata_filter)
156
+ ORDER BY
157
+ CASE
158
+ WHEN p_use_upgrade AND c.embedding_upgrade IS NOT NULL
159
+ THEN c.embedding_upgrade <=> p_query_embedding
160
+ ELSE c.embedding_primary <=> p_query_embedding
161
+ END
162
+ LIMIT candidate_count
163
+ ),
164
+ combined AS (
165
+ SELECT
166
+ COALESCE(f.id, v.id) AS id,
167
+ ( p_alpha * COALESCE(v.vec_score, 0.0) +
168
+ (1.0 - p_alpha) * COALESCE(f.fts_score, 0.0)
169
+ ) AS score,
170
+ COALESCE(v.vec_score, 0.0) AS vec_score,
171
+ -- TRUE when the chunk matched the @@ FTS operator.
172
+ -- We use this flag rather than vec_score to decide whether a chunk
173
+ -- passes the threshold, because in small corpora every chunk appears
174
+ -- in vec_results (LIMIT candidate_count covers all rows), so
175
+ -- vec_score is never NULL even for FTS-only matches.
176
+ f.id IS NOT NULL AS has_fts_match
177
+ FROM fts_results f
178
+ FULL OUTER JOIN vec_results v ON f.id = v.id
179
+ )
180
+ SELECT
181
+ c.id AS chunk_id,
182
+ c.document_id,
183
+ c.chunk_index,
184
+ c.title,
185
+ c.content,
186
+ c.heading_path,
187
+ c.heading_level,
188
+ cm.score,
189
+ d.title AS doc_title,
190
+ d.source AS doc_source,
191
+ ARRAY(SELECT dp.project_id FROM cerefox_document_projects dp
192
+ WHERE dp.document_id = d.id) AS doc_project_ids,
193
+ ARRAY(SELECT p.name FROM cerefox_projects p
194
+ JOIN cerefox_document_projects dp ON p.id = dp.project_id
195
+ WHERE dp.document_id = d.id) AS doc_project_names,
196
+ d.metadata AS doc_metadata,
197
+ (SELECT COUNT(*)::INT FROM cerefox_document_versions dv
198
+ WHERE dv.document_id = d.id) AS version_count
199
+ FROM combined cm
200
+ JOIN cerefox_chunks c ON c.id = cm.id
201
+ JOIN cerefox_documents d ON c.document_id = d.id
202
+ -- FTS matches pass through unconditionally: the @@ operator is a hard gate
203
+ -- and guarantees the query terms appear in the chunk.
204
+ -- Vector-only results (no FTS match) are filtered by the cosine threshold.
205
+ WHERE cm.has_fts_match OR cm.vec_score >= p_min_score
206
+ ORDER BY cm.score DESC
207
+ LIMIT p_match_count;
208
+ END;
209
+ $$;
210
+
211
+ -- ── FTS-Only Search ───────────────────────────────────────────────────────────
212
+ -- Pure keyword / exact-match search. Best for names, dates, tags.
213
+
214
+ CREATE OR REPLACE FUNCTION cerefox_fts_search(
215
+ p_query_text TEXT,
216
+ p_match_count INT DEFAULT 10,
217
+ p_project_id UUID DEFAULT NULL,
218
+ p_metadata_filter JSONB DEFAULT NULL
219
+ )
220
+ RETURNS TABLE (
221
+ chunk_id UUID,
222
+ document_id UUID,
223
+ chunk_index INT,
224
+ title TEXT,
225
+ content TEXT,
226
+ heading_path TEXT[],
227
+ heading_level INT,
228
+ score FLOAT,
229
+ doc_title TEXT,
230
+ doc_source TEXT,
231
+ doc_project_ids UUID[],
232
+ doc_project_names TEXT[],
233
+ doc_metadata JSONB,
234
+ version_count INT
235
+ )
236
+ LANGUAGE plpgsql
237
+ SECURITY DEFINER
238
+ SET search_path = public, pg_catalog
239
+ AS $$
240
+ DECLARE
241
+ -- plainto_tsquery: see rationale comment in cerefox_hybrid_search above.
242
+ query_fts tsquery := plainto_tsquery('english', p_query_text);
243
+ BEGIN
244
+ RETURN QUERY
245
+ SELECT
246
+ c.id AS chunk_id,
247
+ c.document_id,
248
+ c.chunk_index,
249
+ c.title,
250
+ c.content,
251
+ c.heading_path,
252
+ c.heading_level,
253
+ ts_rank_cd(c.fts, query_fts)::FLOAT AS score,
254
+ d.title AS doc_title,
255
+ d.source AS doc_source,
256
+ ARRAY(SELECT dp.project_id FROM cerefox_document_projects dp
257
+ WHERE dp.document_id = d.id) AS doc_project_ids,
258
+ ARRAY(SELECT p.name FROM cerefox_projects p
259
+ JOIN cerefox_document_projects dp ON p.id = dp.project_id
260
+ WHERE dp.document_id = d.id) AS doc_project_names,
261
+ d.metadata AS doc_metadata,
262
+ (SELECT COUNT(*)::INT FROM cerefox_document_versions dv
263
+ WHERE dv.document_id = d.id) AS version_count
264
+ FROM cerefox_chunks c
265
+ JOIN cerefox_documents d ON c.document_id = d.id
266
+ WHERE c.version_id IS NULL
267
+ AND d.deleted_at IS NULL
268
+ AND c.fts @@ query_fts
269
+ AND (p_project_id IS NULL OR EXISTS (
270
+ SELECT 1 FROM cerefox_document_projects dp
271
+ WHERE dp.document_id = d.id AND dp.project_id = p_project_id
272
+ ))
273
+ AND (p_metadata_filter IS NULL OR d.metadata @> p_metadata_filter)
274
+ ORDER BY score DESC
275
+ LIMIT p_match_count;
276
+ END;
277
+ $$;
278
+
279
+ -- ── Semantic-Only Search ──────────────────────────────────────────────────────
280
+ -- Pure vector similarity. Best for conceptual / paraphrase queries.
281
+
282
+ CREATE OR REPLACE FUNCTION cerefox_semantic_search(
283
+ p_query_embedding VECTOR(768),
284
+ p_match_count INT DEFAULT 10,
285
+ p_use_upgrade BOOLEAN DEFAULT FALSE,
286
+ p_project_id UUID DEFAULT NULL,
287
+ p_min_score FLOAT DEFAULT 0.0,
288
+ p_metadata_filter JSONB DEFAULT NULL
289
+ )
290
+ RETURNS TABLE (
291
+ chunk_id UUID,
292
+ document_id UUID,
293
+ chunk_index INT,
294
+ title TEXT,
295
+ content TEXT,
296
+ heading_path TEXT[],
297
+ heading_level INT,
298
+ score FLOAT,
299
+ doc_title TEXT,
300
+ doc_source TEXT,
301
+ doc_project_ids UUID[],
302
+ doc_project_names TEXT[],
303
+ doc_metadata JSONB,
304
+ version_count INT
305
+ )
306
+ LANGUAGE plpgsql
307
+ SECURITY DEFINER
308
+ SET search_path = public, pg_catalog
309
+ AS $$
310
+ BEGIN
311
+ RETURN QUERY
312
+ SELECT
313
+ c.id AS chunk_id,
314
+ c.document_id,
315
+ c.chunk_index,
316
+ c.title,
317
+ c.content,
318
+ c.heading_path,
319
+ c.heading_level,
320
+ CASE
321
+ WHEN p_use_upgrade AND c.embedding_upgrade IS NOT NULL
322
+ THEN (1.0 - (c.embedding_upgrade <=> p_query_embedding))::FLOAT
323
+ ELSE
324
+ (1.0 - (c.embedding_primary <=> p_query_embedding))::FLOAT
325
+ END AS score,
326
+ d.title AS doc_title,
327
+ d.source AS doc_source,
328
+ ARRAY(SELECT dp.project_id FROM cerefox_document_projects dp
329
+ WHERE dp.document_id = d.id) AS doc_project_ids,
330
+ ARRAY(SELECT p.name FROM cerefox_projects p
331
+ JOIN cerefox_document_projects dp ON p.id = dp.project_id
332
+ WHERE dp.document_id = d.id) AS doc_project_names,
333
+ d.metadata AS doc_metadata,
334
+ (SELECT COUNT(*)::INT FROM cerefox_document_versions dv
335
+ WHERE dv.document_id = d.id) AS version_count
336
+ FROM cerefox_chunks c
337
+ JOIN cerefox_documents d ON c.document_id = d.id
338
+ WHERE c.version_id IS NULL
339
+ AND d.deleted_at IS NULL
340
+ AND (p_project_id IS NULL OR EXISTS (
341
+ SELECT 1 FROM cerefox_document_projects dp
342
+ WHERE dp.document_id = d.id AND dp.project_id = p_project_id
343
+ ))
344
+ AND (p_metadata_filter IS NULL OR d.metadata @> p_metadata_filter)
345
+ AND (p_use_upgrade = FALSE OR c.embedding_upgrade IS NOT NULL)
346
+ -- Optional minimum cosine similarity threshold.
347
+ -- Default 0.0 means no filtering (returns all top-N results).
348
+ -- When called via the Python layer, CEREFOX_MIN_SEARCH_SCORE (default 0.65)
349
+ -- is applied client-side; agents calling this RPC directly can pass p_min_score.
350
+ AND CASE
351
+ WHEN p_use_upgrade AND c.embedding_upgrade IS NOT NULL
352
+ THEN (1.0 - (c.embedding_upgrade <=> p_query_embedding))::FLOAT
353
+ ELSE (1.0 - (c.embedding_primary <=> p_query_embedding))::FLOAT
354
+ END >= p_min_score
355
+ ORDER BY
356
+ CASE
357
+ WHEN p_use_upgrade AND c.embedding_upgrade IS NOT NULL
358
+ THEN c.embedding_upgrade <=> p_query_embedding
359
+ ELSE c.embedding_primary <=> p_query_embedding
360
+ END
361
+ LIMIT p_match_count;
362
+ END;
363
+ $$;
364
+
365
+ -- ── Document Reconstruction ───────────────────────────────────────────────────
366
+ -- Reassemble a full document from its chunks (ordered by chunk_index).
367
+ -- Agents use this after a chunk-level search to get broader context.
368
+
369
+ CREATE OR REPLACE FUNCTION cerefox_reconstruct_doc(
370
+ p_document_id UUID
371
+ )
372
+ RETURNS TABLE (
373
+ document_id UUID,
374
+ doc_title TEXT,
375
+ doc_source TEXT,
376
+ doc_metadata JSONB,
377
+ doc_project_ids UUID[],
378
+ doc_project_names TEXT[],
379
+ full_content TEXT,
380
+ chunk_count INT,
381
+ total_chars INT,
382
+ version_count INT
383
+ )
384
+ LANGUAGE sql
385
+ SECURITY DEFINER
386
+ STABLE
387
+ SET search_path = public, pg_catalog
388
+ AS $$
389
+ SELECT
390
+ d.id AS document_id,
391
+ d.title AS doc_title,
392
+ d.source AS doc_source,
393
+ d.metadata AS doc_metadata,
394
+ ARRAY(SELECT dp.project_id FROM cerefox_document_projects dp
395
+ WHERE dp.document_id = d.id) AS doc_project_ids,
396
+ ARRAY(SELECT p.name FROM cerefox_projects p
397
+ JOIN cerefox_document_projects dp ON p.id = dp.project_id
398
+ WHERE dp.document_id = d.id) AS doc_project_names,
399
+ STRING_AGG(c.content, E'\n\n' ORDER BY c.chunk_index) AS full_content,
400
+ COUNT(*)::INT AS chunk_count,
401
+ SUM(c.char_count)::INT AS total_chars,
402
+ (SELECT COUNT(*)::INT FROM cerefox_document_versions dv
403
+ WHERE dv.document_id = d.id) AS version_count
404
+ FROM cerefox_documents d
405
+ JOIN cerefox_chunks c ON c.document_id = d.id
406
+ WHERE d.id = p_document_id
407
+ AND c.version_id IS NULL
408
+ GROUP BY d.id, d.title, d.source, d.metadata;
409
+ $$;
410
+
411
+ -- ── cerefox_save_note ─────────────────────────────────────────────────────────
412
+ -- Agent write tool: create a minimal document record for a short text note.
413
+ -- Embedding and chunking are NOT done server-side in V1 — the Python ingestion
414
+ -- pipeline should be used for full ingest. This RPC is intended for quick
415
+ -- one-shot note capture from AI agents that want to store something immediately.
416
+ --
417
+ -- Parameters:
418
+ -- p_title : Note title (required)
419
+ -- p_content : Markdown content (required)
420
+ -- p_source : Origin label, e.g. 'agent' (default: 'agent')
421
+ -- p_project_id : Optional project UUID (assigns to a single project)
422
+ -- p_metadata : Optional JSONB metadata (e.g. agent name, session id)
423
+ --
424
+ -- Returns: the created document row (id, title, created_at)
425
+
426
+ CREATE OR REPLACE FUNCTION cerefox_save_note(
427
+ p_title TEXT,
428
+ p_content TEXT,
429
+ p_source TEXT DEFAULT 'agent',
430
+ p_project_id UUID DEFAULT NULL,
431
+ p_metadata JSONB DEFAULT '{}'::JSONB
432
+ )
433
+ RETURNS TABLE (
434
+ id UUID,
435
+ title TEXT,
436
+ created_at TIMESTAMPTZ
437
+ )
438
+ LANGUAGE plpgsql
439
+ SECURITY DEFINER
440
+ SET search_path = public, pg_catalog
441
+ AS $$
442
+ DECLARE
443
+ v_hash TEXT;
444
+ v_doc_id UUID;
445
+ v_created_at TIMESTAMPTZ;
446
+ BEGIN
447
+ -- Compute content hash to support deduplication on the caller side.
448
+ v_hash := encode(sha256(p_content::BYTEA), 'hex');
449
+
450
+ INSERT INTO cerefox_documents (
451
+ title, source, content_hash, metadata, chunk_count, total_chars
452
+ ) VALUES (
453
+ p_title, p_source, v_hash, p_metadata, 0, length(p_content)
454
+ )
455
+ RETURNING cerefox_documents.id, cerefox_documents.created_at
456
+ INTO v_doc_id, v_created_at;
457
+
458
+ -- Assign to project if provided (many-to-many junction).
459
+ IF p_project_id IS NOT NULL THEN
460
+ INSERT INTO cerefox_document_projects (document_id, project_id)
461
+ VALUES (v_doc_id, p_project_id)
462
+ ON CONFLICT DO NOTHING;
463
+ END IF;
464
+
465
+ RETURN QUERY SELECT v_doc_id, p_title, v_created_at;
466
+ END;
467
+ $$;
468
+
469
+ -- ── cerefox_context_expand ────────────────────────────────────────────────────
470
+ -- Small-to-big retrieval: given a set of chunk IDs from a search result,
471
+ -- return those chunks plus their immediate neighbours (±window_size by
472
+ -- chunk_index within the same document). Use this after a chunk-level search
473
+ -- to recover more surrounding context without fetching the full document.
474
+ --
475
+ -- Parameters:
476
+ -- p_chunk_ids : Array of chunk UUIDs from the search results
477
+ -- p_window_size : Number of chunks to expand in each direction (default: 1)
478
+ --
479
+ -- Returns each expanded chunk with is_seed=TRUE for original results.
480
+
481
+ CREATE OR REPLACE FUNCTION cerefox_context_expand(
482
+ p_chunk_ids UUID[],
483
+ p_window_size INT DEFAULT 1
484
+ )
485
+ RETURNS TABLE (
486
+ chunk_id UUID,
487
+ document_id UUID,
488
+ chunk_index INT,
489
+ title TEXT,
490
+ content TEXT,
491
+ heading_path TEXT[],
492
+ heading_level INT,
493
+ doc_title TEXT,
494
+ is_seed BOOL
495
+ )
496
+ LANGUAGE sql
497
+ SECURITY DEFINER
498
+ STABLE
499
+ SET search_path = public, pg_catalog
500
+ AS $$
501
+ WITH seeds AS (
502
+ SELECT c.id, c.document_id, c.chunk_index
503
+ FROM cerefox_chunks c
504
+ WHERE c.id = ANY(p_chunk_ids)
505
+ AND c.version_id IS NULL
506
+ ),
507
+ expanded AS (
508
+ SELECT DISTINCT c.id
509
+ FROM cerefox_chunks c
510
+ JOIN seeds s ON c.document_id = s.document_id
511
+ WHERE c.version_id IS NULL
512
+ AND c.chunk_index BETWEEN s.chunk_index - p_window_size
513
+ AND s.chunk_index + p_window_size
514
+ )
515
+ SELECT
516
+ c.id AS chunk_id,
517
+ c.document_id,
518
+ c.chunk_index,
519
+ c.title,
520
+ c.content,
521
+ c.heading_path,
522
+ c.heading_level,
523
+ d.title AS doc_title,
524
+ c.id = ANY(p_chunk_ids) AS is_seed
525
+ FROM expanded e
526
+ JOIN cerefox_chunks c ON c.id = e.id
527
+ JOIN cerefox_documents d ON c.document_id = d.id
528
+ ORDER BY c.document_id, c.chunk_index;
529
+ $$;
530
+
531
+ -- ── cerefox_search_docs ───────────────────────────────────────────────────────
532
+ -- Document-level hybrid search: runs hybrid search internally, deduplicates
533
+ -- results by document (keeping the best-scoring chunk per document), and
534
+ -- returns up to p_match_count *distinct documents* with their content.
535
+ --
536
+ -- ── RPC-level configuration (not exposed via .env) ────────────────────────────
537
+ -- Two params below are intentionally NOT surfaced in Python config or .env.
538
+ -- They are system-level tuning knobs with the same role as OPENAI_MODEL and
539
+ -- EMBEDDING_DIMENSIONS in the Edge Functions — change them here and redeploy
540
+ -- rpcs.sql (python scripts/db_deploy.py) if you need different values.
541
+ --
542
+ -- p_small_to_big_threshold (default: 20000 chars)
543
+ -- Documents larger than this return matched chunks + neighbours instead of
544
+ -- the full document. Set to 0 to always return full document content.
545
+ -- Rationale: at the default match_count=5 and 200 KB response ceiling,
546
+ -- 5 × 20 000 chars ≈ 100 KB — comfortably under the limit even before
547
+ -- accounting for small-to-big compression of large docs.
548
+ --
549
+ -- p_context_window (default: 1)
550
+ -- Neighbour chunks on each side of each matched chunk.
551
+ -- N=1 → up to 3 contiguous chunks per hit (prev, match, next).
552
+ -- N=0 → matched chunks only (no expansion).
553
+ -- N=2 → up to 5 contiguous chunks per hit.
554
+ -- ─────────────────────────────────────────────────────────────────────────────
555
+ --
556
+ -- Parameters:
557
+ -- p_query_text : Query string (used for FTS)
558
+ -- p_query_embedding : 768-dim query embedding (used for vector search)
559
+ -- p_match_count : Max documents to return (default: 5)
560
+ -- p_alpha : Semantic weight 0.0–1.0 (default: 0.7)
561
+ -- p_project_id : Optional project filter (M2M)
562
+ -- p_min_score : Minimum cosine similarity for vector results
563
+ -- p_small_to_big_threshold : See above (default: 20000)
564
+ -- p_context_window : See above (default: 1)
565
+ --
566
+ -- Returns one row per document. total_chars is always the full document size.
567
+ -- chunk_count reflects how many chunks are in full_content (may be partial).
568
+ -- is_partial = TRUE when the small-to-big path was taken for that document.
569
+
570
+ CREATE OR REPLACE FUNCTION cerefox_search_docs(
571
+ p_query_text TEXT,
572
+ p_query_embedding VECTOR(768),
573
+ p_match_count INT DEFAULT 5,
574
+ p_alpha FLOAT DEFAULT 0.7,
575
+ p_project_id UUID DEFAULT NULL,
576
+ p_min_score FLOAT DEFAULT 0.0,
577
+ p_small_to_big_threshold INT DEFAULT 20000,
578
+ p_context_window INT DEFAULT 1,
579
+ p_metadata_filter JSONB DEFAULT NULL
580
+ )
581
+ RETURNS TABLE (
582
+ document_id UUID,
583
+ doc_title TEXT,
584
+ doc_source TEXT,
585
+ doc_metadata JSONB,
586
+ doc_project_ids UUID[],
587
+ doc_project_names TEXT[],
588
+ best_score FLOAT,
589
+ best_chunk_heading_path TEXT[],
590
+ full_content TEXT,
591
+ chunk_count INT,
592
+ total_chars INT,
593
+ doc_updated_at TIMESTAMPTZ,
594
+ version_count INT,
595
+ is_partial BOOL
596
+ )
597
+ LANGUAGE sql
598
+ SECURITY DEFINER
599
+ STABLE
600
+ SET search_path = public, pg_catalog
601
+ AS $$
602
+ WITH chunk_results AS (
603
+ -- Run hybrid search with a 10x candidate pool so deduplication has
604
+ -- enough candidates to fill p_match_count unique documents.
605
+ SELECT * FROM cerefox_hybrid_search(
606
+ p_query_text := p_query_text,
607
+ p_query_embedding := p_query_embedding,
608
+ p_match_count := p_match_count * 10,
609
+ p_alpha := p_alpha,
610
+ p_use_upgrade := FALSE,
611
+ p_project_id := p_project_id,
612
+ p_min_score := p_min_score,
613
+ p_metadata_filter := p_metadata_filter
614
+ )
615
+ ),
616
+ best_per_doc AS (
617
+ -- One row per document: keep the highest-scoring chunk as representative.
618
+ SELECT DISTINCT ON (cr.document_id)
619
+ cr.document_id,
620
+ cr.heading_path AS best_chunk_heading_path,
621
+ cr.score AS best_score,
622
+ cr.doc_title,
623
+ cr.doc_source,
624
+ cr.doc_metadata,
625
+ cr.doc_project_ids,
626
+ cr.doc_project_names,
627
+ cr.version_count,
628
+ d.updated_at AS doc_updated_at
629
+ FROM chunk_results cr
630
+ JOIN cerefox_documents d ON d.id = cr.document_id
631
+ ORDER BY cr.document_id, cr.score DESC
632
+ ),
633
+ top_docs AS (
634
+ SELECT *
635
+ FROM best_per_doc
636
+ ORDER BY best_score DESC
637
+ LIMIT p_match_count
638
+ ),
639
+ -- Compute actual total_chars per top document (needed for threshold check).
640
+ doc_sizes AS (
641
+ SELECT c.document_id, SUM(c.char_count)::INT AS total_chars
642
+ FROM cerefox_chunks c
643
+ WHERE c.document_id IN (SELECT document_id FROM top_docs)
644
+ AND c.version_id IS NULL
645
+ GROUP BY c.document_id
646
+ ),
647
+ -- Matched chunk IDs from documents that exceed the threshold.
648
+ large_doc_seeds AS (
649
+ SELECT cr.chunk_id
650
+ FROM chunk_results cr
651
+ JOIN doc_sizes ds ON cr.document_id = ds.document_id
652
+ WHERE p_small_to_big_threshold > 0
653
+ AND ds.total_chars > p_small_to_big_threshold
654
+ AND cr.document_id IN (SELECT document_id FROM top_docs)
655
+ ),
656
+ -- Expand context for all large-doc seeds in a single call.
657
+ -- cerefox_context_expand respects document boundaries and deduplicates.
658
+ -- When large_doc_seeds is empty (threshold=0 or all docs are small),
659
+ -- ARRAY_AGG returns NULL; COALESCE converts that to an empty array so the
660
+ -- function returns 0 rows safely.
661
+ expanded AS (
662
+ SELECT ec.chunk_id, ec.document_id, ec.chunk_index, ec.content
663
+ FROM cerefox_context_expand(
664
+ COALESCE((SELECT ARRAY_AGG(chunk_id) FROM large_doc_seeds), ARRAY[]::UUID[]),
665
+ p_context_window
666
+ ) ec
667
+ ),
668
+ -- Aggregate expanded chunks per large document (is_partial = TRUE).
669
+ large_doc_content AS (
670
+ SELECT
671
+ e.document_id,
672
+ STRING_AGG(e.content, E'\n\n' ORDER BY e.chunk_index) AS full_content,
673
+ COUNT(*)::INT AS chunk_count,
674
+ TRUE AS is_partial
675
+ FROM expanded e
676
+ GROUP BY e.document_id
677
+ ),
678
+ -- Full content for small documents (is_partial = FALSE).
679
+ small_doc_content AS (
680
+ SELECT
681
+ c.document_id,
682
+ STRING_AGG(c.content, E'\n\n' ORDER BY c.chunk_index) AS full_content,
683
+ COUNT(*)::INT AS chunk_count,
684
+ FALSE AS is_partial
685
+ FROM cerefox_chunks c
686
+ WHERE c.document_id IN (SELECT document_id FROM top_docs)
687
+ AND c.document_id NOT IN (SELECT document_id FROM large_doc_content)
688
+ AND c.version_id IS NULL
689
+ GROUP BY c.document_id
690
+ ),
691
+ all_content AS (
692
+ SELECT document_id, full_content, chunk_count, is_partial FROM large_doc_content
693
+ UNION ALL
694
+ SELECT document_id, full_content, chunk_count, is_partial FROM small_doc_content
695
+ )
696
+ SELECT
697
+ td.document_id,
698
+ td.doc_title,
699
+ td.doc_source,
700
+ td.doc_metadata,
701
+ td.doc_project_ids,
702
+ td.doc_project_names,
703
+ td.best_score,
704
+ td.best_chunk_heading_path,
705
+ ac.full_content,
706
+ ac.chunk_count,
707
+ ds.total_chars, -- always full document size, even for partial results
708
+ td.doc_updated_at,
709
+ td.version_count,
710
+ ac.is_partial
711
+ FROM top_docs td
712
+ JOIN doc_sizes ds ON ds.document_id = td.document_id
713
+ JOIN all_content ac ON ac.document_id = td.document_id
714
+ ORDER BY td.best_score DESC;
715
+ $$;
716
+
717
+ -- ── Metadata key discovery RPC ───────────────────────────────────────────────
718
+ -- Derives metadata keys from actual document data (metadata JSONB column).
719
+ -- No registry table needed — always accurate, zero maintenance.
720
+ -- Used by CLI, MCP tools, web UI autocomplete.
721
+
722
+ -- ── cerefox_snapshot_version ──────────────────────────────────────────────────
723
+ -- Archives all current chunks for a document (sets version_id to the new version
724
+ -- row's UUID) and runs lazy retention cleanup.
725
+ --
726
+ -- Called by the Python pipeline's update_document() and by the TypeScript Edge
727
+ -- Functions before inserting new chunks. This single RPC is the canonical way to
728
+ -- create a version — do not split the chunk-archiving step into separate code.
729
+ --
730
+ -- Retention policy (p_retention_hours):
731
+ -- - Always keeps the most recently created version (accidental-deletion protection)
732
+ -- - Also keeps all versions created within the retention window
733
+ -- - Deletes older versions beyond the window (cascade removes their chunks)
734
+ --
735
+ -- Parameters:
736
+ -- p_document_id : Document to snapshot
737
+ -- p_source : How the update was triggered ('file','paste','agent','manual')
738
+ -- p_retention_hours : Retention window in hours (default: 48)
739
+ --
740
+ -- Returns: (version_id, version_number, chunk_count, total_chars) of the new version
741
+
742
+ DROP FUNCTION IF EXISTS cerefox_snapshot_version(UUID, TEXT, INT);
743
+ DROP FUNCTION IF EXISTS cerefox_snapshot_version(UUID, TEXT, INT, BOOLEAN);
744
+ CREATE FUNCTION cerefox_snapshot_version(
745
+ p_document_id UUID,
746
+ p_source TEXT DEFAULT 'manual',
747
+ p_retention_hours INT DEFAULT 48,
748
+ p_cleanup_enabled BOOLEAN DEFAULT TRUE
749
+ )
750
+ RETURNS TABLE (
751
+ version_id UUID,
752
+ version_number INT,
753
+ chunk_count INT,
754
+ total_chars INT
755
+ )
756
+ LANGUAGE plpgsql
757
+ SECURITY DEFINER
758
+ SET search_path = public, pg_catalog
759
+ AS $$
760
+ DECLARE
761
+ v_version_id UUID;
762
+ v_version_number INT;
763
+ v_chunk_count INT;
764
+ v_total_chars INT;
765
+ BEGIN
766
+ -- Count current chunks to record in the version metadata
767
+ SELECT COUNT(*), COALESCE(SUM(char_count), 0)
768
+ INTO v_chunk_count, v_total_chars
769
+ FROM cerefox_chunks c
770
+ WHERE c.document_id = p_document_id
771
+ AND c.version_id IS NULL;
772
+
773
+ -- Compute the next version number (sequential per document)
774
+ SELECT COALESCE(MAX(dv.version_number), 0) + 1
775
+ INTO v_version_number
776
+ FROM cerefox_document_versions dv
777
+ WHERE dv.document_id = p_document_id;
778
+
779
+ -- Create the version row
780
+ INSERT INTO cerefox_document_versions (
781
+ document_id, version_number, source, chunk_count, total_chars
782
+ ) VALUES (
783
+ p_document_id, v_version_number, p_source, v_chunk_count, v_total_chars
784
+ )
785
+ RETURNING id INTO v_version_id;
786
+
787
+ -- Archive all current chunks by pointing them at the new version
788
+ UPDATE cerefox_chunks c
789
+ SET version_id = v_version_id
790
+ WHERE c.document_id = p_document_id
791
+ AND c.version_id IS NULL;
792
+
793
+ -- Lazy retention: delete versions outside the retention window,
794
+ -- but always keep the most recently created version (the one we just made).
795
+ -- Skip archived versions (archived=true) -- they are protected from cleanup.
796
+ -- Skip cleanup entirely if p_cleanup_enabled is false (immutable mode).
797
+ IF p_cleanup_enabled THEN
798
+ DELETE FROM cerefox_document_versions dv
799
+ WHERE dv.document_id = p_document_id
800
+ AND dv.archived IS NOT TRUE
801
+ AND dv.created_at < NOW() - (p_retention_hours || ' hours')::INTERVAL
802
+ AND dv.id != (
803
+ SELECT id FROM cerefox_document_versions
804
+ WHERE document_id = p_document_id
805
+ ORDER BY created_at DESC
806
+ LIMIT 1
807
+ );
808
+ END IF;
809
+
810
+ RETURN QUERY SELECT v_version_id, v_version_number, v_chunk_count, v_total_chars;
811
+ END;
812
+ $$;
813
+
814
+ -- ── cerefox_get_document ──────────────────────────────────────────────────────
815
+ -- Returns the full content of a document by reconstructing it from chunks.
816
+ -- Pass p_version_id = NULL (or omit it) for the current version.
817
+ -- Pass a specific version UUID to retrieve an archived version.
818
+ -- Version UUIDs are returned by cerefox_list_document_versions.
819
+
820
+ CREATE FUNCTION cerefox_get_document(
821
+ p_document_id UUID,
822
+ p_version_id UUID DEFAULT NULL
823
+ )
824
+ RETURNS TABLE (
825
+ document_id UUID,
826
+ doc_title TEXT,
827
+ doc_source TEXT,
828
+ doc_metadata JSONB,
829
+ doc_project_ids UUID[],
830
+ doc_project_names TEXT[],
831
+ version_id UUID,
832
+ full_content TEXT,
833
+ chunk_count INT,
834
+ total_chars INT,
835
+ created_at TIMESTAMPTZ
836
+ )
837
+ LANGUAGE sql
838
+ SECURITY DEFINER
839
+ STABLE
840
+ SET search_path = public, pg_catalog
841
+ AS $$
842
+ SELECT
843
+ d.id AS document_id,
844
+ d.title AS doc_title,
845
+ d.source AS doc_source,
846
+ d.metadata AS doc_metadata,
847
+ ARRAY(SELECT dp.project_id FROM cerefox_document_projects dp
848
+ WHERE dp.document_id = d.id) AS doc_project_ids,
849
+ ARRAY(SELECT p.name FROM cerefox_projects p
850
+ JOIN cerefox_document_projects dp ON p.id = dp.project_id
851
+ WHERE dp.document_id = d.id) AS doc_project_names,
852
+ p_version_id AS version_id,
853
+ STRING_AGG(c.content, E'\n\n' ORDER BY c.chunk_index) AS full_content,
854
+ COUNT(*)::INT AS chunk_count,
855
+ SUM(c.char_count)::INT AS total_chars,
856
+ d.created_at
857
+ FROM cerefox_documents d
858
+ JOIN cerefox_chunks c ON c.document_id = d.id
859
+ WHERE d.id = p_document_id
860
+ AND (
861
+ (p_version_id IS NULL AND c.version_id IS NULL) OR
862
+ (p_version_id IS NOT NULL AND c.version_id = p_version_id)
863
+ )
864
+ GROUP BY d.id, d.title, d.source, d.metadata, d.created_at;
865
+ $$;
866
+
867
+ -- ── cerefox_list_document_versions ────────────────────────────────────────────
868
+ -- Returns all archived versions for a document, newest first.
869
+ -- version_id is the UUID to pass to cerefox_get_document for retrieval.
870
+ -- version_number is the sequential human-readable number (unique per document).
871
+
872
+ DROP FUNCTION IF EXISTS cerefox_list_document_versions(UUID);
873
+ CREATE FUNCTION cerefox_list_document_versions(
874
+ p_document_id UUID
875
+ )
876
+ RETURNS TABLE (
877
+ version_id UUID,
878
+ version_number INT,
879
+ source TEXT,
880
+ chunk_count INT,
881
+ total_chars INT,
882
+ archived BOOLEAN,
883
+ created_at TIMESTAMPTZ
884
+ )
885
+ LANGUAGE sql
886
+ SECURITY DEFINER
887
+ STABLE
888
+ SET search_path = public, pg_catalog
889
+ AS $$
890
+ SELECT id, version_number, source, chunk_count, total_chars, archived, created_at
891
+ FROM cerefox_document_versions
892
+ WHERE document_id = p_document_id
893
+ ORDER BY created_at DESC;
894
+ $$;
895
+
896
+ -- ── cerefox_delete_document (soft delete) ────────────────────────────────────
897
+ -- Soft-deletes a document by setting deleted_at = NOW(). The document, its
898
+ -- chunks, and versions remain in the database but are excluded from search.
899
+ -- Use cerefox_purge_document for permanent deletion.
900
+ -- Use cerefox_restore_document to undo a soft delete.
901
+
902
+ DROP FUNCTION IF EXISTS cerefox_delete_document(UUID, TEXT, TEXT);
903
+ DROP FUNCTION IF EXISTS cerefox_delete_document(UUID);
904
+ CREATE FUNCTION cerefox_delete_document(
905
+ p_document_id UUID,
906
+ p_author TEXT DEFAULT 'unknown',
907
+ p_author_type TEXT DEFAULT 'user'
908
+ )
909
+ RETURNS VOID
910
+ LANGUAGE plpgsql
911
+ SECURITY DEFINER
912
+ SET search_path = public, pg_catalog
913
+ AS $$
914
+ DECLARE
915
+ v_title TEXT;
916
+ v_total_chars INT;
917
+ BEGIN
918
+ SELECT title, total_chars INTO v_title, v_total_chars
919
+ FROM cerefox_documents WHERE id = p_document_id;
920
+
921
+ IF NOT FOUND THEN
922
+ RAISE EXCEPTION 'Document % not found', p_document_id;
923
+ END IF;
924
+
925
+ -- Soft delete: set deleted_at timestamp
926
+ UPDATE cerefox_documents SET deleted_at = NOW() WHERE id = p_document_id;
927
+
928
+ PERFORM cerefox_create_audit_entry(
929
+ p_document_id := p_document_id,
930
+ p_operation := 'delete',
931
+ p_author := p_author,
932
+ p_author_type := p_author_type,
933
+ p_size_before := v_total_chars,
934
+ p_size_after := 0,
935
+ p_description := 'Soft-deleted document: ' || COALESCE(v_title, '(untitled)') ||
936
+ ' (' || COALESCE(v_total_chars, 0) || ' chars)'
937
+ );
938
+ END;
939
+ $$;
940
+
941
+ -- ── cerefox_restore_document ─────────────────────────────────────────────────
942
+ -- Restores a soft-deleted document by clearing deleted_at.
943
+
944
+ CREATE OR REPLACE FUNCTION cerefox_restore_document(
945
+ p_document_id UUID,
946
+ p_author TEXT DEFAULT 'unknown',
947
+ p_author_type TEXT DEFAULT 'user'
948
+ )
949
+ RETURNS VOID
950
+ LANGUAGE plpgsql
951
+ SECURITY DEFINER
952
+ SET search_path = public, pg_catalog
953
+ AS $$
954
+ DECLARE
955
+ v_title TEXT;
956
+ v_total_chars INT;
957
+ BEGIN
958
+ SELECT title, total_chars INTO v_title, v_total_chars
959
+ FROM cerefox_documents WHERE id = p_document_id AND deleted_at IS NOT NULL;
960
+
961
+ IF v_title IS NULL THEN
962
+ RETURN; -- Not found or not deleted
963
+ END IF;
964
+
965
+ UPDATE cerefox_documents SET deleted_at = NULL WHERE id = p_document_id;
966
+
967
+ PERFORM cerefox_create_audit_entry(
968
+ p_document_id := p_document_id,
969
+ p_operation := 'restore',
970
+ p_author := p_author,
971
+ p_author_type := p_author_type,
972
+ p_size_before := 0,
973
+ p_size_after := v_total_chars,
974
+ p_description := 'Restored document: ' || COALESCE(v_title, '(untitled)')
975
+ );
976
+ END;
977
+ $$;
978
+
979
+ -- ── cerefox_purge_document ───────────────────────────────────────────────────
980
+ -- Permanently deletes a soft-deleted document (CASCADE). Only works on
981
+ -- documents that are already soft-deleted (deleted_at IS NOT NULL).
982
+
983
+ CREATE OR REPLACE FUNCTION cerefox_purge_document(
984
+ p_document_id UUID,
985
+ p_author TEXT DEFAULT 'unknown',
986
+ p_author_type TEXT DEFAULT 'user'
987
+ )
988
+ RETURNS VOID
989
+ LANGUAGE plpgsql
990
+ SECURITY DEFINER
991
+ SET search_path = public, pg_catalog
992
+ AS $$
993
+ DECLARE
994
+ v_title TEXT;
995
+ v_total_chars INT;
996
+ BEGIN
997
+ SELECT title, total_chars INTO v_title, v_total_chars
998
+ FROM cerefox_documents WHERE id = p_document_id AND deleted_at IS NOT NULL;
999
+
1000
+ IF v_title IS NULL THEN
1001
+ RETURN; -- Not found or not soft-deleted
1002
+ END IF;
1003
+
1004
+ PERFORM cerefox_create_audit_entry(
1005
+ p_document_id := p_document_id,
1006
+ p_operation := 'delete',
1007
+ p_author := p_author,
1008
+ p_author_type := p_author_type,
1009
+ p_size_before := v_total_chars,
1010
+ p_size_after := 0,
1011
+ p_description := 'Permanently deleted document: ' || COALESCE(v_title, '(untitled)') ||
1012
+ ' (' || COALESCE(v_total_chars, 0) || ' chars)'
1013
+ );
1014
+
1015
+ DELETE FROM cerefox_documents WHERE id = p_document_id;
1016
+ END;
1017
+ $$;
1018
+
1019
+
1020
+ -- ── cerefox_ingest_document ──────────────────────────────────────────────────
1021
+ -- Single RPC for ingesting a document (create or update). Handles:
1022
+ -- - Create: insert document row, insert chunks, set review_status, create audit entry
1023
+ -- - Update: snapshot old version, delete old chunks, update document row,
1024
+ -- insert new chunks, set review_status, create audit entry
1025
+ --
1026
+ -- Both the Python pipeline and the Edge Function call this after chunking and
1027
+ -- embedding. This is the single implementation of the ingestion write path.
1028
+ --
1029
+ -- Parameters:
1030
+ -- p_document_id : NULL for create, UUID for update
1031
+ -- p_title, p_source, p_source_path, p_content_hash, p_metadata : document fields
1032
+ -- p_review_status : 'approved' or 'pending_review' (based on author_type)
1033
+ -- p_chunks : JSONB array of chunk objects, each with:
1034
+ -- chunk_index, heading_path, heading_level, title,
1035
+ -- content, char_count, embedding (float[]), embedder (text)
1036
+ -- p_author, p_author_type : for audit entry
1037
+ -- p_source_label : version source label for snapshot ('file','paste','agent','manual')
1038
+ -- p_retention_hours : for version cleanup (default 48)
1039
+ -- p_cleanup_enabled : whether version cleanup runs (default true)
1040
+ --
1041
+ -- Returns: document_id, chunk_count, total_chars, operation ('create' or 'update-content'),
1042
+ -- version_id (UUID of snapshot, null on create)
1043
+
1044
+ DROP FUNCTION IF EXISTS cerefox_ingest_document(UUID, TEXT, TEXT, TEXT, TEXT, JSONB, TEXT, JSONB, TEXT, TEXT, TEXT, INT, BOOLEAN);
1045
+ CREATE FUNCTION cerefox_ingest_document(
1046
+ p_document_id UUID DEFAULT NULL,
1047
+ p_title TEXT DEFAULT 'Untitled',
1048
+ p_source TEXT DEFAULT 'agent',
1049
+ p_source_path TEXT DEFAULT NULL,
1050
+ p_content_hash TEXT DEFAULT '',
1051
+ p_metadata JSONB DEFAULT '{}',
1052
+ p_review_status TEXT DEFAULT 'approved',
1053
+ p_chunks JSONB DEFAULT '[]',
1054
+ p_author TEXT DEFAULT 'unknown',
1055
+ p_author_type TEXT DEFAULT 'user',
1056
+ p_source_label TEXT DEFAULT 'manual',
1057
+ p_retention_hours INT DEFAULT 48,
1058
+ p_cleanup_enabled BOOLEAN DEFAULT TRUE
1059
+ )
1060
+ RETURNS TABLE (
1061
+ document_id UUID,
1062
+ chunk_count INT,
1063
+ total_chars INT,
1064
+ operation TEXT,
1065
+ version_id UUID
1066
+ )
1067
+ LANGUAGE plpgsql
1068
+ SECURITY DEFINER
1069
+ SET search_path = public, pg_catalog
1070
+ AS $$
1071
+ DECLARE
1072
+ v_doc_id UUID;
1073
+ v_chunk_count INT;
1074
+ v_total_chars INT;
1075
+ v_operation TEXT;
1076
+ v_version_id UUID := NULL;
1077
+ v_old_chars INT := 0;
1078
+ v_chunk JSONB;
1079
+ v_snap RECORD;
1080
+ v_status TEXT;
1081
+ BEGIN
1082
+ -- ── Zero-chunk guard (v0.3.1) ────────────────────────────────────────
1083
+ -- Refuse to create or update a document with no chunks. Three reasons:
1084
+ -- 1. A zero-chunk document is meaningless on its own (no body, no
1085
+ -- embeddings, can't be searched).
1086
+ -- 2. The SQL signature has DEFAULTs for every parameter, so calling
1087
+ -- `SELECT cerefox_ingest_document()` with no args used to create
1088
+ -- an orphan `Untitled` row. v0.3.0's db-client introspection
1089
+ -- fallback hit this path; see the v0.3.1 Decision Log entry.
1090
+ -- 3. It papers over the asymmetry between `list_documents` (returns
1091
+ -- 0-chunk rows) and `cerefox_get_document` (404s on them).
1092
+ -- Cheaper to refuse the write than to fix both queries.
1093
+ -- If you actually need to clear a doc's content, soft-delete it.
1094
+ IF p_chunks IS NULL OR jsonb_array_length(p_chunks) = 0 THEN
1095
+ RAISE EXCEPTION
1096
+ 'cerefox_ingest_document: refusing to write a document with zero chunks (title=%, source=%). Supply at least one chunk, or use cerefox_delete_document to clear content.',
1097
+ p_title, p_source
1098
+ USING ERRCODE = '22023'; -- invalid_parameter_value
1099
+ END IF;
1100
+
1101
+ -- Validate review_status
1102
+ v_status := CASE WHEN p_review_status IN ('approved', 'pending_review')
1103
+ THEN p_review_status ELSE 'approved' END;
1104
+
1105
+ -- Count chunks and total chars from the input
1106
+ v_chunk_count := jsonb_array_length(p_chunks);
1107
+ v_total_chars := 0;
1108
+ FOR v_chunk IN SELECT * FROM jsonb_array_elements(p_chunks) LOOP
1109
+ v_total_chars := v_total_chars + COALESCE((v_chunk->>'char_count')::INT, 0);
1110
+ END LOOP;
1111
+
1112
+ IF p_document_id IS NOT NULL THEN
1113
+ -- ── UPDATE PATH ──────────────────────────────────────────────
1114
+ v_doc_id := p_document_id;
1115
+ v_operation := 'update-content';
1116
+
1117
+ -- Get old size for audit
1118
+ SELECT COALESCE(d.total_chars, 0) INTO v_old_chars
1119
+ FROM cerefox_documents d WHERE d.id = v_doc_id;
1120
+
1121
+ -- Snapshot old version (archives current chunks, runs retention cleanup)
1122
+ SELECT sv.version_id INTO v_version_id
1123
+ FROM cerefox_snapshot_version(v_doc_id, p_source_label, p_retention_hours, p_cleanup_enabled) sv;
1124
+
1125
+ -- Update document record
1126
+ UPDATE cerefox_documents SET
1127
+ title = p_title,
1128
+ source = p_source,
1129
+ source_path = COALESCE(p_source_path, source_path),
1130
+ content_hash = p_content_hash,
1131
+ metadata = p_metadata,
1132
+ chunk_count = v_chunk_count,
1133
+ total_chars = v_total_chars,
1134
+ review_status = v_status,
1135
+ updated_at = NOW()
1136
+ WHERE id = v_doc_id;
1137
+
1138
+ ELSE
1139
+ -- ── CREATE PATH ──────────────────────────────────────────────
1140
+ v_operation := 'create';
1141
+
1142
+ INSERT INTO cerefox_documents (
1143
+ title, source, source_path, content_hash, metadata,
1144
+ chunk_count, total_chars, review_status
1145
+ ) VALUES (
1146
+ p_title, p_source, p_source_path, p_content_hash, p_metadata,
1147
+ v_chunk_count, v_total_chars, v_status
1148
+ )
1149
+ RETURNING id INTO v_doc_id;
1150
+ END IF;
1151
+
1152
+ -- ── Insert chunks ────────────────────────────────────────────────
1153
+ -- fts is computed here (Option B) using p_title (document title, already a parameter)
1154
+ -- and the chunk's own heading title + content. This avoids pre-computing tsvectors in
1155
+ -- the Python/TypeScript callers and keeps logic in one place (single-implementation).
1156
+ -- Formula: doc_title (A) || chunk_heading (A) || body_content (B)
1157
+ INSERT INTO cerefox_chunks (
1158
+ document_id, chunk_index, heading_path, heading_level,
1159
+ title, content, char_count, embedding_primary, embedder_primary, fts
1160
+ )
1161
+ SELECT
1162
+ v_doc_id,
1163
+ (c->>'chunk_index')::INT,
1164
+ ARRAY(SELECT jsonb_array_elements_text(c->'heading_path')),
1165
+ (c->>'heading_level')::INT,
1166
+ c->>'title',
1167
+ c->>'content',
1168
+ (c->>'char_count')::INT,
1169
+ (SELECT array_agg(e::FLOAT)::VECTOR(768) FROM jsonb_array_elements_text(c->'embedding') AS e),
1170
+ c->>'embedder',
1171
+ setweight(to_tsvector('english', COALESCE(p_title, '')), 'A') ||
1172
+ setweight(to_tsvector('english', COALESCE(c->>'title', '')), 'A') ||
1173
+ setweight(to_tsvector('english', COALESCE(c->>'content', '')), 'B')
1174
+ FROM jsonb_array_elements(p_chunks) AS c;
1175
+
1176
+ -- ── Audit entry ──────────────────────────────────────────────────
1177
+ PERFORM cerefox_create_audit_entry(
1178
+ p_document_id := v_doc_id,
1179
+ p_version_id := v_version_id,
1180
+ p_operation := v_operation,
1181
+ p_author := p_author,
1182
+ p_author_type := p_author_type,
1183
+ p_size_before := CASE WHEN v_operation = 'create' THEN NULL ELSE v_old_chars END,
1184
+ p_size_after := v_total_chars,
1185
+ p_description := v_operation || ': ' || p_title || ' (' || v_chunk_count || ' chunks, ' || v_total_chars || ' chars)'
1186
+ );
1187
+
1188
+ RETURN QUERY SELECT v_doc_id, v_chunk_count, v_total_chars, v_operation, v_version_id;
1189
+ END;
1190
+ $$;
1191
+
1192
+
1193
+ -- ── cerefox_update_chunk_fts ──────────────────────────────────────────────────
1194
+ -- Updates the FTS tsvector for all current chunks of a document using a new
1195
+ -- document title. Called when a document's title changes without a content change
1196
+ -- (the content-unchanged path in the ingestion pipeline skips cerefox_ingest_document).
1197
+ --
1198
+ -- Formula: doc_title (A) || chunk_heading (A) || body_content (B)
1199
+ -- Reads chunk title and content directly from the DB -- caller only needs to
1200
+ -- supply the new document title.
1201
+ --
1202
+ -- Only affects current chunks (version_id IS NULL). Archived chunks retain their
1203
+ -- original tsvectors (they are excluded from all search indexes and require
1204
+ -- re-ingestion to restore anyway).
1205
+
1206
+ DROP FUNCTION IF EXISTS cerefox_update_chunk_fts(UUID, TEXT);
1207
+ CREATE FUNCTION cerefox_update_chunk_fts(
1208
+ p_document_id UUID,
1209
+ p_new_title TEXT
1210
+ )
1211
+ RETURNS VOID
1212
+ LANGUAGE sql
1213
+ SECURITY DEFINER
1214
+ SET search_path = public, pg_catalog
1215
+ AS $$
1216
+ UPDATE cerefox_chunks
1217
+ SET fts =
1218
+ setweight(to_tsvector('english', COALESCE(p_new_title, '')), 'A') ||
1219
+ setweight(to_tsvector('english', COALESCE(title, '')), 'A') ||
1220
+ setweight(to_tsvector('english', COALESCE(content, '')), 'B')
1221
+ WHERE document_id = p_document_id
1222
+ AND version_id IS NULL;
1223
+ $$;
1224
+
1225
+
1226
+ -- ── cerefox_create_audit_entry ────────────────────────────────────────────────
1227
+ -- Inserts an immutable audit log entry. Called by all access paths (Python
1228
+ -- pipeline, Edge Functions, MCP) to maintain the single implementation principle.
1229
+ -- Returns the created entry's id and created_at.
1230
+
1231
+ DROP FUNCTION IF EXISTS cerefox_create_audit_entry(UUID, UUID, TEXT, TEXT, TEXT, INT, INT, TEXT);
1232
+ CREATE FUNCTION cerefox_create_audit_entry(
1233
+ p_document_id UUID DEFAULT NULL,
1234
+ p_version_id UUID DEFAULT NULL,
1235
+ p_operation TEXT DEFAULT 'create',
1236
+ p_author TEXT DEFAULT 'unknown',
1237
+ p_author_type TEXT DEFAULT 'user',
1238
+ p_size_before INT DEFAULT NULL,
1239
+ p_size_after INT DEFAULT NULL,
1240
+ p_description TEXT DEFAULT ''
1241
+ )
1242
+ RETURNS TABLE (
1243
+ audit_id UUID,
1244
+ created_at TIMESTAMPTZ
1245
+ )
1246
+ LANGUAGE sql
1247
+ SECURITY DEFINER
1248
+ SET search_path = public, pg_catalog
1249
+ AS $$
1250
+ INSERT INTO cerefox_audit_log (
1251
+ document_id, version_id, operation, author, author_type,
1252
+ size_before, size_after, description
1253
+ )
1254
+ VALUES (
1255
+ p_document_id, p_version_id, p_operation, p_author,
1256
+ CASE WHEN p_author_type IN ('user', 'agent') THEN p_author_type ELSE 'user' END,
1257
+ p_size_before, p_size_after, p_description
1258
+ )
1259
+ RETURNING id AS audit_id, cerefox_audit_log.created_at;
1260
+ $$;
1261
+
1262
+ -- ── cerefox_list_audit_entries ────────────────────────────────────────────────
1263
+ -- Returns audit log entries with optional filters. Joins cerefox_documents to
1264
+ -- include doc_title. Used by the web UI, Edge Function, and MCP tool.
1265
+ --
1266
+ -- Parameters:
1267
+ -- p_document_id : Filter by document (NULL = all)
1268
+ -- p_author : Filter by author (NULL = all)
1269
+ -- p_operation : Filter by operation type (NULL = all)
1270
+ -- p_since : Return entries created at or after this timestamp (NULL = no lower bound)
1271
+ -- p_until : Return entries created at or before this timestamp (NULL = no upper bound)
1272
+ -- p_limit : Max entries to return (default: 50)
1273
+
1274
+ DROP FUNCTION IF EXISTS cerefox_list_audit_entries(UUID, TEXT, TEXT, TIMESTAMPTZ, TIMESTAMPTZ, INT);
1275
+ CREATE FUNCTION cerefox_list_audit_entries(
1276
+ p_document_id UUID DEFAULT NULL,
1277
+ p_author TEXT DEFAULT NULL,
1278
+ p_operation TEXT DEFAULT NULL,
1279
+ p_since TIMESTAMPTZ DEFAULT NULL,
1280
+ p_until TIMESTAMPTZ DEFAULT NULL,
1281
+ p_limit INT DEFAULT 50
1282
+ )
1283
+ RETURNS TABLE (
1284
+ id UUID,
1285
+ document_id UUID,
1286
+ doc_title TEXT,
1287
+ version_id UUID,
1288
+ operation TEXT,
1289
+ author TEXT,
1290
+ author_type TEXT,
1291
+ size_before INT,
1292
+ size_after INT,
1293
+ description TEXT,
1294
+ created_at TIMESTAMPTZ
1295
+ )
1296
+ LANGUAGE sql
1297
+ SECURITY DEFINER
1298
+ STABLE
1299
+ SET search_path = public, pg_catalog
1300
+ AS $$
1301
+ SELECT
1302
+ a.id,
1303
+ a.document_id,
1304
+ d.title AS doc_title,
1305
+ a.version_id,
1306
+ a.operation,
1307
+ a.author,
1308
+ a.author_type,
1309
+ a.size_before,
1310
+ a.size_after,
1311
+ a.description,
1312
+ a.created_at
1313
+ FROM cerefox_audit_log a
1314
+ LEFT JOIN cerefox_documents d ON d.id = a.document_id
1315
+ WHERE (p_document_id IS NULL OR a.document_id = p_document_id)
1316
+ AND (p_author IS NULL OR a.author = p_author)
1317
+ AND (p_operation IS NULL OR a.operation = p_operation)
1318
+ AND (p_since IS NULL OR a.created_at >= p_since)
1319
+ AND (p_until IS NULL OR a.created_at <= p_until)
1320
+ ORDER BY a.created_at DESC
1321
+ LIMIT p_limit;
1322
+ $$;
1323
+
1324
+ -- ── Metadata key discovery RPC ────────────────────────────────────────────────
1325
+ -- Derives metadata keys from actual document data (metadata JSONB column).
1326
+ -- No registry table needed; always accurate, zero maintenance.
1327
+ -- Used by CLI, MCP tools, web UI autocomplete.
1328
+
1329
+ DROP FUNCTION IF EXISTS cerefox_list_metadata_keys();
1330
+ CREATE FUNCTION cerefox_list_metadata_keys()
1331
+ RETURNS TABLE (
1332
+ key TEXT,
1333
+ doc_count BIGINT,
1334
+ example_values TEXT[]
1335
+ )
1336
+ LANGUAGE sql
1337
+ SECURITY DEFINER
1338
+ STABLE
1339
+ SET search_path = public, pg_catalog
1340
+ AS $$
1341
+ SELECT
1342
+ k.key,
1343
+ COUNT(DISTINCT d.id) AS doc_count,
1344
+ (ARRAY_AGG(DISTINCT d.metadata ->> k.key) FILTER
1345
+ (WHERE d.metadata ->> k.key IS NOT NULL))[1:5] AS example_values
1346
+ FROM cerefox_documents d,
1347
+ LATERAL jsonb_object_keys(d.metadata) AS k(key)
1348
+ WHERE d.metadata IS NOT NULL
1349
+ AND d.metadata != '{}'::jsonb
1350
+ GROUP BY k.key
1351
+ ORDER BY doc_count DESC, k.key;
1352
+ $$;
1353
+
1354
+ -- ── cerefox_list_projects ────────────────────────────────────────────────────
1355
+ -- Lists all projects. Used by MCP tools for project discovery and by the
1356
+ -- web UI for project name dropdowns.
1357
+
1358
+ CREATE OR REPLACE FUNCTION cerefox_list_projects()
1359
+ RETURNS TABLE (
1360
+ id UUID,
1361
+ name TEXT,
1362
+ description TEXT
1363
+ )
1364
+ LANGUAGE sql
1365
+ SECURITY DEFINER
1366
+ STABLE
1367
+ SET search_path = public, pg_catalog
1368
+ AS $$
1369
+ SELECT p.id, p.name, p.description
1370
+ FROM cerefox_projects p
1371
+ ORDER BY p.name;
1372
+ $$;
1373
+
1374
+ -- ── cerefox_metadata_search ──────────────────────────────────────────────────
1375
+ -- Query documents by metadata key-value criteria without a text search term.
1376
+ -- Uses JSONB containment (@>) which leverages the existing GIN index on
1377
+ -- cerefox_documents.metadata.
1378
+ --
1379
+ -- Parameters:
1380
+ -- p_metadata_filter : JSONB containment filter (AND semantics for all keys)
1381
+ -- p_project_id : Optional project UUID filter
1382
+ -- p_updated_since : Only docs updated on or after this timestamp
1383
+ -- p_created_since : Only docs created on or after this timestamp
1384
+ -- p_limit : Max results (default 10)
1385
+ -- p_include_content : When TRUE, reconstruct full text from current chunks
1386
+ -- p_max_bytes : Byte budget for accumulated content (NULL = no limit)
1387
+
1388
+ CREATE OR REPLACE FUNCTION cerefox_metadata_search(
1389
+ p_metadata_filter JSONB,
1390
+ p_project_id UUID DEFAULT NULL,
1391
+ p_updated_since TIMESTAMPTZ DEFAULT NULL,
1392
+ p_created_since TIMESTAMPTZ DEFAULT NULL,
1393
+ p_limit INT DEFAULT 10,
1394
+ p_include_content BOOLEAN DEFAULT FALSE,
1395
+ p_max_bytes INT DEFAULT NULL
1396
+ )
1397
+ RETURNS TABLE (
1398
+ document_id UUID,
1399
+ title TEXT,
1400
+ doc_metadata JSONB,
1401
+ review_status TEXT,
1402
+ source TEXT,
1403
+ created_at TIMESTAMPTZ,
1404
+ updated_at TIMESTAMPTZ,
1405
+ total_chars INT,
1406
+ chunk_count INT,
1407
+ project_ids UUID[],
1408
+ project_names TEXT[],
1409
+ version_count INT,
1410
+ content TEXT
1411
+ )
1412
+ LANGUAGE plpgsql
1413
+ SECURITY DEFINER
1414
+ SET search_path = public, pg_catalog
1415
+ AS $$
1416
+ DECLARE
1417
+ v_bytes_used INT := 0;
1418
+ v_row RECORD;
1419
+ v_row_bytes INT;
1420
+ BEGIN
1421
+ FOR v_row IN
1422
+ SELECT
1423
+ d.id AS document_id,
1424
+ d.title,
1425
+ d.metadata AS doc_metadata,
1426
+ d.review_status,
1427
+ d.source,
1428
+ d.created_at,
1429
+ d.updated_at,
1430
+ d.total_chars,
1431
+ d.chunk_count,
1432
+ ARRAY(SELECT dp.project_id FROM cerefox_document_projects dp
1433
+ WHERE dp.document_id = d.id) AS project_ids,
1434
+ ARRAY(SELECT p.name FROM cerefox_projects p
1435
+ JOIN cerefox_document_projects dp ON p.id = dp.project_id
1436
+ WHERE dp.document_id = d.id) AS project_names,
1437
+ (SELECT COUNT(*)::INT FROM cerefox_document_versions dv
1438
+ WHERE dv.document_id = d.id) AS version_count,
1439
+ CASE WHEN p_include_content THEN
1440
+ (SELECT STRING_AGG(c.content, E'\n\n' ORDER BY c.chunk_index)
1441
+ FROM cerefox_chunks c
1442
+ WHERE c.document_id = d.id AND c.version_id IS NULL)
1443
+ ELSE NULL END AS content
1444
+ FROM cerefox_documents d
1445
+ WHERE d.metadata @> p_metadata_filter
1446
+ AND d.deleted_at IS NULL
1447
+ AND (p_project_id IS NULL OR EXISTS (
1448
+ SELECT 1 FROM cerefox_document_projects dp
1449
+ WHERE dp.document_id = d.id AND dp.project_id = p_project_id
1450
+ ))
1451
+ AND (p_updated_since IS NULL OR d.updated_at >= p_updated_since)
1452
+ AND (p_created_since IS NULL OR d.created_at >= p_created_since)
1453
+ ORDER BY d.updated_at DESC
1454
+ LIMIT p_limit
1455
+ LOOP
1456
+ -- Byte budget enforcement (when p_max_bytes is set and content is included)
1457
+ IF p_max_bytes IS NOT NULL AND p_include_content AND v_row.content IS NOT NULL THEN
1458
+ v_row_bytes := octet_length(v_row.content);
1459
+ IF v_bytes_used + v_row_bytes > p_max_bytes THEN
1460
+ EXIT; -- stop emitting rows
1461
+ END IF;
1462
+ v_bytes_used := v_bytes_used + v_row_bytes;
1463
+ END IF;
1464
+
1465
+ document_id := v_row.document_id;
1466
+ title := v_row.title;
1467
+ doc_metadata := v_row.doc_metadata;
1468
+ review_status := v_row.review_status;
1469
+ source := v_row.source;
1470
+ created_at := v_row.created_at;
1471
+ updated_at := v_row.updated_at;
1472
+ total_chars := v_row.total_chars;
1473
+ chunk_count := v_row.chunk_count;
1474
+ project_ids := v_row.project_ids;
1475
+ project_names := v_row.project_names;
1476
+ version_count := v_row.version_count;
1477
+ content := v_row.content;
1478
+ RETURN NEXT;
1479
+ END LOOP;
1480
+ END;
1481
+ $$;
1482
+
1483
+ -- ── cerefox_get_config / cerefox_set_config ──────────────────────────────────
1484
+ -- Read/write key-value config from cerefox_config table.
1485
+
1486
+ CREATE OR REPLACE FUNCTION cerefox_get_config(p_key TEXT)
1487
+ RETURNS TEXT
1488
+ LANGUAGE sql
1489
+ SECURITY DEFINER
1490
+ STABLE
1491
+ SET search_path = public, pg_catalog
1492
+ AS $$
1493
+ SELECT value FROM cerefox_config WHERE key = p_key;
1494
+ $$;
1495
+
1496
+ CREATE OR REPLACE FUNCTION cerefox_set_config(p_key TEXT, p_value TEXT)
1497
+ RETURNS VOID
1498
+ LANGUAGE plpgsql
1499
+ SECURITY DEFINER
1500
+ SET search_path = public, pg_catalog
1501
+ AS $$
1502
+ DECLARE
1503
+ v_allowed TEXT[] := ARRAY['usage_tracking_enabled', 'require_requestor_identity', 'requestor_identity_format'];
1504
+ BEGIN
1505
+ IF NOT (p_key = ANY(v_allowed)) THEN
1506
+ RAISE EXCEPTION 'Unknown config key: %. Allowed keys: %', p_key, v_allowed;
1507
+ END IF;
1508
+
1509
+ INSERT INTO cerefox_config (key, value)
1510
+ VALUES (p_key, p_value)
1511
+ ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value;
1512
+ END;
1513
+ $$;
1514
+
1515
+ -- ── cerefox_log_usage ────────────────────────────────────────────────────────
1516
+ -- Insert a usage log entry. Checks config first; no-op if tracking is disabled.
1517
+
1518
+ CREATE OR REPLACE FUNCTION cerefox_log_usage(
1519
+ p_operation TEXT,
1520
+ p_access_path TEXT,
1521
+ p_requestor TEXT DEFAULT NULL,
1522
+ p_document_id UUID DEFAULT NULL,
1523
+ p_project_id UUID DEFAULT NULL,
1524
+ p_query_text TEXT DEFAULT NULL,
1525
+ p_result_count INT DEFAULT NULL,
1526
+ p_extra JSONB DEFAULT '{}'::JSONB
1527
+ )
1528
+ RETURNS VOID
1529
+ LANGUAGE plpgsql
1530
+ SECURITY DEFINER
1531
+ SET search_path = public, pg_catalog
1532
+ AS $$
1533
+ DECLARE
1534
+ v_enabled TEXT;
1535
+ BEGIN
1536
+ SELECT value INTO v_enabled FROM cerefox_config WHERE key = 'usage_tracking_enabled';
1537
+ IF v_enabled IS NULL OR v_enabled != 'true' THEN
1538
+ RETURN;
1539
+ END IF;
1540
+
1541
+ INSERT INTO cerefox_usage_log (
1542
+ operation, access_path, requestor, document_id, project_id,
1543
+ query_text, result_count, extra
1544
+ ) VALUES (
1545
+ p_operation, p_access_path, p_requestor, p_document_id, p_project_id,
1546
+ p_query_text, p_result_count, p_extra
1547
+ );
1548
+ END;
1549
+ $$;
1550
+
1551
+ -- ── cerefox_list_usage_log ───────────────────────────────────────────────────
1552
+ -- Query usage log with optional filters.
1553
+
1554
+ CREATE OR REPLACE FUNCTION cerefox_list_usage_log(
1555
+ p_start TIMESTAMPTZ DEFAULT NULL,
1556
+ p_end TIMESTAMPTZ DEFAULT NULL,
1557
+ p_operation TEXT DEFAULT NULL,
1558
+ p_access_path TEXT DEFAULT NULL,
1559
+ p_requestor TEXT DEFAULT NULL,
1560
+ p_project_id UUID DEFAULT NULL,
1561
+ p_limit INT DEFAULT 100
1562
+ )
1563
+ RETURNS TABLE (
1564
+ id UUID,
1565
+ logged_at TIMESTAMPTZ,
1566
+ operation TEXT,
1567
+ access_path TEXT,
1568
+ requestor TEXT,
1569
+ document_id UUID,
1570
+ doc_title TEXT,
1571
+ project_id UUID,
1572
+ query_text TEXT,
1573
+ result_count INT,
1574
+ extra JSONB
1575
+ )
1576
+ LANGUAGE sql
1577
+ SECURITY DEFINER
1578
+ STABLE
1579
+ SET search_path = public, pg_catalog
1580
+ AS $$
1581
+ SELECT
1582
+ ul.id,
1583
+ ul.logged_at,
1584
+ ul.operation,
1585
+ ul.access_path,
1586
+ ul.requestor,
1587
+ ul.document_id,
1588
+ d.title AS doc_title,
1589
+ ul.project_id,
1590
+ ul.query_text,
1591
+ ul.result_count,
1592
+ ul.extra
1593
+ FROM cerefox_usage_log ul
1594
+ LEFT JOIN cerefox_documents d ON ul.document_id = d.id
1595
+ WHERE (p_start IS NULL OR ul.logged_at >= p_start)
1596
+ AND (p_end IS NULL OR ul.logged_at <= p_end)
1597
+ AND (p_operation IS NULL OR ul.operation = p_operation)
1598
+ AND (p_access_path IS NULL OR ul.access_path = p_access_path)
1599
+ AND (p_requestor IS NULL OR ul.requestor = p_requestor)
1600
+ AND (p_project_id IS NULL OR ul.project_id = p_project_id)
1601
+ ORDER BY ul.logged_at DESC
1602
+ LIMIT p_limit;
1603
+ $$;
1604
+
1605
+ -- ── cerefox_usage_summary ────────────────────────────────────────────────────
1606
+ -- Returns a JSON object with aggregated stats for the analytics page.
1607
+
1608
+ CREATE OR REPLACE FUNCTION cerefox_usage_summary(
1609
+ p_start TIMESTAMPTZ DEFAULT NULL,
1610
+ p_end TIMESTAMPTZ DEFAULT NULL,
1611
+ p_project_id UUID DEFAULT NULL,
1612
+ p_access_path TEXT DEFAULT NULL
1613
+ )
1614
+ RETURNS JSON
1615
+ LANGUAGE plpgsql
1616
+ SECURITY DEFINER
1617
+ STABLE
1618
+ SET search_path = public, pg_catalog
1619
+ AS $$
1620
+ DECLARE
1621
+ v_result JSON;
1622
+ BEGIN
1623
+ WITH filtered AS (
1624
+ SELECT *
1625
+ FROM cerefox_usage_log ul
1626
+ WHERE (p_start IS NULL OR ul.logged_at >= p_start)
1627
+ AND (p_end IS NULL OR ul.logged_at <= p_end)
1628
+ AND (p_project_id IS NULL OR ul.project_id = p_project_id)
1629
+ AND (p_access_path IS NULL OR ul.access_path = p_access_path)
1630
+ ),
1631
+ ops_by_day AS (
1632
+ SELECT DATE(logged_at) AS day, COUNT(*) AS count
1633
+ FROM filtered
1634
+ GROUP BY DATE(logged_at)
1635
+ ORDER BY day
1636
+ ),
1637
+ ops_by_operation AS (
1638
+ SELECT operation, COUNT(*) AS count
1639
+ FROM filtered
1640
+ GROUP BY operation
1641
+ ORDER BY count DESC
1642
+ ),
1643
+ ops_by_access_path AS (
1644
+ SELECT access_path, COUNT(*) AS count
1645
+ FROM filtered
1646
+ GROUP BY access_path
1647
+ ORDER BY count DESC
1648
+ ),
1649
+ top_documents AS (
1650
+ SELECT f.document_id, d.title AS doc_title, COUNT(*) AS count
1651
+ FROM filtered f
1652
+ JOIN cerefox_documents d ON f.document_id = d.id
1653
+ WHERE f.document_id IS NOT NULL
1654
+ GROUP BY f.document_id, d.title
1655
+ ORDER BY count DESC
1656
+ LIMIT 10
1657
+ ),
1658
+ top_requestors AS (
1659
+ SELECT requestor, COUNT(*) AS count
1660
+ FROM filtered
1661
+ WHERE requestor IS NOT NULL
1662
+ GROUP BY requestor
1663
+ ORDER BY count DESC
1664
+ LIMIT 10
1665
+ )
1666
+ SELECT json_build_object(
1667
+ 'total_count', (SELECT COUNT(*) FROM filtered),
1668
+ 'ops_by_day', COALESCE((SELECT json_agg(json_build_object('day', day, 'count', count)) FROM ops_by_day), '[]'::JSON),
1669
+ 'ops_by_operation', COALESCE((SELECT json_agg(json_build_object('operation', operation, 'count', count)) FROM ops_by_operation), '[]'::JSON),
1670
+ 'ops_by_access_path', COALESCE((SELECT json_agg(json_build_object('access_path', access_path, 'count', count)) FROM ops_by_access_path), '[]'::JSON),
1671
+ 'top_documents', COALESCE((SELECT json_agg(json_build_object('document_id', document_id, 'doc_title', doc_title, 'count', count)) FROM top_documents), '[]'::JSON),
1672
+ 'top_requestors', COALESCE((SELECT json_agg(json_build_object('requestor', requestor, 'count', count)) FROM top_requestors), '[]'::JSON)
1673
+ ) INTO v_result;
1674
+
1675
+ RETURN v_result;
1676
+ END;
1677
+ $$;
1678
+
1679
+ -- ─────────────────────────────────────────────────────────────────────────
1680
+ -- Schema version reporter
1681
+ -- ─────────────────────────────────────────────────────────────────────────
1682
+ -- Returns the schema version currently deployed in this database. The value
1683
+ -- must match the `@version` marker at the top of schema.sql.
1684
+ -- Bump both when schema.sql or rpcs.sql changes in a way that requires a
1685
+ -- redeploy. The web UI's /api/v1/schema-version endpoint compares the bundled
1686
+ -- and deployed values and surfaces a 'redeploy needed' banner on mismatch.
1687
+
1688
+ CREATE OR REPLACE FUNCTION cerefox_schema_version()
1689
+ RETURNS TEXT
1690
+ LANGUAGE sql
1691
+ STABLE
1692
+ SECURITY DEFINER
1693
+ SET search_path = public, pg_catalog
1694
+ AS $$
1695
+ SELECT '0.3.1'::TEXT;
1696
+ $$;
1697
+
1698
+
1699
+
1700
+ -- ─────────────────────────────────────────────────────────────────────────
1701
+ -- Function-existence probe (introspection helper)
1702
+ -- ─────────────────────────────────────────────────────────────────────────
1703
+ -- Returns TRUE if a function with the given name exists in the public schema,
1704
+ -- regardless of its signature. Used by `db_status.ts` and `cerefox doctor`
1705
+ -- (v0.5) to verify schema health without having to know the parameter list
1706
+ -- of every RPC. Cheaper and more reliable than calling each RPC and parsing
1707
+ -- the error message.
1708
+
1709
+ CREATE OR REPLACE FUNCTION cerefox_pg_function_exists(p_name TEXT)
1710
+ RETURNS BOOLEAN
1711
+ LANGUAGE sql
1712
+ STABLE
1713
+ SECURITY DEFINER
1714
+ SET search_path = public, pg_catalog
1715
+ AS $$
1716
+ SELECT EXISTS (
1717
+ SELECT 1
1718
+ FROM pg_proc p
1719
+ JOIN pg_namespace n ON p.pronamespace = n.oid
1720
+ WHERE n.nspname = 'public'
1721
+ AND p.proname = p_name
1722
+ );
1723
+ $$;