@cerefox/memory 0.7.1 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -25
- package/dist/bin/cerefox.js +1163 -344
- package/dist/frontend/assets/{index-HNlMcvli.js → index-CAp2_lFX.js} +2 -2
- package/dist/frontend/assets/index-CAp2_lFX.js.map +1 -0
- package/dist/frontend/index.html +1 -1
- package/dist/server-assets/_shared/ef-meta/index.ts +97 -0
- package/dist/server-assets/_shared/embeddings/index.ts +175 -0
- package/dist/server-assets/_shared/mcp-tools/_chunker.ts +187 -0
- package/dist/server-assets/_shared/mcp-tools/_projects.ts +121 -0
- package/dist/server-assets/_shared/mcp-tools/_utils.ts +73 -0
- package/dist/server-assets/_shared/mcp-tools/audit-log.ts +95 -0
- package/dist/server-assets/_shared/mcp-tools/get-document.ts +73 -0
- package/dist/server-assets/_shared/mcp-tools/get-help-content.ts +26 -0
- package/dist/server-assets/_shared/mcp-tools/get-help.ts +90 -0
- package/dist/server-assets/_shared/mcp-tools/index.ts +67 -0
- package/dist/server-assets/_shared/mcp-tools/ingest.ts +315 -0
- package/dist/server-assets/_shared/mcp-tools/list-metadata-keys.ts +55 -0
- package/dist/server-assets/_shared/mcp-tools/list-projects.ts +59 -0
- package/dist/server-assets/_shared/mcp-tools/list-versions.ts +72 -0
- package/dist/server-assets/_shared/mcp-tools/metadata-search.ts +154 -0
- package/dist/server-assets/_shared/mcp-tools/search.ts +193 -0
- package/dist/server-assets/_shared/mcp-tools/set-document-projects.ts +163 -0
- package/dist/server-assets/_shared/mcp-tools/types.ts +92 -0
- package/dist/server-assets/db/migrations/0003_add_document_versions.sql +91 -0
- package/dist/server-assets/db/migrations/0004_add_audit_log_review_status_archived.sql +71 -0
- package/dist/server-assets/db/migrations/0005_metadata_search.sql +628 -0
- package/dist/server-assets/db/migrations/0006_usage_log.sql +255 -0
- package/dist/server-assets/db/migrations/0007_usage_log_requestor.sql +178 -0
- package/dist/server-assets/db/migrations/0008_soft_delete.sql +130 -0
- package/dist/server-assets/db/migrations/0009_audit_log_restore_operation.sql +20 -0
- package/dist/server-assets/db/migrations/0010_requestor_enforcement_config.sql +12 -0
- package/dist/server-assets/db/migrations/0011_title_boosting.sql +48 -0
- package/dist/server-assets/db/rpcs.sql +1723 -0
- package/dist/server-assets/db/schema.sql +380 -0
- package/dist/server-assets/supabase/functions/cerefox-get-audit-log/index.ts +117 -0
- package/dist/server-assets/supabase/functions/cerefox-get-document/index.ts +138 -0
- package/dist/server-assets/supabase/functions/cerefox-ingest/index.ts +819 -0
- package/dist/server-assets/supabase/functions/cerefox-list-projects/index.ts +96 -0
- package/dist/server-assets/supabase/functions/cerefox-list-versions/index.ts +113 -0
- package/dist/server-assets/supabase/functions/cerefox-mcp/index.ts +294 -0
- package/dist/server-assets/supabase/functions/cerefox-mcp/shared.ts +42 -0
- package/dist/server-assets/supabase/functions/cerefox-metadata/index.ts +99 -0
- package/dist/server-assets/supabase/functions/cerefox-metadata-search/index.ts +146 -0
- package/dist/server-assets/supabase/functions/cerefox-search/index.ts +382 -0
- package/docs/guides/connect-agents.md +58 -3
- package/docs/guides/migration-v0.5.md +50 -0
- package/package.json +3 -2
- package/dist/frontend/assets/index-HNlMcvli.js.map +0 -1
|
@@ -0,0 +1,1723 @@
|
|
|
1
|
+
-- Cerefox Search & Retrieval RPCs
|
|
2
|
+
-- These functions are exposed as MCP tools via Supabase.
|
|
3
|
+
-- Run via: python scripts/db_deploy.py (after schema.sql)
|
|
4
|
+
--
|
|
5
|
+
-- All RPCs are SECURITY DEFINER so they can be called safely via the
|
|
6
|
+
-- Supabase anon/service key without exposing the underlying tables directly.
|
|
7
|
+
|
|
8
|
+
-- ── Return-type change drops ──────────────────────────────────────────────────
|
|
9
|
+
-- When CREATE OR REPLACE cannot be used because the return type changes,
|
|
10
|
+
-- we drop the old function first. These drops are safe to re-run.
|
|
11
|
+
|
|
12
|
+
-- Drop old 4-param overload (pre p_min_score) and current 5-param semantic search
|
|
13
|
+
DROP FUNCTION IF EXISTS cerefox_semantic_search(VECTOR(768), INT, BOOLEAN, UUID);
|
|
14
|
+
DROP FUNCTION IF EXISTS cerefox_semantic_search(VECTOR(768), INT, BOOLEAN, UUID, FLOAT);
|
|
15
|
+
|
|
16
|
+
-- Drop old 6-param hybrid_search (pre p_min_score, pre M2M join, used d.project_id column).
|
|
17
|
+
DROP FUNCTION IF EXISTS cerefox_hybrid_search(TEXT, VECTOR(768), INT, FLOAT, BOOLEAN, UUID);
|
|
18
|
+
|
|
19
|
+
-- Drop old 7-param hybrid_search that returned doc_project_id UUID (singular, pre-M2M).
|
|
20
|
+
DROP FUNCTION IF EXISTS cerefox_hybrid_search(TEXT, VECTOR(768), INT, FLOAT, BOOLEAN, UUID, FLOAT);
|
|
21
|
+
|
|
22
|
+
-- Drop old 5-param search_docs (pre p_min_score).
|
|
23
|
+
DROP FUNCTION IF EXISTS cerefox_search_docs(TEXT, VECTOR(768), INT, FLOAT, UUID);
|
|
24
|
+
|
|
25
|
+
-- Drop 6-param search_docs that returned doc_project_id UUID (singular) or lacked doc_updated_at.
|
|
26
|
+
DROP FUNCTION IF EXISTS cerefox_search_docs(TEXT, VECTOR(768), INT, FLOAT, UUID, FLOAT);
|
|
27
|
+
|
|
28
|
+
-- Drop 8-param search_docs (pre is_partial) so return-type change can be applied cleanly.
|
|
29
|
+
DROP FUNCTION IF EXISTS cerefox_search_docs(TEXT, VECTOR(768), INT, FLOAT, UUID, FLOAT, INT, INT);
|
|
30
|
+
|
|
31
|
+
DROP FUNCTION IF EXISTS cerefox_fts_search(TEXT, INT, UUID);
|
|
32
|
+
DROP FUNCTION IF EXISTS cerefox_reconstruct_doc(UUID);
|
|
33
|
+
|
|
34
|
+
-- Drop current signatures before adding version_count to their return types.
|
|
35
|
+
-- Iteration 12B: all chunk-level and document-level search results now include
|
|
36
|
+
-- version_count so agents and the web UI know when previous versions are available.
|
|
37
|
+
DROP FUNCTION IF EXISTS cerefox_hybrid_search(TEXT, VECTOR(768), INT, FLOAT, BOOLEAN, UUID, FLOAT);
|
|
38
|
+
DROP FUNCTION IF EXISTS cerefox_fts_search(TEXT, INT, UUID);
|
|
39
|
+
DROP FUNCTION IF EXISTS cerefox_semantic_search(VECTOR(768), INT, BOOLEAN, UUID, FLOAT);
|
|
40
|
+
DROP FUNCTION IF EXISTS cerefox_reconstruct_doc(UUID);
|
|
41
|
+
DROP FUNCTION IF EXISTS cerefox_search_docs(TEXT, VECTOR(768), INT, FLOAT, UUID, FLOAT);
|
|
42
|
+
|
|
43
|
+
-- Iteration 13: Drop pre-metadata-filter signatures so we can add p_metadata_filter JSONB.
|
|
44
|
+
-- Backwards-compatible: the new parameter has DEFAULT NULL so existing callers are unaffected.
|
|
45
|
+
DROP FUNCTION IF EXISTS cerefox_hybrid_search(TEXT, VECTOR(768), INT, FLOAT, BOOLEAN, UUID, FLOAT);
|
|
46
|
+
DROP FUNCTION IF EXISTS cerefox_fts_search(TEXT, INT, UUID);
|
|
47
|
+
DROP FUNCTION IF EXISTS cerefox_semantic_search(VECTOR(768), INT, BOOLEAN, UUID, FLOAT);
|
|
48
|
+
DROP FUNCTION IF EXISTS cerefox_search_docs(TEXT, VECTOR(768), INT, FLOAT, UUID, FLOAT, INT, INT);
|
|
49
|
+
|
|
50
|
+
-- Iteration 16B: Drop pre-project_names signatures so we can add doc_project_names TEXT[]
|
|
51
|
+
-- to all RETURNS TABLE shapes. Also drops reconstruct_doc and get_document for the same reason.
|
|
52
|
+
DROP FUNCTION IF EXISTS cerefox_hybrid_search(TEXT, VECTOR(768), INT, FLOAT, BOOLEAN, UUID, FLOAT, JSONB);
|
|
53
|
+
DROP FUNCTION IF EXISTS cerefox_fts_search(TEXT, INT, UUID, JSONB);
|
|
54
|
+
DROP FUNCTION IF EXISTS cerefox_semantic_search(VECTOR(768), INT, BOOLEAN, UUID, FLOAT, JSONB);
|
|
55
|
+
DROP FUNCTION IF EXISTS cerefox_search_docs(TEXT, VECTOR(768), INT, FLOAT, UUID, FLOAT, INT, INT, JSONB);
|
|
56
|
+
DROP FUNCTION IF EXISTS cerefox_reconstruct_doc(UUID);
|
|
57
|
+
DROP FUNCTION IF EXISTS cerefox_get_document(UUID, UUID);
|
|
58
|
+
|
|
59
|
+
-- ── Shared return type note ────────────────────────────────────────────────────
|
|
60
|
+
-- All chunk-level search RPCs return the same shape for consistency:
|
|
61
|
+
-- chunk_id, document_id, chunk_index, title, content, heading_path,
|
|
62
|
+
-- heading_level, score, doc_title, doc_source, doc_project_ids,
|
|
63
|
+
-- doc_project_names, doc_metadata, version_count
|
|
64
|
+
-- Note: doc_project_ids is UUID[] (array) — a document can belong to many projects.
|
|
65
|
+
-- Note: doc_project_names is TEXT[] (array) — human-readable project names.
|
|
66
|
+
-- Note: version_count is INT — number of archived versions for the parent document.
|
|
67
|
+
-- Agents and the web UI use this to know when previous versions are available
|
|
68
|
+
-- for retrieval. 0 means the current content has never been overwritten.
|
|
69
|
+
|
|
70
|
+
-- ── Hybrid Search ─────────────────────────────────────────────────────────────
|
|
71
|
+
-- Combines full-text search (FTS) and vector similarity with a configurable
|
|
72
|
+
-- alpha weight. alpha=1.0 means pure semantic; alpha=0.0 means pure FTS.
|
|
73
|
+
--
|
|
74
|
+
-- V1 approach: run both searches (top N*5 candidates each), FULL OUTER JOIN on
|
|
75
|
+
-- chunk ID, then combine scores with weighted average. Simple and fast for
|
|
76
|
+
-- typical knowledge base sizes.
|
|
77
|
+
|
|
78
|
+
CREATE OR REPLACE FUNCTION cerefox_hybrid_search(
|
|
79
|
+
p_query_text TEXT,
|
|
80
|
+
p_query_embedding VECTOR(768),
|
|
81
|
+
p_match_count INT DEFAULT 10,
|
|
82
|
+
p_alpha FLOAT DEFAULT 0.7,
|
|
83
|
+
p_use_upgrade BOOLEAN DEFAULT FALSE,
|
|
84
|
+
p_project_id UUID DEFAULT NULL,
|
|
85
|
+
p_min_score FLOAT DEFAULT 0.0,
|
|
86
|
+
p_metadata_filter JSONB DEFAULT NULL
|
|
87
|
+
)
|
|
88
|
+
RETURNS TABLE (
|
|
89
|
+
chunk_id UUID,
|
|
90
|
+
document_id UUID,
|
|
91
|
+
chunk_index INT,
|
|
92
|
+
title TEXT,
|
|
93
|
+
content TEXT,
|
|
94
|
+
heading_path TEXT[],
|
|
95
|
+
heading_level INT,
|
|
96
|
+
score FLOAT,
|
|
97
|
+
doc_title TEXT,
|
|
98
|
+
doc_source TEXT,
|
|
99
|
+
doc_project_ids UUID[],
|
|
100
|
+
doc_project_names TEXT[],
|
|
101
|
+
doc_metadata JSONB,
|
|
102
|
+
version_count INT
|
|
103
|
+
)
|
|
104
|
+
LANGUAGE plpgsql
|
|
105
|
+
SECURITY DEFINER
|
|
106
|
+
SET search_path = public, pg_catalog
|
|
107
|
+
AS $$
|
|
108
|
+
DECLARE
|
|
109
|
+
-- plainto_tsquery: ANDs all terms, treats every token as a literal word.
|
|
110
|
+
-- We deliberately avoid websearch_to_tsquery here because it interprets `-` as
|
|
111
|
+
-- a negation operator, which traps natural queries against dashed titles
|
|
112
|
+
-- (e.g. `Job Hunting - Opportunity Index`). Agent queries don't use the
|
|
113
|
+
-- websearch operators (phrase, OR, NOT); semantic ranking is the soft-match
|
|
114
|
+
-- layer for "broadly related". If operator support is ever needed, gate it
|
|
115
|
+
-- behind an opt-in flag rather than changing the default.
|
|
116
|
+
query_fts tsquery := plainto_tsquery('english', p_query_text);
|
|
117
|
+
candidate_count INT := p_match_count * 5;
|
|
118
|
+
BEGIN
|
|
119
|
+
RETURN QUERY
|
|
120
|
+
WITH
|
|
121
|
+
fts_results AS (
|
|
122
|
+
SELECT
|
|
123
|
+
c.id,
|
|
124
|
+
ts_rank_cd(c.fts, query_fts)::FLOAT AS fts_score
|
|
125
|
+
FROM cerefox_chunks c
|
|
126
|
+
JOIN cerefox_documents d ON c.document_id = d.id
|
|
127
|
+
WHERE c.version_id IS NULL
|
|
128
|
+
AND d.deleted_at IS NULL
|
|
129
|
+
AND c.fts @@ query_fts
|
|
130
|
+
AND (p_project_id IS NULL OR EXISTS (
|
|
131
|
+
SELECT 1 FROM cerefox_document_projects dp
|
|
132
|
+
WHERE dp.document_id = d.id AND dp.project_id = p_project_id
|
|
133
|
+
))
|
|
134
|
+
AND (p_metadata_filter IS NULL OR d.metadata @> p_metadata_filter)
|
|
135
|
+
ORDER BY fts_score DESC
|
|
136
|
+
LIMIT candidate_count
|
|
137
|
+
),
|
|
138
|
+
vec_results AS (
|
|
139
|
+
SELECT
|
|
140
|
+
c.id,
|
|
141
|
+
CASE
|
|
142
|
+
WHEN p_use_upgrade AND c.embedding_upgrade IS NOT NULL
|
|
143
|
+
THEN (1.0 - (c.embedding_upgrade <=> p_query_embedding))::FLOAT
|
|
144
|
+
ELSE
|
|
145
|
+
(1.0 - (c.embedding_primary <=> p_query_embedding))::FLOAT
|
|
146
|
+
END AS vec_score
|
|
147
|
+
FROM cerefox_chunks c
|
|
148
|
+
JOIN cerefox_documents d ON c.document_id = d.id
|
|
149
|
+
WHERE c.version_id IS NULL
|
|
150
|
+
AND d.deleted_at IS NULL
|
|
151
|
+
AND (p_project_id IS NULL OR EXISTS (
|
|
152
|
+
SELECT 1 FROM cerefox_document_projects dp
|
|
153
|
+
WHERE dp.document_id = d.id AND dp.project_id = p_project_id
|
|
154
|
+
))
|
|
155
|
+
AND (p_metadata_filter IS NULL OR d.metadata @> p_metadata_filter)
|
|
156
|
+
ORDER BY
|
|
157
|
+
CASE
|
|
158
|
+
WHEN p_use_upgrade AND c.embedding_upgrade IS NOT NULL
|
|
159
|
+
THEN c.embedding_upgrade <=> p_query_embedding
|
|
160
|
+
ELSE c.embedding_primary <=> p_query_embedding
|
|
161
|
+
END
|
|
162
|
+
LIMIT candidate_count
|
|
163
|
+
),
|
|
164
|
+
combined AS (
|
|
165
|
+
SELECT
|
|
166
|
+
COALESCE(f.id, v.id) AS id,
|
|
167
|
+
( p_alpha * COALESCE(v.vec_score, 0.0) +
|
|
168
|
+
(1.0 - p_alpha) * COALESCE(f.fts_score, 0.0)
|
|
169
|
+
) AS score,
|
|
170
|
+
COALESCE(v.vec_score, 0.0) AS vec_score,
|
|
171
|
+
-- TRUE when the chunk matched the @@ FTS operator.
|
|
172
|
+
-- We use this flag rather than vec_score to decide whether a chunk
|
|
173
|
+
-- passes the threshold, because in small corpora every chunk appears
|
|
174
|
+
-- in vec_results (LIMIT candidate_count covers all rows), so
|
|
175
|
+
-- vec_score is never NULL even for FTS-only matches.
|
|
176
|
+
f.id IS NOT NULL AS has_fts_match
|
|
177
|
+
FROM fts_results f
|
|
178
|
+
FULL OUTER JOIN vec_results v ON f.id = v.id
|
|
179
|
+
)
|
|
180
|
+
SELECT
|
|
181
|
+
c.id AS chunk_id,
|
|
182
|
+
c.document_id,
|
|
183
|
+
c.chunk_index,
|
|
184
|
+
c.title,
|
|
185
|
+
c.content,
|
|
186
|
+
c.heading_path,
|
|
187
|
+
c.heading_level,
|
|
188
|
+
cm.score,
|
|
189
|
+
d.title AS doc_title,
|
|
190
|
+
d.source AS doc_source,
|
|
191
|
+
ARRAY(SELECT dp.project_id FROM cerefox_document_projects dp
|
|
192
|
+
WHERE dp.document_id = d.id) AS doc_project_ids,
|
|
193
|
+
ARRAY(SELECT p.name FROM cerefox_projects p
|
|
194
|
+
JOIN cerefox_document_projects dp ON p.id = dp.project_id
|
|
195
|
+
WHERE dp.document_id = d.id) AS doc_project_names,
|
|
196
|
+
d.metadata AS doc_metadata,
|
|
197
|
+
(SELECT COUNT(*)::INT FROM cerefox_document_versions dv
|
|
198
|
+
WHERE dv.document_id = d.id) AS version_count
|
|
199
|
+
FROM combined cm
|
|
200
|
+
JOIN cerefox_chunks c ON c.id = cm.id
|
|
201
|
+
JOIN cerefox_documents d ON c.document_id = d.id
|
|
202
|
+
-- FTS matches pass through unconditionally: the @@ operator is a hard gate
|
|
203
|
+
-- and guarantees the query terms appear in the chunk.
|
|
204
|
+
-- Vector-only results (no FTS match) are filtered by the cosine threshold.
|
|
205
|
+
WHERE cm.has_fts_match OR cm.vec_score >= p_min_score
|
|
206
|
+
ORDER BY cm.score DESC
|
|
207
|
+
LIMIT p_match_count;
|
|
208
|
+
END;
|
|
209
|
+
$$;
|
|
210
|
+
|
|
211
|
+
-- ── FTS-Only Search ───────────────────────────────────────────────────────────
|
|
212
|
+
-- Pure keyword / exact-match search. Best for names, dates, tags.
|
|
213
|
+
|
|
214
|
+
CREATE OR REPLACE FUNCTION cerefox_fts_search(
|
|
215
|
+
p_query_text TEXT,
|
|
216
|
+
p_match_count INT DEFAULT 10,
|
|
217
|
+
p_project_id UUID DEFAULT NULL,
|
|
218
|
+
p_metadata_filter JSONB DEFAULT NULL
|
|
219
|
+
)
|
|
220
|
+
RETURNS TABLE (
|
|
221
|
+
chunk_id UUID,
|
|
222
|
+
document_id UUID,
|
|
223
|
+
chunk_index INT,
|
|
224
|
+
title TEXT,
|
|
225
|
+
content TEXT,
|
|
226
|
+
heading_path TEXT[],
|
|
227
|
+
heading_level INT,
|
|
228
|
+
score FLOAT,
|
|
229
|
+
doc_title TEXT,
|
|
230
|
+
doc_source TEXT,
|
|
231
|
+
doc_project_ids UUID[],
|
|
232
|
+
doc_project_names TEXT[],
|
|
233
|
+
doc_metadata JSONB,
|
|
234
|
+
version_count INT
|
|
235
|
+
)
|
|
236
|
+
LANGUAGE plpgsql
|
|
237
|
+
SECURITY DEFINER
|
|
238
|
+
SET search_path = public, pg_catalog
|
|
239
|
+
AS $$
|
|
240
|
+
DECLARE
|
|
241
|
+
-- plainto_tsquery: see rationale comment in cerefox_hybrid_search above.
|
|
242
|
+
query_fts tsquery := plainto_tsquery('english', p_query_text);
|
|
243
|
+
BEGIN
|
|
244
|
+
RETURN QUERY
|
|
245
|
+
SELECT
|
|
246
|
+
c.id AS chunk_id,
|
|
247
|
+
c.document_id,
|
|
248
|
+
c.chunk_index,
|
|
249
|
+
c.title,
|
|
250
|
+
c.content,
|
|
251
|
+
c.heading_path,
|
|
252
|
+
c.heading_level,
|
|
253
|
+
ts_rank_cd(c.fts, query_fts)::FLOAT AS score,
|
|
254
|
+
d.title AS doc_title,
|
|
255
|
+
d.source AS doc_source,
|
|
256
|
+
ARRAY(SELECT dp.project_id FROM cerefox_document_projects dp
|
|
257
|
+
WHERE dp.document_id = d.id) AS doc_project_ids,
|
|
258
|
+
ARRAY(SELECT p.name FROM cerefox_projects p
|
|
259
|
+
JOIN cerefox_document_projects dp ON p.id = dp.project_id
|
|
260
|
+
WHERE dp.document_id = d.id) AS doc_project_names,
|
|
261
|
+
d.metadata AS doc_metadata,
|
|
262
|
+
(SELECT COUNT(*)::INT FROM cerefox_document_versions dv
|
|
263
|
+
WHERE dv.document_id = d.id) AS version_count
|
|
264
|
+
FROM cerefox_chunks c
|
|
265
|
+
JOIN cerefox_documents d ON c.document_id = d.id
|
|
266
|
+
WHERE c.version_id IS NULL
|
|
267
|
+
AND d.deleted_at IS NULL
|
|
268
|
+
AND c.fts @@ query_fts
|
|
269
|
+
AND (p_project_id IS NULL OR EXISTS (
|
|
270
|
+
SELECT 1 FROM cerefox_document_projects dp
|
|
271
|
+
WHERE dp.document_id = d.id AND dp.project_id = p_project_id
|
|
272
|
+
))
|
|
273
|
+
AND (p_metadata_filter IS NULL OR d.metadata @> p_metadata_filter)
|
|
274
|
+
ORDER BY score DESC
|
|
275
|
+
LIMIT p_match_count;
|
|
276
|
+
END;
|
|
277
|
+
$$;
|
|
278
|
+
|
|
279
|
+
-- ── Semantic-Only Search ──────────────────────────────────────────────────────
|
|
280
|
+
-- Pure vector similarity. Best for conceptual / paraphrase queries.
|
|
281
|
+
|
|
282
|
+
CREATE OR REPLACE FUNCTION cerefox_semantic_search(
|
|
283
|
+
p_query_embedding VECTOR(768),
|
|
284
|
+
p_match_count INT DEFAULT 10,
|
|
285
|
+
p_use_upgrade BOOLEAN DEFAULT FALSE,
|
|
286
|
+
p_project_id UUID DEFAULT NULL,
|
|
287
|
+
p_min_score FLOAT DEFAULT 0.0,
|
|
288
|
+
p_metadata_filter JSONB DEFAULT NULL
|
|
289
|
+
)
|
|
290
|
+
RETURNS TABLE (
|
|
291
|
+
chunk_id UUID,
|
|
292
|
+
document_id UUID,
|
|
293
|
+
chunk_index INT,
|
|
294
|
+
title TEXT,
|
|
295
|
+
content TEXT,
|
|
296
|
+
heading_path TEXT[],
|
|
297
|
+
heading_level INT,
|
|
298
|
+
score FLOAT,
|
|
299
|
+
doc_title TEXT,
|
|
300
|
+
doc_source TEXT,
|
|
301
|
+
doc_project_ids UUID[],
|
|
302
|
+
doc_project_names TEXT[],
|
|
303
|
+
doc_metadata JSONB,
|
|
304
|
+
version_count INT
|
|
305
|
+
)
|
|
306
|
+
LANGUAGE plpgsql
|
|
307
|
+
SECURITY DEFINER
|
|
308
|
+
SET search_path = public, pg_catalog
|
|
309
|
+
AS $$
|
|
310
|
+
BEGIN
|
|
311
|
+
RETURN QUERY
|
|
312
|
+
SELECT
|
|
313
|
+
c.id AS chunk_id,
|
|
314
|
+
c.document_id,
|
|
315
|
+
c.chunk_index,
|
|
316
|
+
c.title,
|
|
317
|
+
c.content,
|
|
318
|
+
c.heading_path,
|
|
319
|
+
c.heading_level,
|
|
320
|
+
CASE
|
|
321
|
+
WHEN p_use_upgrade AND c.embedding_upgrade IS NOT NULL
|
|
322
|
+
THEN (1.0 - (c.embedding_upgrade <=> p_query_embedding))::FLOAT
|
|
323
|
+
ELSE
|
|
324
|
+
(1.0 - (c.embedding_primary <=> p_query_embedding))::FLOAT
|
|
325
|
+
END AS score,
|
|
326
|
+
d.title AS doc_title,
|
|
327
|
+
d.source AS doc_source,
|
|
328
|
+
ARRAY(SELECT dp.project_id FROM cerefox_document_projects dp
|
|
329
|
+
WHERE dp.document_id = d.id) AS doc_project_ids,
|
|
330
|
+
ARRAY(SELECT p.name FROM cerefox_projects p
|
|
331
|
+
JOIN cerefox_document_projects dp ON p.id = dp.project_id
|
|
332
|
+
WHERE dp.document_id = d.id) AS doc_project_names,
|
|
333
|
+
d.metadata AS doc_metadata,
|
|
334
|
+
(SELECT COUNT(*)::INT FROM cerefox_document_versions dv
|
|
335
|
+
WHERE dv.document_id = d.id) AS version_count
|
|
336
|
+
FROM cerefox_chunks c
|
|
337
|
+
JOIN cerefox_documents d ON c.document_id = d.id
|
|
338
|
+
WHERE c.version_id IS NULL
|
|
339
|
+
AND d.deleted_at IS NULL
|
|
340
|
+
AND (p_project_id IS NULL OR EXISTS (
|
|
341
|
+
SELECT 1 FROM cerefox_document_projects dp
|
|
342
|
+
WHERE dp.document_id = d.id AND dp.project_id = p_project_id
|
|
343
|
+
))
|
|
344
|
+
AND (p_metadata_filter IS NULL OR d.metadata @> p_metadata_filter)
|
|
345
|
+
AND (p_use_upgrade = FALSE OR c.embedding_upgrade IS NOT NULL)
|
|
346
|
+
-- Optional minimum cosine similarity threshold.
|
|
347
|
+
-- Default 0.0 means no filtering (returns all top-N results).
|
|
348
|
+
-- When called via the Python layer, CEREFOX_MIN_SEARCH_SCORE (default 0.65)
|
|
349
|
+
-- is applied client-side; agents calling this RPC directly can pass p_min_score.
|
|
350
|
+
AND CASE
|
|
351
|
+
WHEN p_use_upgrade AND c.embedding_upgrade IS NOT NULL
|
|
352
|
+
THEN (1.0 - (c.embedding_upgrade <=> p_query_embedding))::FLOAT
|
|
353
|
+
ELSE (1.0 - (c.embedding_primary <=> p_query_embedding))::FLOAT
|
|
354
|
+
END >= p_min_score
|
|
355
|
+
ORDER BY
|
|
356
|
+
CASE
|
|
357
|
+
WHEN p_use_upgrade AND c.embedding_upgrade IS NOT NULL
|
|
358
|
+
THEN c.embedding_upgrade <=> p_query_embedding
|
|
359
|
+
ELSE c.embedding_primary <=> p_query_embedding
|
|
360
|
+
END
|
|
361
|
+
LIMIT p_match_count;
|
|
362
|
+
END;
|
|
363
|
+
$$;
|
|
364
|
+
|
|
365
|
+
-- ── Document Reconstruction ───────────────────────────────────────────────────
|
|
366
|
+
-- Reassemble a full document from its chunks (ordered by chunk_index).
|
|
367
|
+
-- Agents use this after a chunk-level search to get broader context.
|
|
368
|
+
|
|
369
|
+
CREATE OR REPLACE FUNCTION cerefox_reconstruct_doc(
|
|
370
|
+
p_document_id UUID
|
|
371
|
+
)
|
|
372
|
+
RETURNS TABLE (
|
|
373
|
+
document_id UUID,
|
|
374
|
+
doc_title TEXT,
|
|
375
|
+
doc_source TEXT,
|
|
376
|
+
doc_metadata JSONB,
|
|
377
|
+
doc_project_ids UUID[],
|
|
378
|
+
doc_project_names TEXT[],
|
|
379
|
+
full_content TEXT,
|
|
380
|
+
chunk_count INT,
|
|
381
|
+
total_chars INT,
|
|
382
|
+
version_count INT
|
|
383
|
+
)
|
|
384
|
+
LANGUAGE sql
|
|
385
|
+
SECURITY DEFINER
|
|
386
|
+
STABLE
|
|
387
|
+
SET search_path = public, pg_catalog
|
|
388
|
+
AS $$
|
|
389
|
+
SELECT
|
|
390
|
+
d.id AS document_id,
|
|
391
|
+
d.title AS doc_title,
|
|
392
|
+
d.source AS doc_source,
|
|
393
|
+
d.metadata AS doc_metadata,
|
|
394
|
+
ARRAY(SELECT dp.project_id FROM cerefox_document_projects dp
|
|
395
|
+
WHERE dp.document_id = d.id) AS doc_project_ids,
|
|
396
|
+
ARRAY(SELECT p.name FROM cerefox_projects p
|
|
397
|
+
JOIN cerefox_document_projects dp ON p.id = dp.project_id
|
|
398
|
+
WHERE dp.document_id = d.id) AS doc_project_names,
|
|
399
|
+
STRING_AGG(c.content, E'\n\n' ORDER BY c.chunk_index) AS full_content,
|
|
400
|
+
COUNT(*)::INT AS chunk_count,
|
|
401
|
+
SUM(c.char_count)::INT AS total_chars,
|
|
402
|
+
(SELECT COUNT(*)::INT FROM cerefox_document_versions dv
|
|
403
|
+
WHERE dv.document_id = d.id) AS version_count
|
|
404
|
+
FROM cerefox_documents d
|
|
405
|
+
JOIN cerefox_chunks c ON c.document_id = d.id
|
|
406
|
+
WHERE d.id = p_document_id
|
|
407
|
+
AND c.version_id IS NULL
|
|
408
|
+
GROUP BY d.id, d.title, d.source, d.metadata;
|
|
409
|
+
$$;
|
|
410
|
+
|
|
411
|
+
-- ── cerefox_save_note ─────────────────────────────────────────────────────────
|
|
412
|
+
-- Agent write tool: create a minimal document record for a short text note.
|
|
413
|
+
-- Embedding and chunking are NOT done server-side in V1 — the Python ingestion
|
|
414
|
+
-- pipeline should be used for full ingest. This RPC is intended for quick
|
|
415
|
+
-- one-shot note capture from AI agents that want to store something immediately.
|
|
416
|
+
--
|
|
417
|
+
-- Parameters:
|
|
418
|
+
-- p_title : Note title (required)
|
|
419
|
+
-- p_content : Markdown content (required)
|
|
420
|
+
-- p_source : Origin label, e.g. 'agent' (default: 'agent')
|
|
421
|
+
-- p_project_id : Optional project UUID (assigns to a single project)
|
|
422
|
+
-- p_metadata : Optional JSONB metadata (e.g. agent name, session id)
|
|
423
|
+
--
|
|
424
|
+
-- Returns: the created document row (id, title, created_at)
|
|
425
|
+
|
|
426
|
+
CREATE OR REPLACE FUNCTION cerefox_save_note(
|
|
427
|
+
p_title TEXT,
|
|
428
|
+
p_content TEXT,
|
|
429
|
+
p_source TEXT DEFAULT 'agent',
|
|
430
|
+
p_project_id UUID DEFAULT NULL,
|
|
431
|
+
p_metadata JSONB DEFAULT '{}'::JSONB
|
|
432
|
+
)
|
|
433
|
+
RETURNS TABLE (
|
|
434
|
+
id UUID,
|
|
435
|
+
title TEXT,
|
|
436
|
+
created_at TIMESTAMPTZ
|
|
437
|
+
)
|
|
438
|
+
LANGUAGE plpgsql
|
|
439
|
+
SECURITY DEFINER
|
|
440
|
+
SET search_path = public, pg_catalog
|
|
441
|
+
AS $$
|
|
442
|
+
DECLARE
|
|
443
|
+
v_hash TEXT;
|
|
444
|
+
v_doc_id UUID;
|
|
445
|
+
v_created_at TIMESTAMPTZ;
|
|
446
|
+
BEGIN
|
|
447
|
+
-- Compute content hash to support deduplication on the caller side.
|
|
448
|
+
v_hash := encode(sha256(p_content::BYTEA), 'hex');
|
|
449
|
+
|
|
450
|
+
INSERT INTO cerefox_documents (
|
|
451
|
+
title, source, content_hash, metadata, chunk_count, total_chars
|
|
452
|
+
) VALUES (
|
|
453
|
+
p_title, p_source, v_hash, p_metadata, 0, length(p_content)
|
|
454
|
+
)
|
|
455
|
+
RETURNING cerefox_documents.id, cerefox_documents.created_at
|
|
456
|
+
INTO v_doc_id, v_created_at;
|
|
457
|
+
|
|
458
|
+
-- Assign to project if provided (many-to-many junction).
|
|
459
|
+
IF p_project_id IS NOT NULL THEN
|
|
460
|
+
INSERT INTO cerefox_document_projects (document_id, project_id)
|
|
461
|
+
VALUES (v_doc_id, p_project_id)
|
|
462
|
+
ON CONFLICT DO NOTHING;
|
|
463
|
+
END IF;
|
|
464
|
+
|
|
465
|
+
RETURN QUERY SELECT v_doc_id, p_title, v_created_at;
|
|
466
|
+
END;
|
|
467
|
+
$$;
|
|
468
|
+
|
|
469
|
+
-- ── cerefox_context_expand ────────────────────────────────────────────────────
|
|
470
|
+
-- Small-to-big retrieval: given a set of chunk IDs from a search result,
|
|
471
|
+
-- return those chunks plus their immediate neighbours (±window_size by
|
|
472
|
+
-- chunk_index within the same document). Use this after a chunk-level search
|
|
473
|
+
-- to recover more surrounding context without fetching the full document.
|
|
474
|
+
--
|
|
475
|
+
-- Parameters:
|
|
476
|
+
-- p_chunk_ids : Array of chunk UUIDs from the search results
|
|
477
|
+
-- p_window_size : Number of chunks to expand in each direction (default: 1)
|
|
478
|
+
--
|
|
479
|
+
-- Returns each expanded chunk with is_seed=TRUE for original results.
|
|
480
|
+
|
|
481
|
+
CREATE OR REPLACE FUNCTION cerefox_context_expand(
|
|
482
|
+
p_chunk_ids UUID[],
|
|
483
|
+
p_window_size INT DEFAULT 1
|
|
484
|
+
)
|
|
485
|
+
RETURNS TABLE (
|
|
486
|
+
chunk_id UUID,
|
|
487
|
+
document_id UUID,
|
|
488
|
+
chunk_index INT,
|
|
489
|
+
title TEXT,
|
|
490
|
+
content TEXT,
|
|
491
|
+
heading_path TEXT[],
|
|
492
|
+
heading_level INT,
|
|
493
|
+
doc_title TEXT,
|
|
494
|
+
is_seed BOOL
|
|
495
|
+
)
|
|
496
|
+
LANGUAGE sql
|
|
497
|
+
SECURITY DEFINER
|
|
498
|
+
STABLE
|
|
499
|
+
SET search_path = public, pg_catalog
|
|
500
|
+
AS $$
|
|
501
|
+
WITH seeds AS (
|
|
502
|
+
SELECT c.id, c.document_id, c.chunk_index
|
|
503
|
+
FROM cerefox_chunks c
|
|
504
|
+
WHERE c.id = ANY(p_chunk_ids)
|
|
505
|
+
AND c.version_id IS NULL
|
|
506
|
+
),
|
|
507
|
+
expanded AS (
|
|
508
|
+
SELECT DISTINCT c.id
|
|
509
|
+
FROM cerefox_chunks c
|
|
510
|
+
JOIN seeds s ON c.document_id = s.document_id
|
|
511
|
+
WHERE c.version_id IS NULL
|
|
512
|
+
AND c.chunk_index BETWEEN s.chunk_index - p_window_size
|
|
513
|
+
AND s.chunk_index + p_window_size
|
|
514
|
+
)
|
|
515
|
+
SELECT
|
|
516
|
+
c.id AS chunk_id,
|
|
517
|
+
c.document_id,
|
|
518
|
+
c.chunk_index,
|
|
519
|
+
c.title,
|
|
520
|
+
c.content,
|
|
521
|
+
c.heading_path,
|
|
522
|
+
c.heading_level,
|
|
523
|
+
d.title AS doc_title,
|
|
524
|
+
c.id = ANY(p_chunk_ids) AS is_seed
|
|
525
|
+
FROM expanded e
|
|
526
|
+
JOIN cerefox_chunks c ON c.id = e.id
|
|
527
|
+
JOIN cerefox_documents d ON c.document_id = d.id
|
|
528
|
+
ORDER BY c.document_id, c.chunk_index;
|
|
529
|
+
$$;
|
|
530
|
+
|
|
531
|
+
-- ── cerefox_search_docs ───────────────────────────────────────────────────────
|
|
532
|
+
-- Document-level hybrid search: runs hybrid search internally, deduplicates
|
|
533
|
+
-- results by document (keeping the best-scoring chunk per document), and
|
|
534
|
+
-- returns up to p_match_count *distinct documents* with their content.
|
|
535
|
+
--
|
|
536
|
+
-- ── RPC-level configuration (not exposed via .env) ────────────────────────────
|
|
537
|
+
-- Two params below are intentionally NOT surfaced in Python config or .env.
|
|
538
|
+
-- They are system-level tuning knobs with the same role as OPENAI_MODEL and
|
|
539
|
+
-- EMBEDDING_DIMENSIONS in the Edge Functions — change them here and redeploy
|
|
540
|
+
-- rpcs.sql (python scripts/db_deploy.py) if you need different values.
|
|
541
|
+
--
|
|
542
|
+
-- p_small_to_big_threshold (default: 20000 chars)
|
|
543
|
+
-- Documents larger than this return matched chunks + neighbours instead of
|
|
544
|
+
-- the full document. Set to 0 to always return full document content.
|
|
545
|
+
-- Rationale: at the default match_count=5 and 200 KB response ceiling,
|
|
546
|
+
-- 5 × 20 000 chars ≈ 100 KB — comfortably under the limit even before
|
|
547
|
+
-- accounting for small-to-big compression of large docs.
|
|
548
|
+
--
|
|
549
|
+
-- p_context_window (default: 1)
|
|
550
|
+
-- Neighbour chunks on each side of each matched chunk.
|
|
551
|
+
-- N=1 → up to 3 contiguous chunks per hit (prev, match, next).
|
|
552
|
+
-- N=0 → matched chunks only (no expansion).
|
|
553
|
+
-- N=2 → up to 5 contiguous chunks per hit.
|
|
554
|
+
-- ─────────────────────────────────────────────────────────────────────────────
|
|
555
|
+
--
|
|
556
|
+
-- Parameters:
|
|
557
|
+
-- p_query_text : Query string (used for FTS)
|
|
558
|
+
-- p_query_embedding : 768-dim query embedding (used for vector search)
|
|
559
|
+
-- p_match_count : Max documents to return (default: 5)
|
|
560
|
+
-- p_alpha : Semantic weight 0.0–1.0 (default: 0.7)
|
|
561
|
+
-- p_project_id : Optional project filter (M2M)
|
|
562
|
+
-- p_min_score : Minimum cosine similarity for vector results
|
|
563
|
+
-- p_small_to_big_threshold : See above (default: 20000)
|
|
564
|
+
-- p_context_window : See above (default: 1)
|
|
565
|
+
--
|
|
566
|
+
-- Returns one row per document. total_chars is always the full document size.
|
|
567
|
+
-- chunk_count reflects how many chunks are in full_content (may be partial).
|
|
568
|
+
-- is_partial = TRUE when the small-to-big path was taken for that document.
|
|
569
|
+
|
|
570
|
+
CREATE OR REPLACE FUNCTION cerefox_search_docs(
|
|
571
|
+
p_query_text TEXT,
|
|
572
|
+
p_query_embedding VECTOR(768),
|
|
573
|
+
p_match_count INT DEFAULT 5,
|
|
574
|
+
p_alpha FLOAT DEFAULT 0.7,
|
|
575
|
+
p_project_id UUID DEFAULT NULL,
|
|
576
|
+
p_min_score FLOAT DEFAULT 0.0,
|
|
577
|
+
p_small_to_big_threshold INT DEFAULT 20000,
|
|
578
|
+
p_context_window INT DEFAULT 1,
|
|
579
|
+
p_metadata_filter JSONB DEFAULT NULL
|
|
580
|
+
)
|
|
581
|
+
RETURNS TABLE (
|
|
582
|
+
document_id UUID,
|
|
583
|
+
doc_title TEXT,
|
|
584
|
+
doc_source TEXT,
|
|
585
|
+
doc_metadata JSONB,
|
|
586
|
+
doc_project_ids UUID[],
|
|
587
|
+
doc_project_names TEXT[],
|
|
588
|
+
best_score FLOAT,
|
|
589
|
+
best_chunk_heading_path TEXT[],
|
|
590
|
+
full_content TEXT,
|
|
591
|
+
chunk_count INT,
|
|
592
|
+
total_chars INT,
|
|
593
|
+
doc_updated_at TIMESTAMPTZ,
|
|
594
|
+
version_count INT,
|
|
595
|
+
is_partial BOOL
|
|
596
|
+
)
|
|
597
|
+
LANGUAGE sql
|
|
598
|
+
SECURITY DEFINER
|
|
599
|
+
STABLE
|
|
600
|
+
SET search_path = public, pg_catalog
|
|
601
|
+
AS $$
|
|
602
|
+
WITH chunk_results AS (
|
|
603
|
+
-- Run hybrid search with a 10x candidate pool so deduplication has
|
|
604
|
+
-- enough candidates to fill p_match_count unique documents.
|
|
605
|
+
SELECT * FROM cerefox_hybrid_search(
|
|
606
|
+
p_query_text := p_query_text,
|
|
607
|
+
p_query_embedding := p_query_embedding,
|
|
608
|
+
p_match_count := p_match_count * 10,
|
|
609
|
+
p_alpha := p_alpha,
|
|
610
|
+
p_use_upgrade := FALSE,
|
|
611
|
+
p_project_id := p_project_id,
|
|
612
|
+
p_min_score := p_min_score,
|
|
613
|
+
p_metadata_filter := p_metadata_filter
|
|
614
|
+
)
|
|
615
|
+
),
|
|
616
|
+
best_per_doc AS (
|
|
617
|
+
-- One row per document: keep the highest-scoring chunk as representative.
|
|
618
|
+
SELECT DISTINCT ON (cr.document_id)
|
|
619
|
+
cr.document_id,
|
|
620
|
+
cr.heading_path AS best_chunk_heading_path,
|
|
621
|
+
cr.score AS best_score,
|
|
622
|
+
cr.doc_title,
|
|
623
|
+
cr.doc_source,
|
|
624
|
+
cr.doc_metadata,
|
|
625
|
+
cr.doc_project_ids,
|
|
626
|
+
cr.doc_project_names,
|
|
627
|
+
cr.version_count,
|
|
628
|
+
d.updated_at AS doc_updated_at
|
|
629
|
+
FROM chunk_results cr
|
|
630
|
+
JOIN cerefox_documents d ON d.id = cr.document_id
|
|
631
|
+
ORDER BY cr.document_id, cr.score DESC
|
|
632
|
+
),
|
|
633
|
+
top_docs AS (
|
|
634
|
+
SELECT *
|
|
635
|
+
FROM best_per_doc
|
|
636
|
+
ORDER BY best_score DESC
|
|
637
|
+
LIMIT p_match_count
|
|
638
|
+
),
|
|
639
|
+
-- Compute actual total_chars per top document (needed for threshold check).
|
|
640
|
+
doc_sizes AS (
|
|
641
|
+
SELECT c.document_id, SUM(c.char_count)::INT AS total_chars
|
|
642
|
+
FROM cerefox_chunks c
|
|
643
|
+
WHERE c.document_id IN (SELECT document_id FROM top_docs)
|
|
644
|
+
AND c.version_id IS NULL
|
|
645
|
+
GROUP BY c.document_id
|
|
646
|
+
),
|
|
647
|
+
-- Matched chunk IDs from documents that exceed the threshold.
|
|
648
|
+
large_doc_seeds AS (
|
|
649
|
+
SELECT cr.chunk_id
|
|
650
|
+
FROM chunk_results cr
|
|
651
|
+
JOIN doc_sizes ds ON cr.document_id = ds.document_id
|
|
652
|
+
WHERE p_small_to_big_threshold > 0
|
|
653
|
+
AND ds.total_chars > p_small_to_big_threshold
|
|
654
|
+
AND cr.document_id IN (SELECT document_id FROM top_docs)
|
|
655
|
+
),
|
|
656
|
+
-- Expand context for all large-doc seeds in a single call.
|
|
657
|
+
-- cerefox_context_expand respects document boundaries and deduplicates.
|
|
658
|
+
-- When large_doc_seeds is empty (threshold=0 or all docs are small),
|
|
659
|
+
-- ARRAY_AGG returns NULL; COALESCE converts that to an empty array so the
|
|
660
|
+
-- function returns 0 rows safely.
|
|
661
|
+
expanded AS (
|
|
662
|
+
SELECT ec.chunk_id, ec.document_id, ec.chunk_index, ec.content
|
|
663
|
+
FROM cerefox_context_expand(
|
|
664
|
+
COALESCE((SELECT ARRAY_AGG(chunk_id) FROM large_doc_seeds), ARRAY[]::UUID[]),
|
|
665
|
+
p_context_window
|
|
666
|
+
) ec
|
|
667
|
+
),
|
|
668
|
+
-- Aggregate expanded chunks per large document (is_partial = TRUE).
|
|
669
|
+
large_doc_content AS (
|
|
670
|
+
SELECT
|
|
671
|
+
e.document_id,
|
|
672
|
+
STRING_AGG(e.content, E'\n\n' ORDER BY e.chunk_index) AS full_content,
|
|
673
|
+
COUNT(*)::INT AS chunk_count,
|
|
674
|
+
TRUE AS is_partial
|
|
675
|
+
FROM expanded e
|
|
676
|
+
GROUP BY e.document_id
|
|
677
|
+
),
|
|
678
|
+
-- Full content for small documents (is_partial = FALSE).
|
|
679
|
+
small_doc_content AS (
|
|
680
|
+
SELECT
|
|
681
|
+
c.document_id,
|
|
682
|
+
STRING_AGG(c.content, E'\n\n' ORDER BY c.chunk_index) AS full_content,
|
|
683
|
+
COUNT(*)::INT AS chunk_count,
|
|
684
|
+
FALSE AS is_partial
|
|
685
|
+
FROM cerefox_chunks c
|
|
686
|
+
WHERE c.document_id IN (SELECT document_id FROM top_docs)
|
|
687
|
+
AND c.document_id NOT IN (SELECT document_id FROM large_doc_content)
|
|
688
|
+
AND c.version_id IS NULL
|
|
689
|
+
GROUP BY c.document_id
|
|
690
|
+
),
|
|
691
|
+
all_content AS (
|
|
692
|
+
SELECT document_id, full_content, chunk_count, is_partial FROM large_doc_content
|
|
693
|
+
UNION ALL
|
|
694
|
+
SELECT document_id, full_content, chunk_count, is_partial FROM small_doc_content
|
|
695
|
+
)
|
|
696
|
+
SELECT
|
|
697
|
+
td.document_id,
|
|
698
|
+
td.doc_title,
|
|
699
|
+
td.doc_source,
|
|
700
|
+
td.doc_metadata,
|
|
701
|
+
td.doc_project_ids,
|
|
702
|
+
td.doc_project_names,
|
|
703
|
+
td.best_score,
|
|
704
|
+
td.best_chunk_heading_path,
|
|
705
|
+
ac.full_content,
|
|
706
|
+
ac.chunk_count,
|
|
707
|
+
ds.total_chars, -- always full document size, even for partial results
|
|
708
|
+
td.doc_updated_at,
|
|
709
|
+
td.version_count,
|
|
710
|
+
ac.is_partial
|
|
711
|
+
FROM top_docs td
|
|
712
|
+
JOIN doc_sizes ds ON ds.document_id = td.document_id
|
|
713
|
+
JOIN all_content ac ON ac.document_id = td.document_id
|
|
714
|
+
ORDER BY td.best_score DESC;
|
|
715
|
+
$$;
|
|
716
|
+
|
|
717
|
+
-- ── Metadata key discovery RPC ───────────────────────────────────────────────
|
|
718
|
+
-- Derives metadata keys from actual document data (metadata JSONB column).
|
|
719
|
+
-- No registry table needed — always accurate, zero maintenance.
|
|
720
|
+
-- Used by CLI, MCP tools, web UI autocomplete.
|
|
721
|
+
|
|
722
|
+
-- ── cerefox_snapshot_version ──────────────────────────────────────────────────
|
|
723
|
+
-- Archives all current chunks for a document (sets version_id to the new version
|
|
724
|
+
-- row's UUID) and runs lazy retention cleanup.
|
|
725
|
+
--
|
|
726
|
+
-- Called by the Python pipeline's update_document() and by the TypeScript Edge
|
|
727
|
+
-- Functions before inserting new chunks. This single RPC is the canonical way to
|
|
728
|
+
-- create a version — do not split the chunk-archiving step into separate code.
|
|
729
|
+
--
|
|
730
|
+
-- Retention policy (p_retention_hours):
|
|
731
|
+
-- - Always keeps the most recently created version (accidental-deletion protection)
|
|
732
|
+
-- - Also keeps all versions created within the retention window
|
|
733
|
+
-- - Deletes older versions beyond the window (cascade removes their chunks)
|
|
734
|
+
--
|
|
735
|
+
-- Parameters:
|
|
736
|
+
-- p_document_id : Document to snapshot
|
|
737
|
+
-- p_source : How the update was triggered ('file','paste','agent','manual')
|
|
738
|
+
-- p_retention_hours : Retention window in hours (default: 48)
|
|
739
|
+
--
|
|
740
|
+
-- Returns: (version_id, version_number, chunk_count, total_chars) of the new version
|
|
741
|
+
|
|
742
|
+
DROP FUNCTION IF EXISTS cerefox_snapshot_version(UUID, TEXT, INT);
|
|
743
|
+
DROP FUNCTION IF EXISTS cerefox_snapshot_version(UUID, TEXT, INT, BOOLEAN);
|
|
744
|
+
CREATE FUNCTION cerefox_snapshot_version(
|
|
745
|
+
p_document_id UUID,
|
|
746
|
+
p_source TEXT DEFAULT 'manual',
|
|
747
|
+
p_retention_hours INT DEFAULT 48,
|
|
748
|
+
p_cleanup_enabled BOOLEAN DEFAULT TRUE
|
|
749
|
+
)
|
|
750
|
+
RETURNS TABLE (
|
|
751
|
+
version_id UUID,
|
|
752
|
+
version_number INT,
|
|
753
|
+
chunk_count INT,
|
|
754
|
+
total_chars INT
|
|
755
|
+
)
|
|
756
|
+
LANGUAGE plpgsql
|
|
757
|
+
SECURITY DEFINER
|
|
758
|
+
SET search_path = public, pg_catalog
|
|
759
|
+
AS $$
|
|
760
|
+
DECLARE
|
|
761
|
+
v_version_id UUID;
|
|
762
|
+
v_version_number INT;
|
|
763
|
+
v_chunk_count INT;
|
|
764
|
+
v_total_chars INT;
|
|
765
|
+
BEGIN
|
|
766
|
+
-- Count current chunks to record in the version metadata
|
|
767
|
+
SELECT COUNT(*), COALESCE(SUM(char_count), 0)
|
|
768
|
+
INTO v_chunk_count, v_total_chars
|
|
769
|
+
FROM cerefox_chunks c
|
|
770
|
+
WHERE c.document_id = p_document_id
|
|
771
|
+
AND c.version_id IS NULL;
|
|
772
|
+
|
|
773
|
+
-- Compute the next version number (sequential per document)
|
|
774
|
+
SELECT COALESCE(MAX(dv.version_number), 0) + 1
|
|
775
|
+
INTO v_version_number
|
|
776
|
+
FROM cerefox_document_versions dv
|
|
777
|
+
WHERE dv.document_id = p_document_id;
|
|
778
|
+
|
|
779
|
+
-- Create the version row
|
|
780
|
+
INSERT INTO cerefox_document_versions (
|
|
781
|
+
document_id, version_number, source, chunk_count, total_chars
|
|
782
|
+
) VALUES (
|
|
783
|
+
p_document_id, v_version_number, p_source, v_chunk_count, v_total_chars
|
|
784
|
+
)
|
|
785
|
+
RETURNING id INTO v_version_id;
|
|
786
|
+
|
|
787
|
+
-- Archive all current chunks by pointing them at the new version
|
|
788
|
+
UPDATE cerefox_chunks c
|
|
789
|
+
SET version_id = v_version_id
|
|
790
|
+
WHERE c.document_id = p_document_id
|
|
791
|
+
AND c.version_id IS NULL;
|
|
792
|
+
|
|
793
|
+
-- Lazy retention: delete versions outside the retention window,
|
|
794
|
+
-- but always keep the most recently created version (the one we just made).
|
|
795
|
+
-- Skip archived versions (archived=true) -- they are protected from cleanup.
|
|
796
|
+
-- Skip cleanup entirely if p_cleanup_enabled is false (immutable mode).
|
|
797
|
+
IF p_cleanup_enabled THEN
|
|
798
|
+
DELETE FROM cerefox_document_versions dv
|
|
799
|
+
WHERE dv.document_id = p_document_id
|
|
800
|
+
AND dv.archived IS NOT TRUE
|
|
801
|
+
AND dv.created_at < NOW() - (p_retention_hours || ' hours')::INTERVAL
|
|
802
|
+
AND dv.id != (
|
|
803
|
+
SELECT id FROM cerefox_document_versions
|
|
804
|
+
WHERE document_id = p_document_id
|
|
805
|
+
ORDER BY created_at DESC
|
|
806
|
+
LIMIT 1
|
|
807
|
+
);
|
|
808
|
+
END IF;
|
|
809
|
+
|
|
810
|
+
RETURN QUERY SELECT v_version_id, v_version_number, v_chunk_count, v_total_chars;
|
|
811
|
+
END;
|
|
812
|
+
$$;
|
|
813
|
+
|
|
814
|
+
-- ── cerefox_get_document ──────────────────────────────────────────────────────
|
|
815
|
+
-- Returns the full content of a document by reconstructing it from chunks.
|
|
816
|
+
-- Pass p_version_id = NULL (or omit it) for the current version.
|
|
817
|
+
-- Pass a specific version UUID to retrieve an archived version.
|
|
818
|
+
-- Version UUIDs are returned by cerefox_list_document_versions.
|
|
819
|
+
|
|
820
|
+
CREATE FUNCTION cerefox_get_document(
|
|
821
|
+
p_document_id UUID,
|
|
822
|
+
p_version_id UUID DEFAULT NULL
|
|
823
|
+
)
|
|
824
|
+
RETURNS TABLE (
|
|
825
|
+
document_id UUID,
|
|
826
|
+
doc_title TEXT,
|
|
827
|
+
doc_source TEXT,
|
|
828
|
+
doc_metadata JSONB,
|
|
829
|
+
doc_project_ids UUID[],
|
|
830
|
+
doc_project_names TEXT[],
|
|
831
|
+
version_id UUID,
|
|
832
|
+
full_content TEXT,
|
|
833
|
+
chunk_count INT,
|
|
834
|
+
total_chars INT,
|
|
835
|
+
created_at TIMESTAMPTZ
|
|
836
|
+
)
|
|
837
|
+
LANGUAGE sql
|
|
838
|
+
SECURITY DEFINER
|
|
839
|
+
STABLE
|
|
840
|
+
SET search_path = public, pg_catalog
|
|
841
|
+
AS $$
|
|
842
|
+
SELECT
|
|
843
|
+
d.id AS document_id,
|
|
844
|
+
d.title AS doc_title,
|
|
845
|
+
d.source AS doc_source,
|
|
846
|
+
d.metadata AS doc_metadata,
|
|
847
|
+
ARRAY(SELECT dp.project_id FROM cerefox_document_projects dp
|
|
848
|
+
WHERE dp.document_id = d.id) AS doc_project_ids,
|
|
849
|
+
ARRAY(SELECT p.name FROM cerefox_projects p
|
|
850
|
+
JOIN cerefox_document_projects dp ON p.id = dp.project_id
|
|
851
|
+
WHERE dp.document_id = d.id) AS doc_project_names,
|
|
852
|
+
p_version_id AS version_id,
|
|
853
|
+
STRING_AGG(c.content, E'\n\n' ORDER BY c.chunk_index) AS full_content,
|
|
854
|
+
COUNT(*)::INT AS chunk_count,
|
|
855
|
+
SUM(c.char_count)::INT AS total_chars,
|
|
856
|
+
d.created_at
|
|
857
|
+
FROM cerefox_documents d
|
|
858
|
+
JOIN cerefox_chunks c ON c.document_id = d.id
|
|
859
|
+
WHERE d.id = p_document_id
|
|
860
|
+
AND (
|
|
861
|
+
(p_version_id IS NULL AND c.version_id IS NULL) OR
|
|
862
|
+
(p_version_id IS NOT NULL AND c.version_id = p_version_id)
|
|
863
|
+
)
|
|
864
|
+
GROUP BY d.id, d.title, d.source, d.metadata, d.created_at;
|
|
865
|
+
$$;
|
|
866
|
+
|
|
867
|
+
-- ── cerefox_list_document_versions ────────────────────────────────────────────
|
|
868
|
+
-- Returns all archived versions for a document, newest first.
|
|
869
|
+
-- version_id is the UUID to pass to cerefox_get_document for retrieval.
|
|
870
|
+
-- version_number is the sequential human-readable number (unique per document).
|
|
871
|
+
|
|
872
|
+
DROP FUNCTION IF EXISTS cerefox_list_document_versions(UUID);
|
|
873
|
+
CREATE FUNCTION cerefox_list_document_versions(
|
|
874
|
+
p_document_id UUID
|
|
875
|
+
)
|
|
876
|
+
RETURNS TABLE (
|
|
877
|
+
version_id UUID,
|
|
878
|
+
version_number INT,
|
|
879
|
+
source TEXT,
|
|
880
|
+
chunk_count INT,
|
|
881
|
+
total_chars INT,
|
|
882
|
+
archived BOOLEAN,
|
|
883
|
+
created_at TIMESTAMPTZ
|
|
884
|
+
)
|
|
885
|
+
LANGUAGE sql
|
|
886
|
+
SECURITY DEFINER
|
|
887
|
+
STABLE
|
|
888
|
+
SET search_path = public, pg_catalog
|
|
889
|
+
AS $$
|
|
890
|
+
SELECT id, version_number, source, chunk_count, total_chars, archived, created_at
|
|
891
|
+
FROM cerefox_document_versions
|
|
892
|
+
WHERE document_id = p_document_id
|
|
893
|
+
ORDER BY created_at DESC;
|
|
894
|
+
$$;
|
|
895
|
+
|
|
896
|
+
-- ── cerefox_delete_document (soft delete) ────────────────────────────────────
|
|
897
|
+
-- Soft-deletes a document by setting deleted_at = NOW(). The document, its
|
|
898
|
+
-- chunks, and versions remain in the database but are excluded from search.
|
|
899
|
+
-- Use cerefox_purge_document for permanent deletion.
|
|
900
|
+
-- Use cerefox_restore_document to undo a soft delete.
|
|
901
|
+
|
|
902
|
+
DROP FUNCTION IF EXISTS cerefox_delete_document(UUID, TEXT, TEXT);
|
|
903
|
+
DROP FUNCTION IF EXISTS cerefox_delete_document(UUID);
|
|
904
|
+
CREATE FUNCTION cerefox_delete_document(
|
|
905
|
+
p_document_id UUID,
|
|
906
|
+
p_author TEXT DEFAULT 'unknown',
|
|
907
|
+
p_author_type TEXT DEFAULT 'user'
|
|
908
|
+
)
|
|
909
|
+
RETURNS VOID
|
|
910
|
+
LANGUAGE plpgsql
|
|
911
|
+
SECURITY DEFINER
|
|
912
|
+
SET search_path = public, pg_catalog
|
|
913
|
+
AS $$
|
|
914
|
+
DECLARE
|
|
915
|
+
v_title TEXT;
|
|
916
|
+
v_total_chars INT;
|
|
917
|
+
BEGIN
|
|
918
|
+
SELECT title, total_chars INTO v_title, v_total_chars
|
|
919
|
+
FROM cerefox_documents WHERE id = p_document_id;
|
|
920
|
+
|
|
921
|
+
IF NOT FOUND THEN
|
|
922
|
+
RAISE EXCEPTION 'Document % not found', p_document_id;
|
|
923
|
+
END IF;
|
|
924
|
+
|
|
925
|
+
-- Soft delete: set deleted_at timestamp
|
|
926
|
+
UPDATE cerefox_documents SET deleted_at = NOW() WHERE id = p_document_id;
|
|
927
|
+
|
|
928
|
+
PERFORM cerefox_create_audit_entry(
|
|
929
|
+
p_document_id := p_document_id,
|
|
930
|
+
p_operation := 'delete',
|
|
931
|
+
p_author := p_author,
|
|
932
|
+
p_author_type := p_author_type,
|
|
933
|
+
p_size_before := v_total_chars,
|
|
934
|
+
p_size_after := 0,
|
|
935
|
+
p_description := 'Soft-deleted document: ' || COALESCE(v_title, '(untitled)') ||
|
|
936
|
+
' (' || COALESCE(v_total_chars, 0) || ' chars)'
|
|
937
|
+
);
|
|
938
|
+
END;
|
|
939
|
+
$$;
|
|
940
|
+
|
|
941
|
+
-- ── cerefox_restore_document ─────────────────────────────────────────────────
|
|
942
|
+
-- Restores a soft-deleted document by clearing deleted_at.
|
|
943
|
+
|
|
944
|
+
CREATE OR REPLACE FUNCTION cerefox_restore_document(
|
|
945
|
+
p_document_id UUID,
|
|
946
|
+
p_author TEXT DEFAULT 'unknown',
|
|
947
|
+
p_author_type TEXT DEFAULT 'user'
|
|
948
|
+
)
|
|
949
|
+
RETURNS VOID
|
|
950
|
+
LANGUAGE plpgsql
|
|
951
|
+
SECURITY DEFINER
|
|
952
|
+
SET search_path = public, pg_catalog
|
|
953
|
+
AS $$
|
|
954
|
+
DECLARE
|
|
955
|
+
v_title TEXT;
|
|
956
|
+
v_total_chars INT;
|
|
957
|
+
BEGIN
|
|
958
|
+
SELECT title, total_chars INTO v_title, v_total_chars
|
|
959
|
+
FROM cerefox_documents WHERE id = p_document_id AND deleted_at IS NOT NULL;
|
|
960
|
+
|
|
961
|
+
IF v_title IS NULL THEN
|
|
962
|
+
RETURN; -- Not found or not deleted
|
|
963
|
+
END IF;
|
|
964
|
+
|
|
965
|
+
UPDATE cerefox_documents SET deleted_at = NULL WHERE id = p_document_id;
|
|
966
|
+
|
|
967
|
+
PERFORM cerefox_create_audit_entry(
|
|
968
|
+
p_document_id := p_document_id,
|
|
969
|
+
p_operation := 'restore',
|
|
970
|
+
p_author := p_author,
|
|
971
|
+
p_author_type := p_author_type,
|
|
972
|
+
p_size_before := 0,
|
|
973
|
+
p_size_after := v_total_chars,
|
|
974
|
+
p_description := 'Restored document: ' || COALESCE(v_title, '(untitled)')
|
|
975
|
+
);
|
|
976
|
+
END;
|
|
977
|
+
$$;
|
|
978
|
+
|
|
979
|
+
-- ── cerefox_purge_document ───────────────────────────────────────────────────
|
|
980
|
+
-- Permanently deletes a soft-deleted document (CASCADE). Only works on
|
|
981
|
+
-- documents that are already soft-deleted (deleted_at IS NOT NULL).
|
|
982
|
+
|
|
983
|
+
CREATE OR REPLACE FUNCTION cerefox_purge_document(
|
|
984
|
+
p_document_id UUID,
|
|
985
|
+
p_author TEXT DEFAULT 'unknown',
|
|
986
|
+
p_author_type TEXT DEFAULT 'user'
|
|
987
|
+
)
|
|
988
|
+
RETURNS VOID
|
|
989
|
+
LANGUAGE plpgsql
|
|
990
|
+
SECURITY DEFINER
|
|
991
|
+
SET search_path = public, pg_catalog
|
|
992
|
+
AS $$
|
|
993
|
+
DECLARE
|
|
994
|
+
v_title TEXT;
|
|
995
|
+
v_total_chars INT;
|
|
996
|
+
BEGIN
|
|
997
|
+
SELECT title, total_chars INTO v_title, v_total_chars
|
|
998
|
+
FROM cerefox_documents WHERE id = p_document_id AND deleted_at IS NOT NULL;
|
|
999
|
+
|
|
1000
|
+
IF v_title IS NULL THEN
|
|
1001
|
+
RETURN; -- Not found or not soft-deleted
|
|
1002
|
+
END IF;
|
|
1003
|
+
|
|
1004
|
+
PERFORM cerefox_create_audit_entry(
|
|
1005
|
+
p_document_id := p_document_id,
|
|
1006
|
+
p_operation := 'delete',
|
|
1007
|
+
p_author := p_author,
|
|
1008
|
+
p_author_type := p_author_type,
|
|
1009
|
+
p_size_before := v_total_chars,
|
|
1010
|
+
p_size_after := 0,
|
|
1011
|
+
p_description := 'Permanently deleted document: ' || COALESCE(v_title, '(untitled)') ||
|
|
1012
|
+
' (' || COALESCE(v_total_chars, 0) || ' chars)'
|
|
1013
|
+
);
|
|
1014
|
+
|
|
1015
|
+
DELETE FROM cerefox_documents WHERE id = p_document_id;
|
|
1016
|
+
END;
|
|
1017
|
+
$$;
|
|
1018
|
+
|
|
1019
|
+
|
|
1020
|
+
-- ── cerefox_ingest_document ──────────────────────────────────────────────────
|
|
1021
|
+
-- Single RPC for ingesting a document (create or update). Handles:
|
|
1022
|
+
-- - Create: insert document row, insert chunks, set review_status, create audit entry
|
|
1023
|
+
-- - Update: snapshot old version, delete old chunks, update document row,
|
|
1024
|
+
-- insert new chunks, set review_status, create audit entry
|
|
1025
|
+
--
|
|
1026
|
+
-- Both the Python pipeline and the Edge Function call this after chunking and
|
|
1027
|
+
-- embedding. This is the single implementation of the ingestion write path.
|
|
1028
|
+
--
|
|
1029
|
+
-- Parameters:
|
|
1030
|
+
-- p_document_id : NULL for create, UUID for update
|
|
1031
|
+
-- p_title, p_source, p_source_path, p_content_hash, p_metadata : document fields
|
|
1032
|
+
-- p_review_status : 'approved' or 'pending_review' (based on author_type)
|
|
1033
|
+
-- p_chunks : JSONB array of chunk objects, each with:
|
|
1034
|
+
-- chunk_index, heading_path, heading_level, title,
|
|
1035
|
+
-- content, char_count, embedding (float[]), embedder (text)
|
|
1036
|
+
-- p_author, p_author_type : for audit entry
|
|
1037
|
+
-- p_source_label : version source label for snapshot ('file','paste','agent','manual')
|
|
1038
|
+
-- p_retention_hours : for version cleanup (default 48)
|
|
1039
|
+
-- p_cleanup_enabled : whether version cleanup runs (default true)
|
|
1040
|
+
--
|
|
1041
|
+
-- Returns: document_id, chunk_count, total_chars, operation ('create' or 'update-content'),
|
|
1042
|
+
-- version_id (UUID of snapshot, null on create)
|
|
1043
|
+
|
|
1044
|
+
DROP FUNCTION IF EXISTS cerefox_ingest_document(UUID, TEXT, TEXT, TEXT, TEXT, JSONB, TEXT, JSONB, TEXT, TEXT, TEXT, INT, BOOLEAN);
|
|
1045
|
+
CREATE FUNCTION cerefox_ingest_document(
|
|
1046
|
+
p_document_id UUID DEFAULT NULL,
|
|
1047
|
+
p_title TEXT DEFAULT 'Untitled',
|
|
1048
|
+
p_source TEXT DEFAULT 'agent',
|
|
1049
|
+
p_source_path TEXT DEFAULT NULL,
|
|
1050
|
+
p_content_hash TEXT DEFAULT '',
|
|
1051
|
+
p_metadata JSONB DEFAULT '{}',
|
|
1052
|
+
p_review_status TEXT DEFAULT 'approved',
|
|
1053
|
+
p_chunks JSONB DEFAULT '[]',
|
|
1054
|
+
p_author TEXT DEFAULT 'unknown',
|
|
1055
|
+
p_author_type TEXT DEFAULT 'user',
|
|
1056
|
+
p_source_label TEXT DEFAULT 'manual',
|
|
1057
|
+
p_retention_hours INT DEFAULT 48,
|
|
1058
|
+
p_cleanup_enabled BOOLEAN DEFAULT TRUE
|
|
1059
|
+
)
|
|
1060
|
+
RETURNS TABLE (
|
|
1061
|
+
document_id UUID,
|
|
1062
|
+
chunk_count INT,
|
|
1063
|
+
total_chars INT,
|
|
1064
|
+
operation TEXT,
|
|
1065
|
+
version_id UUID
|
|
1066
|
+
)
|
|
1067
|
+
LANGUAGE plpgsql
|
|
1068
|
+
SECURITY DEFINER
|
|
1069
|
+
SET search_path = public, pg_catalog
|
|
1070
|
+
AS $$
|
|
1071
|
+
DECLARE
|
|
1072
|
+
v_doc_id UUID;
|
|
1073
|
+
v_chunk_count INT;
|
|
1074
|
+
v_total_chars INT;
|
|
1075
|
+
v_operation TEXT;
|
|
1076
|
+
v_version_id UUID := NULL;
|
|
1077
|
+
v_old_chars INT := 0;
|
|
1078
|
+
v_chunk JSONB;
|
|
1079
|
+
v_snap RECORD;
|
|
1080
|
+
v_status TEXT;
|
|
1081
|
+
BEGIN
|
|
1082
|
+
-- ── Zero-chunk guard (v0.3.1) ────────────────────────────────────────
|
|
1083
|
+
-- Refuse to create or update a document with no chunks. Three reasons:
|
|
1084
|
+
-- 1. A zero-chunk document is meaningless on its own (no body, no
|
|
1085
|
+
-- embeddings, can't be searched).
|
|
1086
|
+
-- 2. The SQL signature has DEFAULTs for every parameter, so calling
|
|
1087
|
+
-- `SELECT cerefox_ingest_document()` with no args used to create
|
|
1088
|
+
-- an orphan `Untitled` row. v0.3.0's db-client introspection
|
|
1089
|
+
-- fallback hit this path; see the v0.3.1 Decision Log entry.
|
|
1090
|
+
-- 3. It papers over the asymmetry between `list_documents` (returns
|
|
1091
|
+
-- 0-chunk rows) and `cerefox_get_document` (404s on them).
|
|
1092
|
+
-- Cheaper to refuse the write than to fix both queries.
|
|
1093
|
+
-- If you actually need to clear a doc's content, soft-delete it.
|
|
1094
|
+
IF p_chunks IS NULL OR jsonb_array_length(p_chunks) = 0 THEN
|
|
1095
|
+
RAISE EXCEPTION
|
|
1096
|
+
'cerefox_ingest_document: refusing to write a document with zero chunks (title=%, source=%). Supply at least one chunk, or use cerefox_delete_document to clear content.',
|
|
1097
|
+
p_title, p_source
|
|
1098
|
+
USING ERRCODE = '22023'; -- invalid_parameter_value
|
|
1099
|
+
END IF;
|
|
1100
|
+
|
|
1101
|
+
-- Validate review_status
|
|
1102
|
+
v_status := CASE WHEN p_review_status IN ('approved', 'pending_review')
|
|
1103
|
+
THEN p_review_status ELSE 'approved' END;
|
|
1104
|
+
|
|
1105
|
+
-- Count chunks and total chars from the input
|
|
1106
|
+
v_chunk_count := jsonb_array_length(p_chunks);
|
|
1107
|
+
v_total_chars := 0;
|
|
1108
|
+
FOR v_chunk IN SELECT * FROM jsonb_array_elements(p_chunks) LOOP
|
|
1109
|
+
v_total_chars := v_total_chars + COALESCE((v_chunk->>'char_count')::INT, 0);
|
|
1110
|
+
END LOOP;
|
|
1111
|
+
|
|
1112
|
+
IF p_document_id IS NOT NULL THEN
|
|
1113
|
+
-- ── UPDATE PATH ──────────────────────────────────────────────
|
|
1114
|
+
v_doc_id := p_document_id;
|
|
1115
|
+
v_operation := 'update-content';
|
|
1116
|
+
|
|
1117
|
+
-- Get old size for audit
|
|
1118
|
+
SELECT COALESCE(d.total_chars, 0) INTO v_old_chars
|
|
1119
|
+
FROM cerefox_documents d WHERE d.id = v_doc_id;
|
|
1120
|
+
|
|
1121
|
+
-- Snapshot old version (archives current chunks, runs retention cleanup)
|
|
1122
|
+
SELECT sv.version_id INTO v_version_id
|
|
1123
|
+
FROM cerefox_snapshot_version(v_doc_id, p_source_label, p_retention_hours, p_cleanup_enabled) sv;
|
|
1124
|
+
|
|
1125
|
+
-- Update document record
|
|
1126
|
+
UPDATE cerefox_documents SET
|
|
1127
|
+
title = p_title,
|
|
1128
|
+
source = p_source,
|
|
1129
|
+
source_path = COALESCE(p_source_path, source_path),
|
|
1130
|
+
content_hash = p_content_hash,
|
|
1131
|
+
metadata = p_metadata,
|
|
1132
|
+
chunk_count = v_chunk_count,
|
|
1133
|
+
total_chars = v_total_chars,
|
|
1134
|
+
review_status = v_status,
|
|
1135
|
+
updated_at = NOW()
|
|
1136
|
+
WHERE id = v_doc_id;
|
|
1137
|
+
|
|
1138
|
+
ELSE
|
|
1139
|
+
-- ── CREATE PATH ──────────────────────────────────────────────
|
|
1140
|
+
v_operation := 'create';
|
|
1141
|
+
|
|
1142
|
+
INSERT INTO cerefox_documents (
|
|
1143
|
+
title, source, source_path, content_hash, metadata,
|
|
1144
|
+
chunk_count, total_chars, review_status
|
|
1145
|
+
) VALUES (
|
|
1146
|
+
p_title, p_source, p_source_path, p_content_hash, p_metadata,
|
|
1147
|
+
v_chunk_count, v_total_chars, v_status
|
|
1148
|
+
)
|
|
1149
|
+
RETURNING id INTO v_doc_id;
|
|
1150
|
+
END IF;
|
|
1151
|
+
|
|
1152
|
+
-- ── Insert chunks ────────────────────────────────────────────────
|
|
1153
|
+
-- fts is computed here (Option B) using p_title (document title, already a parameter)
|
|
1154
|
+
-- and the chunk's own heading title + content. This avoids pre-computing tsvectors in
|
|
1155
|
+
-- the Python/TypeScript callers and keeps logic in one place (single-implementation).
|
|
1156
|
+
-- Formula: doc_title (A) || chunk_heading (A) || body_content (B)
|
|
1157
|
+
INSERT INTO cerefox_chunks (
|
|
1158
|
+
document_id, chunk_index, heading_path, heading_level,
|
|
1159
|
+
title, content, char_count, embedding_primary, embedder_primary, fts
|
|
1160
|
+
)
|
|
1161
|
+
SELECT
|
|
1162
|
+
v_doc_id,
|
|
1163
|
+
(c->>'chunk_index')::INT,
|
|
1164
|
+
ARRAY(SELECT jsonb_array_elements_text(c->'heading_path')),
|
|
1165
|
+
(c->>'heading_level')::INT,
|
|
1166
|
+
c->>'title',
|
|
1167
|
+
c->>'content',
|
|
1168
|
+
(c->>'char_count')::INT,
|
|
1169
|
+
(SELECT array_agg(e::FLOAT)::VECTOR(768) FROM jsonb_array_elements_text(c->'embedding') AS e),
|
|
1170
|
+
c->>'embedder',
|
|
1171
|
+
setweight(to_tsvector('english', COALESCE(p_title, '')), 'A') ||
|
|
1172
|
+
setweight(to_tsvector('english', COALESCE(c->>'title', '')), 'A') ||
|
|
1173
|
+
setweight(to_tsvector('english', COALESCE(c->>'content', '')), 'B')
|
|
1174
|
+
FROM jsonb_array_elements(p_chunks) AS c;
|
|
1175
|
+
|
|
1176
|
+
-- ── Audit entry ──────────────────────────────────────────────────
|
|
1177
|
+
PERFORM cerefox_create_audit_entry(
|
|
1178
|
+
p_document_id := v_doc_id,
|
|
1179
|
+
p_version_id := v_version_id,
|
|
1180
|
+
p_operation := v_operation,
|
|
1181
|
+
p_author := p_author,
|
|
1182
|
+
p_author_type := p_author_type,
|
|
1183
|
+
p_size_before := CASE WHEN v_operation = 'create' THEN NULL ELSE v_old_chars END,
|
|
1184
|
+
p_size_after := v_total_chars,
|
|
1185
|
+
p_description := v_operation || ': ' || p_title || ' (' || v_chunk_count || ' chunks, ' || v_total_chars || ' chars)'
|
|
1186
|
+
);
|
|
1187
|
+
|
|
1188
|
+
RETURN QUERY SELECT v_doc_id, v_chunk_count, v_total_chars, v_operation, v_version_id;
|
|
1189
|
+
END;
|
|
1190
|
+
$$;
|
|
1191
|
+
|
|
1192
|
+
|
|
1193
|
+
-- ── cerefox_update_chunk_fts ──────────────────────────────────────────────────
|
|
1194
|
+
-- Updates the FTS tsvector for all current chunks of a document using a new
|
|
1195
|
+
-- document title. Called when a document's title changes without a content change
|
|
1196
|
+
-- (the content-unchanged path in the ingestion pipeline skips cerefox_ingest_document).
|
|
1197
|
+
--
|
|
1198
|
+
-- Formula: doc_title (A) || chunk_heading (A) || body_content (B)
|
|
1199
|
+
-- Reads chunk title and content directly from the DB -- caller only needs to
|
|
1200
|
+
-- supply the new document title.
|
|
1201
|
+
--
|
|
1202
|
+
-- Only affects current chunks (version_id IS NULL). Archived chunks retain their
|
|
1203
|
+
-- original tsvectors (they are excluded from all search indexes and require
|
|
1204
|
+
-- re-ingestion to restore anyway).
|
|
1205
|
+
|
|
1206
|
+
DROP FUNCTION IF EXISTS cerefox_update_chunk_fts(UUID, TEXT);
|
|
1207
|
+
CREATE FUNCTION cerefox_update_chunk_fts(
|
|
1208
|
+
p_document_id UUID,
|
|
1209
|
+
p_new_title TEXT
|
|
1210
|
+
)
|
|
1211
|
+
RETURNS VOID
|
|
1212
|
+
LANGUAGE sql
|
|
1213
|
+
SECURITY DEFINER
|
|
1214
|
+
SET search_path = public, pg_catalog
|
|
1215
|
+
AS $$
|
|
1216
|
+
UPDATE cerefox_chunks
|
|
1217
|
+
SET fts =
|
|
1218
|
+
setweight(to_tsvector('english', COALESCE(p_new_title, '')), 'A') ||
|
|
1219
|
+
setweight(to_tsvector('english', COALESCE(title, '')), 'A') ||
|
|
1220
|
+
setweight(to_tsvector('english', COALESCE(content, '')), 'B')
|
|
1221
|
+
WHERE document_id = p_document_id
|
|
1222
|
+
AND version_id IS NULL;
|
|
1223
|
+
$$;
|
|
1224
|
+
|
|
1225
|
+
|
|
1226
|
+
-- ── cerefox_create_audit_entry ────────────────────────────────────────────────
|
|
1227
|
+
-- Inserts an immutable audit log entry. Called by all access paths (Python
|
|
1228
|
+
-- pipeline, Edge Functions, MCP) to maintain the single implementation principle.
|
|
1229
|
+
-- Returns the created entry's id and created_at.
|
|
1230
|
+
|
|
1231
|
+
DROP FUNCTION IF EXISTS cerefox_create_audit_entry(UUID, UUID, TEXT, TEXT, TEXT, INT, INT, TEXT);
|
|
1232
|
+
CREATE FUNCTION cerefox_create_audit_entry(
|
|
1233
|
+
p_document_id UUID DEFAULT NULL,
|
|
1234
|
+
p_version_id UUID DEFAULT NULL,
|
|
1235
|
+
p_operation TEXT DEFAULT 'create',
|
|
1236
|
+
p_author TEXT DEFAULT 'unknown',
|
|
1237
|
+
p_author_type TEXT DEFAULT 'user',
|
|
1238
|
+
p_size_before INT DEFAULT NULL,
|
|
1239
|
+
p_size_after INT DEFAULT NULL,
|
|
1240
|
+
p_description TEXT DEFAULT ''
|
|
1241
|
+
)
|
|
1242
|
+
RETURNS TABLE (
|
|
1243
|
+
audit_id UUID,
|
|
1244
|
+
created_at TIMESTAMPTZ
|
|
1245
|
+
)
|
|
1246
|
+
LANGUAGE sql
|
|
1247
|
+
SECURITY DEFINER
|
|
1248
|
+
SET search_path = public, pg_catalog
|
|
1249
|
+
AS $$
|
|
1250
|
+
INSERT INTO cerefox_audit_log (
|
|
1251
|
+
document_id, version_id, operation, author, author_type,
|
|
1252
|
+
size_before, size_after, description
|
|
1253
|
+
)
|
|
1254
|
+
VALUES (
|
|
1255
|
+
p_document_id, p_version_id, p_operation, p_author,
|
|
1256
|
+
CASE WHEN p_author_type IN ('user', 'agent') THEN p_author_type ELSE 'user' END,
|
|
1257
|
+
p_size_before, p_size_after, p_description
|
|
1258
|
+
)
|
|
1259
|
+
RETURNING id AS audit_id, cerefox_audit_log.created_at;
|
|
1260
|
+
$$;
|
|
1261
|
+
|
|
1262
|
+
-- ── cerefox_list_audit_entries ────────────────────────────────────────────────
|
|
1263
|
+
-- Returns audit log entries with optional filters. Joins cerefox_documents to
|
|
1264
|
+
-- include doc_title. Used by the web UI, Edge Function, and MCP tool.
|
|
1265
|
+
--
|
|
1266
|
+
-- Parameters:
|
|
1267
|
+
-- p_document_id : Filter by document (NULL = all)
|
|
1268
|
+
-- p_author : Filter by author (NULL = all)
|
|
1269
|
+
-- p_operation : Filter by operation type (NULL = all)
|
|
1270
|
+
-- p_since : Return entries created at or after this timestamp (NULL = no lower bound)
|
|
1271
|
+
-- p_until : Return entries created at or before this timestamp (NULL = no upper bound)
|
|
1272
|
+
-- p_limit : Max entries to return (default: 50)
|
|
1273
|
+
|
|
1274
|
+
DROP FUNCTION IF EXISTS cerefox_list_audit_entries(UUID, TEXT, TEXT, TIMESTAMPTZ, TIMESTAMPTZ, INT);
|
|
1275
|
+
CREATE FUNCTION cerefox_list_audit_entries(
|
|
1276
|
+
p_document_id UUID DEFAULT NULL,
|
|
1277
|
+
p_author TEXT DEFAULT NULL,
|
|
1278
|
+
p_operation TEXT DEFAULT NULL,
|
|
1279
|
+
p_since TIMESTAMPTZ DEFAULT NULL,
|
|
1280
|
+
p_until TIMESTAMPTZ DEFAULT NULL,
|
|
1281
|
+
p_limit INT DEFAULT 50
|
|
1282
|
+
)
|
|
1283
|
+
RETURNS TABLE (
|
|
1284
|
+
id UUID,
|
|
1285
|
+
document_id UUID,
|
|
1286
|
+
doc_title TEXT,
|
|
1287
|
+
version_id UUID,
|
|
1288
|
+
operation TEXT,
|
|
1289
|
+
author TEXT,
|
|
1290
|
+
author_type TEXT,
|
|
1291
|
+
size_before INT,
|
|
1292
|
+
size_after INT,
|
|
1293
|
+
description TEXT,
|
|
1294
|
+
created_at TIMESTAMPTZ
|
|
1295
|
+
)
|
|
1296
|
+
LANGUAGE sql
|
|
1297
|
+
SECURITY DEFINER
|
|
1298
|
+
STABLE
|
|
1299
|
+
SET search_path = public, pg_catalog
|
|
1300
|
+
AS $$
|
|
1301
|
+
SELECT
|
|
1302
|
+
a.id,
|
|
1303
|
+
a.document_id,
|
|
1304
|
+
d.title AS doc_title,
|
|
1305
|
+
a.version_id,
|
|
1306
|
+
a.operation,
|
|
1307
|
+
a.author,
|
|
1308
|
+
a.author_type,
|
|
1309
|
+
a.size_before,
|
|
1310
|
+
a.size_after,
|
|
1311
|
+
a.description,
|
|
1312
|
+
a.created_at
|
|
1313
|
+
FROM cerefox_audit_log a
|
|
1314
|
+
LEFT JOIN cerefox_documents d ON d.id = a.document_id
|
|
1315
|
+
WHERE (p_document_id IS NULL OR a.document_id = p_document_id)
|
|
1316
|
+
AND (p_author IS NULL OR a.author = p_author)
|
|
1317
|
+
AND (p_operation IS NULL OR a.operation = p_operation)
|
|
1318
|
+
AND (p_since IS NULL OR a.created_at >= p_since)
|
|
1319
|
+
AND (p_until IS NULL OR a.created_at <= p_until)
|
|
1320
|
+
ORDER BY a.created_at DESC
|
|
1321
|
+
LIMIT p_limit;
|
|
1322
|
+
$$;
|
|
1323
|
+
|
|
1324
|
+
-- ── Metadata key discovery RPC ────────────────────────────────────────────────
|
|
1325
|
+
-- Derives metadata keys from actual document data (metadata JSONB column).
|
|
1326
|
+
-- No registry table needed; always accurate, zero maintenance.
|
|
1327
|
+
-- Used by CLI, MCP tools, web UI autocomplete.
|
|
1328
|
+
|
|
1329
|
+
DROP FUNCTION IF EXISTS cerefox_list_metadata_keys();
|
|
1330
|
+
CREATE FUNCTION cerefox_list_metadata_keys()
|
|
1331
|
+
RETURNS TABLE (
|
|
1332
|
+
key TEXT,
|
|
1333
|
+
doc_count BIGINT,
|
|
1334
|
+
example_values TEXT[]
|
|
1335
|
+
)
|
|
1336
|
+
LANGUAGE sql
|
|
1337
|
+
SECURITY DEFINER
|
|
1338
|
+
STABLE
|
|
1339
|
+
SET search_path = public, pg_catalog
|
|
1340
|
+
AS $$
|
|
1341
|
+
SELECT
|
|
1342
|
+
k.key,
|
|
1343
|
+
COUNT(DISTINCT d.id) AS doc_count,
|
|
1344
|
+
(ARRAY_AGG(DISTINCT d.metadata ->> k.key) FILTER
|
|
1345
|
+
(WHERE d.metadata ->> k.key IS NOT NULL))[1:5] AS example_values
|
|
1346
|
+
FROM cerefox_documents d,
|
|
1347
|
+
LATERAL jsonb_object_keys(d.metadata) AS k(key)
|
|
1348
|
+
WHERE d.metadata IS NOT NULL
|
|
1349
|
+
AND d.metadata != '{}'::jsonb
|
|
1350
|
+
GROUP BY k.key
|
|
1351
|
+
ORDER BY doc_count DESC, k.key;
|
|
1352
|
+
$$;
|
|
1353
|
+
|
|
1354
|
+
-- ── cerefox_list_projects ────────────────────────────────────────────────────
|
|
1355
|
+
-- Lists all projects. Used by MCP tools for project discovery and by the
|
|
1356
|
+
-- web UI for project name dropdowns.
|
|
1357
|
+
|
|
1358
|
+
CREATE OR REPLACE FUNCTION cerefox_list_projects()
|
|
1359
|
+
RETURNS TABLE (
|
|
1360
|
+
id UUID,
|
|
1361
|
+
name TEXT,
|
|
1362
|
+
description TEXT
|
|
1363
|
+
)
|
|
1364
|
+
LANGUAGE sql
|
|
1365
|
+
SECURITY DEFINER
|
|
1366
|
+
STABLE
|
|
1367
|
+
SET search_path = public, pg_catalog
|
|
1368
|
+
AS $$
|
|
1369
|
+
SELECT p.id, p.name, p.description
|
|
1370
|
+
FROM cerefox_projects p
|
|
1371
|
+
ORDER BY p.name;
|
|
1372
|
+
$$;
|
|
1373
|
+
|
|
1374
|
+
-- ── cerefox_metadata_search ──────────────────────────────────────────────────
|
|
1375
|
+
-- Query documents by metadata key-value criteria without a text search term.
|
|
1376
|
+
-- Uses JSONB containment (@>) which leverages the existing GIN index on
|
|
1377
|
+
-- cerefox_documents.metadata.
|
|
1378
|
+
--
|
|
1379
|
+
-- Parameters:
|
|
1380
|
+
-- p_metadata_filter : JSONB containment filter (AND semantics for all keys)
|
|
1381
|
+
-- p_project_id : Optional project UUID filter
|
|
1382
|
+
-- p_updated_since : Only docs updated on or after this timestamp
|
|
1383
|
+
-- p_created_since : Only docs created on or after this timestamp
|
|
1384
|
+
-- p_limit : Max results (default 10)
|
|
1385
|
+
-- p_include_content : When TRUE, reconstruct full text from current chunks
|
|
1386
|
+
-- p_max_bytes : Byte budget for accumulated content (NULL = no limit)
|
|
1387
|
+
|
|
1388
|
+
CREATE OR REPLACE FUNCTION cerefox_metadata_search(
|
|
1389
|
+
p_metadata_filter JSONB,
|
|
1390
|
+
p_project_id UUID DEFAULT NULL,
|
|
1391
|
+
p_updated_since TIMESTAMPTZ DEFAULT NULL,
|
|
1392
|
+
p_created_since TIMESTAMPTZ DEFAULT NULL,
|
|
1393
|
+
p_limit INT DEFAULT 10,
|
|
1394
|
+
p_include_content BOOLEAN DEFAULT FALSE,
|
|
1395
|
+
p_max_bytes INT DEFAULT NULL
|
|
1396
|
+
)
|
|
1397
|
+
RETURNS TABLE (
|
|
1398
|
+
document_id UUID,
|
|
1399
|
+
title TEXT,
|
|
1400
|
+
doc_metadata JSONB,
|
|
1401
|
+
review_status TEXT,
|
|
1402
|
+
source TEXT,
|
|
1403
|
+
created_at TIMESTAMPTZ,
|
|
1404
|
+
updated_at TIMESTAMPTZ,
|
|
1405
|
+
total_chars INT,
|
|
1406
|
+
chunk_count INT,
|
|
1407
|
+
project_ids UUID[],
|
|
1408
|
+
project_names TEXT[],
|
|
1409
|
+
version_count INT,
|
|
1410
|
+
content TEXT
|
|
1411
|
+
)
|
|
1412
|
+
LANGUAGE plpgsql
|
|
1413
|
+
SECURITY DEFINER
|
|
1414
|
+
SET search_path = public, pg_catalog
|
|
1415
|
+
AS $$
|
|
1416
|
+
DECLARE
|
|
1417
|
+
v_bytes_used INT := 0;
|
|
1418
|
+
v_row RECORD;
|
|
1419
|
+
v_row_bytes INT;
|
|
1420
|
+
BEGIN
|
|
1421
|
+
FOR v_row IN
|
|
1422
|
+
SELECT
|
|
1423
|
+
d.id AS document_id,
|
|
1424
|
+
d.title,
|
|
1425
|
+
d.metadata AS doc_metadata,
|
|
1426
|
+
d.review_status,
|
|
1427
|
+
d.source,
|
|
1428
|
+
d.created_at,
|
|
1429
|
+
d.updated_at,
|
|
1430
|
+
d.total_chars,
|
|
1431
|
+
d.chunk_count,
|
|
1432
|
+
ARRAY(SELECT dp.project_id FROM cerefox_document_projects dp
|
|
1433
|
+
WHERE dp.document_id = d.id) AS project_ids,
|
|
1434
|
+
ARRAY(SELECT p.name FROM cerefox_projects p
|
|
1435
|
+
JOIN cerefox_document_projects dp ON p.id = dp.project_id
|
|
1436
|
+
WHERE dp.document_id = d.id) AS project_names,
|
|
1437
|
+
(SELECT COUNT(*)::INT FROM cerefox_document_versions dv
|
|
1438
|
+
WHERE dv.document_id = d.id) AS version_count,
|
|
1439
|
+
CASE WHEN p_include_content THEN
|
|
1440
|
+
(SELECT STRING_AGG(c.content, E'\n\n' ORDER BY c.chunk_index)
|
|
1441
|
+
FROM cerefox_chunks c
|
|
1442
|
+
WHERE c.document_id = d.id AND c.version_id IS NULL)
|
|
1443
|
+
ELSE NULL END AS content
|
|
1444
|
+
FROM cerefox_documents d
|
|
1445
|
+
WHERE d.metadata @> p_metadata_filter
|
|
1446
|
+
AND d.deleted_at IS NULL
|
|
1447
|
+
AND (p_project_id IS NULL OR EXISTS (
|
|
1448
|
+
SELECT 1 FROM cerefox_document_projects dp
|
|
1449
|
+
WHERE dp.document_id = d.id AND dp.project_id = p_project_id
|
|
1450
|
+
))
|
|
1451
|
+
AND (p_updated_since IS NULL OR d.updated_at >= p_updated_since)
|
|
1452
|
+
AND (p_created_since IS NULL OR d.created_at >= p_created_since)
|
|
1453
|
+
ORDER BY d.updated_at DESC
|
|
1454
|
+
LIMIT p_limit
|
|
1455
|
+
LOOP
|
|
1456
|
+
-- Byte budget enforcement (when p_max_bytes is set and content is included)
|
|
1457
|
+
IF p_max_bytes IS NOT NULL AND p_include_content AND v_row.content IS NOT NULL THEN
|
|
1458
|
+
v_row_bytes := octet_length(v_row.content);
|
|
1459
|
+
IF v_bytes_used + v_row_bytes > p_max_bytes THEN
|
|
1460
|
+
EXIT; -- stop emitting rows
|
|
1461
|
+
END IF;
|
|
1462
|
+
v_bytes_used := v_bytes_used + v_row_bytes;
|
|
1463
|
+
END IF;
|
|
1464
|
+
|
|
1465
|
+
document_id := v_row.document_id;
|
|
1466
|
+
title := v_row.title;
|
|
1467
|
+
doc_metadata := v_row.doc_metadata;
|
|
1468
|
+
review_status := v_row.review_status;
|
|
1469
|
+
source := v_row.source;
|
|
1470
|
+
created_at := v_row.created_at;
|
|
1471
|
+
updated_at := v_row.updated_at;
|
|
1472
|
+
total_chars := v_row.total_chars;
|
|
1473
|
+
chunk_count := v_row.chunk_count;
|
|
1474
|
+
project_ids := v_row.project_ids;
|
|
1475
|
+
project_names := v_row.project_names;
|
|
1476
|
+
version_count := v_row.version_count;
|
|
1477
|
+
content := v_row.content;
|
|
1478
|
+
RETURN NEXT;
|
|
1479
|
+
END LOOP;
|
|
1480
|
+
END;
|
|
1481
|
+
$$;
|
|
1482
|
+
|
|
1483
|
+
-- ── cerefox_get_config / cerefox_set_config ──────────────────────────────────
|
|
1484
|
+
-- Read/write key-value config from cerefox_config table.
|
|
1485
|
+
|
|
1486
|
+
CREATE OR REPLACE FUNCTION cerefox_get_config(p_key TEXT)
|
|
1487
|
+
RETURNS TEXT
|
|
1488
|
+
LANGUAGE sql
|
|
1489
|
+
SECURITY DEFINER
|
|
1490
|
+
STABLE
|
|
1491
|
+
SET search_path = public, pg_catalog
|
|
1492
|
+
AS $$
|
|
1493
|
+
SELECT value FROM cerefox_config WHERE key = p_key;
|
|
1494
|
+
$$;
|
|
1495
|
+
|
|
1496
|
+
CREATE OR REPLACE FUNCTION cerefox_set_config(p_key TEXT, p_value TEXT)
|
|
1497
|
+
RETURNS VOID
|
|
1498
|
+
LANGUAGE plpgsql
|
|
1499
|
+
SECURITY DEFINER
|
|
1500
|
+
SET search_path = public, pg_catalog
|
|
1501
|
+
AS $$
|
|
1502
|
+
DECLARE
|
|
1503
|
+
v_allowed TEXT[] := ARRAY['usage_tracking_enabled', 'require_requestor_identity', 'requestor_identity_format'];
|
|
1504
|
+
BEGIN
|
|
1505
|
+
IF NOT (p_key = ANY(v_allowed)) THEN
|
|
1506
|
+
RAISE EXCEPTION 'Unknown config key: %. Allowed keys: %', p_key, v_allowed;
|
|
1507
|
+
END IF;
|
|
1508
|
+
|
|
1509
|
+
INSERT INTO cerefox_config (key, value)
|
|
1510
|
+
VALUES (p_key, p_value)
|
|
1511
|
+
ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value;
|
|
1512
|
+
END;
|
|
1513
|
+
$$;
|
|
1514
|
+
|
|
1515
|
+
-- ── cerefox_log_usage ────────────────────────────────────────────────────────
|
|
1516
|
+
-- Insert a usage log entry. Checks config first; no-op if tracking is disabled.
|
|
1517
|
+
|
|
1518
|
+
CREATE OR REPLACE FUNCTION cerefox_log_usage(
|
|
1519
|
+
p_operation TEXT,
|
|
1520
|
+
p_access_path TEXT,
|
|
1521
|
+
p_requestor TEXT DEFAULT NULL,
|
|
1522
|
+
p_document_id UUID DEFAULT NULL,
|
|
1523
|
+
p_project_id UUID DEFAULT NULL,
|
|
1524
|
+
p_query_text TEXT DEFAULT NULL,
|
|
1525
|
+
p_result_count INT DEFAULT NULL,
|
|
1526
|
+
p_extra JSONB DEFAULT '{}'::JSONB
|
|
1527
|
+
)
|
|
1528
|
+
RETURNS VOID
|
|
1529
|
+
LANGUAGE plpgsql
|
|
1530
|
+
SECURITY DEFINER
|
|
1531
|
+
SET search_path = public, pg_catalog
|
|
1532
|
+
AS $$
|
|
1533
|
+
DECLARE
|
|
1534
|
+
v_enabled TEXT;
|
|
1535
|
+
BEGIN
|
|
1536
|
+
SELECT value INTO v_enabled FROM cerefox_config WHERE key = 'usage_tracking_enabled';
|
|
1537
|
+
IF v_enabled IS NULL OR v_enabled != 'true' THEN
|
|
1538
|
+
RETURN;
|
|
1539
|
+
END IF;
|
|
1540
|
+
|
|
1541
|
+
INSERT INTO cerefox_usage_log (
|
|
1542
|
+
operation, access_path, requestor, document_id, project_id,
|
|
1543
|
+
query_text, result_count, extra
|
|
1544
|
+
) VALUES (
|
|
1545
|
+
p_operation, p_access_path, p_requestor, p_document_id, p_project_id,
|
|
1546
|
+
p_query_text, p_result_count, p_extra
|
|
1547
|
+
);
|
|
1548
|
+
END;
|
|
1549
|
+
$$;
|
|
1550
|
+
|
|
1551
|
+
-- ── cerefox_list_usage_log ───────────────────────────────────────────────────
|
|
1552
|
+
-- Query usage log with optional filters.
|
|
1553
|
+
|
|
1554
|
+
CREATE OR REPLACE FUNCTION cerefox_list_usage_log(
|
|
1555
|
+
p_start TIMESTAMPTZ DEFAULT NULL,
|
|
1556
|
+
p_end TIMESTAMPTZ DEFAULT NULL,
|
|
1557
|
+
p_operation TEXT DEFAULT NULL,
|
|
1558
|
+
p_access_path TEXT DEFAULT NULL,
|
|
1559
|
+
p_requestor TEXT DEFAULT NULL,
|
|
1560
|
+
p_project_id UUID DEFAULT NULL,
|
|
1561
|
+
p_limit INT DEFAULT 100
|
|
1562
|
+
)
|
|
1563
|
+
RETURNS TABLE (
|
|
1564
|
+
id UUID,
|
|
1565
|
+
logged_at TIMESTAMPTZ,
|
|
1566
|
+
operation TEXT,
|
|
1567
|
+
access_path TEXT,
|
|
1568
|
+
requestor TEXT,
|
|
1569
|
+
document_id UUID,
|
|
1570
|
+
doc_title TEXT,
|
|
1571
|
+
project_id UUID,
|
|
1572
|
+
query_text TEXT,
|
|
1573
|
+
result_count INT,
|
|
1574
|
+
extra JSONB
|
|
1575
|
+
)
|
|
1576
|
+
LANGUAGE sql
|
|
1577
|
+
SECURITY DEFINER
|
|
1578
|
+
STABLE
|
|
1579
|
+
SET search_path = public, pg_catalog
|
|
1580
|
+
AS $$
|
|
1581
|
+
SELECT
|
|
1582
|
+
ul.id,
|
|
1583
|
+
ul.logged_at,
|
|
1584
|
+
ul.operation,
|
|
1585
|
+
ul.access_path,
|
|
1586
|
+
ul.requestor,
|
|
1587
|
+
ul.document_id,
|
|
1588
|
+
d.title AS doc_title,
|
|
1589
|
+
ul.project_id,
|
|
1590
|
+
ul.query_text,
|
|
1591
|
+
ul.result_count,
|
|
1592
|
+
ul.extra
|
|
1593
|
+
FROM cerefox_usage_log ul
|
|
1594
|
+
LEFT JOIN cerefox_documents d ON ul.document_id = d.id
|
|
1595
|
+
WHERE (p_start IS NULL OR ul.logged_at >= p_start)
|
|
1596
|
+
AND (p_end IS NULL OR ul.logged_at <= p_end)
|
|
1597
|
+
AND (p_operation IS NULL OR ul.operation = p_operation)
|
|
1598
|
+
AND (p_access_path IS NULL OR ul.access_path = p_access_path)
|
|
1599
|
+
AND (p_requestor IS NULL OR ul.requestor = p_requestor)
|
|
1600
|
+
AND (p_project_id IS NULL OR ul.project_id = p_project_id)
|
|
1601
|
+
ORDER BY ul.logged_at DESC
|
|
1602
|
+
LIMIT p_limit;
|
|
1603
|
+
$$;
|
|
1604
|
+
|
|
1605
|
+
-- ── cerefox_usage_summary ────────────────────────────────────────────────────
|
|
1606
|
+
-- Returns a JSON object with aggregated stats for the analytics page.
|
|
1607
|
+
|
|
1608
|
+
CREATE OR REPLACE FUNCTION cerefox_usage_summary(
|
|
1609
|
+
p_start TIMESTAMPTZ DEFAULT NULL,
|
|
1610
|
+
p_end TIMESTAMPTZ DEFAULT NULL,
|
|
1611
|
+
p_project_id UUID DEFAULT NULL,
|
|
1612
|
+
p_access_path TEXT DEFAULT NULL
|
|
1613
|
+
)
|
|
1614
|
+
RETURNS JSON
|
|
1615
|
+
LANGUAGE plpgsql
|
|
1616
|
+
SECURITY DEFINER
|
|
1617
|
+
STABLE
|
|
1618
|
+
SET search_path = public, pg_catalog
|
|
1619
|
+
AS $$
|
|
1620
|
+
DECLARE
|
|
1621
|
+
v_result JSON;
|
|
1622
|
+
BEGIN
|
|
1623
|
+
WITH filtered AS (
|
|
1624
|
+
SELECT *
|
|
1625
|
+
FROM cerefox_usage_log ul
|
|
1626
|
+
WHERE (p_start IS NULL OR ul.logged_at >= p_start)
|
|
1627
|
+
AND (p_end IS NULL OR ul.logged_at <= p_end)
|
|
1628
|
+
AND (p_project_id IS NULL OR ul.project_id = p_project_id)
|
|
1629
|
+
AND (p_access_path IS NULL OR ul.access_path = p_access_path)
|
|
1630
|
+
),
|
|
1631
|
+
ops_by_day AS (
|
|
1632
|
+
SELECT DATE(logged_at) AS day, COUNT(*) AS count
|
|
1633
|
+
FROM filtered
|
|
1634
|
+
GROUP BY DATE(logged_at)
|
|
1635
|
+
ORDER BY day
|
|
1636
|
+
),
|
|
1637
|
+
ops_by_operation AS (
|
|
1638
|
+
SELECT operation, COUNT(*) AS count
|
|
1639
|
+
FROM filtered
|
|
1640
|
+
GROUP BY operation
|
|
1641
|
+
ORDER BY count DESC
|
|
1642
|
+
),
|
|
1643
|
+
ops_by_access_path AS (
|
|
1644
|
+
SELECT access_path, COUNT(*) AS count
|
|
1645
|
+
FROM filtered
|
|
1646
|
+
GROUP BY access_path
|
|
1647
|
+
ORDER BY count DESC
|
|
1648
|
+
),
|
|
1649
|
+
top_documents AS (
|
|
1650
|
+
SELECT f.document_id, d.title AS doc_title, COUNT(*) AS count
|
|
1651
|
+
FROM filtered f
|
|
1652
|
+
JOIN cerefox_documents d ON f.document_id = d.id
|
|
1653
|
+
WHERE f.document_id IS NOT NULL
|
|
1654
|
+
GROUP BY f.document_id, d.title
|
|
1655
|
+
ORDER BY count DESC
|
|
1656
|
+
LIMIT 10
|
|
1657
|
+
),
|
|
1658
|
+
top_requestors AS (
|
|
1659
|
+
SELECT requestor, COUNT(*) AS count
|
|
1660
|
+
FROM filtered
|
|
1661
|
+
WHERE requestor IS NOT NULL
|
|
1662
|
+
GROUP BY requestor
|
|
1663
|
+
ORDER BY count DESC
|
|
1664
|
+
LIMIT 10
|
|
1665
|
+
)
|
|
1666
|
+
SELECT json_build_object(
|
|
1667
|
+
'total_count', (SELECT COUNT(*) FROM filtered),
|
|
1668
|
+
'ops_by_day', COALESCE((SELECT json_agg(json_build_object('day', day, 'count', count)) FROM ops_by_day), '[]'::JSON),
|
|
1669
|
+
'ops_by_operation', COALESCE((SELECT json_agg(json_build_object('operation', operation, 'count', count)) FROM ops_by_operation), '[]'::JSON),
|
|
1670
|
+
'ops_by_access_path', COALESCE((SELECT json_agg(json_build_object('access_path', access_path, 'count', count)) FROM ops_by_access_path), '[]'::JSON),
|
|
1671
|
+
'top_documents', COALESCE((SELECT json_agg(json_build_object('document_id', document_id, 'doc_title', doc_title, 'count', count)) FROM top_documents), '[]'::JSON),
|
|
1672
|
+
'top_requestors', COALESCE((SELECT json_agg(json_build_object('requestor', requestor, 'count', count)) FROM top_requestors), '[]'::JSON)
|
|
1673
|
+
) INTO v_result;
|
|
1674
|
+
|
|
1675
|
+
RETURN v_result;
|
|
1676
|
+
END;
|
|
1677
|
+
$$;
|
|
1678
|
+
|
|
1679
|
+
-- ─────────────────────────────────────────────────────────────────────────
|
|
1680
|
+
-- Schema version reporter
|
|
1681
|
+
-- ─────────────────────────────────────────────────────────────────────────
|
|
1682
|
+
-- Returns the schema version currently deployed in this database. The value
|
|
1683
|
+
-- must match the `@version` marker at the top of schema.sql.
|
|
1684
|
+
-- Bump both when schema.sql or rpcs.sql changes in a way that requires a
|
|
1685
|
+
-- redeploy. The web UI's /api/v1/schema-version endpoint compares the bundled
|
|
1686
|
+
-- and deployed values and surfaces a 'redeploy needed' banner on mismatch.
|
|
1687
|
+
|
|
1688
|
+
CREATE OR REPLACE FUNCTION cerefox_schema_version()
|
|
1689
|
+
RETURNS TEXT
|
|
1690
|
+
LANGUAGE sql
|
|
1691
|
+
STABLE
|
|
1692
|
+
SECURITY DEFINER
|
|
1693
|
+
SET search_path = public, pg_catalog
|
|
1694
|
+
AS $$
|
|
1695
|
+
SELECT '0.3.1'::TEXT;
|
|
1696
|
+
$$;
|
|
1697
|
+
|
|
1698
|
+
|
|
1699
|
+
|
|
1700
|
+
-- ─────────────────────────────────────────────────────────────────────────
|
|
1701
|
+
-- Function-existence probe (introspection helper)
|
|
1702
|
+
-- ─────────────────────────────────────────────────────────────────────────
|
|
1703
|
+
-- Returns TRUE if a function with the given name exists in the public schema,
|
|
1704
|
+
-- regardless of its signature. Used by `db_status.ts` and `cerefox doctor`
|
|
1705
|
+
-- (v0.5) to verify schema health without having to know the parameter list
|
|
1706
|
+
-- of every RPC. Cheaper and more reliable than calling each RPC and parsing
|
|
1707
|
+
-- the error message.
|
|
1708
|
+
|
|
1709
|
+
CREATE OR REPLACE FUNCTION cerefox_pg_function_exists(p_name TEXT)
|
|
1710
|
+
RETURNS BOOLEAN
|
|
1711
|
+
LANGUAGE sql
|
|
1712
|
+
STABLE
|
|
1713
|
+
SECURITY DEFINER
|
|
1714
|
+
SET search_path = public, pg_catalog
|
|
1715
|
+
AS $$
|
|
1716
|
+
SELECT EXISTS (
|
|
1717
|
+
SELECT 1
|
|
1718
|
+
FROM pg_proc p
|
|
1719
|
+
JOIN pg_namespace n ON p.pronamespace = n.oid
|
|
1720
|
+
WHERE n.nspname = 'public'
|
|
1721
|
+
AND p.proname = p_name
|
|
1722
|
+
);
|
|
1723
|
+
$$;
|