@cerefox/memory 0.7.1 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -25
- package/dist/bin/cerefox.js +1163 -344
- package/dist/frontend/assets/{index-HNlMcvli.js → index-CAp2_lFX.js} +2 -2
- package/dist/frontend/assets/index-CAp2_lFX.js.map +1 -0
- package/dist/frontend/index.html +1 -1
- package/dist/server-assets/_shared/ef-meta/index.ts +97 -0
- package/dist/server-assets/_shared/embeddings/index.ts +175 -0
- package/dist/server-assets/_shared/mcp-tools/_chunker.ts +187 -0
- package/dist/server-assets/_shared/mcp-tools/_projects.ts +121 -0
- package/dist/server-assets/_shared/mcp-tools/_utils.ts +73 -0
- package/dist/server-assets/_shared/mcp-tools/audit-log.ts +95 -0
- package/dist/server-assets/_shared/mcp-tools/get-document.ts +73 -0
- package/dist/server-assets/_shared/mcp-tools/get-help-content.ts +26 -0
- package/dist/server-assets/_shared/mcp-tools/get-help.ts +90 -0
- package/dist/server-assets/_shared/mcp-tools/index.ts +67 -0
- package/dist/server-assets/_shared/mcp-tools/ingest.ts +315 -0
- package/dist/server-assets/_shared/mcp-tools/list-metadata-keys.ts +55 -0
- package/dist/server-assets/_shared/mcp-tools/list-projects.ts +59 -0
- package/dist/server-assets/_shared/mcp-tools/list-versions.ts +72 -0
- package/dist/server-assets/_shared/mcp-tools/metadata-search.ts +154 -0
- package/dist/server-assets/_shared/mcp-tools/search.ts +193 -0
- package/dist/server-assets/_shared/mcp-tools/set-document-projects.ts +163 -0
- package/dist/server-assets/_shared/mcp-tools/types.ts +92 -0
- package/dist/server-assets/db/migrations/0003_add_document_versions.sql +91 -0
- package/dist/server-assets/db/migrations/0004_add_audit_log_review_status_archived.sql +71 -0
- package/dist/server-assets/db/migrations/0005_metadata_search.sql +628 -0
- package/dist/server-assets/db/migrations/0006_usage_log.sql +255 -0
- package/dist/server-assets/db/migrations/0007_usage_log_requestor.sql +178 -0
- package/dist/server-assets/db/migrations/0008_soft_delete.sql +130 -0
- package/dist/server-assets/db/migrations/0009_audit_log_restore_operation.sql +20 -0
- package/dist/server-assets/db/migrations/0010_requestor_enforcement_config.sql +12 -0
- package/dist/server-assets/db/migrations/0011_title_boosting.sql +48 -0
- package/dist/server-assets/db/rpcs.sql +1723 -0
- package/dist/server-assets/db/schema.sql +380 -0
- package/dist/server-assets/supabase/functions/cerefox-get-audit-log/index.ts +117 -0
- package/dist/server-assets/supabase/functions/cerefox-get-document/index.ts +138 -0
- package/dist/server-assets/supabase/functions/cerefox-ingest/index.ts +819 -0
- package/dist/server-assets/supabase/functions/cerefox-list-projects/index.ts +96 -0
- package/dist/server-assets/supabase/functions/cerefox-list-versions/index.ts +113 -0
- package/dist/server-assets/supabase/functions/cerefox-mcp/index.ts +294 -0
- package/dist/server-assets/supabase/functions/cerefox-mcp/shared.ts +42 -0
- package/dist/server-assets/supabase/functions/cerefox-metadata/index.ts +99 -0
- package/dist/server-assets/supabase/functions/cerefox-metadata-search/index.ts +146 -0
- package/dist/server-assets/supabase/functions/cerefox-search/index.ts +382 -0
- package/docs/guides/connect-agents.md +58 -3
- package/docs/guides/migration-v0.5.md +50 -0
- package/package.json +3 -2
- package/dist/frontend/assets/index-HNlMcvli.js.map +0 -1
|
@@ -0,0 +1,819 @@
|
|
|
1
|
+
import "jsr:@supabase/functions-js/edge-runtime.d.ts";
|
|
2
|
+
import { createClient } from "jsr:@supabase/supabase-js@2";
|
|
3
|
+
import { isVersionRequest, versionResponse } from "../../../_shared/ef-meta/index.ts";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* cerefox-ingest — Supabase Edge Function
|
|
7
|
+
*
|
|
8
|
+
* Quick-capture endpoint: accepts a markdown note, chunks it by headings,
|
|
9
|
+
* embeds each chunk with OpenAI, and stores everything in the knowledge base.
|
|
10
|
+
*
|
|
11
|
+
* This is the agent write path — use it for short notes captured during a
|
|
12
|
+
* conversation. For large batch ingestion (directories, PDFs, etc.) use the
|
|
13
|
+
* Python CLI: `cerefox ingest file.md`.
|
|
14
|
+
*
|
|
15
|
+
* Request body (JSON):
|
|
16
|
+
* title string required Document title
|
|
17
|
+
* content string required Markdown content
|
|
18
|
+
* project_name string optional Project to assign to (looked up by name, created if absent)
|
|
19
|
+
* source string optional Origin label (default: "agent")
|
|
20
|
+
* metadata object optional Arbitrary JSONB metadata
|
|
21
|
+
*
|
|
22
|
+
* Response: { document_id, title, chunk_count, project_id? }
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
const OPENAI_EMBEDDING_URL = "https://api.openai.com/v1/embeddings";
|
|
26
|
+
const OPENAI_MODEL = "text-embedding-3-small";
|
|
27
|
+
const EMBEDDING_DIMENSIONS = 768;
|
|
28
|
+
|
|
29
|
+
const MAX_CHUNK_CHARS = 4000;
|
|
30
|
+
const MIN_CHUNK_CHARS = 100;
|
|
31
|
+
|
|
32
|
+
interface IngestRequest {
|
|
33
|
+
title: string;
|
|
34
|
+
content: string;
|
|
35
|
+
document_id?: string;
|
|
36
|
+
project_name?: string;
|
|
37
|
+
project_names?: string[]; // Full-set semantics; wins over project_name when both provided
|
|
38
|
+
source?: string;
|
|
39
|
+
metadata?: Record<string, unknown>;
|
|
40
|
+
update_if_exists?: boolean;
|
|
41
|
+
author?: string;
|
|
42
|
+
author_type?: string; // 'user' | 'agent'
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
interface Chunk {
|
|
46
|
+
heading_path: string[];
|
|
47
|
+
heading_level: number;
|
|
48
|
+
title: string;
|
|
49
|
+
content: string;
|
|
50
|
+
char_count: number;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// ── Heading-aware chunker (mirrors Python logic) ───────────────────────────
|
|
54
|
+
//
|
|
55
|
+
// Design notes:
|
|
56
|
+
// • Short-circuit for small documents: if the entire document fits within
|
|
57
|
+
// MAX_CHUNK_CHARS, it is returned as a single chunk with no splitting.
|
|
58
|
+
// • Greedy accumulation: sections are collected into a buffer until adding
|
|
59
|
+
// the next would exceed MAX_CHUNK_CHARS. This keeps chunks close to the
|
|
60
|
+
// target size and avoids many tiny fragments at every heading boundary.
|
|
61
|
+
// All heading levels (H1/H2/H3) are treated equally — size alone controls
|
|
62
|
+
// when a chunk is flushed; there are no hard heading-level boundaries.
|
|
63
|
+
// • Oversized sections (> MAX_CHUNK_CHARS) are paragraph-split with no overlap.
|
|
64
|
+
// • The first section's heading metadata anchors each chunk's breadcrumb.
|
|
65
|
+
// • No overlaps between chunks — the heading breadcrumb in the content
|
|
66
|
+
// provides sufficient context. Overlaps caused duplication on reconstruction.
|
|
67
|
+
|
|
68
|
+
interface Section {
|
|
69
|
+
level: number;
|
|
70
|
+
headings: string[]; // full heading stack at this section
|
|
71
|
+
heading: string; // just the current heading text
|
|
72
|
+
content: string; // heading line + body
|
|
73
|
+
body: string; // body only (no heading line)
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
function parseSections(text: string): Section[] {
|
|
77
|
+
const lines = text.split("\n");
|
|
78
|
+
const sections: Section[] = [];
|
|
79
|
+
let currentHeadings: string[] = [];
|
|
80
|
+
let currentLevel = 0;
|
|
81
|
+
let bodyLines: string[] = [];
|
|
82
|
+
|
|
83
|
+
function collectSection() {
|
|
84
|
+
const body = bodyLines.join("\n").trim();
|
|
85
|
+
bodyLines = [];
|
|
86
|
+
let content: string;
|
|
87
|
+
if (currentLevel > 0) {
|
|
88
|
+
const headerLine = "#".repeat(currentLevel) + " " + (currentHeadings[currentHeadings.length - 1] ?? "");
|
|
89
|
+
content = body ? headerLine + "\n\n" + body : headerLine;
|
|
90
|
+
} else {
|
|
91
|
+
content = body;
|
|
92
|
+
}
|
|
93
|
+
if (!content.trim()) return;
|
|
94
|
+
sections.push({
|
|
95
|
+
level: currentLevel,
|
|
96
|
+
headings: [...currentHeadings],
|
|
97
|
+
heading: currentHeadings[currentHeadings.length - 1] ?? "",
|
|
98
|
+
content,
|
|
99
|
+
body,
|
|
100
|
+
});
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
for (const line of lines) {
|
|
104
|
+
const h1 = line.match(/^# (.+)/);
|
|
105
|
+
const h2 = line.match(/^## (.+)/);
|
|
106
|
+
const h3 = line.match(/^### (.+)/);
|
|
107
|
+
|
|
108
|
+
if (h1) {
|
|
109
|
+
collectSection();
|
|
110
|
+
currentHeadings = [h1[1].trim()];
|
|
111
|
+
currentLevel = 1;
|
|
112
|
+
} else if (h2) {
|
|
113
|
+
collectSection();
|
|
114
|
+
currentHeadings = [currentHeadings[0] ?? "", h2[1].trim()].filter(Boolean);
|
|
115
|
+
currentLevel = 2;
|
|
116
|
+
} else if (h3) {
|
|
117
|
+
collectSection();
|
|
118
|
+
currentHeadings = [
|
|
119
|
+
currentHeadings[0] ?? "",
|
|
120
|
+
currentHeadings[1] ?? "",
|
|
121
|
+
h3[1].trim(),
|
|
122
|
+
].filter(Boolean);
|
|
123
|
+
currentLevel = 3;
|
|
124
|
+
} else {
|
|
125
|
+
bodyLines.push(line);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
collectSection();
|
|
129
|
+
return sections;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
function chunkMarkdown(text: string): Chunk[] {
|
|
133
|
+
const trimmed = text.trim();
|
|
134
|
+
if (!trimmed) return [];
|
|
135
|
+
|
|
136
|
+
// Short-circuit: entire document fits in one chunk — skip heading splitting.
|
|
137
|
+
if (trimmed.length <= MAX_CHUNK_CHARS) {
|
|
138
|
+
return [makeChunk([], 0, trimmed)];
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
const sections = parseSections(trimmed);
|
|
142
|
+
const chunks: Chunk[] = [];
|
|
143
|
+
|
|
144
|
+
// Greedy accumulation buffer
|
|
145
|
+
let bufParts: string[] = [];
|
|
146
|
+
let bufHeadings: string[] = [];
|
|
147
|
+
let bufLevel = 0;
|
|
148
|
+
let bufChars = 0;
|
|
149
|
+
|
|
150
|
+
function flushBuf() {
|
|
151
|
+
if (bufParts.length === 0) return;
|
|
152
|
+
chunks.push(makeChunk(bufHeadings, bufLevel, bufParts.join("\n\n")));
|
|
153
|
+
bufParts = [];
|
|
154
|
+
bufHeadings = [];
|
|
155
|
+
bufLevel = 0;
|
|
156
|
+
bufChars = 0;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
for (const section of sections) {
|
|
160
|
+
const { level, headings, heading, content, body } = section;
|
|
161
|
+
|
|
162
|
+
// Oversized section: flush buffer, then paragraph-split.
|
|
163
|
+
if (content.length > MAX_CHUNK_CHARS) {
|
|
164
|
+
flushBuf();
|
|
165
|
+
const headerPrefix = level > 0 ? "#".repeat(level) + " " + heading + "\n\n" : "";
|
|
166
|
+
const bodyToSplit = body || content;
|
|
167
|
+
const paragraphs = bodyToSplit.split(/\n\n+/);
|
|
168
|
+
let sub = "";
|
|
169
|
+
let isFirst = true;
|
|
170
|
+
for (const para of paragraphs) {
|
|
171
|
+
const prefix = isFirst ? headerPrefix : "";
|
|
172
|
+
if (sub.length + prefix.length + para.length + 2 > MAX_CHUNK_CHARS && sub.length > 0) {
|
|
173
|
+
chunks.push(makeChunk(headings, level, sub.trim()));
|
|
174
|
+
sub = para;
|
|
175
|
+
isFirst = false;
|
|
176
|
+
} else {
|
|
177
|
+
sub = sub ? sub + "\n\n" + para : prefix + para;
|
|
178
|
+
isFirst = false;
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
if (sub.trim()) chunks.push(makeChunk(headings, level, sub.trim()));
|
|
182
|
+
continue;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
// Section fits. Try to accumulate into the buffer.
|
|
186
|
+
const addition = content.length + (bufParts.length > 0 ? 2 : 0);
|
|
187
|
+
|
|
188
|
+
if (bufChars + addition <= MAX_CHUNK_CHARS) {
|
|
189
|
+
if (bufParts.length === 0) {
|
|
190
|
+
bufHeadings = headings;
|
|
191
|
+
bufLevel = level;
|
|
192
|
+
}
|
|
193
|
+
bufParts.push(content);
|
|
194
|
+
bufChars += addition;
|
|
195
|
+
} else {
|
|
196
|
+
flushBuf();
|
|
197
|
+
bufParts = [content];
|
|
198
|
+
bufHeadings = headings;
|
|
199
|
+
bufLevel = level;
|
|
200
|
+
bufChars = content.length;
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
flushBuf();
|
|
205
|
+
return chunks;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
function makeChunk(headings: string[], level: number, content: string): Chunk {
|
|
209
|
+
const title = headings[headings.length - 1] ?? "";
|
|
210
|
+
return { heading_path: [...headings], heading_level: level, title, content, char_count: content.length };
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
// ── Embedding ──────────────────────────────────────────────────────────────
|
|
214
|
+
|
|
215
|
+
const EMBEDDING_MAX_RETRIES = 3;
|
|
216
|
+
const EMBEDDING_INITIAL_BACKOFF_MS = 500; // 500ms, 1s, 2s exponential backoff
|
|
217
|
+
|
|
218
|
+
async function embedBatch(texts: string[], apiKey: string): Promise<number[][]> {
|
|
219
|
+
let lastError: Error | null = null;
|
|
220
|
+
|
|
221
|
+
for (let attempt = 0; attempt < EMBEDDING_MAX_RETRIES; attempt++) {
|
|
222
|
+
try {
|
|
223
|
+
const response = await fetch(OPENAI_EMBEDDING_URL, {
|
|
224
|
+
method: "POST",
|
|
225
|
+
headers: {
|
|
226
|
+
"Authorization": `Bearer ${apiKey}`,
|
|
227
|
+
"Content-Type": "application/json",
|
|
228
|
+
},
|
|
229
|
+
body: JSON.stringify({
|
|
230
|
+
model: OPENAI_MODEL,
|
|
231
|
+
input: texts,
|
|
232
|
+
dimensions: EMBEDDING_DIMENSIONS,
|
|
233
|
+
}),
|
|
234
|
+
});
|
|
235
|
+
|
|
236
|
+
if (!response.ok) {
|
|
237
|
+
const err = await response.text();
|
|
238
|
+
if (response.status < 500) {
|
|
239
|
+
throw new Error(`OpenAI embedding error ${response.status}: ${err}`);
|
|
240
|
+
}
|
|
241
|
+
lastError = new Error(`OpenAI embedding error ${response.status}: ${err}`);
|
|
242
|
+
const backoff = EMBEDDING_INITIAL_BACKOFF_MS * Math.pow(2, attempt);
|
|
243
|
+
console.warn(
|
|
244
|
+
`Embedding API returned ${response.status} (attempt ${attempt + 1}/${EMBEDDING_MAX_RETRIES}), retrying in ${backoff}ms`,
|
|
245
|
+
);
|
|
246
|
+
await new Promise((r) => setTimeout(r, backoff));
|
|
247
|
+
continue;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
const data = await response.json();
|
|
251
|
+
if (attempt > 0) {
|
|
252
|
+
console.info(`Embedding API succeeded on retry ${attempt}`);
|
|
253
|
+
}
|
|
254
|
+
const sorted = data.data.sort(
|
|
255
|
+
(a: { index: number }, b: { index: number }) => a.index - b.index,
|
|
256
|
+
);
|
|
257
|
+
return sorted.map((d: { embedding: number[] }) => d.embedding);
|
|
258
|
+
} catch (err) {
|
|
259
|
+
if (err instanceof Error && err.message.startsWith("OpenAI embedding error")) {
|
|
260
|
+
throw err;
|
|
261
|
+
}
|
|
262
|
+
lastError = err instanceof Error ? err : new Error(String(err));
|
|
263
|
+
const backoff = EMBEDDING_INITIAL_BACKOFF_MS * Math.pow(2, attempt);
|
|
264
|
+
console.warn(
|
|
265
|
+
`Embedding API request failed: ${lastError.message} (attempt ${attempt + 1}/${EMBEDDING_MAX_RETRIES}), retrying in ${backoff}ms`,
|
|
266
|
+
);
|
|
267
|
+
await new Promise((r) => setTimeout(r, backoff));
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
throw lastError ?? new Error(`Embedding API failed after ${EMBEDDING_MAX_RETRIES} attempts`);
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
// ── Content normalisation + hash (SHA-256 hex) ────────────────────────────
|
|
275
|
+
// Must stay in sync with pipeline.py::_normalize / _hash.
|
|
276
|
+
// Converts CRLF (and bare CR) to LF, strips leading/trailing whitespace, and
|
|
277
|
+
// collapses 3+ consecutive newlines to two. The CRLF step is required because
|
|
278
|
+
// browsers submit textarea content with CRLF per the HTML spec, so a document
|
|
279
|
+
// first ingested via CLI/MCP (LF) must hash identically after a web edit.
|
|
280
|
+
|
|
281
|
+
function normalizeContent(text: string): string {
|
|
282
|
+
return text.trim().replace(/\r\n/g, "\n").replace(/\r/g, "\n").replace(/\n{3,}/g, "\n\n");
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
async function sha256hex(text: string): Promise<string> {
|
|
286
|
+
const bytes = new TextEncoder().encode(text);
|
|
287
|
+
const hash = await crypto.subtle.digest("SHA-256", bytes);
|
|
288
|
+
return Array.from(new Uint8Array(hash))
|
|
289
|
+
.map((b) => b.toString(16).padStart(2, "0"))
|
|
290
|
+
.join("");
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
// ── Non-destructive project membership helper ─────────────────────────────
|
|
294
|
+
//
|
|
295
|
+
// Per issue #38: on UPDATE flows, passing project_name must not silently
|
|
296
|
+
// strip existing memberships. Semantics:
|
|
297
|
+
// - Look up (or create) the project by name → project_id.
|
|
298
|
+
// - If (document_id, project_id) row already exists → no-op (idempotent).
|
|
299
|
+
// - Otherwise INSERT a new row, preserving all other existing memberships.
|
|
300
|
+
//
|
|
301
|
+
// Used by both update branches AND the create path so resolution is consistent.
|
|
302
|
+
|
|
303
|
+
// deno-lint-ignore no-explicit-any
|
|
304
|
+
async function ensureDocumentInProject(
|
|
305
|
+
// deno-lint-ignore no-explicit-any
|
|
306
|
+
supabase: any,
|
|
307
|
+
documentId: string,
|
|
308
|
+
projectName: string,
|
|
309
|
+
): Promise<string | null> {
|
|
310
|
+
// Resolve project name → id (look up; create if absent).
|
|
311
|
+
let projectId: string | null = null;
|
|
312
|
+
const { data: proj } = await supabase
|
|
313
|
+
.from("cerefox_projects")
|
|
314
|
+
.select("id")
|
|
315
|
+
.ilike("name", projectName)
|
|
316
|
+
.limit(1);
|
|
317
|
+
if (proj?.length) {
|
|
318
|
+
projectId = proj[0].id;
|
|
319
|
+
} else {
|
|
320
|
+
const { data: newProj } = await supabase
|
|
321
|
+
.from("cerefox_projects")
|
|
322
|
+
.insert({ name: projectName })
|
|
323
|
+
.select("id");
|
|
324
|
+
projectId = newProj?.[0]?.id ?? null;
|
|
325
|
+
}
|
|
326
|
+
if (!projectId) return null;
|
|
327
|
+
|
|
328
|
+
// Check membership; INSERT only if missing. PRIMARY KEY (document_id, project_id)
|
|
329
|
+
// guarantees uniqueness, so this is safe under concurrent calls (worst case:
|
|
330
|
+
// one of two concurrent inserts fails with 23505 unique_violation — we log
|
|
331
|
+
// and treat as "already a member"; outcome is identical).
|
|
332
|
+
const { data: existing } = await supabase
|
|
333
|
+
.from("cerefox_document_projects")
|
|
334
|
+
.select("document_id")
|
|
335
|
+
.eq("document_id", documentId)
|
|
336
|
+
.eq("project_id", projectId)
|
|
337
|
+
.limit(1);
|
|
338
|
+
if (existing?.length) return projectId; // Already a member — non-destructive
|
|
339
|
+
|
|
340
|
+
const { error: insertErr } = await supabase
|
|
341
|
+
.from("cerefox_document_projects")
|
|
342
|
+
.insert({ document_id: documentId, project_id: projectId });
|
|
343
|
+
if (insertErr && !String(insertErr.message ?? "").includes("duplicate key")) {
|
|
344
|
+
console.warn("ensureDocumentInProject: insert failed", insertErr);
|
|
345
|
+
}
|
|
346
|
+
return projectId;
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
// ── Destructive set-the-full-list helper (project_names list form) ─────────
|
|
350
|
+
//
|
|
351
|
+
// Resolves each name to a project_id (creating if absent), then REPLACES the
|
|
352
|
+
// document's project memberships with exactly that set. Used by the
|
|
353
|
+
// project_names: string[] form on cerefox_ingest (full-set semantics).
|
|
354
|
+
//
|
|
355
|
+
// Empty list = remove from all projects.
|
|
356
|
+
|
|
357
|
+
// deno-lint-ignore no-explicit-any
|
|
358
|
+
async function setDocumentProjectsByName(
|
|
359
|
+
// deno-lint-ignore no-explicit-any
|
|
360
|
+
supabase: any,
|
|
361
|
+
documentId: string,
|
|
362
|
+
projectNames: string[],
|
|
363
|
+
): Promise<string[]> {
|
|
364
|
+
const projectIds: string[] = [];
|
|
365
|
+
for (const name of projectNames) {
|
|
366
|
+
if (!name) continue;
|
|
367
|
+
const { data: proj } = await supabase
|
|
368
|
+
.from("cerefox_projects")
|
|
369
|
+
.select("id")
|
|
370
|
+
.ilike("name", name)
|
|
371
|
+
.limit(1);
|
|
372
|
+
if (proj?.length) {
|
|
373
|
+
projectIds.push(proj[0].id);
|
|
374
|
+
} else {
|
|
375
|
+
const { data: newProj } = await supabase
|
|
376
|
+
.from("cerefox_projects")
|
|
377
|
+
.insert({ name })
|
|
378
|
+
.select("id");
|
|
379
|
+
if (newProj?.[0]?.id) projectIds.push(newProj[0].id);
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
// DELETE-then-INSERT replace (matches Python assign_document_projects).
|
|
384
|
+
await supabase
|
|
385
|
+
.from("cerefox_document_projects")
|
|
386
|
+
.delete()
|
|
387
|
+
.eq("document_id", documentId);
|
|
388
|
+
if (projectIds.length > 0) {
|
|
389
|
+
const rows = projectIds.map((pid) => ({ document_id: documentId, project_id: pid }));
|
|
390
|
+
await supabase.from("cerefox_document_projects").insert(rows);
|
|
391
|
+
}
|
|
392
|
+
return projectIds;
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
// ── Main handler ───────────────────────────────────────────────────────────
|
|
396
|
+
|
|
397
|
+
Deno.serve(async (req: Request) => {
|
|
398
|
+
if (req.method === "OPTIONS") {
|
|
399
|
+
return new Response(null, {
|
|
400
|
+
headers: {
|
|
401
|
+
"Access-Control-Allow-Origin": "*",
|
|
402
|
+
"Access-Control-Allow-Headers": "authorization, x-client-info, apikey, content-type",
|
|
403
|
+
},
|
|
404
|
+
});
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
if (isVersionRequest(req)) {
|
|
408
|
+
return versionResponse("cerefox-ingest", {
|
|
409
|
+
"Content-Type": "application/json",
|
|
410
|
+
"Access-Control-Allow-Origin": "*",
|
|
411
|
+
});
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
if (req.method !== "POST") {
|
|
415
|
+
return new Response(JSON.stringify({ error: "POST required" }), {
|
|
416
|
+
status: 405,
|
|
417
|
+
headers: { "Content-Type": "application/json" },
|
|
418
|
+
});
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
let body: IngestRequest;
|
|
422
|
+
try {
|
|
423
|
+
body = await req.json();
|
|
424
|
+
} catch {
|
|
425
|
+
return new Response(JSON.stringify({ error: "Invalid JSON body" }), {
|
|
426
|
+
status: 400,
|
|
427
|
+
headers: { "Content-Type": "application/json" },
|
|
428
|
+
});
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
const { title, content, document_id = null, project_name, source = "agent", metadata = {}, update_if_exists = false, author = "agent", author_type = "agent" } = body;
|
|
432
|
+
|
|
433
|
+
// Validate + normalize project_names if provided (full-set destructive form)
|
|
434
|
+
let project_names: string[] | null = null;
|
|
435
|
+
if (body.project_names !== undefined && body.project_names !== null) {
|
|
436
|
+
if (!Array.isArray(body.project_names)) {
|
|
437
|
+
return new Response(
|
|
438
|
+
JSON.stringify({ error: "project_names must be an array of strings; use project_name (string) for a single project" }),
|
|
439
|
+
{ status: 400, headers: { "Content-Type": "application/json" } },
|
|
440
|
+
);
|
|
441
|
+
}
|
|
442
|
+
project_names = body.project_names.filter((s): s is string => typeof s === "string" && s.length > 0);
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
const supabaseUrl = Deno.env.get("SUPABASE_URL")!;
|
|
446
|
+
const supabaseKey = Deno.env.get("SUPABASE_SERVICE_ROLE_KEY")!;
|
|
447
|
+
const supabase = createClient(supabaseUrl, supabaseKey);
|
|
448
|
+
|
|
449
|
+
// Configurable requestor enforcement
|
|
450
|
+
{
|
|
451
|
+
const identityField = "author";
|
|
452
|
+
const identityValue = body[identityField as keyof IngestRequest] as string | undefined;
|
|
453
|
+
const { data: reqConfig } = await supabase.rpc("cerefox_get_config", { p_key: "require_requestor_identity" });
|
|
454
|
+
if (reqConfig === "true") {
|
|
455
|
+
if (!identityValue || (typeof identityValue === "string" && identityValue.trim() === "")) {
|
|
456
|
+
return new Response(
|
|
457
|
+
JSON.stringify({ error: `Missing required parameter "${identityField}". Server requires caller identity.` }),
|
|
458
|
+
{ status: 400, headers: { "Content-Type": "application/json", "Access-Control-Allow-Origin": "*" } },
|
|
459
|
+
);
|
|
460
|
+
}
|
|
461
|
+
const { data: fmtConfig } = await supabase.rpc("cerefox_get_config", { p_key: "requestor_identity_format" });
|
|
462
|
+
if (fmtConfig && typeof fmtConfig === "string" && fmtConfig.trim() !== "") {
|
|
463
|
+
if (!new RegExp(fmtConfig).test(identityValue)) {
|
|
464
|
+
return new Response(
|
|
465
|
+
JSON.stringify({ error: `Invalid "${identityField}" format. Does not match pattern: ${fmtConfig}` }),
|
|
466
|
+
{ status: 400, headers: { "Content-Type": "application/json", "Access-Control-Allow-Origin": "*" } },
|
|
467
|
+
);
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
if (!title?.trim() || !content?.trim()) {
|
|
474
|
+
return new Response(JSON.stringify({ error: "title and content are required" }), {
|
|
475
|
+
status: 400,
|
|
476
|
+
headers: { "Content-Type": "application/json" },
|
|
477
|
+
});
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
const openaiKey = Deno.env.get("OPENAI_API_KEY");
|
|
481
|
+
if (!openaiKey) {
|
|
482
|
+
return new Response(
|
|
483
|
+
JSON.stringify({ error: "OPENAI_API_KEY secret not set on this project" }),
|
|
484
|
+
{ status: 500, headers: { "Content-Type": "application/json" } },
|
|
485
|
+
);
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
const contentHash = await sha256hex(normalizeContent(content));
|
|
489
|
+
const headers = { "Content-Type": "application/json", "Access-Control-Allow-Origin": "*" };
|
|
490
|
+
const reviewStatus = author_type === "agent" ? "pending_review" : "approved";
|
|
491
|
+
|
|
492
|
+
// ── ID-based update path ────────────────────────────────────────────────────
|
|
493
|
+
// When document_id is provided, update that exact document regardless of
|
|
494
|
+
// update_if_exists. Skip hash dedup -- explicit ID = explicit intent to update.
|
|
495
|
+
if (document_id) {
|
|
496
|
+
const { data: existing } = await supabase
|
|
497
|
+
.from("cerefox_documents")
|
|
498
|
+
.select("id, title, content_hash")
|
|
499
|
+
.eq("id", document_id)
|
|
500
|
+
.is("deleted_at", null)
|
|
501
|
+
.limit(1);
|
|
502
|
+
|
|
503
|
+
if (!existing?.length) {
|
|
504
|
+
return new Response(
|
|
505
|
+
JSON.stringify({ error: `Document not found: ${document_id}` }),
|
|
506
|
+
{ status: 404, headers },
|
|
507
|
+
);
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
const existingDoc = existing[0];
|
|
511
|
+
|
|
512
|
+
// Content unchanged -- skip re-indexing
|
|
513
|
+
if (existingDoc.content_hash === contentHash) {
|
|
514
|
+
const note = update_if_exists ? undefined : "update_if_exists flag was overridden by document_id";
|
|
515
|
+
return new Response(
|
|
516
|
+
JSON.stringify({
|
|
517
|
+
document_id: existingDoc.id,
|
|
518
|
+
title: existingDoc.title,
|
|
519
|
+
skipped: true,
|
|
520
|
+
updated: false,
|
|
521
|
+
message: "Document already up-to-date (content hash match)",
|
|
522
|
+
...(note && { note }),
|
|
523
|
+
}),
|
|
524
|
+
{ headers },
|
|
525
|
+
);
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
// Content changed -- re-chunk, re-embed, ingest via RPC
|
|
529
|
+
const chunks = chunkMarkdown(content);
|
|
530
|
+
if (chunks.length === 0) {
|
|
531
|
+
return new Response(JSON.stringify({ error: "Content produced no chunks" }), { status: 422, headers });
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
const texts = chunks.map((c) => `# ${title.trim()}\n${c.content}`);
|
|
535
|
+
let embeddings: number[][];
|
|
536
|
+
try {
|
|
537
|
+
embeddings = await embedBatch(texts, openaiKey);
|
|
538
|
+
} catch (err) {
|
|
539
|
+
return new Response(JSON.stringify({ error: String(err) }), { status: 502, headers });
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
const totalChars = chunks.reduce((s, c) => s + c.char_count, 0);
|
|
543
|
+
const chunkData = chunks.map((chunk, i) => ({
|
|
544
|
+
chunk_index: i,
|
|
545
|
+
heading_path: chunk.heading_path,
|
|
546
|
+
heading_level: chunk.heading_level,
|
|
547
|
+
title: chunk.title,
|
|
548
|
+
content: chunk.content,
|
|
549
|
+
char_count: chunk.char_count,
|
|
550
|
+
embedding: embeddings[i],
|
|
551
|
+
embedder: OPENAI_MODEL,
|
|
552
|
+
}));
|
|
553
|
+
|
|
554
|
+
const { error: ingestErr } = await supabase.rpc("cerefox_ingest_document", {
|
|
555
|
+
p_document_id: existingDoc.id,
|
|
556
|
+
p_title: title.trim(),
|
|
557
|
+
p_source: source,
|
|
558
|
+
p_content_hash: contentHash,
|
|
559
|
+
p_metadata: metadata,
|
|
560
|
+
p_review_status: reviewStatus,
|
|
561
|
+
p_chunks: chunkData,
|
|
562
|
+
p_author: author,
|
|
563
|
+
p_author_type: author_type,
|
|
564
|
+
p_source_label: source,
|
|
565
|
+
});
|
|
566
|
+
|
|
567
|
+
if (ingestErr) {
|
|
568
|
+
return new Response(JSON.stringify({ error: `Ingest RPC failed: ${ingestErr.message}` }), { status: 500, headers });
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
Promise.resolve(supabase.rpc("cerefox_log_usage", {
|
|
572
|
+
p_operation: "ingest",
|
|
573
|
+
p_access_path: "edge-function",
|
|
574
|
+
p_requestor: author,
|
|
575
|
+
p_document_id: existingDoc.id,
|
|
576
|
+
p_result_count: chunks.length,
|
|
577
|
+
})).catch(() => {});
|
|
578
|
+
|
|
579
|
+
// Project membership semantics on update (issue #38):
|
|
580
|
+
// - project_names (list) → destructive replace (full-set semantics)
|
|
581
|
+
// - project_name (singular) → non-destructive add (only if project_names absent)
|
|
582
|
+
if (project_names !== null) {
|
|
583
|
+
await setDocumentProjectsByName(supabase, existingDoc.id, project_names);
|
|
584
|
+
} else if (project_name) {
|
|
585
|
+
await ensureDocumentInProject(supabase, existingDoc.id, project_name);
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
const note = update_if_exists ? undefined : "update_if_exists flag was overridden by document_id";
|
|
589
|
+
return new Response(
|
|
590
|
+
JSON.stringify({
|
|
591
|
+
document_id: existingDoc.id,
|
|
592
|
+
title: title.trim(),
|
|
593
|
+
chunk_count: chunks.length,
|
|
594
|
+
total_chars: totalChars,
|
|
595
|
+
updated: true,
|
|
596
|
+
...(note && { note }),
|
|
597
|
+
}),
|
|
598
|
+
{ headers },
|
|
599
|
+
);
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
// ── Update-existing path ────────────────────────────────────────────────────
|
|
603
|
+
if (update_if_exists) {
|
|
604
|
+
const { data: existing } = await supabase
|
|
605
|
+
.from("cerefox_documents")
|
|
606
|
+
.select("id, title, content_hash")
|
|
607
|
+
.eq("title", title.trim())
|
|
608
|
+
.order("updated_at", { ascending: false })
|
|
609
|
+
.limit(1);
|
|
610
|
+
|
|
611
|
+
if (existing?.length) {
|
|
612
|
+
const existingDoc = existing[0];
|
|
613
|
+
|
|
614
|
+
// Content unchanged — skip re-indexing
|
|
615
|
+
if (existingDoc.content_hash === contentHash) {
|
|
616
|
+
return new Response(
|
|
617
|
+
JSON.stringify({
|
|
618
|
+
document_id: existingDoc.id,
|
|
619
|
+
title: existingDoc.title,
|
|
620
|
+
skipped: true,
|
|
621
|
+
updated: false,
|
|
622
|
+
message: "Document already up-to-date (content hash match)",
|
|
623
|
+
}),
|
|
624
|
+
{ headers },
|
|
625
|
+
);
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
// Content changed — re-chunk, re-embed, ingest via RPC
|
|
629
|
+
const chunks = chunkMarkdown(content);
|
|
630
|
+
if (chunks.length === 0) {
|
|
631
|
+
return new Response(JSON.stringify({ error: "Content produced no chunks" }), {
|
|
632
|
+
status: 422, headers,
|
|
633
|
+
});
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
// Prepend document title for contextual enrichment (stored content unchanged)
|
|
637
|
+
const texts = chunks.map((c) => `# ${title.trim()}\n${c.content}`);
|
|
638
|
+
let embeddings: number[][];
|
|
639
|
+
try {
|
|
640
|
+
embeddings = await embedBatch(texts, openaiKey);
|
|
641
|
+
} catch (err) {
|
|
642
|
+
return new Response(JSON.stringify({ error: String(err) }), { status: 502, headers });
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
const totalChars = chunks.reduce((s, c) => s + c.char_count, 0);
|
|
646
|
+
|
|
647
|
+
// Single RPC handles: snapshot version, update doc, insert chunks, set review_status, audit entry
|
|
648
|
+
const chunkData = chunks.map((chunk, i) => ({
|
|
649
|
+
chunk_index: i,
|
|
650
|
+
heading_path: chunk.heading_path,
|
|
651
|
+
heading_level: chunk.heading_level,
|
|
652
|
+
title: chunk.title,
|
|
653
|
+
content: chunk.content,
|
|
654
|
+
char_count: chunk.char_count,
|
|
655
|
+
embedding: embeddings[i],
|
|
656
|
+
embedder: OPENAI_MODEL,
|
|
657
|
+
}));
|
|
658
|
+
|
|
659
|
+
const { data: ingestResult, error: ingestErr } = await supabase.rpc("cerefox_ingest_document", {
|
|
660
|
+
p_document_id: existingDoc.id,
|
|
661
|
+
p_title: existingDoc.title,
|
|
662
|
+
p_source: source,
|
|
663
|
+
p_content_hash: contentHash,
|
|
664
|
+
p_metadata: metadata,
|
|
665
|
+
p_review_status: reviewStatus,
|
|
666
|
+
p_chunks: chunkData,
|
|
667
|
+
p_author: author,
|
|
668
|
+
p_author_type: author_type,
|
|
669
|
+
p_source_label: source,
|
|
670
|
+
});
|
|
671
|
+
|
|
672
|
+
if (ingestErr) {
|
|
673
|
+
return new Response(
|
|
674
|
+
JSON.stringify({ error: `Ingest RPC failed: ${ingestErr.message}` }),
|
|
675
|
+
{ status: 500, headers },
|
|
676
|
+
);
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
// Fire-and-forget usage logging for update
|
|
680
|
+
Promise.resolve(supabase.rpc("cerefox_log_usage", {
|
|
681
|
+
p_operation: "ingest",
|
|
682
|
+
p_access_path: "edge-function",
|
|
683
|
+
p_requestor: author,
|
|
684
|
+
p_document_id: existingDoc.id,
|
|
685
|
+
p_result_count: chunks.length,
|
|
686
|
+
})).catch(() => {});
|
|
687
|
+
|
|
688
|
+
// Project membership semantics on update (issue #38):
|
|
689
|
+
// - project_names (list) → destructive replace (full-set semantics)
|
|
690
|
+
// - project_name (singular) → non-destructive add (only if project_names absent)
|
|
691
|
+
if (project_names !== null) {
|
|
692
|
+
await setDocumentProjectsByName(supabase, existingDoc.id, project_names);
|
|
693
|
+
} else if (project_name) {
|
|
694
|
+
await ensureDocumentInProject(supabase, existingDoc.id, project_name);
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
return new Response(
|
|
698
|
+
JSON.stringify({
|
|
699
|
+
document_id: existingDoc.id,
|
|
700
|
+
title: existingDoc.title,
|
|
701
|
+
chunk_count: chunks.length,
|
|
702
|
+
total_chars: totalChars,
|
|
703
|
+
updated: true,
|
|
704
|
+
}),
|
|
705
|
+
{ headers },
|
|
706
|
+
);
|
|
707
|
+
}
|
|
708
|
+
// No match found -- fall through to normal create below
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
// ── Hash deduplication (normal create path) ────────────────────────────────
|
|
712
|
+
const { data: hashMatch } = await supabase
|
|
713
|
+
.from("cerefox_documents")
|
|
714
|
+
.select("id, title")
|
|
715
|
+
.eq("content_hash", contentHash)
|
|
716
|
+
.limit(1);
|
|
717
|
+
|
|
718
|
+
if (hashMatch?.length) {
|
|
719
|
+
return new Response(
|
|
720
|
+
JSON.stringify({
|
|
721
|
+
document_id: hashMatch[0].id,
|
|
722
|
+
title: hashMatch[0].title,
|
|
723
|
+
skipped: true,
|
|
724
|
+
message: "Document already exists (content hash match)",
|
|
725
|
+
}),
|
|
726
|
+
{ headers },
|
|
727
|
+
);
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
// Chunk the content
|
|
731
|
+
const chunks = chunkMarkdown(content);
|
|
732
|
+
if (chunks.length === 0) {
|
|
733
|
+
return new Response(JSON.stringify({ error: "Content produced no chunks" }), {
|
|
734
|
+
status: 422,
|
|
735
|
+
headers,
|
|
736
|
+
});
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
// Embed all chunks with title prefix for contextual enrichment (stored content unchanged)
|
|
740
|
+
const texts = chunks.map((c) => `# ${title.trim()}\n${c.content}`);
|
|
741
|
+
let embeddings: number[][];
|
|
742
|
+
try {
|
|
743
|
+
embeddings = await embedBatch(texts, openaiKey);
|
|
744
|
+
} catch (err) {
|
|
745
|
+
return new Response(JSON.stringify({ error: String(err) }), {
|
|
746
|
+
status: 502,
|
|
747
|
+
headers,
|
|
748
|
+
});
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
const totalChars = chunks.reduce((s, c) => s + c.char_count, 0);
|
|
752
|
+
|
|
753
|
+
// Single RPC handles: insert doc, insert chunks, set review_status, audit entry
|
|
754
|
+
const chunkData = chunks.map((chunk, i) => ({
|
|
755
|
+
chunk_index: i,
|
|
756
|
+
heading_path: chunk.heading_path,
|
|
757
|
+
heading_level: chunk.heading_level,
|
|
758
|
+
title: chunk.title,
|
|
759
|
+
content: chunk.content,
|
|
760
|
+
char_count: chunk.char_count,
|
|
761
|
+
embedding: embeddings[i],
|
|
762
|
+
embedder: OPENAI_MODEL,
|
|
763
|
+
}));
|
|
764
|
+
|
|
765
|
+
const { data: ingestResult, error: ingestErr } = await supabase.rpc("cerefox_ingest_document", {
|
|
766
|
+
p_document_id: null,
|
|
767
|
+
p_title: title.trim(),
|
|
768
|
+
p_source: source,
|
|
769
|
+
p_content_hash: contentHash,
|
|
770
|
+
p_metadata: metadata,
|
|
771
|
+
p_review_status: reviewStatus,
|
|
772
|
+
p_chunks: chunkData,
|
|
773
|
+
p_author: author,
|
|
774
|
+
p_author_type: author_type,
|
|
775
|
+
});
|
|
776
|
+
|
|
777
|
+
if (ingestErr || !ingestResult?.length) {
|
|
778
|
+
return new Response(
|
|
779
|
+
JSON.stringify({ error: `Ingest RPC failed: ${ingestErr?.message ?? "no data returned"}` }),
|
|
780
|
+
{ status: 500, headers },
|
|
781
|
+
);
|
|
782
|
+
}
|
|
783
|
+
|
|
784
|
+
const documentId = ingestResult[0].document_id;
|
|
785
|
+
|
|
786
|
+
// Project assignment on CREATE:
|
|
787
|
+
// - project_names (list) → assign all
|
|
788
|
+
// - project_name (singular) → assign one via the non-destructive helper
|
|
789
|
+
let projectId: string | null = null;
|
|
790
|
+
if (project_names !== null && project_names.length > 0) {
|
|
791
|
+
await setDocumentProjectsByName(supabase, documentId, project_names);
|
|
792
|
+
} else if (project_name) {
|
|
793
|
+
projectId = await ensureDocumentInProject(supabase, documentId, project_name);
|
|
794
|
+
}
|
|
795
|
+
|
|
796
|
+
// Fire-and-forget usage logging for ingest
|
|
797
|
+
Promise.resolve(supabase.rpc("cerefox_log_usage", {
|
|
798
|
+
p_operation: "ingest",
|
|
799
|
+
p_access_path: "edge-function",
|
|
800
|
+
p_requestor: author,
|
|
801
|
+
p_document_id: documentId,
|
|
802
|
+
p_result_count: chunks.length,
|
|
803
|
+
})).catch(() => {});
|
|
804
|
+
|
|
805
|
+
return new Response(
|
|
806
|
+
JSON.stringify({
|
|
807
|
+
document_id: documentId,
|
|
808
|
+
title: title.trim(),
|
|
809
|
+
chunk_count: chunks.length,
|
|
810
|
+
total_chars: totalChars,
|
|
811
|
+
project_id: projectId,
|
|
812
|
+
project_name: project_name ?? null,
|
|
813
|
+
}),
|
|
814
|
+
{
|
|
815
|
+
status: 201,
|
|
816
|
+
headers,
|
|
817
|
+
},
|
|
818
|
+
);
|
|
819
|
+
});
|