@cerefox/memory 0.7.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/README.md +62 -25
  2. package/dist/bin/cerefox.js +1163 -344
  3. package/dist/frontend/assets/{index-HNlMcvli.js → index-CAp2_lFX.js} +2 -2
  4. package/dist/frontend/assets/index-CAp2_lFX.js.map +1 -0
  5. package/dist/frontend/index.html +1 -1
  6. package/dist/server-assets/_shared/ef-meta/index.ts +97 -0
  7. package/dist/server-assets/_shared/embeddings/index.ts +175 -0
  8. package/dist/server-assets/_shared/mcp-tools/_chunker.ts +187 -0
  9. package/dist/server-assets/_shared/mcp-tools/_projects.ts +121 -0
  10. package/dist/server-assets/_shared/mcp-tools/_utils.ts +73 -0
  11. package/dist/server-assets/_shared/mcp-tools/audit-log.ts +95 -0
  12. package/dist/server-assets/_shared/mcp-tools/get-document.ts +73 -0
  13. package/dist/server-assets/_shared/mcp-tools/get-help-content.ts +26 -0
  14. package/dist/server-assets/_shared/mcp-tools/get-help.ts +90 -0
  15. package/dist/server-assets/_shared/mcp-tools/index.ts +67 -0
  16. package/dist/server-assets/_shared/mcp-tools/ingest.ts +315 -0
  17. package/dist/server-assets/_shared/mcp-tools/list-metadata-keys.ts +55 -0
  18. package/dist/server-assets/_shared/mcp-tools/list-projects.ts +59 -0
  19. package/dist/server-assets/_shared/mcp-tools/list-versions.ts +72 -0
  20. package/dist/server-assets/_shared/mcp-tools/metadata-search.ts +154 -0
  21. package/dist/server-assets/_shared/mcp-tools/search.ts +193 -0
  22. package/dist/server-assets/_shared/mcp-tools/set-document-projects.ts +163 -0
  23. package/dist/server-assets/_shared/mcp-tools/types.ts +92 -0
  24. package/dist/server-assets/db/migrations/0003_add_document_versions.sql +91 -0
  25. package/dist/server-assets/db/migrations/0004_add_audit_log_review_status_archived.sql +71 -0
  26. package/dist/server-assets/db/migrations/0005_metadata_search.sql +628 -0
  27. package/dist/server-assets/db/migrations/0006_usage_log.sql +255 -0
  28. package/dist/server-assets/db/migrations/0007_usage_log_requestor.sql +178 -0
  29. package/dist/server-assets/db/migrations/0008_soft_delete.sql +130 -0
  30. package/dist/server-assets/db/migrations/0009_audit_log_restore_operation.sql +20 -0
  31. package/dist/server-assets/db/migrations/0010_requestor_enforcement_config.sql +12 -0
  32. package/dist/server-assets/db/migrations/0011_title_boosting.sql +48 -0
  33. package/dist/server-assets/db/rpcs.sql +1723 -0
  34. package/dist/server-assets/db/schema.sql +380 -0
  35. package/dist/server-assets/supabase/functions/cerefox-get-audit-log/index.ts +117 -0
  36. package/dist/server-assets/supabase/functions/cerefox-get-document/index.ts +138 -0
  37. package/dist/server-assets/supabase/functions/cerefox-ingest/index.ts +819 -0
  38. package/dist/server-assets/supabase/functions/cerefox-list-projects/index.ts +96 -0
  39. package/dist/server-assets/supabase/functions/cerefox-list-versions/index.ts +113 -0
  40. package/dist/server-assets/supabase/functions/cerefox-mcp/index.ts +294 -0
  41. package/dist/server-assets/supabase/functions/cerefox-mcp/shared.ts +42 -0
  42. package/dist/server-assets/supabase/functions/cerefox-metadata/index.ts +99 -0
  43. package/dist/server-assets/supabase/functions/cerefox-metadata-search/index.ts +146 -0
  44. package/dist/server-assets/supabase/functions/cerefox-search/index.ts +382 -0
  45. package/docs/guides/connect-agents.md +58 -3
  46. package/docs/guides/migration-v0.5.md +50 -0
  47. package/package.json +3 -2
  48. package/dist/frontend/assets/index-HNlMcvli.js.map +0 -1
@@ -0,0 +1,819 @@
1
+ import "jsr:@supabase/functions-js/edge-runtime.d.ts";
2
+ import { createClient } from "jsr:@supabase/supabase-js@2";
3
+ import { isVersionRequest, versionResponse } from "../../../_shared/ef-meta/index.ts";
4
+
5
+ /**
6
+ * cerefox-ingest — Supabase Edge Function
7
+ *
8
+ * Quick-capture endpoint: accepts a markdown note, chunks it by headings,
9
+ * embeds each chunk with OpenAI, and stores everything in the knowledge base.
10
+ *
11
+ * This is the agent write path — use it for short notes captured during a
12
+ * conversation. For large batch ingestion (directories, PDFs, etc.) use the
13
+ * Python CLI: `cerefox ingest file.md`.
14
+ *
15
+ * Request body (JSON):
16
+ * title string required Document title
17
+ * content string required Markdown content
18
+ * project_name string optional Project to assign to (looked up by name, created if absent)
19
+ * source string optional Origin label (default: "agent")
20
+ * metadata object optional Arbitrary JSONB metadata
21
+ *
22
+ * Response: { document_id, title, chunk_count, project_id? }
23
+ */
24
+
25
+ const OPENAI_EMBEDDING_URL = "https://api.openai.com/v1/embeddings";
26
+ const OPENAI_MODEL = "text-embedding-3-small";
27
+ const EMBEDDING_DIMENSIONS = 768;
28
+
29
+ const MAX_CHUNK_CHARS = 4000;
30
+ const MIN_CHUNK_CHARS = 100;
31
+
32
+ interface IngestRequest {
33
+ title: string;
34
+ content: string;
35
+ document_id?: string;
36
+ project_name?: string;
37
+ project_names?: string[]; // Full-set semantics; wins over project_name when both provided
38
+ source?: string;
39
+ metadata?: Record<string, unknown>;
40
+ update_if_exists?: boolean;
41
+ author?: string;
42
+ author_type?: string; // 'user' | 'agent'
43
+ }
44
+
45
+ interface Chunk {
46
+ heading_path: string[];
47
+ heading_level: number;
48
+ title: string;
49
+ content: string;
50
+ char_count: number;
51
+ }
52
+
53
+ // ── Heading-aware chunker (mirrors Python logic) ───────────────────────────
54
+ //
55
+ // Design notes:
56
+ // • Short-circuit for small documents: if the entire document fits within
57
+ // MAX_CHUNK_CHARS, it is returned as a single chunk with no splitting.
58
+ // • Greedy accumulation: sections are collected into a buffer until adding
59
+ // the next would exceed MAX_CHUNK_CHARS. This keeps chunks close to the
60
+ // target size and avoids many tiny fragments at every heading boundary.
61
+ // All heading levels (H1/H2/H3) are treated equally — size alone controls
62
+ // when a chunk is flushed; there are no hard heading-level boundaries.
63
+ // • Oversized sections (> MAX_CHUNK_CHARS) are paragraph-split with no overlap.
64
+ // • The first section's heading metadata anchors each chunk's breadcrumb.
65
+ // • No overlaps between chunks — the heading breadcrumb in the content
66
+ // provides sufficient context. Overlaps caused duplication on reconstruction.
67
+
68
+ interface Section {
69
+ level: number;
70
+ headings: string[]; // full heading stack at this section
71
+ heading: string; // just the current heading text
72
+ content: string; // heading line + body
73
+ body: string; // body only (no heading line)
74
+ }
75
+
76
+ function parseSections(text: string): Section[] {
77
+ const lines = text.split("\n");
78
+ const sections: Section[] = [];
79
+ let currentHeadings: string[] = [];
80
+ let currentLevel = 0;
81
+ let bodyLines: string[] = [];
82
+
83
+ function collectSection() {
84
+ const body = bodyLines.join("\n").trim();
85
+ bodyLines = [];
86
+ let content: string;
87
+ if (currentLevel > 0) {
88
+ const headerLine = "#".repeat(currentLevel) + " " + (currentHeadings[currentHeadings.length - 1] ?? "");
89
+ content = body ? headerLine + "\n\n" + body : headerLine;
90
+ } else {
91
+ content = body;
92
+ }
93
+ if (!content.trim()) return;
94
+ sections.push({
95
+ level: currentLevel,
96
+ headings: [...currentHeadings],
97
+ heading: currentHeadings[currentHeadings.length - 1] ?? "",
98
+ content,
99
+ body,
100
+ });
101
+ }
102
+
103
+ for (const line of lines) {
104
+ const h1 = line.match(/^# (.+)/);
105
+ const h2 = line.match(/^## (.+)/);
106
+ const h3 = line.match(/^### (.+)/);
107
+
108
+ if (h1) {
109
+ collectSection();
110
+ currentHeadings = [h1[1].trim()];
111
+ currentLevel = 1;
112
+ } else if (h2) {
113
+ collectSection();
114
+ currentHeadings = [currentHeadings[0] ?? "", h2[1].trim()].filter(Boolean);
115
+ currentLevel = 2;
116
+ } else if (h3) {
117
+ collectSection();
118
+ currentHeadings = [
119
+ currentHeadings[0] ?? "",
120
+ currentHeadings[1] ?? "",
121
+ h3[1].trim(),
122
+ ].filter(Boolean);
123
+ currentLevel = 3;
124
+ } else {
125
+ bodyLines.push(line);
126
+ }
127
+ }
128
+ collectSection();
129
+ return sections;
130
+ }
131
+
132
+ function chunkMarkdown(text: string): Chunk[] {
133
+ const trimmed = text.trim();
134
+ if (!trimmed) return [];
135
+
136
+ // Short-circuit: entire document fits in one chunk — skip heading splitting.
137
+ if (trimmed.length <= MAX_CHUNK_CHARS) {
138
+ return [makeChunk([], 0, trimmed)];
139
+ }
140
+
141
+ const sections = parseSections(trimmed);
142
+ const chunks: Chunk[] = [];
143
+
144
+ // Greedy accumulation buffer
145
+ let bufParts: string[] = [];
146
+ let bufHeadings: string[] = [];
147
+ let bufLevel = 0;
148
+ let bufChars = 0;
149
+
150
+ function flushBuf() {
151
+ if (bufParts.length === 0) return;
152
+ chunks.push(makeChunk(bufHeadings, bufLevel, bufParts.join("\n\n")));
153
+ bufParts = [];
154
+ bufHeadings = [];
155
+ bufLevel = 0;
156
+ bufChars = 0;
157
+ }
158
+
159
+ for (const section of sections) {
160
+ const { level, headings, heading, content, body } = section;
161
+
162
+ // Oversized section: flush buffer, then paragraph-split.
163
+ if (content.length > MAX_CHUNK_CHARS) {
164
+ flushBuf();
165
+ const headerPrefix = level > 0 ? "#".repeat(level) + " " + heading + "\n\n" : "";
166
+ const bodyToSplit = body || content;
167
+ const paragraphs = bodyToSplit.split(/\n\n+/);
168
+ let sub = "";
169
+ let isFirst = true;
170
+ for (const para of paragraphs) {
171
+ const prefix = isFirst ? headerPrefix : "";
172
+ if (sub.length + prefix.length + para.length + 2 > MAX_CHUNK_CHARS && sub.length > 0) {
173
+ chunks.push(makeChunk(headings, level, sub.trim()));
174
+ sub = para;
175
+ isFirst = false;
176
+ } else {
177
+ sub = sub ? sub + "\n\n" + para : prefix + para;
178
+ isFirst = false;
179
+ }
180
+ }
181
+ if (sub.trim()) chunks.push(makeChunk(headings, level, sub.trim()));
182
+ continue;
183
+ }
184
+
185
+ // Section fits. Try to accumulate into the buffer.
186
+ const addition = content.length + (bufParts.length > 0 ? 2 : 0);
187
+
188
+ if (bufChars + addition <= MAX_CHUNK_CHARS) {
189
+ if (bufParts.length === 0) {
190
+ bufHeadings = headings;
191
+ bufLevel = level;
192
+ }
193
+ bufParts.push(content);
194
+ bufChars += addition;
195
+ } else {
196
+ flushBuf();
197
+ bufParts = [content];
198
+ bufHeadings = headings;
199
+ bufLevel = level;
200
+ bufChars = content.length;
201
+ }
202
+ }
203
+
204
+ flushBuf();
205
+ return chunks;
206
+ }
207
+
208
+ function makeChunk(headings: string[], level: number, content: string): Chunk {
209
+ const title = headings[headings.length - 1] ?? "";
210
+ return { heading_path: [...headings], heading_level: level, title, content, char_count: content.length };
211
+ }
212
+
213
+ // ── Embedding ──────────────────────────────────────────────────────────────
214
+
215
+ const EMBEDDING_MAX_RETRIES = 3;
216
+ const EMBEDDING_INITIAL_BACKOFF_MS = 500; // 500ms, 1s, 2s exponential backoff
217
+
218
+ async function embedBatch(texts: string[], apiKey: string): Promise<number[][]> {
219
+ let lastError: Error | null = null;
220
+
221
+ for (let attempt = 0; attempt < EMBEDDING_MAX_RETRIES; attempt++) {
222
+ try {
223
+ const response = await fetch(OPENAI_EMBEDDING_URL, {
224
+ method: "POST",
225
+ headers: {
226
+ "Authorization": `Bearer ${apiKey}`,
227
+ "Content-Type": "application/json",
228
+ },
229
+ body: JSON.stringify({
230
+ model: OPENAI_MODEL,
231
+ input: texts,
232
+ dimensions: EMBEDDING_DIMENSIONS,
233
+ }),
234
+ });
235
+
236
+ if (!response.ok) {
237
+ const err = await response.text();
238
+ if (response.status < 500) {
239
+ throw new Error(`OpenAI embedding error ${response.status}: ${err}`);
240
+ }
241
+ lastError = new Error(`OpenAI embedding error ${response.status}: ${err}`);
242
+ const backoff = EMBEDDING_INITIAL_BACKOFF_MS * Math.pow(2, attempt);
243
+ console.warn(
244
+ `Embedding API returned ${response.status} (attempt ${attempt + 1}/${EMBEDDING_MAX_RETRIES}), retrying in ${backoff}ms`,
245
+ );
246
+ await new Promise((r) => setTimeout(r, backoff));
247
+ continue;
248
+ }
249
+
250
+ const data = await response.json();
251
+ if (attempt > 0) {
252
+ console.info(`Embedding API succeeded on retry ${attempt}`);
253
+ }
254
+ const sorted = data.data.sort(
255
+ (a: { index: number }, b: { index: number }) => a.index - b.index,
256
+ );
257
+ return sorted.map((d: { embedding: number[] }) => d.embedding);
258
+ } catch (err) {
259
+ if (err instanceof Error && err.message.startsWith("OpenAI embedding error")) {
260
+ throw err;
261
+ }
262
+ lastError = err instanceof Error ? err : new Error(String(err));
263
+ const backoff = EMBEDDING_INITIAL_BACKOFF_MS * Math.pow(2, attempt);
264
+ console.warn(
265
+ `Embedding API request failed: ${lastError.message} (attempt ${attempt + 1}/${EMBEDDING_MAX_RETRIES}), retrying in ${backoff}ms`,
266
+ );
267
+ await new Promise((r) => setTimeout(r, backoff));
268
+ }
269
+ }
270
+
271
+ throw lastError ?? new Error(`Embedding API failed after ${EMBEDDING_MAX_RETRIES} attempts`);
272
+ }
273
+
274
+ // ── Content normalisation + hash (SHA-256 hex) ────────────────────────────
275
+ // Must stay in sync with pipeline.py::_normalize / _hash.
276
+ // Converts CRLF (and bare CR) to LF, strips leading/trailing whitespace, and
277
+ // collapses 3+ consecutive newlines to two. The CRLF step is required because
278
+ // browsers submit textarea content with CRLF per the HTML spec, so a document
279
+ // first ingested via CLI/MCP (LF) must hash identically after a web edit.
280
+
281
+ function normalizeContent(text: string): string {
282
+ return text.trim().replace(/\r\n/g, "\n").replace(/\r/g, "\n").replace(/\n{3,}/g, "\n\n");
283
+ }
284
+
285
+ async function sha256hex(text: string): Promise<string> {
286
+ const bytes = new TextEncoder().encode(text);
287
+ const hash = await crypto.subtle.digest("SHA-256", bytes);
288
+ return Array.from(new Uint8Array(hash))
289
+ .map((b) => b.toString(16).padStart(2, "0"))
290
+ .join("");
291
+ }
292
+
293
+ // ── Non-destructive project membership helper ─────────────────────────────
294
+ //
295
+ // Per issue #38: on UPDATE flows, passing project_name must not silently
296
+ // strip existing memberships. Semantics:
297
+ // - Look up (or create) the project by name → project_id.
298
+ // - If (document_id, project_id) row already exists → no-op (idempotent).
299
+ // - Otherwise INSERT a new row, preserving all other existing memberships.
300
+ //
301
+ // Used by both update branches AND the create path so resolution is consistent.
302
+
303
+ // deno-lint-ignore no-explicit-any
304
+ async function ensureDocumentInProject(
305
+ // deno-lint-ignore no-explicit-any
306
+ supabase: any,
307
+ documentId: string,
308
+ projectName: string,
309
+ ): Promise<string | null> {
310
+ // Resolve project name → id (look up; create if absent).
311
+ let projectId: string | null = null;
312
+ const { data: proj } = await supabase
313
+ .from("cerefox_projects")
314
+ .select("id")
315
+ .ilike("name", projectName)
316
+ .limit(1);
317
+ if (proj?.length) {
318
+ projectId = proj[0].id;
319
+ } else {
320
+ const { data: newProj } = await supabase
321
+ .from("cerefox_projects")
322
+ .insert({ name: projectName })
323
+ .select("id");
324
+ projectId = newProj?.[0]?.id ?? null;
325
+ }
326
+ if (!projectId) return null;
327
+
328
+ // Check membership; INSERT only if missing. PRIMARY KEY (document_id, project_id)
329
+ // guarantees uniqueness, so this is safe under concurrent calls (worst case:
330
+ // one of two concurrent inserts fails with 23505 unique_violation — we log
331
+ // and treat as "already a member"; outcome is identical).
332
+ const { data: existing } = await supabase
333
+ .from("cerefox_document_projects")
334
+ .select("document_id")
335
+ .eq("document_id", documentId)
336
+ .eq("project_id", projectId)
337
+ .limit(1);
338
+ if (existing?.length) return projectId; // Already a member — non-destructive
339
+
340
+ const { error: insertErr } = await supabase
341
+ .from("cerefox_document_projects")
342
+ .insert({ document_id: documentId, project_id: projectId });
343
+ if (insertErr && !String(insertErr.message ?? "").includes("duplicate key")) {
344
+ console.warn("ensureDocumentInProject: insert failed", insertErr);
345
+ }
346
+ return projectId;
347
+ }
348
+
349
+ // ── Destructive set-the-full-list helper (project_names list form) ─────────
350
+ //
351
+ // Resolves each name to a project_id (creating if absent), then REPLACES the
352
+ // document's project memberships with exactly that set. Used by the
353
+ // project_names: string[] form on cerefox_ingest (full-set semantics).
354
+ //
355
+ // Empty list = remove from all projects.
356
+
357
+ // deno-lint-ignore no-explicit-any
358
+ async function setDocumentProjectsByName(
359
+ // deno-lint-ignore no-explicit-any
360
+ supabase: any,
361
+ documentId: string,
362
+ projectNames: string[],
363
+ ): Promise<string[]> {
364
+ const projectIds: string[] = [];
365
+ for (const name of projectNames) {
366
+ if (!name) continue;
367
+ const { data: proj } = await supabase
368
+ .from("cerefox_projects")
369
+ .select("id")
370
+ .ilike("name", name)
371
+ .limit(1);
372
+ if (proj?.length) {
373
+ projectIds.push(proj[0].id);
374
+ } else {
375
+ const { data: newProj } = await supabase
376
+ .from("cerefox_projects")
377
+ .insert({ name })
378
+ .select("id");
379
+ if (newProj?.[0]?.id) projectIds.push(newProj[0].id);
380
+ }
381
+ }
382
+
383
+ // DELETE-then-INSERT replace (matches Python assign_document_projects).
384
+ await supabase
385
+ .from("cerefox_document_projects")
386
+ .delete()
387
+ .eq("document_id", documentId);
388
+ if (projectIds.length > 0) {
389
+ const rows = projectIds.map((pid) => ({ document_id: documentId, project_id: pid }));
390
+ await supabase.from("cerefox_document_projects").insert(rows);
391
+ }
392
+ return projectIds;
393
+ }
394
+
395
+ // ── Main handler ───────────────────────────────────────────────────────────
396
+
397
+ Deno.serve(async (req: Request) => {
398
+ if (req.method === "OPTIONS") {
399
+ return new Response(null, {
400
+ headers: {
401
+ "Access-Control-Allow-Origin": "*",
402
+ "Access-Control-Allow-Headers": "authorization, x-client-info, apikey, content-type",
403
+ },
404
+ });
405
+ }
406
+
407
+ if (isVersionRequest(req)) {
408
+ return versionResponse("cerefox-ingest", {
409
+ "Content-Type": "application/json",
410
+ "Access-Control-Allow-Origin": "*",
411
+ });
412
+ }
413
+
414
+ if (req.method !== "POST") {
415
+ return new Response(JSON.stringify({ error: "POST required" }), {
416
+ status: 405,
417
+ headers: { "Content-Type": "application/json" },
418
+ });
419
+ }
420
+
421
+ let body: IngestRequest;
422
+ try {
423
+ body = await req.json();
424
+ } catch {
425
+ return new Response(JSON.stringify({ error: "Invalid JSON body" }), {
426
+ status: 400,
427
+ headers: { "Content-Type": "application/json" },
428
+ });
429
+ }
430
+
431
+ const { title, content, document_id = null, project_name, source = "agent", metadata = {}, update_if_exists = false, author = "agent", author_type = "agent" } = body;
432
+
433
+ // Validate + normalize project_names if provided (full-set destructive form)
434
+ let project_names: string[] | null = null;
435
+ if (body.project_names !== undefined && body.project_names !== null) {
436
+ if (!Array.isArray(body.project_names)) {
437
+ return new Response(
438
+ JSON.stringify({ error: "project_names must be an array of strings; use project_name (string) for a single project" }),
439
+ { status: 400, headers: { "Content-Type": "application/json" } },
440
+ );
441
+ }
442
+ project_names = body.project_names.filter((s): s is string => typeof s === "string" && s.length > 0);
443
+ }
444
+
445
+ const supabaseUrl = Deno.env.get("SUPABASE_URL")!;
446
+ const supabaseKey = Deno.env.get("SUPABASE_SERVICE_ROLE_KEY")!;
447
+ const supabase = createClient(supabaseUrl, supabaseKey);
448
+
449
+ // Configurable requestor enforcement
450
+ {
451
+ const identityField = "author";
452
+ const identityValue = body[identityField as keyof IngestRequest] as string | undefined;
453
+ const { data: reqConfig } = await supabase.rpc("cerefox_get_config", { p_key: "require_requestor_identity" });
454
+ if (reqConfig === "true") {
455
+ if (!identityValue || (typeof identityValue === "string" && identityValue.trim() === "")) {
456
+ return new Response(
457
+ JSON.stringify({ error: `Missing required parameter "${identityField}". Server requires caller identity.` }),
458
+ { status: 400, headers: { "Content-Type": "application/json", "Access-Control-Allow-Origin": "*" } },
459
+ );
460
+ }
461
+ const { data: fmtConfig } = await supabase.rpc("cerefox_get_config", { p_key: "requestor_identity_format" });
462
+ if (fmtConfig && typeof fmtConfig === "string" && fmtConfig.trim() !== "") {
463
+ if (!new RegExp(fmtConfig).test(identityValue)) {
464
+ return new Response(
465
+ JSON.stringify({ error: `Invalid "${identityField}" format. Does not match pattern: ${fmtConfig}` }),
466
+ { status: 400, headers: { "Content-Type": "application/json", "Access-Control-Allow-Origin": "*" } },
467
+ );
468
+ }
469
+ }
470
+ }
471
+ }
472
+
473
+ if (!title?.trim() || !content?.trim()) {
474
+ return new Response(JSON.stringify({ error: "title and content are required" }), {
475
+ status: 400,
476
+ headers: { "Content-Type": "application/json" },
477
+ });
478
+ }
479
+
480
+ const openaiKey = Deno.env.get("OPENAI_API_KEY");
481
+ if (!openaiKey) {
482
+ return new Response(
483
+ JSON.stringify({ error: "OPENAI_API_KEY secret not set on this project" }),
484
+ { status: 500, headers: { "Content-Type": "application/json" } },
485
+ );
486
+ }
487
+
488
+ const contentHash = await sha256hex(normalizeContent(content));
489
+ const headers = { "Content-Type": "application/json", "Access-Control-Allow-Origin": "*" };
490
+ const reviewStatus = author_type === "agent" ? "pending_review" : "approved";
491
+
492
+ // ── ID-based update path ────────────────────────────────────────────────────
493
+ // When document_id is provided, update that exact document regardless of
494
+ // update_if_exists. Skip hash dedup -- explicit ID = explicit intent to update.
495
+ if (document_id) {
496
+ const { data: existing } = await supabase
497
+ .from("cerefox_documents")
498
+ .select("id, title, content_hash")
499
+ .eq("id", document_id)
500
+ .is("deleted_at", null)
501
+ .limit(1);
502
+
503
+ if (!existing?.length) {
504
+ return new Response(
505
+ JSON.stringify({ error: `Document not found: ${document_id}` }),
506
+ { status: 404, headers },
507
+ );
508
+ }
509
+
510
+ const existingDoc = existing[0];
511
+
512
+ // Content unchanged -- skip re-indexing
513
+ if (existingDoc.content_hash === contentHash) {
514
+ const note = update_if_exists ? undefined : "update_if_exists flag was overridden by document_id";
515
+ return new Response(
516
+ JSON.stringify({
517
+ document_id: existingDoc.id,
518
+ title: existingDoc.title,
519
+ skipped: true,
520
+ updated: false,
521
+ message: "Document already up-to-date (content hash match)",
522
+ ...(note && { note }),
523
+ }),
524
+ { headers },
525
+ );
526
+ }
527
+
528
+ // Content changed -- re-chunk, re-embed, ingest via RPC
529
+ const chunks = chunkMarkdown(content);
530
+ if (chunks.length === 0) {
531
+ return new Response(JSON.stringify({ error: "Content produced no chunks" }), { status: 422, headers });
532
+ }
533
+
534
+ const texts = chunks.map((c) => `# ${title.trim()}\n${c.content}`);
535
+ let embeddings: number[][];
536
+ try {
537
+ embeddings = await embedBatch(texts, openaiKey);
538
+ } catch (err) {
539
+ return new Response(JSON.stringify({ error: String(err) }), { status: 502, headers });
540
+ }
541
+
542
+ const totalChars = chunks.reduce((s, c) => s + c.char_count, 0);
543
+ const chunkData = chunks.map((chunk, i) => ({
544
+ chunk_index: i,
545
+ heading_path: chunk.heading_path,
546
+ heading_level: chunk.heading_level,
547
+ title: chunk.title,
548
+ content: chunk.content,
549
+ char_count: chunk.char_count,
550
+ embedding: embeddings[i],
551
+ embedder: OPENAI_MODEL,
552
+ }));
553
+
554
+ const { error: ingestErr } = await supabase.rpc("cerefox_ingest_document", {
555
+ p_document_id: existingDoc.id,
556
+ p_title: title.trim(),
557
+ p_source: source,
558
+ p_content_hash: contentHash,
559
+ p_metadata: metadata,
560
+ p_review_status: reviewStatus,
561
+ p_chunks: chunkData,
562
+ p_author: author,
563
+ p_author_type: author_type,
564
+ p_source_label: source,
565
+ });
566
+
567
+ if (ingestErr) {
568
+ return new Response(JSON.stringify({ error: `Ingest RPC failed: ${ingestErr.message}` }), { status: 500, headers });
569
+ }
570
+
571
+ Promise.resolve(supabase.rpc("cerefox_log_usage", {
572
+ p_operation: "ingest",
573
+ p_access_path: "edge-function",
574
+ p_requestor: author,
575
+ p_document_id: existingDoc.id,
576
+ p_result_count: chunks.length,
577
+ })).catch(() => {});
578
+
579
+ // Project membership semantics on update (issue #38):
580
+ // - project_names (list) → destructive replace (full-set semantics)
581
+ // - project_name (singular) → non-destructive add (only if project_names absent)
582
+ if (project_names !== null) {
583
+ await setDocumentProjectsByName(supabase, existingDoc.id, project_names);
584
+ } else if (project_name) {
585
+ await ensureDocumentInProject(supabase, existingDoc.id, project_name);
586
+ }
587
+
588
+ const note = update_if_exists ? undefined : "update_if_exists flag was overridden by document_id";
589
+ return new Response(
590
+ JSON.stringify({
591
+ document_id: existingDoc.id,
592
+ title: title.trim(),
593
+ chunk_count: chunks.length,
594
+ total_chars: totalChars,
595
+ updated: true,
596
+ ...(note && { note }),
597
+ }),
598
+ { headers },
599
+ );
600
+ }
601
+
602
+ // ── Update-existing path ────────────────────────────────────────────────────
603
+ if (update_if_exists) {
604
+ const { data: existing } = await supabase
605
+ .from("cerefox_documents")
606
+ .select("id, title, content_hash")
607
+ .eq("title", title.trim())
608
+ .order("updated_at", { ascending: false })
609
+ .limit(1);
610
+
611
+ if (existing?.length) {
612
+ const existingDoc = existing[0];
613
+
614
+ // Content unchanged — skip re-indexing
615
+ if (existingDoc.content_hash === contentHash) {
616
+ return new Response(
617
+ JSON.stringify({
618
+ document_id: existingDoc.id,
619
+ title: existingDoc.title,
620
+ skipped: true,
621
+ updated: false,
622
+ message: "Document already up-to-date (content hash match)",
623
+ }),
624
+ { headers },
625
+ );
626
+ }
627
+
628
+ // Content changed — re-chunk, re-embed, ingest via RPC
629
+ const chunks = chunkMarkdown(content);
630
+ if (chunks.length === 0) {
631
+ return new Response(JSON.stringify({ error: "Content produced no chunks" }), {
632
+ status: 422, headers,
633
+ });
634
+ }
635
+
636
+ // Prepend document title for contextual enrichment (stored content unchanged)
637
+ const texts = chunks.map((c) => `# ${title.trim()}\n${c.content}`);
638
+ let embeddings: number[][];
639
+ try {
640
+ embeddings = await embedBatch(texts, openaiKey);
641
+ } catch (err) {
642
+ return new Response(JSON.stringify({ error: String(err) }), { status: 502, headers });
643
+ }
644
+
645
+ const totalChars = chunks.reduce((s, c) => s + c.char_count, 0);
646
+
647
+ // Single RPC handles: snapshot version, update doc, insert chunks, set review_status, audit entry
648
+ const chunkData = chunks.map((chunk, i) => ({
649
+ chunk_index: i,
650
+ heading_path: chunk.heading_path,
651
+ heading_level: chunk.heading_level,
652
+ title: chunk.title,
653
+ content: chunk.content,
654
+ char_count: chunk.char_count,
655
+ embedding: embeddings[i],
656
+ embedder: OPENAI_MODEL,
657
+ }));
658
+
659
+ const { data: ingestResult, error: ingestErr } = await supabase.rpc("cerefox_ingest_document", {
660
+ p_document_id: existingDoc.id,
661
+ p_title: existingDoc.title,
662
+ p_source: source,
663
+ p_content_hash: contentHash,
664
+ p_metadata: metadata,
665
+ p_review_status: reviewStatus,
666
+ p_chunks: chunkData,
667
+ p_author: author,
668
+ p_author_type: author_type,
669
+ p_source_label: source,
670
+ });
671
+
672
+ if (ingestErr) {
673
+ return new Response(
674
+ JSON.stringify({ error: `Ingest RPC failed: ${ingestErr.message}` }),
675
+ { status: 500, headers },
676
+ );
677
+ }
678
+
679
+ // Fire-and-forget usage logging for update
680
+ Promise.resolve(supabase.rpc("cerefox_log_usage", {
681
+ p_operation: "ingest",
682
+ p_access_path: "edge-function",
683
+ p_requestor: author,
684
+ p_document_id: existingDoc.id,
685
+ p_result_count: chunks.length,
686
+ })).catch(() => {});
687
+
688
+ // Project membership semantics on update (issue #38):
689
+ // - project_names (list) → destructive replace (full-set semantics)
690
+ // - project_name (singular) → non-destructive add (only if project_names absent)
691
+ if (project_names !== null) {
692
+ await setDocumentProjectsByName(supabase, existingDoc.id, project_names);
693
+ } else if (project_name) {
694
+ await ensureDocumentInProject(supabase, existingDoc.id, project_name);
695
+ }
696
+
697
+ return new Response(
698
+ JSON.stringify({
699
+ document_id: existingDoc.id,
700
+ title: existingDoc.title,
701
+ chunk_count: chunks.length,
702
+ total_chars: totalChars,
703
+ updated: true,
704
+ }),
705
+ { headers },
706
+ );
707
+ }
708
+ // No match found -- fall through to normal create below
709
+ }
710
+
711
+ // ── Hash deduplication (normal create path) ────────────────────────────────
712
+ const { data: hashMatch } = await supabase
713
+ .from("cerefox_documents")
714
+ .select("id, title")
715
+ .eq("content_hash", contentHash)
716
+ .limit(1);
717
+
718
+ if (hashMatch?.length) {
719
+ return new Response(
720
+ JSON.stringify({
721
+ document_id: hashMatch[0].id,
722
+ title: hashMatch[0].title,
723
+ skipped: true,
724
+ message: "Document already exists (content hash match)",
725
+ }),
726
+ { headers },
727
+ );
728
+ }
729
+
730
+ // Chunk the content
731
+ const chunks = chunkMarkdown(content);
732
+ if (chunks.length === 0) {
733
+ return new Response(JSON.stringify({ error: "Content produced no chunks" }), {
734
+ status: 422,
735
+ headers,
736
+ });
737
+ }
738
+
739
+ // Embed all chunks with title prefix for contextual enrichment (stored content unchanged)
740
+ const texts = chunks.map((c) => `# ${title.trim()}\n${c.content}`);
741
+ let embeddings: number[][];
742
+ try {
743
+ embeddings = await embedBatch(texts, openaiKey);
744
+ } catch (err) {
745
+ return new Response(JSON.stringify({ error: String(err) }), {
746
+ status: 502,
747
+ headers,
748
+ });
749
+ }
750
+
751
+ const totalChars = chunks.reduce((s, c) => s + c.char_count, 0);
752
+
753
+ // Single RPC handles: insert doc, insert chunks, set review_status, audit entry
754
+ const chunkData = chunks.map((chunk, i) => ({
755
+ chunk_index: i,
756
+ heading_path: chunk.heading_path,
757
+ heading_level: chunk.heading_level,
758
+ title: chunk.title,
759
+ content: chunk.content,
760
+ char_count: chunk.char_count,
761
+ embedding: embeddings[i],
762
+ embedder: OPENAI_MODEL,
763
+ }));
764
+
765
+ const { data: ingestResult, error: ingestErr } = await supabase.rpc("cerefox_ingest_document", {
766
+ p_document_id: null,
767
+ p_title: title.trim(),
768
+ p_source: source,
769
+ p_content_hash: contentHash,
770
+ p_metadata: metadata,
771
+ p_review_status: reviewStatus,
772
+ p_chunks: chunkData,
773
+ p_author: author,
774
+ p_author_type: author_type,
775
+ });
776
+
777
+ if (ingestErr || !ingestResult?.length) {
778
+ return new Response(
779
+ JSON.stringify({ error: `Ingest RPC failed: ${ingestErr?.message ?? "no data returned"}` }),
780
+ { status: 500, headers },
781
+ );
782
+ }
783
+
784
+ const documentId = ingestResult[0].document_id;
785
+
786
+ // Project assignment on CREATE:
787
+ // - project_names (list) → assign all
788
+ // - project_name (singular) → assign one via the non-destructive helper
789
+ let projectId: string | null = null;
790
+ if (project_names !== null && project_names.length > 0) {
791
+ await setDocumentProjectsByName(supabase, documentId, project_names);
792
+ } else if (project_name) {
793
+ projectId = await ensureDocumentInProject(supabase, documentId, project_name);
794
+ }
795
+
796
+ // Fire-and-forget usage logging for ingest
797
+ Promise.resolve(supabase.rpc("cerefox_log_usage", {
798
+ p_operation: "ingest",
799
+ p_access_path: "edge-function",
800
+ p_requestor: author,
801
+ p_document_id: documentId,
802
+ p_result_count: chunks.length,
803
+ })).catch(() => {});
804
+
805
+ return new Response(
806
+ JSON.stringify({
807
+ document_id: documentId,
808
+ title: title.trim(),
809
+ chunk_count: chunks.length,
810
+ total_chars: totalChars,
811
+ project_id: projectId,
812
+ project_name: project_name ?? null,
813
+ }),
814
+ {
815
+ status: 201,
816
+ headers,
817
+ },
818
+ );
819
+ });