@cerefox/memory 0.7.2 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/cerefox.js +1357 -361
- package/dist/frontend/assets/{index-BzAPcCXA.js → index-CAp2_lFX.js} +2 -2
- package/dist/frontend/assets/index-CAp2_lFX.js.map +1 -0
- package/dist/frontend/index.html +1 -1
- package/dist/server-assets/_shared/ef-meta/index.ts +97 -0
- package/dist/server-assets/_shared/embeddings/index.ts +175 -0
- package/dist/server-assets/_shared/mcp-tools/_chunker.ts +187 -0
- package/dist/server-assets/_shared/mcp-tools/_projects.ts +121 -0
- package/dist/server-assets/_shared/mcp-tools/_utils.ts +73 -0
- package/dist/server-assets/_shared/mcp-tools/audit-log.ts +95 -0
- package/dist/server-assets/_shared/mcp-tools/get-document.ts +73 -0
- package/dist/server-assets/_shared/mcp-tools/get-help-content.ts +26 -0
- package/dist/server-assets/_shared/mcp-tools/get-help.ts +90 -0
- package/dist/server-assets/_shared/mcp-tools/index.ts +67 -0
- package/dist/server-assets/_shared/mcp-tools/ingest.ts +315 -0
- package/dist/server-assets/_shared/mcp-tools/list-metadata-keys.ts +55 -0
- package/dist/server-assets/_shared/mcp-tools/list-projects.ts +59 -0
- package/dist/server-assets/_shared/mcp-tools/list-versions.ts +72 -0
- package/dist/server-assets/_shared/mcp-tools/metadata-search.ts +154 -0
- package/dist/server-assets/_shared/mcp-tools/search.ts +193 -0
- package/dist/server-assets/_shared/mcp-tools/set-document-projects.ts +163 -0
- package/dist/server-assets/_shared/mcp-tools/types.ts +92 -0
- package/dist/server-assets/db/migrations/0003_add_document_versions.sql +91 -0
- package/dist/server-assets/db/migrations/0004_add_audit_log_review_status_archived.sql +71 -0
- package/dist/server-assets/db/migrations/0005_metadata_search.sql +628 -0
- package/dist/server-assets/db/migrations/0006_usage_log.sql +255 -0
- package/dist/server-assets/db/migrations/0007_usage_log_requestor.sql +178 -0
- package/dist/server-assets/db/migrations/0008_soft_delete.sql +130 -0
- package/dist/server-assets/db/migrations/0009_audit_log_restore_operation.sql +20 -0
- package/dist/server-assets/db/migrations/0010_requestor_enforcement_config.sql +12 -0
- package/dist/server-assets/db/migrations/0011_title_boosting.sql +48 -0
- package/dist/server-assets/db/rpcs.sql +1723 -0
- package/dist/server-assets/db/schema.sql +380 -0
- package/dist/server-assets/supabase/functions/cerefox-get-audit-log/index.ts +117 -0
- package/dist/server-assets/supabase/functions/cerefox-get-document/index.ts +138 -0
- package/dist/server-assets/supabase/functions/cerefox-ingest/index.ts +819 -0
- package/dist/server-assets/supabase/functions/cerefox-list-projects/index.ts +96 -0
- package/dist/server-assets/supabase/functions/cerefox-list-versions/index.ts +113 -0
- package/dist/server-assets/supabase/functions/cerefox-mcp/index.ts +294 -0
- package/dist/server-assets/supabase/functions/cerefox-mcp/shared.ts +42 -0
- package/dist/server-assets/supabase/functions/cerefox-metadata/index.ts +99 -0
- package/dist/server-assets/supabase/functions/cerefox-metadata-search/index.ts +146 -0
- package/dist/server-assets/supabase/functions/cerefox-search/index.ts +382 -0
- package/docs/guides/connect-agents.md +78 -3
- package/docs/guides/migration-v0.5.md +50 -0
- package/docs/guides/quickstart.md +6 -2
- package/package.json +3 -2
- package/dist/frontend/assets/index-BzAPcCXA.js.map +0 -1
package/dist/frontend/index.html
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
<link rel="icon" type="image/png" href="/app/cerefox_icon.png" />
|
|
6
6
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
7
7
|
<title>Cerefox</title>
|
|
8
|
-
<script type="module" crossorigin src="/app/assets/index-
|
|
8
|
+
<script type="module" crossorigin src="/app/assets/index-CAp2_lFX.js"></script>
|
|
9
9
|
<link rel="stylesheet" crossorigin href="/app/assets/index-DoDJGRih.css">
|
|
10
10
|
</head>
|
|
11
11
|
<body>
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared Edge Function metadata (iter-26 Part 26B).
|
|
3
|
+
*
|
|
4
|
+
* Every Cerefox Edge Function answers `GET <ef>/version` with
|
|
5
|
+
* `{ name, version }` so clients can detect server↔client drift
|
|
6
|
+
* (see `_shared/compatibility/`). `cerefox-mcp` additionally exposes an
|
|
7
|
+
* aggregator at `GET cerefox-mcp/version?peers=true` that probes every
|
|
8
|
+
* peer EF + the Postgres schema version, so `cerefox doctor` learns the
|
|
9
|
+
* whole server-side version picture in one round-trip.
|
|
10
|
+
*
|
|
11
|
+
* This module is Deno-runtime safe (no `node:` imports) so it can be
|
|
12
|
+
* imported by the EFs as well as the Node/Bun local client. It is one of
|
|
13
|
+
* the `_shared` subtrees bundled into the npm package's
|
|
14
|
+
* `dist/server-assets/_shared/` (Part 26A) so EFs deploy with it intact.
|
|
15
|
+
*
|
|
16
|
+
* `EF_VERSION` is bumped by `scripts/cut_release.ts` when EF source
|
|
17
|
+
* actually changed since the previous tag (guarded — a release that
|
|
18
|
+
* doesn't touch `supabase/functions/` leaves it alone).
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
export const EF_VERSION = "0.8.0";
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* The 8 peer EFs the cerefox-mcp aggregator probes (excludes cerefox-mcp
|
|
25
|
+
* itself). Order is the probe order.
|
|
26
|
+
*/
|
|
27
|
+
export const PEER_EF_NAMES = [
|
|
28
|
+
"cerefox-search",
|
|
29
|
+
"cerefox-ingest",
|
|
30
|
+
"cerefox-metadata",
|
|
31
|
+
"cerefox-get-document",
|
|
32
|
+
"cerefox-list-versions",
|
|
33
|
+
"cerefox-get-audit-log",
|
|
34
|
+
"cerefox-metadata-search",
|
|
35
|
+
"cerefox-list-projects",
|
|
36
|
+
] as const;
|
|
37
|
+
|
|
38
|
+
export interface EfVersionPayload {
|
|
39
|
+
name: string;
|
|
40
|
+
version: string;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/** A peer probe result for the aggregator response. */
|
|
44
|
+
export interface PeerVersion {
|
|
45
|
+
name: string;
|
|
46
|
+
version: string;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export interface PeerError {
|
|
50
|
+
name: string;
|
|
51
|
+
error: string;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export interface AggregatedVersions {
|
|
55
|
+
/** This EF (cerefox-mcp). */
|
|
56
|
+
name: string;
|
|
57
|
+
version: string;
|
|
58
|
+
/** Deployed Postgres schema version, or null if the probe failed. */
|
|
59
|
+
schema: string | null;
|
|
60
|
+
/** Successfully-probed peer EFs. */
|
|
61
|
+
efs: PeerVersion[];
|
|
62
|
+
/** Peers that failed to respond (timeout, 404, network). */
|
|
63
|
+
errors: PeerError[];
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/** True when a request targets an EF's `/version` path via GET. */
|
|
67
|
+
export function isVersionRequest(req: Request): boolean {
|
|
68
|
+
if (req.method !== "GET") return false;
|
|
69
|
+
const { pathname } = new URL(req.url);
|
|
70
|
+
return pathname.endsWith("/version");
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/** True when the aggregator was requested (`?peers=true`). */
|
|
74
|
+
export function wantsPeers(req: Request): boolean {
|
|
75
|
+
return new URL(req.url).searchParams.get("peers") === "true";
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/** Single-EF version response. */
|
|
79
|
+
export function versionResponse(
|
|
80
|
+
name: string,
|
|
81
|
+
headers: Record<string, string>,
|
|
82
|
+
): Response {
|
|
83
|
+
const payload: EfVersionPayload = { name, version: EF_VERSION };
|
|
84
|
+
return new Response(JSON.stringify(payload), { status: 200, headers });
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Derive a peer EF's `/version` URL from the incoming cerefox-mcp request.
|
|
89
|
+
* Replaces the trailing `/cerefox-mcp[/...]` path segment with
|
|
90
|
+
* `/<peerName>/version`, preserving origin + the functions base path.
|
|
91
|
+
*/
|
|
92
|
+
export function peerVersionUrl(reqUrl: string, peerName: string): string {
|
|
93
|
+
const url = new URL(reqUrl);
|
|
94
|
+
// Strip everything from `/cerefox-mcp` onward, then append the peer path.
|
|
95
|
+
const base = url.pathname.replace(/\/cerefox-mcp(\/.*)?$/, "");
|
|
96
|
+
return `${url.origin}${base}/${peerName}/version`;
|
|
97
|
+
}
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared OpenAI-compatible embedding client.
|
|
3
|
+
*
|
|
4
|
+
* Used by `_shared/mcp-tools/search.ts` (query embedding) and
|
|
5
|
+
* `_shared/mcp-tools/ingest.ts` (chunk embeddings). Both the Edge Function
|
|
6
|
+
* and the local TS MCP server use this module via `_shared/mcp-tools/`.
|
|
7
|
+
*
|
|
8
|
+
* Runtime-neutral: uses only `fetch` and `setTimeout`. No Deno- or Bun-
|
|
9
|
+
* specific APIs. The OpenAI API key is always passed in by the caller —
|
|
10
|
+
* the module never reads env vars directly.
|
|
11
|
+
*
|
|
12
|
+
* Mirrors `supabase/functions/cerefox-mcp/embeddings.ts` exactly for v0.4.0
|
|
13
|
+
* (extraction commit; no behaviour change). Future tweaks live here.
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
export const OPENAI_EMBEDDING_URL = "https://api.openai.com/v1/embeddings";
|
|
17
|
+
export const OPENAI_MODEL = "text-embedding-3-small";
|
|
18
|
+
export const EMBEDDING_DIMENSIONS = 768;
|
|
19
|
+
|
|
20
|
+
const EMBEDDING_MAX_RETRIES = 3;
|
|
21
|
+
const EMBEDDING_INITIAL_BACKOFF_MS = 500; // 500ms → 1s → 2s
|
|
22
|
+
|
|
23
|
+
/** Embed a single string. Used for the query vector in `cerefox_search`. */
|
|
24
|
+
export async function getEmbedding(text: string, apiKey: string): Promise<number[]> {
|
|
25
|
+
let lastError: Error | null = null;
|
|
26
|
+
|
|
27
|
+
for (let attempt = 0; attempt < EMBEDDING_MAX_RETRIES; attempt++) {
|
|
28
|
+
try {
|
|
29
|
+
const response = await fetch(OPENAI_EMBEDDING_URL, {
|
|
30
|
+
method: "POST",
|
|
31
|
+
headers: {
|
|
32
|
+
"Authorization": `Bearer ${apiKey}`,
|
|
33
|
+
"Content-Type": "application/json",
|
|
34
|
+
},
|
|
35
|
+
body: JSON.stringify({
|
|
36
|
+
model: OPENAI_MODEL,
|
|
37
|
+
input: text,
|
|
38
|
+
dimensions: EMBEDDING_DIMENSIONS,
|
|
39
|
+
}),
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
if (!response.ok) {
|
|
43
|
+
const err = await response.text();
|
|
44
|
+
if (response.status < 500) {
|
|
45
|
+
// 4xx — don't retry; throw immediately.
|
|
46
|
+
throw new Error(`OpenAI embedding error ${response.status}: ${err}`);
|
|
47
|
+
}
|
|
48
|
+
lastError = new Error(`OpenAI embedding error ${response.status}: ${err}`);
|
|
49
|
+
const backoff = EMBEDDING_INITIAL_BACKOFF_MS * Math.pow(2, attempt);
|
|
50
|
+
console.warn(
|
|
51
|
+
`Embedding API returned ${response.status} (attempt ${attempt + 1}/${EMBEDDING_MAX_RETRIES}), retrying in ${backoff}ms`,
|
|
52
|
+
);
|
|
53
|
+
await new Promise((r) => setTimeout(r, backoff));
|
|
54
|
+
continue;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
const data = await response.json();
|
|
58
|
+
if (attempt > 0) console.info(`Embedding API succeeded on retry ${attempt}`);
|
|
59
|
+
return data.data[0].embedding;
|
|
60
|
+
} catch (err) {
|
|
61
|
+
if (err instanceof Error && err.message.startsWith("OpenAI embedding error")) throw err;
|
|
62
|
+
lastError = err instanceof Error ? err : new Error(String(err));
|
|
63
|
+
const backoff = EMBEDDING_INITIAL_BACKOFF_MS * Math.pow(2, attempt);
|
|
64
|
+
console.warn(
|
|
65
|
+
`Embedding API request failed: ${lastError.message} (attempt ${attempt + 1}/${EMBEDDING_MAX_RETRIES}), retrying in ${backoff}ms`,
|
|
66
|
+
);
|
|
67
|
+
await new Promise((r) => setTimeout(r, backoff));
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
throw lastError ?? new Error(`Embedding API failed after ${EMBEDDING_MAX_RETRIES} attempts`);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Per-API-call batch limit. Mirrors Python's `CloudEmbedder.BATCH_SIZE`
|
|
76
|
+
* (in `src/cerefox/embeddings/cloud.py`). OpenAI's `/v1/embeddings`
|
|
77
|
+
* accepts up to 2048 inputs per request, but 96 is the Python contract
|
|
78
|
+
* and matches what the existing corpus was embedded with.
|
|
79
|
+
*
|
|
80
|
+
* v0.7 (iter-25 / Part 25B) introduces this constant to TS — the v0.4
|
|
81
|
+
* `embedBatch` had no batching and would blow the API limit on bulk
|
|
82
|
+
* ingest of large documents.
|
|
83
|
+
*/
|
|
84
|
+
export const EMBEDDING_BATCH_SIZE = 96;
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Single API call to OpenAI's embeddings endpoint. Caller is responsible
|
|
88
|
+
* for staying within the API's per-request limit; in practice, use
|
|
89
|
+
* `embedBatch` which chunks calls at `EMBEDDING_BATCH_SIZE`.
|
|
90
|
+
*/
|
|
91
|
+
async function embedBatchSingleCall(
|
|
92
|
+
texts: string[],
|
|
93
|
+
apiKey: string,
|
|
94
|
+
): Promise<number[][]> {
|
|
95
|
+
let lastError: Error | null = null;
|
|
96
|
+
|
|
97
|
+
for (let attempt = 0; attempt < EMBEDDING_MAX_RETRIES; attempt++) {
|
|
98
|
+
try {
|
|
99
|
+
const response = await fetch(OPENAI_EMBEDDING_URL, {
|
|
100
|
+
method: "POST",
|
|
101
|
+
headers: {
|
|
102
|
+
"Authorization": `Bearer ${apiKey}`,
|
|
103
|
+
"Content-Type": "application/json",
|
|
104
|
+
},
|
|
105
|
+
body: JSON.stringify({
|
|
106
|
+
model: OPENAI_MODEL,
|
|
107
|
+
input: texts,
|
|
108
|
+
dimensions: EMBEDDING_DIMENSIONS,
|
|
109
|
+
}),
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
if (!response.ok) {
|
|
113
|
+
const err = await response.text();
|
|
114
|
+
if (response.status < 500) {
|
|
115
|
+
throw new Error(`OpenAI embedding error ${response.status}: ${err}`);
|
|
116
|
+
}
|
|
117
|
+
lastError = new Error(`OpenAI embedding error ${response.status}: ${err}`);
|
|
118
|
+
const backoff = EMBEDDING_INITIAL_BACKOFF_MS * Math.pow(2, attempt);
|
|
119
|
+
console.warn(
|
|
120
|
+
`Embedding API returned ${response.status} (attempt ${attempt + 1}/${EMBEDDING_MAX_RETRIES}), retrying in ${backoff}ms`,
|
|
121
|
+
);
|
|
122
|
+
await new Promise((r) => setTimeout(r, backoff));
|
|
123
|
+
continue;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
const data = await response.json();
|
|
127
|
+
if (attempt > 0) console.info(`Embedding API succeeded on retry ${attempt}`);
|
|
128
|
+
const sorted = data.data.sort(
|
|
129
|
+
(a: { index: number }, b: { index: number }) => a.index - b.index,
|
|
130
|
+
);
|
|
131
|
+
return sorted.map((d: { embedding: number[] }) => d.embedding);
|
|
132
|
+
} catch (err) {
|
|
133
|
+
if (err instanceof Error && err.message.startsWith("OpenAI embedding error")) throw err;
|
|
134
|
+
lastError = err instanceof Error ? err : new Error(String(err));
|
|
135
|
+
const backoff = EMBEDDING_INITIAL_BACKOFF_MS * Math.pow(2, attempt);
|
|
136
|
+
console.warn(
|
|
137
|
+
`Embedding API request failed: ${lastError.message} (attempt ${attempt + 1}/${EMBEDDING_MAX_RETRIES}), retrying in ${backoff}ms`,
|
|
138
|
+
);
|
|
139
|
+
await new Promise((r) => setTimeout(r, backoff));
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
throw lastError ?? new Error(`Embedding API failed after ${EMBEDDING_MAX_RETRIES} attempts`);
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Embed multiple strings, chunked into per-API-call batches of
|
|
148
|
+
* `batchSize` (default 96). Used by the v0.7 ingestion pipeline + the
|
|
149
|
+
* MCP-tools ingest handler.
|
|
150
|
+
*
|
|
151
|
+
* Results are returned in input order (each per-call response is sorted
|
|
152
|
+
* by `index` and the results concatenated in input order).
|
|
153
|
+
*
|
|
154
|
+
* Pre-v0.7 callers that used the old single-call `embedBatch` (no
|
|
155
|
+
* batching) continue to work — the signature is backward-compatible.
|
|
156
|
+
* The new `batchSize` param is opt-in; default 96 matches Python.
|
|
157
|
+
*/
|
|
158
|
+
export async function embedBatch(
|
|
159
|
+
texts: string[],
|
|
160
|
+
apiKey: string,
|
|
161
|
+
batchSize: number = EMBEDDING_BATCH_SIZE,
|
|
162
|
+
): Promise<number[][]> {
|
|
163
|
+
if (texts.length === 0) return [];
|
|
164
|
+
if (texts.length <= batchSize) {
|
|
165
|
+
return embedBatchSingleCall(texts, apiKey);
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
const out: number[][] = [];
|
|
169
|
+
for (let start = 0; start < texts.length; start += batchSize) {
|
|
170
|
+
const slice = texts.slice(start, start + batchSize);
|
|
171
|
+
const vectors = await embedBatchSingleCall(slice, apiKey);
|
|
172
|
+
for (const v of vectors) out.push(v);
|
|
173
|
+
}
|
|
174
|
+
return out;
|
|
175
|
+
}
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Heading-aware markdown chunker.
|
|
3
|
+
*
|
|
4
|
+
* Mirrors:
|
|
5
|
+
* - `src/cerefox/chunking/markdown.py` (Python pipeline)
|
|
6
|
+
* - `supabase/functions/cerefox-ingest/index.ts` (standalone ingest EF)
|
|
7
|
+
*
|
|
8
|
+
* Greedy section accumulation: H1/H2/H3 sections are joined into a buffer
|
|
9
|
+
* until adding the next would exceed `MAX_CHUNK_CHARS`. Oversized sections
|
|
10
|
+
* are paragraph-split. Short documents collapse to a single chunk.
|
|
11
|
+
*
|
|
12
|
+
* The hash of the chunked output (via `_hash.ts:sha256hex(normalizeContent(...))`)
|
|
13
|
+
* must match the Python pipeline byte-for-byte so dedup works across access
|
|
14
|
+
* paths. Don't change chunk boundaries without updating both.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
export const MAX_CHUNK_CHARS = 4000;
|
|
18
|
+
|
|
19
|
+
interface Section {
|
|
20
|
+
level: number;
|
|
21
|
+
headings: string[];
|
|
22
|
+
heading: string;
|
|
23
|
+
content: string;
|
|
24
|
+
body: string;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export interface Chunk {
|
|
28
|
+
heading_path: string[];
|
|
29
|
+
heading_level: number;
|
|
30
|
+
title: string;
|
|
31
|
+
content: string;
|
|
32
|
+
char_count: number;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function parseSections(text: string): Section[] {
|
|
36
|
+
const lines = text.split("\n");
|
|
37
|
+
const sections: Section[] = [];
|
|
38
|
+
let currentHeadings: string[] = [];
|
|
39
|
+
let currentLevel = 0;
|
|
40
|
+
let bodyLines: string[] = [];
|
|
41
|
+
|
|
42
|
+
function collectSection() {
|
|
43
|
+
const body = bodyLines.join("\n").trim();
|
|
44
|
+
bodyLines = [];
|
|
45
|
+
let content: string;
|
|
46
|
+
if (currentLevel > 0) {
|
|
47
|
+
const headerLine = "#".repeat(currentLevel) + " " +
|
|
48
|
+
(currentHeadings[currentHeadings.length - 1] ?? "");
|
|
49
|
+
content = body ? headerLine + "\n\n" + body : headerLine;
|
|
50
|
+
} else {
|
|
51
|
+
content = body;
|
|
52
|
+
}
|
|
53
|
+
if (!content.trim()) return;
|
|
54
|
+
sections.push({
|
|
55
|
+
level: currentLevel,
|
|
56
|
+
headings: [...currentHeadings],
|
|
57
|
+
heading: currentHeadings[currentHeadings.length - 1] ?? "",
|
|
58
|
+
content,
|
|
59
|
+
body,
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
for (const line of lines) {
|
|
64
|
+
const h1 = line.match(/^# (.+)/);
|
|
65
|
+
const h2 = line.match(/^## (.+)/);
|
|
66
|
+
const h3 = line.match(/^### (.+)/);
|
|
67
|
+
|
|
68
|
+
if (h1) {
|
|
69
|
+
collectSection();
|
|
70
|
+
currentHeadings = [h1[1].trim()];
|
|
71
|
+
currentLevel = 1;
|
|
72
|
+
} else if (h2) {
|
|
73
|
+
collectSection();
|
|
74
|
+
currentHeadings = [currentHeadings[0] ?? "", h2[1].trim()].filter(Boolean);
|
|
75
|
+
currentLevel = 2;
|
|
76
|
+
} else if (h3) {
|
|
77
|
+
collectSection();
|
|
78
|
+
currentHeadings = [
|
|
79
|
+
currentHeadings[0] ?? "",
|
|
80
|
+
currentHeadings[1] ?? "",
|
|
81
|
+
h3[1].trim(),
|
|
82
|
+
].filter(Boolean);
|
|
83
|
+
currentLevel = 3;
|
|
84
|
+
} else {
|
|
85
|
+
bodyLines.push(line);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
collectSection();
|
|
89
|
+
return sections;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
function makeChunk(headings: string[], level: number, content: string): Chunk {
|
|
93
|
+
const title = headings[headings.length - 1] ?? "";
|
|
94
|
+
return {
|
|
95
|
+
heading_path: [...headings],
|
|
96
|
+
heading_level: level,
|
|
97
|
+
title,
|
|
98
|
+
content,
|
|
99
|
+
char_count: content.length,
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
export function chunkMarkdown(text: string): Chunk[] {
|
|
104
|
+
const trimmed = text.trim();
|
|
105
|
+
if (!trimmed) return [];
|
|
106
|
+
|
|
107
|
+
if (trimmed.length <= MAX_CHUNK_CHARS) {
|
|
108
|
+
return [makeChunk([], 0, trimmed)];
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
const sections = parseSections(trimmed);
|
|
112
|
+
const chunks: Chunk[] = [];
|
|
113
|
+
|
|
114
|
+
let bufParts: string[] = [];
|
|
115
|
+
let bufHeadings: string[] = [];
|
|
116
|
+
let bufLevel = 0;
|
|
117
|
+
let bufChars = 0;
|
|
118
|
+
|
|
119
|
+
function flushBuf() {
|
|
120
|
+
if (bufParts.length === 0) return;
|
|
121
|
+
chunks.push(makeChunk(bufHeadings, bufLevel, bufParts.join("\n\n")));
|
|
122
|
+
bufParts = [];
|
|
123
|
+
bufHeadings = [];
|
|
124
|
+
bufLevel = 0;
|
|
125
|
+
bufChars = 0;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
for (const section of sections) {
|
|
129
|
+
const { level, headings, heading, content, body } = section;
|
|
130
|
+
|
|
131
|
+
if (content.length > MAX_CHUNK_CHARS) {
|
|
132
|
+
flushBuf();
|
|
133
|
+
const headerPrefix = level > 0 ? "#".repeat(level) + " " + heading + "\n\n" : "";
|
|
134
|
+
const bodyToSplit = body || content;
|
|
135
|
+
const paragraphs = bodyToSplit.split(/\n\n+/);
|
|
136
|
+
let sub = "";
|
|
137
|
+
let isFirst = true;
|
|
138
|
+
for (const para of paragraphs) {
|
|
139
|
+
const prefix = isFirst ? headerPrefix : "";
|
|
140
|
+
if (sub.length + prefix.length + para.length + 2 > MAX_CHUNK_CHARS && sub.length > 0) {
|
|
141
|
+
chunks.push(makeChunk(headings, level, sub.trim()));
|
|
142
|
+
sub = para;
|
|
143
|
+
isFirst = false;
|
|
144
|
+
} else {
|
|
145
|
+
sub = sub ? sub + "\n\n" + para : prefix + para;
|
|
146
|
+
isFirst = false;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
if (sub.trim()) chunks.push(makeChunk(headings, level, sub.trim()));
|
|
150
|
+
continue;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
const addition = content.length + (bufParts.length > 0 ? 2 : 0);
|
|
154
|
+
|
|
155
|
+
if (bufChars + addition <= MAX_CHUNK_CHARS) {
|
|
156
|
+
if (bufParts.length === 0) {
|
|
157
|
+
bufHeadings = headings;
|
|
158
|
+
bufLevel = level;
|
|
159
|
+
}
|
|
160
|
+
bufParts.push(content);
|
|
161
|
+
bufChars += addition;
|
|
162
|
+
} else {
|
|
163
|
+
flushBuf();
|
|
164
|
+
bufParts = [content];
|
|
165
|
+
bufHeadings = headings;
|
|
166
|
+
bufLevel = level;
|
|
167
|
+
bufChars = content.length;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
flushBuf();
|
|
172
|
+
return chunks;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
/** Content-hash normalization. Must match `pipeline.py::_normalize`
|
|
176
|
+
* byte-for-byte so cross-runtime dedup works. */
|
|
177
|
+
export function normalizeContent(text: string): string {
|
|
178
|
+
return text.trim().replace(/\r\n/g, "\n").replace(/\r/g, "\n").replace(/\n{3,}/g, "\n\n");
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
export async function sha256hex(text: string): Promise<string> {
|
|
182
|
+
const bytes = new TextEncoder().encode(text);
|
|
183
|
+
const hash = await crypto.subtle.digest("SHA-256", bytes);
|
|
184
|
+
return Array.from(new Uint8Array(hash))
|
|
185
|
+
.map((b) => b.toString(16).padStart(2, "0"))
|
|
186
|
+
.join("");
|
|
187
|
+
}
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Project-membership helpers shared by the `ingest` and
|
|
3
|
+
* `set-document-projects` tools.
|
|
4
|
+
*
|
|
5
|
+
* Two semantics are needed by callers:
|
|
6
|
+
*
|
|
7
|
+
* - **Non-destructive add** (`ensureDocumentInProject`): used when an ingest
|
|
8
|
+
* call supplies a singular `project_name`. Resolves (or creates) the
|
|
9
|
+
* project, then ensures the `(document, project)` row exists. Idempotent;
|
|
10
|
+
* does NOT remove any existing memberships. Per issue #38: the v0.1.20
|
|
11
|
+
* fix that stopped agent updates from silently wiping operator-curated
|
|
12
|
+
* memberships.
|
|
13
|
+
*
|
|
14
|
+
* - **Destructive replace** (`setDocumentProjectsByName`): used when a call
|
|
15
|
+
* supplies an explicit `project_names` list (or via the dedicated
|
|
16
|
+
* `cerefox_set_document_projects` tool). DELETE-then-INSERT replaces the
|
|
17
|
+
* document's memberships with exactly the given set.
|
|
18
|
+
*
|
|
19
|
+
* Both call sites need consistent name resolution (case-insensitive
|
|
20
|
+
* `ilike` match against `cerefox_projects.name`); centralising here
|
|
21
|
+
* prevents drift.
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
import type { MCPSupabaseClient } from "./types.ts";
|
|
25
|
+
|
|
26
|
+
/** Ensure `(documentId, project)` exists. Resolves project by name
|
|
27
|
+
* (case-insensitive); creates the project if missing. Idempotent.
|
|
28
|
+
* Returns the resolved project_id, or `null` if creation failed. */
|
|
29
|
+
export async function ensureDocumentInProject(
|
|
30
|
+
supabase: MCPSupabaseClient,
|
|
31
|
+
documentId: string,
|
|
32
|
+
projectName: string,
|
|
33
|
+
): Promise<string | null> {
|
|
34
|
+
let projectId: string | null = null;
|
|
35
|
+
const { data: proj } = await supabase
|
|
36
|
+
.from("cerefox_projects")
|
|
37
|
+
.select("id")
|
|
38
|
+
.ilike("name", projectName)
|
|
39
|
+
.limit(1);
|
|
40
|
+
if (proj?.length) {
|
|
41
|
+
projectId = proj[0].id;
|
|
42
|
+
} else {
|
|
43
|
+
const { data: newProj } = await supabase
|
|
44
|
+
.from("cerefox_projects")
|
|
45
|
+
.insert({ name: projectName })
|
|
46
|
+
.select("id");
|
|
47
|
+
projectId = newProj?.[0]?.id ?? null;
|
|
48
|
+
}
|
|
49
|
+
if (!projectId) return null;
|
|
50
|
+
|
|
51
|
+
const { data: existing } = await supabase
|
|
52
|
+
.from("cerefox_document_projects")
|
|
53
|
+
.select("document_id")
|
|
54
|
+
.eq("document_id", documentId)
|
|
55
|
+
.eq("project_id", projectId)
|
|
56
|
+
.limit(1);
|
|
57
|
+
if (existing?.length) return projectId;
|
|
58
|
+
|
|
59
|
+
const { error: insertErr } = await supabase
|
|
60
|
+
.from("cerefox_document_projects")
|
|
61
|
+
.insert({ document_id: documentId, project_id: projectId });
|
|
62
|
+
if (insertErr && !String(insertErr.message ?? "").includes("duplicate key")) {
|
|
63
|
+
console.warn("ensureDocumentInProject: insert failed", insertErr);
|
|
64
|
+
}
|
|
65
|
+
return projectId;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/** DELETE-then-INSERT replacement of a document's project memberships.
|
|
69
|
+
* Resolves each name → project_id (creating if absent); preserves order.
|
|
70
|
+
* Empty `projectNames` clears all memberships. Returns the resolved
|
|
71
|
+
* project_ids in input order. */
|
|
72
|
+
export async function setDocumentProjectsByName(
|
|
73
|
+
supabase: MCPSupabaseClient,
|
|
74
|
+
documentId: string,
|
|
75
|
+
projectNames: string[],
|
|
76
|
+
): Promise<string[]> {
|
|
77
|
+
const projectIds: string[] = [];
|
|
78
|
+
for (const name of projectNames) {
|
|
79
|
+
if (!name) continue;
|
|
80
|
+
const { data: proj } = await supabase
|
|
81
|
+
.from("cerefox_projects")
|
|
82
|
+
.select("id")
|
|
83
|
+
.ilike("name", name)
|
|
84
|
+
.limit(1);
|
|
85
|
+
if (proj?.length) {
|
|
86
|
+
projectIds.push(proj[0].id);
|
|
87
|
+
} else {
|
|
88
|
+
const { data: newProj } = await supabase
|
|
89
|
+
.from("cerefox_projects")
|
|
90
|
+
.insert({ name })
|
|
91
|
+
.select("id");
|
|
92
|
+
if (newProj?.[0]?.id) projectIds.push(newProj[0].id);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
await supabase
|
|
97
|
+
.from("cerefox_document_projects")
|
|
98
|
+
.delete()
|
|
99
|
+
.eq("document_id", documentId);
|
|
100
|
+
if (projectIds.length > 0) {
|
|
101
|
+
const rows = projectIds.map((pid) => ({ document_id: documentId, project_id: pid }));
|
|
102
|
+
await supabase.from("cerefox_document_projects").insert(rows);
|
|
103
|
+
}
|
|
104
|
+
return projectIds;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/** Resolve a project name → project_id (case-insensitive), or `null` if
|
|
108
|
+
* not found. Does NOT create. Used by search / metadata-search to translate
|
|
109
|
+
* `project_name` parameters to UUIDs. */
|
|
110
|
+
export async function lookupProjectId(
|
|
111
|
+
supabase: MCPSupabaseClient,
|
|
112
|
+
projectName: string,
|
|
113
|
+
): Promise<string | null> {
|
|
114
|
+
const { data, error } = await supabase
|
|
115
|
+
.from("cerefox_projects")
|
|
116
|
+
.select("id")
|
|
117
|
+
.ilike("name", projectName)
|
|
118
|
+
.limit(1);
|
|
119
|
+
if (error || !data?.length) return null;
|
|
120
|
+
return data[0].id;
|
|
121
|
+
}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Internal helpers shared by `_shared/mcp-tools/` handlers.
|
|
3
|
+
*
|
|
4
|
+
* - `applyByteBudget`: drop whole rows until the cumulative serialized size
|
|
5
|
+
* fits within the budget. Used by `search` and `metadata-search`.
|
|
6
|
+
* - `logUsage`: fire-and-forget write to `cerefox_usage_log` via RPC.
|
|
7
|
+
* Never blocks the tool response.
|
|
8
|
+
*
|
|
9
|
+
* Both helpers are mirrored from `supabase/functions/cerefox-mcp/shared.ts`
|
|
10
|
+
* for the v0.4.0 extraction — once `_shared/mcp-tools/` is the source of
|
|
11
|
+
* truth (after 22D refactors the EF to import from here), the EF's `shared.ts`
|
|
12
|
+
* removes its copies.
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
import type { MCPSupabaseClient } from "./types.ts";
|
|
16
|
+
|
|
17
|
+
/** Server-enforced response-size ceiling for MCP results. Agents can request
|
|
18
|
+
* smaller budgets via `max_bytes`; values above this are capped. */
|
|
19
|
+
export const MAX_RESPONSE_BYTES = 200_000;
|
|
20
|
+
|
|
21
|
+
export function applyByteBudget(
|
|
22
|
+
rows: unknown[],
|
|
23
|
+
maxBytes: number,
|
|
24
|
+
): { accepted: unknown[]; truncated: boolean; usedBytes: number } {
|
|
25
|
+
const accepted: unknown[] = [];
|
|
26
|
+
let usedBytes = 0;
|
|
27
|
+
let truncated = false;
|
|
28
|
+
|
|
29
|
+
for (const row of rows) {
|
|
30
|
+
const rowBytes = new TextEncoder().encode(JSON.stringify(row)).length;
|
|
31
|
+
if (usedBytes + rowBytes > maxBytes) {
|
|
32
|
+
truncated = true;
|
|
33
|
+
break;
|
|
34
|
+
}
|
|
35
|
+
accepted.push(row);
|
|
36
|
+
usedBytes += rowBytes;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
return { accepted, truncated, usedBytes };
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
import type { AccessPath } from "./types.ts";
|
|
43
|
+
|
|
44
|
+
export interface LogUsageParams {
|
|
45
|
+
operation: string;
|
|
46
|
+
accessPath: AccessPath;
|
|
47
|
+
query_text?: string | null;
|
|
48
|
+
document_id?: string | null;
|
|
49
|
+
project_id?: string | null;
|
|
50
|
+
result_count?: number | null;
|
|
51
|
+
requestor?: string | null;
|
|
52
|
+
extra?: Record<string, unknown>;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/** Fire-and-forget usage logging. Never throws, never blocks the response.
|
|
56
|
+
* Failures are silently swallowed — usage logging is best-effort by design.
|
|
57
|
+
* Differs from the EF's `logUsage` only in that `accessPath` is a required
|
|
58
|
+
* parameter (was hardcoded to `"remote-mcp"` in the EF) so the local TS
|
|
59
|
+
* MCP server can pass `"local-mcp"` for the same call site. */
|
|
60
|
+
export function logUsage(supabase: MCPSupabaseClient, params: LogUsageParams): void {
|
|
61
|
+
Promise.resolve(
|
|
62
|
+
supabase.rpc("cerefox_log_usage", {
|
|
63
|
+
p_operation: params.operation,
|
|
64
|
+
p_access_path: params.accessPath,
|
|
65
|
+
p_requestor: params.requestor ?? "mcp-agent",
|
|
66
|
+
p_document_id: params.document_id ?? null,
|
|
67
|
+
p_project_id: params.project_id ?? null,
|
|
68
|
+
p_query_text: params.query_text ?? null,
|
|
69
|
+
p_result_count: params.result_count ?? null,
|
|
70
|
+
p_extra: params.extra ?? {},
|
|
71
|
+
}),
|
|
72
|
+
).catch(() => {});
|
|
73
|
+
}
|