membot 0.0.1 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/membot.md +137 -0
- package/.cursor/rules/membot.mdc +137 -0
- package/README.md +131 -0
- package/package.json +83 -24
- package/patches/@huggingface%2Ftransformers@4.2.0.patch +137 -0
- package/scripts/apply-transformers-patch.sh +35 -0
- package/src/cli.ts +72 -0
- package/src/commands/check-update.ts +69 -0
- package/src/commands/mcpx.ts +112 -0
- package/src/commands/reindex.ts +53 -0
- package/src/commands/serve.ts +58 -0
- package/src/commands/skill.ts +131 -0
- package/src/commands/upgrade.ts +220 -0
- package/src/config/loader.ts +100 -0
- package/src/config/schemas.ts +39 -0
- package/src/constants.ts +42 -0
- package/src/context.ts +80 -0
- package/src/db/blobs.ts +53 -0
- package/src/db/chunks.ts +176 -0
- package/src/db/connection.ts +173 -0
- package/src/db/files.ts +325 -0
- package/src/db/migrations/001-init.ts +63 -0
- package/src/db/migrations/002-fts.ts +12 -0
- package/src/db/migrations.ts +45 -0
- package/src/errors.ts +87 -0
- package/src/ingest/chunker.ts +117 -0
- package/src/ingest/converter/docx.ts +15 -0
- package/src/ingest/converter/html.ts +20 -0
- package/src/ingest/converter/image.ts +71 -0
- package/src/ingest/converter/index.ts +119 -0
- package/src/ingest/converter/llm.ts +66 -0
- package/src/ingest/converter/ocr.ts +51 -0
- package/src/ingest/converter/pdf.ts +38 -0
- package/src/ingest/converter/text.ts +8 -0
- package/src/ingest/describer.ts +72 -0
- package/src/ingest/embedder.ts +98 -0
- package/src/ingest/fetcher.ts +280 -0
- package/src/ingest/ingest.ts +444 -0
- package/src/ingest/local-reader.ts +64 -0
- package/src/ingest/search-text.ts +18 -0
- package/src/ingest/source-resolver.ts +186 -0
- package/src/mcp/instructions.ts +34 -0
- package/src/mcp/server.ts +101 -0
- package/src/mount/commander.ts +174 -0
- package/src/mount/mcp.ts +111 -0
- package/src/mount/zod-to-cli.ts +158 -0
- package/src/operations/add.ts +69 -0
- package/src/operations/diff.ts +105 -0
- package/src/operations/index.ts +38 -0
- package/src/operations/info.ts +95 -0
- package/src/operations/list.ts +87 -0
- package/src/operations/move.ts +83 -0
- package/src/operations/prune.ts +80 -0
- package/src/operations/read.ts +102 -0
- package/src/operations/refresh.ts +72 -0
- package/src/operations/remove.ts +35 -0
- package/src/operations/search.ts +72 -0
- package/src/operations/tree.ts +103 -0
- package/src/operations/types.ts +81 -0
- package/src/operations/versions.ts +78 -0
- package/src/operations/write.ts +77 -0
- package/src/output/formatter.ts +68 -0
- package/src/output/logger.ts +114 -0
- package/src/output/progress.ts +78 -0
- package/src/output/tty.ts +91 -0
- package/src/refresh/runner.ts +296 -0
- package/src/refresh/scheduler.ts +54 -0
- package/src/sdk.ts +27 -0
- package/src/search/hybrid.ts +100 -0
- package/src/search/keyword.ts +62 -0
- package/src/search/semantic.ts +56 -0
- package/src/types/text-modules.d.ts +9 -0
- package/src/update/background.ts +73 -0
- package/src/update/cache.ts +40 -0
- package/src/update/checker.ts +117 -0
- package/.claude/settings.local.json +0 -7
- package/CLAUDE.md +0 -139
- package/docs/plan.md +0 -905
|
@@ -0,0 +1,444 @@
|
|
|
1
|
+
import type { AppContext } from "../context.ts";
|
|
2
|
+
import { upsertBlob } from "../db/blobs.ts";
|
|
3
|
+
import { insertChunksForVersion, rebuildFts } from "../db/chunks.ts";
|
|
4
|
+
import { type FetcherKind, getCurrent, insertVersion, millisIso, type SourceType } from "../db/files.ts";
|
|
5
|
+
import { asHelpful, HelpfulError } from "../errors.ts";
|
|
6
|
+
import { chunkDeterministic } from "./chunker.ts";
|
|
7
|
+
import { convert } from "./converter/index.ts";
|
|
8
|
+
import { describe } from "./describer.ts";
|
|
9
|
+
import { embed } from "./embedder.ts";
|
|
10
|
+
import { fetchRemote } from "./fetcher.ts";
|
|
11
|
+
import { readLocalFile, sha256Hex } from "./local-reader.ts";
|
|
12
|
+
import { buildSearchText } from "./search-text.ts";
|
|
13
|
+
import { type ResolvedLocalEntry, type ResolvedSource, resolveSource } from "./source-resolver.ts";
|
|
14
|
+
|
|
15
|
+
export interface IngestInput {
|
|
16
|
+
source: string;
|
|
17
|
+
logical_path?: string;
|
|
18
|
+
include?: string;
|
|
19
|
+
exclude?: string;
|
|
20
|
+
follow_symlinks?: boolean;
|
|
21
|
+
refresh_frequency?: string;
|
|
22
|
+
fetcher_hint?: string;
|
|
23
|
+
change_note?: string;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export interface IngestEntryResult {
|
|
27
|
+
source_path: string;
|
|
28
|
+
logical_path: string;
|
|
29
|
+
version_id: string | null;
|
|
30
|
+
status: "ok" | "failed";
|
|
31
|
+
error?: string;
|
|
32
|
+
mime_type: string | null;
|
|
33
|
+
size_bytes: number;
|
|
34
|
+
fetcher: FetcherKind;
|
|
35
|
+
source_sha256: string;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export interface IngestResult {
|
|
39
|
+
ingested: IngestEntryResult[];
|
|
40
|
+
total: number;
|
|
41
|
+
ok: number;
|
|
42
|
+
failed: number;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Top-level ingest orchestrator. Resolves the source arg, dispatches to the
|
|
47
|
+
* right reader (local / remote / inline), runs the pipeline (convert →
|
|
48
|
+
* describe → chunk → embed → write), and returns one entry per matched
|
|
49
|
+
* file. Partial failures are reported per-entry; the entire call doesn't
|
|
50
|
+
* abort because one URL or PDF is bad.
|
|
51
|
+
*/
|
|
52
|
+
export async function ingest(input: IngestInput, ctx: AppContext): Promise<IngestResult> {
|
|
53
|
+
const resolved = await resolveSource(input.source, {
|
|
54
|
+
include: input.include,
|
|
55
|
+
exclude: input.exclude,
|
|
56
|
+
followSymlinks: input.follow_symlinks ?? true,
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
const refreshSec = parseDuration(input.refresh_frequency);
|
|
60
|
+
|
|
61
|
+
if (resolved.kind === "inline") {
|
|
62
|
+
return ingestInline(resolved.text, input, ctx, refreshSec);
|
|
63
|
+
}
|
|
64
|
+
if (resolved.kind === "url") {
|
|
65
|
+
return ingestUrl(resolved.url, input, ctx, refreshSec);
|
|
66
|
+
}
|
|
67
|
+
return ingestLocalFiles(resolved, input, ctx, refreshSec);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/** Ingest a single inline blob (source_type='inline'). */
|
|
71
|
+
async function ingestInline(
|
|
72
|
+
text: string,
|
|
73
|
+
input: IngestInput,
|
|
74
|
+
ctx: AppContext,
|
|
75
|
+
refreshSec: number | null,
|
|
76
|
+
): Promise<IngestResult> {
|
|
77
|
+
const logicalPath = input.logical_path ?? defaultInlinePath();
|
|
78
|
+
const bytes = new TextEncoder().encode(text);
|
|
79
|
+
const sha = sha256Hex(bytes);
|
|
80
|
+
const result: IngestEntryResult = {
|
|
81
|
+
source_path: "inline:",
|
|
82
|
+
logical_path: logicalPath,
|
|
83
|
+
version_id: null,
|
|
84
|
+
status: "ok",
|
|
85
|
+
mime_type: "text/markdown",
|
|
86
|
+
size_bytes: bytes.byteLength,
|
|
87
|
+
fetcher: "inline",
|
|
88
|
+
source_sha256: sha,
|
|
89
|
+
};
|
|
90
|
+
try {
|
|
91
|
+
const versionId = await persistVersion(ctx, {
|
|
92
|
+
logicalPath,
|
|
93
|
+
sourceType: "inline",
|
|
94
|
+
sourcePath: null,
|
|
95
|
+
sourceMtimeMs: null,
|
|
96
|
+
sourceSha: sha,
|
|
97
|
+
blobSha: null,
|
|
98
|
+
mime: "text/markdown",
|
|
99
|
+
bytes: null,
|
|
100
|
+
markdown: text,
|
|
101
|
+
fetcher: "inline",
|
|
102
|
+
fetcherServer: null,
|
|
103
|
+
fetcherTool: null,
|
|
104
|
+
fetcherArgs: null,
|
|
105
|
+
refreshSec,
|
|
106
|
+
changeNote: input.change_note ?? null,
|
|
107
|
+
});
|
|
108
|
+
result.version_id = versionId;
|
|
109
|
+
} catch (err) {
|
|
110
|
+
result.status = "failed";
|
|
111
|
+
result.error = errorMessage(err);
|
|
112
|
+
}
|
|
113
|
+
return summarize([result]);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/** Ingest one URL (source_type='remote'). */
|
|
117
|
+
async function ingestUrl(
|
|
118
|
+
url: string,
|
|
119
|
+
input: IngestInput,
|
|
120
|
+
ctx: AppContext,
|
|
121
|
+
refreshSec: number | null,
|
|
122
|
+
): Promise<IngestResult> {
|
|
123
|
+
const mcpxAdapter = ctx.mcpx
|
|
124
|
+
? {
|
|
125
|
+
async listTools() {
|
|
126
|
+
const tools = await ctx.mcpx!.listTools();
|
|
127
|
+
return tools;
|
|
128
|
+
},
|
|
129
|
+
async exec(server: string, tool: string, args: Record<string, unknown>) {
|
|
130
|
+
return ctx.mcpx!.exec(server, tool, args);
|
|
131
|
+
},
|
|
132
|
+
}
|
|
133
|
+
: null;
|
|
134
|
+
|
|
135
|
+
const logicalPath = input.logical_path ?? defaultLogicalForUrl(url);
|
|
136
|
+
const result: IngestEntryResult = {
|
|
137
|
+
source_path: url,
|
|
138
|
+
logical_path: logicalPath,
|
|
139
|
+
version_id: null,
|
|
140
|
+
status: "ok",
|
|
141
|
+
mime_type: null,
|
|
142
|
+
size_bytes: 0,
|
|
143
|
+
fetcher: "http",
|
|
144
|
+
source_sha256: "",
|
|
145
|
+
};
|
|
146
|
+
|
|
147
|
+
try {
|
|
148
|
+
const fetched = await fetchRemote(url, { hint: input.fetcher_hint, mcpx: mcpxAdapter });
|
|
149
|
+
result.mime_type = fetched.mimeType;
|
|
150
|
+
result.size_bytes = fetched.bytes.byteLength;
|
|
151
|
+
result.fetcher = fetched.fetcher;
|
|
152
|
+
result.source_sha256 = fetched.sha256;
|
|
153
|
+
|
|
154
|
+
const versionId = await pipelineForBytes(ctx, {
|
|
155
|
+
logicalPath,
|
|
156
|
+
bytes: fetched.bytes,
|
|
157
|
+
mime: fetched.mimeType,
|
|
158
|
+
source: url,
|
|
159
|
+
sourceType: "remote",
|
|
160
|
+
sourcePath: url,
|
|
161
|
+
sourceMtimeMs: null,
|
|
162
|
+
sourceSha: fetched.sha256,
|
|
163
|
+
fetcher: fetched.fetcher,
|
|
164
|
+
fetcherServer: fetched.fetcherServer,
|
|
165
|
+
fetcherTool: fetched.fetcherTool,
|
|
166
|
+
fetcherArgs: fetched.fetcherArgs,
|
|
167
|
+
refreshSec,
|
|
168
|
+
changeNote: input.change_note ?? null,
|
|
169
|
+
});
|
|
170
|
+
result.version_id = versionId;
|
|
171
|
+
} catch (err) {
|
|
172
|
+
result.status = "failed";
|
|
173
|
+
result.error = errorMessage(err);
|
|
174
|
+
}
|
|
175
|
+
return summarize([result]);
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/** Ingest a list of local files (source_type='local'). One transaction per entry. */
|
|
179
|
+
async function ingestLocalFiles(
|
|
180
|
+
resolved: Extract<ResolvedSource, { kind: "local-files" }>,
|
|
181
|
+
input: IngestInput,
|
|
182
|
+
ctx: AppContext,
|
|
183
|
+
refreshSec: number | null,
|
|
184
|
+
): Promise<IngestResult> {
|
|
185
|
+
if (resolved.entries.length === 0) {
|
|
186
|
+
throw new HelpfulError({
|
|
187
|
+
kind: "input_error",
|
|
188
|
+
message: `Glob/path matched 0 files`,
|
|
189
|
+
hint: `Try a broader pattern (e.g. ./**/*.md) or relax --exclude.`,
|
|
190
|
+
});
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
const results: IngestEntryResult[] = [];
|
|
194
|
+
ctx.progress.start(resolved.entries.length, "ingest");
|
|
195
|
+
const isMulti = resolved.entries.length > 1;
|
|
196
|
+
|
|
197
|
+
for (const entry of resolved.entries) {
|
|
198
|
+
ctx.progress.tick(entry.relPath);
|
|
199
|
+
const logicalPath = pickLogicalPath(input.logical_path, entry, isMulti);
|
|
200
|
+
const result: IngestEntryResult = {
|
|
201
|
+
source_path: entry.absPath,
|
|
202
|
+
logical_path: logicalPath,
|
|
203
|
+
version_id: null,
|
|
204
|
+
status: "ok",
|
|
205
|
+
mime_type: null,
|
|
206
|
+
size_bytes: 0,
|
|
207
|
+
fetcher: "local",
|
|
208
|
+
source_sha256: "",
|
|
209
|
+
};
|
|
210
|
+
try {
|
|
211
|
+
const local = await readLocalFile(entry.absPath);
|
|
212
|
+
result.mime_type = local.mimeType;
|
|
213
|
+
result.size_bytes = local.sizeBytes;
|
|
214
|
+
result.source_sha256 = local.sha256;
|
|
215
|
+
|
|
216
|
+
const versionId = await pipelineForBytes(ctx, {
|
|
217
|
+
logicalPath,
|
|
218
|
+
bytes: local.bytes,
|
|
219
|
+
mime: local.mimeType,
|
|
220
|
+
source: entry.absPath,
|
|
221
|
+
sourceType: "local",
|
|
222
|
+
sourcePath: entry.absPath,
|
|
223
|
+
sourceMtimeMs: local.mtimeMs,
|
|
224
|
+
sourceSha: local.sha256,
|
|
225
|
+
fetcher: "local",
|
|
226
|
+
fetcherServer: null,
|
|
227
|
+
fetcherTool: null,
|
|
228
|
+
fetcherArgs: null,
|
|
229
|
+
refreshSec,
|
|
230
|
+
changeNote: input.change_note ?? null,
|
|
231
|
+
});
|
|
232
|
+
result.version_id = versionId;
|
|
233
|
+
} catch (err) {
|
|
234
|
+
result.status = "failed";
|
|
235
|
+
result.error = errorMessage(err);
|
|
236
|
+
}
|
|
237
|
+
results.push(result);
|
|
238
|
+
}
|
|
239
|
+
ctx.progress.done(`ingested ${results.filter((r) => r.status === "ok").length}/${results.length}`);
|
|
240
|
+
|
|
241
|
+
return summarize(results);
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
interface PipelineParams {
|
|
245
|
+
logicalPath: string;
|
|
246
|
+
bytes: Uint8Array;
|
|
247
|
+
mime: string;
|
|
248
|
+
source: string;
|
|
249
|
+
sourceType: SourceType;
|
|
250
|
+
sourcePath: string | null;
|
|
251
|
+
sourceMtimeMs: number | null;
|
|
252
|
+
sourceSha: string;
|
|
253
|
+
fetcher: FetcherKind;
|
|
254
|
+
fetcherServer: string | null;
|
|
255
|
+
fetcherTool: string | null;
|
|
256
|
+
fetcherArgs: Record<string, unknown> | null;
|
|
257
|
+
refreshSec: number | null;
|
|
258
|
+
changeNote: string | null;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
/**
|
|
262
|
+
* Run the bytes-in / version-out pipeline: store the blob, convert to
|
|
263
|
+
* markdown, describe, chunk, embed, and write a new files row + chunks
|
|
264
|
+
* rows under a fresh version_id. Returns the version_id so callers can
|
|
265
|
+
* report it back.
|
|
266
|
+
*/
|
|
267
|
+
async function pipelineForBytes(ctx: AppContext, p: PipelineParams): Promise<string> {
|
|
268
|
+
await upsertBlob(ctx.db, {
|
|
269
|
+
sha256: p.sourceSha,
|
|
270
|
+
mime_type: p.mime,
|
|
271
|
+
size_bytes: p.bytes.byteLength,
|
|
272
|
+
bytes: p.bytes,
|
|
273
|
+
});
|
|
274
|
+
|
|
275
|
+
const conversion = await convert(p.bytes, p.mime, p.source, ctx.config.llm);
|
|
276
|
+
const markdown = conversion.markdown;
|
|
277
|
+
const contentSha = sha256Hex(new TextEncoder().encode(markdown));
|
|
278
|
+
|
|
279
|
+
return persistVersion(ctx, {
|
|
280
|
+
logicalPath: p.logicalPath,
|
|
281
|
+
sourceType: p.sourceType,
|
|
282
|
+
sourcePath: p.sourcePath,
|
|
283
|
+
sourceMtimeMs: p.sourceMtimeMs,
|
|
284
|
+
sourceSha: p.sourceSha,
|
|
285
|
+
blobSha: p.sourceSha,
|
|
286
|
+
mime: p.mime,
|
|
287
|
+
bytes: p.bytes,
|
|
288
|
+
markdown,
|
|
289
|
+
contentSha,
|
|
290
|
+
fetcher: p.fetcher,
|
|
291
|
+
fetcherServer: p.fetcherServer,
|
|
292
|
+
fetcherTool: p.fetcherTool,
|
|
293
|
+
fetcherArgs: p.fetcherArgs,
|
|
294
|
+
refreshSec: p.refreshSec,
|
|
295
|
+
changeNote: p.changeNote,
|
|
296
|
+
});
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
interface PersistParams {
|
|
300
|
+
logicalPath: string;
|
|
301
|
+
sourceType: SourceType;
|
|
302
|
+
sourcePath: string | null;
|
|
303
|
+
sourceMtimeMs: number | null;
|
|
304
|
+
sourceSha: string;
|
|
305
|
+
blobSha: string | null;
|
|
306
|
+
mime: string;
|
|
307
|
+
bytes: Uint8Array | null;
|
|
308
|
+
markdown: string;
|
|
309
|
+
contentSha?: string;
|
|
310
|
+
fetcher: FetcherKind;
|
|
311
|
+
fetcherServer: string | null;
|
|
312
|
+
fetcherTool: string | null;
|
|
313
|
+
fetcherArgs: Record<string, unknown> | null;
|
|
314
|
+
refreshSec: number | null;
|
|
315
|
+
changeNote: string | null;
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
/**
|
|
319
|
+
* Insert a new (logical_path, version_id) row plus its chunks. Description
|
|
320
|
+
* is generated on every ingest (LLM with deterministic fallback). The
|
|
321
|
+
* embedded text per chunk is `<path>\n<description>\n\n<body>`, stored
|
|
322
|
+
* verbatim as `chunks.search_text` and later FTS-indexed.
|
|
323
|
+
*/
|
|
324
|
+
async function persistVersion(ctx: AppContext, p: PersistParams): Promise<string> {
|
|
325
|
+
const description = await describe(p.logicalPath, p.mime, p.markdown, ctx.config.llm);
|
|
326
|
+
const chunks = chunkDeterministic(p.markdown, ctx.config.chunker);
|
|
327
|
+
const searchTexts = chunks.map((c) => buildSearchText(p.logicalPath, description, c.content));
|
|
328
|
+
let embeddings: number[][];
|
|
329
|
+
try {
|
|
330
|
+
embeddings = await embed(searchTexts, ctx.config.embedding_model);
|
|
331
|
+
} catch (err) {
|
|
332
|
+
throw asHelpful(
|
|
333
|
+
err,
|
|
334
|
+
`while embedding chunks for ${p.logicalPath}`,
|
|
335
|
+
"Run `bun run prebuild` to apply the transformers WASM patch, or set a different config.embedding_model.",
|
|
336
|
+
);
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
const versionId = millisIso(Date.now());
|
|
340
|
+
const contentSha = p.contentSha ?? sha256Hex(new TextEncoder().encode(p.markdown));
|
|
341
|
+
await insertVersion(ctx.db, {
|
|
342
|
+
logical_path: p.logicalPath,
|
|
343
|
+
version_id: versionId,
|
|
344
|
+
source_type: p.sourceType,
|
|
345
|
+
source_path: p.sourcePath,
|
|
346
|
+
source_mtime_ms: p.sourceMtimeMs,
|
|
347
|
+
source_sha256: p.sourceSha,
|
|
348
|
+
blob_sha256: p.blobSha,
|
|
349
|
+
content_sha256: contentSha,
|
|
350
|
+
content: p.markdown,
|
|
351
|
+
description,
|
|
352
|
+
mime_type: p.mime,
|
|
353
|
+
size_bytes: p.bytes?.byteLength ?? new TextEncoder().encode(p.markdown).byteLength,
|
|
354
|
+
fetcher: p.fetcher,
|
|
355
|
+
fetcher_server: p.fetcherServer,
|
|
356
|
+
fetcher_tool: p.fetcherTool,
|
|
357
|
+
fetcher_args: p.fetcherArgs,
|
|
358
|
+
refresh_frequency_sec: p.refreshSec,
|
|
359
|
+
refreshed_at: new Date().toISOString(),
|
|
360
|
+
last_refresh_status: "ok",
|
|
361
|
+
change_note: p.changeNote,
|
|
362
|
+
});
|
|
363
|
+
|
|
364
|
+
await insertChunksForVersion(
|
|
365
|
+
ctx.db,
|
|
366
|
+
p.logicalPath,
|
|
367
|
+
versionId,
|
|
368
|
+
chunks.map((c, i) => ({
|
|
369
|
+
chunk_index: c.index,
|
|
370
|
+
chunk_content: c.content,
|
|
371
|
+
search_text: searchTexts[i] ?? buildSearchText(p.logicalPath, description, c.content),
|
|
372
|
+
embedding: embeddings[i] ?? new Array(embeddings[0]?.length ?? 0).fill(0),
|
|
373
|
+
})),
|
|
374
|
+
);
|
|
375
|
+
await rebuildFts(ctx.db);
|
|
376
|
+
return versionId;
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
/**
|
|
380
|
+
* Pick the logical path for a single matched entry. For a single-file
|
|
381
|
+
* ingest with explicit `logical_path`, use it as-is. For multi-entry
|
|
382
|
+
* ingests with `logical_path` set, treat it as a *prefix* under which
|
|
383
|
+
* each entry's relative path is placed.
|
|
384
|
+
*/
|
|
385
|
+
function pickLogicalPath(explicit: string | undefined, entry: ResolvedLocalEntry, isMulti: boolean): string {
|
|
386
|
+
if (!explicit) return entry.relPath.replaceAll("\\", "/");
|
|
387
|
+
if (!isMulti) return explicit;
|
|
388
|
+
const prefix = explicit.endsWith("/") ? explicit.slice(0, -1) : explicit;
|
|
389
|
+
return `${prefix}/${entry.relPath.replaceAll("\\", "/")}`;
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
/** Default logical path for an ingested URL — host + path, sanitized. */
|
|
393
|
+
function defaultLogicalForUrl(url: string): string {
|
|
394
|
+
try {
|
|
395
|
+
const u = new URL(url);
|
|
396
|
+
const tail = u.pathname.replace(/^\/+/, "").replaceAll("/", "_") || "root";
|
|
397
|
+
return `urls/${u.hostname}/${tail || "root"}`;
|
|
398
|
+
} catch {
|
|
399
|
+
return `urls/${url.replace(/[^a-z0-9.-]/gi, "_")}`;
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
function defaultInlinePath(): string {
|
|
404
|
+
return `inline/${new Date().toISOString().replace(/[:.]/g, "-")}.md`;
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
/**
|
|
408
|
+
* Convert a duration string (`5m`, `1h`, `24h`, `7d`) to seconds. Returns
|
|
409
|
+
* null when the input is undefined / blank, throws a HelpfulError on
|
|
410
|
+
* malformed input.
|
|
411
|
+
*/
|
|
412
|
+
export function parseDuration(input: string | null | undefined): number | null {
|
|
413
|
+
if (!input?.trim()) return null;
|
|
414
|
+
const m = input.trim().match(/^(\d+)([smhd])$/i);
|
|
415
|
+
if (!m) {
|
|
416
|
+
throw new HelpfulError({
|
|
417
|
+
kind: "input_error",
|
|
418
|
+
message: `invalid duration: ${input}`,
|
|
419
|
+
hint: `Use forms like 5m, 1h, 24h, 7d.`,
|
|
420
|
+
});
|
|
421
|
+
}
|
|
422
|
+
const n = Number(m[1]);
|
|
423
|
+
const unit = m[2]?.toLowerCase() ?? "s";
|
|
424
|
+
const multiplier = unit === "s" ? 1 : unit === "m" ? 60 : unit === "h" ? 3600 : 86400;
|
|
425
|
+
return n * multiplier;
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
/** Roll a list of per-entry results into the top-level summary shape. */
|
|
429
|
+
function summarize(entries: IngestEntryResult[]): IngestResult {
|
|
430
|
+
let ok = 0;
|
|
431
|
+
let failed = 0;
|
|
432
|
+
for (const e of entries) {
|
|
433
|
+
if (e.status === "ok") ok += 1;
|
|
434
|
+
else failed += 1;
|
|
435
|
+
}
|
|
436
|
+
return { ingested: entries, total: entries.length, ok, failed };
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
function errorMessage(err: unknown): string {
|
|
440
|
+
if (err instanceof Error) return err.message;
|
|
441
|
+
return String(err);
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
export { getCurrent };
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
import { stat } from "node:fs/promises";
|
|
3
|
+
import { asHelpful } from "../errors.ts";
|
|
4
|
+
|
|
5
|
+
export interface LocalRead {
|
|
6
|
+
bytes: Uint8Array;
|
|
7
|
+
sha256: string;
|
|
8
|
+
mtimeMs: number;
|
|
9
|
+
sizeBytes: number;
|
|
10
|
+
mimeType: string;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Best-effort filename → MIME mapping using Bun's built-in resolver. Strips
|
|
15
|
+
* the `;charset=...` suffix Bun adds for text types so the value is safe to
|
|
16
|
+
* compare with `===` against fixed MIME strings in the converter dispatch.
|
|
17
|
+
* Lowercases the path's basename so `.PNG` and `.png` resolve identically.
|
|
18
|
+
*/
|
|
19
|
+
export function mimeFromPath(path: string): string {
|
|
20
|
+
const lastSlash = Math.max(path.lastIndexOf("/"), path.lastIndexOf("\\"));
|
|
21
|
+
const base =
|
|
22
|
+
lastSlash >= 0 ? path.slice(0, lastSlash + 1) + path.slice(lastSlash + 1).toLowerCase() : path.toLowerCase();
|
|
23
|
+
const raw = Bun.file(base).type;
|
|
24
|
+
const mime = raw.split(";")[0]?.trim();
|
|
25
|
+
if (!mime || mime === "application/octet-stream") return "application/octet-stream";
|
|
26
|
+
return mime;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Read a local file: bytes, sha256, last-mtime, size, and an inferred MIME
|
|
31
|
+
* type. Used by the ingest pipeline as the universal entry point for
|
|
32
|
+
* source_type='local'.
|
|
33
|
+
*/
|
|
34
|
+
export async function readLocalFile(path: string): Promise<LocalRead> {
|
|
35
|
+
let stats: Awaited<ReturnType<typeof stat>>;
|
|
36
|
+
try {
|
|
37
|
+
stats = await stat(path);
|
|
38
|
+
} catch (err) {
|
|
39
|
+
throw asHelpful(
|
|
40
|
+
err,
|
|
41
|
+
`while stat'ing ${path}`,
|
|
42
|
+
`Check that the path exists and you have read access. \`ls -la ${path}\`.`,
|
|
43
|
+
"not_found",
|
|
44
|
+
);
|
|
45
|
+
}
|
|
46
|
+
const file = Bun.file(path);
|
|
47
|
+
const ab = await file.arrayBuffer();
|
|
48
|
+
const bytes = new Uint8Array(ab);
|
|
49
|
+
const sha256 = sha256Hex(bytes);
|
|
50
|
+
return {
|
|
51
|
+
bytes,
|
|
52
|
+
sha256,
|
|
53
|
+
mtimeMs: stats.mtimeMs,
|
|
54
|
+
sizeBytes: stats.size,
|
|
55
|
+
mimeType: mimeFromPath(path),
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/** Compute a hex SHA-256 over the provided bytes. */
|
|
60
|
+
export function sha256Hex(bytes: Uint8Array): string {
|
|
61
|
+
const hash = createHash("sha256");
|
|
62
|
+
hash.update(bytes);
|
|
63
|
+
return hash.digest("hex");
|
|
64
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Build the exact string that gets embedded AND FTS-indexed for a chunk.
|
|
3
|
+
* Format:
|
|
4
|
+
* <logical_path>
|
|
5
|
+
* <description>
|
|
6
|
+
* <blank line>
|
|
7
|
+
* <chunk_content>
|
|
8
|
+
*
|
|
9
|
+
* The path + description prefix lifts recall on filename-only or
|
|
10
|
+
* description-only queries (e.g. "the OAuth diagram" finds an empty PNG
|
|
11
|
+
* row whose body is only its caption). Stored verbatim as
|
|
12
|
+
* `chunks.search_text` so retrieval consumers can return clean snippets
|
|
13
|
+
* by reading `chunks.chunk_content` separately.
|
|
14
|
+
*/
|
|
15
|
+
export function buildSearchText(logicalPath: string, description: string | null, chunkContent: string): string {
|
|
16
|
+
const desc = (description ?? "").trim();
|
|
17
|
+
return `${logicalPath}\n${desc}\n\n${chunkContent}`;
|
|
18
|
+
}
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
import { realpath, stat } from "node:fs/promises";
|
|
2
|
+
import { isAbsolute, join, relative, resolve, sep } from "node:path";
|
|
3
|
+
import picomatch from "picomatch";
|
|
4
|
+
import { asHelpful, HelpfulError } from "../errors.ts";
|
|
5
|
+
|
|
6
|
+
export type ResolvedSource =
|
|
7
|
+
| { kind: "inline"; text: string; logicalHint: string | null }
|
|
8
|
+
| { kind: "url"; url: string; logicalHint: string | null }
|
|
9
|
+
| { kind: "local-files"; entries: ResolvedLocalEntry[]; basePath: string };
|
|
10
|
+
|
|
11
|
+
export interface ResolvedLocalEntry {
|
|
12
|
+
absPath: string;
|
|
13
|
+
/** Path relative to the base; used to derive a default logical_path. */
|
|
14
|
+
relPath: string;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export interface ResolveOptions {
|
|
18
|
+
include?: string;
|
|
19
|
+
exclude?: string;
|
|
20
|
+
followSymlinks?: boolean;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
const DEFAULT_EXCLUDES = ["**/node_modules/**", "**/.git/**", "**/.DS_Store", "**/dist/**", "**/.cache/**"];
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Polymorphic source-arg expander. Accepts:
|
|
27
|
+
* - "inline:<text>" → inline literal
|
|
28
|
+
* - "http://..." | "https://..." → URL (fetched downstream by fetcher)
|
|
29
|
+
* - existing file → single-file local
|
|
30
|
+
* - existing directory → recursive walk (symlinks via realpath cache)
|
|
31
|
+
* - glob pattern → picomatch-filtered walk
|
|
32
|
+
*
|
|
33
|
+
* Symlink loops are broken via a realpath cache. Include / exclude are
|
|
34
|
+
* applied to the entry's path *relative to the base* so users don't need
|
|
35
|
+
* absolute-path globs.
|
|
36
|
+
*/
|
|
37
|
+
export async function resolveSource(source: string, options: ResolveOptions = {}): Promise<ResolvedSource> {
|
|
38
|
+
if (source.startsWith("inline:")) {
|
|
39
|
+
return { kind: "inline", text: source.slice("inline:".length), logicalHint: null };
|
|
40
|
+
}
|
|
41
|
+
if (source.startsWith("http://") || source.startsWith("https://")) {
|
|
42
|
+
return { kind: "url", url: source, logicalHint: null };
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
const followSymlinks = options.followSymlinks !== false;
|
|
46
|
+
const includeMatchers = (options.include ?? "**/*")
|
|
47
|
+
.split(",")
|
|
48
|
+
.map((g) => g.trim())
|
|
49
|
+
.filter(Boolean);
|
|
50
|
+
const excludeMatchers = [
|
|
51
|
+
...DEFAULT_EXCLUDES,
|
|
52
|
+
...(options.exclude ?? "")
|
|
53
|
+
.split(",")
|
|
54
|
+
.map((g) => g.trim())
|
|
55
|
+
.filter(Boolean),
|
|
56
|
+
];
|
|
57
|
+
|
|
58
|
+
if (isGlob(source)) {
|
|
59
|
+
const base = globBase(source);
|
|
60
|
+
try {
|
|
61
|
+
const realBase = await realpath(base);
|
|
62
|
+
return walk(realBase, [source, ...includeMatchers], excludeMatchers, followSymlinks);
|
|
63
|
+
} catch (err) {
|
|
64
|
+
throw asHelpful(
|
|
65
|
+
err,
|
|
66
|
+
`while resolving glob base ${base}`,
|
|
67
|
+
`Check that the directory ${base} exists.`,
|
|
68
|
+
"not_found",
|
|
69
|
+
);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
const abs = resolve(source);
|
|
74
|
+
let st: Awaited<ReturnType<typeof stat>>;
|
|
75
|
+
try {
|
|
76
|
+
st = await stat(abs);
|
|
77
|
+
} catch (err) {
|
|
78
|
+
throw asHelpful(
|
|
79
|
+
err,
|
|
80
|
+
`while stat'ing ${source}`,
|
|
81
|
+
`Check that the path exists. Run \`ls ${source}\`. For URLs, prefix with http:// or https://.`,
|
|
82
|
+
"not_found",
|
|
83
|
+
);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
if (st.isFile()) {
|
|
87
|
+
return {
|
|
88
|
+
kind: "local-files",
|
|
89
|
+
basePath: abs,
|
|
90
|
+
entries: [{ absPath: abs, relPath: source.split(sep).pop() ?? source }],
|
|
91
|
+
};
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
if (st.isDirectory()) {
|
|
95
|
+
const realBase = await realpath(abs);
|
|
96
|
+
return walk(realBase, includeMatchers, excludeMatchers, followSymlinks);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
throw new HelpfulError({
|
|
100
|
+
kind: "input_error",
|
|
101
|
+
message: `${source} is neither a file, directory, nor URL`,
|
|
102
|
+
hint: `Pass a file path, directory, glob (e.g. "docs/**/*.md"), URL, or "inline:<text>".`,
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/** Crude glob detector — matches what picomatch treats as a pattern. */
|
|
107
|
+
export function isGlob(s: string): boolean {
|
|
108
|
+
return /[*?[\]{}!]/.test(s);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/** Take the static directory prefix of a glob (everything before the first wildcard). */
|
|
112
|
+
export function globBase(glob: string): string {
|
|
113
|
+
const parts = glob.split(sep);
|
|
114
|
+
const out: string[] = [];
|
|
115
|
+
for (const p of parts) {
|
|
116
|
+
if (/[*?[\]{}!]/.test(p)) break;
|
|
117
|
+
out.push(p);
|
|
118
|
+
}
|
|
119
|
+
const base = out.join(sep);
|
|
120
|
+
return base.length === 0 || !isAbsolute(base) ? resolve(base || ".") : base;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Recursively walk `base`, returning files matched by `includes` and not
|
|
125
|
+
* matched by `excludes`. Both globsets match against the entry's path
|
|
126
|
+
* relative to `base`. Symlinks are followed when `followSymlinks` is true,
|
|
127
|
+
* with cycles detected via a realpath cache.
|
|
128
|
+
*/
|
|
129
|
+
async function walk(
|
|
130
|
+
base: string,
|
|
131
|
+
includes: string[],
|
|
132
|
+
excludes: string[],
|
|
133
|
+
followSymlinks: boolean,
|
|
134
|
+
): Promise<ResolvedSource> {
|
|
135
|
+
const seen = new Set<string>();
|
|
136
|
+
const entries: ResolvedLocalEntry[] = [];
|
|
137
|
+
|
|
138
|
+
const isInclude = picomatch(includes, { dot: false, nocase: false });
|
|
139
|
+
const isExclude = excludes.length ? picomatch(excludes, { dot: false }) : null;
|
|
140
|
+
|
|
141
|
+
const queue: string[] = [base];
|
|
142
|
+
while (queue.length > 0) {
|
|
143
|
+
const cur = queue.shift();
|
|
144
|
+
if (cur === undefined) break;
|
|
145
|
+
let real: string;
|
|
146
|
+
try {
|
|
147
|
+
real = await realpath(cur);
|
|
148
|
+
} catch {
|
|
149
|
+
continue;
|
|
150
|
+
}
|
|
151
|
+
if (seen.has(real)) continue;
|
|
152
|
+
seen.add(real);
|
|
153
|
+
let st: Awaited<ReturnType<typeof stat>>;
|
|
154
|
+
try {
|
|
155
|
+
st = await stat(real);
|
|
156
|
+
} catch {
|
|
157
|
+
continue;
|
|
158
|
+
}
|
|
159
|
+
if (st.isSymbolicLink() && !followSymlinks) continue;
|
|
160
|
+
if (st.isDirectory()) {
|
|
161
|
+
let names: string[];
|
|
162
|
+
try {
|
|
163
|
+
names = await readdir(real);
|
|
164
|
+
} catch {
|
|
165
|
+
continue;
|
|
166
|
+
}
|
|
167
|
+
for (const name of names) {
|
|
168
|
+
queue.push(join(real, name));
|
|
169
|
+
}
|
|
170
|
+
continue;
|
|
171
|
+
}
|
|
172
|
+
if (!st.isFile()) continue;
|
|
173
|
+
const rel = relative(base, real);
|
|
174
|
+
const relForMatch = rel.length === 0 ? (cur.split(sep).pop() ?? cur) : rel;
|
|
175
|
+
if (isExclude?.(relForMatch)) continue;
|
|
176
|
+
if (!isInclude(relForMatch)) continue;
|
|
177
|
+
entries.push({ absPath: real, relPath: relForMatch });
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
return { kind: "local-files", basePath: base, entries };
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
async function readdir(path: string): Promise<string[]> {
|
|
184
|
+
const fs = await import("node:fs/promises");
|
|
185
|
+
return fs.readdir(path);
|
|
186
|
+
}
|