membot 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/package.json +81 -24
  2. package/patches/@huggingface%2Ftransformers@4.2.0.patch +137 -0
  3. package/scripts/apply-transformers-patch.sh +35 -0
  4. package/src/cli.ts +70 -0
  5. package/src/commands/check-update.ts +69 -0
  6. package/src/commands/mcpx.ts +112 -0
  7. package/src/commands/reindex.ts +53 -0
  8. package/src/commands/serve.ts +58 -0
  9. package/src/commands/upgrade.ts +220 -0
  10. package/src/config/loader.ts +100 -0
  11. package/src/config/schemas.ts +39 -0
  12. package/src/constants.ts +42 -0
  13. package/src/context.ts +80 -0
  14. package/src/db/blobs.ts +53 -0
  15. package/src/db/chunks.ts +176 -0
  16. package/src/db/connection.ts +173 -0
  17. package/src/db/files.ts +325 -0
  18. package/src/db/migrations/001-init.ts +63 -0
  19. package/src/db/migrations/002-fts.ts +12 -0
  20. package/src/db/migrations.ts +45 -0
  21. package/src/errors.ts +87 -0
  22. package/src/ingest/chunker.ts +117 -0
  23. package/src/ingest/converter/docx.ts +15 -0
  24. package/src/ingest/converter/html.ts +20 -0
  25. package/src/ingest/converter/image.ts +71 -0
  26. package/src/ingest/converter/index.ts +119 -0
  27. package/src/ingest/converter/llm.ts +66 -0
  28. package/src/ingest/converter/ocr.ts +51 -0
  29. package/src/ingest/converter/pdf.ts +38 -0
  30. package/src/ingest/converter/text.ts +8 -0
  31. package/src/ingest/describer.ts +72 -0
  32. package/src/ingest/embedder.ts +83 -0
  33. package/src/ingest/fetcher.ts +280 -0
  34. package/src/ingest/ingest.ts +444 -0
  35. package/src/ingest/local-reader.ts +64 -0
  36. package/src/ingest/search-text.ts +18 -0
  37. package/src/ingest/source-resolver.ts +186 -0
  38. package/src/mcp/instructions.ts +34 -0
  39. package/src/mcp/server.ts +101 -0
  40. package/src/mount/commander.ts +174 -0
  41. package/src/mount/mcp.ts +111 -0
  42. package/src/mount/zod-to-cli.ts +158 -0
  43. package/src/operations/add.ts +69 -0
  44. package/src/operations/diff.ts +105 -0
  45. package/src/operations/index.ts +38 -0
  46. package/src/operations/info.ts +95 -0
  47. package/src/operations/list.ts +87 -0
  48. package/src/operations/move.ts +83 -0
  49. package/src/operations/prune.ts +80 -0
  50. package/src/operations/read.ts +102 -0
  51. package/src/operations/refresh.ts +72 -0
  52. package/src/operations/remove.ts +35 -0
  53. package/src/operations/search.ts +72 -0
  54. package/src/operations/tree.ts +103 -0
  55. package/src/operations/types.ts +81 -0
  56. package/src/operations/versions.ts +78 -0
  57. package/src/operations/write.ts +77 -0
  58. package/src/output/formatter.ts +68 -0
  59. package/src/output/logger.ts +114 -0
  60. package/src/output/progress.ts +78 -0
  61. package/src/output/tty.ts +91 -0
  62. package/src/refresh/runner.ts +296 -0
  63. package/src/refresh/scheduler.ts +54 -0
  64. package/src/sdk.ts +27 -0
  65. package/src/search/hybrid.ts +100 -0
  66. package/src/search/keyword.ts +62 -0
  67. package/src/search/semantic.ts +56 -0
  68. package/src/update/background.ts +73 -0
  69. package/src/update/cache.ts +40 -0
  70. package/src/update/checker.ts +117 -0
  71. package/.claude/settings.local.json +0 -7
  72. package/CLAUDE.md +0 -139
  73. package/docs/plan.md +0 -905
@@ -0,0 +1,444 @@
1
+ import type { AppContext } from "../context.ts";
2
+ import { upsertBlob } from "../db/blobs.ts";
3
+ import { insertChunksForVersion, rebuildFts } from "../db/chunks.ts";
4
+ import { type FetcherKind, getCurrent, insertVersion, millisIso, type SourceType } from "../db/files.ts";
5
+ import { asHelpful, HelpfulError } from "../errors.ts";
6
+ import { chunkDeterministic } from "./chunker.ts";
7
+ import { convert } from "./converter/index.ts";
8
+ import { describe } from "./describer.ts";
9
+ import { embed } from "./embedder.ts";
10
+ import { fetchRemote } from "./fetcher.ts";
11
+ import { readLocalFile, sha256Hex } from "./local-reader.ts";
12
+ import { buildSearchText } from "./search-text.ts";
13
+ import { type ResolvedLocalEntry, type ResolvedSource, resolveSource } from "./source-resolver.ts";
14
+
15
+ export interface IngestInput {
16
+ source: string;
17
+ logical_path?: string;
18
+ include?: string;
19
+ exclude?: string;
20
+ follow_symlinks?: boolean;
21
+ refresh_frequency?: string;
22
+ fetcher_hint?: string;
23
+ change_note?: string;
24
+ }
25
+
26
+ export interface IngestEntryResult {
27
+ source_path: string;
28
+ logical_path: string;
29
+ version_id: string | null;
30
+ status: "ok" | "failed";
31
+ error?: string;
32
+ mime_type: string | null;
33
+ size_bytes: number;
34
+ fetcher: FetcherKind;
35
+ source_sha256: string;
36
+ }
37
+
38
+ export interface IngestResult {
39
+ ingested: IngestEntryResult[];
40
+ total: number;
41
+ ok: number;
42
+ failed: number;
43
+ }
44
+
45
+ /**
46
+ * Top-level ingest orchestrator. Resolves the source arg, dispatches to the
47
+ * right reader (local / remote / inline), runs the pipeline (convert →
48
+ * describe → chunk → embed → write), and returns one entry per matched
49
+ * file. Partial failures are reported per-entry; the entire call doesn't
50
+ * abort because one URL or PDF is bad.
51
+ */
52
+ export async function ingest(input: IngestInput, ctx: AppContext): Promise<IngestResult> {
53
+ const resolved = await resolveSource(input.source, {
54
+ include: input.include,
55
+ exclude: input.exclude,
56
+ followSymlinks: input.follow_symlinks ?? true,
57
+ });
58
+
59
+ const refreshSec = parseDuration(input.refresh_frequency);
60
+
61
+ if (resolved.kind === "inline") {
62
+ return ingestInline(resolved.text, input, ctx, refreshSec);
63
+ }
64
+ if (resolved.kind === "url") {
65
+ return ingestUrl(resolved.url, input, ctx, refreshSec);
66
+ }
67
+ return ingestLocalFiles(resolved, input, ctx, refreshSec);
68
+ }
69
+
70
+ /** Ingest a single inline blob (source_type='inline'). */
71
+ async function ingestInline(
72
+ text: string,
73
+ input: IngestInput,
74
+ ctx: AppContext,
75
+ refreshSec: number | null,
76
+ ): Promise<IngestResult> {
77
+ const logicalPath = input.logical_path ?? defaultInlinePath();
78
+ const bytes = new TextEncoder().encode(text);
79
+ const sha = sha256Hex(bytes);
80
+ const result: IngestEntryResult = {
81
+ source_path: "inline:",
82
+ logical_path: logicalPath,
83
+ version_id: null,
84
+ status: "ok",
85
+ mime_type: "text/markdown",
86
+ size_bytes: bytes.byteLength,
87
+ fetcher: "inline",
88
+ source_sha256: sha,
89
+ };
90
+ try {
91
+ const versionId = await persistVersion(ctx, {
92
+ logicalPath,
93
+ sourceType: "inline",
94
+ sourcePath: null,
95
+ sourceMtimeMs: null,
96
+ sourceSha: sha,
97
+ blobSha: null,
98
+ mime: "text/markdown",
99
+ bytes: null,
100
+ markdown: text,
101
+ fetcher: "inline",
102
+ fetcherServer: null,
103
+ fetcherTool: null,
104
+ fetcherArgs: null,
105
+ refreshSec,
106
+ changeNote: input.change_note ?? null,
107
+ });
108
+ result.version_id = versionId;
109
+ } catch (err) {
110
+ result.status = "failed";
111
+ result.error = errorMessage(err);
112
+ }
113
+ return summarize([result]);
114
+ }
115
+
116
+ /** Ingest one URL (source_type='remote'). */
117
+ async function ingestUrl(
118
+ url: string,
119
+ input: IngestInput,
120
+ ctx: AppContext,
121
+ refreshSec: number | null,
122
+ ): Promise<IngestResult> {
123
+ const mcpxAdapter = ctx.mcpx
124
+ ? {
125
+ async listTools() {
126
+ const tools = await ctx.mcpx!.listTools();
127
+ return tools;
128
+ },
129
+ async exec(server: string, tool: string, args: Record<string, unknown>) {
130
+ return ctx.mcpx!.exec(server, tool, args);
131
+ },
132
+ }
133
+ : null;
134
+
135
+ const logicalPath = input.logical_path ?? defaultLogicalForUrl(url);
136
+ const result: IngestEntryResult = {
137
+ source_path: url,
138
+ logical_path: logicalPath,
139
+ version_id: null,
140
+ status: "ok",
141
+ mime_type: null,
142
+ size_bytes: 0,
143
+ fetcher: "http",
144
+ source_sha256: "",
145
+ };
146
+
147
+ try {
148
+ const fetched = await fetchRemote(url, { hint: input.fetcher_hint, mcpx: mcpxAdapter });
149
+ result.mime_type = fetched.mimeType;
150
+ result.size_bytes = fetched.bytes.byteLength;
151
+ result.fetcher = fetched.fetcher;
152
+ result.source_sha256 = fetched.sha256;
153
+
154
+ const versionId = await pipelineForBytes(ctx, {
155
+ logicalPath,
156
+ bytes: fetched.bytes,
157
+ mime: fetched.mimeType,
158
+ source: url,
159
+ sourceType: "remote",
160
+ sourcePath: url,
161
+ sourceMtimeMs: null,
162
+ sourceSha: fetched.sha256,
163
+ fetcher: fetched.fetcher,
164
+ fetcherServer: fetched.fetcherServer,
165
+ fetcherTool: fetched.fetcherTool,
166
+ fetcherArgs: fetched.fetcherArgs,
167
+ refreshSec,
168
+ changeNote: input.change_note ?? null,
169
+ });
170
+ result.version_id = versionId;
171
+ } catch (err) {
172
+ result.status = "failed";
173
+ result.error = errorMessage(err);
174
+ }
175
+ return summarize([result]);
176
+ }
177
+
178
+ /** Ingest a list of local files (source_type='local'). One transaction per entry. */
179
+ async function ingestLocalFiles(
180
+ resolved: Extract<ResolvedSource, { kind: "local-files" }>,
181
+ input: IngestInput,
182
+ ctx: AppContext,
183
+ refreshSec: number | null,
184
+ ): Promise<IngestResult> {
185
+ if (resolved.entries.length === 0) {
186
+ throw new HelpfulError({
187
+ kind: "input_error",
188
+ message: `Glob/path matched 0 files`,
189
+ hint: `Try a broader pattern (e.g. ./**/*.md) or relax --exclude.`,
190
+ });
191
+ }
192
+
193
+ const results: IngestEntryResult[] = [];
194
+ ctx.progress.start(resolved.entries.length, "ingest");
195
+ const isMulti = resolved.entries.length > 1;
196
+
197
+ for (const entry of resolved.entries) {
198
+ ctx.progress.tick(entry.relPath);
199
+ const logicalPath = pickLogicalPath(input.logical_path, entry, isMulti);
200
+ const result: IngestEntryResult = {
201
+ source_path: entry.absPath,
202
+ logical_path: logicalPath,
203
+ version_id: null,
204
+ status: "ok",
205
+ mime_type: null,
206
+ size_bytes: 0,
207
+ fetcher: "local",
208
+ source_sha256: "",
209
+ };
210
+ try {
211
+ const local = await readLocalFile(entry.absPath);
212
+ result.mime_type = local.mimeType;
213
+ result.size_bytes = local.sizeBytes;
214
+ result.source_sha256 = local.sha256;
215
+
216
+ const versionId = await pipelineForBytes(ctx, {
217
+ logicalPath,
218
+ bytes: local.bytes,
219
+ mime: local.mimeType,
220
+ source: entry.absPath,
221
+ sourceType: "local",
222
+ sourcePath: entry.absPath,
223
+ sourceMtimeMs: local.mtimeMs,
224
+ sourceSha: local.sha256,
225
+ fetcher: "local",
226
+ fetcherServer: null,
227
+ fetcherTool: null,
228
+ fetcherArgs: null,
229
+ refreshSec,
230
+ changeNote: input.change_note ?? null,
231
+ });
232
+ result.version_id = versionId;
233
+ } catch (err) {
234
+ result.status = "failed";
235
+ result.error = errorMessage(err);
236
+ }
237
+ results.push(result);
238
+ }
239
+ ctx.progress.done(`ingested ${results.filter((r) => r.status === "ok").length}/${results.length}`);
240
+
241
+ return summarize(results);
242
+ }
243
+
244
+ interface PipelineParams {
245
+ logicalPath: string;
246
+ bytes: Uint8Array;
247
+ mime: string;
248
+ source: string;
249
+ sourceType: SourceType;
250
+ sourcePath: string | null;
251
+ sourceMtimeMs: number | null;
252
+ sourceSha: string;
253
+ fetcher: FetcherKind;
254
+ fetcherServer: string | null;
255
+ fetcherTool: string | null;
256
+ fetcherArgs: Record<string, unknown> | null;
257
+ refreshSec: number | null;
258
+ changeNote: string | null;
259
+ }
260
+
261
+ /**
262
+ * Run the bytes-in / version-out pipeline: store the blob, convert to
263
+ * markdown, describe, chunk, embed, and write a new files row + chunks
264
+ * rows under a fresh version_id. Returns the version_id so callers can
265
+ * report it back.
266
+ */
267
+ async function pipelineForBytes(ctx: AppContext, p: PipelineParams): Promise<string> {
268
+ await upsertBlob(ctx.db, {
269
+ sha256: p.sourceSha,
270
+ mime_type: p.mime,
271
+ size_bytes: p.bytes.byteLength,
272
+ bytes: p.bytes,
273
+ });
274
+
275
+ const conversion = await convert(p.bytes, p.mime, p.source, ctx.config.llm);
276
+ const markdown = conversion.markdown;
277
+ const contentSha = sha256Hex(new TextEncoder().encode(markdown));
278
+
279
+ return persistVersion(ctx, {
280
+ logicalPath: p.logicalPath,
281
+ sourceType: p.sourceType,
282
+ sourcePath: p.sourcePath,
283
+ sourceMtimeMs: p.sourceMtimeMs,
284
+ sourceSha: p.sourceSha,
285
+ blobSha: p.sourceSha,
286
+ mime: p.mime,
287
+ bytes: p.bytes,
288
+ markdown,
289
+ contentSha,
290
+ fetcher: p.fetcher,
291
+ fetcherServer: p.fetcherServer,
292
+ fetcherTool: p.fetcherTool,
293
+ fetcherArgs: p.fetcherArgs,
294
+ refreshSec: p.refreshSec,
295
+ changeNote: p.changeNote,
296
+ });
297
+ }
298
+
299
+ interface PersistParams {
300
+ logicalPath: string;
301
+ sourceType: SourceType;
302
+ sourcePath: string | null;
303
+ sourceMtimeMs: number | null;
304
+ sourceSha: string;
305
+ blobSha: string | null;
306
+ mime: string;
307
+ bytes: Uint8Array | null;
308
+ markdown: string;
309
+ contentSha?: string;
310
+ fetcher: FetcherKind;
311
+ fetcherServer: string | null;
312
+ fetcherTool: string | null;
313
+ fetcherArgs: Record<string, unknown> | null;
314
+ refreshSec: number | null;
315
+ changeNote: string | null;
316
+ }
317
+
318
+ /**
319
+ * Insert a new (logical_path, version_id) row plus its chunks. Description
320
+ * is generated on every ingest (LLM with deterministic fallback). The
321
+ * embedded text per chunk is `<path>\n<description>\n\n<body>`, stored
322
+ * verbatim as `chunks.search_text` and later FTS-indexed.
323
+ */
324
+ async function persistVersion(ctx: AppContext, p: PersistParams): Promise<string> {
325
+ const description = await describe(p.logicalPath, p.mime, p.markdown, ctx.config.llm);
326
+ const chunks = chunkDeterministic(p.markdown, ctx.config.chunker);
327
+ const searchTexts = chunks.map((c) => buildSearchText(p.logicalPath, description, c.content));
328
+ let embeddings: number[][];
329
+ try {
330
+ embeddings = await embed(searchTexts, ctx.config.embedding_model);
331
+ } catch (err) {
332
+ throw asHelpful(
333
+ err,
334
+ `while embedding chunks for ${p.logicalPath}`,
335
+ "Run `bun run prebuild` to apply the transformers WASM patch, or set a different config.embedding_model.",
336
+ );
337
+ }
338
+
339
+ const versionId = millisIso(Date.now());
340
+ const contentSha = p.contentSha ?? sha256Hex(new TextEncoder().encode(p.markdown));
341
+ await insertVersion(ctx.db, {
342
+ logical_path: p.logicalPath,
343
+ version_id: versionId,
344
+ source_type: p.sourceType,
345
+ source_path: p.sourcePath,
346
+ source_mtime_ms: p.sourceMtimeMs,
347
+ source_sha256: p.sourceSha,
348
+ blob_sha256: p.blobSha,
349
+ content_sha256: contentSha,
350
+ content: p.markdown,
351
+ description,
352
+ mime_type: p.mime,
353
+ size_bytes: p.bytes?.byteLength ?? new TextEncoder().encode(p.markdown).byteLength,
354
+ fetcher: p.fetcher,
355
+ fetcher_server: p.fetcherServer,
356
+ fetcher_tool: p.fetcherTool,
357
+ fetcher_args: p.fetcherArgs,
358
+ refresh_frequency_sec: p.refreshSec,
359
+ refreshed_at: new Date().toISOString(),
360
+ last_refresh_status: "ok",
361
+ change_note: p.changeNote,
362
+ });
363
+
364
+ await insertChunksForVersion(
365
+ ctx.db,
366
+ p.logicalPath,
367
+ versionId,
368
+ chunks.map((c, i) => ({
369
+ chunk_index: c.index,
370
+ chunk_content: c.content,
371
+ search_text: searchTexts[i] ?? buildSearchText(p.logicalPath, description, c.content),
372
+ embedding: embeddings[i] ?? new Array(embeddings[0]?.length ?? 0).fill(0),
373
+ })),
374
+ );
375
+ await rebuildFts(ctx.db);
376
+ return versionId;
377
+ }
378
+
379
+ /**
380
+ * Pick the logical path for a single matched entry. For a single-file
381
+ * ingest with explicit `logical_path`, use it as-is. For multi-entry
382
+ * ingests with `logical_path` set, treat it as a *prefix* under which
383
+ * each entry's relative path is placed.
384
+ */
385
+ function pickLogicalPath(explicit: string | undefined, entry: ResolvedLocalEntry, isMulti: boolean): string {
386
+ if (!explicit) return entry.relPath.replaceAll("\\", "/");
387
+ if (!isMulti) return explicit;
388
+ const prefix = explicit.endsWith("/") ? explicit.slice(0, -1) : explicit;
389
+ return `${prefix}/${entry.relPath.replaceAll("\\", "/")}`;
390
+ }
391
+
392
+ /** Default logical path for an ingested URL — host + path, sanitized. */
393
+ function defaultLogicalForUrl(url: string): string {
394
+ try {
395
+ const u = new URL(url);
396
+ const tail = u.pathname.replace(/^\/+/, "").replaceAll("/", "_") || "root";
397
+ return `urls/${u.hostname}/${tail || "root"}`;
398
+ } catch {
399
+ return `urls/${url.replace(/[^a-z0-9.-]/gi, "_")}`;
400
+ }
401
+ }
402
+
403
+ function defaultInlinePath(): string {
404
+ return `inline/${new Date().toISOString().replace(/[:.]/g, "-")}.md`;
405
+ }
406
+
407
+ /**
408
+ * Convert a duration string (`5m`, `1h`, `24h`, `7d`) to seconds. Returns
409
+ * null when the input is undefined / blank, throws a HelpfulError on
410
+ * malformed input.
411
+ */
412
+ export function parseDuration(input: string | null | undefined): number | null {
413
+ if (!input?.trim()) return null;
414
+ const m = input.trim().match(/^(\d+)([smhd])$/i);
415
+ if (!m) {
416
+ throw new HelpfulError({
417
+ kind: "input_error",
418
+ message: `invalid duration: ${input}`,
419
+ hint: `Use forms like 5m, 1h, 24h, 7d.`,
420
+ });
421
+ }
422
+ const n = Number(m[1]);
423
+ const unit = m[2]?.toLowerCase() ?? "s";
424
+ const multiplier = unit === "s" ? 1 : unit === "m" ? 60 : unit === "h" ? 3600 : 86400;
425
+ return n * multiplier;
426
+ }
427
+
428
+ /** Roll a list of per-entry results into the top-level summary shape. */
429
+ function summarize(entries: IngestEntryResult[]): IngestResult {
430
+ let ok = 0;
431
+ let failed = 0;
432
+ for (const e of entries) {
433
+ if (e.status === "ok") ok += 1;
434
+ else failed += 1;
435
+ }
436
+ return { ingested: entries, total: entries.length, ok, failed };
437
+ }
438
+
439
+ function errorMessage(err: unknown): string {
440
+ if (err instanceof Error) return err.message;
441
+ return String(err);
442
+ }
443
+
444
+ export { getCurrent };
@@ -0,0 +1,64 @@
1
+ import { createHash } from "node:crypto";
2
+ import { stat } from "node:fs/promises";
3
+ import { asHelpful } from "../errors.ts";
4
+
5
+ export interface LocalRead {
6
+ bytes: Uint8Array;
7
+ sha256: string;
8
+ mtimeMs: number;
9
+ sizeBytes: number;
10
+ mimeType: string;
11
+ }
12
+
13
+ /**
14
+ * Best-effort filename → MIME mapping using Bun's built-in resolver. Strips
15
+ * the `;charset=...` suffix Bun adds for text types so the value is safe to
16
+ * compare with `===` against fixed MIME strings in the converter dispatch.
17
+ * Lowercases the path's basename so `.PNG` and `.png` resolve identically.
18
+ */
19
+ export function mimeFromPath(path: string): string {
20
+ const lastSlash = Math.max(path.lastIndexOf("/"), path.lastIndexOf("\\"));
21
+ const base =
22
+ lastSlash >= 0 ? path.slice(0, lastSlash + 1) + path.slice(lastSlash + 1).toLowerCase() : path.toLowerCase();
23
+ const raw = Bun.file(base).type;
24
+ const mime = raw.split(";")[0]?.trim();
25
+ if (!mime || mime === "application/octet-stream") return "application/octet-stream";
26
+ return mime;
27
+ }
28
+
29
+ /**
30
+ * Read a local file: bytes, sha256, last-mtime, size, and an inferred MIME
31
+ * type. Used by the ingest pipeline as the universal entry point for
32
+ * source_type='local'.
33
+ */
34
+ export async function readLocalFile(path: string): Promise<LocalRead> {
35
+ let stats: Awaited<ReturnType<typeof stat>>;
36
+ try {
37
+ stats = await stat(path);
38
+ } catch (err) {
39
+ throw asHelpful(
40
+ err,
41
+ `while stat'ing ${path}`,
42
+ `Check that the path exists and you have read access. \`ls -la ${path}\`.`,
43
+ "not_found",
44
+ );
45
+ }
46
+ const file = Bun.file(path);
47
+ const ab = await file.arrayBuffer();
48
+ const bytes = new Uint8Array(ab);
49
+ const sha256 = sha256Hex(bytes);
50
+ return {
51
+ bytes,
52
+ sha256,
53
+ mtimeMs: stats.mtimeMs,
54
+ sizeBytes: stats.size,
55
+ mimeType: mimeFromPath(path),
56
+ };
57
+ }
58
+
59
+ /** Compute a hex SHA-256 over the provided bytes. */
60
+ export function sha256Hex(bytes: Uint8Array): string {
61
+ const hash = createHash("sha256");
62
+ hash.update(bytes);
63
+ return hash.digest("hex");
64
+ }
@@ -0,0 +1,18 @@
1
+ /**
2
+ * Build the exact string that gets embedded AND FTS-indexed for a chunk.
3
+ * Format:
4
+ * <logical_path>
5
+ * <description>
6
+ * <blank line>
7
+ * <chunk_content>
8
+ *
9
+ * The path + description prefix lifts recall on filename-only or
10
+ * description-only queries (e.g. "the OAuth diagram" finds an empty PNG
11
+ * row whose body is only its caption). Stored verbatim as
12
+ * `chunks.search_text` so retrieval consumers can return clean snippets
13
+ * by reading `chunks.chunk_content` separately.
14
+ */
15
+ export function buildSearchText(logicalPath: string, description: string | null, chunkContent: string): string {
16
+ const desc = (description ?? "").trim();
17
+ return `${logicalPath}\n${desc}\n\n${chunkContent}`;
18
+ }
@@ -0,0 +1,186 @@
1
+ import { realpath, stat } from "node:fs/promises";
2
+ import { isAbsolute, join, relative, resolve, sep } from "node:path";
3
+ import picomatch from "picomatch";
4
+ import { asHelpful, HelpfulError } from "../errors.ts";
5
+
6
+ export type ResolvedSource =
7
+ | { kind: "inline"; text: string; logicalHint: string | null }
8
+ | { kind: "url"; url: string; logicalHint: string | null }
9
+ | { kind: "local-files"; entries: ResolvedLocalEntry[]; basePath: string };
10
+
11
+ export interface ResolvedLocalEntry {
12
+ absPath: string;
13
+ /** Path relative to the base; used to derive a default logical_path. */
14
+ relPath: string;
15
+ }
16
+
17
+ export interface ResolveOptions {
18
+ include?: string;
19
+ exclude?: string;
20
+ followSymlinks?: boolean;
21
+ }
22
+
23
+ const DEFAULT_EXCLUDES = ["**/node_modules/**", "**/.git/**", "**/.DS_Store", "**/dist/**", "**/.cache/**"];
24
+
25
+ /**
26
+ * Polymorphic source-arg expander. Accepts:
27
+ * - "inline:<text>" → inline literal
28
+ * - "http://..." | "https://..." → URL (fetched downstream by fetcher)
29
+ * - existing file → single-file local
30
+ * - existing directory → recursive walk (symlinks via realpath cache)
31
+ * - glob pattern → picomatch-filtered walk
32
+ *
33
+ * Symlink loops are broken via a realpath cache. Include / exclude are
34
+ * applied to the entry's path *relative to the base* so users don't need
35
+ * absolute-path globs.
36
+ */
37
+ export async function resolveSource(source: string, options: ResolveOptions = {}): Promise<ResolvedSource> {
38
+ if (source.startsWith("inline:")) {
39
+ return { kind: "inline", text: source.slice("inline:".length), logicalHint: null };
40
+ }
41
+ if (source.startsWith("http://") || source.startsWith("https://")) {
42
+ return { kind: "url", url: source, logicalHint: null };
43
+ }
44
+
45
+ const followSymlinks = options.followSymlinks !== false;
46
+ const includeMatchers = (options.include ?? "**/*")
47
+ .split(",")
48
+ .map((g) => g.trim())
49
+ .filter(Boolean);
50
+ const excludeMatchers = [
51
+ ...DEFAULT_EXCLUDES,
52
+ ...(options.exclude ?? "")
53
+ .split(",")
54
+ .map((g) => g.trim())
55
+ .filter(Boolean),
56
+ ];
57
+
58
+ if (isGlob(source)) {
59
+ const base = globBase(source);
60
+ try {
61
+ const realBase = await realpath(base);
62
+ return walk(realBase, [source, ...includeMatchers], excludeMatchers, followSymlinks);
63
+ } catch (err) {
64
+ throw asHelpful(
65
+ err,
66
+ `while resolving glob base ${base}`,
67
+ `Check that the directory ${base} exists.`,
68
+ "not_found",
69
+ );
70
+ }
71
+ }
72
+
73
+ const abs = resolve(source);
74
+ let st: Awaited<ReturnType<typeof stat>>;
75
+ try {
76
+ st = await stat(abs);
77
+ } catch (err) {
78
+ throw asHelpful(
79
+ err,
80
+ `while stat'ing ${source}`,
81
+ `Check that the path exists. Run \`ls ${source}\`. For URLs, prefix with http:// or https://.`,
82
+ "not_found",
83
+ );
84
+ }
85
+
86
+ if (st.isFile()) {
87
+ return {
88
+ kind: "local-files",
89
+ basePath: abs,
90
+ entries: [{ absPath: abs, relPath: source.split(sep).pop() ?? source }],
91
+ };
92
+ }
93
+
94
+ if (st.isDirectory()) {
95
+ const realBase = await realpath(abs);
96
+ return walk(realBase, includeMatchers, excludeMatchers, followSymlinks);
97
+ }
98
+
99
+ throw new HelpfulError({
100
+ kind: "input_error",
101
+ message: `${source} is neither a file, directory, nor URL`,
102
+ hint: `Pass a file path, directory, glob (e.g. "docs/**/*.md"), URL, or "inline:<text>".`,
103
+ });
104
+ }
105
+
106
+ /** Crude glob detector — matches what picomatch treats as a pattern. */
107
+ export function isGlob(s: string): boolean {
108
+ return /[*?[\]{}!]/.test(s);
109
+ }
110
+
111
+ /** Take the static directory prefix of a glob (everything before the first wildcard). */
112
+ export function globBase(glob: string): string {
113
+ const parts = glob.split(sep);
114
+ const out: string[] = [];
115
+ for (const p of parts) {
116
+ if (/[*?[\]{}!]/.test(p)) break;
117
+ out.push(p);
118
+ }
119
+ const base = out.join(sep);
120
+ return base.length === 0 || !isAbsolute(base) ? resolve(base || ".") : base;
121
+ }
122
+
123
+ /**
124
+ * Recursively walk `base`, returning files matched by `includes` and not
125
+ * matched by `excludes`. Both globsets match against the entry's path
126
+ * relative to `base`. Symlinks are followed when `followSymlinks` is true,
127
+ * with cycles detected via a realpath cache.
128
+ */
129
+ async function walk(
130
+ base: string,
131
+ includes: string[],
132
+ excludes: string[],
133
+ followSymlinks: boolean,
134
+ ): Promise<ResolvedSource> {
135
+ const seen = new Set<string>();
136
+ const entries: ResolvedLocalEntry[] = [];
137
+
138
+ const isInclude = picomatch(includes, { dot: false, nocase: false });
139
+ const isExclude = excludes.length ? picomatch(excludes, { dot: false }) : null;
140
+
141
+ const queue: string[] = [base];
142
+ while (queue.length > 0) {
143
+ const cur = queue.shift();
144
+ if (cur === undefined) break;
145
+ let real: string;
146
+ try {
147
+ real = await realpath(cur);
148
+ } catch {
149
+ continue;
150
+ }
151
+ if (seen.has(real)) continue;
152
+ seen.add(real);
153
+ let st: Awaited<ReturnType<typeof stat>>;
154
+ try {
155
+ st = await stat(real);
156
+ } catch {
157
+ continue;
158
+ }
159
+ if (st.isSymbolicLink() && !followSymlinks) continue;
160
+ if (st.isDirectory()) {
161
+ let names: string[];
162
+ try {
163
+ names = await readdir(real);
164
+ } catch {
165
+ continue;
166
+ }
167
+ for (const name of names) {
168
+ queue.push(join(real, name));
169
+ }
170
+ continue;
171
+ }
172
+ if (!st.isFile()) continue;
173
+ const rel = relative(base, real);
174
+ const relForMatch = rel.length === 0 ? (cur.split(sep).pop() ?? cur) : rel;
175
+ if (isExclude?.(relForMatch)) continue;
176
+ if (!isInclude(relForMatch)) continue;
177
+ entries.push({ absPath: real, relPath: relForMatch });
178
+ }
179
+
180
+ return { kind: "local-files", basePath: base, entries };
181
+ }
182
+
183
+ async function readdir(path: string): Promise<string[]> {
184
+ const fs = await import("node:fs/promises");
185
+ return fs.readdir(path);
186
+ }