@crewhaus/tool-document-ingest 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,37 @@
1
+ import { CrewhausError } from "@crewhaus/errors";
2
+ import type { RegisteredTool } from "@crewhaus/tool-catalog";
3
+ export type DocumentParserResult = {
4
+ readonly content: string;
5
+ /** Optional structured metadata to surface to the model. */
6
+ readonly metadata?: Record<string, unknown>;
7
+ };
8
+ export type DocumentParser = (path: string) => Promise<DocumentParserResult> | DocumentParserResult;
9
+ export declare class DocumentIngestError extends CrewhausError {
10
+ readonly name = "DocumentIngestError";
11
+ constructor(message: string, cause?: unknown);
12
+ }
13
+ export declare class ToolPermissionError extends CrewhausError {
14
+ readonly name = "ToolPermissionError";
15
+ readonly toolName: string;
16
+ readonly path: string;
17
+ constructor(toolName: string, attemptedPath: string);
18
+ }
19
+ /**
20
+ * Register a parser for a file extension. Operators who need PDF/docx
21
+ * support wire their preferred library here:
22
+ *
23
+ * import pdfParse from "pdf-parse";
24
+ * registerDocumentParser(".pdf", async (path) => {
25
+ * const buf = await fs.readFile(path);
26
+ * const { text, numpages } = await pdfParse(buf);
27
+ * return { content: text, metadata: { pages: numpages } };
28
+ * });
29
+ *
30
+ * Extension is matched case-insensitively. Must start with ".".
31
+ */
32
+ export declare function registerDocumentParser(ext: string, parser: DocumentParser): void;
33
+ /**
34
+ * Clear all registered parsers. For tests.
35
+ */
36
+ export declare function clearDocumentParsers(): void;
37
+ export declare const ingestDocument: RegisteredTool;
package/dist/index.js ADDED
@@ -0,0 +1,268 @@
1
+ /**
2
+ * Catalog R3 — tool-document-ingest. M4.3 of the heavy-hitter plan.
3
+ *
4
+ * `IngestDocument(path)` reads a file from the user's host and returns
5
+ * its content plus structured metadata (line count, byte size, MIME
6
+ * guess, optional chunks).
7
+ *
8
+ * v0 supported formats — handled inline, zero extra deps:
9
+ * - .txt, .md, .mdx — plain UTF-8 text
10
+ * - .csv, .tsv — text plus row count
11
+ * - .json, .yaml, .yml — text plus parse-validation
12
+ * - .log, .out — plain text
13
+ *
14
+ * Stubbed formats — return a clear "needs operator-registered parser"
15
+ * error pointing at `registerDocumentParser(ext, parser)`:
16
+ * - .pdf, .docx, .doc, .xlsx, .xls, .pptx, .epub
17
+ *
18
+ * Why no pdf-parse / mammoth / xlsx deps in v0: those packages weigh
19
+ * several MB each and have native sub-deps. Operators who need them
20
+ * can register their own parser via `registerDocumentParser`; the
21
+ * tool's contract stays the same.
22
+ *
23
+ * Security note: the path is user-controlled (the model supplies it, and
24
+ * the model may be steered by injected content). Two defenses apply:
25
+ * 1. Containment — the path is resolved against `process.cwd()` and
26
+ * rejected if it escapes the workspace root, lexically (`..` or
27
+ * absolute escapes) or via an in-root symlink whose real target
28
+ * lies outside (CWE-59). See `resolveSafe` below.
29
+ * 2. Output classification — the runtime classifies the OUTPUT via
30
+ * boundary-classifier with the existing `tool` origin (Pillar 3);
31
+ * file contents may contain anything (e.g. a prompt-injecting PDF).
32
+ */
33
+ import { closeSync, existsSync, constants as fsConstants, fstatSync, openSync, readSync, realpathSync, statSync, } from "node:fs";
34
+ import { basename, dirname, extname, join, resolve, sep } from "node:path";
35
+ import { CrewhausError } from "@crewhaus/errors";
36
+ import { buildTool } from "@crewhaus/tool-builder";
37
+ import { z } from "zod";
38
+ export class DocumentIngestError extends CrewhausError {
39
+ name = "DocumentIngestError";
40
+ constructor(message, cause) {
41
+ super("tool", message, cause);
42
+ }
43
+ }
44
+ export class ToolPermissionError extends CrewhausError {
45
+ name = "ToolPermissionError";
46
+ toolName;
47
+ path;
48
+ constructor(toolName, attemptedPath) {
49
+ super("tool", `tool "${toolName}" rejected path "${attemptedPath}": resolved location escapes the workspace root`);
50
+ this.toolName = toolName;
51
+ this.path = attemptedPath;
52
+ }
53
+ }
54
+ /**
55
+ * Resolve `rel` against the workspace root and reject anything that escapes.
56
+ * Mirrors `tool-fs`'s `resolveSafe` — duplicated here (like `tool-image`)
57
+ * rather than extracted to a shared package; keep the copies in sync.
58
+ */
59
+ function resolveSafe(toolName, rel, root = process.cwd()) {
60
+ const rootResolved = resolve(root);
61
+ const abs = resolve(rootResolved, rel);
62
+ // 1) Lexical containment — fast path; rejects `..` and absolute escapes.
63
+ // The trailing `sep` avoids the `/root` vs `/root-sibling` pitfall.
64
+ if (abs !== rootResolved && !abs.startsWith(`${rootResolved}${sep}`)) {
65
+ throw new ToolPermissionError(toolName, rel);
66
+ }
67
+ // 2) Symlink-aware containment (CWE-59). The lexical check above is fooled
68
+ // by an in-root symlink that points outside the workspace, so re-check
69
+ // the REAL path. The leaf may not exist (the file-not-found error comes
70
+ // after containment so escaping paths never leak existence info), so
71
+ // resolve the deepest existing ancestor and re-append the missing tail.
72
+ // Fails closed if realpath errors for any reason other than the walk.
73
+ let real;
74
+ try {
75
+ const rootReal = realpathSync(rootResolved);
76
+ let probe = abs;
77
+ const tail = [];
78
+ while (!existsSync(probe)) {
79
+ tail.unshift(basename(probe));
80
+ const parent = dirname(probe);
81
+ if (parent === probe)
82
+ break; // reached the filesystem root
83
+ probe = parent;
84
+ }
85
+ real = tail.length > 0 ? join(realpathSync(probe), ...tail) : realpathSync(probe);
86
+ if (real !== rootReal && !real.startsWith(`${rootReal}${sep}`)) {
87
+ throw new ToolPermissionError(toolName, rel);
88
+ }
89
+ }
90
+ catch (err) {
91
+ if (err instanceof ToolPermissionError)
92
+ throw err;
93
+ throw new ToolPermissionError(toolName, rel);
94
+ }
95
+ // Return the validated REAL path; the read below opens it with O_NOFOLLOW so a
96
+ // leaf swapped to a symlink after this check (TOCTOU/CWE-367) is rejected.
97
+ return real;
98
+ }
99
+ /**
100
+ * Read a resolveSafe-validated text file with O_NOFOLLOW so a leaf swapped to a
101
+ * symlink after the containment check is rejected rather than followed out of
102
+ * the workspace. (Custom parsers read the resolved realpath themselves and are
103
+ * out of this guard's scope.)
104
+ */
105
+ function readTextNoFollow(absPath) {
106
+ let fd;
107
+ try {
108
+ fd = openSync(absPath, fsConstants.O_RDONLY | fsConstants.O_NOFOLLOW);
109
+ }
110
+ catch (err) {
111
+ if (err.code === "ELOOP") {
112
+ throw new ToolPermissionError("IngestDocument", absPath);
113
+ }
114
+ throw err;
115
+ }
116
+ try {
117
+ const { size } = fstatSync(fd);
118
+ const b = Buffer.allocUnsafe(size);
119
+ let offset = 0;
120
+ while (offset < size) {
121
+ const n = readSync(fd, b, offset, size - offset, offset);
122
+ if (n === 0)
123
+ break;
124
+ offset += n;
125
+ }
126
+ return b.toString("utf-8", 0, offset);
127
+ }
128
+ finally {
129
+ closeSync(fd);
130
+ }
131
+ }
132
+ const TEXT_EXTENSIONS = new Set([".txt", ".md", ".mdx", ".log", ".out", ".rst"]);
133
+ const TABULAR_EXTENSIONS = new Set([".csv", ".tsv"]);
134
+ const STRUCTURED_EXTENSIONS = new Set([".json", ".yaml", ".yml"]);
135
+ const STUB_EXTENSIONS = new Set([".pdf", ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".epub"]);
136
+ const customParsers = new Map();
137
+ /**
138
+ * Register a parser for a file extension. Operators who need PDF/docx
139
+ * support wire their preferred library here:
140
+ *
141
+ * import pdfParse from "pdf-parse";
142
+ * registerDocumentParser(".pdf", async (path) => {
143
+ * const buf = await fs.readFile(path);
144
+ * const { text, numpages } = await pdfParse(buf);
145
+ * return { content: text, metadata: { pages: numpages } };
146
+ * });
147
+ *
148
+ * Extension is matched case-insensitively. Must start with ".".
149
+ */
150
+ export function registerDocumentParser(ext, parser) {
151
+ if (!ext.startsWith(".")) {
152
+ throw new DocumentIngestError(`extension must start with "." (got "${ext}")`);
153
+ }
154
+ customParsers.set(ext.toLowerCase(), parser);
155
+ }
156
+ /**
157
+ * Clear all registered parsers. For tests.
158
+ */
159
+ export function clearDocumentParsers() {
160
+ customParsers.clear();
161
+ }
162
+ const inputSchema = z.object({
163
+ path: z
164
+ .string()
165
+ .min(1)
166
+ .describe("Workspace-relative path to the file. Paths escaping the workspace are rejected."),
167
+ maxBytes: z
168
+ .number()
169
+ .int()
170
+ .positive()
171
+ .max(10_000_000)
172
+ .optional()
173
+ .describe("Hard cap. Default 1MB. Files larger than this are truncated with a notice."),
174
+ });
175
+ const DEFAULT_MAX_BYTES = 1_000_000;
176
+ export const ingestDocument = buildTool({
177
+ name: "IngestDocument",
178
+ description: "Read a file inside the workspace and return its content with structured metadata. Paths escaping the workspace root are rejected. Supports plain text, CSV/TSV, JSON, YAML out of the box; PDF/docx/xlsx need an operator-registered parser.",
179
+ inputSchema,
180
+ readOnly: true,
181
+ destructive: false,
182
+ execute: async (input) => {
183
+ const abs = resolveSafe("IngestDocument", input.path);
184
+ if (!existsSync(abs)) {
185
+ throw new DocumentIngestError(`file not found: ${abs}`);
186
+ }
187
+ const stat = statSync(abs);
188
+ if (!stat.isFile()) {
189
+ throw new DocumentIngestError(`not a regular file: ${abs}`);
190
+ }
191
+ const ext = extname(abs).toLowerCase();
192
+ const maxBytes = input.maxBytes ?? DEFAULT_MAX_BYTES;
193
+ // Operator-registered parser takes priority over built-in handling.
194
+ const customParser = customParsers.get(ext);
195
+ if (customParser !== undefined) {
196
+ const result = await customParser(abs);
197
+ return renderResult({
198
+ path: abs,
199
+ content: result.content,
200
+ metadata: { ext, size: stat.size, ...result.metadata },
201
+ maxBytes,
202
+ });
203
+ }
204
+ if (STUB_EXTENSIONS.has(ext)) {
205
+ throw new DocumentIngestError(`extension "${ext}" needs a parser registered via registerDocumentParser(). See @crewhaus/tool-document-ingest README for the pdf-parse / mammoth / xlsx setup.`);
206
+ }
207
+ if (TEXT_EXTENSIONS.has(ext) ||
208
+ TABULAR_EXTENSIONS.has(ext) ||
209
+ STRUCTURED_EXTENSIONS.has(ext) ||
210
+ ext === "") {
211
+ const raw = readTextNoFollow(abs);
212
+ const metadata = {
213
+ ext: ext || "(none)",
214
+ size: stat.size,
215
+ lines: countLines(raw),
216
+ };
217
+ if (TABULAR_EXTENSIONS.has(ext)) {
218
+ const delim = ext === ".tsv" ? "\t" : ",";
219
+ const lines = raw.split("\n").filter((l) => l.length > 0);
220
+ metadata["rows"] = lines.length;
221
+ metadata["columns"] = (lines[0] ?? "").split(delim).length;
222
+ }
223
+ if (STRUCTURED_EXTENSIONS.has(ext)) {
224
+ if (ext === ".json") {
225
+ try {
226
+ JSON.parse(raw);
227
+ metadata["valid_json"] = true;
228
+ }
229
+ catch (err) {
230
+ metadata["valid_json"] = false;
231
+ metadata["parse_error"] = err.message.slice(0, 200);
232
+ }
233
+ }
234
+ }
235
+ return renderResult({ path: abs, content: raw, metadata, maxBytes });
236
+ }
237
+ throw new DocumentIngestError(`extension "${ext}" is not handled by built-in ingest. Register a parser via registerDocumentParser("${ext}", …), or rename to .txt/.md if it's plain text.`);
238
+ },
239
+ });
240
+ function renderResult(args) {
241
+ let content = args.content;
242
+ let truncated = false;
243
+ if (Buffer.byteLength(content, "utf8") > args.maxBytes) {
244
+ content = Buffer.from(content, "utf-8").subarray(0, args.maxBytes).toString("utf-8");
245
+ truncated = true;
246
+ }
247
+ const lines = [
248
+ `<document path="${args.path}" name="${basename(args.path)}">`,
249
+ `metadata: ${JSON.stringify(args.metadata)}${truncated ? ` (TRUNCATED to ${args.maxBytes} bytes)` : ""}`,
250
+ "---",
251
+ content,
252
+ "</document>",
253
+ ];
254
+ return lines.join("\n");
255
+ }
256
+ function countLines(s) {
257
+ if (s.length === 0)
258
+ return 0;
259
+ let n = 1;
260
+ for (const ch of s) {
261
+ if (ch === "\n")
262
+ n++;
263
+ }
264
+ // A trailing newline is a line terminator, not a 4th empty line.
265
+ if (s.endsWith("\n"))
266
+ n--;
267
+ return n;
268
+ }
package/package.json CHANGED
@@ -1,20 +1,23 @@
1
1
  {
2
2
  "name": "@crewhaus/tool-document-ingest",
3
- "version": "0.1.4",
3
+ "version": "0.1.5",
4
4
  "type": "module",
5
5
  "description": "M4.3 — IngestDocument tool. Reads .txt/.md/.csv/.json/.yaml inline; PDF/docx/xlsx via pluggable parsers (deferred to operator-provided handlers).",
6
- "main": "src/index.ts",
7
- "types": "src/index.ts",
6
+ "main": "dist/index.js",
7
+ "types": "dist/index.d.ts",
8
8
  "exports": {
9
- ".": "./src/index.ts"
9
+ ".": {
10
+ "types": "./dist/index.d.ts",
11
+ "import": "./dist/index.js"
12
+ }
10
13
  },
11
14
  "scripts": {
12
15
  "test": "bun test src"
13
16
  },
14
17
  "dependencies": {
15
- "@crewhaus/errors": "0.1.4",
16
- "@crewhaus/tool-builder": "0.1.4",
17
- "@crewhaus/tool-catalog": "0.1.4",
18
+ "@crewhaus/errors": "0.1.5",
19
+ "@crewhaus/tool-builder": "0.1.5",
20
+ "@crewhaus/tool-catalog": "0.1.5",
18
21
  "zod": "^3.23.8"
19
22
  },
20
23
  "license": "Apache-2.0",
@@ -35,5 +38,5 @@
35
38
  "publishConfig": {
36
39
  "access": "public"
37
40
  },
38
- "files": ["src", "README.md", "LICENSE", "NOTICE"]
41
+ "files": ["dist", "README.md", "LICENSE", "NOTICE"]
39
42
  }
package/src/index.test.ts DELETED
@@ -1,217 +0,0 @@
1
- /**
2
- * Tests for tool-document-ingest. Built-in handling for plain text /
3
- * structured / tabular formats, operator-registered parsers, and
4
- * workspace path containment (mirrors tool-fs's traversal-rejection
5
- * cases).
6
- *
7
- * The harness chdirs into a temp workspace because IngestDocument is
8
- * sandboxed to `process.cwd()` — tests address files by workspace-
9
- * relative path.
10
- */
11
- import { afterEach, beforeEach, describe, expect, test } from "bun:test";
12
- import { mkdirSync, mkdtempSync, realpathSync, rmSync, symlinkSync, writeFileSync } from "node:fs";
13
- import { tmpdir } from "node:os";
14
- import { join } from "node:path";
15
- import { CrewhausError } from "@crewhaus/errors";
16
- import {
17
- DocumentIngestError,
18
- ToolPermissionError,
19
- clearDocumentParsers,
20
- ingestDocument,
21
- registerDocumentParser,
22
- } from "./index";
23
-
24
- let tmp: string;
25
- let originalCwd: string;
26
-
27
- beforeEach(() => {
28
- originalCwd = process.cwd();
29
- // realpath so absolute-path-inside-workspace assertions hold on macOS,
30
- // where tmpdir() lives behind the /var → /private/var symlink.
31
- tmp = realpathSync(mkdtempSync(join(tmpdir(), "doc-ingest-")));
32
- process.chdir(tmp);
33
- clearDocumentParsers();
34
- });
35
-
36
- afterEach(() => {
37
- process.chdir(originalCwd);
38
- rmSync(tmp, { recursive: true, force: true });
39
- clearDocumentParsers();
40
- });
41
-
42
- /** Write `content` into the temp workspace; returns the workspace-relative path. */
43
- function writeFile(name: string, content: string): string {
44
- writeFileSync(join(tmp, name), content);
45
- return name;
46
- }
47
-
48
- describe("ingestDocument — basics", () => {
49
- test("tool flags: read-only, non-destructive, named 'IngestDocument'", () => {
50
- expect(ingestDocument.name).toBe("IngestDocument");
51
- expect(ingestDocument.readOnly).toBe(true);
52
- expect(ingestDocument.destructive).toBe(false);
53
- });
54
-
55
- test("throws when file does not exist", async () => {
56
- await expect(ingestDocument.execute({ path: "does/not/exist.txt" })).rejects.toThrow(
57
- DocumentIngestError,
58
- );
59
- });
60
-
61
- test("ingests a plain .txt file with the document envelope", async () => {
62
- const path = writeFile("note.txt", "hello\nworld\n");
63
- const result = await ingestDocument.execute({ path });
64
- expect(result).toContain("<document path=");
65
- expect(result).toContain("hello");
66
- expect(result).toContain("</document>");
67
- expect(result).toContain("metadata:");
68
- });
69
-
70
- test("emits line count + ext in metadata for .md", async () => {
71
- const path = writeFile("doc.md", "# title\n\nbody.\n");
72
- const result = await ingestDocument.execute({ path });
73
- expect(result).toMatch(/"ext":"\.md"/);
74
- expect(result).toMatch(/"lines":3/);
75
- });
76
- });
77
-
78
- describe("ingestDocument — path containment", () => {
79
- test("ToolPermissionError is a CrewhausError with code 'tool'", () => {
80
- const err = new ToolPermissionError("IngestDocument", "../../escape");
81
- expect(err).toBeInstanceOf(CrewhausError);
82
- expect(err.code).toBe("tool");
83
- expect(err.toolName).toBe("IngestDocument");
84
- expect(err.path).toBe("../../escape");
85
- expect(err.message).toContain("escapes the workspace root");
86
- });
87
-
88
- test("rejects parent-directory traversal", async () => {
89
- await expect(ingestDocument.execute({ path: "../../../etc/passwd" })).rejects.toBeInstanceOf(
90
- ToolPermissionError,
91
- );
92
- });
93
-
94
- test("rejects absolute path outside workspace", async () => {
95
- await expect(ingestDocument.execute({ path: "/etc/passwd" })).rejects.toBeInstanceOf(
96
- ToolPermissionError,
97
- );
98
- });
99
-
100
- test("rejects subdir-then-traversal", async () => {
101
- mkdirSync(join(tmp, "sub"));
102
- await expect(ingestDocument.execute({ path: "sub/../../escape.txt" })).rejects.toBeInstanceOf(
103
- ToolPermissionError,
104
- );
105
- });
106
-
107
- test("rejects an in-root symlink whose target escapes the workspace", async () => {
108
- const outside = mkdtempSync(join(tmpdir(), "doc-ingest-outside-"));
109
- try {
110
- writeFileSync(join(outside, "secret.txt"), "top secret");
111
- symlinkSync(join(outside, "secret.txt"), join(tmp, "link.txt"));
112
- await expect(ingestDocument.execute({ path: "link.txt" })).rejects.toBeInstanceOf(
113
- ToolPermissionError,
114
- );
115
- } finally {
116
- rmSync(outside, { recursive: true, force: true });
117
- }
118
- });
119
-
120
- test("allows an absolute path inside the workspace", async () => {
121
- writeFile("inside.txt", "in-root content");
122
- const result = await ingestDocument.execute({ path: join(tmp, "inside.txt") });
123
- expect(result).toContain("in-root content");
124
- });
125
-
126
- test("allows an in-root symlink to an in-root file (no over-blocking)", async () => {
127
- writeFile("real.txt", "linked content");
128
- symlinkSync(join(tmp, "real.txt"), join(tmp, "good-link.txt"));
129
- const result = await ingestDocument.execute({ path: "good-link.txt" });
130
- expect(result).toContain("linked content");
131
- });
132
- });
133
-
134
- describe("ingestDocument — tabular formats", () => {
135
- test("counts rows + columns in CSV", async () => {
136
- const path = writeFile("data.csv", "a,b,c\n1,2,3\n4,5,6\n");
137
- const result = await ingestDocument.execute({ path });
138
- expect(result).toMatch(/"rows":3/);
139
- expect(result).toMatch(/"columns":3/);
140
- });
141
-
142
- test("uses tab delimiter for .tsv", async () => {
143
- const path = writeFile("data.tsv", "a\tb\n1\t2\n");
144
- const result = await ingestDocument.execute({ path });
145
- expect(result).toMatch(/"columns":2/);
146
- });
147
- });
148
-
149
- describe("ingestDocument — structured formats", () => {
150
- test("validates JSON parse for .json files", async () => {
151
- const path = writeFile("config.json", '{"k":1}');
152
- const result = await ingestDocument.execute({ path });
153
- expect(result).toMatch(/"valid_json":true/);
154
- });
155
-
156
- test("flags malformed JSON without throwing", async () => {
157
- const path = writeFile("config.json", "{not json}");
158
- const result = await ingestDocument.execute({ path });
159
- expect(result).toMatch(/"valid_json":false/);
160
- expect(result).toContain("parse_error");
161
- });
162
- });
163
-
164
- describe("ingestDocument — stubbed extensions", () => {
165
- test(".pdf raises with a pointer to registerDocumentParser", async () => {
166
- const path = writeFile("doc.pdf", "%PDF-1.4 fake content");
167
- await expect(ingestDocument.execute({ path })).rejects.toThrow(
168
- /needs a parser registered via registerDocumentParser/,
169
- );
170
- });
171
-
172
- test(".docx and .xlsx similarly throw with extension-specific message", async () => {
173
- const docx = writeFile("a.docx", "fake docx");
174
- await expect(ingestDocument.execute({ path: docx })).rejects.toThrow(/"\.docx"/);
175
- const xlsx = writeFile("b.xlsx", "fake xlsx");
176
- await expect(ingestDocument.execute({ path: xlsx })).rejects.toThrow(/"\.xlsx"/);
177
- });
178
- });
179
-
180
- describe("ingestDocument — operator-registered parsers", () => {
181
- test("registerDocumentParser overrides built-in handling", async () => {
182
- registerDocumentParser(".pdf", async (path) => ({
183
- content: `parsed PDF text from ${path}`,
184
- metadata: { pages: 42 },
185
- }));
186
- const path = writeFile("doc.pdf", "fake content");
187
- const result = await ingestDocument.execute({ path });
188
- expect(result).toContain("parsed PDF text from");
189
- expect(result).toMatch(/"pages":42/);
190
- });
191
-
192
- test("registerDocumentParser rejects extensions without leading dot", () => {
193
- expect(() => registerDocumentParser("pdf", async () => ({ content: "" }))).toThrow(
194
- DocumentIngestError,
195
- );
196
- });
197
-
198
- test("ext matching is case-insensitive", async () => {
199
- registerDocumentParser(".PDF", async () => ({ content: "uppercase ext" }));
200
- const path = writeFile("doc.pdf", "x");
201
- const result = await ingestDocument.execute({ path });
202
- expect(result).toContain("uppercase ext");
203
- });
204
- });
205
-
206
- describe("ingestDocument — size cap", () => {
207
- test("truncates content above maxBytes with a TRUNCATED notice", async () => {
208
- const path = writeFile("big.txt", "x".repeat(200));
209
- const result = await ingestDocument.execute({ path, maxBytes: 50 });
210
- expect(result).toContain("TRUNCATED to 50 bytes");
211
- });
212
-
213
- test("default maxBytes is 1MB", () => {
214
- const parsed = ingestDocument.inputSchema.parse({ path: "x" });
215
- expect(parsed.maxBytes).toBeUndefined();
216
- });
217
- });
package/src/index.ts DELETED
@@ -1,309 +0,0 @@
1
- /**
2
- * Catalog R3 — tool-document-ingest. M4.3 of the heavy-hitter plan.
3
- *
4
- * `IngestDocument(path)` reads a file from the user's host and returns
5
- * its content plus structured metadata (line count, byte size, MIME
6
- * guess, optional chunks).
7
- *
8
- * v0 supported formats — handled inline, zero extra deps:
9
- * - .txt, .md, .mdx — plain UTF-8 text
10
- * - .csv, .tsv — text plus row count
11
- * - .json, .yaml, .yml — text plus parse-validation
12
- * - .log, .out — plain text
13
- *
14
- * Stubbed formats — return a clear "needs operator-registered parser"
15
- * error pointing at `registerDocumentParser(ext, parser)`:
16
- * - .pdf, .docx, .doc, .xlsx, .xls, .pptx, .epub
17
- *
18
- * Why no pdf-parse / mammoth / xlsx deps in v0: those packages weigh
19
- * several MB each and have native sub-deps. Operators who need them
20
- * can register their own parser via `registerDocumentParser`; the
21
- * tool's contract stays the same.
22
- *
23
- * Security note: the path is user-controlled (the model supplies it, and
24
- * the model may be steered by injected content). Two defenses apply:
25
- * 1. Containment — the path is resolved against `process.cwd()` and
26
- * rejected if it escapes the workspace root, lexically (`..` or
27
- * absolute escapes) or via an in-root symlink whose real target
28
- * lies outside (CWE-59). See `resolveSafe` below.
29
- * 2. Output classification — the runtime classifies the OUTPUT via
30
- * boundary-classifier with the existing `tool` origin (Pillar 3);
31
- * file contents may contain anything (e.g. a prompt-injecting PDF).
32
- */
33
- import {
34
- closeSync,
35
- existsSync,
36
- constants as fsConstants,
37
- fstatSync,
38
- openSync,
39
- readSync,
40
- realpathSync,
41
- statSync,
42
- } from "node:fs";
43
- import { basename, dirname, extname, join, resolve, sep } from "node:path";
44
- import { CrewhausError } from "@crewhaus/errors";
45
- import { buildTool } from "@crewhaus/tool-builder";
46
- import type { RegisteredTool } from "@crewhaus/tool-catalog";
47
- import { z } from "zod";
48
-
49
- export type DocumentParserResult = {
50
- readonly content: string;
51
- /** Optional structured metadata to surface to the model. */
52
- readonly metadata?: Record<string, unknown>;
53
- };
54
-
55
- export type DocumentParser = (path: string) => Promise<DocumentParserResult> | DocumentParserResult;
56
-
57
- export class DocumentIngestError extends CrewhausError {
58
- override readonly name = "DocumentIngestError";
59
- constructor(message: string, cause?: unknown) {
60
- super("tool", message, cause);
61
- }
62
- }
63
-
64
- export class ToolPermissionError extends CrewhausError {
65
- override readonly name = "ToolPermissionError";
66
- readonly toolName: string;
67
- readonly path: string;
68
-
69
- constructor(toolName: string, attemptedPath: string) {
70
- super(
71
- "tool",
72
- `tool "${toolName}" rejected path "${attemptedPath}": resolved location escapes the workspace root`,
73
- );
74
- this.toolName = toolName;
75
- this.path = attemptedPath;
76
- }
77
- }
78
-
79
- /**
80
- * Resolve `rel` against the workspace root and reject anything that escapes.
81
- * Mirrors `tool-fs`'s `resolveSafe` — duplicated here (like `tool-image`)
82
- * rather than extracted to a shared package; keep the copies in sync.
83
- */
84
- function resolveSafe(toolName: string, rel: string, root: string = process.cwd()): string {
85
- const rootResolved = resolve(root);
86
- const abs = resolve(rootResolved, rel);
87
- // 1) Lexical containment — fast path; rejects `..` and absolute escapes.
88
- // The trailing `sep` avoids the `/root` vs `/root-sibling` pitfall.
89
- if (abs !== rootResolved && !abs.startsWith(`${rootResolved}${sep}`)) {
90
- throw new ToolPermissionError(toolName, rel);
91
- }
92
- // 2) Symlink-aware containment (CWE-59). The lexical check above is fooled
93
- // by an in-root symlink that points outside the workspace, so re-check
94
- // the REAL path. The leaf may not exist (the file-not-found error comes
95
- // after containment so escaping paths never leak existence info), so
96
- // resolve the deepest existing ancestor and re-append the missing tail.
97
- // Fails closed if realpath errors for any reason other than the walk.
98
- let real: string;
99
- try {
100
- const rootReal = realpathSync(rootResolved);
101
- let probe = abs;
102
- const tail: string[] = [];
103
- while (!existsSync(probe)) {
104
- tail.unshift(basename(probe));
105
- const parent = dirname(probe);
106
- if (parent === probe) break; // reached the filesystem root
107
- probe = parent;
108
- }
109
- real = tail.length > 0 ? join(realpathSync(probe), ...tail) : realpathSync(probe);
110
- if (real !== rootReal && !real.startsWith(`${rootReal}${sep}`)) {
111
- throw new ToolPermissionError(toolName, rel);
112
- }
113
- } catch (err) {
114
- if (err instanceof ToolPermissionError) throw err;
115
- throw new ToolPermissionError(toolName, rel);
116
- }
117
- // Return the validated REAL path; the read below opens it with O_NOFOLLOW so a
118
- // leaf swapped to a symlink after this check (TOCTOU/CWE-367) is rejected.
119
- return real;
120
- }
121
-
122
- /**
123
- * Read a resolveSafe-validated text file with O_NOFOLLOW so a leaf swapped to a
124
- * symlink after the containment check is rejected rather than followed out of
125
- * the workspace. (Custom parsers read the resolved realpath themselves and are
126
- * out of this guard's scope.)
127
- */
128
- function readTextNoFollow(absPath: string): string {
129
- let fd: number;
130
- try {
131
- fd = openSync(absPath, fsConstants.O_RDONLY | fsConstants.O_NOFOLLOW);
132
- } catch (err) {
133
- if ((err as NodeJS.ErrnoException).code === "ELOOP") {
134
- throw new ToolPermissionError("IngestDocument", absPath);
135
- }
136
- throw err;
137
- }
138
- try {
139
- const { size } = fstatSync(fd);
140
- const b = Buffer.allocUnsafe(size);
141
- let offset = 0;
142
- while (offset < size) {
143
- const n = readSync(fd, b, offset, size - offset, offset);
144
- if (n === 0) break;
145
- offset += n;
146
- }
147
- return b.toString("utf-8", 0, offset);
148
- } finally {
149
- closeSync(fd);
150
- }
151
- }
152
-
153
- const TEXT_EXTENSIONS = new Set([".txt", ".md", ".mdx", ".log", ".out", ".rst"]);
154
- const TABULAR_EXTENSIONS = new Set([".csv", ".tsv"]);
155
- const STRUCTURED_EXTENSIONS = new Set([".json", ".yaml", ".yml"]);
156
- const STUB_EXTENSIONS = new Set([".pdf", ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".epub"]);
157
-
158
- const customParsers = new Map<string, DocumentParser>();
159
-
160
- /**
161
- * Register a parser for a file extension. Operators who need PDF/docx
162
- * support wire their preferred library here:
163
- *
164
- * import pdfParse from "pdf-parse";
165
- * registerDocumentParser(".pdf", async (path) => {
166
- * const buf = await fs.readFile(path);
167
- * const { text, numpages } = await pdfParse(buf);
168
- * return { content: text, metadata: { pages: numpages } };
169
- * });
170
- *
171
- * Extension is matched case-insensitively. Must start with ".".
172
- */
173
- export function registerDocumentParser(ext: string, parser: DocumentParser): void {
174
- if (!ext.startsWith(".")) {
175
- throw new DocumentIngestError(`extension must start with "." (got "${ext}")`);
176
- }
177
- customParsers.set(ext.toLowerCase(), parser);
178
- }
179
-
180
- /**
181
- * Clear all registered parsers. For tests.
182
- */
183
- export function clearDocumentParsers(): void {
184
- customParsers.clear();
185
- }
186
-
187
- const inputSchema = z.object({
188
- path: z
189
- .string()
190
- .min(1)
191
- .describe("Workspace-relative path to the file. Paths escaping the workspace are rejected."),
192
- maxBytes: z
193
- .number()
194
- .int()
195
- .positive()
196
- .max(10_000_000)
197
- .optional()
198
- .describe("Hard cap. Default 1MB. Files larger than this are truncated with a notice."),
199
- });
200
-
201
- const DEFAULT_MAX_BYTES = 1_000_000;
202
-
203
- export const ingestDocument: RegisteredTool = buildTool({
204
- name: "IngestDocument",
205
- description:
206
- "Read a file inside the workspace and return its content with structured metadata. Paths escaping the workspace root are rejected. Supports plain text, CSV/TSV, JSON, YAML out of the box; PDF/docx/xlsx need an operator-registered parser.",
207
- inputSchema,
208
- readOnly: true,
209
- destructive: false,
210
- execute: async (input) => {
211
- const abs = resolveSafe("IngestDocument", input.path);
212
- if (!existsSync(abs)) {
213
- throw new DocumentIngestError(`file not found: ${abs}`);
214
- }
215
- const stat = statSync(abs);
216
- if (!stat.isFile()) {
217
- throw new DocumentIngestError(`not a regular file: ${abs}`);
218
- }
219
- const ext = extname(abs).toLowerCase();
220
- const maxBytes = input.maxBytes ?? DEFAULT_MAX_BYTES;
221
-
222
- // Operator-registered parser takes priority over built-in handling.
223
- const customParser = customParsers.get(ext);
224
- if (customParser !== undefined) {
225
- const result = await customParser(abs);
226
- return renderResult({
227
- path: abs,
228
- content: result.content,
229
- metadata: { ext, size: stat.size, ...result.metadata },
230
- maxBytes,
231
- });
232
- }
233
-
234
- if (STUB_EXTENSIONS.has(ext)) {
235
- throw new DocumentIngestError(
236
- `extension "${ext}" needs a parser registered via registerDocumentParser(). See @crewhaus/tool-document-ingest README for the pdf-parse / mammoth / xlsx setup.`,
237
- );
238
- }
239
-
240
- if (
241
- TEXT_EXTENSIONS.has(ext) ||
242
- TABULAR_EXTENSIONS.has(ext) ||
243
- STRUCTURED_EXTENSIONS.has(ext) ||
244
- ext === ""
245
- ) {
246
- const raw = readTextNoFollow(abs);
247
- const metadata: Record<string, unknown> = {
248
- ext: ext || "(none)",
249
- size: stat.size,
250
- lines: countLines(raw),
251
- };
252
- if (TABULAR_EXTENSIONS.has(ext)) {
253
- const delim = ext === ".tsv" ? "\t" : ",";
254
- const lines = raw.split("\n").filter((l) => l.length > 0);
255
- metadata["rows"] = lines.length;
256
- metadata["columns"] = (lines[0] ?? "").split(delim).length;
257
- }
258
- if (STRUCTURED_EXTENSIONS.has(ext)) {
259
- if (ext === ".json") {
260
- try {
261
- JSON.parse(raw);
262
- metadata["valid_json"] = true;
263
- } catch (err) {
264
- metadata["valid_json"] = false;
265
- metadata["parse_error"] = (err as Error).message.slice(0, 200);
266
- }
267
- }
268
- }
269
- return renderResult({ path: abs, content: raw, metadata, maxBytes });
270
- }
271
-
272
- throw new DocumentIngestError(
273
- `extension "${ext}" is not handled by built-in ingest. Register a parser via registerDocumentParser("${ext}", …), or rename to .txt/.md if it's plain text.`,
274
- );
275
- },
276
- });
277
-
278
- function renderResult(args: {
279
- readonly path: string;
280
- readonly content: string;
281
- readonly metadata: Record<string, unknown>;
282
- readonly maxBytes: number;
283
- }): string {
284
- let content = args.content;
285
- let truncated = false;
286
- if (Buffer.byteLength(content, "utf8") > args.maxBytes) {
287
- content = Buffer.from(content, "utf-8").subarray(0, args.maxBytes).toString("utf-8");
288
- truncated = true;
289
- }
290
- const lines: string[] = [
291
- `<document path="${args.path}" name="${basename(args.path)}">`,
292
- `metadata: ${JSON.stringify(args.metadata)}${truncated ? ` (TRUNCATED to ${args.maxBytes} bytes)` : ""}`,
293
- "---",
294
- content,
295
- "</document>",
296
- ];
297
- return lines.join("\n");
298
- }
299
-
300
- function countLines(s: string): number {
301
- if (s.length === 0) return 0;
302
- let n = 1;
303
- for (const ch of s) {
304
- if (ch === "\n") n++;
305
- }
306
- // A trailing newline is a line terminator, not a 4th empty line.
307
- if (s.endsWith("\n")) n--;
308
- return n;
309
- }