@crewhaus/tool-document-ingest 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@crewhaus/tool-document-ingest",
3
- "version": "0.1.1",
3
+ "version": "0.1.2",
4
4
  "type": "module",
5
5
  "description": "M4.3 — IngestDocument tool. Reads .txt/.md/.csv/.json/.yaml inline; PDF/docx/xlsx via pluggable parsers (deferred to operator-provided handlers).",
6
6
  "main": "src/index.ts",
@@ -12,16 +12,16 @@
12
12
  "test": "bun test src"
13
13
  },
14
14
  "dependencies": {
15
- "@crewhaus/errors": "0.1.1",
16
- "@crewhaus/tool-builder": "0.1.1",
17
- "@crewhaus/tool-catalog": "0.1.1",
15
+ "@crewhaus/errors": "0.1.2",
16
+ "@crewhaus/tool-builder": "0.1.2",
17
+ "@crewhaus/tool-catalog": "0.1.2",
18
18
  "zod": "^3.23.8"
19
19
  },
20
20
  "license": "Apache-2.0",
21
21
  "author": {
22
22
  "name": "Max Meier",
23
- "email": "max@studiomax.io",
24
- "url": "https://studiomax.io"
23
+ "email": "max@crewhaus.ai",
24
+ "url": "https://crewhaus.ai"
25
25
  },
26
26
  "repository": {
27
27
  "type": "git",
@@ -33,12 +33,7 @@
33
33
  "url": "https://github.com/crewhaus/factory/issues"
34
34
  },
35
35
  "publishConfig": {
36
- "access": "restricted"
36
+ "access": "public"
37
37
  },
38
- "files": [
39
- "src",
40
- "README.md",
41
- "LICENSE",
42
- "NOTICE"
43
- ]
38
+ "files": ["src", "README.md", "LICENSE", "NOTICE"]
44
39
  }
package/src/index.test.ts CHANGED
@@ -1,34 +1,48 @@
1
1
  /**
2
2
  * Tests for tool-document-ingest. Built-in handling for plain text /
3
- * structured / tabular formats, plus operator-registered parsers.
3
+ * structured / tabular formats, operator-registered parsers, and
4
+ * workspace path containment (mirrors tool-fs's traversal-rejection
5
+ * cases).
6
+ *
7
+ * The harness chdirs into a temp workspace because IngestDocument is
8
+ * sandboxed to `process.cwd()` — tests address files by workspace-
9
+ * relative path.
4
10
  */
5
11
  import { afterEach, beforeEach, describe, expect, test } from "bun:test";
6
- import { mkdtempSync, rmSync, writeFileSync } from "node:fs";
12
+ import { mkdirSync, mkdtempSync, realpathSync, rmSync, symlinkSync, writeFileSync } from "node:fs";
7
13
  import { tmpdir } from "node:os";
8
14
  import { join } from "node:path";
15
+ import { CrewhausError } from "@crewhaus/errors";
9
16
  import {
10
17
  DocumentIngestError,
18
+ ToolPermissionError,
11
19
  clearDocumentParsers,
12
20
  ingestDocument,
13
21
  registerDocumentParser,
14
22
  } from "./index";
15
23
 
16
24
  let tmp: string;
25
+ let originalCwd: string;
17
26
 
18
27
  beforeEach(() => {
19
- tmp = mkdtempSync(join(tmpdir(), "doc-ingest-"));
28
+ originalCwd = process.cwd();
29
+ // realpath so absolute-path-inside-workspace assertions hold on macOS,
30
+ // where tmpdir() lives behind the /var → /private/var symlink.
31
+ tmp = realpathSync(mkdtempSync(join(tmpdir(), "doc-ingest-")));
32
+ process.chdir(tmp);
20
33
  clearDocumentParsers();
21
34
  });
22
35
 
23
36
  afterEach(() => {
37
+ process.chdir(originalCwd);
24
38
  rmSync(tmp, { recursive: true, force: true });
25
39
  clearDocumentParsers();
26
40
  });
27
41
 
42
+ /** Write `content` into the temp workspace; returns the workspace-relative path. */
28
43
  function writeFile(name: string, content: string): string {
29
- const path = join(tmp, name);
30
- writeFileSync(path, content);
31
- return path;
44
+ writeFileSync(join(tmp, name), content);
45
+ return name;
32
46
  }
33
47
 
34
48
  describe("ingestDocument — basics", () => {
@@ -39,7 +53,7 @@ describe("ingestDocument — basics", () => {
39
53
  });
40
54
 
41
55
  test("throws when file does not exist", async () => {
42
- await expect(ingestDocument.execute({ path: "/does/not/exist.txt" })).rejects.toThrow(
56
+ await expect(ingestDocument.execute({ path: "does/not/exist.txt" })).rejects.toThrow(
43
57
  DocumentIngestError,
44
58
  );
45
59
  });
@@ -61,6 +75,62 @@ describe("ingestDocument — basics", () => {
61
75
  });
62
76
  });
63
77
 
78
+ describe("ingestDocument — path containment", () => {
79
+ test("ToolPermissionError is a CrewhausError with code 'tool'", () => {
80
+ const err = new ToolPermissionError("IngestDocument", "../../escape");
81
+ expect(err).toBeInstanceOf(CrewhausError);
82
+ expect(err.code).toBe("tool");
83
+ expect(err.toolName).toBe("IngestDocument");
84
+ expect(err.path).toBe("../../escape");
85
+ expect(err.message).toContain("escapes the workspace root");
86
+ });
87
+
88
+ test("rejects parent-directory traversal", async () => {
89
+ await expect(ingestDocument.execute({ path: "../../../etc/passwd" })).rejects.toBeInstanceOf(
90
+ ToolPermissionError,
91
+ );
92
+ });
93
+
94
+ test("rejects absolute path outside workspace", async () => {
95
+ await expect(ingestDocument.execute({ path: "/etc/passwd" })).rejects.toBeInstanceOf(
96
+ ToolPermissionError,
97
+ );
98
+ });
99
+
100
+ test("rejects subdir-then-traversal", async () => {
101
+ mkdirSync(join(tmp, "sub"));
102
+ await expect(ingestDocument.execute({ path: "sub/../../escape.txt" })).rejects.toBeInstanceOf(
103
+ ToolPermissionError,
104
+ );
105
+ });
106
+
107
+ test("rejects an in-root symlink whose target escapes the workspace", async () => {
108
+ const outside = mkdtempSync(join(tmpdir(), "doc-ingest-outside-"));
109
+ try {
110
+ writeFileSync(join(outside, "secret.txt"), "top secret");
111
+ symlinkSync(join(outside, "secret.txt"), join(tmp, "link.txt"));
112
+ await expect(ingestDocument.execute({ path: "link.txt" })).rejects.toBeInstanceOf(
113
+ ToolPermissionError,
114
+ );
115
+ } finally {
116
+ rmSync(outside, { recursive: true, force: true });
117
+ }
118
+ });
119
+
120
+ test("allows an absolute path inside the workspace", async () => {
121
+ writeFile("inside.txt", "in-root content");
122
+ const result = await ingestDocument.execute({ path: join(tmp, "inside.txt") });
123
+ expect(result).toContain("in-root content");
124
+ });
125
+
126
+ test("allows an in-root symlink to an in-root file (no over-blocking)", async () => {
127
+ writeFile("real.txt", "linked content");
128
+ symlinkSync(join(tmp, "real.txt"), join(tmp, "good-link.txt"));
129
+ const result = await ingestDocument.execute({ path: "good-link.txt" });
130
+ expect(result).toContain("linked content");
131
+ });
132
+ });
133
+
64
134
  describe("ingestDocument — tabular formats", () => {
65
135
  test("counts rows + columns in CSV", async () => {
66
136
  const path = writeFile("data.csv", "a,b,c\n1,2,3\n4,5,6\n");
package/src/index.ts CHANGED
@@ -20,15 +20,27 @@
20
20
  * can register their own parser via `registerDocumentParser`; the
21
21
  * tool's contract stays the same.
22
22
  *
23
- * Security note: the path is a Bun.file() read of a user-controlled
24
- * string. The runtime should classify the OUTPUT via boundary-classifier
25
- * with origin "user" files in the user's host are developer-trusted,
26
- * but the contents may contain anything (e.g. a prompt-injecting PDF).
27
- * Pillar 3 compliance is satisfied by the existing `tool` origin
28
- * classifier in runtime-core.
23
+ * Security note: the path is user-controlled (the model supplies it, and
24
+ * the model may be steered by injected content). Two defenses apply:
25
+ * 1. Containmentthe path is resolved against `process.cwd()` and
26
+ * rejected if it escapes the workspace root, lexically (`..` or
27
+ * absolute escapes) or via an in-root symlink whose real target
28
+ * lies outside (CWE-59). See `resolveSafe` below.
29
+ * 2. Output classification — the runtime classifies the OUTPUT via
30
+ * boundary-classifier with the existing `tool` origin (Pillar 3);
31
+ * file contents may contain anything (e.g. a prompt-injecting PDF).
29
32
  */
30
- import { existsSync, readFileSync, statSync } from "node:fs";
31
- import { basename, extname, resolve } from "node:path";
33
+ import {
34
+ closeSync,
35
+ existsSync,
36
+ constants as fsConstants,
37
+ fstatSync,
38
+ openSync,
39
+ readSync,
40
+ realpathSync,
41
+ statSync,
42
+ } from "node:fs";
43
+ import { basename, dirname, extname, join, resolve, sep } from "node:path";
32
44
  import { CrewhausError } from "@crewhaus/errors";
33
45
  import { buildTool } from "@crewhaus/tool-builder";
34
46
  import type { RegisteredTool } from "@crewhaus/tool-catalog";
@@ -49,6 +61,95 @@ export class DocumentIngestError extends CrewhausError {
49
61
  }
50
62
  }
51
63
 
64
+ export class ToolPermissionError extends CrewhausError {
65
+ override readonly name = "ToolPermissionError";
66
+ readonly toolName: string;
67
+ readonly path: string;
68
+
69
+ constructor(toolName: string, attemptedPath: string) {
70
+ super(
71
+ "tool",
72
+ `tool "${toolName}" rejected path "${attemptedPath}": resolved location escapes the workspace root`,
73
+ );
74
+ this.toolName = toolName;
75
+ this.path = attemptedPath;
76
+ }
77
+ }
78
+
79
+ /**
80
+ * Resolve `rel` against the workspace root and reject anything that escapes.
81
+ * Mirrors `tool-fs`'s `resolveSafe` — duplicated here (like `tool-image`)
82
+ * rather than extracted to a shared package; keep the copies in sync.
83
+ */
84
+ function resolveSafe(toolName: string, rel: string, root: string = process.cwd()): string {
85
+ const rootResolved = resolve(root);
86
+ const abs = resolve(rootResolved, rel);
87
+ // 1) Lexical containment — fast path; rejects `..` and absolute escapes.
88
+ // The trailing `sep` avoids the `/root` vs `/root-sibling` pitfall.
89
+ if (abs !== rootResolved && !abs.startsWith(`${rootResolved}${sep}`)) {
90
+ throw new ToolPermissionError(toolName, rel);
91
+ }
92
+ // 2) Symlink-aware containment (CWE-59). The lexical check above is fooled
93
+ // by an in-root symlink that points outside the workspace, so re-check
94
+ // the REAL path. The leaf may not exist (the file-not-found error comes
95
+ // after containment so escaping paths never leak existence info), so
96
+ // resolve the deepest existing ancestor and re-append the missing tail.
97
+ // Fails closed if realpath errors for any reason other than the walk.
98
+ let real: string;
99
+ try {
100
+ const rootReal = realpathSync(rootResolved);
101
+ let probe = abs;
102
+ const tail: string[] = [];
103
+ while (!existsSync(probe)) {
104
+ tail.unshift(basename(probe));
105
+ const parent = dirname(probe);
106
+ if (parent === probe) break; // reached the filesystem root
107
+ probe = parent;
108
+ }
109
+ real = tail.length > 0 ? join(realpathSync(probe), ...tail) : realpathSync(probe);
110
+ if (real !== rootReal && !real.startsWith(`${rootReal}${sep}`)) {
111
+ throw new ToolPermissionError(toolName, rel);
112
+ }
113
+ } catch (err) {
114
+ if (err instanceof ToolPermissionError) throw err;
115
+ throw new ToolPermissionError(toolName, rel);
116
+ }
117
+ // Return the validated REAL path; the read below opens it with O_NOFOLLOW so a
118
+ // leaf swapped to a symlink after this check (TOCTOU/CWE-367) is rejected.
119
+ return real;
120
+ }
121
+
122
+ /**
123
+ * Read a resolveSafe-validated text file with O_NOFOLLOW so a leaf swapped to a
124
+ * symlink after the containment check is rejected rather than followed out of
125
+ * the workspace. (Custom parsers read the resolved realpath themselves and are
126
+ * out of this guard's scope.)
127
+ */
128
+ function readTextNoFollow(absPath: string): string {
129
+ let fd: number;
130
+ try {
131
+ fd = openSync(absPath, fsConstants.O_RDONLY | fsConstants.O_NOFOLLOW);
132
+ } catch (err) {
133
+ if ((err as NodeJS.ErrnoException).code === "ELOOP") {
134
+ throw new ToolPermissionError("IngestDocument", absPath);
135
+ }
136
+ throw err;
137
+ }
138
+ try {
139
+ const { size } = fstatSync(fd);
140
+ const b = Buffer.allocUnsafe(size);
141
+ let offset = 0;
142
+ while (offset < size) {
143
+ const n = readSync(fd, b, offset, size - offset, offset);
144
+ if (n === 0) break;
145
+ offset += n;
146
+ }
147
+ return b.toString("utf-8", 0, offset);
148
+ } finally {
149
+ closeSync(fd);
150
+ }
151
+ }
152
+
52
153
  const TEXT_EXTENSIONS = new Set([".txt", ".md", ".mdx", ".log", ".out", ".rst"]);
53
154
  const TABULAR_EXTENSIONS = new Set([".csv", ".tsv"]);
54
155
  const STRUCTURED_EXTENSIONS = new Set([".json", ".yaml", ".yml"]);
@@ -84,7 +185,10 @@ export function clearDocumentParsers(): void {
84
185
  }
85
186
 
86
187
  const inputSchema = z.object({
87
- path: z.string().min(1).describe("Absolute or cwd-relative path to the file."),
188
+ path: z
189
+ .string()
190
+ .min(1)
191
+ .describe("Workspace-relative path to the file. Paths escaping the workspace are rejected."),
88
192
  maxBytes: z
89
193
  .number()
90
194
  .int()
@@ -99,12 +203,12 @@ const DEFAULT_MAX_BYTES = 1_000_000;
99
203
  export const ingestDocument: RegisteredTool = buildTool({
100
204
  name: "IngestDocument",
101
205
  description:
102
- "Read a file from the host filesystem and return its content with structured metadata. Supports plain text, CSV/TSV, JSON, YAML out of the box; PDF/docx/xlsx need an operator-registered parser.",
206
+ "Read a file inside the workspace and return its content with structured metadata. Paths escaping the workspace root are rejected. Supports plain text, CSV/TSV, JSON, YAML out of the box; PDF/docx/xlsx need an operator-registered parser.",
103
207
  inputSchema,
104
208
  readOnly: true,
105
209
  destructive: false,
106
210
  execute: async (input) => {
107
- const abs = resolve(input.path);
211
+ const abs = resolveSafe("IngestDocument", input.path);
108
212
  if (!existsSync(abs)) {
109
213
  throw new DocumentIngestError(`file not found: ${abs}`);
110
214
  }
@@ -139,7 +243,7 @@ export const ingestDocument: RegisteredTool = buildTool({
139
243
  STRUCTURED_EXTENSIONS.has(ext) ||
140
244
  ext === ""
141
245
  ) {
142
- const raw = readFileSync(abs, "utf-8");
246
+ const raw = readTextNoFollow(abs);
143
247
  const metadata: Record<string, unknown> = {
144
248
  ext: ext || "(none)",
145
249
  size: stat.size,