@crewhaus/tool-document-ingest 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +8 -13
- package/src/index.test.ts +77 -7
- package/src/index.ts +116 -12
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@crewhaus/tool-document-ingest",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.3",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "M4.3 — IngestDocument tool. Reads .txt/.md/.csv/.json/.yaml inline; PDF/docx/xlsx via pluggable parsers (deferred to operator-provided handlers).",
|
|
6
6
|
"main": "src/index.ts",
|
|
@@ -12,16 +12,16 @@
|
|
|
12
12
|
"test": "bun test src"
|
|
13
13
|
},
|
|
14
14
|
"dependencies": {
|
|
15
|
-
"@crewhaus/errors": "0.1.
|
|
16
|
-
"@crewhaus/tool-builder": "0.1.
|
|
17
|
-
"@crewhaus/tool-catalog": "0.1.
|
|
15
|
+
"@crewhaus/errors": "0.1.3",
|
|
16
|
+
"@crewhaus/tool-builder": "0.1.3",
|
|
17
|
+
"@crewhaus/tool-catalog": "0.1.3",
|
|
18
18
|
"zod": "^3.23.8"
|
|
19
19
|
},
|
|
20
20
|
"license": "Apache-2.0",
|
|
21
21
|
"author": {
|
|
22
22
|
"name": "Max Meier",
|
|
23
|
-
"email": "max@
|
|
24
|
-
"url": "https://
|
|
23
|
+
"email": "max@crewhaus.ai",
|
|
24
|
+
"url": "https://crewhaus.ai"
|
|
25
25
|
},
|
|
26
26
|
"repository": {
|
|
27
27
|
"type": "git",
|
|
@@ -33,12 +33,7 @@
|
|
|
33
33
|
"url": "https://github.com/crewhaus/factory/issues"
|
|
34
34
|
},
|
|
35
35
|
"publishConfig": {
|
|
36
|
-
"access": "
|
|
36
|
+
"access": "public"
|
|
37
37
|
},
|
|
38
|
-
"files": [
|
|
39
|
-
"src",
|
|
40
|
-
"README.md",
|
|
41
|
-
"LICENSE",
|
|
42
|
-
"NOTICE"
|
|
43
|
-
]
|
|
38
|
+
"files": ["src", "README.md", "LICENSE", "NOTICE"]
|
|
44
39
|
}
|
package/src/index.test.ts
CHANGED
|
@@ -1,34 +1,48 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Tests for tool-document-ingest. Built-in handling for plain text /
|
|
3
|
-
* structured / tabular formats,
|
|
3
|
+
* structured / tabular formats, operator-registered parsers, and
|
|
4
|
+
* workspace path containment (mirrors tool-fs's traversal-rejection
|
|
5
|
+
* cases).
|
|
6
|
+
*
|
|
7
|
+
* The harness chdirs into a temp workspace because IngestDocument is
|
|
8
|
+
* sandboxed to `process.cwd()` — tests address files by workspace-
|
|
9
|
+
* relative path.
|
|
4
10
|
*/
|
|
5
11
|
import { afterEach, beforeEach, describe, expect, test } from "bun:test";
|
|
6
|
-
import { mkdtempSync, rmSync, writeFileSync } from "node:fs";
|
|
12
|
+
import { mkdirSync, mkdtempSync, realpathSync, rmSync, symlinkSync, writeFileSync } from "node:fs";
|
|
7
13
|
import { tmpdir } from "node:os";
|
|
8
14
|
import { join } from "node:path";
|
|
15
|
+
import { CrewhausError } from "@crewhaus/errors";
|
|
9
16
|
import {
|
|
10
17
|
DocumentIngestError,
|
|
18
|
+
ToolPermissionError,
|
|
11
19
|
clearDocumentParsers,
|
|
12
20
|
ingestDocument,
|
|
13
21
|
registerDocumentParser,
|
|
14
22
|
} from "./index";
|
|
15
23
|
|
|
16
24
|
let tmp: string;
|
|
25
|
+
let originalCwd: string;
|
|
17
26
|
|
|
18
27
|
beforeEach(() => {
|
|
19
|
-
|
|
28
|
+
originalCwd = process.cwd();
|
|
29
|
+
// realpath so absolute-path-inside-workspace assertions hold on macOS,
|
|
30
|
+
// where tmpdir() lives behind the /var → /private/var symlink.
|
|
31
|
+
tmp = realpathSync(mkdtempSync(join(tmpdir(), "doc-ingest-")));
|
|
32
|
+
process.chdir(tmp);
|
|
20
33
|
clearDocumentParsers();
|
|
21
34
|
});
|
|
22
35
|
|
|
23
36
|
afterEach(() => {
|
|
37
|
+
process.chdir(originalCwd);
|
|
24
38
|
rmSync(tmp, { recursive: true, force: true });
|
|
25
39
|
clearDocumentParsers();
|
|
26
40
|
});
|
|
27
41
|
|
|
42
|
+
/** Write `content` into the temp workspace; returns the workspace-relative path. */
|
|
28
43
|
function writeFile(name: string, content: string): string {
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
return path;
|
|
44
|
+
writeFileSync(join(tmp, name), content);
|
|
45
|
+
return name;
|
|
32
46
|
}
|
|
33
47
|
|
|
34
48
|
describe("ingestDocument — basics", () => {
|
|
@@ -39,7 +53,7 @@ describe("ingestDocument — basics", () => {
|
|
|
39
53
|
});
|
|
40
54
|
|
|
41
55
|
test("throws when file does not exist", async () => {
|
|
42
|
-
await expect(ingestDocument.execute({ path: "
|
|
56
|
+
await expect(ingestDocument.execute({ path: "does/not/exist.txt" })).rejects.toThrow(
|
|
43
57
|
DocumentIngestError,
|
|
44
58
|
);
|
|
45
59
|
});
|
|
@@ -61,6 +75,62 @@ describe("ingestDocument — basics", () => {
|
|
|
61
75
|
});
|
|
62
76
|
});
|
|
63
77
|
|
|
78
|
+
describe("ingestDocument — path containment", () => {
|
|
79
|
+
test("ToolPermissionError is a CrewhausError with code 'tool'", () => {
|
|
80
|
+
const err = new ToolPermissionError("IngestDocument", "../../escape");
|
|
81
|
+
expect(err).toBeInstanceOf(CrewhausError);
|
|
82
|
+
expect(err.code).toBe("tool");
|
|
83
|
+
expect(err.toolName).toBe("IngestDocument");
|
|
84
|
+
expect(err.path).toBe("../../escape");
|
|
85
|
+
expect(err.message).toContain("escapes the workspace root");
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
test("rejects parent-directory traversal", async () => {
|
|
89
|
+
await expect(ingestDocument.execute({ path: "../../../etc/passwd" })).rejects.toBeInstanceOf(
|
|
90
|
+
ToolPermissionError,
|
|
91
|
+
);
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
test("rejects absolute path outside workspace", async () => {
|
|
95
|
+
await expect(ingestDocument.execute({ path: "/etc/passwd" })).rejects.toBeInstanceOf(
|
|
96
|
+
ToolPermissionError,
|
|
97
|
+
);
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
test("rejects subdir-then-traversal", async () => {
|
|
101
|
+
mkdirSync(join(tmp, "sub"));
|
|
102
|
+
await expect(ingestDocument.execute({ path: "sub/../../escape.txt" })).rejects.toBeInstanceOf(
|
|
103
|
+
ToolPermissionError,
|
|
104
|
+
);
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
test("rejects an in-root symlink whose target escapes the workspace", async () => {
|
|
108
|
+
const outside = mkdtempSync(join(tmpdir(), "doc-ingest-outside-"));
|
|
109
|
+
try {
|
|
110
|
+
writeFileSync(join(outside, "secret.txt"), "top secret");
|
|
111
|
+
symlinkSync(join(outside, "secret.txt"), join(tmp, "link.txt"));
|
|
112
|
+
await expect(ingestDocument.execute({ path: "link.txt" })).rejects.toBeInstanceOf(
|
|
113
|
+
ToolPermissionError,
|
|
114
|
+
);
|
|
115
|
+
} finally {
|
|
116
|
+
rmSync(outside, { recursive: true, force: true });
|
|
117
|
+
}
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
test("allows an absolute path inside the workspace", async () => {
|
|
121
|
+
writeFile("inside.txt", "in-root content");
|
|
122
|
+
const result = await ingestDocument.execute({ path: join(tmp, "inside.txt") });
|
|
123
|
+
expect(result).toContain("in-root content");
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
test("allows an in-root symlink to an in-root file (no over-blocking)", async () => {
|
|
127
|
+
writeFile("real.txt", "linked content");
|
|
128
|
+
symlinkSync(join(tmp, "real.txt"), join(tmp, "good-link.txt"));
|
|
129
|
+
const result = await ingestDocument.execute({ path: "good-link.txt" });
|
|
130
|
+
expect(result).toContain("linked content");
|
|
131
|
+
});
|
|
132
|
+
});
|
|
133
|
+
|
|
64
134
|
describe("ingestDocument — tabular formats", () => {
|
|
65
135
|
test("counts rows + columns in CSV", async () => {
|
|
66
136
|
const path = writeFile("data.csv", "a,b,c\n1,2,3\n4,5,6\n");
|
package/src/index.ts
CHANGED
|
@@ -20,15 +20,27 @@
|
|
|
20
20
|
* can register their own parser via `registerDocumentParser`; the
|
|
21
21
|
* tool's contract stays the same.
|
|
22
22
|
*
|
|
23
|
-
* Security note: the path is
|
|
24
|
-
*
|
|
25
|
-
*
|
|
26
|
-
*
|
|
27
|
-
*
|
|
28
|
-
*
|
|
23
|
+
* Security note: the path is user-controlled (the model supplies it, and
|
|
24
|
+
* the model may be steered by injected content). Two defenses apply:
|
|
25
|
+
* 1. Containment — the path is resolved against `process.cwd()` and
|
|
26
|
+
* rejected if it escapes the workspace root, lexically (`..` or
|
|
27
|
+
* absolute escapes) or via an in-root symlink whose real target
|
|
28
|
+
* lies outside (CWE-59). See `resolveSafe` below.
|
|
29
|
+
* 2. Output classification — the runtime classifies the OUTPUT via
|
|
30
|
+
* boundary-classifier with the existing `tool` origin (Pillar 3);
|
|
31
|
+
* file contents may contain anything (e.g. a prompt-injecting PDF).
|
|
29
32
|
*/
|
|
30
|
-
import {
|
|
31
|
-
|
|
33
|
+
import {
|
|
34
|
+
closeSync,
|
|
35
|
+
existsSync,
|
|
36
|
+
constants as fsConstants,
|
|
37
|
+
fstatSync,
|
|
38
|
+
openSync,
|
|
39
|
+
readSync,
|
|
40
|
+
realpathSync,
|
|
41
|
+
statSync,
|
|
42
|
+
} from "node:fs";
|
|
43
|
+
import { basename, dirname, extname, join, resolve, sep } from "node:path";
|
|
32
44
|
import { CrewhausError } from "@crewhaus/errors";
|
|
33
45
|
import { buildTool } from "@crewhaus/tool-builder";
|
|
34
46
|
import type { RegisteredTool } from "@crewhaus/tool-catalog";
|
|
@@ -49,6 +61,95 @@ export class DocumentIngestError extends CrewhausError {
|
|
|
49
61
|
}
|
|
50
62
|
}
|
|
51
63
|
|
|
64
|
+
export class ToolPermissionError extends CrewhausError {
|
|
65
|
+
override readonly name = "ToolPermissionError";
|
|
66
|
+
readonly toolName: string;
|
|
67
|
+
readonly path: string;
|
|
68
|
+
|
|
69
|
+
constructor(toolName: string, attemptedPath: string) {
|
|
70
|
+
super(
|
|
71
|
+
"tool",
|
|
72
|
+
`tool "${toolName}" rejected path "${attemptedPath}": resolved location escapes the workspace root`,
|
|
73
|
+
);
|
|
74
|
+
this.toolName = toolName;
|
|
75
|
+
this.path = attemptedPath;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Resolve `rel` against the workspace root and reject anything that escapes.
|
|
81
|
+
* Mirrors `tool-fs`'s `resolveSafe` — duplicated here (like `tool-image`)
|
|
82
|
+
* rather than extracted to a shared package; keep the copies in sync.
|
|
83
|
+
*/
|
|
84
|
+
function resolveSafe(toolName: string, rel: string, root: string = process.cwd()): string {
|
|
85
|
+
const rootResolved = resolve(root);
|
|
86
|
+
const abs = resolve(rootResolved, rel);
|
|
87
|
+
// 1) Lexical containment — fast path; rejects `..` and absolute escapes.
|
|
88
|
+
// The trailing `sep` avoids the `/root` vs `/root-sibling` pitfall.
|
|
89
|
+
if (abs !== rootResolved && !abs.startsWith(`${rootResolved}${sep}`)) {
|
|
90
|
+
throw new ToolPermissionError(toolName, rel);
|
|
91
|
+
}
|
|
92
|
+
// 2) Symlink-aware containment (CWE-59). The lexical check above is fooled
|
|
93
|
+
// by an in-root symlink that points outside the workspace, so re-check
|
|
94
|
+
// the REAL path. The leaf may not exist (the file-not-found error comes
|
|
95
|
+
// after containment so escaping paths never leak existence info), so
|
|
96
|
+
// resolve the deepest existing ancestor and re-append the missing tail.
|
|
97
|
+
// Fails closed if realpath errors for any reason other than the walk.
|
|
98
|
+
let real: string;
|
|
99
|
+
try {
|
|
100
|
+
const rootReal = realpathSync(rootResolved);
|
|
101
|
+
let probe = abs;
|
|
102
|
+
const tail: string[] = [];
|
|
103
|
+
while (!existsSync(probe)) {
|
|
104
|
+
tail.unshift(basename(probe));
|
|
105
|
+
const parent = dirname(probe);
|
|
106
|
+
if (parent === probe) break; // reached the filesystem root
|
|
107
|
+
probe = parent;
|
|
108
|
+
}
|
|
109
|
+
real = tail.length > 0 ? join(realpathSync(probe), ...tail) : realpathSync(probe);
|
|
110
|
+
if (real !== rootReal && !real.startsWith(`${rootReal}${sep}`)) {
|
|
111
|
+
throw new ToolPermissionError(toolName, rel);
|
|
112
|
+
}
|
|
113
|
+
} catch (err) {
|
|
114
|
+
if (err instanceof ToolPermissionError) throw err;
|
|
115
|
+
throw new ToolPermissionError(toolName, rel);
|
|
116
|
+
}
|
|
117
|
+
// Return the validated REAL path; the read below opens it with O_NOFOLLOW so a
|
|
118
|
+
// leaf swapped to a symlink after this check (TOCTOU/CWE-367) is rejected.
|
|
119
|
+
return real;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/**
|
|
123
|
+
* Read a resolveSafe-validated text file with O_NOFOLLOW so a leaf swapped to a
|
|
124
|
+
* symlink after the containment check is rejected rather than followed out of
|
|
125
|
+
* the workspace. (Custom parsers read the resolved realpath themselves and are
|
|
126
|
+
* out of this guard's scope.)
|
|
127
|
+
*/
|
|
128
|
+
function readTextNoFollow(absPath: string): string {
|
|
129
|
+
let fd: number;
|
|
130
|
+
try {
|
|
131
|
+
fd = openSync(absPath, fsConstants.O_RDONLY | fsConstants.O_NOFOLLOW);
|
|
132
|
+
} catch (err) {
|
|
133
|
+
if ((err as NodeJS.ErrnoException).code === "ELOOP") {
|
|
134
|
+
throw new ToolPermissionError("IngestDocument", absPath);
|
|
135
|
+
}
|
|
136
|
+
throw err;
|
|
137
|
+
}
|
|
138
|
+
try {
|
|
139
|
+
const { size } = fstatSync(fd);
|
|
140
|
+
const b = Buffer.allocUnsafe(size);
|
|
141
|
+
let offset = 0;
|
|
142
|
+
while (offset < size) {
|
|
143
|
+
const n = readSync(fd, b, offset, size - offset, offset);
|
|
144
|
+
if (n === 0) break;
|
|
145
|
+
offset += n;
|
|
146
|
+
}
|
|
147
|
+
return b.toString("utf-8", 0, offset);
|
|
148
|
+
} finally {
|
|
149
|
+
closeSync(fd);
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
52
153
|
const TEXT_EXTENSIONS = new Set([".txt", ".md", ".mdx", ".log", ".out", ".rst"]);
|
|
53
154
|
const TABULAR_EXTENSIONS = new Set([".csv", ".tsv"]);
|
|
54
155
|
const STRUCTURED_EXTENSIONS = new Set([".json", ".yaml", ".yml"]);
|
|
@@ -84,7 +185,10 @@ export function clearDocumentParsers(): void {
|
|
|
84
185
|
}
|
|
85
186
|
|
|
86
187
|
const inputSchema = z.object({
|
|
87
|
-
path: z
|
|
188
|
+
path: z
|
|
189
|
+
.string()
|
|
190
|
+
.min(1)
|
|
191
|
+
.describe("Workspace-relative path to the file. Paths escaping the workspace are rejected."),
|
|
88
192
|
maxBytes: z
|
|
89
193
|
.number()
|
|
90
194
|
.int()
|
|
@@ -99,12 +203,12 @@ const DEFAULT_MAX_BYTES = 1_000_000;
|
|
|
99
203
|
export const ingestDocument: RegisteredTool = buildTool({
|
|
100
204
|
name: "IngestDocument",
|
|
101
205
|
description:
|
|
102
|
-
"Read a file
|
|
206
|
+
"Read a file inside the workspace and return its content with structured metadata. Paths escaping the workspace root are rejected. Supports plain text, CSV/TSV, JSON, YAML out of the box; PDF/docx/xlsx need an operator-registered parser.",
|
|
103
207
|
inputSchema,
|
|
104
208
|
readOnly: true,
|
|
105
209
|
destructive: false,
|
|
106
210
|
execute: async (input) => {
|
|
107
|
-
const abs =
|
|
211
|
+
const abs = resolveSafe("IngestDocument", input.path);
|
|
108
212
|
if (!existsSync(abs)) {
|
|
109
213
|
throw new DocumentIngestError(`file not found: ${abs}`);
|
|
110
214
|
}
|
|
@@ -139,7 +243,7 @@ export const ingestDocument: RegisteredTool = buildTool({
|
|
|
139
243
|
STRUCTURED_EXTENSIONS.has(ext) ||
|
|
140
244
|
ext === ""
|
|
141
245
|
) {
|
|
142
|
-
const raw =
|
|
246
|
+
const raw = readTextNoFollow(abs);
|
|
143
247
|
const metadata: Record<string, unknown> = {
|
|
144
248
|
ext: ext || "(none)",
|
|
145
249
|
size: stat.size,
|