@crewhaus/tool-document-ingest 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +44 -0
- package/src/index.test.ts +147 -0
- package/src/index.ts +205 -0
package/package.json
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@crewhaus/tool-document-ingest",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"type": "module",
|
|
5
|
+
"description": "M4.3 — IngestDocument tool. Reads .txt/.md/.csv/.json/.yaml inline; PDF/docx/xlsx via pluggable parsers (deferred to operator-provided handlers).",
|
|
6
|
+
"main": "src/index.ts",
|
|
7
|
+
"types": "src/index.ts",
|
|
8
|
+
"exports": {
|
|
9
|
+
".": "./src/index.ts"
|
|
10
|
+
},
|
|
11
|
+
"scripts": {
|
|
12
|
+
"test": "bun test src"
|
|
13
|
+
},
|
|
14
|
+
"dependencies": {
|
|
15
|
+
"@crewhaus/errors": "0.0.0",
|
|
16
|
+
"@crewhaus/tool-builder": "0.0.0",
|
|
17
|
+
"@crewhaus/tool-catalog": "0.0.0",
|
|
18
|
+
"zod": "^3.23.8"
|
|
19
|
+
},
|
|
20
|
+
"license": "Apache-2.0",
|
|
21
|
+
"author": {
|
|
22
|
+
"name": "Max Meier",
|
|
23
|
+
"email": "max@studiomax.io",
|
|
24
|
+
"url": "https://studiomax.io"
|
|
25
|
+
},
|
|
26
|
+
"repository": {
|
|
27
|
+
"type": "git",
|
|
28
|
+
"url": "git+https://github.com/crewhaus/factory.git",
|
|
29
|
+
"directory": "packages/tool-document-ingest"
|
|
30
|
+
},
|
|
31
|
+
"homepage": "https://github.com/crewhaus/factory/tree/main/packages/tool-document-ingest#readme",
|
|
32
|
+
"bugs": {
|
|
33
|
+
"url": "https://github.com/crewhaus/factory/issues"
|
|
34
|
+
},
|
|
35
|
+
"publishConfig": {
|
|
36
|
+
"access": "restricted"
|
|
37
|
+
},
|
|
38
|
+
"files": [
|
|
39
|
+
"src",
|
|
40
|
+
"README.md",
|
|
41
|
+
"LICENSE",
|
|
42
|
+
"NOTICE"
|
|
43
|
+
]
|
|
44
|
+
}
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for tool-document-ingest. Built-in handling for plain text /
|
|
3
|
+
* structured / tabular formats, plus operator-registered parsers.
|
|
4
|
+
*/
|
|
5
|
+
import { afterEach, beforeEach, describe, expect, test } from "bun:test";
|
|
6
|
+
import { mkdtempSync, rmSync, writeFileSync } from "node:fs";
|
|
7
|
+
import { tmpdir } from "node:os";
|
|
8
|
+
import { join } from "node:path";
|
|
9
|
+
import {
|
|
10
|
+
DocumentIngestError,
|
|
11
|
+
clearDocumentParsers,
|
|
12
|
+
ingestDocument,
|
|
13
|
+
registerDocumentParser,
|
|
14
|
+
} from "./index";
|
|
15
|
+
|
|
16
|
+
let tmp: string;
|
|
17
|
+
|
|
18
|
+
beforeEach(() => {
|
|
19
|
+
tmp = mkdtempSync(join(tmpdir(), "doc-ingest-"));
|
|
20
|
+
clearDocumentParsers();
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
afterEach(() => {
|
|
24
|
+
rmSync(tmp, { recursive: true, force: true });
|
|
25
|
+
clearDocumentParsers();
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
function writeFile(name: string, content: string): string {
|
|
29
|
+
const path = join(tmp, name);
|
|
30
|
+
writeFileSync(path, content);
|
|
31
|
+
return path;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
describe("ingestDocument — basics", () => {
|
|
35
|
+
test("tool flags: read-only, non-destructive, named 'IngestDocument'", () => {
|
|
36
|
+
expect(ingestDocument.name).toBe("IngestDocument");
|
|
37
|
+
expect(ingestDocument.readOnly).toBe(true);
|
|
38
|
+
expect(ingestDocument.destructive).toBe(false);
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
test("throws when file does not exist", async () => {
|
|
42
|
+
await expect(ingestDocument.execute({ path: "/does/not/exist.txt" })).rejects.toThrow(
|
|
43
|
+
DocumentIngestError,
|
|
44
|
+
);
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
test("ingests a plain .txt file with the document envelope", async () => {
|
|
48
|
+
const path = writeFile("note.txt", "hello\nworld\n");
|
|
49
|
+
const result = await ingestDocument.execute({ path });
|
|
50
|
+
expect(result).toContain("<document path=");
|
|
51
|
+
expect(result).toContain("hello");
|
|
52
|
+
expect(result).toContain("</document>");
|
|
53
|
+
expect(result).toContain("metadata:");
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
test("emits line count + ext in metadata for .md", async () => {
|
|
57
|
+
const path = writeFile("doc.md", "# title\n\nbody.\n");
|
|
58
|
+
const result = await ingestDocument.execute({ path });
|
|
59
|
+
expect(result).toMatch(/"ext":"\.md"/);
|
|
60
|
+
expect(result).toMatch(/"lines":3/);
|
|
61
|
+
});
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
describe("ingestDocument — tabular formats", () => {
|
|
65
|
+
test("counts rows + columns in CSV", async () => {
|
|
66
|
+
const path = writeFile("data.csv", "a,b,c\n1,2,3\n4,5,6\n");
|
|
67
|
+
const result = await ingestDocument.execute({ path });
|
|
68
|
+
expect(result).toMatch(/"rows":3/);
|
|
69
|
+
expect(result).toMatch(/"columns":3/);
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
test("uses tab delimiter for .tsv", async () => {
|
|
73
|
+
const path = writeFile("data.tsv", "a\tb\n1\t2\n");
|
|
74
|
+
const result = await ingestDocument.execute({ path });
|
|
75
|
+
expect(result).toMatch(/"columns":2/);
|
|
76
|
+
});
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
describe("ingestDocument — structured formats", () => {
|
|
80
|
+
test("validates JSON parse for .json files", async () => {
|
|
81
|
+
const path = writeFile("config.json", '{"k":1}');
|
|
82
|
+
const result = await ingestDocument.execute({ path });
|
|
83
|
+
expect(result).toMatch(/"valid_json":true/);
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
test("flags malformed JSON without throwing", async () => {
|
|
87
|
+
const path = writeFile("config.json", "{not json}");
|
|
88
|
+
const result = await ingestDocument.execute({ path });
|
|
89
|
+
expect(result).toMatch(/"valid_json":false/);
|
|
90
|
+
expect(result).toContain("parse_error");
|
|
91
|
+
});
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
describe("ingestDocument — stubbed extensions", () => {
|
|
95
|
+
test(".pdf raises with a pointer to registerDocumentParser", async () => {
|
|
96
|
+
const path = writeFile("doc.pdf", "%PDF-1.4 fake content");
|
|
97
|
+
await expect(ingestDocument.execute({ path })).rejects.toThrow(
|
|
98
|
+
/needs a parser registered via registerDocumentParser/,
|
|
99
|
+
);
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
test(".docx and .xlsx similarly throw with extension-specific message", async () => {
|
|
103
|
+
const docx = writeFile("a.docx", "fake docx");
|
|
104
|
+
await expect(ingestDocument.execute({ path: docx })).rejects.toThrow(/"\.docx"/);
|
|
105
|
+
const xlsx = writeFile("b.xlsx", "fake xlsx");
|
|
106
|
+
await expect(ingestDocument.execute({ path: xlsx })).rejects.toThrow(/"\.xlsx"/);
|
|
107
|
+
});
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
describe("ingestDocument — operator-registered parsers", () => {
|
|
111
|
+
test("registerDocumentParser overrides built-in handling", async () => {
|
|
112
|
+
registerDocumentParser(".pdf", async (path) => ({
|
|
113
|
+
content: `parsed PDF text from ${path}`,
|
|
114
|
+
metadata: { pages: 42 },
|
|
115
|
+
}));
|
|
116
|
+
const path = writeFile("doc.pdf", "fake content");
|
|
117
|
+
const result = await ingestDocument.execute({ path });
|
|
118
|
+
expect(result).toContain("parsed PDF text from");
|
|
119
|
+
expect(result).toMatch(/"pages":42/);
|
|
120
|
+
});
|
|
121
|
+
|
|
122
|
+
test("registerDocumentParser rejects extensions without leading dot", () => {
|
|
123
|
+
expect(() => registerDocumentParser("pdf", async () => ({ content: "" }))).toThrow(
|
|
124
|
+
DocumentIngestError,
|
|
125
|
+
);
|
|
126
|
+
});
|
|
127
|
+
|
|
128
|
+
test("ext matching is case-insensitive", async () => {
|
|
129
|
+
registerDocumentParser(".PDF", async () => ({ content: "uppercase ext" }));
|
|
130
|
+
const path = writeFile("doc.pdf", "x");
|
|
131
|
+
const result = await ingestDocument.execute({ path });
|
|
132
|
+
expect(result).toContain("uppercase ext");
|
|
133
|
+
});
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
describe("ingestDocument — size cap", () => {
|
|
137
|
+
test("truncates content above maxBytes with a TRUNCATED notice", async () => {
|
|
138
|
+
const path = writeFile("big.txt", "x".repeat(200));
|
|
139
|
+
const result = await ingestDocument.execute({ path, maxBytes: 50 });
|
|
140
|
+
expect(result).toContain("TRUNCATED to 50 bytes");
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
test("default maxBytes is 1MB", () => {
|
|
144
|
+
const parsed = ingestDocument.inputSchema.parse({ path: "x" });
|
|
145
|
+
expect(parsed.maxBytes).toBeUndefined();
|
|
146
|
+
});
|
|
147
|
+
});
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Catalog R3 — tool-document-ingest. M4.3 of the heavy-hitter plan.
|
|
3
|
+
*
|
|
4
|
+
* `IngestDocument(path)` reads a file from the user's host and returns
|
|
5
|
+
* its content plus structured metadata (line count, byte size, MIME
|
|
6
|
+
* guess, optional chunks).
|
|
7
|
+
*
|
|
8
|
+
* v0 supported formats — handled inline, zero extra deps:
|
|
9
|
+
* - .txt, .md, .mdx — plain UTF-8 text
|
|
10
|
+
* - .csv, .tsv — text plus row count
|
|
11
|
+
* - .json, .yaml, .yml — text plus parse-validation
|
|
12
|
+
* - .log, .out — plain text
|
|
13
|
+
*
|
|
14
|
+
* Stubbed formats — return a clear "needs operator-registered parser"
|
|
15
|
+
* error pointing at `registerDocumentParser(ext, parser)`:
|
|
16
|
+
* - .pdf, .docx, .doc, .xlsx, .xls, .pptx, .epub
|
|
17
|
+
*
|
|
18
|
+
* Why no pdf-parse / mammoth / xlsx deps in v0: those packages weigh
|
|
19
|
+
* several MB each and have native sub-deps. Operators who need them
|
|
20
|
+
* can register their own parser via `registerDocumentParser`; the
|
|
21
|
+
* tool's contract stays the same.
|
|
22
|
+
*
|
|
23
|
+
* Security note: the path is a Bun.file() read of a user-controlled
|
|
24
|
+
* string. The runtime should classify the OUTPUT via boundary-classifier
|
|
25
|
+
* with origin "user" — files in the user's host are developer-trusted,
|
|
26
|
+
* but the contents may contain anything (e.g. a prompt-injecting PDF).
|
|
27
|
+
* Pillar 3 compliance is satisfied by the existing `tool` origin
|
|
28
|
+
* classifier in runtime-core.
|
|
29
|
+
*/
|
|
30
|
+
import { existsSync, readFileSync, statSync } from "node:fs";
|
|
31
|
+
import { basename, extname, resolve } from "node:path";
|
|
32
|
+
import { CrewhausError } from "@crewhaus/errors";
|
|
33
|
+
import { buildTool } from "@crewhaus/tool-builder";
|
|
34
|
+
import type { RegisteredTool } from "@crewhaus/tool-catalog";
|
|
35
|
+
import { z } from "zod";
|
|
36
|
+
|
|
37
|
+
export type DocumentParserResult = {
|
|
38
|
+
readonly content: string;
|
|
39
|
+
/** Optional structured metadata to surface to the model. */
|
|
40
|
+
readonly metadata?: Record<string, unknown>;
|
|
41
|
+
};
|
|
42
|
+
|
|
43
|
+
export type DocumentParser = (path: string) => Promise<DocumentParserResult> | DocumentParserResult;
|
|
44
|
+
|
|
45
|
+
export class DocumentIngestError extends CrewhausError {
|
|
46
|
+
override readonly name = "DocumentIngestError";
|
|
47
|
+
constructor(message: string, cause?: unknown) {
|
|
48
|
+
super("tool", message, cause);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
const TEXT_EXTENSIONS = new Set([".txt", ".md", ".mdx", ".log", ".out", ".rst"]);
|
|
53
|
+
const TABULAR_EXTENSIONS = new Set([".csv", ".tsv"]);
|
|
54
|
+
const STRUCTURED_EXTENSIONS = new Set([".json", ".yaml", ".yml"]);
|
|
55
|
+
const STUB_EXTENSIONS = new Set([".pdf", ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".epub"]);
|
|
56
|
+
|
|
57
|
+
const customParsers = new Map<string, DocumentParser>();
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Register a parser for a file extension. Operators who need PDF/docx
|
|
61
|
+
* support wire their preferred library here:
|
|
62
|
+
*
|
|
63
|
+
* import pdfParse from "pdf-parse";
|
|
64
|
+
* registerDocumentParser(".pdf", async (path) => {
|
|
65
|
+
* const buf = await fs.readFile(path);
|
|
66
|
+
* const { text, numpages } = await pdfParse(buf);
|
|
67
|
+
* return { content: text, metadata: { pages: numpages } };
|
|
68
|
+
* });
|
|
69
|
+
*
|
|
70
|
+
* Extension is matched case-insensitively. Must start with ".".
|
|
71
|
+
*/
|
|
72
|
+
export function registerDocumentParser(ext: string, parser: DocumentParser): void {
|
|
73
|
+
if (!ext.startsWith(".")) {
|
|
74
|
+
throw new DocumentIngestError(`extension must start with "." (got "${ext}")`);
|
|
75
|
+
}
|
|
76
|
+
customParsers.set(ext.toLowerCase(), parser);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Clear all registered parsers. For tests.
|
|
81
|
+
*/
|
|
82
|
+
export function clearDocumentParsers(): void {
|
|
83
|
+
customParsers.clear();
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
const inputSchema = z.object({
|
|
87
|
+
path: z.string().min(1).describe("Absolute or cwd-relative path to the file."),
|
|
88
|
+
maxBytes: z
|
|
89
|
+
.number()
|
|
90
|
+
.int()
|
|
91
|
+
.positive()
|
|
92
|
+
.max(10_000_000)
|
|
93
|
+
.optional()
|
|
94
|
+
.describe("Hard cap. Default 1MB. Files larger than this are truncated with a notice."),
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
const DEFAULT_MAX_BYTES = 1_000_000;
|
|
98
|
+
|
|
99
|
+
export const ingestDocument: RegisteredTool = buildTool({
|
|
100
|
+
name: "IngestDocument",
|
|
101
|
+
description:
|
|
102
|
+
"Read a file from the host filesystem and return its content with structured metadata. Supports plain text, CSV/TSV, JSON, YAML out of the box; PDF/docx/xlsx need an operator-registered parser.",
|
|
103
|
+
inputSchema,
|
|
104
|
+
readOnly: true,
|
|
105
|
+
destructive: false,
|
|
106
|
+
execute: async (input) => {
|
|
107
|
+
const abs = resolve(input.path);
|
|
108
|
+
if (!existsSync(abs)) {
|
|
109
|
+
throw new DocumentIngestError(`file not found: ${abs}`);
|
|
110
|
+
}
|
|
111
|
+
const stat = statSync(abs);
|
|
112
|
+
if (!stat.isFile()) {
|
|
113
|
+
throw new DocumentIngestError(`not a regular file: ${abs}`);
|
|
114
|
+
}
|
|
115
|
+
const ext = extname(abs).toLowerCase();
|
|
116
|
+
const maxBytes = input.maxBytes ?? DEFAULT_MAX_BYTES;
|
|
117
|
+
|
|
118
|
+
// Operator-registered parser takes priority over built-in handling.
|
|
119
|
+
const customParser = customParsers.get(ext);
|
|
120
|
+
if (customParser !== undefined) {
|
|
121
|
+
const result = await customParser(abs);
|
|
122
|
+
return renderResult({
|
|
123
|
+
path: abs,
|
|
124
|
+
content: result.content,
|
|
125
|
+
metadata: { ext, size: stat.size, ...result.metadata },
|
|
126
|
+
maxBytes,
|
|
127
|
+
});
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
if (STUB_EXTENSIONS.has(ext)) {
|
|
131
|
+
throw new DocumentIngestError(
|
|
132
|
+
`extension "${ext}" needs a parser registered via registerDocumentParser(). See @crewhaus/tool-document-ingest README for the pdf-parse / mammoth / xlsx setup.`,
|
|
133
|
+
);
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
if (
|
|
137
|
+
TEXT_EXTENSIONS.has(ext) ||
|
|
138
|
+
TABULAR_EXTENSIONS.has(ext) ||
|
|
139
|
+
STRUCTURED_EXTENSIONS.has(ext) ||
|
|
140
|
+
ext === ""
|
|
141
|
+
) {
|
|
142
|
+
const raw = readFileSync(abs, "utf-8");
|
|
143
|
+
const metadata: Record<string, unknown> = {
|
|
144
|
+
ext: ext || "(none)",
|
|
145
|
+
size: stat.size,
|
|
146
|
+
lines: countLines(raw),
|
|
147
|
+
};
|
|
148
|
+
if (TABULAR_EXTENSIONS.has(ext)) {
|
|
149
|
+
const delim = ext === ".tsv" ? "\t" : ",";
|
|
150
|
+
const lines = raw.split("\n").filter((l) => l.length > 0);
|
|
151
|
+
metadata["rows"] = lines.length;
|
|
152
|
+
metadata["columns"] = (lines[0] ?? "").split(delim).length;
|
|
153
|
+
}
|
|
154
|
+
if (STRUCTURED_EXTENSIONS.has(ext)) {
|
|
155
|
+
if (ext === ".json") {
|
|
156
|
+
try {
|
|
157
|
+
JSON.parse(raw);
|
|
158
|
+
metadata["valid_json"] = true;
|
|
159
|
+
} catch (err) {
|
|
160
|
+
metadata["valid_json"] = false;
|
|
161
|
+
metadata["parse_error"] = (err as Error).message.slice(0, 200);
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
return renderResult({ path: abs, content: raw, metadata, maxBytes });
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
throw new DocumentIngestError(
|
|
169
|
+
`extension "${ext}" is not handled by built-in ingest. Register a parser via registerDocumentParser("${ext}", …), or rename to .txt/.md if it's plain text.`,
|
|
170
|
+
);
|
|
171
|
+
},
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
function renderResult(args: {
|
|
175
|
+
readonly path: string;
|
|
176
|
+
readonly content: string;
|
|
177
|
+
readonly metadata: Record<string, unknown>;
|
|
178
|
+
readonly maxBytes: number;
|
|
179
|
+
}): string {
|
|
180
|
+
let content = args.content;
|
|
181
|
+
let truncated = false;
|
|
182
|
+
if (Buffer.byteLength(content, "utf8") > args.maxBytes) {
|
|
183
|
+
content = Buffer.from(content, "utf-8").subarray(0, args.maxBytes).toString("utf-8");
|
|
184
|
+
truncated = true;
|
|
185
|
+
}
|
|
186
|
+
const lines: string[] = [
|
|
187
|
+
`<document path="${args.path}" name="${basename(args.path)}">`,
|
|
188
|
+
`metadata: ${JSON.stringify(args.metadata)}${truncated ? ` (TRUNCATED to ${args.maxBytes} bytes)` : ""}`,
|
|
189
|
+
"---",
|
|
190
|
+
content,
|
|
191
|
+
"</document>",
|
|
192
|
+
];
|
|
193
|
+
return lines.join("\n");
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
function countLines(s: string): number {
|
|
197
|
+
if (s.length === 0) return 0;
|
|
198
|
+
let n = 1;
|
|
199
|
+
for (const ch of s) {
|
|
200
|
+
if (ch === "\n") n++;
|
|
201
|
+
}
|
|
202
|
+
// A trailing newline is a line terminator, not a 4th empty line.
|
|
203
|
+
if (s.endsWith("\n")) n--;
|
|
204
|
+
return n;
|
|
205
|
+
}
|