pi-docparser 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +17 -0
- package/LICENSE +21 -0
- package/README.md +219 -0
- package/THIRD_PARTY_NOTICES.md +29 -0
- package/assets/pi-docparser-preview.jpg +0 -0
- package/extensions/docparser/constants.ts +71 -0
- package/extensions/docparser/deps.ts +522 -0
- package/extensions/docparser/doctor.ts +291 -0
- package/extensions/docparser/index.ts +9 -0
- package/extensions/docparser/input.ts +230 -0
- package/extensions/docparser/request.ts +67 -0
- package/extensions/docparser/schema.ts +82 -0
- package/extensions/docparser/tool.ts +305 -0
- package/extensions/docparser/types.ts +100 -0
- package/licenses/LiteParse-APACHE-2.0.txt +201 -0
- package/package.json +66 -0
- package/skills/parse-document/SKILL.md +139 -0
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
import type { ExtensionAPI, ExtensionCommandContext } from "@mariozechner/pi-coding-agent";
|
|
2
|
+
|
|
3
|
+
import { DOCTOR_COMMAND, DOCTOR_COMMAND_NAME, INSTALL_COMMAND_TIMEOUT_MS } from "./constants.ts";
|
|
4
|
+
import {
|
|
5
|
+
buildInstallStrategies,
|
|
6
|
+
diagnoseDependencies,
|
|
7
|
+
getInputCategoryLabel,
|
|
8
|
+
getPlatformLabel,
|
|
9
|
+
getPreferredStrategies,
|
|
10
|
+
getRelevantDependencyNames,
|
|
11
|
+
summarizeInstallOutput,
|
|
12
|
+
} from "./deps.ts";
|
|
13
|
+
import { resolveDocumentTarget } from "./input.ts";
|
|
14
|
+
import type {
|
|
15
|
+
DependencyDiagnosis,
|
|
16
|
+
InputInspection,
|
|
17
|
+
InstallStrategy,
|
|
18
|
+
UiMessageLevel,
|
|
19
|
+
} from "./types.ts";
|
|
20
|
+
|
|
21
|
+
function normalizeDoctorArgument(input: string): string | undefined {
|
|
22
|
+
const trimmed = input.trim();
|
|
23
|
+
if (!trimmed) {
|
|
24
|
+
return undefined;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
const isQuoted =
|
|
28
|
+
(trimmed.startsWith('"') && trimmed.endsWith('"')) ||
|
|
29
|
+
(trimmed.startsWith("'") && trimmed.endsWith("'"));
|
|
30
|
+
|
|
31
|
+
return isQuoted ? trimmed.slice(1, -1) : trimmed;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function formatDoctorReport(options: {
|
|
35
|
+
inspection?: InputInspection;
|
|
36
|
+
sourcePath?: string;
|
|
37
|
+
resolvedPath?: string;
|
|
38
|
+
diagnoses: DependencyDiagnosis[];
|
|
39
|
+
strategies: InstallStrategy[];
|
|
40
|
+
installSummary?: string[];
|
|
41
|
+
}): { text: string; level: UiMessageLevel } {
|
|
42
|
+
const missing = options.diagnoses.filter((diagnosis) => !diagnosis.installed);
|
|
43
|
+
const relevantMissing = missing.filter((diagnosis) => diagnosis.relevant);
|
|
44
|
+
const level: UiMessageLevel =
|
|
45
|
+
relevantMissing.length > 0 ? "error" : missing.length > 0 ? "warning" : "info";
|
|
46
|
+
const lines = ["docparser doctor", `Platform: ${getPlatformLabel()}`];
|
|
47
|
+
|
|
48
|
+
if (options.sourcePath) {
|
|
49
|
+
lines.push(`Target: ${options.sourcePath}`);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
if (options.resolvedPath && options.resolvedPath !== options.sourcePath) {
|
|
53
|
+
lines.push(`Resolved path: ${options.resolvedPath}`);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
if (options.inspection) {
|
|
57
|
+
lines.push(`Detected input type: ${getInputCategoryLabel(options.inspection.category)}`);
|
|
58
|
+
if (options.inspection.extension) {
|
|
59
|
+
lines.push(`Detected extension: ${options.inspection.extension}`);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
if (options.inspection && getRelevantDependencyNames(options.inspection).size === 0) {
|
|
64
|
+
lines.push(
|
|
65
|
+
"This input type does not require extra host conversion packages for normal parsing.",
|
|
66
|
+
);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
lines.push("Dependency status:");
|
|
70
|
+
for (const diagnosis of options.diagnoses) {
|
|
71
|
+
const status = diagnosis.installed
|
|
72
|
+
? diagnosis.detectedCommand
|
|
73
|
+
? `installed (${diagnosis.detectedCommand})`
|
|
74
|
+
: "installed"
|
|
75
|
+
: diagnosis.relevant
|
|
76
|
+
? options.inspection
|
|
77
|
+
? "missing — required for this input"
|
|
78
|
+
: "missing — relevant"
|
|
79
|
+
: "missing — optional";
|
|
80
|
+
|
|
81
|
+
lines.push(`- ${diagnosis.label}: ${status}`);
|
|
82
|
+
lines.push(` ${diagnosis.summary}`);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
if (options.installSummary?.length) {
|
|
86
|
+
lines.push("Installation attempt:");
|
|
87
|
+
for (const line of options.installSummary) {
|
|
88
|
+
const [firstLine, ...rest] = line.split("\n");
|
|
89
|
+
lines.push(`- ${firstLine}`);
|
|
90
|
+
for (const continuation of rest) {
|
|
91
|
+
lines.push(` ${continuation}`);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
if (missing.length > 0) {
|
|
97
|
+
const preferredStrategies = getPreferredStrategies(options.strategies);
|
|
98
|
+
if (preferredStrategies.length > 0) {
|
|
99
|
+
lines.push("Suggested setup commands:");
|
|
100
|
+
for (const strategy of preferredStrategies.slice(0, 2)) {
|
|
101
|
+
lines.push(`- ${strategy.label}:`);
|
|
102
|
+
for (const command of strategy.commands) {
|
|
103
|
+
lines.push(` ${command.display}`);
|
|
104
|
+
}
|
|
105
|
+
if (strategy.autoRunBlockedReason) {
|
|
106
|
+
lines.push(` Note: ${strategy.autoRunBlockedReason}`);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
} else if (process.platform === "linux") {
|
|
110
|
+
lines.push(
|
|
111
|
+
"Suggested setup: install the missing packages with your distribution package manager.",
|
|
112
|
+
);
|
|
113
|
+
} else if (process.platform === "darwin") {
|
|
114
|
+
lines.push(
|
|
115
|
+
"Suggested setup: install Homebrew, then run the appropriate brew install commands.",
|
|
116
|
+
);
|
|
117
|
+
} else if (process.platform === "win32") {
|
|
118
|
+
lines.push("Suggested setup: use winget or Chocolatey to install the missing packages.");
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
return {
|
|
123
|
+
text: lines.join("\n"),
|
|
124
|
+
level,
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
async function collectDoctorState(inspection?: InputInspection): Promise<{
|
|
129
|
+
diagnoses: DependencyDiagnosis[];
|
|
130
|
+
missingDependencies: DependencyDiagnosis[];
|
|
131
|
+
installCandidates: DependencyDiagnosis[];
|
|
132
|
+
strategies: InstallStrategy[];
|
|
133
|
+
}> {
|
|
134
|
+
const diagnoses = await diagnoseDependencies(inspection);
|
|
135
|
+
const missingDependencies = diagnoses.filter((diagnosis) => !diagnosis.installed);
|
|
136
|
+
const installCandidates = inspection
|
|
137
|
+
? missingDependencies.filter((diagnosis) => diagnosis.relevant)
|
|
138
|
+
: missingDependencies;
|
|
139
|
+
const strategyInput = installCandidates.length > 0 ? installCandidates : missingDependencies;
|
|
140
|
+
|
|
141
|
+
return {
|
|
142
|
+
diagnoses,
|
|
143
|
+
missingDependencies,
|
|
144
|
+
installCandidates,
|
|
145
|
+
strategies: await buildInstallStrategies(strategyInput),
|
|
146
|
+
};
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
async function selectInstallStrategy(
|
|
150
|
+
strategies: InstallStrategy[],
|
|
151
|
+
ctx: ExtensionCommandContext,
|
|
152
|
+
): Promise<InstallStrategy | undefined> {
|
|
153
|
+
if (strategies.length === 1) {
|
|
154
|
+
return strategies[0];
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
const labels = strategies.map((strategy) => strategy.label);
|
|
158
|
+
const selectedLabel = await ctx.ui.select("Choose an install strategy", labels);
|
|
159
|
+
if (!selectedLabel) {
|
|
160
|
+
return undefined;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
return strategies.find((strategy) => strategy.label === selectedLabel);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
export function registerDoctorCommand(pi: ExtensionAPI) {
|
|
167
|
+
pi.registerCommand(DOCTOR_COMMAND_NAME, {
|
|
168
|
+
description:
|
|
169
|
+
"Diagnose docparser host dependencies and optionally try to install missing packages",
|
|
170
|
+
handler: async (args, ctx) => {
|
|
171
|
+
if (!ctx.hasUI) {
|
|
172
|
+
return;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
const normalizedArg = normalizeDoctorArgument(args);
|
|
176
|
+
|
|
177
|
+
if (normalizedArg === "help" || normalizedArg === "--help") {
|
|
178
|
+
ctx.ui.notify(
|
|
179
|
+
[
|
|
180
|
+
`${DOCTOR_COMMAND} usage`,
|
|
181
|
+
`- ${DOCTOR_COMMAND}`,
|
|
182
|
+
`- ${DOCTOR_COMMAND} @path/to/file.docx`,
|
|
183
|
+
"",
|
|
184
|
+
"With a file path, the doctor focuses on the dependencies relevant to that input.",
|
|
185
|
+
].join("\n"),
|
|
186
|
+
"info",
|
|
187
|
+
);
|
|
188
|
+
return;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
try {
|
|
192
|
+
let target: Awaited<ReturnType<typeof resolveDocumentTarget>> | undefined;
|
|
193
|
+
try {
|
|
194
|
+
target = normalizedArg ? await resolveDocumentTarget(normalizedArg, ctx.cwd) : undefined;
|
|
195
|
+
} catch {
|
|
196
|
+
if (normalizedArg) {
|
|
197
|
+
ctx.ui.notify(`Document file not found or not readable: ${normalizedArg}`, "error");
|
|
198
|
+
}
|
|
199
|
+
return;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
const initialState = await collectDoctorState(target?.inspection);
|
|
203
|
+
const initialReport = formatDoctorReport({
|
|
204
|
+
inspection: target?.inspection,
|
|
205
|
+
sourcePath: target?.sourcePath,
|
|
206
|
+
resolvedPath: target?.resolvedPath,
|
|
207
|
+
diagnoses: initialState.diagnoses,
|
|
208
|
+
strategies: initialState.strategies,
|
|
209
|
+
});
|
|
210
|
+
ctx.ui.notify(initialReport.text, initialReport.level);
|
|
211
|
+
|
|
212
|
+
if (
|
|
213
|
+
initialState.missingDependencies.length === 0 ||
|
|
214
|
+
initialState.installCandidates.length === 0
|
|
215
|
+
) {
|
|
216
|
+
return;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
const autoRunnableStrategies = getPreferredStrategies(
|
|
220
|
+
initialState.strategies.filter((strategy) => strategy.autoRunnable),
|
|
221
|
+
);
|
|
222
|
+
if (autoRunnableStrategies.length === 0) {
|
|
223
|
+
ctx.ui.notify(
|
|
224
|
+
`No automatic install strategy is safely available right now. Follow the suggested commands above or install the packages manually, then rerun ${DOCTOR_COMMAND}.`,
|
|
225
|
+
"warning",
|
|
226
|
+
);
|
|
227
|
+
return;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
const selectedStrategy = await selectInstallStrategy(autoRunnableStrategies, ctx);
|
|
231
|
+
if (!selectedStrategy) {
|
|
232
|
+
return;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
const installList = initialState.installCandidates
|
|
236
|
+
.map((dependency) => dependency.label)
|
|
237
|
+
.join(", ");
|
|
238
|
+
const confirmText = [
|
|
239
|
+
`Missing packages: ${installList}`,
|
|
240
|
+
`Installer: ${selectedStrategy.label}`,
|
|
241
|
+
"",
|
|
242
|
+
"Commands that will be attempted:",
|
|
243
|
+
...selectedStrategy.commands.map((command) => `- ${command.display}`),
|
|
244
|
+
"",
|
|
245
|
+
"Try running them now?",
|
|
246
|
+
].join("\n");
|
|
247
|
+
|
|
248
|
+
const confirmed = await ctx.ui.confirm("docparser doctor", confirmText);
|
|
249
|
+
if (!confirmed) {
|
|
250
|
+
return;
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
const installSummary: string[] = [];
|
|
254
|
+
for (const command of selectedStrategy.commands) {
|
|
255
|
+
ctx.ui.notify(`Running: ${command.display}`, "info");
|
|
256
|
+
const result = await pi.exec(command.command, command.args, {
|
|
257
|
+
timeout: command.timeoutMs ?? INSTALL_COMMAND_TIMEOUT_MS,
|
|
258
|
+
});
|
|
259
|
+
const success = result.code === 0 && !result.killed;
|
|
260
|
+
|
|
261
|
+
installSummary.push(
|
|
262
|
+
`${command.description}: ${success ? "ok" : `failed (exit ${result.code}${result.killed ? ", killed" : ""})`}`,
|
|
263
|
+
);
|
|
264
|
+
|
|
265
|
+
if (!success) {
|
|
266
|
+
const outputSummary = summarizeInstallOutput(result.stdout, result.stderr);
|
|
267
|
+
if (outputSummary) {
|
|
268
|
+
installSummary.push(`Command output:\n${outputSummary}`);
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
const finalState = await collectDoctorState(target?.inspection);
|
|
274
|
+
const finalReport = formatDoctorReport({
|
|
275
|
+
inspection: target?.inspection,
|
|
276
|
+
sourcePath: target?.sourcePath,
|
|
277
|
+
resolvedPath: target?.resolvedPath,
|
|
278
|
+
diagnoses: finalState.diagnoses,
|
|
279
|
+
strategies: finalState.strategies,
|
|
280
|
+
installSummary,
|
|
281
|
+
});
|
|
282
|
+
ctx.ui.notify(finalReport.text, finalReport.level);
|
|
283
|
+
} catch (error) {
|
|
284
|
+
ctx.ui.notify(
|
|
285
|
+
`docparser doctor failed: ${error instanceof Error ? error.message : String(error)}`,
|
|
286
|
+
"error",
|
|
287
|
+
);
|
|
288
|
+
}
|
|
289
|
+
},
|
|
290
|
+
});
|
|
291
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
|
|
2
|
+
|
|
3
|
+
import { registerDoctorCommand } from "./doctor.ts";
|
|
4
|
+
import { registerDocumentParseTool } from "./tool.ts";
|
|
5
|
+
|
|
6
|
+
export default function parseDocumentExtension(pi: ExtensionAPI) {
|
|
7
|
+
registerDocumentParseTool(pi);
|
|
8
|
+
registerDoctorCommand(pi);
|
|
9
|
+
}
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
import { constants as fsConstants } from "node:fs";
|
|
2
|
+
import { access, open } from "node:fs/promises";
|
|
3
|
+
import { extname, isAbsolute, resolve as resolvePath } from "node:path";
|
|
4
|
+
import { homedir } from "node:os";
|
|
5
|
+
|
|
6
|
+
import { IMAGE_EXTENSIONS, OFFICE_EXTENSIONS, SPREADSHEET_EXTENSIONS } from "./constants.ts";
|
|
7
|
+
import type { InputCategory, InputInspection, ScreenshotSelection } from "./types.ts";
|
|
8
|
+
|
|
9
|
+
const UNICODE_SPACES = /[\u00A0\u2000-\u200A\u202F\u205F\u3000]/g;
|
|
10
|
+
const NARROW_NO_BREAK_SPACE = "\u202F";
|
|
11
|
+
|
|
12
|
+
function normalizeDocumentPathInput(input: string): string {
|
|
13
|
+
return input.trim().replace(/^@/, "").replace(UNICODE_SPACES, " ");
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
function expandHomeDirectory(filePath: string): string {
|
|
17
|
+
if (filePath === "~") {
|
|
18
|
+
return homedir();
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
if (filePath.startsWith("~/")) {
|
|
22
|
+
return `${homedir()}${filePath.slice(1)}`;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
return filePath;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function tryMacOsAmPmVariant(filePath: string): string {
|
|
29
|
+
return filePath.replace(/ (AM|PM)\./g, `${NARROW_NO_BREAK_SPACE}$1.`);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
function tryNfdVariant(filePath: string): string {
|
|
33
|
+
return filePath.normalize("NFD");
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
function tryCurlyQuoteVariant(filePath: string): string {
|
|
37
|
+
return filePath.replace(/'/g, "\u2019");
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
async function pathExists(filePath: string): Promise<boolean> {
|
|
41
|
+
try {
|
|
42
|
+
await access(filePath, fsConstants.F_OK);
|
|
43
|
+
return true;
|
|
44
|
+
} catch {
|
|
45
|
+
return false;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
async function resolveExistingPath(filePath: string, cwd: string): Promise<string> {
|
|
50
|
+
const expanded = expandHomeDirectory(filePath);
|
|
51
|
+
const resolved = isAbsolute(expanded) ? expanded : resolvePath(cwd, expanded);
|
|
52
|
+
const nfdVariant = tryNfdVariant(resolved);
|
|
53
|
+
|
|
54
|
+
for (const candidate of new Set([
|
|
55
|
+
resolved,
|
|
56
|
+
tryMacOsAmPmVariant(resolved),
|
|
57
|
+
nfdVariant,
|
|
58
|
+
tryCurlyQuoteVariant(resolved),
|
|
59
|
+
tryCurlyQuoteVariant(nfdVariant),
|
|
60
|
+
])) {
|
|
61
|
+
if (await pathExists(candidate)) {
|
|
62
|
+
return candidate;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
return resolved;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
async function ensureReadableFile(filePath: string, sourcePath: string): Promise<void> {
|
|
70
|
+
try {
|
|
71
|
+
await access(filePath, fsConstants.R_OK);
|
|
72
|
+
} catch {
|
|
73
|
+
throw new Error(`Document file not found or not readable: ${sourcePath}`);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
function getInputCategory(extension: string): InputCategory | undefined {
|
|
78
|
+
if (extension === ".pdf") {
|
|
79
|
+
return "pdf";
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
if (OFFICE_EXTENSIONS.has(extension)) {
|
|
83
|
+
return "office";
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
if (SPREADSHEET_EXTENSIONS.has(extension)) {
|
|
87
|
+
return "spreadsheet";
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
if (IMAGE_EXTENSIONS.has(extension)) {
|
|
91
|
+
return "image";
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
return undefined;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
async function readFileHeader(filePath: string, length: number): Promise<Buffer> {
|
|
98
|
+
const handle = await open(filePath, "r");
|
|
99
|
+
|
|
100
|
+
try {
|
|
101
|
+
const buffer = Buffer.alloc(length);
|
|
102
|
+
const { bytesRead } = await handle.read(buffer, 0, length, 0);
|
|
103
|
+
return buffer.subarray(0, bytesRead);
|
|
104
|
+
} finally {
|
|
105
|
+
await handle.close();
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
function isPdfHeader(header: Buffer): boolean {
|
|
110
|
+
return header.length >= 4 && header.toString("utf8", 0, 4) === "%PDF";
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
function isPngHeader(header: Buffer): boolean {
|
|
114
|
+
return (
|
|
115
|
+
header.length >= 4 &&
|
|
116
|
+
header[0] === 0x89 &&
|
|
117
|
+
header[1] === 0x50 &&
|
|
118
|
+
header[2] === 0x4e &&
|
|
119
|
+
header[3] === 0x47
|
|
120
|
+
);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
function isJpegHeader(header: Buffer): boolean {
|
|
124
|
+
return header.length >= 3 && header[0] === 0xff && header[1] === 0xd8 && header[2] === 0xff;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
async function inspectInputFile(filePath: string): Promise<InputInspection> {
|
|
128
|
+
const extension = extname(filePath).toLowerCase();
|
|
129
|
+
const category = getInputCategory(extension);
|
|
130
|
+
|
|
131
|
+
if (category) {
|
|
132
|
+
return { extension, category };
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
try {
|
|
136
|
+
const header = await readFileHeader(filePath, 16);
|
|
137
|
+
|
|
138
|
+
if (isPdfHeader(header)) {
|
|
139
|
+
return { extension: extension || ".pdf", category: "pdf" };
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
if (!extension && isPngHeader(header)) {
|
|
143
|
+
return { extension: ".png", category: "image" };
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
if (!extension && isJpegHeader(header)) {
|
|
147
|
+
return { extension: ".jpg", category: "image" };
|
|
148
|
+
}
|
|
149
|
+
} catch {
|
|
150
|
+
// Best-effort inspection only. Readability is validated separately.
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
return { extension, category: "other" };
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
export async function resolveDocumentTarget(
|
|
157
|
+
input: string,
|
|
158
|
+
cwd: string,
|
|
159
|
+
): Promise<{
|
|
160
|
+
sourcePath: string;
|
|
161
|
+
resolvedPath: string;
|
|
162
|
+
inspection: InputInspection;
|
|
163
|
+
}> {
|
|
164
|
+
const sourcePath = normalizeDocumentPathInput(input);
|
|
165
|
+
const resolvedPath = await resolveExistingPath(sourcePath, cwd);
|
|
166
|
+
|
|
167
|
+
await ensureReadableFile(resolvedPath, sourcePath);
|
|
168
|
+
|
|
169
|
+
return {
|
|
170
|
+
sourcePath,
|
|
171
|
+
resolvedPath,
|
|
172
|
+
inspection: await inspectInputFile(resolvedPath),
|
|
173
|
+
};
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
export function parsePageSelection(selection: string): number[] {
|
|
177
|
+
const pages = new Set<number>();
|
|
178
|
+
|
|
179
|
+
for (const rawPart of selection.split(",")) {
|
|
180
|
+
const part = rawPart.trim();
|
|
181
|
+
if (!part) continue;
|
|
182
|
+
|
|
183
|
+
if (part.includes("-")) {
|
|
184
|
+
const [rawStart, rawEnd] = part.split("-", 2).map((value) => value.trim());
|
|
185
|
+
const start = Number.parseInt(rawStart, 10);
|
|
186
|
+
const end = Number.parseInt(rawEnd, 10);
|
|
187
|
+
|
|
188
|
+
if (!Number.isInteger(start) || !Number.isInteger(end) || start < 1 || end < start) {
|
|
189
|
+
throw new Error(`Invalid page range: ${part}`);
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
for (let page = start; page <= end; page++) {
|
|
193
|
+
pages.add(page);
|
|
194
|
+
}
|
|
195
|
+
continue;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
const page = Number.parseInt(part, 10);
|
|
199
|
+
if (!Number.isInteger(page) || page < 1) {
|
|
200
|
+
throw new Error(`Invalid page number: ${part}`);
|
|
201
|
+
}
|
|
202
|
+
pages.add(page);
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
const result = Array.from(pages).sort((a, b) => a - b);
|
|
206
|
+
if (result.length === 0) {
|
|
207
|
+
throw new Error("No valid page numbers were provided.");
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
return result;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
export function resolveScreenshotSelection(selection: string): ScreenshotSelection {
|
|
214
|
+
const trimmedSelection = selection.trim();
|
|
215
|
+
if (!trimmedSelection) {
|
|
216
|
+
throw new Error("Screenshot page selection must not be empty.");
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
if (["all", "*"].includes(trimmedSelection.toLowerCase())) {
|
|
220
|
+
return {
|
|
221
|
+
pageNumbers: undefined,
|
|
222
|
+
description: "all pages",
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
return {
|
|
227
|
+
pageNumbers: parsePageSelection(trimmedSelection),
|
|
228
|
+
description: `pages ${trimmedSelection}`,
|
|
229
|
+
};
|
|
230
|
+
}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import { DEFAULT_DPI, DEFAULT_MAX_PAGES, DEFAULT_NUM_WORKERS } from "./constants.ts";
|
|
2
|
+
import { resolveScreenshotSelection } from "./input.ts";
|
|
3
|
+
import type { DocumentParseParams, DocumentParsePlan, LiteParseToolConfig } from "./types.ts";
|
|
4
|
+
|
|
5
|
+
function normalizeOptionalString(value: string | undefined): string | undefined {
|
|
6
|
+
const trimmed = value?.trim();
|
|
7
|
+
return trimmed ? trimmed : undefined;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
function resolveOcrLanguage(
|
|
11
|
+
params: DocumentParseParams,
|
|
12
|
+
ocrServerUrl: string | undefined,
|
|
13
|
+
warnings: string[],
|
|
14
|
+
): LiteParseToolConfig["ocrLanguage"] | undefined {
|
|
15
|
+
const singleOcrLanguage = normalizeOptionalString(params.ocrLanguage);
|
|
16
|
+
const ocrLanguages = (params.ocrLanguages ?? [])
|
|
17
|
+
.map((language) => language.trim())
|
|
18
|
+
.filter(Boolean);
|
|
19
|
+
|
|
20
|
+
if (singleOcrLanguage && ocrLanguages.length > 0) {
|
|
21
|
+
warnings.push("Both ocrLanguage and ocrLanguages were provided. Using ocrLanguages.");
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
if (ocrLanguages.length === 0) {
|
|
25
|
+
return singleOcrLanguage;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
if (ocrServerUrl) {
|
|
29
|
+
if (ocrLanguages.length > 1) {
|
|
30
|
+
warnings.push(
|
|
31
|
+
"Multiple OCR languages were provided, but HTTP OCR servers currently receive only the first language code.",
|
|
32
|
+
);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
return ocrLanguages[0];
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
return ocrLanguages.join("+");
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export function buildDocumentParsePlan(params: DocumentParseParams): DocumentParsePlan {
|
|
42
|
+
const warnings: string[] = [];
|
|
43
|
+
const ocrServerUrl = normalizeOptionalString(params.ocrServerUrl);
|
|
44
|
+
const ocrLanguage = resolveOcrLanguage(params, ocrServerUrl, warnings);
|
|
45
|
+
|
|
46
|
+
const parserConfig: LiteParseToolConfig = {
|
|
47
|
+
outputFormat: params.format ?? "text",
|
|
48
|
+
ocrEnabled: (params.ocr ?? "auto") !== "off",
|
|
49
|
+
ocrLanguage,
|
|
50
|
+
ocrServerUrl,
|
|
51
|
+
numWorkers: params.numWorkers ?? DEFAULT_NUM_WORKERS,
|
|
52
|
+
maxPages: params.maxPages ?? DEFAULT_MAX_PAGES,
|
|
53
|
+
targetPages: normalizeOptionalString(params.targetPages),
|
|
54
|
+
dpi: params.dpi ?? DEFAULT_DPI,
|
|
55
|
+
preciseBoundingBox: params.preciseBoundingBox ?? true,
|
|
56
|
+
preserveVerySmallText: params.preserveSmallText ?? false,
|
|
57
|
+
preserveLayoutAlignmentAcrossPages: params.preserveLayoutAlignmentAcrossPages ?? false,
|
|
58
|
+
};
|
|
59
|
+
|
|
60
|
+
return {
|
|
61
|
+
parserConfig,
|
|
62
|
+
screenshotSelection: params.screenshotPages
|
|
63
|
+
? resolveScreenshotSelection(params.screenshotPages)
|
|
64
|
+
: undefined,
|
|
65
|
+
warnings,
|
|
66
|
+
};
|
|
67
|
+
}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import { StringEnum, Type } from "@mariozechner/pi-ai";
|
|
2
|
+
|
|
3
|
+
export const DocumentParseSchema = Type.Object({
|
|
4
|
+
path: Type.String({
|
|
5
|
+
description:
|
|
6
|
+
"Path to the document file to parse (PDF, DOCX, PPTX, XLSX, CSV, PNG, JPG, TIFF, WebP, etc.)",
|
|
7
|
+
}),
|
|
8
|
+
format: Type.Optional(
|
|
9
|
+
StringEnum(["text", "json"] as const, {
|
|
10
|
+
description: "Output format for the parsed document (default: text)",
|
|
11
|
+
}),
|
|
12
|
+
),
|
|
13
|
+
targetPages: Type.Optional(
|
|
14
|
+
Type.String({
|
|
15
|
+
description: 'Optional page selection for parsing, e.g. "1-5,10,15-20"',
|
|
16
|
+
}),
|
|
17
|
+
),
|
|
18
|
+
screenshotPages: Type.Optional(
|
|
19
|
+
Type.String({
|
|
20
|
+
description:
|
|
21
|
+
'Optional PDF page selection for screenshots, e.g. "1-3,8" or "all". Screenshots are currently generated only for PDF inputs and are saved as PNG files.',
|
|
22
|
+
}),
|
|
23
|
+
),
|
|
24
|
+
ocr: Type.Optional(
|
|
25
|
+
StringEnum(["auto", "off"] as const, {
|
|
26
|
+
description:
|
|
27
|
+
"OCR mode: auto uses LiteParse OCR behavior, off disables OCR for faster parsing",
|
|
28
|
+
}),
|
|
29
|
+
),
|
|
30
|
+
ocrLanguage: Type.Optional(
|
|
31
|
+
Type.String({
|
|
32
|
+
description:
|
|
33
|
+
"Optional single OCR language code. Built-in Tesseract typically uses ISO 639-3 codes such as eng, deu, fra, jpn. Many HTTP OCR servers instead expect ISO 639-1 codes such as en, de, fr, ja.",
|
|
34
|
+
}),
|
|
35
|
+
),
|
|
36
|
+
ocrLanguages: Type.Optional(
|
|
37
|
+
Type.Array(Type.String(), {
|
|
38
|
+
minItems: 1,
|
|
39
|
+
description:
|
|
40
|
+
"Optional multiple OCR language codes. For built-in Tesseract they are joined into a multilingual language string. For HTTP OCR servers, only the first code is forwarded.",
|
|
41
|
+
}),
|
|
42
|
+
),
|
|
43
|
+
ocrServerUrl: Type.Optional(
|
|
44
|
+
Type.String({
|
|
45
|
+
description: "Optional HTTP OCR server URL implementing the LiteParse OCR API",
|
|
46
|
+
}),
|
|
47
|
+
),
|
|
48
|
+
numWorkers: Type.Optional(
|
|
49
|
+
Type.Integer({
|
|
50
|
+
minimum: 1,
|
|
51
|
+
description: "Optional OCR worker count (default: CPU cores - 1)",
|
|
52
|
+
}),
|
|
53
|
+
),
|
|
54
|
+
maxPages: Type.Optional(
|
|
55
|
+
Type.Integer({
|
|
56
|
+
minimum: 1,
|
|
57
|
+
description: "Maximum number of pages to parse (default: 10000)",
|
|
58
|
+
}),
|
|
59
|
+
),
|
|
60
|
+
dpi: Type.Optional(
|
|
61
|
+
Type.Integer({
|
|
62
|
+
minimum: 72,
|
|
63
|
+
description: "Rendering DPI for OCR and screenshots (default: 150)",
|
|
64
|
+
}),
|
|
65
|
+
),
|
|
66
|
+
preciseBoundingBox: Type.Optional(
|
|
67
|
+
Type.Boolean({
|
|
68
|
+
description: "Whether to compute precise bounding boxes (default: true)",
|
|
69
|
+
}),
|
|
70
|
+
),
|
|
71
|
+
preserveSmallText: Type.Optional(
|
|
72
|
+
Type.Boolean({
|
|
73
|
+
description: "Whether to preserve very small text that would otherwise be filtered out",
|
|
74
|
+
}),
|
|
75
|
+
),
|
|
76
|
+
preserveLayoutAlignmentAcrossPages: Type.Optional(
|
|
77
|
+
Type.Boolean({
|
|
78
|
+
description:
|
|
79
|
+
"Whether to preserve text alignment consistently across page boundaries (default: false)",
|
|
80
|
+
}),
|
|
81
|
+
),
|
|
82
|
+
});
|