pi-docparser 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,522 @@
1
+ import { truncateTail } from "@mariozechner/pi-coding-agent";
2
+ import { spawn } from "node:child_process";
3
+ import { constants as fsConstants } from "node:fs";
4
+ import { access } from "node:fs/promises";
5
+
6
+ import {
7
+ DOCTOR_COMMAND,
8
+ GHOSTSCRIPT_MISSING_MESSAGE,
9
+ GHOSTSCRIPT_REQUIRED_EXTENSIONS,
10
+ IMAGEMAGICK_MISSING_MESSAGE,
11
+ INSTALL_COMMAND_TIMEOUT_MS,
12
+ LIBREOFFICE_MISSING_MESSAGE,
13
+ } from "./constants.ts";
14
+ import type {
15
+ DependencyDiagnosis,
16
+ DependencyName,
17
+ InputCategory,
18
+ InputInspection,
19
+ InstallCommandSpec,
20
+ InstallStrategy,
21
+ PackageManagerId,
22
+ UnixPrivilegeContext,
23
+ } from "./types.ts";
24
+
25
+ const DEPENDENCY_NAMES = ["libreoffice", "imagemagick", "ghostscript"] as const;
26
+ const PLATFORM_LABELS = {
27
+ darwin: "macOS",
28
+ linux: "Linux",
29
+ win32: "Windows",
30
+ } as const;
31
+ const INPUT_CATEGORY_LABELS: Record<InputCategory, string> = {
32
+ pdf: "PDF",
33
+ office: "Office document",
34
+ spreadsheet: "Spreadsheet / tabular document",
35
+ image: "Image",
36
+ other: "Other / unknown",
37
+ };
38
+ const PACKAGE_NAMES: Record<PackageManagerId, Record<DependencyName, string>> = {
39
+ brew: {
40
+ libreoffice: "libreoffice",
41
+ imagemagick: "imagemagick",
42
+ ghostscript: "ghostscript",
43
+ },
44
+ "apt-get": {
45
+ libreoffice: "libreoffice",
46
+ imagemagick: "imagemagick",
47
+ ghostscript: "ghostscript",
48
+ },
49
+ dnf: {
50
+ libreoffice: "libreoffice",
51
+ imagemagick: "ImageMagick",
52
+ ghostscript: "ghostscript",
53
+ },
54
+ yum: {
55
+ libreoffice: "libreoffice",
56
+ imagemagick: "ImageMagick",
57
+ ghostscript: "ghostscript",
58
+ },
59
+ pacman: {
60
+ libreoffice: "libreoffice-fresh",
61
+ imagemagick: "imagemagick",
62
+ ghostscript: "ghostscript",
63
+ },
64
+ zypper: {
65
+ libreoffice: "libreoffice",
66
+ imagemagick: "ImageMagick",
67
+ ghostscript: "ghostscript",
68
+ },
69
+ apk: {
70
+ libreoffice: "libreoffice",
71
+ imagemagick: "imagemagick",
72
+ ghostscript: "ghostscript",
73
+ },
74
+ winget: {
75
+ libreoffice: "TheDocumentFoundation.LibreOffice",
76
+ imagemagick: "ImageMagick.Q16",
77
+ ghostscript: "ArtifexSoftware.GhostScript",
78
+ },
79
+ choco: {
80
+ libreoffice: "libreoffice-fresh",
81
+ imagemagick: "imagemagick.app",
82
+ ghostscript: "ghostscript",
83
+ },
84
+ };
85
+ const LINUX_MANAGERS: Array<{ id: PackageManagerId; label: string }> = [
86
+ { id: "apt-get", label: "APT" },
87
+ { id: "dnf", label: "DNF" },
88
+ { id: "yum", label: "YUM" },
89
+ { id: "pacman", label: "pacman" },
90
+ { id: "zypper", label: "zypper" },
91
+ { id: "apk", label: "apk" },
92
+ ];
93
+ const DEPENDENCY_SETUP_PATTERNS = [
94
+ "LibreOffice is not installed",
95
+ "ImageMagick is not installed",
96
+ "Ghostscript is required",
97
+ ];
98
+
99
+ async function spawnSucceeded(command: string, args: string[]): Promise<boolean> {
100
+ return new Promise((resolve) => {
101
+ const child = spawn(command, args, {
102
+ stdio: "ignore",
103
+ windowsHide: true,
104
+ });
105
+
106
+ child.on("error", () => resolve(false));
107
+ child.on("close", (code) => resolve(code === 0));
108
+ });
109
+ }
110
+
111
+ async function runBinaryLookup(binary: string): Promise<boolean> {
112
+ return spawnSucceeded(process.platform === "win32" ? "where" : "which", [binary]);
113
+ }
114
+
115
+ async function isExecutablePathAvailable(filePath: string): Promise<boolean> {
116
+ try {
117
+ await access(filePath, process.platform === "win32" ? fsConstants.F_OK : fsConstants.X_OK);
118
+ return true;
119
+ } catch {
120
+ return false;
121
+ }
122
+ }
123
+
124
+ async function findFirstAvailableCommand(
125
+ commandNames: string[],
126
+ candidatePaths: string[] = [],
127
+ ): Promise<string | undefined> {
128
+ for (const commandName of commandNames) {
129
+ if (await runBinaryLookup(commandName)) {
130
+ return commandName;
131
+ }
132
+ }
133
+
134
+ for (const candidatePath of candidatePaths) {
135
+ if (await isExecutablePathAvailable(candidatePath)) {
136
+ return candidatePath;
137
+ }
138
+ }
139
+
140
+ return undefined;
141
+ }
142
+
143
+ function formatGhostscriptMissingMessage(extension: string): string {
144
+ const fileTypeLabel = (extension || "vector").replace(/^\./, "").toUpperCase();
145
+ return GHOSTSCRIPT_MISSING_MESSAGE.replace("%s", fileTypeLabel);
146
+ }
147
+
148
+ const DEPENDENCY_METADATA: Record<
149
+ DependencyName,
150
+ {
151
+ label: string;
152
+ summary: string;
153
+ findCommand: () => Promise<string | undefined>;
154
+ getMissingMessage: (inspection?: InputInspection) => string;
155
+ }
156
+ > = {
157
+ libreoffice: {
158
+ label: "LibreOffice",
159
+ summary:
160
+ "Needed for Office documents and spreadsheets such as DOCX, PPTX, XLSX, CSV, and similar formats.",
161
+ findCommand: () =>
162
+ findFirstAvailableCommand(
163
+ ["libreoffice", "soffice"],
164
+ process.platform === "darwin"
165
+ ? [
166
+ "/Applications/LibreOffice.app/Contents/MacOS/soffice",
167
+ "/Applications/LibreOffice.app/Contents/MacOS/libreoffice",
168
+ ]
169
+ : process.platform === "win32"
170
+ ? [
171
+ "C:\\Program Files\\LibreOffice\\program\\soffice.exe",
172
+ "C:\\Program Files\\LibreOffice\\program\\libreoffice.exe",
173
+ "C:\\Program Files (x86)\\LibreOffice\\program\\soffice.exe",
174
+ "C:\\Program Files (x86)\\LibreOffice\\program\\libreoffice.exe",
175
+ ]
176
+ : [],
177
+ ),
178
+ getMissingMessage: () => LIBREOFFICE_MISSING_MESSAGE,
179
+ },
180
+ imagemagick: {
181
+ label: "ImageMagick",
182
+ summary:
183
+ "Needed for image inputs such as PNG, JPG, TIFF, WebP, SVG, and similar formats that must be converted before parsing.",
184
+ findCommand: () =>
185
+ findFirstAvailableCommand(process.platform === "win32" ? ["magick"] : ["magick", "convert"]),
186
+ getMissingMessage: () => IMAGEMAGICK_MISSING_MESSAGE,
187
+ },
188
+ ghostscript: {
189
+ label: "Ghostscript",
190
+ summary:
191
+ "Needed for vector image conversion paths such as SVG, EPS, PS, and AI when ImageMagick delegates rendering.",
192
+ findCommand: () => findFirstAvailableCommand(["gs", "gswin64c", "gswin32c"]),
193
+ getMissingMessage: (inspection) =>
194
+ formatGhostscriptMissingMessage(inspection?.extension || ".svg"),
195
+ },
196
+ };
197
+
198
+ function getPackageNames(manager: PackageManagerId, dependencyNames: DependencyName[]): string[] {
199
+ return dependencyNames.map((dependencyName) => PACKAGE_NAMES[manager][dependencyName]);
200
+ }
201
+
202
+ function getLinuxInstallArgs(managerId: PackageManagerId, packageNames: string[]): string[] {
203
+ if (managerId === "pacman") {
204
+ return ["-Sy", "--noconfirm", ...packageNames];
205
+ }
206
+
207
+ if (managerId === "apk") {
208
+ return ["add", ...packageNames];
209
+ }
210
+
211
+ return ["install", "-y", ...packageNames];
212
+ }
213
+
214
+ function buildCommandDisplay(command: string, args: string[], displayPrefix = ""): string {
215
+ return `${displayPrefix}${command}${args.length > 0 ? ` ${args.join(" ")}` : ""}`;
216
+ }
217
+
218
+ function createCommandSpec(
219
+ description: string,
220
+ command: string,
221
+ args: string[],
222
+ options: { prefix?: string[]; displayPrefix?: string; timeoutMs?: number } = {},
223
+ ): InstallCommandSpec {
224
+ const prefix = options.prefix ?? [];
225
+ const displayPrefix = options.displayPrefix ?? "";
226
+
227
+ if (prefix.length === 0) {
228
+ return {
229
+ description,
230
+ command,
231
+ args,
232
+ display: buildCommandDisplay(command, args),
233
+ timeoutMs: options.timeoutMs,
234
+ };
235
+ }
236
+
237
+ return {
238
+ description,
239
+ command: prefix[0],
240
+ args: [...prefix.slice(1), command, ...args],
241
+ display: buildCommandDisplay(command, args, displayPrefix),
242
+ timeoutMs: options.timeoutMs,
243
+ };
244
+ }
245
+
246
+ async function getUnixPrivilegeContext(): Promise<UnixPrivilegeContext> {
247
+ if (typeof process.getuid === "function" && process.getuid() === 0) {
248
+ return {
249
+ prefix: [],
250
+ displayPrefix: "",
251
+ autoRunnable: true,
252
+ };
253
+ }
254
+
255
+ if ((await runBinaryLookup("sudo")) && (await spawnSucceeded("sudo", ["-n", "true"]))) {
256
+ return {
257
+ prefix: ["sudo", "-n"],
258
+ displayPrefix: "sudo ",
259
+ autoRunnable: true,
260
+ };
261
+ }
262
+
263
+ return {
264
+ prefix: ["sudo", "-n"],
265
+ displayPrefix: "sudo ",
266
+ autoRunnable: false,
267
+ blockedReason: "Automatic install on Linux requires root privileges or passwordless sudo.",
268
+ };
269
+ }
270
+
271
+ function buildLinuxInstallCommands(
272
+ manager: { id: PackageManagerId; label: string },
273
+ dependencyNames: DependencyName[],
274
+ privilegeContext: UnixPrivilegeContext,
275
+ ): InstallCommandSpec[] {
276
+ const packageNames = getPackageNames(manager.id, dependencyNames);
277
+ const commands: InstallCommandSpec[] = [];
278
+
279
+ if (manager.id === "apt-get") {
280
+ commands.push(
281
+ createCommandSpec("Refresh apt package metadata", "apt-get", ["update"], {
282
+ prefix: privilegeContext.prefix,
283
+ displayPrefix: privilegeContext.displayPrefix,
284
+ timeoutMs: INSTALL_COMMAND_TIMEOUT_MS,
285
+ }),
286
+ );
287
+ }
288
+
289
+ commands.push(
290
+ createCommandSpec(
291
+ `Install missing document parser dependencies via ${manager.label}`,
292
+ manager.id,
293
+ getLinuxInstallArgs(manager.id, packageNames),
294
+ {
295
+ prefix: privilegeContext.prefix,
296
+ displayPrefix: privilegeContext.displayPrefix,
297
+ timeoutMs: INSTALL_COMMAND_TIMEOUT_MS,
298
+ },
299
+ ),
300
+ );
301
+
302
+ return commands;
303
+ }
304
+
305
+ function buildWingetCommands(dependencyNames: DependencyName[]): InstallCommandSpec[] {
306
+ return dependencyNames.map((dependencyName) =>
307
+ createCommandSpec(
308
+ `Install ${DEPENDENCY_METADATA[dependencyName].label} via winget`,
309
+ "winget",
310
+ [
311
+ "install",
312
+ "-e",
313
+ "--id",
314
+ PACKAGE_NAMES.winget[dependencyName],
315
+ "--accept-package-agreements",
316
+ "--accept-source-agreements",
317
+ ],
318
+ { timeoutMs: INSTALL_COMMAND_TIMEOUT_MS },
319
+ ),
320
+ );
321
+ }
322
+
323
+ export function getRelevantDependencyNames(inspection?: InputInspection): Set<DependencyName> {
324
+ if (!inspection) {
325
+ return new Set(DEPENDENCY_NAMES);
326
+ }
327
+
328
+ const relevantDependencies =
329
+ inspection.category === "office" || inspection.category === "spreadsheet"
330
+ ? new Set<DependencyName>(["libreoffice"])
331
+ : inspection.category === "image"
332
+ ? new Set<DependencyName>(["imagemagick"])
333
+ : new Set<DependencyName>();
334
+
335
+ if (
336
+ inspection.category === "image" &&
337
+ GHOSTSCRIPT_REQUIRED_EXTENSIONS.has(inspection.extension)
338
+ ) {
339
+ relevantDependencies.add("ghostscript");
340
+ }
341
+
342
+ return relevantDependencies;
343
+ }
344
+
345
+ export async function diagnoseDependencies(
346
+ inspection?: InputInspection,
347
+ ): Promise<DependencyDiagnosis[]> {
348
+ const relevantDependencies = getRelevantDependencyNames(inspection);
349
+ const detectedCommands = await Promise.all(
350
+ DEPENDENCY_NAMES.map((dependencyName) => DEPENDENCY_METADATA[dependencyName].findCommand()),
351
+ );
352
+
353
+ return DEPENDENCY_NAMES.map((dependencyName, index) => ({
354
+ name: dependencyName,
355
+ label: DEPENDENCY_METADATA[dependencyName].label,
356
+ installed: Boolean(detectedCommands[index]),
357
+ detectedCommand: detectedCommands[index],
358
+ relevant: relevantDependencies.has(dependencyName),
359
+ summary: DEPENDENCY_METADATA[dependencyName].summary,
360
+ missingMessage: DEPENDENCY_METADATA[dependencyName].getMissingMessage(inspection),
361
+ }));
362
+ }
363
+
364
+ export async function getMissingHostDependencyMessage(
365
+ inspection: InputInspection,
366
+ ): Promise<string | undefined> {
367
+ const diagnoses = await diagnoseDependencies(inspection);
368
+ return diagnoses.find((diagnosis) => diagnosis.relevant && !diagnosis.installed)?.missingMessage;
369
+ }
370
+
371
+ export function isDependencySetupMessage(message: string): boolean {
372
+ return DEPENDENCY_SETUP_PATTERNS.some((pattern) => message.includes(pattern));
373
+ }
374
+
375
+ export function appendDoctorHint(message: string): string {
376
+ return message.includes(DOCTOR_COMMAND)
377
+ ? message
378
+ : `${message} Run ${DOCTOR_COMMAND} for guided setup.`;
379
+ }
380
+
381
+ export function getPlatformLabel(): string {
382
+ return PLATFORM_LABELS[process.platform as keyof typeof PLATFORM_LABELS] ?? process.platform;
383
+ }
384
+
385
+ export function getInputCategoryLabel(category: InputCategory): string {
386
+ return INPUT_CATEGORY_LABELS[category];
387
+ }
388
+
389
+ export async function buildInstallStrategies(
390
+ missingDependencies: DependencyDiagnosis[],
391
+ ): Promise<InstallStrategy[]> {
392
+ const missingNames = Array.from(
393
+ new Set(
394
+ missingDependencies
395
+ .filter((dependency) => !dependency.installed)
396
+ .map((dependency) => dependency.name),
397
+ ),
398
+ );
399
+ if (missingNames.length === 0) {
400
+ return [];
401
+ }
402
+
403
+ if (process.platform === "darwin") {
404
+ const brewAvailable = await runBinaryLookup("brew");
405
+
406
+ return [
407
+ {
408
+ id: "brew",
409
+ label: "Homebrew",
410
+ autoRunnable: brewAvailable,
411
+ autoRunBlockedReason: brewAvailable ? undefined : "Homebrew was not detected on PATH.",
412
+ commands: [
413
+ createCommandSpec(
414
+ "Install missing document parser dependencies via Homebrew",
415
+ "brew",
416
+ ["install", ...getPackageNames("brew", missingNames)],
417
+ { timeoutMs: INSTALL_COMMAND_TIMEOUT_MS },
418
+ ),
419
+ ],
420
+ },
421
+ ];
422
+ }
423
+
424
+ if (process.platform === "linux") {
425
+ const privilegeContext = await getUnixPrivilegeContext();
426
+ const strategies: InstallStrategy[] = [];
427
+
428
+ for (const manager of LINUX_MANAGERS) {
429
+ if (!(await runBinaryLookup(manager.id))) {
430
+ continue;
431
+ }
432
+
433
+ strategies.push({
434
+ id: manager.id,
435
+ label: manager.label,
436
+ autoRunnable: privilegeContext.autoRunnable,
437
+ autoRunBlockedReason: privilegeContext.autoRunnable
438
+ ? undefined
439
+ : privilegeContext.blockedReason,
440
+ commands: buildLinuxInstallCommands(manager, missingNames, privilegeContext),
441
+ });
442
+ }
443
+
444
+ return strategies;
445
+ }
446
+
447
+ if (process.platform === "win32") {
448
+ const wingetAvailable = await runBinaryLookup("winget");
449
+ const chocoAvailable = await runBinaryLookup("choco");
450
+ const strategies: InstallStrategy[] = [];
451
+
452
+ if (wingetAvailable) {
453
+ strategies.push({
454
+ id: "winget",
455
+ label: "winget",
456
+ autoRunnable: true,
457
+ commands: buildWingetCommands(missingNames),
458
+ });
459
+ }
460
+
461
+ if (chocoAvailable) {
462
+ strategies.push({
463
+ id: "choco",
464
+ label: "Chocolatey",
465
+ autoRunnable: true,
466
+ commands: [
467
+ createCommandSpec(
468
+ "Install missing document parser dependencies via Chocolatey",
469
+ "choco",
470
+ ["install", "-y", ...getPackageNames("choco", missingNames)],
471
+ { timeoutMs: INSTALL_COMMAND_TIMEOUT_MS },
472
+ ),
473
+ ],
474
+ });
475
+ }
476
+
477
+ if (wingetAvailable || chocoAvailable) {
478
+ return strategies;
479
+ }
480
+
481
+ return [
482
+ {
483
+ id: "winget",
484
+ label: "winget",
485
+ autoRunnable: false,
486
+ autoRunBlockedReason: "Neither winget nor Chocolatey was detected on PATH.",
487
+ commands: buildWingetCommands(missingNames),
488
+ },
489
+ ];
490
+ }
491
+
492
+ return [];
493
+ }
494
+
495
+ export function getPreferredStrategies(strategies: InstallStrategy[]): InstallStrategy[] {
496
+ const order =
497
+ process.platform === "darwin"
498
+ ? ["brew"]
499
+ : process.platform === "linux"
500
+ ? ["apt-get", "dnf", "yum", "pacman", "zypper", "apk"]
501
+ : ["winget", "choco"];
502
+ const getOrderIndex = (id: PackageManagerId) => {
503
+ const index = order.indexOf(id);
504
+ return index === -1 ? Number.MAX_SAFE_INTEGER : index;
505
+ };
506
+
507
+ return [...strategies].sort((a, b) => getOrderIndex(a.id) - getOrderIndex(b.id));
508
+ }
509
+
510
+ export function summarizeInstallOutput(stdout: string, stderr: string): string | undefined {
511
+ const combined = [stdout.trim(), stderr.trim()].filter(Boolean).join("\n\n");
512
+ if (!combined) {
513
+ return undefined;
514
+ }
515
+
516
+ const truncation = truncateTail(combined, {
517
+ maxLines: 20,
518
+ maxBytes: 2 * 1024,
519
+ });
520
+
521
+ return truncation.content.trim();
522
+ }