cerfaparse 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js ADDED
@@ -0,0 +1,620 @@
1
+ #!/usr/bin/env node
2
+
3
+ // src/cli.ts
4
+ import { program } from "commander";
5
+ import { readFile as readFile2, writeFile } from "fs/promises";
6
+ import { basename, dirname, join as join2 } from "path";
7
+
8
+ // src/poppler.ts
9
+ import { execa } from "execa";
10
+ import { access, mkdtemp, readFile, rm } from "fs/promises";
11
+ import { tmpdir } from "os";
12
+ import { join } from "path";
13
+ async function checkPoppler() {
14
+ const tools = ["pdftocairo", "pdftotext", "pdfinfo"];
15
+ const missing = [];
16
+ for (const tool of tools) {
17
+ try {
18
+ await execa(tool, ["-v"]);
19
+ } catch {
20
+ missing.push(tool);
21
+ }
22
+ }
23
+ if (missing.length > 0) {
24
+ throw new Error(
25
+ `Poppler tools not found: ${missing.join(", ")}. Install with:
26
+ macOS: brew install poppler
27
+ Linux: apt install poppler-utils`
28
+ );
29
+ }
30
+ }
31
+ async function getPageCount(pdfPath) {
32
+ await validateFileExists(pdfPath);
33
+ const { stdout } = await execa("pdfinfo", [pdfPath]);
34
+ const match = stdout.match(/^Pages:\s+(\d+)/m);
35
+ if (!match) throw new Error("Could not determine page count from pdfinfo");
36
+ return parseInt(match[1], 10);
37
+ }
38
+ async function extractSvg(pdfPath, page) {
39
+ const tmpDir = await mkdtemp(join(tmpdir(), "cerfaparse-"));
40
+ const outPath = join(tmpDir, `page-${page}.svg`);
41
+ try {
42
+ await execa("pdftocairo", [
43
+ "-svg",
44
+ "-f",
45
+ String(page),
46
+ "-l",
47
+ String(page),
48
+ pdfPath,
49
+ outPath
50
+ ]);
51
+ return await readFile(outPath, "utf-8");
52
+ } finally {
53
+ try {
54
+ await rm(tmpDir, { recursive: true, force: true });
55
+ } catch {
56
+ }
57
+ }
58
+ }
59
+ async function extractBbox(pdfPath) {
60
+ const { stdout } = await execa("pdftotext", [
61
+ "-bbox-layout",
62
+ pdfPath,
63
+ "-"
64
+ ]);
65
+ return stdout;
66
+ }
67
+ async function validateFileExists(path) {
68
+ try {
69
+ await access(path);
70
+ } catch {
71
+ throw new Error(`File not found: ${path}`);
72
+ }
73
+ }
74
+
75
+ // src/extract-boxes.ts
76
+ import * as cheerio from "cheerio";
77
+ var WHITE_THRESHOLD = 0.95;
78
+ var DARK_STROKE_THRESHOLD = 0.2;
79
+ var MAX_BOX_WIDTH = 50;
80
+ var MAX_BOX_HEIGHT = 50;
81
+ var MIN_BOX_SIZE = 3;
82
+ function extractBoxes(svgXml) {
83
+ const $ = cheerio.load(svgXml, { xml: true });
84
+ let pageHeight = 0;
85
+ const viewBox = $("svg").attr("viewBox") ?? "";
86
+ const vbParts = viewBox.split(/\s+/);
87
+ if (vbParts.length >= 4) {
88
+ pageHeight = parseFloat(vbParts[3]);
89
+ }
90
+ if (!Number.isFinite(pageHeight) || pageHeight <= 0) {
91
+ const heightAttr = $("svg").attr("height") ?? "";
92
+ pageHeight = parseFloat(heightAttr);
93
+ }
94
+ if (!Number.isFinite(pageHeight) || pageHeight <= 0) {
95
+ throw new Error("Could not extract page height from SVG viewBox or height attribute");
96
+ }
97
+ let transform = null;
98
+ const boxes = [];
99
+ $("path").each((_, el) => {
100
+ const path = $(el);
101
+ const d = path.attr("d");
102
+ const fill = path.attr("fill") ?? "";
103
+ const stroke = path.attr("stroke") ?? "";
104
+ const strokeWidth = parseFloat(path.attr("stroke-width") ?? "0");
105
+ if (!d || !isWhiteColor(fill)) return;
106
+ const rect = parseRectPath(d);
107
+ if (!rect) return;
108
+ if (rect.width > MAX_BOX_WIDTH || rect.height > MAX_BOX_HEIGHT) return;
109
+ if (rect.width < MIN_BOX_SIZE || rect.height < MIN_BOX_SIZE) return;
110
+ const type = classifyBox(stroke, strokeWidth);
111
+ if (!type) return;
112
+ const pathTransform = parseTransformFromAttr(path.attr("transform") ?? "");
113
+ if (!pathTransform) return;
114
+ const ancestorTransform = collectAncestorTransform($, el);
115
+ const effectiveTransform = ancestorTransform ? composeTransforms(ancestorTransform, pathTransform) : pathTransform;
116
+ if (!transform) {
117
+ transform = effectiveTransform;
118
+ } else if (!matricesEqual(transform, effectiveTransform)) {
119
+ console.warn(
120
+ "Warning: found input box with different transform matrix \u2014 skipping. This may indicate rotated sections."
121
+ );
122
+ return;
123
+ }
124
+ boxes.push({ ...rect, type });
125
+ });
126
+ return { boxes, transform, pageHeight };
127
+ }
128
+ function collectAncestorTransform($, el) {
129
+ const transforms = [];
130
+ let insideDefs = false;
131
+ let current = el.parent;
132
+ while (current) {
133
+ const tag = current.tagName ?? current.name ?? "";
134
+ if (tag === "svg") break;
135
+ if (tag === "defs") {
136
+ insideDefs = true;
137
+ break;
138
+ }
139
+ if (tag === "g") {
140
+ const t = $(current).attr("transform") ?? "";
141
+ const parsed = parseTransformAttr(t);
142
+ if (parsed) transforms.unshift(parsed);
143
+ }
144
+ current = current.parent;
145
+ }
146
+ if (insideDefs) {
147
+ const svgChildren = $("svg").children().toArray();
148
+ for (const child of svgChildren) {
149
+ const tag = child.tagName ?? "";
150
+ if (tag !== "use") continue;
151
+ const t = $(child).attr("transform") ?? "";
152
+ const parsed = parseTransformAttr(t);
153
+ if (parsed) return parsed;
154
+ }
155
+ return null;
156
+ }
157
+ if (transforms.length === 0) return null;
158
+ return transforms.reduce(composeTransforms);
159
+ }
160
+ function parseTransformAttr(attr) {
161
+ const matrixMatch = attr.match(MATRIX_RE);
162
+ if (matrixMatch) {
163
+ const m = {
164
+ a: parseFloat(matrixMatch[1]),
165
+ b: parseFloat(matrixMatch[2]),
166
+ c: parseFloat(matrixMatch[3]),
167
+ d: parseFloat(matrixMatch[4]),
168
+ e: parseFloat(matrixMatch[5]),
169
+ f: parseFloat(matrixMatch[6])
170
+ };
171
+ if (Object.values(m).some((v) => !Number.isFinite(v))) return null;
172
+ return m;
173
+ }
174
+ const translateMatch = attr.match(
175
+ /translate\(\s*([-\d.]+)(?:\s*,\s*([-\d.]+))?\s*\)/
176
+ );
177
+ if (translateMatch) {
178
+ const tx = parseFloat(translateMatch[1]);
179
+ const ty = parseFloat(translateMatch[2] ?? "0");
180
+ if (!Number.isFinite(tx) || !Number.isFinite(ty)) return null;
181
+ return { a: 1, b: 0, c: 0, d: 1, e: tx, f: ty };
182
+ }
183
+ return null;
184
+ }
185
+ function composeTransforms(m1, m2) {
186
+ return {
187
+ a: m1.a * m2.a + m1.c * m2.b,
188
+ b: m1.b * m2.a + m1.d * m2.b,
189
+ c: m1.a * m2.c + m1.c * m2.d,
190
+ d: m1.b * m2.c + m1.d * m2.d,
191
+ e: m1.a * m2.e + m1.c * m2.f + m1.e,
192
+ f: m1.b * m2.e + m1.d * m2.f + m1.f
193
+ };
194
+ }
195
+ var MATRIX_RE = /matrix\(\s*([-\d.]+)\s*,\s*([-\d.]+)\s*,\s*([-\d.]+)\s*,\s*([-\d.]+)\s*,\s*([-\d.]+)\s*,\s*([-\d.]+)\s*\)/;
196
+ function parseTransformFromAttr(attr) {
197
+ return parseTransformAttr(attr);
198
+ }
199
+ function matricesEqual(a, b) {
200
+ const tol = 1e-3;
201
+ return Math.abs(a.a - b.a) < tol && Math.abs(a.b - b.b) < tol && Math.abs(a.c - b.c) < tol && Math.abs(a.d - b.d) < tol && Math.abs(a.e - b.e) < tol && Math.abs(a.f - b.f) < tol;
202
+ }
203
+ function parseRectPath(d) {
204
+ const commands = d.trim().split(/\s*([MLHVCSQTAZ])\s*/i).filter(Boolean);
205
+ const points = [];
206
+ let i = 0;
207
+ while (i < commands.length) {
208
+ const cmd = commands[i];
209
+ if (cmd === "M" || cmd === "L") {
210
+ const coords = commands[i + 1]?.trim().split(/\s+/);
211
+ if (coords && coords.length >= 2) {
212
+ const x = parseFloat(coords[0]);
213
+ const y = parseFloat(coords[1]);
214
+ if (!Number.isFinite(x) || !Number.isFinite(y)) return null;
215
+ points.push([x, y]);
216
+ }
217
+ i += 2;
218
+ } else if (cmd === "Z") {
219
+ i += 1;
220
+ } else {
221
+ return null;
222
+ }
223
+ }
224
+ if (points.length < 4) return null;
225
+ const xs = points.slice(0, 4).map((p) => p[0]);
226
+ const ys = points.slice(0, 4).map((p) => p[1]);
227
+ const minX = Math.min(...xs);
228
+ const maxX = Math.max(...xs);
229
+ const minY = Math.min(...ys);
230
+ const maxY = Math.max(...ys);
231
+ const width = maxX - minX;
232
+ const height = maxY - minY;
233
+ if (width < 1 || height < 1) return null;
234
+ return { x: minX, y: minY, width, height };
235
+ }
236
+ function parseRgbPercent(color) {
237
+ const match = color.match(
238
+ /rgb\(\s*([\d.]+)%\s*,\s*([\d.]+)%\s*,\s*([\d.]+)%\s*\)/
239
+ );
240
+ if (!match) return null;
241
+ return [
242
+ parseFloat(match[1]) / 100,
243
+ parseFloat(match[2]) / 100,
244
+ parseFloat(match[3]) / 100
245
+ ];
246
+ }
247
+ function isWhiteColor(color) {
248
+ const rgb = parseRgbPercent(color);
249
+ if (!rgb) return false;
250
+ return rgb[0] >= WHITE_THRESHOLD && rgb[1] >= WHITE_THRESHOLD && rgb[2] >= WHITE_THRESHOLD;
251
+ }
252
+ function isDarkColor(color) {
253
+ const rgb = parseRgbPercent(color);
254
+ if (!rgb) return false;
255
+ return rgb[0] < DARK_STROKE_THRESHOLD && rgb[1] < DARK_STROKE_THRESHOLD && rgb[2] < DARK_STROKE_THRESHOLD;
256
+ }
257
+ function classifyBox(stroke, strokeWidth) {
258
+ if (isWhiteColor(stroke) && strokeWidth >= 0.9) {
259
+ return "cell";
260
+ }
261
+ if (isDarkColor(stroke)) {
262
+ return "checkbox";
263
+ }
264
+ return null;
265
+ }
266
+
267
+ // src/extract-labels.ts
268
+ import * as cheerio2 from "cheerio";
269
+ var WORD_JOIN_GAP = 10;
270
+ var LINE_Y_TOLERANCE = 2;
271
+ function extractLabels(bboxHtml) {
272
+ const $ = cheerio2.load(bboxHtml, { xml: false });
273
+ const pageLabels = /* @__PURE__ */ new Map();
274
+ $("page").each((pageIdx, pageEl) => {
275
+ const pageNum = pageIdx + 1;
276
+ const words = [];
277
+ $(pageEl).find("word").each((_, wordEl) => {
278
+ const $w = $(wordEl);
279
+ words.push({
280
+ text: $w.text().trim(),
281
+ xMin: parseFloat($w.attr("xmin") ?? "0"),
282
+ yMin: parseFloat($w.attr("ymin") ?? "0"),
283
+ xMax: parseFloat($w.attr("xmax") ?? "0"),
284
+ yMax: parseFloat($w.attr("ymax") ?? "0")
285
+ });
286
+ });
287
+ const labels = assembleLabels(words, pageNum);
288
+ pageLabels.set(pageNum, labels);
289
+ });
290
+ return pageLabels;
291
+ }
292
+ function assembleLabels(words, page) {
293
+ if (words.length === 0) return [];
294
+ const sorted = [...words].sort((a, b) => {
295
+ const yDiff = a.yMin - b.yMin;
296
+ if (Math.abs(yDiff) > LINE_Y_TOLERANCE) return yDiff;
297
+ return a.xMin - b.xMin;
298
+ });
299
+ const lines = [[sorted[0]]];
300
+ for (let i = 1; i < sorted.length; i++) {
301
+ const prev = lines[lines.length - 1];
302
+ const lastWord = prev[prev.length - 1];
303
+ const curr = sorted[i];
304
+ if (Math.abs(curr.yMin - lastWord.yMin) <= LINE_Y_TOLERANCE) {
305
+ prev.push(curr);
306
+ } else {
307
+ lines.push([curr]);
308
+ }
309
+ }
310
+ const labels = [];
311
+ for (const line of lines) {
312
+ const sortedLine = line.sort((a, b) => a.xMin - b.xMin);
313
+ let current = [sortedLine[0]];
314
+ for (let i = 1; i < sortedLine.length; i++) {
315
+ const prev = current[current.length - 1];
316
+ const curr = sortedLine[i];
317
+ const gap = curr.xMin - prev.xMax;
318
+ if (gap <= WORD_JOIN_GAP) {
319
+ current.push(curr);
320
+ } else {
321
+ labels.push(wordsToLabel(current, page));
322
+ current = [curr];
323
+ }
324
+ }
325
+ labels.push(wordsToLabel(current, page));
326
+ }
327
+ return labels;
328
+ }
329
+ function wordsToLabel(words, page) {
330
+ const text = words.map((w) => w.text).join(" ");
331
+ let xMin = Infinity, yMin = Infinity, xMax = -Infinity, yMax = -Infinity;
332
+ for (const w of words) {
333
+ if (w.xMin < xMin) xMin = w.xMin;
334
+ if (w.yMin < yMin) yMin = w.yMin;
335
+ if (w.xMax > xMax) xMax = w.xMax;
336
+ if (w.yMax > yMax) yMax = w.yMax;
337
+ }
338
+ return { text, xMin, yMin, xMax, yMax, page };
339
+ }
340
+
341
+ // src/group-rows.ts
342
+ var Y_TOLERANCE = 2;
343
+ var FIELD_GAP_THRESHOLD = 10;
344
+ function groupBoxesIntoFields(boxes, page) {
345
+ const rows = groupByY(boxes, page);
346
+ const fields = [];
347
+ for (const row of rows) {
348
+ const sorted = [...row.boxes].sort((a, b) => a.x - b.x);
349
+ const splits = splitByXGap(sorted, row);
350
+ fields.push(...splits);
351
+ }
352
+ fields.sort((a, b) => {
353
+ const yDiff = a.boxes[0].y - b.boxes[0].y;
354
+ if (Math.abs(yDiff) > Y_TOLERANCE) return yDiff;
355
+ return a.boxes[0].x - b.boxes[0].x;
356
+ });
357
+ return fields;
358
+ }
359
+ function groupByY(boxes, page) {
360
+ const sorted = [...boxes].sort((a, b) => a.y - b.y);
361
+ const rows = [];
362
+ for (const box of sorted) {
363
+ const existingRow = rows.find(
364
+ (r) => Math.abs(r.y - box.y) <= Y_TOLERANCE && r.type === box.type
365
+ );
366
+ if (existingRow) {
367
+ existingRow.boxes.push(box);
368
+ const avg = existingRow.boxes.reduce((sum, b) => sum + b.y, 0) / existingRow.boxes.length;
369
+ existingRow.y = avg;
370
+ } else {
371
+ rows.push({ boxes: [box], y: box.y, type: box.type, page });
372
+ }
373
+ }
374
+ return rows;
375
+ }
376
+ function splitByXGap(sortedBoxes, row) {
377
+ if (sortedBoxes.length === 0) return [];
378
+ const groups = [[sortedBoxes[0]]];
379
+ for (let i = 1; i < sortedBoxes.length; i++) {
380
+ const prev = sortedBoxes[i - 1];
381
+ const curr = sortedBoxes[i];
382
+ const gap = curr.x - (prev.x + prev.width);
383
+ if (gap > FIELD_GAP_THRESHOLD) {
384
+ groups.push([curr]);
385
+ } else {
386
+ groups[groups.length - 1].push(curr);
387
+ }
388
+ }
389
+ return groups.map((boxes) => ({
390
+ boxes,
391
+ row: { ...row, boxes },
392
+ boxCount: boxes.length
393
+ }));
394
+ }
395
+
396
+ // src/transform.ts
397
+ function svgPointToViewport(svgX, svgY, matrix) {
398
+ return {
399
+ x: matrix.a * svgX + matrix.c * svgY + matrix.e,
400
+ y: matrix.b * svgX + matrix.d * svgY + matrix.f
401
+ };
402
+ }
403
+ function svgBoxToPdfRect(box, matrix, pageHeight) {
404
+ const corner1 = svgPointToViewport(box.x, box.y, matrix);
405
+ const corner2 = svgPointToViewport(
406
+ box.x + box.width,
407
+ box.y + box.height,
408
+ matrix
409
+ );
410
+ const vpLeft = Math.min(corner1.x, corner2.x);
411
+ const vpTop = Math.min(corner1.y, corner2.y);
412
+ const vpRight = Math.max(corner1.x, corner2.x);
413
+ const vpBottom = Math.max(corner1.y, corner2.y);
414
+ const width = vpRight - vpLeft;
415
+ const height = vpBottom - vpTop;
416
+ const pdfX = vpLeft;
417
+ const pdfY = pageHeight - vpBottom;
418
+ return { x: pdfX, y: pdfY, width, height };
419
+ }
420
+
421
+ // src/map-labels.ts
422
+ function toFieldType(boxType) {
423
+ return boxType === "cell" ? "input" : "checkbox";
424
+ }
425
+ var LABEL_Y_MAX_DISTANCE = 25;
426
+ function labelToPdfCoords(label, pageHeight) {
427
+ return {
428
+ xMin: label.xMin,
429
+ xMax: label.xMax,
430
+ yBottom: pageHeight - label.yMax,
431
+ // bottom edge in PDF
432
+ yTop: pageHeight - label.yMin
433
+ // top edge in PDF
434
+ };
435
+ }
436
+ function mapLabelsToFields(fieldGroups, labels, transform, page, pageHeight) {
437
+ const fields = [];
438
+ const usedNames = /* @__PURE__ */ new Set();
439
+ for (const group of fieldGroups) {
440
+ const pdfRect = computeGroupPdfRect(group, transform, pageHeight);
441
+ const bestLabel = findBestLabel(pdfRect, labels, pageHeight);
442
+ const labelText = bestLabel?.text ?? "";
443
+ const baseName = generateFieldName(labelText, group.row.type, page);
444
+ const name = deduplicateName(baseName, usedNames);
445
+ usedNames.add(name);
446
+ const field = {
447
+ key: name,
448
+ type: toFieldType(group.row.type),
449
+ props: {
450
+ label: labelText,
451
+ page,
452
+ pdfRect,
453
+ ...group.row.type === "cell" ? { maxLength: group.boxCount } : {}
454
+ }
455
+ };
456
+ fields.push(field);
457
+ }
458
+ return fields;
459
+ }
460
+ function computeGroupPdfRect(group, transform, pageHeight) {
461
+ const rects = group.boxes.map((box) => svgBoxToPdfRect(box, transform, pageHeight));
462
+ let x = Infinity, y = Infinity, maxX = -Infinity, maxY = -Infinity;
463
+ for (const r of rects) {
464
+ if (r.x < x) x = r.x;
465
+ if (r.y < y) y = r.y;
466
+ const rx = r.x + r.width;
467
+ const ry = r.y + r.height;
468
+ if (rx > maxX) maxX = rx;
469
+ if (ry > maxY) maxY = ry;
470
+ }
471
+ return { x, y, width: maxX - x, height: maxY - y };
472
+ }
473
+ function findBestLabel(fieldRect, labels, pageHeight) {
474
+ const fieldTop = fieldRect.y + fieldRect.height;
475
+ const fieldLeft = fieldRect.x;
476
+ const fieldRight = fieldRect.x + fieldRect.width;
477
+ let bestLabel = null;
478
+ let bestScore = Infinity;
479
+ for (const label of labels) {
480
+ const pdfLabel = labelToPdfCoords(label, pageHeight);
481
+ const yDistance = pdfLabel.yBottom - fieldTop;
482
+ if (yDistance < 0 || yDistance > LABEL_Y_MAX_DISTANCE) continue;
483
+ const hasOverlap = pdfLabel.xMin < fieldRight && pdfLabel.xMax > fieldLeft;
484
+ const isLeftOf = pdfLabel.xMax <= fieldLeft && fieldLeft - pdfLabel.xMax < 50;
485
+ if (!hasOverlap && !isLeftOf) continue;
486
+ const score = yDistance + (hasOverlap ? 0 : 20);
487
+ if (score < bestScore) {
488
+ bestScore = score;
489
+ bestLabel = label;
490
+ }
491
+ }
492
+ return bestLabel;
493
+ }
494
+ function generateFieldName(labelText, _type, page) {
495
+ if (!labelText) {
496
+ return `p${page}_field`;
497
+ }
498
+ const cleaned = labelText.replace(/\s*:\s*$/, "").trim();
499
+ const camel = cleaned.normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/[^a-zA-Z0-9\s]/g, "").trim().split(/\s+/).map(
500
+ (word, i) => i === 0 ? word.toLowerCase() : word.charAt(0).toUpperCase() + word.slice(1).toLowerCase()
501
+ ).join("");
502
+ if (!camel) {
503
+ return `p${page}_field`;
504
+ }
505
+ return `p${page}_${camel}`;
506
+ }
507
+ function deduplicateName(baseName, used) {
508
+ if (!used.has(baseName)) return baseName;
509
+ let counter = 2;
510
+ while (used.has(`${baseName}_${counter}`)) counter++;
511
+ return `${baseName}_${counter}`;
512
+ }
513
+
514
+ // src/inject-fields.ts
515
+ import { PDFDocument, PDFName, StandardFonts } from "pdf-lib";
516
+ function clearWidgetAppearance(acroField) {
517
+ for (const widget of acroField.getWidgets()) {
518
+ const mk = widget.dict.lookup(PDFName.of("MK"));
519
+ if (mk && typeof mk.delete === "function") {
520
+ mk.delete(PDFName.of("BG"));
521
+ mk.delete(PDFName.of("BC"));
522
+ }
523
+ }
524
+ }
525
+ async function injectFields(pdfBytes, fields) {
526
+ const pdfDoc = await PDFDocument.load(pdfBytes);
527
+ const form = pdfDoc.getForm();
528
+ const font = await pdfDoc.embedFont(StandardFonts.Helvetica);
529
+ const pages = pdfDoc.getPages();
530
+ for (const field of fields) {
531
+ const pageIndex = field.props.page - 1;
532
+ if (pageIndex < 0 || pageIndex >= pages.length) continue;
533
+ const page = pages[pageIndex];
534
+ const { x, y, width, height } = field.props.pdfRect;
535
+ if (field.type === "input") {
536
+ const textField = form.createTextField(field.key);
537
+ textField.addToPage(page, { x, y, width, height, font, borderWidth: 0 });
538
+ if (field.props.maxLength) {
539
+ textField.setMaxLength(field.props.maxLength);
540
+ textField.enableCombing();
541
+ }
542
+ textField.setFontSize(0);
543
+ clearWidgetAppearance(textField.acroField);
544
+ textField.updateAppearances(font);
545
+ } else {
546
+ const checkbox = form.createCheckBox(field.key);
547
+ checkbox.addToPage(page, { x, y, width, height, borderWidth: 0 });
548
+ clearWidgetAppearance(checkbox.acroField);
549
+ checkbox.updateAppearances();
550
+ }
551
+ }
552
+ return pdfDoc.save();
553
+ }
554
+
555
+ // src/cli.ts
556
+ async function convert(inputPath, outputPath) {
557
+ await checkPoppler();
558
+ const dir = dirname(inputPath);
559
+ const base = basename(inputPath, ".pdf");
560
+ const pdfOut = outputPath ?? join2(dir, `${base}-fillable.pdf`);
561
+ const jsonOut = join2(dirname(pdfOut), `${basename(pdfOut, ".pdf")}.fields.json`);
562
+ const pageCount = await getPageCount(inputPath);
563
+ console.log(`Processing ${pageCount} page(s)...`);
564
+ const bboxHtml = await extractBbox(inputPath);
565
+ const labelsByPage = extractLabels(bboxHtml);
566
+ const allFields = [];
567
+ for (let page = 1; page <= pageCount; page++) {
568
+ console.log(` Page ${page}: extracting geometry...`);
569
+ const svgXml = await extractSvg(inputPath, page);
570
+ const { boxes, transform, pageHeight } = extractBoxes(svgXml);
571
+ if (!transform || boxes.length === 0) {
572
+ console.log(` Page ${page}: no input boxes found, skipping`);
573
+ continue;
574
+ }
575
+ const fieldGroups = groupBoxesIntoFields(boxes, page);
576
+ const labels = labelsByPage.get(page) ?? [];
577
+ const fields = mapLabelsToFields(fieldGroups, labels, transform, page, pageHeight);
578
+ const cellCount = fields.filter((f) => f.type === "input").length;
579
+ const checkboxCount = fields.filter((f) => f.type === "checkbox").length;
580
+ console.log(` Page ${page}: ${cellCount} text fields, ${checkboxCount} checkboxes`);
581
+ allFields.push(...fields);
582
+ }
583
+ console.log("Injecting AcroForm fields...");
584
+ const pdfBytes = new Uint8Array(await readFile2(inputPath));
585
+ const filledPdf = await injectFields(pdfBytes, allFields);
586
+ await writeFile(pdfOut, filledPdf);
587
+ const output = {
588
+ pages: Array.from({ length: pageCount }, (_, i) => ({
589
+ pageNumber: i + 1,
590
+ fields: allFields.filter((f) => f.props.page === i + 1)
591
+ }))
592
+ };
593
+ await writeFile(jsonOut, JSON.stringify(output, null, 2));
594
+ console.log(`
595
+ Done!`);
596
+ console.log(` PDF: ${pdfOut}`);
597
+ console.log(` JSON: ${jsonOut}`);
598
+ console.log(` Total fields: ${allFields.length}`);
599
+ return { pdfOut, jsonOut, fields: allFields };
600
+ }
601
+ function main() {
602
+ program.name("cerfaparse").description("Convert flat CERFA PDFs into fillable AcroForm PDFs with field definitions").version("0.1.0");
603
+ program.command("convert").description("Convert a flat CERFA PDF to a fillable AcroForm PDF + JSON field definitions").argument("<input>", "Path to the input CERFA PDF").option("-o, --output <path>", "Output PDF path").action(async (input, opts) => {
604
+ try {
605
+ await convert(input, opts.output);
606
+ } catch (err) {
607
+ console.error(`Error: ${err.message}`);
608
+ process.exit(1);
609
+ }
610
+ });
611
+ program.parse();
612
+ }
613
+ var isMainModule = process.argv[1] && (import.meta.url.endsWith(process.argv[1]) || import.meta.url === `file://${process.argv[1]}`);
614
+ if (isMainModule) {
615
+ main();
616
+ }
617
+ export {
618
+ convert
619
+ };
620
+ //# sourceMappingURL=cli.js.map