@heripo/pdf-parser 0.1.9 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -43,11 +43,17 @@ var DOCLING_ENVIRONMENT = {
43
43
  */
44
44
  STARTUP_DELAY_MS: 2e3
45
45
  };
46
+ var PAGE_RENDERING = {
47
+ /** Default rendering DPI for VLM text recognition quality */
48
+ DEFAULT_DPI: 200,
49
+ /** Low-resolution DPI for OCR strategy sampling */
50
+ SAMPLE_DPI: 150
51
+ };
46
52
  var IMAGE_PDF_CONVERTER = {
47
53
  /**
48
54
  * ImageMagick density option (DPI) for PDF to image conversion
49
55
  */
50
- DENSITY: 300,
56
+ DENSITY: PAGE_RENDERING.DEFAULT_DPI,
51
57
  /**
52
58
  * ImageMagick quality option (1-100)
53
59
  */
@@ -837,16 +843,10 @@ var DoclingEnvironment = class _DoclingEnvironment {
837
843
 
838
844
  // src/core/pdf-converter.ts
839
845
  import { omit } from "es-toolkit";
840
- import {
841
- copyFileSync,
842
- createWriteStream as createWriteStream2,
843
- existsSync as existsSync4,
844
- readFileSync as readFileSync4,
845
- rmSync as rmSync3
846
- } from "fs";
847
- import { writeFile } from "fs/promises";
846
+ import { copyFileSync, createWriteStream as createWriteStream3, existsSync as existsSync4, rmSync as rmSync3 } from "fs";
847
+ import { rename as rename2, writeFile } from "fs/promises";
848
848
  import { join as join6 } from "path";
849
- import { pipeline } from "stream/promises";
849
+ import { pipeline as pipeline3 } from "stream/promises";
850
850
 
851
851
  // src/errors/image-pdf-fallback-error.ts
852
852
  var ImagePdfFallbackError = class extends Error {
@@ -862,19 +862,24 @@ var ImagePdfFallbackError = class extends Error {
862
862
 
863
863
  // src/processors/image-extractor.ts
864
864
  import {
865
- createWriteStream,
865
+ createReadStream,
866
+ createWriteStream as createWriteStream2,
866
867
  existsSync,
867
868
  mkdirSync,
868
- readFileSync,
869
869
  readdirSync,
870
870
  rmSync,
871
871
  writeFileSync
872
872
  } from "fs";
873
873
  import { extname, join as join2 } from "path";
874
+ import { Transform } from "stream";
875
+ import { pipeline as pipeline2 } from "stream/promises";
874
876
  import * as yauzl from "yauzl";
875
877
 
876
878
  // src/utils/jq.ts
877
879
  import { spawn as spawn3 } from "child_process";
880
+ import { createWriteStream } from "fs";
881
+ import { rename } from "fs/promises";
882
+ import { pipeline } from "stream/promises";
878
883
  function getJqPath() {
879
884
  const p = process.env.JQ_PATH?.trim();
880
885
  return p && p.length > 0 ? p : "jq";
@@ -926,25 +931,139 @@ function runJqFileJson(program, filePath) {
926
931
  });
927
932
  });
928
933
  }
929
- function jqExtractBase64PngStrings(filePath) {
930
- const program = `
931
- [
932
- .. |
933
- select(type == "string" and startswith("data:image/png;base64"))
934
- ]
935
- `;
936
- return runJqFileJson(program, filePath);
934
+ function runJqFileToFile(program, inputPath, outputPath) {
935
+ return new Promise((resolve, reject) => {
936
+ const jqPath = getJqPath();
937
+ const args = [program, inputPath];
938
+ const child = spawn3(jqPath, args, {
939
+ stdio: ["ignore", "pipe", "pipe"],
940
+ env: process.env
941
+ });
942
+ let stderr = "";
943
+ let exitCode = null;
944
+ let pipelineDone = false;
945
+ let settled = false;
946
+ child.stderr.setEncoding("utf-8");
947
+ child.stderr.on("data", (chunk) => {
948
+ stderr += chunk;
949
+ });
950
+ const ws = createWriteStream(outputPath);
951
+ function trySettle() {
952
+ if (settled) return;
953
+ if (!pipelineDone || exitCode === null) return;
954
+ settled = true;
955
+ if (exitCode !== 0) {
956
+ reject(
957
+ new Error(
958
+ `jq exited with code ${exitCode}. ${stderr ? "Stderr: " + stderr : ""}`
959
+ )
960
+ );
961
+ } else {
962
+ resolve();
963
+ }
964
+ }
965
+ child.on("error", (err) => {
966
+ if (settled) return;
967
+ settled = true;
968
+ ws.destroy();
969
+ reject(err);
970
+ });
971
+ pipeline(child.stdout, ws).then(() => {
972
+ pipelineDone = true;
973
+ trySettle();
974
+ }).catch((err) => {
975
+ if (settled) return;
976
+ settled = true;
977
+ reject(err);
978
+ });
979
+ child.on("close", (code) => {
980
+ exitCode = code ?? 1;
981
+ trySettle();
982
+ });
983
+ });
937
984
  }
938
- function jqReplaceBase64WithPaths(filePath, dirName, prefix) {
985
+ function runJqFileLines(program, filePath, onLine) {
986
+ return new Promise((resolve, reject) => {
987
+ const jqPath = getJqPath();
988
+ const args = ["-r", program, filePath];
989
+ const child = spawn3(jqPath, args, {
990
+ stdio: ["ignore", "pipe", "pipe"],
991
+ env: process.env
992
+ });
993
+ let stderr = "";
994
+ let buffer = "";
995
+ let callbackError = false;
996
+ child.stdout.setEncoding("utf-8");
997
+ child.stderr.setEncoding("utf-8");
998
+ function safeOnLine(line) {
999
+ if (callbackError) return;
1000
+ try {
1001
+ onLine(line);
1002
+ } catch (err) {
1003
+ callbackError = true;
1004
+ child.kill();
1005
+ reject(err);
1006
+ }
1007
+ }
1008
+ child.stdout.on("data", (chunk) => {
1009
+ buffer += chunk;
1010
+ let newlineIdx;
1011
+ while ((newlineIdx = buffer.indexOf("\n")) !== -1) {
1012
+ const line = buffer.slice(0, newlineIdx);
1013
+ buffer = buffer.slice(newlineIdx + 1);
1014
+ if (line.length > 0) {
1015
+ safeOnLine(line);
1016
+ }
1017
+ }
1018
+ });
1019
+ child.stderr.on("data", (chunk) => {
1020
+ stderr += chunk;
1021
+ });
1022
+ child.on("error", (err) => {
1023
+ if (!callbackError) reject(err);
1024
+ });
1025
+ child.on("close", (code) => {
1026
+ if (callbackError) return;
1027
+ if (buffer.length > 0) {
1028
+ safeOnLine(buffer);
1029
+ }
1030
+ if (callbackError) return;
1031
+ if (code !== 0) {
1032
+ reject(
1033
+ new Error(
1034
+ `jq exited with code ${code}. ${stderr ? "Stderr: " + stderr : ""}`
1035
+ )
1036
+ );
1037
+ } else {
1038
+ resolve();
1039
+ }
1040
+ });
1041
+ });
1042
+ }
1043
+ async function jqExtractBase64PngStringsStreaming(filePath, onImage) {
1044
+ let index = 0;
1045
+ await runJqFileLines(
1046
+ '.. | select(type == "string" and startswith("data:image/png;base64"))',
1047
+ filePath,
1048
+ (line) => {
1049
+ onImage(line, index);
1050
+ index++;
1051
+ }
1052
+ );
1053
+ return index;
1054
+ }
1055
+ async function jqReplaceBase64WithPathsToFile(inputPath, outputPath, dirName, prefix) {
939
1056
  const program = `
940
1057
  reduce paths(type == "string" and startswith("data:image/png;base64")) as $p (
941
1058
  {data: ., counter: 0};
942
1059
  .counter as $idx |
943
1060
  .data |= setpath($p; "${dirName}/${prefix}_\\($idx).png") |
944
1061
  .counter += 1
945
- ) | {data: .data, count: .counter}
1062
+ ) | .data
946
1063
  `;
947
- return runJqFileJson(program, filePath);
1064
+ const tmpPath = outputPath + ".tmp";
1065
+ await runJqFileToFile(program, inputPath, tmpPath);
1066
+ await rename(tmpPath, outputPath);
948
1067
  }
949
1068
 
950
1069
  // src/processors/image-extractor.ts
@@ -972,7 +1091,7 @@ var ImageExtractor = class _ImageExtractor {
972
1091
  return;
973
1092
  }
974
1093
  mkdirSync(join2(entryPath, ".."), { recursive: true });
975
- const writeStream = createWriteStream(entryPath);
1094
+ const writeStream = createWriteStream2(entryPath);
976
1095
  readStream.pipe(writeStream);
977
1096
  writeStream.on("finish", () => {
978
1097
  zipfile.readEntry();
@@ -988,26 +1107,6 @@ var ImageExtractor = class _ImageExtractor {
988
1107
  });
989
1108
  });
990
1109
  }
991
- /**
992
- * Extract base64 images from JSON file using jq (for large files)
993
- * Returns array of base64 data strings
994
- */
995
- static async extractBase64ImagesFromJsonWithJq(jsonPath) {
996
- return jqExtractBase64PngStrings(jsonPath);
997
- }
998
- /**
999
- * Replace base64 images with file paths in JSON using jq (for large files)
1000
- * Uses reduce to maintain counter state while walking the JSON
1001
- */
1002
- static async replaceBase64ImagesInJsonWithJq(jsonPath, outputPath, dirName, prefix) {
1003
- const { data, count } = await jqReplaceBase64WithPaths(
1004
- jsonPath,
1005
- dirName,
1006
- prefix
1007
- );
1008
- writeFileSync(outputPath, JSON.stringify(data, null, 2), "utf-8");
1009
- return count;
1010
- }
1011
1110
  /**
1012
1111
  * Extract a base64-encoded image to a file and return the relative path
1013
1112
  */
@@ -1021,8 +1120,66 @@ var ImageExtractor = class _ImageExtractor {
1021
1120
  return `${dirName}/${filename}`;
1022
1121
  }
1023
1122
  /**
1024
- * Save JSON and HTML documents with base64 images extracted to separate files
1025
- * Uses jq for JSON processing to handle large files
1123
+ * Extract base64 images from HTML using streaming.
1124
+ * Reads HTML file as a stream, extracts base64 images from src attributes,
1125
+ * saves them as PNG files, and replaces with file paths in the output HTML.
1126
+ * Returns the number of images extracted.
1127
+ */
1128
+ static async extractImagesFromHtmlStream(htmlInputPath, htmlOutputPath, imagesDir) {
1129
+ let imageIndex = 0;
1130
+ let pending = "";
1131
+ const MARKER = 'src="data:image/png;base64,';
1132
+ const transform = new Transform({
1133
+ decodeStrings: false,
1134
+ encoding: "utf-8",
1135
+ transform(chunk, _encoding, callback) {
1136
+ pending += chunk;
1137
+ let result = "";
1138
+ while (true) {
1139
+ const markerIdx = pending.indexOf(MARKER);
1140
+ if (markerIdx === -1) {
1141
+ const safeEnd = Math.max(0, pending.length - MARKER.length);
1142
+ result += pending.slice(0, safeEnd);
1143
+ pending = pending.slice(safeEnd);
1144
+ break;
1145
+ }
1146
+ result += pending.slice(0, markerIdx);
1147
+ const dataStart = markerIdx + MARKER.length;
1148
+ const quoteIdx = pending.indexOf('"', dataStart);
1149
+ if (quoteIdx === -1) {
1150
+ pending = pending.slice(markerIdx);
1151
+ break;
1152
+ }
1153
+ const base64Content = pending.slice(dataStart, quoteIdx);
1154
+ const filename = `image_${imageIndex}.png`;
1155
+ const filepath = join2(imagesDir, filename);
1156
+ const buf = Buffer.from(base64Content, "base64");
1157
+ writeFileSync(filepath, buf);
1158
+ const relativePath = `images/${filename}`;
1159
+ result += `src="${relativePath}"`;
1160
+ imageIndex++;
1161
+ pending = pending.slice(quoteIdx + 1);
1162
+ }
1163
+ if (result.length > 0) {
1164
+ this.push(result);
1165
+ }
1166
+ callback();
1167
+ },
1168
+ flush(callback) {
1169
+ if (pending.length > 0) {
1170
+ this.push(pending);
1171
+ }
1172
+ callback();
1173
+ }
1174
+ });
1175
+ const rs = createReadStream(htmlInputPath, { encoding: "utf-8" });
1176
+ const ws = createWriteStream2(htmlOutputPath, { encoding: "utf-8" });
1177
+ await pipeline2(rs, transform, ws);
1178
+ return imageIndex;
1179
+ }
1180
+ /**
1181
+ * Save JSON and HTML documents with base64 images extracted to separate files.
1182
+ * Uses jq for JSON processing and streaming for HTML to handle large files.
1026
1183
  *
1027
1184
  * This method:
1028
1185
  * 1. Extracts base64-encoded images from JSON and HTML content
@@ -1030,7 +1187,7 @@ var ImageExtractor = class _ImageExtractor {
1030
1187
  * 3. Replaces base64 data with relative file paths
1031
1188
  * 4. Saves the transformed documents to the output directory
1032
1189
  */
1033
- static async saveDocumentsWithExtractedImages(logger, outputDir, filename, jsonSourcePath, htmlContent) {
1190
+ static async saveDocumentsWithExtractedImages(logger, outputDir, filename, jsonSourcePath, htmlSourcePath) {
1034
1191
  try {
1035
1192
  if (existsSync(outputDir)) {
1036
1193
  rmSync(outputDir, { recursive: true, force: true });
@@ -1046,27 +1203,29 @@ var ImageExtractor = class _ImageExtractor {
1046
1203
  if (!existsSync(imagesDir)) {
1047
1204
  mkdirSync(imagesDir, { recursive: true });
1048
1205
  }
1049
- const base64Images = await _ImageExtractor.extractBase64ImagesFromJsonWithJq(jsonSourcePath);
1050
- base64Images.forEach((base64Data, index) => {
1051
- _ImageExtractor.extractBase64ImageToFile(
1052
- base64Data,
1053
- imagesDir,
1054
- index,
1055
- "pic",
1056
- "images"
1057
- );
1058
- });
1206
+ const imageCount = await jqExtractBase64PngStringsStreaming(
1207
+ jsonSourcePath,
1208
+ (base64Data, index) => {
1209
+ _ImageExtractor.extractBase64ImageToFile(
1210
+ base64Data,
1211
+ imagesDir,
1212
+ index,
1213
+ "pic",
1214
+ "images"
1215
+ );
1216
+ }
1217
+ );
1059
1218
  logger.info(
1060
- `[PDFConverter] Extracted ${base64Images.length} picture images from JSON to ${imagesDir}`
1219
+ `[PDFConverter] Extracted ${imageCount} picture images from JSON to ${imagesDir}`
1061
1220
  );
1062
- const replacedCount = await _ImageExtractor.replaceBase64ImagesInJsonWithJq(
1221
+ await jqReplaceBase64WithPathsToFile(
1063
1222
  jsonSourcePath,
1064
1223
  jsonPath,
1065
1224
  "images",
1066
1225
  "pic"
1067
1226
  );
1068
1227
  logger.info(
1069
- `[PDFConverter] Replaced ${replacedCount} base64 images with file paths`
1228
+ `[PDFConverter] Replaced ${imageCount} base64 images with file paths`
1070
1229
  );
1071
1230
  } catch (e) {
1072
1231
  logger.warn(
@@ -1082,42 +1241,36 @@ var ImageExtractor = class _ImageExtractor {
1082
1241
  if (!existsSync(imagesDir)) {
1083
1242
  mkdirSync(imagesDir, { recursive: true });
1084
1243
  }
1085
- let imageIndex = 0;
1086
- const transformedHtml = htmlContent.replace(
1087
- /src="data:image\/png;base64,([^"]+)"/g,
1088
- (_, base64Content) => {
1089
- const filename2 = `image_${imageIndex}.png`;
1090
- const filepath = join2(imagesDir, filename2);
1091
- const buffer = Buffer.from(base64Content, "base64");
1092
- writeFileSync(filepath, buffer);
1093
- const relativePath = `images/${filename2}`;
1094
- imageIndex += 1;
1095
- return `src="${relativePath}"`;
1096
- }
1244
+ const htmlImageCount = await _ImageExtractor.extractImagesFromHtmlStream(
1245
+ htmlSourcePath,
1246
+ htmlPath,
1247
+ imagesDir
1097
1248
  );
1098
1249
  logger.info(
1099
- `[PDFConverter] Extracted ${imageIndex} images from HTML to ${imagesDir}`
1250
+ `[PDFConverter] Extracted ${htmlImageCount} images from HTML to ${imagesDir}`
1100
1251
  );
1101
- writeFileSync(htmlPath, transformedHtml, "utf-8");
1102
1252
  } catch (e) {
1103
1253
  logger.warn(
1104
- "[PDFConverter] Failed to extract images from HTML, writing original. Error:",
1254
+ "[PDFConverter] Failed to extract images from HTML, copying original. Error:",
1105
1255
  e
1106
1256
  );
1107
- writeFileSync(htmlPath, htmlContent, "utf-8");
1257
+ const rs = createReadStream(htmlSourcePath);
1258
+ const ws = createWriteStream2(htmlPath);
1259
+ await pipeline2(rs, ws);
1108
1260
  }
1109
1261
  logger.info("[PDFConverter] Saved HTML:", htmlPath);
1110
1262
  }
1111
1263
  /**
1112
1264
  * Extract documents from ZIP and save with extracted images
1113
- * Uses jq for JSON processing to handle large files without loading into Node.js memory
1265
+ * Uses jq for JSON processing and streaming for HTML to handle large files
1266
+ * without loading into Node.js memory
1114
1267
  *
1115
1268
  * Complete workflow:
1116
1269
  * 1. Extract ZIP file to temporary directory
1117
1270
  * 2. Find JSON and HTML files from extracted files
1118
- * 3. Use jq to extract base64 images from JSON and save as separate files
1119
- * 4. Use jq to replace base64 with file paths in JSON
1120
- * 5. Process HTML with regex to extract and replace images
1271
+ * 3. Use jq to stream-extract base64 images from JSON and save as separate files
1272
+ * 4. Use jq to replace base64 with file paths in JSON (piped to file)
1273
+ * 5. Process HTML with streaming Transform to extract and replace images
1121
1274
  * 6. Save transformed documents to output directory (as result.json and result.html)
1122
1275
  */
1123
1276
  static async extractAndSaveDocumentsFromZip(logger, zipPath, extractDir, outputDir) {
@@ -1133,14 +1286,13 @@ var ImageExtractor = class _ImageExtractor {
1133
1286
  }
1134
1287
  const jsonPath = join2(extractDir, jsonFile);
1135
1288
  const htmlPath = join2(extractDir, htmlFile);
1136
- const htmlContent = readFileSync(htmlPath, "utf-8");
1137
1289
  logger.info("[PDFConverter] Saving converted files to output...");
1138
1290
  await _ImageExtractor.saveDocumentsWithExtractedImages(
1139
1291
  logger,
1140
1292
  outputDir,
1141
1293
  "result",
1142
1294
  jsonPath,
1143
- htmlContent
1295
+ htmlPath
1144
1296
  );
1145
1297
  logger.info("[PDFConverter] Files saved to:", outputDir);
1146
1298
  }
@@ -1149,43 +1301,82 @@ var ImageExtractor = class _ImageExtractor {
1149
1301
  // src/processors/page-renderer.ts
1150
1302
  import { existsSync as existsSync2, mkdirSync as mkdirSync2, readdirSync as readdirSync2 } from "fs";
1151
1303
  import { join as join3 } from "path";
1152
- var DEFAULT_DPI = 300;
1304
+ var PROGRESS_LOG_PERCENT_STEP = 10;
1153
1305
  var PageRenderer = class {
1154
1306
  constructor(logger) {
1155
1307
  this.logger = logger;
1156
1308
  }
1309
+ lastLoggedPercent = 0;
1157
1310
  /**
1158
1311
  * Render all pages of a PDF to individual PNG files.
1159
1312
  *
1313
+ * Uses per-page rendering (`magick 'input.pdf[N]'`) when page count is known,
1314
+ * limiting peak memory to ~15MB/page instead of loading all pages at once.
1315
+ *
1160
1316
  * @param pdfPath - Absolute path to the source PDF file
1161
1317
  * @param outputDir - Directory where pages/ subdirectory will be created
1162
1318
  * @param options - Rendering options
1163
1319
  * @returns Render result with page count and file paths
1164
1320
  */
1165
1321
  async renderPages(pdfPath, outputDir, options) {
1166
- const dpi = options?.dpi ?? DEFAULT_DPI;
1322
+ const dpi = options?.dpi ?? PAGE_RENDERING.DEFAULT_DPI;
1167
1323
  const pagesDir = join3(outputDir, "pages");
1168
1324
  if (!existsSync2(pagesDir)) {
1169
1325
  mkdirSync2(pagesDir, { recursive: true });
1170
1326
  }
1171
- this.logger.info(`[PageRenderer] Rendering PDF at ${dpi} DPI...`);
1172
- const outputPattern = join3(pagesDir, "page_%d.png");
1173
- const result = await spawnAsync("magick", [
1174
- "-density",
1175
- dpi.toString(),
1176
- pdfPath,
1177
- "-background",
1178
- "white",
1179
- "-alpha",
1180
- "remove",
1181
- "-alpha",
1182
- "off",
1183
- outputPattern
1184
- ]);
1185
- if (result.code !== 0) {
1186
- throw new Error(
1187
- `[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
1327
+ const totalPages = await this.getPageCount(pdfPath);
1328
+ if (totalPages > 0) {
1329
+ this.logger.info(
1330
+ `[PageRenderer] Rendering ${totalPages} pages at ${dpi} DPI...`
1188
1331
  );
1332
+ this.lastLoggedPercent = 0;
1333
+ for (let i = 0; i < totalPages; i++) {
1334
+ const result = await spawnAsync(
1335
+ "magick",
1336
+ [
1337
+ "-density",
1338
+ dpi.toString(),
1339
+ `${pdfPath}[${i}]`,
1340
+ "-background",
1341
+ "white",
1342
+ "-alpha",
1343
+ "remove",
1344
+ "-alpha",
1345
+ "off",
1346
+ join3(pagesDir, `page_${i}.png`)
1347
+ ],
1348
+ { captureStdout: false }
1349
+ );
1350
+ if (result.code !== 0) {
1351
+ throw new Error(
1352
+ `[PageRenderer] Failed to render page ${i + 1}/${totalPages}: ${result.stderr || "Unknown error"}`
1353
+ );
1354
+ }
1355
+ this.logProgress(i + 1, totalPages);
1356
+ }
1357
+ } else {
1358
+ this.logger.info(`[PageRenderer] Rendering PDF at ${dpi} DPI...`);
1359
+ const result = await spawnAsync(
1360
+ "magick",
1361
+ [
1362
+ "-density",
1363
+ dpi.toString(),
1364
+ pdfPath,
1365
+ "-background",
1366
+ "white",
1367
+ "-alpha",
1368
+ "remove",
1369
+ "-alpha",
1370
+ "off",
1371
+ join3(pagesDir, "page_%d.png")
1372
+ ],
1373
+ { captureStdout: false }
1374
+ );
1375
+ if (result.code !== 0) {
1376
+ throw new Error(
1377
+ `[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
1378
+ );
1379
+ }
1189
1380
  }
1190
1381
  const pageFiles = readdirSync2(pagesDir).filter((f) => f.startsWith("page_") && f.endsWith(".png")).sort((a, b) => {
1191
1382
  const numA = parseInt(a.replace("page_", "").replace(".png", ""), 10);
@@ -1201,6 +1392,32 @@ var PageRenderer = class {
1201
1392
  pageFiles
1202
1393
  };
1203
1394
  }
1395
+ /**
1396
+ * Log rendering progress at appropriate intervals (every 10%).
1397
+ */
1398
+ logProgress(current, total) {
1399
+ const percent = Math.floor(current / total * 100);
1400
+ if (percent >= this.lastLoggedPercent + PROGRESS_LOG_PERCENT_STEP || current === total) {
1401
+ this.lastLoggedPercent = percent;
1402
+ this.logger.info(
1403
+ `[PageRenderer] Rendering pages: ${current}/${total} (${percent}%)`
1404
+ );
1405
+ }
1406
+ }
1407
+ /**
1408
+ * Get total page count using pdfinfo.
1409
+ * Returns 0 on failure (progress logging will be skipped).
1410
+ */
1411
+ async getPageCount(pdfPath) {
1412
+ try {
1413
+ const result = await spawnAsync("pdfinfo", [pdfPath]);
1414
+ if (result.code !== 0) return 0;
1415
+ const match = result.stdout.match(/^Pages:\s+(\d+)/m);
1416
+ return match ? parseInt(match[1], 10) : 0;
1417
+ } catch {
1418
+ return 0;
1419
+ }
1420
+ }
1204
1421
  };
1205
1422
 
1206
1423
  // src/processors/pdf-text-extractor.ts
@@ -1286,7 +1503,7 @@ var PdfTextExtractor = class {
1286
1503
  };
1287
1504
 
1288
1505
  // src/processors/vlm-text-corrector.ts
1289
- import { readFileSync as readFileSync2, writeFileSync as writeFileSync2 } from "fs";
1506
+ import { readFileSync, writeFileSync as writeFileSync2 } from "fs";
1290
1507
  import { join as join4 } from "path";
1291
1508
 
1292
1509
  // src/types/vlm-text-correction-schema.ts
@@ -1418,7 +1635,7 @@ var VlmTextCorrector = class {
1418
1635
  async correctAndSave(outputDir, model, options) {
1419
1636
  this.logger.info("[VlmTextCorrector] Starting text correction...");
1420
1637
  const resultPath = join4(outputDir, "result.json");
1421
- const doc = JSON.parse(readFileSync2(resultPath, "utf-8"));
1638
+ const doc = JSON.parse(readFileSync(resultPath, "utf-8"));
1422
1639
  let pageNumbers = this.getPageNumbers(doc);
1423
1640
  if (pageNumbers.length === 0) {
1424
1641
  this.logger.info("[VlmTextCorrector] No pages to process");
@@ -1745,7 +1962,7 @@ var VlmTextCorrector = class {
1745
1962
  */
1746
1963
  readPageImage(outputDir, pageNo) {
1747
1964
  const imagePath = join4(outputDir, "pages", `page_${pageNo - 1}.png`);
1748
- return readFileSync2(imagePath).toString("base64");
1965
+ return readFileSync(imagePath).toString("base64");
1749
1966
  }
1750
1967
  /**
1751
1968
  * Apply VLM corrections to the DoclingDocument.
@@ -1800,9 +2017,8 @@ var VlmTextCorrector = class {
1800
2017
 
1801
2018
  // src/samplers/ocr-strategy-sampler.ts
1802
2019
  import { normalizeToBcp47 } from "@heripo/model";
1803
- import { readFileSync as readFileSync3 } from "fs";
2020
+ import { readFileSync as readFileSync2 } from "fs";
1804
2021
  import { z as z2 } from "zod/v4";
1805
- var SAMPLE_DPI = 150;
1806
2022
  var EDGE_TRIM_RATIO = 0.1;
1807
2023
  var DEFAULT_MAX_SAMPLE_PAGES = 15;
1808
2024
  var DEFAULT_MAX_RETRIES2 = 3;
@@ -1853,7 +2069,7 @@ var OcrStrategySampler = class {
1853
2069
  const renderResult = await this.pageRenderer.renderPages(
1854
2070
  pdfPath,
1855
2071
  outputDir,
1856
- { dpi: SAMPLE_DPI }
2072
+ { dpi: PAGE_RENDERING.SAMPLE_DPI }
1857
2073
  );
1858
2074
  if (renderResult.pageCount === 0) {
1859
2075
  this.logger.info("[OcrStrategySampler] No pages found in PDF");
@@ -2015,7 +2231,7 @@ var OcrStrategySampler = class {
2015
2231
  this.logger.debug(
2016
2232
  `[OcrStrategySampler] Analyzing page ${pageNo} for Korean-Hanja mix and language...`
2017
2233
  );
2018
- const base64Image = readFileSync3(pageFile).toString("base64");
2234
+ const base64Image = readFileSync2(pageFile).toString("base64");
2019
2235
  const messages = [
2020
2236
  {
2021
2237
  role: "user",
@@ -2063,7 +2279,7 @@ var OcrStrategySampler = class {
2063
2279
  };
2064
2280
 
2065
2281
  // src/utils/local-file-server.ts
2066
- import { createReadStream, statSync } from "fs";
2282
+ import { createReadStream as createReadStream2, statSync } from "fs";
2067
2283
  import { createServer } from "http";
2068
2284
  import { basename } from "path";
2069
2285
  var LocalFileServer = class {
@@ -2085,7 +2301,7 @@ var LocalFileServer = class {
2085
2301
  "Content-Type": "application/pdf",
2086
2302
  "Content-Length": stat.size
2087
2303
  });
2088
- createReadStream(filePath).pipe(res);
2304
+ createReadStream2(filePath).pipe(res);
2089
2305
  } else {
2090
2306
  res.writeHead(404);
2091
2307
  res.end("Not Found");
@@ -2370,8 +2586,10 @@ var PDFConverter = class {
2370
2586
  let pageTexts;
2371
2587
  try {
2372
2588
  const resultPath2 = join6(outputDir, "result.json");
2373
- const doc = JSON.parse(readFileSync4(resultPath2, "utf-8"));
2374
- const totalPages = Object.keys(doc.pages).length;
2589
+ const totalPages = await runJqFileJson(
2590
+ ".pages | length",
2591
+ resultPath2
2592
+ );
2375
2593
  const textExtractor = new PdfTextExtractor(this.logger);
2376
2594
  pageTexts = await textExtractor.extractText(pdfPath, totalPages);
2377
2595
  } catch {
@@ -2703,8 +2921,8 @@ var PDFConverter = class {
2703
2921
  const zipPath = join6(process.cwd(), "result.zip");
2704
2922
  this.logger.info("[PDFConverter] Saving ZIP file to:", zipPath);
2705
2923
  if (zipResult.fileStream) {
2706
- const writeStream = createWriteStream2(zipPath);
2707
- await pipeline(zipResult.fileStream, writeStream);
2924
+ const writeStream = createWriteStream3(zipPath);
2925
+ await pipeline3(zipResult.fileStream, writeStream);
2708
2926
  return;
2709
2927
  }
2710
2928
  if (zipResult.data) {
@@ -2736,6 +2954,7 @@ var PDFConverter = class {
2736
2954
  }
2737
2955
  /**
2738
2956
  * Render page images from the source PDF using ImageMagick and update result.json.
2957
+ * Uses jq to update the JSON file without loading it into Node.js memory.
2739
2958
  * Replaces Docling's generate_page_images which fails on large PDFs
2740
2959
  * due to memory limits when embedding all page images as base64.
2741
2960
  */
@@ -2753,17 +2972,18 @@ var PDFConverter = class {
2753
2972
  const renderer = new PageRenderer(this.logger);
2754
2973
  const renderResult = await renderer.renderPages(pdfPath, outputDir);
2755
2974
  const resultPath = join6(outputDir, "result.json");
2756
- const doc = JSON.parse(readFileSync4(resultPath, "utf-8"));
2757
- for (const page of Object.values(doc.pages)) {
2758
- const pageNo = page.page_no;
2759
- const fileIndex = pageNo - 1;
2760
- if (fileIndex >= 0 && fileIndex < renderResult.pageCount) {
2761
- page.image.uri = `pages/page_${fileIndex}.png`;
2762
- page.image.mimetype = "image/png";
2763
- page.image.dpi = 300;
2764
- }
2765
- }
2766
- await writeFile(resultPath, JSON.stringify(doc, null, 2));
2975
+ const tmpPath = resultPath + ".tmp";
2976
+ const jqProgram = `
2977
+ .pages |= with_entries(
2978
+ if (.value.page_no - 1) >= 0 and (.value.page_no - 1) < ${renderResult.pageCount} then
2979
+ .value.image.uri = "pages/page_\\(.value.page_no - 1).png" |
2980
+ .value.image.mimetype = "image/png" |
2981
+ .value.image.dpi = ${PAGE_RENDERING.DEFAULT_DPI}
2982
+ else . end
2983
+ )
2984
+ `;
2985
+ await runJqFileToFile(jqProgram, resultPath, tmpPath);
2986
+ await rename2(tmpPath, resultPath);
2767
2987
  this.logger.info(
2768
2988
  `[PDFConverter] Rendered ${renderResult.pageCount} page images`
2769
2989
  );