@heripo/pdf-parser 0.1.9 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -81,11 +81,17 @@ var DOCLING_ENVIRONMENT = {
81
81
  */
82
82
  STARTUP_DELAY_MS: 2e3
83
83
  };
84
+ var PAGE_RENDERING = {
85
+ /** Default rendering DPI for VLM text recognition quality */
86
+ DEFAULT_DPI: 200,
87
+ /** Low-resolution DPI for OCR strategy sampling */
88
+ SAMPLE_DPI: 150
89
+ };
84
90
  var IMAGE_PDF_CONVERTER = {
85
91
  /**
86
92
  * ImageMagick density option (DPI) for PDF to image conversion
87
93
  */
88
- DENSITY: 300,
94
+ DENSITY: PAGE_RENDERING.DEFAULT_DPI,
89
95
  /**
90
96
  * ImageMagick quality option (1-100)
91
97
  */
@@ -869,10 +875,10 @@ var DoclingEnvironment = class _DoclingEnvironment {
869
875
 
870
876
  // src/core/pdf-converter.ts
871
877
  var import_es_toolkit = require("es-toolkit");
872
- var import_node_fs7 = require("fs");
873
- var import_promises = require("fs/promises");
878
+ var import_node_fs8 = require("fs");
879
+ var import_promises4 = require("fs/promises");
874
880
  var import_node_path7 = require("path");
875
- var import_promises2 = require("stream/promises");
881
+ var import_promises5 = require("stream/promises");
876
882
 
877
883
  // src/errors/image-pdf-fallback-error.ts
878
884
  var ImagePdfFallbackError = class extends Error {
@@ -887,12 +893,17 @@ var ImagePdfFallbackError = class extends Error {
887
893
  };
888
894
 
889
895
  // src/processors/image-extractor.ts
890
- var import_node_fs = require("fs");
896
+ var import_node_fs2 = require("fs");
891
897
  var import_node_path2 = require("path");
898
+ var import_node_stream = require("stream");
899
+ var import_promises3 = require("stream/promises");
892
900
  var yauzl = __toESM(require("yauzl"), 1);
893
901
 
894
902
  // src/utils/jq.ts
895
903
  var import_node_child_process2 = require("child_process");
904
+ var import_node_fs = require("fs");
905
+ var import_promises = require("fs/promises");
906
+ var import_promises2 = require("stream/promises");
896
907
  function getJqPath() {
897
908
  const p = process.env.JQ_PATH?.trim();
898
909
  return p && p.length > 0 ? p : "jq";
@@ -944,25 +955,139 @@ function runJqFileJson(program, filePath) {
944
955
  });
945
956
  });
946
957
  }
947
- function jqExtractBase64PngStrings(filePath) {
948
- const program = `
949
- [
950
- .. |
951
- select(type == "string" and startswith("data:image/png;base64"))
952
- ]
953
- `;
954
- return runJqFileJson(program, filePath);
958
+ function runJqFileToFile(program, inputPath, outputPath) {
959
+ return new Promise((resolve, reject) => {
960
+ const jqPath = getJqPath();
961
+ const args = [program, inputPath];
962
+ const child = (0, import_node_child_process2.spawn)(jqPath, args, {
963
+ stdio: ["ignore", "pipe", "pipe"],
964
+ env: process.env
965
+ });
966
+ let stderr = "";
967
+ let exitCode = null;
968
+ let pipelineDone = false;
969
+ let settled = false;
970
+ child.stderr.setEncoding("utf-8");
971
+ child.stderr.on("data", (chunk) => {
972
+ stderr += chunk;
973
+ });
974
+ const ws = (0, import_node_fs.createWriteStream)(outputPath);
975
+ function trySettle() {
976
+ if (settled) return;
977
+ if (!pipelineDone || exitCode === null) return;
978
+ settled = true;
979
+ if (exitCode !== 0) {
980
+ reject(
981
+ new Error(
982
+ `jq exited with code ${exitCode}. ${stderr ? "Stderr: " + stderr : ""}`
983
+ )
984
+ );
985
+ } else {
986
+ resolve();
987
+ }
988
+ }
989
+ child.on("error", (err) => {
990
+ if (settled) return;
991
+ settled = true;
992
+ ws.destroy();
993
+ reject(err);
994
+ });
995
+ (0, import_promises2.pipeline)(child.stdout, ws).then(() => {
996
+ pipelineDone = true;
997
+ trySettle();
998
+ }).catch((err) => {
999
+ if (settled) return;
1000
+ settled = true;
1001
+ reject(err);
1002
+ });
1003
+ child.on("close", (code) => {
1004
+ exitCode = code ?? 1;
1005
+ trySettle();
1006
+ });
1007
+ });
1008
+ }
1009
+ function runJqFileLines(program, filePath, onLine) {
1010
+ return new Promise((resolve, reject) => {
1011
+ const jqPath = getJqPath();
1012
+ const args = ["-r", program, filePath];
1013
+ const child = (0, import_node_child_process2.spawn)(jqPath, args, {
1014
+ stdio: ["ignore", "pipe", "pipe"],
1015
+ env: process.env
1016
+ });
1017
+ let stderr = "";
1018
+ let buffer = "";
1019
+ let callbackError = false;
1020
+ child.stdout.setEncoding("utf-8");
1021
+ child.stderr.setEncoding("utf-8");
1022
+ function safeOnLine(line) {
1023
+ if (callbackError) return;
1024
+ try {
1025
+ onLine(line);
1026
+ } catch (err) {
1027
+ callbackError = true;
1028
+ child.kill();
1029
+ reject(err);
1030
+ }
1031
+ }
1032
+ child.stdout.on("data", (chunk) => {
1033
+ buffer += chunk;
1034
+ let newlineIdx;
1035
+ while ((newlineIdx = buffer.indexOf("\n")) !== -1) {
1036
+ const line = buffer.slice(0, newlineIdx);
1037
+ buffer = buffer.slice(newlineIdx + 1);
1038
+ if (line.length > 0) {
1039
+ safeOnLine(line);
1040
+ }
1041
+ }
1042
+ });
1043
+ child.stderr.on("data", (chunk) => {
1044
+ stderr += chunk;
1045
+ });
1046
+ child.on("error", (err) => {
1047
+ if (!callbackError) reject(err);
1048
+ });
1049
+ child.on("close", (code) => {
1050
+ if (callbackError) return;
1051
+ if (buffer.length > 0) {
1052
+ safeOnLine(buffer);
1053
+ }
1054
+ if (callbackError) return;
1055
+ if (code !== 0) {
1056
+ reject(
1057
+ new Error(
1058
+ `jq exited with code ${code}. ${stderr ? "Stderr: " + stderr : ""}`
1059
+ )
1060
+ );
1061
+ } else {
1062
+ resolve();
1063
+ }
1064
+ });
1065
+ });
955
1066
  }
956
- function jqReplaceBase64WithPaths(filePath, dirName, prefix) {
1067
+ async function jqExtractBase64PngStringsStreaming(filePath, onImage) {
1068
+ let index = 0;
1069
+ await runJqFileLines(
1070
+ '.. | select(type == "string" and startswith("data:image/png;base64"))',
1071
+ filePath,
1072
+ (line) => {
1073
+ onImage(line, index);
1074
+ index++;
1075
+ }
1076
+ );
1077
+ return index;
1078
+ }
1079
+ async function jqReplaceBase64WithPathsToFile(inputPath, outputPath, dirName, prefix) {
957
1080
  const program = `
958
1081
  reduce paths(type == "string" and startswith("data:image/png;base64")) as $p (
959
1082
  {data: ., counter: 0};
960
1083
  .counter as $idx |
961
1084
  .data |= setpath($p; "${dirName}/${prefix}_\\($idx).png") |
962
1085
  .counter += 1
963
- ) | {data: .data, count: .counter}
1086
+ ) | .data
964
1087
  `;
965
- return runJqFileJson(program, filePath);
1088
+ const tmpPath = outputPath + ".tmp";
1089
+ await runJqFileToFile(program, inputPath, tmpPath);
1090
+ await (0, import_promises.rename)(tmpPath, outputPath);
966
1091
  }
967
1092
 
968
1093
  // src/processors/image-extractor.ts
@@ -981,7 +1106,7 @@ var ImageExtractor = class _ImageExtractor {
981
1106
  zipfile.on("entry", (entry) => {
982
1107
  const entryPath = (0, import_node_path2.join)(targetDir, entry.fileName);
983
1108
  if (/\/$/.test(entry.fileName)) {
984
- (0, import_node_fs.mkdirSync)(entryPath, { recursive: true });
1109
+ (0, import_node_fs2.mkdirSync)(entryPath, { recursive: true });
985
1110
  zipfile.readEntry();
986
1111
  } else {
987
1112
  zipfile.openReadStream(entry, (err2, readStream) => {
@@ -989,8 +1114,8 @@ var ImageExtractor = class _ImageExtractor {
989
1114
  reject(err2 || new Error("Failed to open read stream"));
990
1115
  return;
991
1116
  }
992
- (0, import_node_fs.mkdirSync)((0, import_node_path2.join)(entryPath, ".."), { recursive: true });
993
- const writeStream = (0, import_node_fs.createWriteStream)(entryPath);
1117
+ (0, import_node_fs2.mkdirSync)((0, import_node_path2.join)(entryPath, ".."), { recursive: true });
1118
+ const writeStream = (0, import_node_fs2.createWriteStream)(entryPath);
994
1119
  readStream.pipe(writeStream);
995
1120
  writeStream.on("finish", () => {
996
1121
  zipfile.readEntry();
@@ -1006,26 +1131,6 @@ var ImageExtractor = class _ImageExtractor {
1006
1131
  });
1007
1132
  });
1008
1133
  }
1009
- /**
1010
- * Extract base64 images from JSON file using jq (for large files)
1011
- * Returns array of base64 data strings
1012
- */
1013
- static async extractBase64ImagesFromJsonWithJq(jsonPath) {
1014
- return jqExtractBase64PngStrings(jsonPath);
1015
- }
1016
- /**
1017
- * Replace base64 images with file paths in JSON using jq (for large files)
1018
- * Uses reduce to maintain counter state while walking the JSON
1019
- */
1020
- static async replaceBase64ImagesInJsonWithJq(jsonPath, outputPath, dirName, prefix) {
1021
- const { data, count } = await jqReplaceBase64WithPaths(
1022
- jsonPath,
1023
- dirName,
1024
- prefix
1025
- );
1026
- (0, import_node_fs.writeFileSync)(outputPath, JSON.stringify(data, null, 2), "utf-8");
1027
- return count;
1028
- }
1029
1134
  /**
1030
1135
  * Extract a base64-encoded image to a file and return the relative path
1031
1136
  */
@@ -1035,12 +1140,70 @@ var ImageExtractor = class _ImageExtractor {
1035
1140
  const filename = `${prefix}_${index}.png`;
1036
1141
  const filepath = (0, import_node_path2.join)(imagesDir, filename);
1037
1142
  const buffer = Buffer.from(base64Content, "base64");
1038
- (0, import_node_fs.writeFileSync)(filepath, buffer);
1143
+ (0, import_node_fs2.writeFileSync)(filepath, buffer);
1039
1144
  return `${dirName}/${filename}`;
1040
1145
  }
1041
1146
  /**
1042
- * Save JSON and HTML documents with base64 images extracted to separate files
1043
- * Uses jq for JSON processing to handle large files
1147
+ * Extract base64 images from HTML using streaming.
1148
+ * Reads HTML file as a stream, extracts base64 images from src attributes,
1149
+ * saves them as PNG files, and replaces with file paths in the output HTML.
1150
+ * Returns the number of images extracted.
1151
+ */
1152
+ static async extractImagesFromHtmlStream(htmlInputPath, htmlOutputPath, imagesDir) {
1153
+ let imageIndex = 0;
1154
+ let pending = "";
1155
+ const MARKER = 'src="data:image/png;base64,';
1156
+ const transform = new import_node_stream.Transform({
1157
+ decodeStrings: false,
1158
+ encoding: "utf-8",
1159
+ transform(chunk, _encoding, callback) {
1160
+ pending += chunk;
1161
+ let result = "";
1162
+ while (true) {
1163
+ const markerIdx = pending.indexOf(MARKER);
1164
+ if (markerIdx === -1) {
1165
+ const safeEnd = Math.max(0, pending.length - MARKER.length);
1166
+ result += pending.slice(0, safeEnd);
1167
+ pending = pending.slice(safeEnd);
1168
+ break;
1169
+ }
1170
+ result += pending.slice(0, markerIdx);
1171
+ const dataStart = markerIdx + MARKER.length;
1172
+ const quoteIdx = pending.indexOf('"', dataStart);
1173
+ if (quoteIdx === -1) {
1174
+ pending = pending.slice(markerIdx);
1175
+ break;
1176
+ }
1177
+ const base64Content = pending.slice(dataStart, quoteIdx);
1178
+ const filename = `image_${imageIndex}.png`;
1179
+ const filepath = (0, import_node_path2.join)(imagesDir, filename);
1180
+ const buf = Buffer.from(base64Content, "base64");
1181
+ (0, import_node_fs2.writeFileSync)(filepath, buf);
1182
+ const relativePath = `images/${filename}`;
1183
+ result += `src="${relativePath}"`;
1184
+ imageIndex++;
1185
+ pending = pending.slice(quoteIdx + 1);
1186
+ }
1187
+ if (result.length > 0) {
1188
+ this.push(result);
1189
+ }
1190
+ callback();
1191
+ },
1192
+ flush(callback) {
1193
+ if (pending.length > 0) {
1194
+ this.push(pending);
1195
+ }
1196
+ callback();
1197
+ }
1198
+ });
1199
+ const rs = (0, import_node_fs2.createReadStream)(htmlInputPath, { encoding: "utf-8" });
1200
+ const ws = (0, import_node_fs2.createWriteStream)(htmlOutputPath, { encoding: "utf-8" });
1201
+ await (0, import_promises3.pipeline)(rs, transform, ws);
1202
+ return imageIndex;
1203
+ }
1204
+ /**
1205
+ * Save JSON and HTML documents with base64 images extracted to separate files.
1206
+ * Uses jq for JSON processing and streaming for HTML to handle large files.
1044
1207
  *
1045
1208
  * This method:
1046
1209
  * 1. Extracts base64-encoded images from JSON and HTML content
@@ -1048,43 +1211,45 @@ var ImageExtractor = class _ImageExtractor {
1048
1211
  * 3. Replaces base64 data with relative file paths
1049
1212
  * 4. Saves the transformed documents to the output directory
1050
1213
  */
1051
- static async saveDocumentsWithExtractedImages(logger, outputDir, filename, jsonSourcePath, htmlContent) {
1214
+ static async saveDocumentsWithExtractedImages(logger, outputDir, filename, jsonSourcePath, htmlSourcePath) {
1052
1215
  try {
1053
- if ((0, import_node_fs.existsSync)(outputDir)) {
1054
- (0, import_node_fs.rmSync)(outputDir, { recursive: true, force: true });
1216
+ if ((0, import_node_fs2.existsSync)(outputDir)) {
1217
+ (0, import_node_fs2.rmSync)(outputDir, { recursive: true, force: true });
1055
1218
  }
1056
1219
  } catch (e) {
1057
1220
  logger.warn("[PDFConverter] Failed to clear output directory:", e);
1058
1221
  }
1059
- (0, import_node_fs.mkdirSync)(outputDir, { recursive: true });
1222
+ (0, import_node_fs2.mkdirSync)(outputDir, { recursive: true });
1060
1223
  const baseName = filename.replace((0, import_node_path2.extname)(filename), "");
1061
1224
  const jsonPath = (0, import_node_path2.join)(outputDir, `${baseName}.json`);
1062
1225
  try {
1063
1226
  const imagesDir = (0, import_node_path2.join)(outputDir, "images");
1064
- if (!(0, import_node_fs.existsSync)(imagesDir)) {
1065
- (0, import_node_fs.mkdirSync)(imagesDir, { recursive: true });
1066
- }
1067
- const base64Images = await _ImageExtractor.extractBase64ImagesFromJsonWithJq(jsonSourcePath);
1068
- base64Images.forEach((base64Data, index) => {
1069
- _ImageExtractor.extractBase64ImageToFile(
1070
- base64Data,
1071
- imagesDir,
1072
- index,
1073
- "pic",
1074
- "images"
1075
- );
1076
- });
1227
+ if (!(0, import_node_fs2.existsSync)(imagesDir)) {
1228
+ (0, import_node_fs2.mkdirSync)(imagesDir, { recursive: true });
1229
+ }
1230
+ const imageCount = await jqExtractBase64PngStringsStreaming(
1231
+ jsonSourcePath,
1232
+ (base64Data, index) => {
1233
+ _ImageExtractor.extractBase64ImageToFile(
1234
+ base64Data,
1235
+ imagesDir,
1236
+ index,
1237
+ "pic",
1238
+ "images"
1239
+ );
1240
+ }
1241
+ );
1077
1242
  logger.info(
1078
- `[PDFConverter] Extracted ${base64Images.length} picture images from JSON to ${imagesDir}`
1243
+ `[PDFConverter] Extracted ${imageCount} picture images from JSON to ${imagesDir}`
1079
1244
  );
1080
- const replacedCount = await _ImageExtractor.replaceBase64ImagesInJsonWithJq(
1245
+ await jqReplaceBase64WithPathsToFile(
1081
1246
  jsonSourcePath,
1082
1247
  jsonPath,
1083
1248
  "images",
1084
1249
  "pic"
1085
1250
  );
1086
1251
  logger.info(
1087
- `[PDFConverter] Replaced ${replacedCount} base64 images with file paths`
1252
+ `[PDFConverter] Replaced ${imageCount} base64 images with file paths`
1088
1253
  );
1089
1254
  } catch (e) {
1090
1255
  logger.warn(
@@ -1097,51 +1262,45 @@ var ImageExtractor = class _ImageExtractor {
1097
1262
  const htmlPath = (0, import_node_path2.join)(outputDir, `${baseName}.html`);
1098
1263
  try {
1099
1264
  const imagesDir = (0, import_node_path2.join)(outputDir, "images");
1100
- if (!(0, import_node_fs.existsSync)(imagesDir)) {
1101
- (0, import_node_fs.mkdirSync)(imagesDir, { recursive: true });
1102
- }
1103
- let imageIndex = 0;
1104
- const transformedHtml = htmlContent.replace(
1105
- /src="data:image\/png;base64,([^"]+)"/g,
1106
- (_, base64Content) => {
1107
- const filename2 = `image_${imageIndex}.png`;
1108
- const filepath = (0, import_node_path2.join)(imagesDir, filename2);
1109
- const buffer = Buffer.from(base64Content, "base64");
1110
- (0, import_node_fs.writeFileSync)(filepath, buffer);
1111
- const relativePath = `images/${filename2}`;
1112
- imageIndex += 1;
1113
- return `src="${relativePath}"`;
1114
- }
1265
+ if (!(0, import_node_fs2.existsSync)(imagesDir)) {
1266
+ (0, import_node_fs2.mkdirSync)(imagesDir, { recursive: true });
1267
+ }
1268
+ const htmlImageCount = await _ImageExtractor.extractImagesFromHtmlStream(
1269
+ htmlSourcePath,
1270
+ htmlPath,
1271
+ imagesDir
1115
1272
  );
1116
1273
  logger.info(
1117
- `[PDFConverter] Extracted ${imageIndex} images from HTML to ${imagesDir}`
1274
+ `[PDFConverter] Extracted ${htmlImageCount} images from HTML to ${imagesDir}`
1118
1275
  );
1119
- (0, import_node_fs.writeFileSync)(htmlPath, transformedHtml, "utf-8");
1120
1276
  } catch (e) {
1121
1277
  logger.warn(
1122
- "[PDFConverter] Failed to extract images from HTML, writing original. Error:",
1278
+ "[PDFConverter] Failed to extract images from HTML, copying original. Error:",
1123
1279
  e
1124
1280
  );
1125
- (0, import_node_fs.writeFileSync)(htmlPath, htmlContent, "utf-8");
1281
+ const rs = (0, import_node_fs2.createReadStream)(htmlSourcePath);
1282
+ const ws = (0, import_node_fs2.createWriteStream)(htmlPath);
1283
+ await (0, import_promises3.pipeline)(rs, ws);
1126
1284
  }
1127
1285
  logger.info("[PDFConverter] Saved HTML:", htmlPath);
1128
1286
  }
1129
1287
  /**
1130
1288
  * Extract documents from ZIP and save with extracted images
1131
- * Uses jq for JSON processing to handle large files without loading into Node.js memory
1289
+ * Uses jq for JSON processing and streaming for HTML to handle large files
1290
+ * without loading into Node.js memory
1132
1291
  *
1133
1292
  * Complete workflow:
1134
1293
  * 1. Extract ZIP file to temporary directory
1135
1294
  * 2. Find JSON and HTML files from extracted files
1136
- * 3. Use jq to extract base64 images from JSON and save as separate files
1137
- * 4. Use jq to replace base64 with file paths in JSON
1138
- * 5. Process HTML with regex to extract and replace images
1295
+ * 3. Use jq to stream-extract base64 images from JSON and save as separate files
1296
+ * 4. Use jq to replace base64 with file paths in JSON (piped to file)
1297
+ * 5. Process HTML with streaming Transform to extract and replace images
1139
1298
  * 6. Save transformed documents to output directory (as result.json and result.html)
1140
1299
  */
1141
1300
  static async extractAndSaveDocumentsFromZip(logger, zipPath, extractDir, outputDir) {
1142
1301
  logger.info("[PDFConverter] Extracting ZIP file...");
1143
1302
  await _ImageExtractor.extractZip(zipPath, extractDir);
1144
- const files = (0, import_node_fs.readdirSync)(extractDir);
1303
+ const files = (0, import_node_fs2.readdirSync)(extractDir);
1145
1304
  const jsonFile = files.find((f) => (0, import_node_path2.extname)(f).toLowerCase() === ".json");
1146
1305
  const htmlFile = files.find((f) => (0, import_node_path2.extname)(f).toLowerCase() === ".html");
1147
1306
  if (!jsonFile || !htmlFile) {
@@ -1151,23 +1310,22 @@ var ImageExtractor = class _ImageExtractor {
1151
1310
  }
1152
1311
  const jsonPath = (0, import_node_path2.join)(extractDir, jsonFile);
1153
1312
  const htmlPath = (0, import_node_path2.join)(extractDir, htmlFile);
1154
- const htmlContent = (0, import_node_fs.readFileSync)(htmlPath, "utf-8");
1155
1313
  logger.info("[PDFConverter] Saving converted files to output...");
1156
1314
  await _ImageExtractor.saveDocumentsWithExtractedImages(
1157
1315
  logger,
1158
1316
  outputDir,
1159
1317
  "result",
1160
1318
  jsonPath,
1161
- htmlContent
1319
+ htmlPath
1162
1320
  );
1163
1321
  logger.info("[PDFConverter] Files saved to:", outputDir);
1164
1322
  }
1165
1323
  };
1166
1324
 
1167
1325
  // src/processors/page-renderer.ts
1168
- var import_node_fs2 = require("fs");
1326
+ var import_node_fs3 = require("fs");
1169
1327
  var import_node_path3 = require("path");
1170
- var DEFAULT_DPI = 300;
1328
+ var PROGRESS_POLL_INTERVAL_MS = 2e3;
1171
1329
  var PageRenderer = class {
1172
1330
  constructor(logger) {
1173
1331
  this.logger = logger;
@@ -1181,31 +1339,62 @@ var PageRenderer = class {
1181
1339
  * @returns Render result with page count and file paths
1182
1340
  */
1183
1341
  async renderPages(pdfPath, outputDir, options) {
1184
- const dpi = options?.dpi ?? DEFAULT_DPI;
1342
+ const dpi = options?.dpi ?? PAGE_RENDERING.DEFAULT_DPI;
1185
1343
  const pagesDir = (0, import_node_path3.join)(outputDir, "pages");
1186
- if (!(0, import_node_fs2.existsSync)(pagesDir)) {
1187
- (0, import_node_fs2.mkdirSync)(pagesDir, { recursive: true });
1344
+ if (!(0, import_node_fs3.existsSync)(pagesDir)) {
1345
+ (0, import_node_fs3.mkdirSync)(pagesDir, { recursive: true });
1188
1346
  }
1189
- this.logger.info(`[PageRenderer] Rendering PDF at ${dpi} DPI...`);
1190
- const outputPattern = (0, import_node_path3.join)(pagesDir, "page_%d.png");
1191
- const result = await spawnAsync("magick", [
1192
- "-density",
1193
- dpi.toString(),
1194
- pdfPath,
1195
- "-background",
1196
- "white",
1197
- "-alpha",
1198
- "remove",
1199
- "-alpha",
1200
- "off",
1201
- outputPattern
1202
- ]);
1203
- if (result.code !== 0) {
1204
- throw new Error(
1205
- `[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
1347
+ const totalPages = await this.getPageCount(pdfPath);
1348
+ if (totalPages > 0) {
1349
+ this.logger.info(
1350
+ `[PageRenderer] Rendering ${totalPages} pages at ${dpi} DPI...`
1206
1351
  );
1352
+ } else {
1353
+ this.logger.info(`[PageRenderer] Rendering PDF at ${dpi} DPI...`);
1354
+ }
1355
+ const outputPattern = (0, import_node_path3.join)(pagesDir, "page_%d.png");
1356
+ let progressInterval = null;
1357
+ if (totalPages > 0) {
1358
+ let lastLoggedCount = 0;
1359
+ progressInterval = setInterval(() => {
1360
+ try {
1361
+ const rendered = (0, import_node_fs3.readdirSync)(pagesDir).filter(
1362
+ (f) => f.startsWith("page_") && f.endsWith(".png")
1363
+ ).length;
1364
+ if (rendered > 0 && rendered !== lastLoggedCount) {
1365
+ lastLoggedCount = rendered;
1366
+ this.logger.info(
1367
+ `[PageRenderer] Rendering pages: ${rendered}/${totalPages}`
1368
+ );
1369
+ }
1370
+ } catch {
1371
+ }
1372
+ }, PROGRESS_POLL_INTERVAL_MS);
1373
+ }
1374
+ try {
1375
+ const result = await spawnAsync("magick", [
1376
+ "-density",
1377
+ dpi.toString(),
1378
+ pdfPath,
1379
+ "-background",
1380
+ "white",
1381
+ "-alpha",
1382
+ "remove",
1383
+ "-alpha",
1384
+ "off",
1385
+ outputPattern
1386
+ ]);
1387
+ if (result.code !== 0) {
1388
+ throw new Error(
1389
+ `[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
1390
+ );
1391
+ }
1392
+ } finally {
1393
+ if (progressInterval) {
1394
+ clearInterval(progressInterval);
1395
+ }
1207
1396
  }
1208
- const pageFiles = (0, import_node_fs2.readdirSync)(pagesDir).filter((f) => f.startsWith("page_") && f.endsWith(".png")).sort((a, b) => {
1397
+ const pageFiles = (0, import_node_fs3.readdirSync)(pagesDir).filter((f) => f.startsWith("page_") && f.endsWith(".png")).sort((a, b) => {
1209
1398
  const numA = parseInt(a.replace("page_", "").replace(".png", ""), 10);
1210
1399
  const numB = parseInt(b.replace("page_", "").replace(".png", ""), 10);
1211
1400
  return numA - numB;
@@ -1219,6 +1408,20 @@ var PageRenderer = class {
1219
1408
  pageFiles
1220
1409
  };
1221
1410
  }
1411
+ /**
1412
+ * Get total page count using pdfinfo.
1413
+ * Returns 0 on failure (progress logging will be skipped).
1414
+ */
1415
+ async getPageCount(pdfPath) {
1416
+ try {
1417
+ const result = await spawnAsync("pdfinfo", [pdfPath]);
1418
+ if (result.code !== 0) return 0;
1419
+ const match = result.stdout.match(/^Pages:\s+(\d+)/m);
1420
+ return match ? parseInt(match[1], 10) : 0;
1421
+ } catch {
1422
+ return 0;
1423
+ }
1424
+ }
1222
1425
  };
1223
1426
 
1224
1427
  // src/processors/pdf-text-extractor.ts
@@ -1304,7 +1507,7 @@ var PdfTextExtractor = class {
1304
1507
  };
1305
1508
 
1306
1509
  // src/processors/vlm-text-corrector.ts
1307
- var import_node_fs3 = require("fs");
1510
+ var import_node_fs4 = require("fs");
1308
1511
  var import_node_path4 = require("path");
1309
1512
 
1310
1513
  // src/types/vlm-text-correction-schema.ts
@@ -1436,7 +1639,7 @@ var VlmTextCorrector = class {
1436
1639
  async correctAndSave(outputDir, model, options) {
1437
1640
  this.logger.info("[VlmTextCorrector] Starting text correction...");
1438
1641
  const resultPath = (0, import_node_path4.join)(outputDir, "result.json");
1439
- const doc = JSON.parse((0, import_node_fs3.readFileSync)(resultPath, "utf-8"));
1642
+ const doc = JSON.parse((0, import_node_fs4.readFileSync)(resultPath, "utf-8"));
1440
1643
  let pageNumbers = this.getPageNumbers(doc);
1441
1644
  if (pageNumbers.length === 0) {
1442
1645
  this.logger.info("[VlmTextCorrector] No pages to process");
@@ -1487,7 +1690,7 @@ var VlmTextCorrector = class {
1487
1690
  if (corrections === null) continue;
1488
1691
  this.applyCorrections(doc, pageNumbers[i], corrections);
1489
1692
  }
1490
- (0, import_node_fs3.writeFileSync)(resultPath, JSON.stringify(doc, null, 2));
1693
+ (0, import_node_fs4.writeFileSync)(resultPath, JSON.stringify(doc, null, 2));
1491
1694
  this.logger.info(
1492
1695
  `[VlmTextCorrector] Correction complete: ${totalTextCorrections} text, ${totalCellCorrections} cell corrections across ${pageNumbers.length} pages (${pagesFailed} failed)`
1493
1696
  );
@@ -1763,7 +1966,7 @@ var VlmTextCorrector = class {
1763
1966
  */
1764
1967
  readPageImage(outputDir, pageNo) {
1765
1968
  const imagePath = (0, import_node_path4.join)(outputDir, "pages", `page_${pageNo - 1}.png`);
1766
- return (0, import_node_fs3.readFileSync)(imagePath).toString("base64");
1969
+ return (0, import_node_fs4.readFileSync)(imagePath).toString("base64");
1767
1970
  }
1768
1971
  /**
1769
1972
  * Apply VLM corrections to the DoclingDocument.
@@ -1818,9 +2021,8 @@ var VlmTextCorrector = class {
1818
2021
 
1819
2022
  // src/samplers/ocr-strategy-sampler.ts
1820
2023
  var import_model = require("@heripo/model");
1821
- var import_node_fs4 = require("fs");
2024
+ var import_node_fs5 = require("fs");
1822
2025
  var import_v42 = require("zod/v4");
1823
- var SAMPLE_DPI = 150;
1824
2026
  var EDGE_TRIM_RATIO = 0.1;
1825
2027
  var DEFAULT_MAX_SAMPLE_PAGES = 15;
1826
2028
  var DEFAULT_MAX_RETRIES2 = 3;
@@ -1871,7 +2073,7 @@ var OcrStrategySampler = class {
1871
2073
  const renderResult = await this.pageRenderer.renderPages(
1872
2074
  pdfPath,
1873
2075
  outputDir,
1874
- { dpi: SAMPLE_DPI }
2076
+ { dpi: PAGE_RENDERING.SAMPLE_DPI }
1875
2077
  );
1876
2078
  if (renderResult.pageCount === 0) {
1877
2079
  this.logger.info("[OcrStrategySampler] No pages found in PDF");
@@ -2033,7 +2235,7 @@ var OcrStrategySampler = class {
2033
2235
  this.logger.debug(
2034
2236
  `[OcrStrategySampler] Analyzing page ${pageNo} for Korean-Hanja mix and language...`
2035
2237
  );
2036
- const base64Image = (0, import_node_fs4.readFileSync)(pageFile).toString("base64");
2238
+ const base64Image = (0, import_node_fs5.readFileSync)(pageFile).toString("base64");
2037
2239
  const messages = [
2038
2240
  {
2039
2241
  role: "user",
@@ -2081,7 +2283,7 @@ var OcrStrategySampler = class {
2081
2283
  };
2082
2284
 
2083
2285
  // src/utils/local-file-server.ts
2084
- var import_node_fs5 = require("fs");
2286
+ var import_node_fs6 = require("fs");
2085
2287
  var import_node_http = require("http");
2086
2288
  var import_node_path5 = require("path");
2087
2289
  var LocalFileServer = class {
@@ -2095,7 +2297,7 @@ var LocalFileServer = class {
2095
2297
  */
2096
2298
  async start(filePath) {
2097
2299
  const filename = (0, import_node_path5.basename)(filePath);
2098
- const stat = (0, import_node_fs5.statSync)(filePath);
2300
+ const stat = (0, import_node_fs6.statSync)(filePath);
2099
2301
  return new Promise((resolve, reject) => {
2100
2302
  this.server = (0, import_node_http.createServer)((req, res) => {
2101
2303
  if (req.url === `/${filename}`) {
@@ -2103,7 +2305,7 @@ var LocalFileServer = class {
2103
2305
  "Content-Type": "application/pdf",
2104
2306
  "Content-Length": stat.size
2105
2307
  });
2106
- (0, import_node_fs5.createReadStream)(filePath).pipe(res);
2308
+ (0, import_node_fs6.createReadStream)(filePath).pipe(res);
2107
2309
  } else {
2108
2310
  res.writeHead(404);
2109
2311
  res.end("Not Found");
@@ -2140,7 +2342,7 @@ var LocalFileServer = class {
2140
2342
  };
2141
2343
 
2142
2344
  // src/core/image-pdf-converter.ts
2143
- var import_node_fs6 = require("fs");
2345
+ var import_node_fs7 = require("fs");
2144
2346
  var import_node_os = require("os");
2145
2347
  var import_node_path6 = require("path");
2146
2348
  var ImagePdfConverter = class {
@@ -2168,8 +2370,8 @@ var ImagePdfConverter = class {
2168
2370
  this.logger.info("[ImagePdfConverter] Image PDF created:", outputPath);
2169
2371
  return outputPath;
2170
2372
  } finally {
2171
- if ((0, import_node_fs6.existsSync)(inputPath)) {
2172
- (0, import_node_fs6.rmSync)(inputPath, { force: true });
2373
+ if ((0, import_node_fs7.existsSync)(inputPath)) {
2374
+ (0, import_node_fs7.rmSync)(inputPath, { force: true });
2173
2375
  }
2174
2376
  }
2175
2377
  }
@@ -2216,12 +2418,12 @@ var ImagePdfConverter = class {
2216
2418
  * Cleanup the temporary image PDF file
2217
2419
  */
2218
2420
  cleanup(imagePdfPath) {
2219
- if ((0, import_node_fs6.existsSync)(imagePdfPath)) {
2421
+ if ((0, import_node_fs7.existsSync)(imagePdfPath)) {
2220
2422
  this.logger.info(
2221
2423
  "[ImagePdfConverter] Cleaning up temp file:",
2222
2424
  imagePdfPath
2223
2425
  );
2224
- (0, import_node_fs6.rmSync)(imagePdfPath, { force: true });
2426
+ (0, import_node_fs7.rmSync)(imagePdfPath, { force: true });
2225
2427
  }
2226
2428
  }
2227
2429
  };
@@ -2365,8 +2567,8 @@ var PDFConverter = class {
2365
2567
  }
2366
2568
  return strategy;
2367
2569
  } finally {
2368
- if ((0, import_node_fs7.existsSync)(samplingDir)) {
2369
- (0, import_node_fs7.rmSync)(samplingDir, { recursive: true, force: true });
2570
+ if ((0, import_node_fs8.existsSync)(samplingDir)) {
2571
+ (0, import_node_fs8.rmSync)(samplingDir, { recursive: true, force: true });
2370
2572
  }
2371
2573
  }
2372
2574
  }
@@ -2388,8 +2590,10 @@ var PDFConverter = class {
2388
2590
  let pageTexts;
2389
2591
  try {
2390
2592
  const resultPath2 = (0, import_node_path7.join)(outputDir, "result.json");
2391
- const doc = JSON.parse((0, import_node_fs7.readFileSync)(resultPath2, "utf-8"));
2392
- const totalPages = Object.keys(doc.pages).length;
2593
+ const totalPages = await runJqFileJson(
2594
+ ".pages | length",
2595
+ resultPath2
2596
+ );
2393
2597
  const textExtractor = new PdfTextExtractor(this.logger);
2394
2598
  pageTexts = await textExtractor.extractText(pdfPath, totalPages);
2395
2599
  } catch {
@@ -2399,7 +2603,7 @@ var PDFConverter = class {
2399
2603
  }
2400
2604
  const resultPath = (0, import_node_path7.join)(outputDir, "result.json");
2401
2605
  const ocrOriginPath = (0, import_node_path7.join)(outputDir, "result_ocr_origin.json");
2402
- (0, import_node_fs7.copyFileSync)(resultPath, ocrOriginPath);
2606
+ (0, import_node_fs8.copyFileSync)(resultPath, ocrOriginPath);
2403
2607
  const corrector = new VlmTextCorrector(this.logger);
2404
2608
  await corrector.correctAndSave(outputDir, options.vlmProcessorModel, {
2405
2609
  concurrency: options.vlmConcurrency,
@@ -2560,19 +2764,19 @@ var PDFConverter = class {
2560
2764
  this.logger.info("[PDFConverter] Total time:", duration, "ms");
2561
2765
  } finally {
2562
2766
  this.logger.info("[PDFConverter] Cleaning up temporary files...");
2563
- if ((0, import_node_fs7.existsSync)(zipPath)) {
2564
- (0, import_node_fs7.rmSync)(zipPath, { force: true });
2767
+ if ((0, import_node_fs8.existsSync)(zipPath)) {
2768
+ (0, import_node_fs8.rmSync)(zipPath, { force: true });
2565
2769
  }
2566
- if ((0, import_node_fs7.existsSync)(extractDir)) {
2567
- (0, import_node_fs7.rmSync)(extractDir, { recursive: true, force: true });
2770
+ if ((0, import_node_fs8.existsSync)(extractDir)) {
2771
+ (0, import_node_fs8.rmSync)(extractDir, { recursive: true, force: true });
2568
2772
  }
2569
2773
  if (cleanupAfterCallback) {
2570
2774
  this.logger.info(
2571
2775
  "[PDFConverter] Cleaning up output directory:",
2572
2776
  outputDir
2573
2777
  );
2574
- if ((0, import_node_fs7.existsSync)(outputDir)) {
2575
- (0, import_node_fs7.rmSync)(outputDir, { recursive: true, force: true });
2778
+ if ((0, import_node_fs8.existsSync)(outputDir)) {
2779
+ (0, import_node_fs8.rmSync)(outputDir, { recursive: true, force: true });
2576
2780
  }
2577
2781
  } else {
2578
2782
  this.logger.info("[PDFConverter] Output preserved at:", outputDir);
@@ -2721,12 +2925,12 @@ var PDFConverter = class {
2721
2925
  const zipPath = (0, import_node_path7.join)(process.cwd(), "result.zip");
2722
2926
  this.logger.info("[PDFConverter] Saving ZIP file to:", zipPath);
2723
2927
  if (zipResult.fileStream) {
2724
- const writeStream = (0, import_node_fs7.createWriteStream)(zipPath);
2725
- await (0, import_promises2.pipeline)(zipResult.fileStream, writeStream);
2928
+ const writeStream = (0, import_node_fs8.createWriteStream)(zipPath);
2929
+ await (0, import_promises5.pipeline)(zipResult.fileStream, writeStream);
2726
2930
  return;
2727
2931
  }
2728
2932
  if (zipResult.data) {
2729
- await (0, import_promises.writeFile)(zipPath, zipResult.data);
2933
+ await (0, import_promises4.writeFile)(zipPath, zipResult.data);
2730
2934
  return;
2731
2935
  }
2732
2936
  this.logger.warn(
@@ -2742,7 +2946,7 @@ var PDFConverter = class {
2742
2946
  );
2743
2947
  }
2744
2948
  const buffer = new Uint8Array(await response.arrayBuffer());
2745
- await (0, import_promises.writeFile)(zipPath, buffer);
2949
+ await (0, import_promises4.writeFile)(zipPath, buffer);
2746
2950
  }
2747
2951
  async processConvertedFiles(zipPath, extractDir, outputDir) {
2748
2952
  await ImageExtractor.extractAndSaveDocumentsFromZip(
@@ -2754,6 +2958,7 @@ var PDFConverter = class {
2754
2958
  }
2755
2959
  /**
2756
2960
  * Render page images from the source PDF using ImageMagick and update result.json.
2961
+ * Uses jq to update the JSON file without loading it into Node.js memory.
2757
2962
  * Replaces Docling's generate_page_images which fails on large PDFs
2758
2963
  * due to memory limits when embedding all page images as base64.
2759
2964
  */
@@ -2771,17 +2976,18 @@ var PDFConverter = class {
2771
2976
  const renderer = new PageRenderer(this.logger);
2772
2977
  const renderResult = await renderer.renderPages(pdfPath, outputDir);
2773
2978
  const resultPath = (0, import_node_path7.join)(outputDir, "result.json");
2774
- const doc = JSON.parse((0, import_node_fs7.readFileSync)(resultPath, "utf-8"));
2775
- for (const page of Object.values(doc.pages)) {
2776
- const pageNo = page.page_no;
2777
- const fileIndex = pageNo - 1;
2778
- if (fileIndex >= 0 && fileIndex < renderResult.pageCount) {
2779
- page.image.uri = `pages/page_${fileIndex}.png`;
2780
- page.image.mimetype = "image/png";
2781
- page.image.dpi = 300;
2782
- }
2783
- }
2784
- await (0, import_promises.writeFile)(resultPath, JSON.stringify(doc, null, 2));
2979
+ const tmpPath = resultPath + ".tmp";
2980
+ const jqProgram = `
2981
+ .pages |= with_entries(
2982
+ if (.value.page_no - 1) >= 0 and (.value.page_no - 1) < ${renderResult.pageCount} then
2983
+ .value.image.uri = "pages/page_\\(.value.page_no - 1).png" |
2984
+ .value.image.mimetype = "image/png" |
2985
+ .value.image.dpi = ${PAGE_RENDERING.DEFAULT_DPI}
2986
+ else . end
2987
+ )
2988
+ `;
2989
+ await runJqFileToFile(jqProgram, resultPath, tmpPath);
2990
+ await (0, import_promises4.rename)(tmpPath, resultPath);
2785
2991
  this.logger.info(
2786
2992
  `[PDFConverter] Rendered ${renderResult.pageCount} page images`
2787
2993
  );