@heripo/pdf-parser 0.1.9 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -43,11 +43,17 @@ var DOCLING_ENVIRONMENT = {
43
43
  */
44
44
  STARTUP_DELAY_MS: 2e3
45
45
  };
46
+ var PAGE_RENDERING = {
47
+ /** Default rendering DPI for VLM text recognition quality */
48
+ DEFAULT_DPI: 200,
49
+ /** Low-resolution DPI for OCR strategy sampling */
50
+ SAMPLE_DPI: 150
51
+ };
46
52
  var IMAGE_PDF_CONVERTER = {
47
53
  /**
48
54
  * ImageMagick density option (DPI) for PDF to image conversion
49
55
  */
50
- DENSITY: 300,
56
+ DENSITY: PAGE_RENDERING.DEFAULT_DPI,
51
57
  /**
52
58
  * ImageMagick quality option (1-100)
53
59
  */
@@ -837,16 +843,10 @@ var DoclingEnvironment = class _DoclingEnvironment {
837
843
 
838
844
  // src/core/pdf-converter.ts
839
845
  import { omit } from "es-toolkit";
840
- import {
841
- copyFileSync,
842
- createWriteStream as createWriteStream2,
843
- existsSync as existsSync4,
844
- readFileSync as readFileSync4,
845
- rmSync as rmSync3
846
- } from "fs";
847
- import { writeFile } from "fs/promises";
846
+ import { copyFileSync, createWriteStream as createWriteStream3, existsSync as existsSync4, rmSync as rmSync3 } from "fs";
847
+ import { rename as rename2, writeFile } from "fs/promises";
848
848
  import { join as join6 } from "path";
849
- import { pipeline } from "stream/promises";
849
+ import { pipeline as pipeline3 } from "stream/promises";
850
850
 
851
851
  // src/errors/image-pdf-fallback-error.ts
852
852
  var ImagePdfFallbackError = class extends Error {
@@ -862,19 +862,24 @@ var ImagePdfFallbackError = class extends Error {
862
862
 
863
863
  // src/processors/image-extractor.ts
864
864
  import {
865
- createWriteStream,
865
+ createReadStream,
866
+ createWriteStream as createWriteStream2,
866
867
  existsSync,
867
868
  mkdirSync,
868
- readFileSync,
869
869
  readdirSync,
870
870
  rmSync,
871
871
  writeFileSync
872
872
  } from "fs";
873
873
  import { extname, join as join2 } from "path";
874
+ import { Transform } from "stream";
875
+ import { pipeline as pipeline2 } from "stream/promises";
874
876
  import * as yauzl from "yauzl";
875
877
 
876
878
  // src/utils/jq.ts
877
879
  import { spawn as spawn3 } from "child_process";
880
+ import { createWriteStream } from "fs";
881
+ import { rename } from "fs/promises";
882
+ import { pipeline } from "stream/promises";
878
883
  function getJqPath() {
879
884
  const p = process.env.JQ_PATH?.trim();
880
885
  return p && p.length > 0 ? p : "jq";
@@ -926,25 +931,139 @@ function runJqFileJson(program, filePath) {
926
931
  });
927
932
  });
928
933
  }
929
- function jqExtractBase64PngStrings(filePath) {
930
- const program = `
931
- [
932
- .. |
933
- select(type == "string" and startswith("data:image/png;base64"))
934
- ]
935
- `;
936
- return runJqFileJson(program, filePath);
934
+ function runJqFileToFile(program, inputPath, outputPath) {
935
+ return new Promise((resolve, reject) => {
936
+ const jqPath = getJqPath();
937
+ const args = [program, inputPath];
938
+ const child = spawn3(jqPath, args, {
939
+ stdio: ["ignore", "pipe", "pipe"],
940
+ env: process.env
941
+ });
942
+ let stderr = "";
943
+ let exitCode = null;
944
+ let pipelineDone = false;
945
+ let settled = false;
946
+ child.stderr.setEncoding("utf-8");
947
+ child.stderr.on("data", (chunk) => {
948
+ stderr += chunk;
949
+ });
950
+ const ws = createWriteStream(outputPath);
951
+ function trySettle() {
952
+ if (settled) return;
953
+ if (!pipelineDone || exitCode === null) return;
954
+ settled = true;
955
+ if (exitCode !== 0) {
956
+ reject(
957
+ new Error(
958
+ `jq exited with code ${exitCode}. ${stderr ? "Stderr: " + stderr : ""}`
959
+ )
960
+ );
961
+ } else {
962
+ resolve();
963
+ }
964
+ }
965
+ child.on("error", (err) => {
966
+ if (settled) return;
967
+ settled = true;
968
+ ws.destroy();
969
+ reject(err);
970
+ });
971
+ pipeline(child.stdout, ws).then(() => {
972
+ pipelineDone = true;
973
+ trySettle();
974
+ }).catch((err) => {
975
+ if (settled) return;
976
+ settled = true;
977
+ reject(err);
978
+ });
979
+ child.on("close", (code) => {
980
+ exitCode = code ?? 1;
981
+ trySettle();
982
+ });
983
+ });
937
984
  }
938
- function jqReplaceBase64WithPaths(filePath, dirName, prefix) {
985
+ function runJqFileLines(program, filePath, onLine) {
986
+ return new Promise((resolve, reject) => {
987
+ const jqPath = getJqPath();
988
+ const args = ["-r", program, filePath];
989
+ const child = spawn3(jqPath, args, {
990
+ stdio: ["ignore", "pipe", "pipe"],
991
+ env: process.env
992
+ });
993
+ let stderr = "";
994
+ let buffer = "";
995
+ let callbackError = false;
996
+ child.stdout.setEncoding("utf-8");
997
+ child.stderr.setEncoding("utf-8");
998
+ function safeOnLine(line) {
999
+ if (callbackError) return;
1000
+ try {
1001
+ onLine(line);
1002
+ } catch (err) {
1003
+ callbackError = true;
1004
+ child.kill();
1005
+ reject(err);
1006
+ }
1007
+ }
1008
+ child.stdout.on("data", (chunk) => {
1009
+ buffer += chunk;
1010
+ let newlineIdx;
1011
+ while ((newlineIdx = buffer.indexOf("\n")) !== -1) {
1012
+ const line = buffer.slice(0, newlineIdx);
1013
+ buffer = buffer.slice(newlineIdx + 1);
1014
+ if (line.length > 0) {
1015
+ safeOnLine(line);
1016
+ }
1017
+ }
1018
+ });
1019
+ child.stderr.on("data", (chunk) => {
1020
+ stderr += chunk;
1021
+ });
1022
+ child.on("error", (err) => {
1023
+ if (!callbackError) reject(err);
1024
+ });
1025
+ child.on("close", (code) => {
1026
+ if (callbackError) return;
1027
+ if (buffer.length > 0) {
1028
+ safeOnLine(buffer);
1029
+ }
1030
+ if (callbackError) return;
1031
+ if (code !== 0) {
1032
+ reject(
1033
+ new Error(
1034
+ `jq exited with code ${code}. ${stderr ? "Stderr: " + stderr : ""}`
1035
+ )
1036
+ );
1037
+ } else {
1038
+ resolve();
1039
+ }
1040
+ });
1041
+ });
1042
+ }
1043
+ async function jqExtractBase64PngStringsStreaming(filePath, onImage) {
1044
+ let index = 0;
1045
+ await runJqFileLines(
1046
+ '.. | select(type == "string" and startswith("data:image/png;base64"))',
1047
+ filePath,
1048
+ (line) => {
1049
+ onImage(line, index);
1050
+ index++;
1051
+ }
1052
+ );
1053
+ return index;
1054
+ }
1055
+ async function jqReplaceBase64WithPathsToFile(inputPath, outputPath, dirName, prefix) {
939
1056
  const program = `
940
1057
  reduce paths(type == "string" and startswith("data:image/png;base64")) as $p (
941
1058
  {data: ., counter: 0};
942
1059
  .counter as $idx |
943
1060
  .data |= setpath($p; "${dirName}/${prefix}_\\($idx).png") |
944
1061
  .counter += 1
945
- ) | {data: .data, count: .counter}
1062
+ ) | .data
946
1063
  `;
947
- return runJqFileJson(program, filePath);
1064
+ const tmpPath = outputPath + ".tmp";
1065
+ await runJqFileToFile(program, inputPath, tmpPath);
1066
+ await rename(tmpPath, outputPath);
948
1067
  }
949
1068
 
950
1069
  // src/processors/image-extractor.ts
@@ -972,7 +1091,7 @@ var ImageExtractor = class _ImageExtractor {
972
1091
  return;
973
1092
  }
974
1093
  mkdirSync(join2(entryPath, ".."), { recursive: true });
975
- const writeStream = createWriteStream(entryPath);
1094
+ const writeStream = createWriteStream2(entryPath);
976
1095
  readStream.pipe(writeStream);
977
1096
  writeStream.on("finish", () => {
978
1097
  zipfile.readEntry();
@@ -988,26 +1107,6 @@ var ImageExtractor = class _ImageExtractor {
988
1107
  });
989
1108
  });
990
1109
  }
991
- /**
992
- * Extract base64 images from JSON file using jq (for large files)
993
- * Returns array of base64 data strings
994
- */
995
- static async extractBase64ImagesFromJsonWithJq(jsonPath) {
996
- return jqExtractBase64PngStrings(jsonPath);
997
- }
998
- /**
999
- * Replace base64 images with file paths in JSON using jq (for large files)
1000
- * Uses reduce to maintain counter state while walking the JSON
1001
- */
1002
- static async replaceBase64ImagesInJsonWithJq(jsonPath, outputPath, dirName, prefix) {
1003
- const { data, count } = await jqReplaceBase64WithPaths(
1004
- jsonPath,
1005
- dirName,
1006
- prefix
1007
- );
1008
- writeFileSync(outputPath, JSON.stringify(data, null, 2), "utf-8");
1009
- return count;
1010
- }
1011
1110
  /**
1012
1111
  * Extract a base64-encoded image to a file and return the relative path
1013
1112
  */
@@ -1021,8 +1120,66 @@ var ImageExtractor = class _ImageExtractor {
1021
1120
  return `${dirName}/${filename}`;
1022
1121
  }
1023
1122
  /**
1024
- * Save JSON and HTML documents with base64 images extracted to separate files
1025
- * Uses jq for JSON processing to handle large files
1123
+ * Extract base64 images from HTML using streaming.
1124
+ * Reads HTML file as a stream, extracts base64 images from src attributes,
1125
+ * saves them as PNG files, and replaces with file paths in the output HTML.
1126
+ * Returns the number of images extracted.
1127
+ */
1128
+ static async extractImagesFromHtmlStream(htmlInputPath, htmlOutputPath, imagesDir) {
1129
+ let imageIndex = 0;
1130
+ let pending = "";
1131
+ const MARKER = 'src="data:image/png;base64,';
1132
+ const transform = new Transform({
1133
+ decodeStrings: false,
1134
+ encoding: "utf-8",
1135
+ transform(chunk, _encoding, callback) {
1136
+ pending += chunk;
1137
+ let result = "";
1138
+ while (true) {
1139
+ const markerIdx = pending.indexOf(MARKER);
1140
+ if (markerIdx === -1) {
1141
+ const safeEnd = Math.max(0, pending.length - MARKER.length);
1142
+ result += pending.slice(0, safeEnd);
1143
+ pending = pending.slice(safeEnd);
1144
+ break;
1145
+ }
1146
+ result += pending.slice(0, markerIdx);
1147
+ const dataStart = markerIdx + MARKER.length;
1148
+ const quoteIdx = pending.indexOf('"', dataStart);
1149
+ if (quoteIdx === -1) {
1150
+ pending = pending.slice(markerIdx);
1151
+ break;
1152
+ }
1153
+ const base64Content = pending.slice(dataStart, quoteIdx);
1154
+ const filename = `image_${imageIndex}.png`;
1155
+ const filepath = join2(imagesDir, filename);
1156
+ const buf = Buffer.from(base64Content, "base64");
1157
+ writeFileSync(filepath, buf);
1158
+ const relativePath = `images/${filename}`;
1159
+ result += `src="${relativePath}"`;
1160
+ imageIndex++;
1161
+ pending = pending.slice(quoteIdx + 1);
1162
+ }
1163
+ if (result.length > 0) {
1164
+ this.push(result);
1165
+ }
1166
+ callback();
1167
+ },
1168
+ flush(callback) {
1169
+ if (pending.length > 0) {
1170
+ this.push(pending);
1171
+ }
1172
+ callback();
1173
+ }
1174
+ });
1175
+ const rs = createReadStream(htmlInputPath, { encoding: "utf-8" });
1176
+ const ws = createWriteStream2(htmlOutputPath, { encoding: "utf-8" });
1177
+ await pipeline2(rs, transform, ws);
1178
+ return imageIndex;
1179
+ }
1180
+ /**
1181
+ * Save JSON and HTML documents with base64 images extracted to separate files.
1182
+ * Uses jq for JSON processing and streaming for HTML to handle large files.
1026
1183
  *
1027
1184
  * This method:
1028
1185
  * 1. Extracts base64-encoded images from JSON and HTML content
@@ -1030,7 +1187,7 @@ var ImageExtractor = class _ImageExtractor {
1030
1187
  * 3. Replaces base64 data with relative file paths
1031
1188
  * 4. Saves the transformed documents to the output directory
1032
1189
  */
1033
- static async saveDocumentsWithExtractedImages(logger, outputDir, filename, jsonSourcePath, htmlContent) {
1190
+ static async saveDocumentsWithExtractedImages(logger, outputDir, filename, jsonSourcePath, htmlSourcePath) {
1034
1191
  try {
1035
1192
  if (existsSync(outputDir)) {
1036
1193
  rmSync(outputDir, { recursive: true, force: true });
@@ -1046,27 +1203,29 @@ var ImageExtractor = class _ImageExtractor {
1046
1203
  if (!existsSync(imagesDir)) {
1047
1204
  mkdirSync(imagesDir, { recursive: true });
1048
1205
  }
1049
- const base64Images = await _ImageExtractor.extractBase64ImagesFromJsonWithJq(jsonSourcePath);
1050
- base64Images.forEach((base64Data, index) => {
1051
- _ImageExtractor.extractBase64ImageToFile(
1052
- base64Data,
1053
- imagesDir,
1054
- index,
1055
- "pic",
1056
- "images"
1057
- );
1058
- });
1206
+ const imageCount = await jqExtractBase64PngStringsStreaming(
1207
+ jsonSourcePath,
1208
+ (base64Data, index) => {
1209
+ _ImageExtractor.extractBase64ImageToFile(
1210
+ base64Data,
1211
+ imagesDir,
1212
+ index,
1213
+ "pic",
1214
+ "images"
1215
+ );
1216
+ }
1217
+ );
1059
1218
  logger.info(
1060
- `[PDFConverter] Extracted ${base64Images.length} picture images from JSON to ${imagesDir}`
1219
+ `[PDFConverter] Extracted ${imageCount} picture images from JSON to ${imagesDir}`
1061
1220
  );
1062
- const replacedCount = await _ImageExtractor.replaceBase64ImagesInJsonWithJq(
1221
+ await jqReplaceBase64WithPathsToFile(
1063
1222
  jsonSourcePath,
1064
1223
  jsonPath,
1065
1224
  "images",
1066
1225
  "pic"
1067
1226
  );
1068
1227
  logger.info(
1069
- `[PDFConverter] Replaced ${replacedCount} base64 images with file paths`
1228
+ `[PDFConverter] Replaced ${imageCount} base64 images with file paths`
1070
1229
  );
1071
1230
  } catch (e) {
1072
1231
  logger.warn(
@@ -1082,42 +1241,36 @@ var ImageExtractor = class _ImageExtractor {
1082
1241
  if (!existsSync(imagesDir)) {
1083
1242
  mkdirSync(imagesDir, { recursive: true });
1084
1243
  }
1085
- let imageIndex = 0;
1086
- const transformedHtml = htmlContent.replace(
1087
- /src="data:image\/png;base64,([^"]+)"/g,
1088
- (_, base64Content) => {
1089
- const filename2 = `image_${imageIndex}.png`;
1090
- const filepath = join2(imagesDir, filename2);
1091
- const buffer = Buffer.from(base64Content, "base64");
1092
- writeFileSync(filepath, buffer);
1093
- const relativePath = `images/${filename2}`;
1094
- imageIndex += 1;
1095
- return `src="${relativePath}"`;
1096
- }
1244
+ const htmlImageCount = await _ImageExtractor.extractImagesFromHtmlStream(
1245
+ htmlSourcePath,
1246
+ htmlPath,
1247
+ imagesDir
1097
1248
  );
1098
1249
  logger.info(
1099
- `[PDFConverter] Extracted ${imageIndex} images from HTML to ${imagesDir}`
1250
+ `[PDFConverter] Extracted ${htmlImageCount} images from HTML to ${imagesDir}`
1100
1251
  );
1101
- writeFileSync(htmlPath, transformedHtml, "utf-8");
1102
1252
  } catch (e) {
1103
1253
  logger.warn(
1104
- "[PDFConverter] Failed to extract images from HTML, writing original. Error:",
1254
+ "[PDFConverter] Failed to extract images from HTML, copying original. Error:",
1105
1255
  e
1106
1256
  );
1107
- writeFileSync(htmlPath, htmlContent, "utf-8");
1257
+ const rs = createReadStream(htmlSourcePath);
1258
+ const ws = createWriteStream2(htmlPath);
1259
+ await pipeline2(rs, ws);
1108
1260
  }
1109
1261
  logger.info("[PDFConverter] Saved HTML:", htmlPath);
1110
1262
  }
1111
1263
  /**
1112
1264
  * Extract documents from ZIP and save with extracted images
1113
- * Uses jq for JSON processing to handle large files without loading into Node.js memory
1265
+ * Uses jq for JSON processing and streaming for HTML to handle large files
1266
+ * without loading into Node.js memory
1114
1267
  *
1115
1268
  * Complete workflow:
1116
1269
  * 1. Extract ZIP file to temporary directory
1117
1270
  * 2. Find JSON and HTML files from extracted files
1118
- * 3. Use jq to extract base64 images from JSON and save as separate files
1119
- * 4. Use jq to replace base64 with file paths in JSON
1120
- * 5. Process HTML with regex to extract and replace images
1271
+ * 3. Use jq to stream-extract base64 images from JSON and save as separate files
1272
+ * 4. Use jq to replace base64 with file paths in JSON (piped to file)
1273
+ * 5. Process HTML with streaming Transform to extract and replace images
1121
1274
  * 6. Save transformed documents to output directory (as result.json and result.html)
1122
1275
  */
1123
1276
  static async extractAndSaveDocumentsFromZip(logger, zipPath, extractDir, outputDir) {
@@ -1133,14 +1286,13 @@ var ImageExtractor = class _ImageExtractor {
1133
1286
  }
1134
1287
  const jsonPath = join2(extractDir, jsonFile);
1135
1288
  const htmlPath = join2(extractDir, htmlFile);
1136
- const htmlContent = readFileSync(htmlPath, "utf-8");
1137
1289
  logger.info("[PDFConverter] Saving converted files to output...");
1138
1290
  await _ImageExtractor.saveDocumentsWithExtractedImages(
1139
1291
  logger,
1140
1292
  outputDir,
1141
1293
  "result",
1142
1294
  jsonPath,
1143
- htmlContent
1295
+ htmlPath
1144
1296
  );
1145
1297
  logger.info("[PDFConverter] Files saved to:", outputDir);
1146
1298
  }
@@ -1149,7 +1301,7 @@ var ImageExtractor = class _ImageExtractor {
1149
1301
  // src/processors/page-renderer.ts
1150
1302
  import { existsSync as existsSync2, mkdirSync as mkdirSync2, readdirSync as readdirSync2 } from "fs";
1151
1303
  import { join as join3 } from "path";
1152
- var DEFAULT_DPI = 300;
1304
+ var PROGRESS_POLL_INTERVAL_MS = 2e3;
1153
1305
  var PageRenderer = class {
1154
1306
  constructor(logger) {
1155
1307
  this.logger = logger;
@@ -1163,29 +1315,60 @@ var PageRenderer = class {
1163
1315
  * @returns Render result with page count and file paths
1164
1316
  */
1165
1317
  async renderPages(pdfPath, outputDir, options) {
1166
- const dpi = options?.dpi ?? DEFAULT_DPI;
1318
+ const dpi = options?.dpi ?? PAGE_RENDERING.DEFAULT_DPI;
1167
1319
  const pagesDir = join3(outputDir, "pages");
1168
1320
  if (!existsSync2(pagesDir)) {
1169
1321
  mkdirSync2(pagesDir, { recursive: true });
1170
1322
  }
1171
- this.logger.info(`[PageRenderer] Rendering PDF at ${dpi} DPI...`);
1172
- const outputPattern = join3(pagesDir, "page_%d.png");
1173
- const result = await spawnAsync("magick", [
1174
- "-density",
1175
- dpi.toString(),
1176
- pdfPath,
1177
- "-background",
1178
- "white",
1179
- "-alpha",
1180
- "remove",
1181
- "-alpha",
1182
- "off",
1183
- outputPattern
1184
- ]);
1185
- if (result.code !== 0) {
1186
- throw new Error(
1187
- `[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
1323
+ const totalPages = await this.getPageCount(pdfPath);
1324
+ if (totalPages > 0) {
1325
+ this.logger.info(
1326
+ `[PageRenderer] Rendering ${totalPages} pages at ${dpi} DPI...`
1188
1327
  );
1328
+ } else {
1329
+ this.logger.info(`[PageRenderer] Rendering PDF at ${dpi} DPI...`);
1330
+ }
1331
+ const outputPattern = join3(pagesDir, "page_%d.png");
1332
+ let progressInterval = null;
1333
+ if (totalPages > 0) {
1334
+ let lastLoggedCount = 0;
1335
+ progressInterval = setInterval(() => {
1336
+ try {
1337
+ const rendered = readdirSync2(pagesDir).filter(
1338
+ (f) => f.startsWith("page_") && f.endsWith(".png")
1339
+ ).length;
1340
+ if (rendered > 0 && rendered !== lastLoggedCount) {
1341
+ lastLoggedCount = rendered;
1342
+ this.logger.info(
1343
+ `[PageRenderer] Rendering pages: ${rendered}/${totalPages}`
1344
+ );
1345
+ }
1346
+ } catch {
1347
+ }
1348
+ }, PROGRESS_POLL_INTERVAL_MS);
1349
+ }
1350
+ try {
1351
+ const result = await spawnAsync("magick", [
1352
+ "-density",
1353
+ dpi.toString(),
1354
+ pdfPath,
1355
+ "-background",
1356
+ "white",
1357
+ "-alpha",
1358
+ "remove",
1359
+ "-alpha",
1360
+ "off",
1361
+ outputPattern
1362
+ ]);
1363
+ if (result.code !== 0) {
1364
+ throw new Error(
1365
+ `[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
1366
+ );
1367
+ }
1368
+ } finally {
1369
+ if (progressInterval) {
1370
+ clearInterval(progressInterval);
1371
+ }
1189
1372
  }
1190
1373
  const pageFiles = readdirSync2(pagesDir).filter((f) => f.startsWith("page_") && f.endsWith(".png")).sort((a, b) => {
1191
1374
  const numA = parseInt(a.replace("page_", "").replace(".png", ""), 10);
@@ -1201,6 +1384,20 @@ var PageRenderer = class {
1201
1384
  pageFiles
1202
1385
  };
1203
1386
  }
1387
+ /**
1388
+ * Get total page count using pdfinfo.
1389
+ * Returns 0 on failure (progress logging will be skipped).
1390
+ */
1391
+ async getPageCount(pdfPath) {
1392
+ try {
1393
+ const result = await spawnAsync("pdfinfo", [pdfPath]);
1394
+ if (result.code !== 0) return 0;
1395
+ const match = result.stdout.match(/^Pages:\s+(\d+)/m);
1396
+ return match ? parseInt(match[1], 10) : 0;
1397
+ } catch {
1398
+ return 0;
1399
+ }
1400
+ }
1204
1401
  };
1205
1402
 
1206
1403
  // src/processors/pdf-text-extractor.ts
@@ -1286,7 +1483,7 @@ var PdfTextExtractor = class {
1286
1483
  };
1287
1484
 
1288
1485
  // src/processors/vlm-text-corrector.ts
1289
- import { readFileSync as readFileSync2, writeFileSync as writeFileSync2 } from "fs";
1486
+ import { readFileSync, writeFileSync as writeFileSync2 } from "fs";
1290
1487
  import { join as join4 } from "path";
1291
1488
 
1292
1489
  // src/types/vlm-text-correction-schema.ts
@@ -1418,7 +1615,7 @@ var VlmTextCorrector = class {
1418
1615
  async correctAndSave(outputDir, model, options) {
1419
1616
  this.logger.info("[VlmTextCorrector] Starting text correction...");
1420
1617
  const resultPath = join4(outputDir, "result.json");
1421
- const doc = JSON.parse(readFileSync2(resultPath, "utf-8"));
1618
+ const doc = JSON.parse(readFileSync(resultPath, "utf-8"));
1422
1619
  let pageNumbers = this.getPageNumbers(doc);
1423
1620
  if (pageNumbers.length === 0) {
1424
1621
  this.logger.info("[VlmTextCorrector] No pages to process");
@@ -1745,7 +1942,7 @@ var VlmTextCorrector = class {
1745
1942
  */
1746
1943
  readPageImage(outputDir, pageNo) {
1747
1944
  const imagePath = join4(outputDir, "pages", `page_${pageNo - 1}.png`);
1748
- return readFileSync2(imagePath).toString("base64");
1945
+ return readFileSync(imagePath).toString("base64");
1749
1946
  }
1750
1947
  /**
1751
1948
  * Apply VLM corrections to the DoclingDocument.
@@ -1800,9 +1997,8 @@ var VlmTextCorrector = class {
1800
1997
 
1801
1998
  // src/samplers/ocr-strategy-sampler.ts
1802
1999
  import { normalizeToBcp47 } from "@heripo/model";
1803
- import { readFileSync as readFileSync3 } from "fs";
2000
+ import { readFileSync as readFileSync2 } from "fs";
1804
2001
  import { z as z2 } from "zod/v4";
1805
- var SAMPLE_DPI = 150;
1806
2002
  var EDGE_TRIM_RATIO = 0.1;
1807
2003
  var DEFAULT_MAX_SAMPLE_PAGES = 15;
1808
2004
  var DEFAULT_MAX_RETRIES2 = 3;
@@ -1853,7 +2049,7 @@ var OcrStrategySampler = class {
1853
2049
  const renderResult = await this.pageRenderer.renderPages(
1854
2050
  pdfPath,
1855
2051
  outputDir,
1856
- { dpi: SAMPLE_DPI }
2052
+ { dpi: PAGE_RENDERING.SAMPLE_DPI }
1857
2053
  );
1858
2054
  if (renderResult.pageCount === 0) {
1859
2055
  this.logger.info("[OcrStrategySampler] No pages found in PDF");
@@ -2015,7 +2211,7 @@ var OcrStrategySampler = class {
2015
2211
  this.logger.debug(
2016
2212
  `[OcrStrategySampler] Analyzing page ${pageNo} for Korean-Hanja mix and language...`
2017
2213
  );
2018
- const base64Image = readFileSync3(pageFile).toString("base64");
2214
+ const base64Image = readFileSync2(pageFile).toString("base64");
2019
2215
  const messages = [
2020
2216
  {
2021
2217
  role: "user",
@@ -2063,7 +2259,7 @@ var OcrStrategySampler = class {
2063
2259
  };
2064
2260
 
2065
2261
  // src/utils/local-file-server.ts
2066
- import { createReadStream, statSync } from "fs";
2262
+ import { createReadStream as createReadStream2, statSync } from "fs";
2067
2263
  import { createServer } from "http";
2068
2264
  import { basename } from "path";
2069
2265
  var LocalFileServer = class {
@@ -2085,7 +2281,7 @@ var LocalFileServer = class {
2085
2281
  "Content-Type": "application/pdf",
2086
2282
  "Content-Length": stat.size
2087
2283
  });
2088
- createReadStream(filePath).pipe(res);
2284
+ createReadStream2(filePath).pipe(res);
2089
2285
  } else {
2090
2286
  res.writeHead(404);
2091
2287
  res.end("Not Found");
@@ -2370,8 +2566,10 @@ var PDFConverter = class {
2370
2566
  let pageTexts;
2371
2567
  try {
2372
2568
  const resultPath2 = join6(outputDir, "result.json");
2373
- const doc = JSON.parse(readFileSync4(resultPath2, "utf-8"));
2374
- const totalPages = Object.keys(doc.pages).length;
2569
+ const totalPages = await runJqFileJson(
2570
+ ".pages | length",
2571
+ resultPath2
2572
+ );
2375
2573
  const textExtractor = new PdfTextExtractor(this.logger);
2376
2574
  pageTexts = await textExtractor.extractText(pdfPath, totalPages);
2377
2575
  } catch {
@@ -2703,8 +2901,8 @@ var PDFConverter = class {
2703
2901
  const zipPath = join6(process.cwd(), "result.zip");
2704
2902
  this.logger.info("[PDFConverter] Saving ZIP file to:", zipPath);
2705
2903
  if (zipResult.fileStream) {
2706
- const writeStream = createWriteStream2(zipPath);
2707
- await pipeline(zipResult.fileStream, writeStream);
2904
+ const writeStream = createWriteStream3(zipPath);
2905
+ await pipeline3(zipResult.fileStream, writeStream);
2708
2906
  return;
2709
2907
  }
2710
2908
  if (zipResult.data) {
@@ -2736,6 +2934,7 @@ var PDFConverter = class {
2736
2934
  }
2737
2935
  /**
2738
2936
  * Render page images from the source PDF using ImageMagick and update result.json.
2937
+ * Uses jq to update the JSON file without loading it into Node.js memory.
2739
2938
  * Replaces Docling's generate_page_images which fails on large PDFs
2740
2939
  * due to memory limits when embedding all page images as base64.
2741
2940
  */
@@ -2753,17 +2952,18 @@ var PDFConverter = class {
2753
2952
  const renderer = new PageRenderer(this.logger);
2754
2953
  const renderResult = await renderer.renderPages(pdfPath, outputDir);
2755
2954
  const resultPath = join6(outputDir, "result.json");
2756
- const doc = JSON.parse(readFileSync4(resultPath, "utf-8"));
2757
- for (const page of Object.values(doc.pages)) {
2758
- const pageNo = page.page_no;
2759
- const fileIndex = pageNo - 1;
2760
- if (fileIndex >= 0 && fileIndex < renderResult.pageCount) {
2761
- page.image.uri = `pages/page_${fileIndex}.png`;
2762
- page.image.mimetype = "image/png";
2763
- page.image.dpi = 300;
2764
- }
2765
- }
2766
- await writeFile(resultPath, JSON.stringify(doc, null, 2));
2955
+ const tmpPath = resultPath + ".tmp";
2956
+ const jqProgram = `
2957
+ .pages |= with_entries(
2958
+ if (.value.page_no - 1) >= 0 and (.value.page_no - 1) < ${renderResult.pageCount} then
2959
+ .value.image.uri = "pages/page_\\(.value.page_no - 1).png" |
2960
+ .value.image.mimetype = "image/png" |
2961
+ .value.image.dpi = ${PAGE_RENDERING.DEFAULT_DPI}
2962
+ else . end
2963
+ )
2964
+ `;
2965
+ await runJqFileToFile(jqProgram, resultPath, tmpPath);
2966
+ await rename2(tmpPath, resultPath);
2767
2967
  this.logger.info(
2768
2968
  `[PDFConverter] Rendered ${renderResult.pageCount} page images`
2769
2969
  );