@heripo/pdf-parser 0.1.8 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -43,11 +43,17 @@ var DOCLING_ENVIRONMENT = {
43
43
  */
44
44
  STARTUP_DELAY_MS: 2e3
45
45
  };
46
+ var PAGE_RENDERING = {
47
+ /** Default rendering DPI for VLM text recognition quality */
48
+ DEFAULT_DPI: 200,
49
+ /** Low-resolution DPI for OCR strategy sampling */
50
+ SAMPLE_DPI: 150
51
+ };
46
52
  var IMAGE_PDF_CONVERTER = {
47
53
  /**
48
54
  * ImageMagick density option (DPI) for PDF to image conversion
49
55
  */
50
- DENSITY: 300,
56
+ DENSITY: PAGE_RENDERING.DEFAULT_DPI,
51
57
  /**
52
58
  * ImageMagick quality option (1-100)
53
59
  */
@@ -837,16 +843,10 @@ var DoclingEnvironment = class _DoclingEnvironment {
837
843
 
838
844
  // src/core/pdf-converter.ts
839
845
  import { omit } from "es-toolkit";
840
- import {
841
- copyFileSync,
842
- createWriteStream as createWriteStream2,
843
- existsSync as existsSync4,
844
- readFileSync as readFileSync4,
845
- rmSync as rmSync3
846
- } from "fs";
847
- import { writeFile } from "fs/promises";
846
+ import { copyFileSync, createWriteStream as createWriteStream3, existsSync as existsSync4, rmSync as rmSync3 } from "fs";
847
+ import { rename as rename2, writeFile } from "fs/promises";
848
848
  import { join as join6 } from "path";
849
- import { pipeline } from "stream/promises";
849
+ import { pipeline as pipeline3 } from "stream/promises";
850
850
 
851
851
  // src/errors/image-pdf-fallback-error.ts
852
852
  var ImagePdfFallbackError = class extends Error {
@@ -862,19 +862,24 @@ var ImagePdfFallbackError = class extends Error {
862
862
 
863
863
  // src/processors/image-extractor.ts
864
864
  import {
865
- createWriteStream,
865
+ createReadStream,
866
+ createWriteStream as createWriteStream2,
866
867
  existsSync,
867
868
  mkdirSync,
868
- readFileSync,
869
869
  readdirSync,
870
870
  rmSync,
871
871
  writeFileSync
872
872
  } from "fs";
873
873
  import { extname, join as join2 } from "path";
874
+ import { Transform } from "stream";
875
+ import { pipeline as pipeline2 } from "stream/promises";
874
876
  import * as yauzl from "yauzl";
875
877
 
876
878
  // src/utils/jq.ts
877
879
  import { spawn as spawn3 } from "child_process";
880
+ import { createWriteStream } from "fs";
881
+ import { rename } from "fs/promises";
882
+ import { pipeline } from "stream/promises";
878
883
  function getJqPath() {
879
884
  const p = process.env.JQ_PATH?.trim();
880
885
  return p && p.length > 0 ? p : "jq";
@@ -926,25 +931,139 @@ function runJqFileJson(program, filePath) {
926
931
  });
927
932
  });
928
933
  }
929
- function jqExtractBase64PngStrings(filePath) {
930
- const program = `
931
- [
932
- .. |
933
- select(type == "string" and startswith("data:image/png;base64"))
934
- ]
935
- `;
936
- return runJqFileJson(program, filePath);
934
+ function runJqFileToFile(program, inputPath, outputPath) {
935
+ return new Promise((resolve, reject) => {
936
+ const jqPath = getJqPath();
937
+ const args = [program, inputPath];
938
+ const child = spawn3(jqPath, args, {
939
+ stdio: ["ignore", "pipe", "pipe"],
940
+ env: process.env
941
+ });
942
+ let stderr = "";
943
+ let exitCode = null;
944
+ let pipelineDone = false;
945
+ let settled = false;
946
+ child.stderr.setEncoding("utf-8");
947
+ child.stderr.on("data", (chunk) => {
948
+ stderr += chunk;
949
+ });
950
+ const ws = createWriteStream(outputPath);
951
+ function trySettle() {
952
+ if (settled) return;
953
+ if (!pipelineDone || exitCode === null) return;
954
+ settled = true;
955
+ if (exitCode !== 0) {
956
+ reject(
957
+ new Error(
958
+ `jq exited with code ${exitCode}. ${stderr ? "Stderr: " + stderr : ""}`
959
+ )
960
+ );
961
+ } else {
962
+ resolve();
963
+ }
964
+ }
965
+ child.on("error", (err) => {
966
+ if (settled) return;
967
+ settled = true;
968
+ ws.destroy();
969
+ reject(err);
970
+ });
971
+ pipeline(child.stdout, ws).then(() => {
972
+ pipelineDone = true;
973
+ trySettle();
974
+ }).catch((err) => {
975
+ if (settled) return;
976
+ settled = true;
977
+ reject(err);
978
+ });
979
+ child.on("close", (code) => {
980
+ exitCode = code ?? 1;
981
+ trySettle();
982
+ });
983
+ });
984
+ }
985
+ function runJqFileLines(program, filePath, onLine) {
986
+ return new Promise((resolve, reject) => {
987
+ const jqPath = getJqPath();
988
+ const args = ["-r", program, filePath];
989
+ const child = spawn3(jqPath, args, {
990
+ stdio: ["ignore", "pipe", "pipe"],
991
+ env: process.env
992
+ });
993
+ let stderr = "";
994
+ let buffer = "";
995
+ let callbackError = false;
996
+ child.stdout.setEncoding("utf-8");
997
+ child.stderr.setEncoding("utf-8");
998
+ function safeOnLine(line) {
999
+ if (callbackError) return;
1000
+ try {
1001
+ onLine(line);
1002
+ } catch (err) {
1003
+ callbackError = true;
1004
+ child.kill();
1005
+ reject(err);
1006
+ }
1007
+ }
1008
+ child.stdout.on("data", (chunk) => {
1009
+ buffer += chunk;
1010
+ let newlineIdx;
1011
+ while ((newlineIdx = buffer.indexOf("\n")) !== -1) {
1012
+ const line = buffer.slice(0, newlineIdx);
1013
+ buffer = buffer.slice(newlineIdx + 1);
1014
+ if (line.length > 0) {
1015
+ safeOnLine(line);
1016
+ }
1017
+ }
1018
+ });
1019
+ child.stderr.on("data", (chunk) => {
1020
+ stderr += chunk;
1021
+ });
1022
+ child.on("error", (err) => {
1023
+ if (!callbackError) reject(err);
1024
+ });
1025
+ child.on("close", (code) => {
1026
+ if (callbackError) return;
1027
+ if (buffer.length > 0) {
1028
+ safeOnLine(buffer);
1029
+ }
1030
+ if (callbackError) return;
1031
+ if (code !== 0) {
1032
+ reject(
1033
+ new Error(
1034
+ `jq exited with code ${code}. ${stderr ? "Stderr: " + stderr : ""}`
1035
+ )
1036
+ );
1037
+ } else {
1038
+ resolve();
1039
+ }
1040
+ });
1041
+ });
937
1042
  }
938
- function jqReplaceBase64WithPaths(filePath, dirName, prefix) {
1043
+ async function jqExtractBase64PngStringsStreaming(filePath, onImage) {
1044
+ let index = 0;
1045
+ await runJqFileLines(
1046
+ '.. | select(type == "string" and startswith("data:image/png;base64"))',
1047
+ filePath,
1048
+ (line) => {
1049
+ onImage(line, index);
1050
+ index++;
1051
+ }
1052
+ );
1053
+ return index;
1054
+ }
1055
+ async function jqReplaceBase64WithPathsToFile(inputPath, outputPath, dirName, prefix) {
939
1056
  const program = `
940
1057
  reduce paths(type == "string" and startswith("data:image/png;base64")) as $p (
941
1058
  {data: ., counter: 0};
942
1059
  .counter as $idx |
943
1060
  .data |= setpath($p; "${dirName}/${prefix}_\\($idx).png") |
944
1061
  .counter += 1
945
- ) | {data: .data, count: .counter}
1062
+ ) | .data
946
1063
  `;
947
- return runJqFileJson(program, filePath);
1064
+ const tmpPath = outputPath + ".tmp";
1065
+ await runJqFileToFile(program, inputPath, tmpPath);
1066
+ await rename(tmpPath, outputPath);
948
1067
  }
949
1068
 
950
1069
  // src/processors/image-extractor.ts
@@ -972,7 +1091,7 @@ var ImageExtractor = class _ImageExtractor {
972
1091
  return;
973
1092
  }
974
1093
  mkdirSync(join2(entryPath, ".."), { recursive: true });
975
- const writeStream = createWriteStream(entryPath);
1094
+ const writeStream = createWriteStream2(entryPath);
976
1095
  readStream.pipe(writeStream);
977
1096
  writeStream.on("finish", () => {
978
1097
  zipfile.readEntry();
@@ -988,26 +1107,6 @@ var ImageExtractor = class _ImageExtractor {
988
1107
  });
989
1108
  });
990
1109
  }
991
- /**
992
- * Extract base64 images from JSON file using jq (for large files)
993
- * Returns array of base64 data strings
994
- */
995
- static async extractBase64ImagesFromJsonWithJq(jsonPath) {
996
- return jqExtractBase64PngStrings(jsonPath);
997
- }
998
- /**
999
- * Replace base64 images with file paths in JSON using jq (for large files)
1000
- * Uses reduce to maintain counter state while walking the JSON
1001
- */
1002
- static async replaceBase64ImagesInJsonWithJq(jsonPath, outputPath, dirName, prefix) {
1003
- const { data, count } = await jqReplaceBase64WithPaths(
1004
- jsonPath,
1005
- dirName,
1006
- prefix
1007
- );
1008
- writeFileSync(outputPath, JSON.stringify(data, null, 2), "utf-8");
1009
- return count;
1010
- }
1011
1110
  /**
1012
1111
  * Extract a base64-encoded image to a file and return the relative path
1013
1112
  */
@@ -1021,8 +1120,66 @@ var ImageExtractor = class _ImageExtractor {
1021
1120
  return `${dirName}/${filename}`;
1022
1121
  }
1023
1122
  /**
1024
- * Save JSON and HTML documents with base64 images extracted to separate files
1025
- * Uses jq for JSON processing to handle large files
1123
+ * Extract base64 images from HTML using streaming.
1124
+ * Reads HTML file as a stream, extracts base64 images from src attributes,
1125
+ * saves them as PNG files, and replaces with file paths in the output HTML.
1126
+ * Returns the number of images extracted.
1127
+ */
1128
+ static async extractImagesFromHtmlStream(htmlInputPath, htmlOutputPath, imagesDir) {
1129
+ let imageIndex = 0;
1130
+ let pending = "";
1131
+ const MARKER = 'src="data:image/png;base64,';
1132
+ const transform = new Transform({
1133
+ decodeStrings: false,
1134
+ encoding: "utf-8",
1135
+ transform(chunk, _encoding, callback) {
1136
+ pending += chunk;
1137
+ let result = "";
1138
+ while (true) {
1139
+ const markerIdx = pending.indexOf(MARKER);
1140
+ if (markerIdx === -1) {
1141
+ const safeEnd = Math.max(0, pending.length - MARKER.length);
1142
+ result += pending.slice(0, safeEnd);
1143
+ pending = pending.slice(safeEnd);
1144
+ break;
1145
+ }
1146
+ result += pending.slice(0, markerIdx);
1147
+ const dataStart = markerIdx + MARKER.length;
1148
+ const quoteIdx = pending.indexOf('"', dataStart);
1149
+ if (quoteIdx === -1) {
1150
+ pending = pending.slice(markerIdx);
1151
+ break;
1152
+ }
1153
+ const base64Content = pending.slice(dataStart, quoteIdx);
1154
+ const filename = `image_${imageIndex}.png`;
1155
+ const filepath = join2(imagesDir, filename);
1156
+ const buf = Buffer.from(base64Content, "base64");
1157
+ writeFileSync(filepath, buf);
1158
+ const relativePath = `images/${filename}`;
1159
+ result += `src="${relativePath}"`;
1160
+ imageIndex++;
1161
+ pending = pending.slice(quoteIdx + 1);
1162
+ }
1163
+ if (result.length > 0) {
1164
+ this.push(result);
1165
+ }
1166
+ callback();
1167
+ },
1168
+ flush(callback) {
1169
+ if (pending.length > 0) {
1170
+ this.push(pending);
1171
+ }
1172
+ callback();
1173
+ }
1174
+ });
1175
+ const rs = createReadStream(htmlInputPath, { encoding: "utf-8" });
1176
+ const ws = createWriteStream2(htmlOutputPath, { encoding: "utf-8" });
1177
+ await pipeline2(rs, transform, ws);
1178
+ return imageIndex;
1179
+ }
1180
+ /**
1181
+ * Save JSON and HTML documents with base64 images extracted to separate files.
1182
+ * Uses jq for JSON processing and streaming for HTML to handle large files.
1026
1183
  *
1027
1184
  * This method:
1028
1185
  * 1. Extracts base64-encoded images from JSON and HTML content
@@ -1030,7 +1187,7 @@ var ImageExtractor = class _ImageExtractor {
1030
1187
  * 3. Replaces base64 data with relative file paths
1031
1188
  * 4. Saves the transformed documents to the output directory
1032
1189
  */
1033
- static async saveDocumentsWithExtractedImages(logger, outputDir, filename, jsonSourcePath, htmlContent) {
1190
+ static async saveDocumentsWithExtractedImages(logger, outputDir, filename, jsonSourcePath, htmlSourcePath) {
1034
1191
  try {
1035
1192
  if (existsSync(outputDir)) {
1036
1193
  rmSync(outputDir, { recursive: true, force: true });
@@ -1042,31 +1199,33 @@ var ImageExtractor = class _ImageExtractor {
1042
1199
  const baseName = filename.replace(extname(filename), "");
1043
1200
  const jsonPath = join2(outputDir, `${baseName}.json`);
1044
1201
  try {
1045
- const pagesDir = join2(outputDir, "pages");
1046
- if (!existsSync(pagesDir)) {
1047
- mkdirSync(pagesDir, { recursive: true });
1048
- }
1049
- const base64Images = await _ImageExtractor.extractBase64ImagesFromJsonWithJq(jsonSourcePath);
1050
- base64Images.forEach((base64Data, index) => {
1051
- _ImageExtractor.extractBase64ImageToFile(
1052
- base64Data,
1053
- pagesDir,
1054
- index,
1055
- "page",
1056
- "pages"
1057
- );
1058
- });
1202
+ const imagesDir = join2(outputDir, "images");
1203
+ if (!existsSync(imagesDir)) {
1204
+ mkdirSync(imagesDir, { recursive: true });
1205
+ }
1206
+ const imageCount = await jqExtractBase64PngStringsStreaming(
1207
+ jsonSourcePath,
1208
+ (base64Data, index) => {
1209
+ _ImageExtractor.extractBase64ImageToFile(
1210
+ base64Data,
1211
+ imagesDir,
1212
+ index,
1213
+ "pic",
1214
+ "images"
1215
+ );
1216
+ }
1217
+ );
1059
1218
  logger.info(
1060
- `[PDFConverter] Extracted ${base64Images.length} images from JSON to ${pagesDir}`
1219
+ `[PDFConverter] Extracted ${imageCount} picture images from JSON to ${imagesDir}`
1061
1220
  );
1062
- const replacedCount = await _ImageExtractor.replaceBase64ImagesInJsonWithJq(
1221
+ await jqReplaceBase64WithPathsToFile(
1063
1222
  jsonSourcePath,
1064
1223
  jsonPath,
1065
- "pages",
1066
- "page"
1224
+ "images",
1225
+ "pic"
1067
1226
  );
1068
1227
  logger.info(
1069
- `[PDFConverter] Replaced ${replacedCount} base64 images with file paths`
1228
+ `[PDFConverter] Replaced ${imageCount} base64 images with file paths`
1070
1229
  );
1071
1230
  } catch (e) {
1072
1231
  logger.warn(
@@ -1082,42 +1241,36 @@ var ImageExtractor = class _ImageExtractor {
1082
1241
  if (!existsSync(imagesDir)) {
1083
1242
  mkdirSync(imagesDir, { recursive: true });
1084
1243
  }
1085
- let imageIndex = 0;
1086
- const transformedHtml = htmlContent.replace(
1087
- /src="data:image\/png;base64,([^"]+)"/g,
1088
- (_, base64Content) => {
1089
- const filename2 = `image_${imageIndex}.png`;
1090
- const filepath = join2(imagesDir, filename2);
1091
- const buffer = Buffer.from(base64Content, "base64");
1092
- writeFileSync(filepath, buffer);
1093
- const relativePath = `images/${filename2}`;
1094
- imageIndex += 1;
1095
- return `src="${relativePath}"`;
1096
- }
1244
+ const htmlImageCount = await _ImageExtractor.extractImagesFromHtmlStream(
1245
+ htmlSourcePath,
1246
+ htmlPath,
1247
+ imagesDir
1097
1248
  );
1098
1249
  logger.info(
1099
- `[PDFConverter] Extracted ${imageIndex} images from HTML to ${imagesDir}`
1250
+ `[PDFConverter] Extracted ${htmlImageCount} images from HTML to ${imagesDir}`
1100
1251
  );
1101
- writeFileSync(htmlPath, transformedHtml, "utf-8");
1102
1252
  } catch (e) {
1103
1253
  logger.warn(
1104
- "[PDFConverter] Failed to extract images from HTML, writing original. Error:",
1254
+ "[PDFConverter] Failed to extract images from HTML, copying original. Error:",
1105
1255
  e
1106
1256
  );
1107
- writeFileSync(htmlPath, htmlContent, "utf-8");
1257
+ const rs = createReadStream(htmlSourcePath);
1258
+ const ws = createWriteStream2(htmlPath);
1259
+ await pipeline2(rs, ws);
1108
1260
  }
1109
1261
  logger.info("[PDFConverter] Saved HTML:", htmlPath);
1110
1262
  }
1111
1263
  /**
1112
1264
  * Extract documents from ZIP and save with extracted images
1113
- * Uses jq for JSON processing to handle large files without loading into Node.js memory
1265
+ * Uses jq for JSON processing and streaming for HTML to handle large files
1266
+ * without loading into Node.js memory
1114
1267
  *
1115
1268
  * Complete workflow:
1116
1269
  * 1. Extract ZIP file to temporary directory
1117
1270
  * 2. Find JSON and HTML files from extracted files
1118
- * 3. Use jq to extract base64 images from JSON and save as separate files
1119
- * 4. Use jq to replace base64 with file paths in JSON
1120
- * 5. Process HTML with regex to extract and replace images
1271
+ * 3. Use jq to stream-extract base64 images from JSON and save as separate files
1272
+ * 4. Use jq to replace base64 with file paths in JSON (piped to file)
1273
+ * 5. Process HTML with streaming Transform to extract and replace images
1121
1274
  * 6. Save transformed documents to output directory (as result.json and result.html)
1122
1275
  */
1123
1276
  static async extractAndSaveDocumentsFromZip(logger, zipPath, extractDir, outputDir) {
@@ -1133,14 +1286,13 @@ var ImageExtractor = class _ImageExtractor {
1133
1286
  }
1134
1287
  const jsonPath = join2(extractDir, jsonFile);
1135
1288
  const htmlPath = join2(extractDir, htmlFile);
1136
- const htmlContent = readFileSync(htmlPath, "utf-8");
1137
1289
  logger.info("[PDFConverter] Saving converted files to output...");
1138
1290
  await _ImageExtractor.saveDocumentsWithExtractedImages(
1139
1291
  logger,
1140
1292
  outputDir,
1141
1293
  "result",
1142
1294
  jsonPath,
1143
- htmlContent
1295
+ htmlPath
1144
1296
  );
1145
1297
  logger.info("[PDFConverter] Files saved to:", outputDir);
1146
1298
  }
@@ -1149,7 +1301,7 @@ var ImageExtractor = class _ImageExtractor {
1149
1301
  // src/processors/page-renderer.ts
1150
1302
  import { existsSync as existsSync2, mkdirSync as mkdirSync2, readdirSync as readdirSync2 } from "fs";
1151
1303
  import { join as join3 } from "path";
1152
- var DEFAULT_DPI = 300;
1304
+ var PROGRESS_POLL_INTERVAL_MS = 2e3;
1153
1305
  var PageRenderer = class {
1154
1306
  constructor(logger) {
1155
1307
  this.logger = logger;
@@ -1163,29 +1315,60 @@ var PageRenderer = class {
1163
1315
  * @returns Render result with page count and file paths
1164
1316
  */
1165
1317
  async renderPages(pdfPath, outputDir, options) {
1166
- const dpi = options?.dpi ?? DEFAULT_DPI;
1318
+ const dpi = options?.dpi ?? PAGE_RENDERING.DEFAULT_DPI;
1167
1319
  const pagesDir = join3(outputDir, "pages");
1168
1320
  if (!existsSync2(pagesDir)) {
1169
1321
  mkdirSync2(pagesDir, { recursive: true });
1170
1322
  }
1171
- this.logger.info(`[PageRenderer] Rendering PDF at ${dpi} DPI...`);
1172
- const outputPattern = join3(pagesDir, "page_%d.png");
1173
- const result = await spawnAsync("magick", [
1174
- "-density",
1175
- dpi.toString(),
1176
- pdfPath,
1177
- "-background",
1178
- "white",
1179
- "-alpha",
1180
- "remove",
1181
- "-alpha",
1182
- "off",
1183
- outputPattern
1184
- ]);
1185
- if (result.code !== 0) {
1186
- throw new Error(
1187
- `[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
1323
+ const totalPages = await this.getPageCount(pdfPath);
1324
+ if (totalPages > 0) {
1325
+ this.logger.info(
1326
+ `[PageRenderer] Rendering ${totalPages} pages at ${dpi} DPI...`
1188
1327
  );
1328
+ } else {
1329
+ this.logger.info(`[PageRenderer] Rendering PDF at ${dpi} DPI...`);
1330
+ }
1331
+ const outputPattern = join3(pagesDir, "page_%d.png");
1332
+ let progressInterval = null;
1333
+ if (totalPages > 0) {
1334
+ let lastLoggedCount = 0;
1335
+ progressInterval = setInterval(() => {
1336
+ try {
1337
+ const rendered = readdirSync2(pagesDir).filter(
1338
+ (f) => f.startsWith("page_") && f.endsWith(".png")
1339
+ ).length;
1340
+ if (rendered > 0 && rendered !== lastLoggedCount) {
1341
+ lastLoggedCount = rendered;
1342
+ this.logger.info(
1343
+ `[PageRenderer] Rendering pages: ${rendered}/${totalPages}`
1344
+ );
1345
+ }
1346
+ } catch {
1347
+ }
1348
+ }, PROGRESS_POLL_INTERVAL_MS);
1349
+ }
1350
+ try {
1351
+ const result = await spawnAsync("magick", [
1352
+ "-density",
1353
+ dpi.toString(),
1354
+ pdfPath,
1355
+ "-background",
1356
+ "white",
1357
+ "-alpha",
1358
+ "remove",
1359
+ "-alpha",
1360
+ "off",
1361
+ outputPattern
1362
+ ]);
1363
+ if (result.code !== 0) {
1364
+ throw new Error(
1365
+ `[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
1366
+ );
1367
+ }
1368
+ } finally {
1369
+ if (progressInterval) {
1370
+ clearInterval(progressInterval);
1371
+ }
1189
1372
  }
1190
1373
  const pageFiles = readdirSync2(pagesDir).filter((f) => f.startsWith("page_") && f.endsWith(".png")).sort((a, b) => {
1191
1374
  const numA = parseInt(a.replace("page_", "").replace(".png", ""), 10);
@@ -1201,6 +1384,20 @@ var PageRenderer = class {
1201
1384
  pageFiles
1202
1385
  };
1203
1386
  }
1387
+ /**
1388
+ * Get total page count using pdfinfo.
1389
+ * Returns 0 on failure (progress logging will be skipped).
1390
+ */
1391
+ async getPageCount(pdfPath) {
1392
+ try {
1393
+ const result = await spawnAsync("pdfinfo", [pdfPath]);
1394
+ if (result.code !== 0) return 0;
1395
+ const match = result.stdout.match(/^Pages:\s+(\d+)/m);
1396
+ return match ? parseInt(match[1], 10) : 0;
1397
+ } catch {
1398
+ return 0;
1399
+ }
1400
+ }
1204
1401
  };
1205
1402
 
1206
1403
  // src/processors/pdf-text-extractor.ts
@@ -1286,7 +1483,7 @@ var PdfTextExtractor = class {
1286
1483
  };
1287
1484
 
1288
1485
  // src/processors/vlm-text-corrector.ts
1289
- import { readFileSync as readFileSync2, writeFileSync as writeFileSync2 } from "fs";
1486
+ import { readFileSync, writeFileSync as writeFileSync2 } from "fs";
1290
1487
  import { join as join4 } from "path";
1291
1488
 
1292
1489
  // src/types/vlm-text-correction-schema.ts
@@ -1418,7 +1615,7 @@ var VlmTextCorrector = class {
1418
1615
  async correctAndSave(outputDir, model, options) {
1419
1616
  this.logger.info("[VlmTextCorrector] Starting text correction...");
1420
1617
  const resultPath = join4(outputDir, "result.json");
1421
- const doc = JSON.parse(readFileSync2(resultPath, "utf-8"));
1618
+ const doc = JSON.parse(readFileSync(resultPath, "utf-8"));
1422
1619
  let pageNumbers = this.getPageNumbers(doc);
1423
1620
  if (pageNumbers.length === 0) {
1424
1621
  this.logger.info("[VlmTextCorrector] No pages to process");
@@ -1745,7 +1942,7 @@ var VlmTextCorrector = class {
1745
1942
  */
1746
1943
  readPageImage(outputDir, pageNo) {
1747
1944
  const imagePath = join4(outputDir, "pages", `page_${pageNo - 1}.png`);
1748
- return readFileSync2(imagePath).toString("base64");
1945
+ return readFileSync(imagePath).toString("base64");
1749
1946
  }
1750
1947
  /**
1751
1948
  * Apply VLM corrections to the DoclingDocument.
@@ -1799,9 +1996,9 @@ var VlmTextCorrector = class {
1799
1996
  };
1800
1997
 
1801
1998
  // src/samplers/ocr-strategy-sampler.ts
1802
- import { readFileSync as readFileSync3 } from "fs";
1999
+ import { normalizeToBcp47 } from "@heripo/model";
2000
+ import { readFileSync as readFileSync2 } from "fs";
1803
2001
  import { z as z2 } from "zod/v4";
1804
- var SAMPLE_DPI = 150;
1805
2002
  var EDGE_TRIM_RATIO = 0.1;
1806
2003
  var DEFAULT_MAX_SAMPLE_PAGES = 15;
1807
2004
  var DEFAULT_MAX_RETRIES2 = 3;
@@ -1852,7 +2049,7 @@ var OcrStrategySampler = class {
1852
2049
  const renderResult = await this.pageRenderer.renderPages(
1853
2050
  pdfPath,
1854
2051
  outputDir,
1855
- { dpi: SAMPLE_DPI }
2052
+ { dpi: PAGE_RENDERING.SAMPLE_DPI }
1856
2053
  );
1857
2054
  if (renderResult.pageCount === 0) {
1858
2055
  this.logger.info("[OcrStrategySampler] No pages found in PDF");
@@ -1871,7 +2068,7 @@ var OcrStrategySampler = class {
1871
2068
  `[OcrStrategySampler] Sampling ${sampleIndices.length} of ${renderResult.pageCount} pages: [${sampleIndices.map((i) => i + 1).join(", ")}]`
1872
2069
  );
1873
2070
  let sampledCount = 0;
1874
- let detectedLanguages;
2071
+ const languageFrequency = /* @__PURE__ */ new Map();
1875
2072
  for (const idx of sampleIndices) {
1876
2073
  sampledCount++;
1877
2074
  const pageFile = renderResult.pageFiles[idx];
@@ -1881,14 +2078,17 @@ var OcrStrategySampler = class {
1881
2078
  model,
1882
2079
  options
1883
2080
  );
1884
- detectedLanguages = pageAnalysis.detectedLanguages;
2081
+ for (const lang of pageAnalysis.detectedLanguages) {
2082
+ languageFrequency.set(lang, (languageFrequency.get(lang) ?? 0) + 1);
2083
+ }
1885
2084
  if (pageAnalysis.hasKoreanHanjaMix) {
1886
2085
  this.logger.info(
1887
2086
  `[OcrStrategySampler] Korean-Hanja mix detected on page ${idx + 1} \u2192 VLM strategy`
1888
2087
  );
2088
+ const detectedLanguages2 = this.aggregateLanguages(languageFrequency);
1889
2089
  return {
1890
2090
  method: "vlm",
1891
- detectedLanguages,
2091
+ detectedLanguages: detectedLanguages2,
1892
2092
  reason: `Korean-Hanja mix detected on page ${idx + 1}`,
1893
2093
  sampledPages: sampledCount,
1894
2094
  totalPages: renderResult.pageCount
@@ -1898,6 +2098,7 @@ var OcrStrategySampler = class {
1898
2098
  this.logger.info(
1899
2099
  "[OcrStrategySampler] No Korean-Hanja mix detected \u2192 ocrmac strategy"
1900
2100
  );
2101
+ const detectedLanguages = this.aggregateLanguages(languageFrequency);
1901
2102
  return {
1902
2103
  method: "ocrmac",
1903
2104
  detectedLanguages,
@@ -2002,14 +2203,15 @@ var OcrStrategySampler = class {
2002
2203
  }
2003
2204
  /**
2004
2205
  * Analyze a single sample page for Korean-Hanja mixed script and primary language.
2206
+ * Normalizes raw VLM language responses to valid BCP 47 tags, filtering out invalid ones.
2005
2207
  *
2006
- * @returns Object with Korean-Hanja detection result and detected languages
2208
+ * @returns Object with Korean-Hanja detection result and normalized detected languages
2007
2209
  */
2008
2210
  async analyzeSamplePage(pageFile, pageNo, model, options) {
2009
2211
  this.logger.debug(
2010
2212
  `[OcrStrategySampler] Analyzing page ${pageNo} for Korean-Hanja mix and language...`
2011
2213
  );
2012
- const base64Image = readFileSync3(pageFile).toString("base64");
2214
+ const base64Image = readFileSync2(pageFile).toString("base64");
2013
2215
  const messages = [
2014
2216
  {
2015
2217
  role: "user",
@@ -2037,18 +2239,27 @@ var OcrStrategySampler = class {
2037
2239
  options.aggregator.track(result.usage);
2038
2240
  }
2039
2241
  const output = result.output;
2242
+ const normalizedLanguages = output.detectedLanguages.map(normalizeToBcp47).filter((tag) => tag !== null);
2040
2243
  this.logger.debug(
2041
- `[OcrStrategySampler] Page ${pageNo}: hasKoreanHanjaMix=${output.hasKoreanHanjaMix}, detectedLanguages=${output.detectedLanguages.join(",")}`
2244
+ `[OcrStrategySampler] Page ${pageNo}: hasKoreanHanjaMix=${output.hasKoreanHanjaMix}, detectedLanguages=${normalizedLanguages.join(",")}`
2042
2245
  );
2043
2246
  return {
2044
2247
  hasKoreanHanjaMix: output.hasKoreanHanjaMix,
2045
- detectedLanguages: output.detectedLanguages
2248
+ detectedLanguages: normalizedLanguages
2046
2249
  };
2047
2250
  }
2251
+ /**
2252
+ * Aggregate language frequency map into a sorted array.
2253
+ * Returns languages sorted by frequency (descending), or undefined if empty.
2254
+ */
2255
+ aggregateLanguages(frequencyMap) {
2256
+ if (frequencyMap.size === 0) return void 0;
2257
+ return [...frequencyMap.entries()].sort((a, b) => b[1] - a[1]).map(([lang]) => lang);
2258
+ }
2048
2259
  };
2049
2260
 
2050
2261
  // src/utils/local-file-server.ts
2051
- import { createReadStream, statSync } from "fs";
2262
+ import { createReadStream as createReadStream2, statSync } from "fs";
2052
2263
  import { createServer } from "http";
2053
2264
  import { basename } from "path";
2054
2265
  var LocalFileServer = class {
@@ -2070,7 +2281,7 @@ var LocalFileServer = class {
2070
2281
  "Content-Type": "application/pdf",
2071
2282
  "Content-Length": stat.size
2072
2283
  });
2073
- createReadStream(filePath).pipe(res);
2284
+ createReadStream2(filePath).pipe(res);
2074
2285
  } else {
2075
2286
  res.writeHead(404);
2076
2287
  res.end("Not Found");
@@ -2355,8 +2566,10 @@ var PDFConverter = class {
2355
2566
  let pageTexts;
2356
2567
  try {
2357
2568
  const resultPath2 = join6(outputDir, "result.json");
2358
- const doc = JSON.parse(readFileSync4(resultPath2, "utf-8"));
2359
- const totalPages = Object.keys(doc.pages).length;
2569
+ const totalPages = await runJqFileJson(
2570
+ ".pages | length",
2571
+ resultPath2
2572
+ );
2360
2573
  const textExtractor = new PdfTextExtractor(this.logger);
2361
2574
  pageTexts = await textExtractor.extractText(pdfPath, totalPages);
2362
2575
  } catch {
@@ -2513,6 +2726,7 @@ var PDFConverter = class {
2513
2726
  const outputDir = join6(cwd, "output", reportId);
2514
2727
  try {
2515
2728
  await this.processConvertedFiles(zipPath, extractDir, outputDir);
2729
+ await this.renderPageImages(url, outputDir);
2516
2730
  if (abortSignal?.aborted) {
2517
2731
  this.logger.info("[PDFConverter] Conversion aborted before callback");
2518
2732
  const error = new Error("PDF conversion was aborted");
@@ -2568,6 +2782,8 @@ var PDFConverter = class {
2568
2782
  framework: "livetext"
2569
2783
  },
2570
2784
  generate_picture_images: true,
2785
+ generate_page_images: false,
2786
+ // Page images are rendered by PageRenderer (ImageMagick) after conversion
2571
2787
  images_scale: 2,
2572
2788
  /**
2573
2789
  * While disabling this option yields the most accurate text extraction for readable PDFs,
@@ -2685,8 +2901,8 @@ var PDFConverter = class {
2685
2901
  const zipPath = join6(process.cwd(), "result.zip");
2686
2902
  this.logger.info("[PDFConverter] Saving ZIP file to:", zipPath);
2687
2903
  if (zipResult.fileStream) {
2688
- const writeStream = createWriteStream2(zipPath);
2689
- await pipeline(zipResult.fileStream, writeStream);
2904
+ const writeStream = createWriteStream3(zipPath);
2905
+ await pipeline3(zipResult.fileStream, writeStream);
2690
2906
  return;
2691
2907
  }
2692
2908
  if (zipResult.data) {
@@ -2716,6 +2932,42 @@ var PDFConverter = class {
2716
2932
  outputDir
2717
2933
  );
2718
2934
  }
2935
+ /**
2936
+ * Render page images from the source PDF using ImageMagick and update result.json.
2937
+ * Uses jq to update the JSON file without loading it into Node.js memory.
2938
+ * Replaces Docling's generate_page_images which fails on large PDFs
2939
+ * due to memory limits when embedding all page images as base64.
2940
+ */
2941
+ async renderPageImages(url, outputDir) {
2942
+ if (!url.startsWith("file://")) {
2943
+ this.logger.warn(
2944
+ "[PDFConverter] Page image rendering skipped: only supported for local files (file:// URLs)"
2945
+ );
2946
+ return;
2947
+ }
2948
+ const pdfPath = url.slice(7);
2949
+ this.logger.info(
2950
+ "[PDFConverter] Rendering page images with ImageMagick..."
2951
+ );
2952
+ const renderer = new PageRenderer(this.logger);
2953
+ const renderResult = await renderer.renderPages(pdfPath, outputDir);
2954
+ const resultPath = join6(outputDir, "result.json");
2955
+ const tmpPath = resultPath + ".tmp";
2956
+ const jqProgram = `
2957
+ .pages |= with_entries(
2958
+ if (.value.page_no - 1) >= 0 and (.value.page_no - 1) < ${renderResult.pageCount} then
2959
+ .value.image.uri = "pages/page_\\(.value.page_no - 1).png" |
2960
+ .value.image.mimetype = "image/png" |
2961
+ .value.image.dpi = ${PAGE_RENDERING.DEFAULT_DPI}
2962
+ else . end
2963
+ )
2964
+ `;
2965
+ await runJqFileToFile(jqProgram, resultPath, tmpPath);
2966
+ await rename2(tmpPath, resultPath);
2967
+ this.logger.info(
2968
+ `[PDFConverter] Rendered ${renderResult.pageCount} page images`
2969
+ );
2970
+ }
2719
2971
  };
2720
2972
 
2721
2973
  // src/core/pdf-parser.ts
@@ -2754,6 +3006,7 @@ var PDFParser = class {
2754
3006
  this.logger.info("[PDFParser] Initializing...");
2755
3007
  this.checkOperatingSystem();
2756
3008
  this.checkJqInstalled();
3009
+ this.checkPopplerInstalled();
2757
3010
  this.checkMacOSVersion();
2758
3011
  if (this.enableImagePdfFallback && !this.baseUrl) {
2759
3012
  this.checkImageMagickInstalled();
@@ -2810,6 +3063,15 @@ var PDFParser = class {
2810
3063
  );
2811
3064
  }
2812
3065
  }
3066
+ checkPopplerInstalled() {
3067
+ try {
3068
+ execSync("which pdftotext", { stdio: "ignore" });
3069
+ } catch {
3070
+ throw new Error(
3071
+ "poppler is not installed. Please install poppler using: brew install poppler"
3072
+ );
3073
+ }
3074
+ }
2813
3075
  checkMacOSVersion() {
2814
3076
  try {
2815
3077
  const versionOutput = execSync("sw_vers -productVersion", {