@heripo/pdf-parser 0.1.9 → 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +379 -153
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +345 -125
- package/dist/index.js.map +1 -1
- package/package.json +5 -5
package/dist/index.cjs
CHANGED
|
@@ -81,11 +81,17 @@ var DOCLING_ENVIRONMENT = {
|
|
|
81
81
|
*/
|
|
82
82
|
STARTUP_DELAY_MS: 2e3
|
|
83
83
|
};
|
|
84
|
+
var PAGE_RENDERING = {
|
|
85
|
+
/** Default rendering DPI for VLM text recognition quality */
|
|
86
|
+
DEFAULT_DPI: 200,
|
|
87
|
+
/** Low-resolution DPI for OCR strategy sampling */
|
|
88
|
+
SAMPLE_DPI: 150
|
|
89
|
+
};
|
|
84
90
|
var IMAGE_PDF_CONVERTER = {
|
|
85
91
|
/**
|
|
86
92
|
* ImageMagick density option (DPI) for PDF to image conversion
|
|
87
93
|
*/
|
|
88
|
-
DENSITY:
|
|
94
|
+
DENSITY: PAGE_RENDERING.DEFAULT_DPI,
|
|
89
95
|
/**
|
|
90
96
|
* ImageMagick quality option (1-100)
|
|
91
97
|
*/
|
|
@@ -869,10 +875,10 @@ var DoclingEnvironment = class _DoclingEnvironment {
|
|
|
869
875
|
|
|
870
876
|
// src/core/pdf-converter.ts
|
|
871
877
|
var import_es_toolkit = require("es-toolkit");
|
|
872
|
-
var
|
|
873
|
-
var
|
|
878
|
+
var import_node_fs8 = require("fs");
|
|
879
|
+
var import_promises4 = require("fs/promises");
|
|
874
880
|
var import_node_path7 = require("path");
|
|
875
|
-
var
|
|
881
|
+
var import_promises5 = require("stream/promises");
|
|
876
882
|
|
|
877
883
|
// src/errors/image-pdf-fallback-error.ts
|
|
878
884
|
var ImagePdfFallbackError = class extends Error {
|
|
@@ -887,12 +893,17 @@ var ImagePdfFallbackError = class extends Error {
|
|
|
887
893
|
};
|
|
888
894
|
|
|
889
895
|
// src/processors/image-extractor.ts
|
|
890
|
-
var
|
|
896
|
+
var import_node_fs2 = require("fs");
|
|
891
897
|
var import_node_path2 = require("path");
|
|
898
|
+
var import_node_stream = require("stream");
|
|
899
|
+
var import_promises3 = require("stream/promises");
|
|
892
900
|
var yauzl = __toESM(require("yauzl"), 1);
|
|
893
901
|
|
|
894
902
|
// src/utils/jq.ts
|
|
895
903
|
var import_node_child_process2 = require("child_process");
|
|
904
|
+
var import_node_fs = require("fs");
|
|
905
|
+
var import_promises = require("fs/promises");
|
|
906
|
+
var import_promises2 = require("stream/promises");
|
|
896
907
|
function getJqPath() {
|
|
897
908
|
const p = process.env.JQ_PATH?.trim();
|
|
898
909
|
return p && p.length > 0 ? p : "jq";
|
|
@@ -944,25 +955,139 @@ function runJqFileJson(program, filePath) {
|
|
|
944
955
|
});
|
|
945
956
|
});
|
|
946
957
|
}
|
|
947
|
-
function
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
]
|
|
953
|
-
|
|
954
|
-
|
|
958
|
+
function runJqFileToFile(program, inputPath, outputPath) {
|
|
959
|
+
return new Promise((resolve, reject) => {
|
|
960
|
+
const jqPath = getJqPath();
|
|
961
|
+
const args = [program, inputPath];
|
|
962
|
+
const child = (0, import_node_child_process2.spawn)(jqPath, args, {
|
|
963
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
964
|
+
env: process.env
|
|
965
|
+
});
|
|
966
|
+
let stderr = "";
|
|
967
|
+
let exitCode = null;
|
|
968
|
+
let pipelineDone = false;
|
|
969
|
+
let settled = false;
|
|
970
|
+
child.stderr.setEncoding("utf-8");
|
|
971
|
+
child.stderr.on("data", (chunk) => {
|
|
972
|
+
stderr += chunk;
|
|
973
|
+
});
|
|
974
|
+
const ws = (0, import_node_fs.createWriteStream)(outputPath);
|
|
975
|
+
function trySettle() {
|
|
976
|
+
if (settled) return;
|
|
977
|
+
if (!pipelineDone || exitCode === null) return;
|
|
978
|
+
settled = true;
|
|
979
|
+
if (exitCode !== 0) {
|
|
980
|
+
reject(
|
|
981
|
+
new Error(
|
|
982
|
+
`jq exited with code ${exitCode}. ${stderr ? "Stderr: " + stderr : ""}`
|
|
983
|
+
)
|
|
984
|
+
);
|
|
985
|
+
} else {
|
|
986
|
+
resolve();
|
|
987
|
+
}
|
|
988
|
+
}
|
|
989
|
+
child.on("error", (err) => {
|
|
990
|
+
if (settled) return;
|
|
991
|
+
settled = true;
|
|
992
|
+
ws.destroy();
|
|
993
|
+
reject(err);
|
|
994
|
+
});
|
|
995
|
+
(0, import_promises2.pipeline)(child.stdout, ws).then(() => {
|
|
996
|
+
pipelineDone = true;
|
|
997
|
+
trySettle();
|
|
998
|
+
}).catch((err) => {
|
|
999
|
+
if (settled) return;
|
|
1000
|
+
settled = true;
|
|
1001
|
+
reject(err);
|
|
1002
|
+
});
|
|
1003
|
+
child.on("close", (code) => {
|
|
1004
|
+
exitCode = code ?? 1;
|
|
1005
|
+
trySettle();
|
|
1006
|
+
});
|
|
1007
|
+
});
|
|
955
1008
|
}
|
|
956
|
-
function
|
|
1009
|
+
function runJqFileLines(program, filePath, onLine) {
|
|
1010
|
+
return new Promise((resolve, reject) => {
|
|
1011
|
+
const jqPath = getJqPath();
|
|
1012
|
+
const args = ["-r", program, filePath];
|
|
1013
|
+
const child = (0, import_node_child_process2.spawn)(jqPath, args, {
|
|
1014
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
1015
|
+
env: process.env
|
|
1016
|
+
});
|
|
1017
|
+
let stderr = "";
|
|
1018
|
+
let buffer = "";
|
|
1019
|
+
let callbackError = false;
|
|
1020
|
+
child.stdout.setEncoding("utf-8");
|
|
1021
|
+
child.stderr.setEncoding("utf-8");
|
|
1022
|
+
function safeOnLine(line) {
|
|
1023
|
+
if (callbackError) return;
|
|
1024
|
+
try {
|
|
1025
|
+
onLine(line);
|
|
1026
|
+
} catch (err) {
|
|
1027
|
+
callbackError = true;
|
|
1028
|
+
child.kill();
|
|
1029
|
+
reject(err);
|
|
1030
|
+
}
|
|
1031
|
+
}
|
|
1032
|
+
child.stdout.on("data", (chunk) => {
|
|
1033
|
+
buffer += chunk;
|
|
1034
|
+
let newlineIdx;
|
|
1035
|
+
while ((newlineIdx = buffer.indexOf("\n")) !== -1) {
|
|
1036
|
+
const line = buffer.slice(0, newlineIdx);
|
|
1037
|
+
buffer = buffer.slice(newlineIdx + 1);
|
|
1038
|
+
if (line.length > 0) {
|
|
1039
|
+
safeOnLine(line);
|
|
1040
|
+
}
|
|
1041
|
+
}
|
|
1042
|
+
});
|
|
1043
|
+
child.stderr.on("data", (chunk) => {
|
|
1044
|
+
stderr += chunk;
|
|
1045
|
+
});
|
|
1046
|
+
child.on("error", (err) => {
|
|
1047
|
+
if (!callbackError) reject(err);
|
|
1048
|
+
});
|
|
1049
|
+
child.on("close", (code) => {
|
|
1050
|
+
if (callbackError) return;
|
|
1051
|
+
if (buffer.length > 0) {
|
|
1052
|
+
safeOnLine(buffer);
|
|
1053
|
+
}
|
|
1054
|
+
if (callbackError) return;
|
|
1055
|
+
if (code !== 0) {
|
|
1056
|
+
reject(
|
|
1057
|
+
new Error(
|
|
1058
|
+
`jq exited with code ${code}. ${stderr ? "Stderr: " + stderr : ""}`
|
|
1059
|
+
)
|
|
1060
|
+
);
|
|
1061
|
+
} else {
|
|
1062
|
+
resolve();
|
|
1063
|
+
}
|
|
1064
|
+
});
|
|
1065
|
+
});
|
|
1066
|
+
}
|
|
1067
|
+
async function jqExtractBase64PngStringsStreaming(filePath, onImage) {
|
|
1068
|
+
let index = 0;
|
|
1069
|
+
await runJqFileLines(
|
|
1070
|
+
'.. | select(type == "string" and startswith("data:image/png;base64"))',
|
|
1071
|
+
filePath,
|
|
1072
|
+
(line) => {
|
|
1073
|
+
onImage(line, index);
|
|
1074
|
+
index++;
|
|
1075
|
+
}
|
|
1076
|
+
);
|
|
1077
|
+
return index;
|
|
1078
|
+
}
|
|
1079
|
+
async function jqReplaceBase64WithPathsToFile(inputPath, outputPath, dirName, prefix) {
|
|
957
1080
|
const program = `
|
|
958
1081
|
reduce paths(type == "string" and startswith("data:image/png;base64")) as $p (
|
|
959
1082
|
{data: ., counter: 0};
|
|
960
1083
|
.counter as $idx |
|
|
961
1084
|
.data |= setpath($p; "${dirName}/${prefix}_\\($idx).png") |
|
|
962
1085
|
.counter += 1
|
|
963
|
-
) |
|
|
1086
|
+
) | .data
|
|
964
1087
|
`;
|
|
965
|
-
|
|
1088
|
+
const tmpPath = outputPath + ".tmp";
|
|
1089
|
+
await runJqFileToFile(program, inputPath, tmpPath);
|
|
1090
|
+
await (0, import_promises.rename)(tmpPath, outputPath);
|
|
966
1091
|
}
|
|
967
1092
|
|
|
968
1093
|
// src/processors/image-extractor.ts
|
|
@@ -981,7 +1106,7 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
981
1106
|
zipfile.on("entry", (entry) => {
|
|
982
1107
|
const entryPath = (0, import_node_path2.join)(targetDir, entry.fileName);
|
|
983
1108
|
if (/\/$/.test(entry.fileName)) {
|
|
984
|
-
(0,
|
|
1109
|
+
(0, import_node_fs2.mkdirSync)(entryPath, { recursive: true });
|
|
985
1110
|
zipfile.readEntry();
|
|
986
1111
|
} else {
|
|
987
1112
|
zipfile.openReadStream(entry, (err2, readStream) => {
|
|
@@ -989,8 +1114,8 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
989
1114
|
reject(err2 || new Error("Failed to open read stream"));
|
|
990
1115
|
return;
|
|
991
1116
|
}
|
|
992
|
-
(0,
|
|
993
|
-
const writeStream = (0,
|
|
1117
|
+
(0, import_node_fs2.mkdirSync)((0, import_node_path2.join)(entryPath, ".."), { recursive: true });
|
|
1118
|
+
const writeStream = (0, import_node_fs2.createWriteStream)(entryPath);
|
|
994
1119
|
readStream.pipe(writeStream);
|
|
995
1120
|
writeStream.on("finish", () => {
|
|
996
1121
|
zipfile.readEntry();
|
|
@@ -1006,26 +1131,6 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1006
1131
|
});
|
|
1007
1132
|
});
|
|
1008
1133
|
}
|
|
1009
|
-
/**
|
|
1010
|
-
* Extract base64 images from JSON file using jq (for large files)
|
|
1011
|
-
* Returns array of base64 data strings
|
|
1012
|
-
*/
|
|
1013
|
-
static async extractBase64ImagesFromJsonWithJq(jsonPath) {
|
|
1014
|
-
return jqExtractBase64PngStrings(jsonPath);
|
|
1015
|
-
}
|
|
1016
|
-
/**
|
|
1017
|
-
* Replace base64 images with file paths in JSON using jq (for large files)
|
|
1018
|
-
* Uses reduce to maintain counter state while walking the JSON
|
|
1019
|
-
*/
|
|
1020
|
-
static async replaceBase64ImagesInJsonWithJq(jsonPath, outputPath, dirName, prefix) {
|
|
1021
|
-
const { data, count } = await jqReplaceBase64WithPaths(
|
|
1022
|
-
jsonPath,
|
|
1023
|
-
dirName,
|
|
1024
|
-
prefix
|
|
1025
|
-
);
|
|
1026
|
-
(0, import_node_fs.writeFileSync)(outputPath, JSON.stringify(data, null, 2), "utf-8");
|
|
1027
|
-
return count;
|
|
1028
|
-
}
|
|
1029
1134
|
/**
|
|
1030
1135
|
* Extract a base64-encoded image to a file and return the relative path
|
|
1031
1136
|
*/
|
|
@@ -1035,12 +1140,70 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1035
1140
|
const filename = `${prefix}_${index}.png`;
|
|
1036
1141
|
const filepath = (0, import_node_path2.join)(imagesDir, filename);
|
|
1037
1142
|
const buffer = Buffer.from(base64Content, "base64");
|
|
1038
|
-
(0,
|
|
1143
|
+
(0, import_node_fs2.writeFileSync)(filepath, buffer);
|
|
1039
1144
|
return `${dirName}/${filename}`;
|
|
1040
1145
|
}
|
|
1041
1146
|
/**
|
|
1042
|
-
*
|
|
1043
|
-
*
|
|
1147
|
+
* Extract base64 images from HTML using streaming.
|
|
1148
|
+
* Reads HTML file as a stream, extracts base64 images from src attributes,
|
|
1149
|
+
* saves them as PNG files, and replaces with file paths in the output HTML.
|
|
1150
|
+
* Returns the number of images extracted.
|
|
1151
|
+
*/
|
|
1152
|
+
static async extractImagesFromHtmlStream(htmlInputPath, htmlOutputPath, imagesDir) {
|
|
1153
|
+
let imageIndex = 0;
|
|
1154
|
+
let pending = "";
|
|
1155
|
+
const MARKER = 'src="data:image/png;base64,';
|
|
1156
|
+
const transform = new import_node_stream.Transform({
|
|
1157
|
+
decodeStrings: false,
|
|
1158
|
+
encoding: "utf-8",
|
|
1159
|
+
transform(chunk, _encoding, callback) {
|
|
1160
|
+
pending += chunk;
|
|
1161
|
+
let result = "";
|
|
1162
|
+
while (true) {
|
|
1163
|
+
const markerIdx = pending.indexOf(MARKER);
|
|
1164
|
+
if (markerIdx === -1) {
|
|
1165
|
+
const safeEnd = Math.max(0, pending.length - MARKER.length);
|
|
1166
|
+
result += pending.slice(0, safeEnd);
|
|
1167
|
+
pending = pending.slice(safeEnd);
|
|
1168
|
+
break;
|
|
1169
|
+
}
|
|
1170
|
+
result += pending.slice(0, markerIdx);
|
|
1171
|
+
const dataStart = markerIdx + MARKER.length;
|
|
1172
|
+
const quoteIdx = pending.indexOf('"', dataStart);
|
|
1173
|
+
if (quoteIdx === -1) {
|
|
1174
|
+
pending = pending.slice(markerIdx);
|
|
1175
|
+
break;
|
|
1176
|
+
}
|
|
1177
|
+
const base64Content = pending.slice(dataStart, quoteIdx);
|
|
1178
|
+
const filename = `image_${imageIndex}.png`;
|
|
1179
|
+
const filepath = (0, import_node_path2.join)(imagesDir, filename);
|
|
1180
|
+
const buf = Buffer.from(base64Content, "base64");
|
|
1181
|
+
(0, import_node_fs2.writeFileSync)(filepath, buf);
|
|
1182
|
+
const relativePath = `images/${filename}`;
|
|
1183
|
+
result += `src="${relativePath}"`;
|
|
1184
|
+
imageIndex++;
|
|
1185
|
+
pending = pending.slice(quoteIdx + 1);
|
|
1186
|
+
}
|
|
1187
|
+
if (result.length > 0) {
|
|
1188
|
+
this.push(result);
|
|
1189
|
+
}
|
|
1190
|
+
callback();
|
|
1191
|
+
},
|
|
1192
|
+
flush(callback) {
|
|
1193
|
+
if (pending.length > 0) {
|
|
1194
|
+
this.push(pending);
|
|
1195
|
+
}
|
|
1196
|
+
callback();
|
|
1197
|
+
}
|
|
1198
|
+
});
|
|
1199
|
+
const rs = (0, import_node_fs2.createReadStream)(htmlInputPath, { encoding: "utf-8" });
|
|
1200
|
+
const ws = (0, import_node_fs2.createWriteStream)(htmlOutputPath, { encoding: "utf-8" });
|
|
1201
|
+
await (0, import_promises3.pipeline)(rs, transform, ws);
|
|
1202
|
+
return imageIndex;
|
|
1203
|
+
}
|
|
1204
|
+
/**
|
|
1205
|
+
* Save JSON and HTML documents with base64 images extracted to separate files.
|
|
1206
|
+
* Uses jq for JSON processing and streaming for HTML to handle large files.
|
|
1044
1207
|
*
|
|
1045
1208
|
* This method:
|
|
1046
1209
|
* 1. Extracts base64-encoded images from JSON and HTML content
|
|
@@ -1048,43 +1211,45 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1048
1211
|
* 3. Replaces base64 data with relative file paths
|
|
1049
1212
|
* 4. Saves the transformed documents to the output directory
|
|
1050
1213
|
*/
|
|
1051
|
-
static async saveDocumentsWithExtractedImages(logger, outputDir, filename, jsonSourcePath,
|
|
1214
|
+
static async saveDocumentsWithExtractedImages(logger, outputDir, filename, jsonSourcePath, htmlSourcePath) {
|
|
1052
1215
|
try {
|
|
1053
|
-
if ((0,
|
|
1054
|
-
(0,
|
|
1216
|
+
if ((0, import_node_fs2.existsSync)(outputDir)) {
|
|
1217
|
+
(0, import_node_fs2.rmSync)(outputDir, { recursive: true, force: true });
|
|
1055
1218
|
}
|
|
1056
1219
|
} catch (e) {
|
|
1057
1220
|
logger.warn("[PDFConverter] Failed to clear output directory:", e);
|
|
1058
1221
|
}
|
|
1059
|
-
(0,
|
|
1222
|
+
(0, import_node_fs2.mkdirSync)(outputDir, { recursive: true });
|
|
1060
1223
|
const baseName = filename.replace((0, import_node_path2.extname)(filename), "");
|
|
1061
1224
|
const jsonPath = (0, import_node_path2.join)(outputDir, `${baseName}.json`);
|
|
1062
1225
|
try {
|
|
1063
1226
|
const imagesDir = (0, import_node_path2.join)(outputDir, "images");
|
|
1064
|
-
if (!(0,
|
|
1065
|
-
(0,
|
|
1066
|
-
}
|
|
1067
|
-
const
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1227
|
+
if (!(0, import_node_fs2.existsSync)(imagesDir)) {
|
|
1228
|
+
(0, import_node_fs2.mkdirSync)(imagesDir, { recursive: true });
|
|
1229
|
+
}
|
|
1230
|
+
const imageCount = await jqExtractBase64PngStringsStreaming(
|
|
1231
|
+
jsonSourcePath,
|
|
1232
|
+
(base64Data, index) => {
|
|
1233
|
+
_ImageExtractor.extractBase64ImageToFile(
|
|
1234
|
+
base64Data,
|
|
1235
|
+
imagesDir,
|
|
1236
|
+
index,
|
|
1237
|
+
"pic",
|
|
1238
|
+
"images"
|
|
1239
|
+
);
|
|
1240
|
+
}
|
|
1241
|
+
);
|
|
1077
1242
|
logger.info(
|
|
1078
|
-
`[PDFConverter] Extracted ${
|
|
1243
|
+
`[PDFConverter] Extracted ${imageCount} picture images from JSON to ${imagesDir}`
|
|
1079
1244
|
);
|
|
1080
|
-
|
|
1245
|
+
await jqReplaceBase64WithPathsToFile(
|
|
1081
1246
|
jsonSourcePath,
|
|
1082
1247
|
jsonPath,
|
|
1083
1248
|
"images",
|
|
1084
1249
|
"pic"
|
|
1085
1250
|
);
|
|
1086
1251
|
logger.info(
|
|
1087
|
-
`[PDFConverter] Replaced ${
|
|
1252
|
+
`[PDFConverter] Replaced ${imageCount} base64 images with file paths`
|
|
1088
1253
|
);
|
|
1089
1254
|
} catch (e) {
|
|
1090
1255
|
logger.warn(
|
|
@@ -1097,51 +1262,45 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1097
1262
|
const htmlPath = (0, import_node_path2.join)(outputDir, `${baseName}.html`);
|
|
1098
1263
|
try {
|
|
1099
1264
|
const imagesDir = (0, import_node_path2.join)(outputDir, "images");
|
|
1100
|
-
if (!(0,
|
|
1101
|
-
(0,
|
|
1102
|
-
}
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
const filename2 = `image_${imageIndex}.png`;
|
|
1108
|
-
const filepath = (0, import_node_path2.join)(imagesDir, filename2);
|
|
1109
|
-
const buffer = Buffer.from(base64Content, "base64");
|
|
1110
|
-
(0, import_node_fs.writeFileSync)(filepath, buffer);
|
|
1111
|
-
const relativePath = `images/${filename2}`;
|
|
1112
|
-
imageIndex += 1;
|
|
1113
|
-
return `src="${relativePath}"`;
|
|
1114
|
-
}
|
|
1265
|
+
if (!(0, import_node_fs2.existsSync)(imagesDir)) {
|
|
1266
|
+
(0, import_node_fs2.mkdirSync)(imagesDir, { recursive: true });
|
|
1267
|
+
}
|
|
1268
|
+
const htmlImageCount = await _ImageExtractor.extractImagesFromHtmlStream(
|
|
1269
|
+
htmlSourcePath,
|
|
1270
|
+
htmlPath,
|
|
1271
|
+
imagesDir
|
|
1115
1272
|
);
|
|
1116
1273
|
logger.info(
|
|
1117
|
-
`[PDFConverter] Extracted ${
|
|
1274
|
+
`[PDFConverter] Extracted ${htmlImageCount} images from HTML to ${imagesDir}`
|
|
1118
1275
|
);
|
|
1119
|
-
(0, import_node_fs.writeFileSync)(htmlPath, transformedHtml, "utf-8");
|
|
1120
1276
|
} catch (e) {
|
|
1121
1277
|
logger.warn(
|
|
1122
|
-
"[PDFConverter] Failed to extract images from HTML,
|
|
1278
|
+
"[PDFConverter] Failed to extract images from HTML, copying original. Error:",
|
|
1123
1279
|
e
|
|
1124
1280
|
);
|
|
1125
|
-
(0,
|
|
1281
|
+
const rs = (0, import_node_fs2.createReadStream)(htmlSourcePath);
|
|
1282
|
+
const ws = (0, import_node_fs2.createWriteStream)(htmlPath);
|
|
1283
|
+
await (0, import_promises3.pipeline)(rs, ws);
|
|
1126
1284
|
}
|
|
1127
1285
|
logger.info("[PDFConverter] Saved HTML:", htmlPath);
|
|
1128
1286
|
}
|
|
1129
1287
|
/**
|
|
1130
1288
|
* Extract documents from ZIP and save with extracted images
|
|
1131
|
-
* Uses jq for JSON processing to handle large files
|
|
1289
|
+
* Uses jq for JSON processing and streaming for HTML to handle large files
|
|
1290
|
+
* without loading into Node.js memory
|
|
1132
1291
|
*
|
|
1133
1292
|
* Complete workflow:
|
|
1134
1293
|
* 1. Extract ZIP file to temporary directory
|
|
1135
1294
|
* 2. Find JSON and HTML files from extracted files
|
|
1136
|
-
* 3. Use jq to extract base64 images from JSON and save as separate files
|
|
1137
|
-
* 4. Use jq to replace base64 with file paths in JSON
|
|
1138
|
-
* 5. Process HTML with
|
|
1295
|
+
* 3. Use jq to stream-extract base64 images from JSON and save as separate files
|
|
1296
|
+
* 4. Use jq to replace base64 with file paths in JSON (piped to file)
|
|
1297
|
+
* 5. Process HTML with streaming Transform to extract and replace images
|
|
1139
1298
|
* 6. Save transformed documents to output directory (as result.json and result.html)
|
|
1140
1299
|
*/
|
|
1141
1300
|
static async extractAndSaveDocumentsFromZip(logger, zipPath, extractDir, outputDir) {
|
|
1142
1301
|
logger.info("[PDFConverter] Extracting ZIP file...");
|
|
1143
1302
|
await _ImageExtractor.extractZip(zipPath, extractDir);
|
|
1144
|
-
const files = (0,
|
|
1303
|
+
const files = (0, import_node_fs2.readdirSync)(extractDir);
|
|
1145
1304
|
const jsonFile = files.find((f) => (0, import_node_path2.extname)(f).toLowerCase() === ".json");
|
|
1146
1305
|
const htmlFile = files.find((f) => (0, import_node_path2.extname)(f).toLowerCase() === ".html");
|
|
1147
1306
|
if (!jsonFile || !htmlFile) {
|
|
@@ -1151,61 +1310,99 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1151
1310
|
}
|
|
1152
1311
|
const jsonPath = (0, import_node_path2.join)(extractDir, jsonFile);
|
|
1153
1312
|
const htmlPath = (0, import_node_path2.join)(extractDir, htmlFile);
|
|
1154
|
-
const htmlContent = (0, import_node_fs.readFileSync)(htmlPath, "utf-8");
|
|
1155
1313
|
logger.info("[PDFConverter] Saving converted files to output...");
|
|
1156
1314
|
await _ImageExtractor.saveDocumentsWithExtractedImages(
|
|
1157
1315
|
logger,
|
|
1158
1316
|
outputDir,
|
|
1159
1317
|
"result",
|
|
1160
1318
|
jsonPath,
|
|
1161
|
-
|
|
1319
|
+
htmlPath
|
|
1162
1320
|
);
|
|
1163
1321
|
logger.info("[PDFConverter] Files saved to:", outputDir);
|
|
1164
1322
|
}
|
|
1165
1323
|
};
|
|
1166
1324
|
|
|
1167
1325
|
// src/processors/page-renderer.ts
|
|
1168
|
-
var
|
|
1326
|
+
var import_node_fs3 = require("fs");
|
|
1169
1327
|
var import_node_path3 = require("path");
|
|
1170
|
-
var
|
|
1328
|
+
var PROGRESS_LOG_PERCENT_STEP = 10;
|
|
1171
1329
|
var PageRenderer = class {
|
|
1172
1330
|
constructor(logger) {
|
|
1173
1331
|
this.logger = logger;
|
|
1174
1332
|
}
|
|
1333
|
+
lastLoggedPercent = 0;
|
|
1175
1334
|
/**
|
|
1176
1335
|
* Render all pages of a PDF to individual PNG files.
|
|
1177
1336
|
*
|
|
1337
|
+
* Uses per-page rendering (`magick 'input.pdf[N]'`) when page count is known,
|
|
1338
|
+
* limiting peak memory to ~15MB/page instead of loading all pages at once.
|
|
1339
|
+
*
|
|
1178
1340
|
* @param pdfPath - Absolute path to the source PDF file
|
|
1179
1341
|
* @param outputDir - Directory where pages/ subdirectory will be created
|
|
1180
1342
|
* @param options - Rendering options
|
|
1181
1343
|
* @returns Render result with page count and file paths
|
|
1182
1344
|
*/
|
|
1183
1345
|
async renderPages(pdfPath, outputDir, options) {
|
|
1184
|
-
const dpi = options?.dpi ?? DEFAULT_DPI;
|
|
1346
|
+
const dpi = options?.dpi ?? PAGE_RENDERING.DEFAULT_DPI;
|
|
1185
1347
|
const pagesDir = (0, import_node_path3.join)(outputDir, "pages");
|
|
1186
|
-
if (!(0,
|
|
1187
|
-
(0,
|
|
1348
|
+
if (!(0, import_node_fs3.existsSync)(pagesDir)) {
|
|
1349
|
+
(0, import_node_fs3.mkdirSync)(pagesDir, { recursive: true });
|
|
1188
1350
|
}
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
dpi.toString(),
|
|
1194
|
-
pdfPath,
|
|
1195
|
-
"-background",
|
|
1196
|
-
"white",
|
|
1197
|
-
"-alpha",
|
|
1198
|
-
"remove",
|
|
1199
|
-
"-alpha",
|
|
1200
|
-
"off",
|
|
1201
|
-
outputPattern
|
|
1202
|
-
]);
|
|
1203
|
-
if (result.code !== 0) {
|
|
1204
|
-
throw new Error(
|
|
1205
|
-
`[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
|
|
1351
|
+
const totalPages = await this.getPageCount(pdfPath);
|
|
1352
|
+
if (totalPages > 0) {
|
|
1353
|
+
this.logger.info(
|
|
1354
|
+
`[PageRenderer] Rendering ${totalPages} pages at ${dpi} DPI...`
|
|
1206
1355
|
);
|
|
1356
|
+
this.lastLoggedPercent = 0;
|
|
1357
|
+
for (let i = 0; i < totalPages; i++) {
|
|
1358
|
+
const result = await spawnAsync(
|
|
1359
|
+
"magick",
|
|
1360
|
+
[
|
|
1361
|
+
"-density",
|
|
1362
|
+
dpi.toString(),
|
|
1363
|
+
`${pdfPath}[${i}]`,
|
|
1364
|
+
"-background",
|
|
1365
|
+
"white",
|
|
1366
|
+
"-alpha",
|
|
1367
|
+
"remove",
|
|
1368
|
+
"-alpha",
|
|
1369
|
+
"off",
|
|
1370
|
+
(0, import_node_path3.join)(pagesDir, `page_${i}.png`)
|
|
1371
|
+
],
|
|
1372
|
+
{ captureStdout: false }
|
|
1373
|
+
);
|
|
1374
|
+
if (result.code !== 0) {
|
|
1375
|
+
throw new Error(
|
|
1376
|
+
`[PageRenderer] Failed to render page ${i + 1}/${totalPages}: ${result.stderr || "Unknown error"}`
|
|
1377
|
+
);
|
|
1378
|
+
}
|
|
1379
|
+
this.logProgress(i + 1, totalPages);
|
|
1380
|
+
}
|
|
1381
|
+
} else {
|
|
1382
|
+
this.logger.info(`[PageRenderer] Rendering PDF at ${dpi} DPI...`);
|
|
1383
|
+
const result = await spawnAsync(
|
|
1384
|
+
"magick",
|
|
1385
|
+
[
|
|
1386
|
+
"-density",
|
|
1387
|
+
dpi.toString(),
|
|
1388
|
+
pdfPath,
|
|
1389
|
+
"-background",
|
|
1390
|
+
"white",
|
|
1391
|
+
"-alpha",
|
|
1392
|
+
"remove",
|
|
1393
|
+
"-alpha",
|
|
1394
|
+
"off",
|
|
1395
|
+
(0, import_node_path3.join)(pagesDir, "page_%d.png")
|
|
1396
|
+
],
|
|
1397
|
+
{ captureStdout: false }
|
|
1398
|
+
);
|
|
1399
|
+
if (result.code !== 0) {
|
|
1400
|
+
throw new Error(
|
|
1401
|
+
`[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
|
|
1402
|
+
);
|
|
1403
|
+
}
|
|
1207
1404
|
}
|
|
1208
|
-
const pageFiles = (0,
|
|
1405
|
+
const pageFiles = (0, import_node_fs3.readdirSync)(pagesDir).filter((f) => f.startsWith("page_") && f.endsWith(".png")).sort((a, b) => {
|
|
1209
1406
|
const numA = parseInt(a.replace("page_", "").replace(".png", ""), 10);
|
|
1210
1407
|
const numB = parseInt(b.replace("page_", "").replace(".png", ""), 10);
|
|
1211
1408
|
return numA - numB;
|
|
@@ -1219,6 +1416,32 @@ var PageRenderer = class {
|
|
|
1219
1416
|
pageFiles
|
|
1220
1417
|
};
|
|
1221
1418
|
}
|
|
1419
|
+
/**
|
|
1420
|
+
* Log rendering progress at appropriate intervals (every 10%).
|
|
1421
|
+
*/
|
|
1422
|
+
logProgress(current, total) {
|
|
1423
|
+
const percent = Math.floor(current / total * 100);
|
|
1424
|
+
if (percent >= this.lastLoggedPercent + PROGRESS_LOG_PERCENT_STEP || current === total) {
|
|
1425
|
+
this.lastLoggedPercent = percent;
|
|
1426
|
+
this.logger.info(
|
|
1427
|
+
`[PageRenderer] Rendering pages: ${current}/${total} (${percent}%)`
|
|
1428
|
+
);
|
|
1429
|
+
}
|
|
1430
|
+
}
|
|
1431
|
+
/**
|
|
1432
|
+
* Get total page count using pdfinfo.
|
|
1433
|
+
* Returns 0 on failure (progress logging will be skipped).
|
|
1434
|
+
*/
|
|
1435
|
+
async getPageCount(pdfPath) {
|
|
1436
|
+
try {
|
|
1437
|
+
const result = await spawnAsync("pdfinfo", [pdfPath]);
|
|
1438
|
+
if (result.code !== 0) return 0;
|
|
1439
|
+
const match = result.stdout.match(/^Pages:\s+(\d+)/m);
|
|
1440
|
+
return match ? parseInt(match[1], 10) : 0;
|
|
1441
|
+
} catch {
|
|
1442
|
+
return 0;
|
|
1443
|
+
}
|
|
1444
|
+
}
|
|
1222
1445
|
};
|
|
1223
1446
|
|
|
1224
1447
|
// src/processors/pdf-text-extractor.ts
|
|
@@ -1304,7 +1527,7 @@ var PdfTextExtractor = class {
|
|
|
1304
1527
|
};
|
|
1305
1528
|
|
|
1306
1529
|
// src/processors/vlm-text-corrector.ts
|
|
1307
|
-
var
|
|
1530
|
+
var import_node_fs4 = require("fs");
|
|
1308
1531
|
var import_node_path4 = require("path");
|
|
1309
1532
|
|
|
1310
1533
|
// src/types/vlm-text-correction-schema.ts
|
|
@@ -1436,7 +1659,7 @@ var VlmTextCorrector = class {
|
|
|
1436
1659
|
async correctAndSave(outputDir, model, options) {
|
|
1437
1660
|
this.logger.info("[VlmTextCorrector] Starting text correction...");
|
|
1438
1661
|
const resultPath = (0, import_node_path4.join)(outputDir, "result.json");
|
|
1439
|
-
const doc = JSON.parse((0,
|
|
1662
|
+
const doc = JSON.parse((0, import_node_fs4.readFileSync)(resultPath, "utf-8"));
|
|
1440
1663
|
let pageNumbers = this.getPageNumbers(doc);
|
|
1441
1664
|
if (pageNumbers.length === 0) {
|
|
1442
1665
|
this.logger.info("[VlmTextCorrector] No pages to process");
|
|
@@ -1487,7 +1710,7 @@ var VlmTextCorrector = class {
|
|
|
1487
1710
|
if (corrections === null) continue;
|
|
1488
1711
|
this.applyCorrections(doc, pageNumbers[i], corrections);
|
|
1489
1712
|
}
|
|
1490
|
-
(0,
|
|
1713
|
+
(0, import_node_fs4.writeFileSync)(resultPath, JSON.stringify(doc, null, 2));
|
|
1491
1714
|
this.logger.info(
|
|
1492
1715
|
`[VlmTextCorrector] Correction complete: ${totalTextCorrections} text, ${totalCellCorrections} cell corrections across ${pageNumbers.length} pages (${pagesFailed} failed)`
|
|
1493
1716
|
);
|
|
@@ -1763,7 +1986,7 @@ var VlmTextCorrector = class {
|
|
|
1763
1986
|
*/
|
|
1764
1987
|
readPageImage(outputDir, pageNo) {
|
|
1765
1988
|
const imagePath = (0, import_node_path4.join)(outputDir, "pages", `page_${pageNo - 1}.png`);
|
|
1766
|
-
return (0,
|
|
1989
|
+
return (0, import_node_fs4.readFileSync)(imagePath).toString("base64");
|
|
1767
1990
|
}
|
|
1768
1991
|
/**
|
|
1769
1992
|
* Apply VLM corrections to the DoclingDocument.
|
|
@@ -1818,9 +2041,8 @@ var VlmTextCorrector = class {
|
|
|
1818
2041
|
|
|
1819
2042
|
// src/samplers/ocr-strategy-sampler.ts
|
|
1820
2043
|
var import_model = require("@heripo/model");
|
|
1821
|
-
var
|
|
2044
|
+
var import_node_fs5 = require("fs");
|
|
1822
2045
|
var import_v42 = require("zod/v4");
|
|
1823
|
-
var SAMPLE_DPI = 150;
|
|
1824
2046
|
var EDGE_TRIM_RATIO = 0.1;
|
|
1825
2047
|
var DEFAULT_MAX_SAMPLE_PAGES = 15;
|
|
1826
2048
|
var DEFAULT_MAX_RETRIES2 = 3;
|
|
@@ -1871,7 +2093,7 @@ var OcrStrategySampler = class {
|
|
|
1871
2093
|
const renderResult = await this.pageRenderer.renderPages(
|
|
1872
2094
|
pdfPath,
|
|
1873
2095
|
outputDir,
|
|
1874
|
-
{ dpi: SAMPLE_DPI }
|
|
2096
|
+
{ dpi: PAGE_RENDERING.SAMPLE_DPI }
|
|
1875
2097
|
);
|
|
1876
2098
|
if (renderResult.pageCount === 0) {
|
|
1877
2099
|
this.logger.info("[OcrStrategySampler] No pages found in PDF");
|
|
@@ -2033,7 +2255,7 @@ var OcrStrategySampler = class {
|
|
|
2033
2255
|
this.logger.debug(
|
|
2034
2256
|
`[OcrStrategySampler] Analyzing page ${pageNo} for Korean-Hanja mix and language...`
|
|
2035
2257
|
);
|
|
2036
|
-
const base64Image = (0,
|
|
2258
|
+
const base64Image = (0, import_node_fs5.readFileSync)(pageFile).toString("base64");
|
|
2037
2259
|
const messages = [
|
|
2038
2260
|
{
|
|
2039
2261
|
role: "user",
|
|
@@ -2081,7 +2303,7 @@ var OcrStrategySampler = class {
|
|
|
2081
2303
|
};
|
|
2082
2304
|
|
|
2083
2305
|
// src/utils/local-file-server.ts
|
|
2084
|
-
var
|
|
2306
|
+
var import_node_fs6 = require("fs");
|
|
2085
2307
|
var import_node_http = require("http");
|
|
2086
2308
|
var import_node_path5 = require("path");
|
|
2087
2309
|
var LocalFileServer = class {
|
|
@@ -2095,7 +2317,7 @@ var LocalFileServer = class {
|
|
|
2095
2317
|
*/
|
|
2096
2318
|
async start(filePath) {
|
|
2097
2319
|
const filename = (0, import_node_path5.basename)(filePath);
|
|
2098
|
-
const stat = (0,
|
|
2320
|
+
const stat = (0, import_node_fs6.statSync)(filePath);
|
|
2099
2321
|
return new Promise((resolve, reject) => {
|
|
2100
2322
|
this.server = (0, import_node_http.createServer)((req, res) => {
|
|
2101
2323
|
if (req.url === `/${filename}`) {
|
|
@@ -2103,7 +2325,7 @@ var LocalFileServer = class {
|
|
|
2103
2325
|
"Content-Type": "application/pdf",
|
|
2104
2326
|
"Content-Length": stat.size
|
|
2105
2327
|
});
|
|
2106
|
-
(0,
|
|
2328
|
+
(0, import_node_fs6.createReadStream)(filePath).pipe(res);
|
|
2107
2329
|
} else {
|
|
2108
2330
|
res.writeHead(404);
|
|
2109
2331
|
res.end("Not Found");
|
|
@@ -2140,7 +2362,7 @@ var LocalFileServer = class {
|
|
|
2140
2362
|
};
|
|
2141
2363
|
|
|
2142
2364
|
// src/core/image-pdf-converter.ts
|
|
2143
|
-
var
|
|
2365
|
+
var import_node_fs7 = require("fs");
|
|
2144
2366
|
var import_node_os = require("os");
|
|
2145
2367
|
var import_node_path6 = require("path");
|
|
2146
2368
|
var ImagePdfConverter = class {
|
|
@@ -2168,8 +2390,8 @@ var ImagePdfConverter = class {
|
|
|
2168
2390
|
this.logger.info("[ImagePdfConverter] Image PDF created:", outputPath);
|
|
2169
2391
|
return outputPath;
|
|
2170
2392
|
} finally {
|
|
2171
|
-
if ((0,
|
|
2172
|
-
(0,
|
|
2393
|
+
if ((0, import_node_fs7.existsSync)(inputPath)) {
|
|
2394
|
+
(0, import_node_fs7.rmSync)(inputPath, { force: true });
|
|
2173
2395
|
}
|
|
2174
2396
|
}
|
|
2175
2397
|
}
|
|
@@ -2216,12 +2438,12 @@ var ImagePdfConverter = class {
|
|
|
2216
2438
|
* Cleanup the temporary image PDF file
|
|
2217
2439
|
*/
|
|
2218
2440
|
cleanup(imagePdfPath) {
|
|
2219
|
-
if ((0,
|
|
2441
|
+
if ((0, import_node_fs7.existsSync)(imagePdfPath)) {
|
|
2220
2442
|
this.logger.info(
|
|
2221
2443
|
"[ImagePdfConverter] Cleaning up temp file:",
|
|
2222
2444
|
imagePdfPath
|
|
2223
2445
|
);
|
|
2224
|
-
(0,
|
|
2446
|
+
(0, import_node_fs7.rmSync)(imagePdfPath, { force: true });
|
|
2225
2447
|
}
|
|
2226
2448
|
}
|
|
2227
2449
|
};
|
|
@@ -2365,8 +2587,8 @@ var PDFConverter = class {
|
|
|
2365
2587
|
}
|
|
2366
2588
|
return strategy;
|
|
2367
2589
|
} finally {
|
|
2368
|
-
if ((0,
|
|
2369
|
-
(0,
|
|
2590
|
+
if ((0, import_node_fs8.existsSync)(samplingDir)) {
|
|
2591
|
+
(0, import_node_fs8.rmSync)(samplingDir, { recursive: true, force: true });
|
|
2370
2592
|
}
|
|
2371
2593
|
}
|
|
2372
2594
|
}
|
|
@@ -2388,8 +2610,10 @@ var PDFConverter = class {
|
|
|
2388
2610
|
let pageTexts;
|
|
2389
2611
|
try {
|
|
2390
2612
|
const resultPath2 = (0, import_node_path7.join)(outputDir, "result.json");
|
|
2391
|
-
const
|
|
2392
|
-
|
|
2613
|
+
const totalPages = await runJqFileJson(
|
|
2614
|
+
".pages | length",
|
|
2615
|
+
resultPath2
|
|
2616
|
+
);
|
|
2393
2617
|
const textExtractor = new PdfTextExtractor(this.logger);
|
|
2394
2618
|
pageTexts = await textExtractor.extractText(pdfPath, totalPages);
|
|
2395
2619
|
} catch {
|
|
@@ -2399,7 +2623,7 @@ var PDFConverter = class {
|
|
|
2399
2623
|
}
|
|
2400
2624
|
const resultPath = (0, import_node_path7.join)(outputDir, "result.json");
|
|
2401
2625
|
const ocrOriginPath = (0, import_node_path7.join)(outputDir, "result_ocr_origin.json");
|
|
2402
|
-
(0,
|
|
2626
|
+
(0, import_node_fs8.copyFileSync)(resultPath, ocrOriginPath);
|
|
2403
2627
|
const corrector = new VlmTextCorrector(this.logger);
|
|
2404
2628
|
await corrector.correctAndSave(outputDir, options.vlmProcessorModel, {
|
|
2405
2629
|
concurrency: options.vlmConcurrency,
|
|
@@ -2560,19 +2784,19 @@ var PDFConverter = class {
|
|
|
2560
2784
|
this.logger.info("[PDFConverter] Total time:", duration, "ms");
|
|
2561
2785
|
} finally {
|
|
2562
2786
|
this.logger.info("[PDFConverter] Cleaning up temporary files...");
|
|
2563
|
-
if ((0,
|
|
2564
|
-
(0,
|
|
2787
|
+
if ((0, import_node_fs8.existsSync)(zipPath)) {
|
|
2788
|
+
(0, import_node_fs8.rmSync)(zipPath, { force: true });
|
|
2565
2789
|
}
|
|
2566
|
-
if ((0,
|
|
2567
|
-
(0,
|
|
2790
|
+
if ((0, import_node_fs8.existsSync)(extractDir)) {
|
|
2791
|
+
(0, import_node_fs8.rmSync)(extractDir, { recursive: true, force: true });
|
|
2568
2792
|
}
|
|
2569
2793
|
if (cleanupAfterCallback) {
|
|
2570
2794
|
this.logger.info(
|
|
2571
2795
|
"[PDFConverter] Cleaning up output directory:",
|
|
2572
2796
|
outputDir
|
|
2573
2797
|
);
|
|
2574
|
-
if ((0,
|
|
2575
|
-
(0,
|
|
2798
|
+
if ((0, import_node_fs8.existsSync)(outputDir)) {
|
|
2799
|
+
(0, import_node_fs8.rmSync)(outputDir, { recursive: true, force: true });
|
|
2576
2800
|
}
|
|
2577
2801
|
} else {
|
|
2578
2802
|
this.logger.info("[PDFConverter] Output preserved at:", outputDir);
|
|
@@ -2721,12 +2945,12 @@ var PDFConverter = class {
|
|
|
2721
2945
|
const zipPath = (0, import_node_path7.join)(process.cwd(), "result.zip");
|
|
2722
2946
|
this.logger.info("[PDFConverter] Saving ZIP file to:", zipPath);
|
|
2723
2947
|
if (zipResult.fileStream) {
|
|
2724
|
-
const writeStream = (0,
|
|
2725
|
-
await (0,
|
|
2948
|
+
const writeStream = (0, import_node_fs8.createWriteStream)(zipPath);
|
|
2949
|
+
await (0, import_promises5.pipeline)(zipResult.fileStream, writeStream);
|
|
2726
2950
|
return;
|
|
2727
2951
|
}
|
|
2728
2952
|
if (zipResult.data) {
|
|
2729
|
-
await (0,
|
|
2953
|
+
await (0, import_promises4.writeFile)(zipPath, zipResult.data);
|
|
2730
2954
|
return;
|
|
2731
2955
|
}
|
|
2732
2956
|
this.logger.warn(
|
|
@@ -2742,7 +2966,7 @@ var PDFConverter = class {
|
|
|
2742
2966
|
);
|
|
2743
2967
|
}
|
|
2744
2968
|
const buffer = new Uint8Array(await response.arrayBuffer());
|
|
2745
|
-
await (0,
|
|
2969
|
+
await (0, import_promises4.writeFile)(zipPath, buffer);
|
|
2746
2970
|
}
|
|
2747
2971
|
async processConvertedFiles(zipPath, extractDir, outputDir) {
|
|
2748
2972
|
await ImageExtractor.extractAndSaveDocumentsFromZip(
|
|
@@ -2754,6 +2978,7 @@ var PDFConverter = class {
|
|
|
2754
2978
|
}
|
|
2755
2979
|
/**
|
|
2756
2980
|
* Render page images from the source PDF using ImageMagick and update result.json.
|
|
2981
|
+
* Uses jq to update the JSON file without loading it into Node.js memory.
|
|
2757
2982
|
* Replaces Docling's generate_page_images which fails on large PDFs
|
|
2758
2983
|
* due to memory limits when embedding all page images as base64.
|
|
2759
2984
|
*/
|
|
@@ -2771,17 +2996,18 @@ var PDFConverter = class {
|
|
|
2771
2996
|
const renderer = new PageRenderer(this.logger);
|
|
2772
2997
|
const renderResult = await renderer.renderPages(pdfPath, outputDir);
|
|
2773
2998
|
const resultPath = (0, import_node_path7.join)(outputDir, "result.json");
|
|
2774
|
-
const
|
|
2775
|
-
|
|
2776
|
-
|
|
2777
|
-
|
|
2778
|
-
|
|
2779
|
-
|
|
2780
|
-
|
|
2781
|
-
|
|
2782
|
-
|
|
2783
|
-
|
|
2784
|
-
await (
|
|
2999
|
+
const tmpPath = resultPath + ".tmp";
|
|
3000
|
+
const jqProgram = `
|
|
3001
|
+
.pages |= with_entries(
|
|
3002
|
+
if (.value.page_no - 1) >= 0 and (.value.page_no - 1) < ${renderResult.pageCount} then
|
|
3003
|
+
.value.image.uri = "pages/page_\\(.value.page_no - 1).png" |
|
|
3004
|
+
.value.image.mimetype = "image/png" |
|
|
3005
|
+
.value.image.dpi = ${PAGE_RENDERING.DEFAULT_DPI}
|
|
3006
|
+
else . end
|
|
3007
|
+
)
|
|
3008
|
+
`;
|
|
3009
|
+
await runJqFileToFile(jqProgram, resultPath, tmpPath);
|
|
3010
|
+
await (0, import_promises4.rename)(tmpPath, resultPath);
|
|
2785
3011
|
this.logger.info(
|
|
2786
3012
|
`[PDFConverter] Rendered ${renderResult.pageCount} page images`
|
|
2787
3013
|
);
|