@heripo/pdf-parser 0.1.8 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.ko.md +19 -9
- package/README.md +19 -9
- package/dist/index.cjs +419 -151
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -1
- package/dist/index.d.ts +4 -1
- package/dist/index.js +388 -126
- package/dist/index.js.map +1 -1
- package/package.json +4 -4
package/dist/index.cjs
CHANGED
|
@@ -81,11 +81,17 @@ var DOCLING_ENVIRONMENT = {
|
|
|
81
81
|
*/
|
|
82
82
|
STARTUP_DELAY_MS: 2e3
|
|
83
83
|
};
|
|
84
|
+
var PAGE_RENDERING = {
|
|
85
|
+
/** Default rendering DPI for VLM text recognition quality */
|
|
86
|
+
DEFAULT_DPI: 200,
|
|
87
|
+
/** Low-resolution DPI for OCR strategy sampling */
|
|
88
|
+
SAMPLE_DPI: 150
|
|
89
|
+
};
|
|
84
90
|
var IMAGE_PDF_CONVERTER = {
|
|
85
91
|
/**
|
|
86
92
|
* ImageMagick density option (DPI) for PDF to image conversion
|
|
87
93
|
*/
|
|
88
|
-
DENSITY:
|
|
94
|
+
DENSITY: PAGE_RENDERING.DEFAULT_DPI,
|
|
89
95
|
/**
|
|
90
96
|
* ImageMagick quality option (1-100)
|
|
91
97
|
*/
|
|
@@ -869,10 +875,10 @@ var DoclingEnvironment = class _DoclingEnvironment {
|
|
|
869
875
|
|
|
870
876
|
// src/core/pdf-converter.ts
|
|
871
877
|
var import_es_toolkit = require("es-toolkit");
|
|
872
|
-
var
|
|
873
|
-
var
|
|
878
|
+
var import_node_fs8 = require("fs");
|
|
879
|
+
var import_promises4 = require("fs/promises");
|
|
874
880
|
var import_node_path7 = require("path");
|
|
875
|
-
var
|
|
881
|
+
var import_promises5 = require("stream/promises");
|
|
876
882
|
|
|
877
883
|
// src/errors/image-pdf-fallback-error.ts
|
|
878
884
|
var ImagePdfFallbackError = class extends Error {
|
|
@@ -887,12 +893,17 @@ var ImagePdfFallbackError = class extends Error {
|
|
|
887
893
|
};
|
|
888
894
|
|
|
889
895
|
// src/processors/image-extractor.ts
|
|
890
|
-
var
|
|
896
|
+
var import_node_fs2 = require("fs");
|
|
891
897
|
var import_node_path2 = require("path");
|
|
898
|
+
var import_node_stream = require("stream");
|
|
899
|
+
var import_promises3 = require("stream/promises");
|
|
892
900
|
var yauzl = __toESM(require("yauzl"), 1);
|
|
893
901
|
|
|
894
902
|
// src/utils/jq.ts
|
|
895
903
|
var import_node_child_process2 = require("child_process");
|
|
904
|
+
var import_node_fs = require("fs");
|
|
905
|
+
var import_promises = require("fs/promises");
|
|
906
|
+
var import_promises2 = require("stream/promises");
|
|
896
907
|
function getJqPath() {
|
|
897
908
|
const p = process.env.JQ_PATH?.trim();
|
|
898
909
|
return p && p.length > 0 ? p : "jq";
|
|
@@ -944,25 +955,139 @@ function runJqFileJson(program, filePath) {
|
|
|
944
955
|
});
|
|
945
956
|
});
|
|
946
957
|
}
|
|
947
|
-
function
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
]
|
|
953
|
-
|
|
954
|
-
|
|
958
|
+
function runJqFileToFile(program, inputPath, outputPath) {
|
|
959
|
+
return new Promise((resolve, reject) => {
|
|
960
|
+
const jqPath = getJqPath();
|
|
961
|
+
const args = [program, inputPath];
|
|
962
|
+
const child = (0, import_node_child_process2.spawn)(jqPath, args, {
|
|
963
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
964
|
+
env: process.env
|
|
965
|
+
});
|
|
966
|
+
let stderr = "";
|
|
967
|
+
let exitCode = null;
|
|
968
|
+
let pipelineDone = false;
|
|
969
|
+
let settled = false;
|
|
970
|
+
child.stderr.setEncoding("utf-8");
|
|
971
|
+
child.stderr.on("data", (chunk) => {
|
|
972
|
+
stderr += chunk;
|
|
973
|
+
});
|
|
974
|
+
const ws = (0, import_node_fs.createWriteStream)(outputPath);
|
|
975
|
+
function trySettle() {
|
|
976
|
+
if (settled) return;
|
|
977
|
+
if (!pipelineDone || exitCode === null) return;
|
|
978
|
+
settled = true;
|
|
979
|
+
if (exitCode !== 0) {
|
|
980
|
+
reject(
|
|
981
|
+
new Error(
|
|
982
|
+
`jq exited with code ${exitCode}. ${stderr ? "Stderr: " + stderr : ""}`
|
|
983
|
+
)
|
|
984
|
+
);
|
|
985
|
+
} else {
|
|
986
|
+
resolve();
|
|
987
|
+
}
|
|
988
|
+
}
|
|
989
|
+
child.on("error", (err) => {
|
|
990
|
+
if (settled) return;
|
|
991
|
+
settled = true;
|
|
992
|
+
ws.destroy();
|
|
993
|
+
reject(err);
|
|
994
|
+
});
|
|
995
|
+
(0, import_promises2.pipeline)(child.stdout, ws).then(() => {
|
|
996
|
+
pipelineDone = true;
|
|
997
|
+
trySettle();
|
|
998
|
+
}).catch((err) => {
|
|
999
|
+
if (settled) return;
|
|
1000
|
+
settled = true;
|
|
1001
|
+
reject(err);
|
|
1002
|
+
});
|
|
1003
|
+
child.on("close", (code) => {
|
|
1004
|
+
exitCode = code ?? 1;
|
|
1005
|
+
trySettle();
|
|
1006
|
+
});
|
|
1007
|
+
});
|
|
1008
|
+
}
|
|
1009
|
+
function runJqFileLines(program, filePath, onLine) {
|
|
1010
|
+
return new Promise((resolve, reject) => {
|
|
1011
|
+
const jqPath = getJqPath();
|
|
1012
|
+
const args = ["-r", program, filePath];
|
|
1013
|
+
const child = (0, import_node_child_process2.spawn)(jqPath, args, {
|
|
1014
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
1015
|
+
env: process.env
|
|
1016
|
+
});
|
|
1017
|
+
let stderr = "";
|
|
1018
|
+
let buffer = "";
|
|
1019
|
+
let callbackError = false;
|
|
1020
|
+
child.stdout.setEncoding("utf-8");
|
|
1021
|
+
child.stderr.setEncoding("utf-8");
|
|
1022
|
+
function safeOnLine(line) {
|
|
1023
|
+
if (callbackError) return;
|
|
1024
|
+
try {
|
|
1025
|
+
onLine(line);
|
|
1026
|
+
} catch (err) {
|
|
1027
|
+
callbackError = true;
|
|
1028
|
+
child.kill();
|
|
1029
|
+
reject(err);
|
|
1030
|
+
}
|
|
1031
|
+
}
|
|
1032
|
+
child.stdout.on("data", (chunk) => {
|
|
1033
|
+
buffer += chunk;
|
|
1034
|
+
let newlineIdx;
|
|
1035
|
+
while ((newlineIdx = buffer.indexOf("\n")) !== -1) {
|
|
1036
|
+
const line = buffer.slice(0, newlineIdx);
|
|
1037
|
+
buffer = buffer.slice(newlineIdx + 1);
|
|
1038
|
+
if (line.length > 0) {
|
|
1039
|
+
safeOnLine(line);
|
|
1040
|
+
}
|
|
1041
|
+
}
|
|
1042
|
+
});
|
|
1043
|
+
child.stderr.on("data", (chunk) => {
|
|
1044
|
+
stderr += chunk;
|
|
1045
|
+
});
|
|
1046
|
+
child.on("error", (err) => {
|
|
1047
|
+
if (!callbackError) reject(err);
|
|
1048
|
+
});
|
|
1049
|
+
child.on("close", (code) => {
|
|
1050
|
+
if (callbackError) return;
|
|
1051
|
+
if (buffer.length > 0) {
|
|
1052
|
+
safeOnLine(buffer);
|
|
1053
|
+
}
|
|
1054
|
+
if (callbackError) return;
|
|
1055
|
+
if (code !== 0) {
|
|
1056
|
+
reject(
|
|
1057
|
+
new Error(
|
|
1058
|
+
`jq exited with code ${code}. ${stderr ? "Stderr: " + stderr : ""}`
|
|
1059
|
+
)
|
|
1060
|
+
);
|
|
1061
|
+
} else {
|
|
1062
|
+
resolve();
|
|
1063
|
+
}
|
|
1064
|
+
});
|
|
1065
|
+
});
|
|
955
1066
|
}
|
|
956
|
-
function
|
|
1067
|
+
async function jqExtractBase64PngStringsStreaming(filePath, onImage) {
|
|
1068
|
+
let index = 0;
|
|
1069
|
+
await runJqFileLines(
|
|
1070
|
+
'.. | select(type == "string" and startswith("data:image/png;base64"))',
|
|
1071
|
+
filePath,
|
|
1072
|
+
(line) => {
|
|
1073
|
+
onImage(line, index);
|
|
1074
|
+
index++;
|
|
1075
|
+
}
|
|
1076
|
+
);
|
|
1077
|
+
return index;
|
|
1078
|
+
}
|
|
1079
|
+
async function jqReplaceBase64WithPathsToFile(inputPath, outputPath, dirName, prefix) {
|
|
957
1080
|
const program = `
|
|
958
1081
|
reduce paths(type == "string" and startswith("data:image/png;base64")) as $p (
|
|
959
1082
|
{data: ., counter: 0};
|
|
960
1083
|
.counter as $idx |
|
|
961
1084
|
.data |= setpath($p; "${dirName}/${prefix}_\\($idx).png") |
|
|
962
1085
|
.counter += 1
|
|
963
|
-
) |
|
|
1086
|
+
) | .data
|
|
964
1087
|
`;
|
|
965
|
-
|
|
1088
|
+
const tmpPath = outputPath + ".tmp";
|
|
1089
|
+
await runJqFileToFile(program, inputPath, tmpPath);
|
|
1090
|
+
await (0, import_promises.rename)(tmpPath, outputPath);
|
|
966
1091
|
}
|
|
967
1092
|
|
|
968
1093
|
// src/processors/image-extractor.ts
|
|
@@ -981,7 +1106,7 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
981
1106
|
zipfile.on("entry", (entry) => {
|
|
982
1107
|
const entryPath = (0, import_node_path2.join)(targetDir, entry.fileName);
|
|
983
1108
|
if (/\/$/.test(entry.fileName)) {
|
|
984
|
-
(0,
|
|
1109
|
+
(0, import_node_fs2.mkdirSync)(entryPath, { recursive: true });
|
|
985
1110
|
zipfile.readEntry();
|
|
986
1111
|
} else {
|
|
987
1112
|
zipfile.openReadStream(entry, (err2, readStream) => {
|
|
@@ -989,8 +1114,8 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
989
1114
|
reject(err2 || new Error("Failed to open read stream"));
|
|
990
1115
|
return;
|
|
991
1116
|
}
|
|
992
|
-
(0,
|
|
993
|
-
const writeStream = (0,
|
|
1117
|
+
(0, import_node_fs2.mkdirSync)((0, import_node_path2.join)(entryPath, ".."), { recursive: true });
|
|
1118
|
+
const writeStream = (0, import_node_fs2.createWriteStream)(entryPath);
|
|
994
1119
|
readStream.pipe(writeStream);
|
|
995
1120
|
writeStream.on("finish", () => {
|
|
996
1121
|
zipfile.readEntry();
|
|
@@ -1006,26 +1131,6 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1006
1131
|
});
|
|
1007
1132
|
});
|
|
1008
1133
|
}
|
|
1009
|
-
/**
|
|
1010
|
-
* Extract base64 images from JSON file using jq (for large files)
|
|
1011
|
-
* Returns array of base64 data strings
|
|
1012
|
-
*/
|
|
1013
|
-
static async extractBase64ImagesFromJsonWithJq(jsonPath) {
|
|
1014
|
-
return jqExtractBase64PngStrings(jsonPath);
|
|
1015
|
-
}
|
|
1016
|
-
/**
|
|
1017
|
-
* Replace base64 images with file paths in JSON using jq (for large files)
|
|
1018
|
-
* Uses reduce to maintain counter state while walking the JSON
|
|
1019
|
-
*/
|
|
1020
|
-
static async replaceBase64ImagesInJsonWithJq(jsonPath, outputPath, dirName, prefix) {
|
|
1021
|
-
const { data, count } = await jqReplaceBase64WithPaths(
|
|
1022
|
-
jsonPath,
|
|
1023
|
-
dirName,
|
|
1024
|
-
prefix
|
|
1025
|
-
);
|
|
1026
|
-
(0, import_node_fs.writeFileSync)(outputPath, JSON.stringify(data, null, 2), "utf-8");
|
|
1027
|
-
return count;
|
|
1028
|
-
}
|
|
1029
1134
|
/**
|
|
1030
1135
|
* Extract a base64-encoded image to a file and return the relative path
|
|
1031
1136
|
*/
|
|
@@ -1035,12 +1140,70 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1035
1140
|
const filename = `${prefix}_${index}.png`;
|
|
1036
1141
|
const filepath = (0, import_node_path2.join)(imagesDir, filename);
|
|
1037
1142
|
const buffer = Buffer.from(base64Content, "base64");
|
|
1038
|
-
(0,
|
|
1143
|
+
(0, import_node_fs2.writeFileSync)(filepath, buffer);
|
|
1039
1144
|
return `${dirName}/${filename}`;
|
|
1040
1145
|
}
|
|
1041
1146
|
/**
|
|
1042
|
-
*
|
|
1043
|
-
*
|
|
1147
|
+
* Extract base64 images from HTML using streaming.
|
|
1148
|
+
* Reads HTML file as a stream, extracts base64 images from src attributes,
|
|
1149
|
+
* saves them as PNG files, and replaces with file paths in the output HTML.
|
|
1150
|
+
* Returns the number of images extracted.
|
|
1151
|
+
*/
|
|
1152
|
+
static async extractImagesFromHtmlStream(htmlInputPath, htmlOutputPath, imagesDir) {
|
|
1153
|
+
let imageIndex = 0;
|
|
1154
|
+
let pending = "";
|
|
1155
|
+
const MARKER = 'src="data:image/png;base64,';
|
|
1156
|
+
const transform = new import_node_stream.Transform({
|
|
1157
|
+
decodeStrings: false,
|
|
1158
|
+
encoding: "utf-8",
|
|
1159
|
+
transform(chunk, _encoding, callback) {
|
|
1160
|
+
pending += chunk;
|
|
1161
|
+
let result = "";
|
|
1162
|
+
while (true) {
|
|
1163
|
+
const markerIdx = pending.indexOf(MARKER);
|
|
1164
|
+
if (markerIdx === -1) {
|
|
1165
|
+
const safeEnd = Math.max(0, pending.length - MARKER.length);
|
|
1166
|
+
result += pending.slice(0, safeEnd);
|
|
1167
|
+
pending = pending.slice(safeEnd);
|
|
1168
|
+
break;
|
|
1169
|
+
}
|
|
1170
|
+
result += pending.slice(0, markerIdx);
|
|
1171
|
+
const dataStart = markerIdx + MARKER.length;
|
|
1172
|
+
const quoteIdx = pending.indexOf('"', dataStart);
|
|
1173
|
+
if (quoteIdx === -1) {
|
|
1174
|
+
pending = pending.slice(markerIdx);
|
|
1175
|
+
break;
|
|
1176
|
+
}
|
|
1177
|
+
const base64Content = pending.slice(dataStart, quoteIdx);
|
|
1178
|
+
const filename = `image_${imageIndex}.png`;
|
|
1179
|
+
const filepath = (0, import_node_path2.join)(imagesDir, filename);
|
|
1180
|
+
const buf = Buffer.from(base64Content, "base64");
|
|
1181
|
+
(0, import_node_fs2.writeFileSync)(filepath, buf);
|
|
1182
|
+
const relativePath = `images/${filename}`;
|
|
1183
|
+
result += `src="${relativePath}"`;
|
|
1184
|
+
imageIndex++;
|
|
1185
|
+
pending = pending.slice(quoteIdx + 1);
|
|
1186
|
+
}
|
|
1187
|
+
if (result.length > 0) {
|
|
1188
|
+
this.push(result);
|
|
1189
|
+
}
|
|
1190
|
+
callback();
|
|
1191
|
+
},
|
|
1192
|
+
flush(callback) {
|
|
1193
|
+
if (pending.length > 0) {
|
|
1194
|
+
this.push(pending);
|
|
1195
|
+
}
|
|
1196
|
+
callback();
|
|
1197
|
+
}
|
|
1198
|
+
});
|
|
1199
|
+
const rs = (0, import_node_fs2.createReadStream)(htmlInputPath, { encoding: "utf-8" });
|
|
1200
|
+
const ws = (0, import_node_fs2.createWriteStream)(htmlOutputPath, { encoding: "utf-8" });
|
|
1201
|
+
await (0, import_promises3.pipeline)(rs, transform, ws);
|
|
1202
|
+
return imageIndex;
|
|
1203
|
+
}
|
|
1204
|
+
/**
|
|
1205
|
+
* Save JSON and HTML documents with base64 images extracted to separate files.
|
|
1206
|
+
* Uses jq for JSON processing and streaming for HTML to handle large files.
|
|
1044
1207
|
*
|
|
1045
1208
|
* This method:
|
|
1046
1209
|
* 1. Extracts base64-encoded images from JSON and HTML content
|
|
@@ -1048,43 +1211,45 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1048
1211
|
* 3. Replaces base64 data with relative file paths
|
|
1049
1212
|
* 4. Saves the transformed documents to the output directory
|
|
1050
1213
|
*/
|
|
1051
|
-
static async saveDocumentsWithExtractedImages(logger, outputDir, filename, jsonSourcePath,
|
|
1214
|
+
static async saveDocumentsWithExtractedImages(logger, outputDir, filename, jsonSourcePath, htmlSourcePath) {
|
|
1052
1215
|
try {
|
|
1053
|
-
if ((0,
|
|
1054
|
-
(0,
|
|
1216
|
+
if ((0, import_node_fs2.existsSync)(outputDir)) {
|
|
1217
|
+
(0, import_node_fs2.rmSync)(outputDir, { recursive: true, force: true });
|
|
1055
1218
|
}
|
|
1056
1219
|
} catch (e) {
|
|
1057
1220
|
logger.warn("[PDFConverter] Failed to clear output directory:", e);
|
|
1058
1221
|
}
|
|
1059
|
-
(0,
|
|
1222
|
+
(0, import_node_fs2.mkdirSync)(outputDir, { recursive: true });
|
|
1060
1223
|
const baseName = filename.replace((0, import_node_path2.extname)(filename), "");
|
|
1061
1224
|
const jsonPath = (0, import_node_path2.join)(outputDir, `${baseName}.json`);
|
|
1062
1225
|
try {
|
|
1063
|
-
const
|
|
1064
|
-
if (!(0,
|
|
1065
|
-
(0,
|
|
1066
|
-
}
|
|
1067
|
-
const
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1226
|
+
const imagesDir = (0, import_node_path2.join)(outputDir, "images");
|
|
1227
|
+
if (!(0, import_node_fs2.existsSync)(imagesDir)) {
|
|
1228
|
+
(0, import_node_fs2.mkdirSync)(imagesDir, { recursive: true });
|
|
1229
|
+
}
|
|
1230
|
+
const imageCount = await jqExtractBase64PngStringsStreaming(
|
|
1231
|
+
jsonSourcePath,
|
|
1232
|
+
(base64Data, index) => {
|
|
1233
|
+
_ImageExtractor.extractBase64ImageToFile(
|
|
1234
|
+
base64Data,
|
|
1235
|
+
imagesDir,
|
|
1236
|
+
index,
|
|
1237
|
+
"pic",
|
|
1238
|
+
"images"
|
|
1239
|
+
);
|
|
1240
|
+
}
|
|
1241
|
+
);
|
|
1077
1242
|
logger.info(
|
|
1078
|
-
`[PDFConverter] Extracted ${
|
|
1243
|
+
`[PDFConverter] Extracted ${imageCount} picture images from JSON to ${imagesDir}`
|
|
1079
1244
|
);
|
|
1080
|
-
|
|
1245
|
+
await jqReplaceBase64WithPathsToFile(
|
|
1081
1246
|
jsonSourcePath,
|
|
1082
1247
|
jsonPath,
|
|
1083
|
-
"
|
|
1084
|
-
"
|
|
1248
|
+
"images",
|
|
1249
|
+
"pic"
|
|
1085
1250
|
);
|
|
1086
1251
|
logger.info(
|
|
1087
|
-
`[PDFConverter] Replaced ${
|
|
1252
|
+
`[PDFConverter] Replaced ${imageCount} base64 images with file paths`
|
|
1088
1253
|
);
|
|
1089
1254
|
} catch (e) {
|
|
1090
1255
|
logger.warn(
|
|
@@ -1097,51 +1262,45 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1097
1262
|
const htmlPath = (0, import_node_path2.join)(outputDir, `${baseName}.html`);
|
|
1098
1263
|
try {
|
|
1099
1264
|
const imagesDir = (0, import_node_path2.join)(outputDir, "images");
|
|
1100
|
-
if (!(0,
|
|
1101
|
-
(0,
|
|
1102
|
-
}
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
const filename2 = `image_${imageIndex}.png`;
|
|
1108
|
-
const filepath = (0, import_node_path2.join)(imagesDir, filename2);
|
|
1109
|
-
const buffer = Buffer.from(base64Content, "base64");
|
|
1110
|
-
(0, import_node_fs.writeFileSync)(filepath, buffer);
|
|
1111
|
-
const relativePath = `images/${filename2}`;
|
|
1112
|
-
imageIndex += 1;
|
|
1113
|
-
return `src="${relativePath}"`;
|
|
1114
|
-
}
|
|
1265
|
+
if (!(0, import_node_fs2.existsSync)(imagesDir)) {
|
|
1266
|
+
(0, import_node_fs2.mkdirSync)(imagesDir, { recursive: true });
|
|
1267
|
+
}
|
|
1268
|
+
const htmlImageCount = await _ImageExtractor.extractImagesFromHtmlStream(
|
|
1269
|
+
htmlSourcePath,
|
|
1270
|
+
htmlPath,
|
|
1271
|
+
imagesDir
|
|
1115
1272
|
);
|
|
1116
1273
|
logger.info(
|
|
1117
|
-
`[PDFConverter] Extracted ${
|
|
1274
|
+
`[PDFConverter] Extracted ${htmlImageCount} images from HTML to ${imagesDir}`
|
|
1118
1275
|
);
|
|
1119
|
-
(0, import_node_fs.writeFileSync)(htmlPath, transformedHtml, "utf-8");
|
|
1120
1276
|
} catch (e) {
|
|
1121
1277
|
logger.warn(
|
|
1122
|
-
"[PDFConverter] Failed to extract images from HTML,
|
|
1278
|
+
"[PDFConverter] Failed to extract images from HTML, copying original. Error:",
|
|
1123
1279
|
e
|
|
1124
1280
|
);
|
|
1125
|
-
(0,
|
|
1281
|
+
const rs = (0, import_node_fs2.createReadStream)(htmlSourcePath);
|
|
1282
|
+
const ws = (0, import_node_fs2.createWriteStream)(htmlPath);
|
|
1283
|
+
await (0, import_promises3.pipeline)(rs, ws);
|
|
1126
1284
|
}
|
|
1127
1285
|
logger.info("[PDFConverter] Saved HTML:", htmlPath);
|
|
1128
1286
|
}
|
|
1129
1287
|
/**
|
|
1130
1288
|
* Extract documents from ZIP and save with extracted images
|
|
1131
|
-
* Uses jq for JSON processing to handle large files
|
|
1289
|
+
* Uses jq for JSON processing and streaming for HTML to handle large files
|
|
1290
|
+
* without loading into Node.js memory
|
|
1132
1291
|
*
|
|
1133
1292
|
* Complete workflow:
|
|
1134
1293
|
* 1. Extract ZIP file to temporary directory
|
|
1135
1294
|
* 2. Find JSON and HTML files from extracted files
|
|
1136
|
-
* 3. Use jq to extract base64 images from JSON and save as separate files
|
|
1137
|
-
* 4. Use jq to replace base64 with file paths in JSON
|
|
1138
|
-
* 5. Process HTML with
|
|
1295
|
+
* 3. Use jq to stream-extract base64 images from JSON and save as separate files
|
|
1296
|
+
* 4. Use jq to replace base64 with file paths in JSON (piped to file)
|
|
1297
|
+
* 5. Process HTML with streaming Transform to extract and replace images
|
|
1139
1298
|
* 6. Save transformed documents to output directory (as result.json and result.html)
|
|
1140
1299
|
*/
|
|
1141
1300
|
static async extractAndSaveDocumentsFromZip(logger, zipPath, extractDir, outputDir) {
|
|
1142
1301
|
logger.info("[PDFConverter] Extracting ZIP file...");
|
|
1143
1302
|
await _ImageExtractor.extractZip(zipPath, extractDir);
|
|
1144
|
-
const files = (0,
|
|
1303
|
+
const files = (0, import_node_fs2.readdirSync)(extractDir);
|
|
1145
1304
|
const jsonFile = files.find((f) => (0, import_node_path2.extname)(f).toLowerCase() === ".json");
|
|
1146
1305
|
const htmlFile = files.find((f) => (0, import_node_path2.extname)(f).toLowerCase() === ".html");
|
|
1147
1306
|
if (!jsonFile || !htmlFile) {
|
|
@@ -1151,23 +1310,22 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1151
1310
|
}
|
|
1152
1311
|
const jsonPath = (0, import_node_path2.join)(extractDir, jsonFile);
|
|
1153
1312
|
const htmlPath = (0, import_node_path2.join)(extractDir, htmlFile);
|
|
1154
|
-
const htmlContent = (0, import_node_fs.readFileSync)(htmlPath, "utf-8");
|
|
1155
1313
|
logger.info("[PDFConverter] Saving converted files to output...");
|
|
1156
1314
|
await _ImageExtractor.saveDocumentsWithExtractedImages(
|
|
1157
1315
|
logger,
|
|
1158
1316
|
outputDir,
|
|
1159
1317
|
"result",
|
|
1160
1318
|
jsonPath,
|
|
1161
|
-
|
|
1319
|
+
htmlPath
|
|
1162
1320
|
);
|
|
1163
1321
|
logger.info("[PDFConverter] Files saved to:", outputDir);
|
|
1164
1322
|
}
|
|
1165
1323
|
};
|
|
1166
1324
|
|
|
1167
1325
|
// src/processors/page-renderer.ts
|
|
1168
|
-
var
|
|
1326
|
+
var import_node_fs3 = require("fs");
|
|
1169
1327
|
var import_node_path3 = require("path");
|
|
1170
|
-
var
|
|
1328
|
+
var PROGRESS_POLL_INTERVAL_MS = 2e3;
|
|
1171
1329
|
var PageRenderer = class {
|
|
1172
1330
|
constructor(logger) {
|
|
1173
1331
|
this.logger = logger;
|
|
@@ -1181,31 +1339,62 @@ var PageRenderer = class {
|
|
|
1181
1339
|
* @returns Render result with page count and file paths
|
|
1182
1340
|
*/
|
|
1183
1341
|
async renderPages(pdfPath, outputDir, options) {
|
|
1184
|
-
const dpi = options?.dpi ?? DEFAULT_DPI;
|
|
1342
|
+
const dpi = options?.dpi ?? PAGE_RENDERING.DEFAULT_DPI;
|
|
1185
1343
|
const pagesDir = (0, import_node_path3.join)(outputDir, "pages");
|
|
1186
|
-
if (!(0,
|
|
1187
|
-
(0,
|
|
1344
|
+
if (!(0, import_node_fs3.existsSync)(pagesDir)) {
|
|
1345
|
+
(0, import_node_fs3.mkdirSync)(pagesDir, { recursive: true });
|
|
1188
1346
|
}
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
dpi.toString(),
|
|
1194
|
-
pdfPath,
|
|
1195
|
-
"-background",
|
|
1196
|
-
"white",
|
|
1197
|
-
"-alpha",
|
|
1198
|
-
"remove",
|
|
1199
|
-
"-alpha",
|
|
1200
|
-
"off",
|
|
1201
|
-
outputPattern
|
|
1202
|
-
]);
|
|
1203
|
-
if (result.code !== 0) {
|
|
1204
|
-
throw new Error(
|
|
1205
|
-
`[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
|
|
1347
|
+
const totalPages = await this.getPageCount(pdfPath);
|
|
1348
|
+
if (totalPages > 0) {
|
|
1349
|
+
this.logger.info(
|
|
1350
|
+
`[PageRenderer] Rendering ${totalPages} pages at ${dpi} DPI...`
|
|
1206
1351
|
);
|
|
1352
|
+
} else {
|
|
1353
|
+
this.logger.info(`[PageRenderer] Rendering PDF at ${dpi} DPI...`);
|
|
1354
|
+
}
|
|
1355
|
+
const outputPattern = (0, import_node_path3.join)(pagesDir, "page_%d.png");
|
|
1356
|
+
let progressInterval = null;
|
|
1357
|
+
if (totalPages > 0) {
|
|
1358
|
+
let lastLoggedCount = 0;
|
|
1359
|
+
progressInterval = setInterval(() => {
|
|
1360
|
+
try {
|
|
1361
|
+
const rendered = (0, import_node_fs3.readdirSync)(pagesDir).filter(
|
|
1362
|
+
(f) => f.startsWith("page_") && f.endsWith(".png")
|
|
1363
|
+
).length;
|
|
1364
|
+
if (rendered > 0 && rendered !== lastLoggedCount) {
|
|
1365
|
+
lastLoggedCount = rendered;
|
|
1366
|
+
this.logger.info(
|
|
1367
|
+
`[PageRenderer] Rendering pages: ${rendered}/${totalPages}`
|
|
1368
|
+
);
|
|
1369
|
+
}
|
|
1370
|
+
} catch {
|
|
1371
|
+
}
|
|
1372
|
+
}, PROGRESS_POLL_INTERVAL_MS);
|
|
1373
|
+
}
|
|
1374
|
+
try {
|
|
1375
|
+
const result = await spawnAsync("magick", [
|
|
1376
|
+
"-density",
|
|
1377
|
+
dpi.toString(),
|
|
1378
|
+
pdfPath,
|
|
1379
|
+
"-background",
|
|
1380
|
+
"white",
|
|
1381
|
+
"-alpha",
|
|
1382
|
+
"remove",
|
|
1383
|
+
"-alpha",
|
|
1384
|
+
"off",
|
|
1385
|
+
outputPattern
|
|
1386
|
+
]);
|
|
1387
|
+
if (result.code !== 0) {
|
|
1388
|
+
throw new Error(
|
|
1389
|
+
`[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
|
|
1390
|
+
);
|
|
1391
|
+
}
|
|
1392
|
+
} finally {
|
|
1393
|
+
if (progressInterval) {
|
|
1394
|
+
clearInterval(progressInterval);
|
|
1395
|
+
}
|
|
1207
1396
|
}
|
|
1208
|
-
const pageFiles = (0,
|
|
1397
|
+
const pageFiles = (0, import_node_fs3.readdirSync)(pagesDir).filter((f) => f.startsWith("page_") && f.endsWith(".png")).sort((a, b) => {
|
|
1209
1398
|
const numA = parseInt(a.replace("page_", "").replace(".png", ""), 10);
|
|
1210
1399
|
const numB = parseInt(b.replace("page_", "").replace(".png", ""), 10);
|
|
1211
1400
|
return numA - numB;
|
|
@@ -1219,6 +1408,20 @@ var PageRenderer = class {
|
|
|
1219
1408
|
pageFiles
|
|
1220
1409
|
};
|
|
1221
1410
|
}
|
|
1411
|
+
/**
|
|
1412
|
+
* Get total page count using pdfinfo.
|
|
1413
|
+
* Returns 0 on failure (progress logging will be skipped).
|
|
1414
|
+
*/
|
|
1415
|
+
async getPageCount(pdfPath) {
|
|
1416
|
+
try {
|
|
1417
|
+
const result = await spawnAsync("pdfinfo", [pdfPath]);
|
|
1418
|
+
if (result.code !== 0) return 0;
|
|
1419
|
+
const match = result.stdout.match(/^Pages:\s+(\d+)/m);
|
|
1420
|
+
return match ? parseInt(match[1], 10) : 0;
|
|
1421
|
+
} catch {
|
|
1422
|
+
return 0;
|
|
1423
|
+
}
|
|
1424
|
+
}
|
|
1222
1425
|
};
|
|
1223
1426
|
|
|
1224
1427
|
// src/processors/pdf-text-extractor.ts
|
|
@@ -1304,7 +1507,7 @@ var PdfTextExtractor = class {
|
|
|
1304
1507
|
};
|
|
1305
1508
|
|
|
1306
1509
|
// src/processors/vlm-text-corrector.ts
|
|
1307
|
-
var
|
|
1510
|
+
var import_node_fs4 = require("fs");
|
|
1308
1511
|
var import_node_path4 = require("path");
|
|
1309
1512
|
|
|
1310
1513
|
// src/types/vlm-text-correction-schema.ts
|
|
@@ -1436,7 +1639,7 @@ var VlmTextCorrector = class {
|
|
|
1436
1639
|
async correctAndSave(outputDir, model, options) {
|
|
1437
1640
|
this.logger.info("[VlmTextCorrector] Starting text correction...");
|
|
1438
1641
|
const resultPath = (0, import_node_path4.join)(outputDir, "result.json");
|
|
1439
|
-
const doc = JSON.parse((0,
|
|
1642
|
+
const doc = JSON.parse((0, import_node_fs4.readFileSync)(resultPath, "utf-8"));
|
|
1440
1643
|
let pageNumbers = this.getPageNumbers(doc);
|
|
1441
1644
|
if (pageNumbers.length === 0) {
|
|
1442
1645
|
this.logger.info("[VlmTextCorrector] No pages to process");
|
|
@@ -1487,7 +1690,7 @@ var VlmTextCorrector = class {
|
|
|
1487
1690
|
if (corrections === null) continue;
|
|
1488
1691
|
this.applyCorrections(doc, pageNumbers[i], corrections);
|
|
1489
1692
|
}
|
|
1490
|
-
(0,
|
|
1693
|
+
(0, import_node_fs4.writeFileSync)(resultPath, JSON.stringify(doc, null, 2));
|
|
1491
1694
|
this.logger.info(
|
|
1492
1695
|
`[VlmTextCorrector] Correction complete: ${totalTextCorrections} text, ${totalCellCorrections} cell corrections across ${pageNumbers.length} pages (${pagesFailed} failed)`
|
|
1493
1696
|
);
|
|
@@ -1763,7 +1966,7 @@ var VlmTextCorrector = class {
|
|
|
1763
1966
|
*/
|
|
1764
1967
|
readPageImage(outputDir, pageNo) {
|
|
1765
1968
|
const imagePath = (0, import_node_path4.join)(outputDir, "pages", `page_${pageNo - 1}.png`);
|
|
1766
|
-
return (0,
|
|
1969
|
+
return (0, import_node_fs4.readFileSync)(imagePath).toString("base64");
|
|
1767
1970
|
}
|
|
1768
1971
|
/**
|
|
1769
1972
|
* Apply VLM corrections to the DoclingDocument.
|
|
@@ -1817,9 +2020,9 @@ var VlmTextCorrector = class {
|
|
|
1817
2020
|
};
|
|
1818
2021
|
|
|
1819
2022
|
// src/samplers/ocr-strategy-sampler.ts
|
|
1820
|
-
var
|
|
2023
|
+
var import_model = require("@heripo/model");
|
|
2024
|
+
var import_node_fs5 = require("fs");
|
|
1821
2025
|
var import_v42 = require("zod/v4");
|
|
1822
|
-
var SAMPLE_DPI = 150;
|
|
1823
2026
|
var EDGE_TRIM_RATIO = 0.1;
|
|
1824
2027
|
var DEFAULT_MAX_SAMPLE_PAGES = 15;
|
|
1825
2028
|
var DEFAULT_MAX_RETRIES2 = 3;
|
|
@@ -1870,7 +2073,7 @@ var OcrStrategySampler = class {
|
|
|
1870
2073
|
const renderResult = await this.pageRenderer.renderPages(
|
|
1871
2074
|
pdfPath,
|
|
1872
2075
|
outputDir,
|
|
1873
|
-
{ dpi: SAMPLE_DPI }
|
|
2076
|
+
{ dpi: PAGE_RENDERING.SAMPLE_DPI }
|
|
1874
2077
|
);
|
|
1875
2078
|
if (renderResult.pageCount === 0) {
|
|
1876
2079
|
this.logger.info("[OcrStrategySampler] No pages found in PDF");
|
|
@@ -1889,7 +2092,7 @@ var OcrStrategySampler = class {
|
|
|
1889
2092
|
`[OcrStrategySampler] Sampling ${sampleIndices.length} of ${renderResult.pageCount} pages: [${sampleIndices.map((i) => i + 1).join(", ")}]`
|
|
1890
2093
|
);
|
|
1891
2094
|
let sampledCount = 0;
|
|
1892
|
-
|
|
2095
|
+
const languageFrequency = /* @__PURE__ */ new Map();
|
|
1893
2096
|
for (const idx of sampleIndices) {
|
|
1894
2097
|
sampledCount++;
|
|
1895
2098
|
const pageFile = renderResult.pageFiles[idx];
|
|
@@ -1899,14 +2102,17 @@ var OcrStrategySampler = class {
|
|
|
1899
2102
|
model,
|
|
1900
2103
|
options
|
|
1901
2104
|
);
|
|
1902
|
-
|
|
2105
|
+
for (const lang of pageAnalysis.detectedLanguages) {
|
|
2106
|
+
languageFrequency.set(lang, (languageFrequency.get(lang) ?? 0) + 1);
|
|
2107
|
+
}
|
|
1903
2108
|
if (pageAnalysis.hasKoreanHanjaMix) {
|
|
1904
2109
|
this.logger.info(
|
|
1905
2110
|
`[OcrStrategySampler] Korean-Hanja mix detected on page ${idx + 1} \u2192 VLM strategy`
|
|
1906
2111
|
);
|
|
2112
|
+
const detectedLanguages2 = this.aggregateLanguages(languageFrequency);
|
|
1907
2113
|
return {
|
|
1908
2114
|
method: "vlm",
|
|
1909
|
-
detectedLanguages,
|
|
2115
|
+
detectedLanguages: detectedLanguages2,
|
|
1910
2116
|
reason: `Korean-Hanja mix detected on page ${idx + 1}`,
|
|
1911
2117
|
sampledPages: sampledCount,
|
|
1912
2118
|
totalPages: renderResult.pageCount
|
|
@@ -1916,6 +2122,7 @@ var OcrStrategySampler = class {
|
|
|
1916
2122
|
this.logger.info(
|
|
1917
2123
|
"[OcrStrategySampler] No Korean-Hanja mix detected \u2192 ocrmac strategy"
|
|
1918
2124
|
);
|
|
2125
|
+
const detectedLanguages = this.aggregateLanguages(languageFrequency);
|
|
1919
2126
|
return {
|
|
1920
2127
|
method: "ocrmac",
|
|
1921
2128
|
detectedLanguages,
|
|
@@ -2020,14 +2227,15 @@ var OcrStrategySampler = class {
|
|
|
2020
2227
|
}
|
|
2021
2228
|
/**
|
|
2022
2229
|
* Analyze a single sample page for Korean-Hanja mixed script and primary language.
|
|
2230
|
+
* Normalizes raw VLM language responses to valid BCP 47 tags, filtering out invalid ones.
|
|
2023
2231
|
*
|
|
2024
|
-
* @returns Object with Korean-Hanja detection result and detected languages
|
|
2232
|
+
* @returns Object with Korean-Hanja detection result and normalized detected languages
|
|
2025
2233
|
*/
|
|
2026
2234
|
async analyzeSamplePage(pageFile, pageNo, model, options) {
|
|
2027
2235
|
this.logger.debug(
|
|
2028
2236
|
`[OcrStrategySampler] Analyzing page ${pageNo} for Korean-Hanja mix and language...`
|
|
2029
2237
|
);
|
|
2030
|
-
const base64Image = (0,
|
|
2238
|
+
const base64Image = (0, import_node_fs5.readFileSync)(pageFile).toString("base64");
|
|
2031
2239
|
const messages = [
|
|
2032
2240
|
{
|
|
2033
2241
|
role: "user",
|
|
@@ -2055,18 +2263,27 @@ var OcrStrategySampler = class {
|
|
|
2055
2263
|
options.aggregator.track(result.usage);
|
|
2056
2264
|
}
|
|
2057
2265
|
const output = result.output;
|
|
2266
|
+
const normalizedLanguages = output.detectedLanguages.map(import_model.normalizeToBcp47).filter((tag) => tag !== null);
|
|
2058
2267
|
this.logger.debug(
|
|
2059
|
-
`[OcrStrategySampler] Page ${pageNo}: hasKoreanHanjaMix=${output.hasKoreanHanjaMix}, detectedLanguages=${
|
|
2268
|
+
`[OcrStrategySampler] Page ${pageNo}: hasKoreanHanjaMix=${output.hasKoreanHanjaMix}, detectedLanguages=${normalizedLanguages.join(",")}`
|
|
2060
2269
|
);
|
|
2061
2270
|
return {
|
|
2062
2271
|
hasKoreanHanjaMix: output.hasKoreanHanjaMix,
|
|
2063
|
-
detectedLanguages:
|
|
2272
|
+
detectedLanguages: normalizedLanguages
|
|
2064
2273
|
};
|
|
2065
2274
|
}
|
|
2275
|
+
/**
|
|
2276
|
+
* Aggregate language frequency map into a sorted array.
|
|
2277
|
+
* Returns languages sorted by frequency (descending), or undefined if empty.
|
|
2278
|
+
*/
|
|
2279
|
+
aggregateLanguages(frequencyMap) {
|
|
2280
|
+
if (frequencyMap.size === 0) return void 0;
|
|
2281
|
+
return [...frequencyMap.entries()].sort((a, b) => b[1] - a[1]).map(([lang]) => lang);
|
|
2282
|
+
}
|
|
2066
2283
|
};
|
|
2067
2284
|
|
|
2068
2285
|
// src/utils/local-file-server.ts
|
|
2069
|
-
var
|
|
2286
|
+
var import_node_fs6 = require("fs");
|
|
2070
2287
|
var import_node_http = require("http");
|
|
2071
2288
|
var import_node_path5 = require("path");
|
|
2072
2289
|
var LocalFileServer = class {
|
|
@@ -2080,7 +2297,7 @@ var LocalFileServer = class {
|
|
|
2080
2297
|
*/
|
|
2081
2298
|
async start(filePath) {
|
|
2082
2299
|
const filename = (0, import_node_path5.basename)(filePath);
|
|
2083
|
-
const stat = (0,
|
|
2300
|
+
const stat = (0, import_node_fs6.statSync)(filePath);
|
|
2084
2301
|
return new Promise((resolve, reject) => {
|
|
2085
2302
|
this.server = (0, import_node_http.createServer)((req, res) => {
|
|
2086
2303
|
if (req.url === `/${filename}`) {
|
|
@@ -2088,7 +2305,7 @@ var LocalFileServer = class {
|
|
|
2088
2305
|
"Content-Type": "application/pdf",
|
|
2089
2306
|
"Content-Length": stat.size
|
|
2090
2307
|
});
|
|
2091
|
-
(0,
|
|
2308
|
+
(0, import_node_fs6.createReadStream)(filePath).pipe(res);
|
|
2092
2309
|
} else {
|
|
2093
2310
|
res.writeHead(404);
|
|
2094
2311
|
res.end("Not Found");
|
|
@@ -2125,7 +2342,7 @@ var LocalFileServer = class {
|
|
|
2125
2342
|
};
|
|
2126
2343
|
|
|
2127
2344
|
// src/core/image-pdf-converter.ts
|
|
2128
|
-
var
|
|
2345
|
+
var import_node_fs7 = require("fs");
|
|
2129
2346
|
var import_node_os = require("os");
|
|
2130
2347
|
var import_node_path6 = require("path");
|
|
2131
2348
|
var ImagePdfConverter = class {
|
|
@@ -2153,8 +2370,8 @@ var ImagePdfConverter = class {
|
|
|
2153
2370
|
this.logger.info("[ImagePdfConverter] Image PDF created:", outputPath);
|
|
2154
2371
|
return outputPath;
|
|
2155
2372
|
} finally {
|
|
2156
|
-
if ((0,
|
|
2157
|
-
(0,
|
|
2373
|
+
if ((0, import_node_fs7.existsSync)(inputPath)) {
|
|
2374
|
+
(0, import_node_fs7.rmSync)(inputPath, { force: true });
|
|
2158
2375
|
}
|
|
2159
2376
|
}
|
|
2160
2377
|
}
|
|
@@ -2201,12 +2418,12 @@ var ImagePdfConverter = class {
|
|
|
2201
2418
|
* Cleanup the temporary image PDF file
|
|
2202
2419
|
*/
|
|
2203
2420
|
cleanup(imagePdfPath) {
|
|
2204
|
-
if ((0,
|
|
2421
|
+
if ((0, import_node_fs7.existsSync)(imagePdfPath)) {
|
|
2205
2422
|
this.logger.info(
|
|
2206
2423
|
"[ImagePdfConverter] Cleaning up temp file:",
|
|
2207
2424
|
imagePdfPath
|
|
2208
2425
|
);
|
|
2209
|
-
(0,
|
|
2426
|
+
(0, import_node_fs7.rmSync)(imagePdfPath, { force: true });
|
|
2210
2427
|
}
|
|
2211
2428
|
}
|
|
2212
2429
|
};
|
|
@@ -2350,8 +2567,8 @@ var PDFConverter = class {
|
|
|
2350
2567
|
}
|
|
2351
2568
|
return strategy;
|
|
2352
2569
|
} finally {
|
|
2353
|
-
if ((0,
|
|
2354
|
-
(0,
|
|
2570
|
+
if ((0, import_node_fs8.existsSync)(samplingDir)) {
|
|
2571
|
+
(0, import_node_fs8.rmSync)(samplingDir, { recursive: true, force: true });
|
|
2355
2572
|
}
|
|
2356
2573
|
}
|
|
2357
2574
|
}
|
|
@@ -2373,8 +2590,10 @@ var PDFConverter = class {
|
|
|
2373
2590
|
let pageTexts;
|
|
2374
2591
|
try {
|
|
2375
2592
|
const resultPath2 = (0, import_node_path7.join)(outputDir, "result.json");
|
|
2376
|
-
const
|
|
2377
|
-
|
|
2593
|
+
const totalPages = await runJqFileJson(
|
|
2594
|
+
".pages | length",
|
|
2595
|
+
resultPath2
|
|
2596
|
+
);
|
|
2378
2597
|
const textExtractor = new PdfTextExtractor(this.logger);
|
|
2379
2598
|
pageTexts = await textExtractor.extractText(pdfPath, totalPages);
|
|
2380
2599
|
} catch {
|
|
@@ -2384,7 +2603,7 @@ var PDFConverter = class {
|
|
|
2384
2603
|
}
|
|
2385
2604
|
const resultPath = (0, import_node_path7.join)(outputDir, "result.json");
|
|
2386
2605
|
const ocrOriginPath = (0, import_node_path7.join)(outputDir, "result_ocr_origin.json");
|
|
2387
|
-
(0,
|
|
2606
|
+
(0, import_node_fs8.copyFileSync)(resultPath, ocrOriginPath);
|
|
2388
2607
|
const corrector = new VlmTextCorrector(this.logger);
|
|
2389
2608
|
await corrector.correctAndSave(outputDir, options.vlmProcessorModel, {
|
|
2390
2609
|
concurrency: options.vlmConcurrency,
|
|
@@ -2531,6 +2750,7 @@ var PDFConverter = class {
|
|
|
2531
2750
|
const outputDir = (0, import_node_path7.join)(cwd, "output", reportId);
|
|
2532
2751
|
try {
|
|
2533
2752
|
await this.processConvertedFiles(zipPath, extractDir, outputDir);
|
|
2753
|
+
await this.renderPageImages(url, outputDir);
|
|
2534
2754
|
if (abortSignal?.aborted) {
|
|
2535
2755
|
this.logger.info("[PDFConverter] Conversion aborted before callback");
|
|
2536
2756
|
const error = new Error("PDF conversion was aborted");
|
|
@@ -2544,19 +2764,19 @@ var PDFConverter = class {
|
|
|
2544
2764
|
this.logger.info("[PDFConverter] Total time:", duration, "ms");
|
|
2545
2765
|
} finally {
|
|
2546
2766
|
this.logger.info("[PDFConverter] Cleaning up temporary files...");
|
|
2547
|
-
if ((0,
|
|
2548
|
-
(0,
|
|
2767
|
+
if ((0, import_node_fs8.existsSync)(zipPath)) {
|
|
2768
|
+
(0, import_node_fs8.rmSync)(zipPath, { force: true });
|
|
2549
2769
|
}
|
|
2550
|
-
if ((0,
|
|
2551
|
-
(0,
|
|
2770
|
+
if ((0, import_node_fs8.existsSync)(extractDir)) {
|
|
2771
|
+
(0, import_node_fs8.rmSync)(extractDir, { recursive: true, force: true });
|
|
2552
2772
|
}
|
|
2553
2773
|
if (cleanupAfterCallback) {
|
|
2554
2774
|
this.logger.info(
|
|
2555
2775
|
"[PDFConverter] Cleaning up output directory:",
|
|
2556
2776
|
outputDir
|
|
2557
2777
|
);
|
|
2558
|
-
if ((0,
|
|
2559
|
-
(0,
|
|
2778
|
+
if ((0, import_node_fs8.existsSync)(outputDir)) {
|
|
2779
|
+
(0, import_node_fs8.rmSync)(outputDir, { recursive: true, force: true });
|
|
2560
2780
|
}
|
|
2561
2781
|
} else {
|
|
2562
2782
|
this.logger.info("[PDFConverter] Output preserved at:", outputDir);
|
|
@@ -2586,6 +2806,8 @@ var PDFConverter = class {
|
|
|
2586
2806
|
framework: "livetext"
|
|
2587
2807
|
},
|
|
2588
2808
|
generate_picture_images: true,
|
|
2809
|
+
generate_page_images: false,
|
|
2810
|
+
// Page images are rendered by PageRenderer (ImageMagick) after conversion
|
|
2589
2811
|
images_scale: 2,
|
|
2590
2812
|
/**
|
|
2591
2813
|
* While disabling this option yields the most accurate text extraction for readable PDFs,
|
|
@@ -2703,12 +2925,12 @@ var PDFConverter = class {
|
|
|
2703
2925
|
const zipPath = (0, import_node_path7.join)(process.cwd(), "result.zip");
|
|
2704
2926
|
this.logger.info("[PDFConverter] Saving ZIP file to:", zipPath);
|
|
2705
2927
|
if (zipResult.fileStream) {
|
|
2706
|
-
const writeStream = (0,
|
|
2707
|
-
await (0,
|
|
2928
|
+
const writeStream = (0, import_node_fs8.createWriteStream)(zipPath);
|
|
2929
|
+
await (0, import_promises5.pipeline)(zipResult.fileStream, writeStream);
|
|
2708
2930
|
return;
|
|
2709
2931
|
}
|
|
2710
2932
|
if (zipResult.data) {
|
|
2711
|
-
await (0,
|
|
2933
|
+
await (0, import_promises4.writeFile)(zipPath, zipResult.data);
|
|
2712
2934
|
return;
|
|
2713
2935
|
}
|
|
2714
2936
|
this.logger.warn(
|
|
@@ -2724,7 +2946,7 @@ var PDFConverter = class {
|
|
|
2724
2946
|
);
|
|
2725
2947
|
}
|
|
2726
2948
|
const buffer = new Uint8Array(await response.arrayBuffer());
|
|
2727
|
-
await (0,
|
|
2949
|
+
await (0, import_promises4.writeFile)(zipPath, buffer);
|
|
2728
2950
|
}
|
|
2729
2951
|
async processConvertedFiles(zipPath, extractDir, outputDir) {
|
|
2730
2952
|
await ImageExtractor.extractAndSaveDocumentsFromZip(
|
|
@@ -2734,6 +2956,42 @@ var PDFConverter = class {
|
|
|
2734
2956
|
outputDir
|
|
2735
2957
|
);
|
|
2736
2958
|
}
|
|
2959
|
+
/**
|
|
2960
|
+
* Render page images from the source PDF using ImageMagick and update result.json.
|
|
2961
|
+
* Uses jq to update the JSON file without loading it into Node.js memory.
|
|
2962
|
+
* Replaces Docling's generate_page_images which fails on large PDFs
|
|
2963
|
+
* due to memory limits when embedding all page images as base64.
|
|
2964
|
+
*/
|
|
2965
|
+
async renderPageImages(url, outputDir) {
|
|
2966
|
+
if (!url.startsWith("file://")) {
|
|
2967
|
+
this.logger.warn(
|
|
2968
|
+
"[PDFConverter] Page image rendering skipped: only supported for local files (file:// URLs)"
|
|
2969
|
+
);
|
|
2970
|
+
return;
|
|
2971
|
+
}
|
|
2972
|
+
const pdfPath = url.slice(7);
|
|
2973
|
+
this.logger.info(
|
|
2974
|
+
"[PDFConverter] Rendering page images with ImageMagick..."
|
|
2975
|
+
);
|
|
2976
|
+
const renderer = new PageRenderer(this.logger);
|
|
2977
|
+
const renderResult = await renderer.renderPages(pdfPath, outputDir);
|
|
2978
|
+
const resultPath = (0, import_node_path7.join)(outputDir, "result.json");
|
|
2979
|
+
const tmpPath = resultPath + ".tmp";
|
|
2980
|
+
const jqProgram = `
|
|
2981
|
+
.pages |= with_entries(
|
|
2982
|
+
if (.value.page_no - 1) >= 0 and (.value.page_no - 1) < ${renderResult.pageCount} then
|
|
2983
|
+
.value.image.uri = "pages/page_\\(.value.page_no - 1).png" |
|
|
2984
|
+
.value.image.mimetype = "image/png" |
|
|
2985
|
+
.value.image.dpi = ${PAGE_RENDERING.DEFAULT_DPI}
|
|
2986
|
+
else . end
|
|
2987
|
+
)
|
|
2988
|
+
`;
|
|
2989
|
+
await runJqFileToFile(jqProgram, resultPath, tmpPath);
|
|
2990
|
+
await (0, import_promises4.rename)(tmpPath, resultPath);
|
|
2991
|
+
this.logger.info(
|
|
2992
|
+
`[PDFConverter] Rendered ${renderResult.pageCount} page images`
|
|
2993
|
+
);
|
|
2994
|
+
}
|
|
2737
2995
|
};
|
|
2738
2996
|
|
|
2739
2997
|
// src/core/pdf-parser.ts
|
|
@@ -2772,6 +3030,7 @@ var PDFParser = class {
|
|
|
2772
3030
|
this.logger.info("[PDFParser] Initializing...");
|
|
2773
3031
|
this.checkOperatingSystem();
|
|
2774
3032
|
this.checkJqInstalled();
|
|
3033
|
+
this.checkPopplerInstalled();
|
|
2775
3034
|
this.checkMacOSVersion();
|
|
2776
3035
|
if (this.enableImagePdfFallback && !this.baseUrl) {
|
|
2777
3036
|
this.checkImageMagickInstalled();
|
|
@@ -2828,6 +3087,15 @@ var PDFParser = class {
|
|
|
2828
3087
|
);
|
|
2829
3088
|
}
|
|
2830
3089
|
}
|
|
3090
|
+
checkPopplerInstalled() {
|
|
3091
|
+
try {
|
|
3092
|
+
(0, import_node_child_process3.execSync)("which pdftotext", { stdio: "ignore" });
|
|
3093
|
+
} catch {
|
|
3094
|
+
throw new Error(
|
|
3095
|
+
"poppler is not installed. Please install poppler using: brew install poppler"
|
|
3096
|
+
);
|
|
3097
|
+
}
|
|
3098
|
+
}
|
|
2831
3099
|
checkMacOSVersion() {
|
|
2832
3100
|
try {
|
|
2833
3101
|
const versionOutput = (0, import_node_child_process3.execSync)("sw_vers -productVersion", {
|