@heripo/pdf-parser 0.1.9 → 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +379 -153
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +345 -125
- package/dist/index.js.map +1 -1
- package/package.json +5 -5
package/dist/index.js
CHANGED
|
@@ -43,11 +43,17 @@ var DOCLING_ENVIRONMENT = {
|
|
|
43
43
|
*/
|
|
44
44
|
STARTUP_DELAY_MS: 2e3
|
|
45
45
|
};
|
|
46
|
+
var PAGE_RENDERING = {
|
|
47
|
+
/** Default rendering DPI for VLM text recognition quality */
|
|
48
|
+
DEFAULT_DPI: 200,
|
|
49
|
+
/** Low-resolution DPI for OCR strategy sampling */
|
|
50
|
+
SAMPLE_DPI: 150
|
|
51
|
+
};
|
|
46
52
|
var IMAGE_PDF_CONVERTER = {
|
|
47
53
|
/**
|
|
48
54
|
* ImageMagick density option (DPI) for PDF to image conversion
|
|
49
55
|
*/
|
|
50
|
-
DENSITY:
|
|
56
|
+
DENSITY: PAGE_RENDERING.DEFAULT_DPI,
|
|
51
57
|
/**
|
|
52
58
|
* ImageMagick quality option (1-100)
|
|
53
59
|
*/
|
|
@@ -837,16 +843,10 @@ var DoclingEnvironment = class _DoclingEnvironment {
|
|
|
837
843
|
|
|
838
844
|
// src/core/pdf-converter.ts
|
|
839
845
|
import { omit } from "es-toolkit";
|
|
840
|
-
import {
|
|
841
|
-
|
|
842
|
-
createWriteStream as createWriteStream2,
|
|
843
|
-
existsSync as existsSync4,
|
|
844
|
-
readFileSync as readFileSync4,
|
|
845
|
-
rmSync as rmSync3
|
|
846
|
-
} from "fs";
|
|
847
|
-
import { writeFile } from "fs/promises";
|
|
846
|
+
import { copyFileSync, createWriteStream as createWriteStream3, existsSync as existsSync4, rmSync as rmSync3 } from "fs";
|
|
847
|
+
import { rename as rename2, writeFile } from "fs/promises";
|
|
848
848
|
import { join as join6 } from "path";
|
|
849
|
-
import { pipeline } from "stream/promises";
|
|
849
|
+
import { pipeline as pipeline3 } from "stream/promises";
|
|
850
850
|
|
|
851
851
|
// src/errors/image-pdf-fallback-error.ts
|
|
852
852
|
var ImagePdfFallbackError = class extends Error {
|
|
@@ -862,19 +862,24 @@ var ImagePdfFallbackError = class extends Error {
|
|
|
862
862
|
|
|
863
863
|
// src/processors/image-extractor.ts
|
|
864
864
|
import {
|
|
865
|
-
|
|
865
|
+
createReadStream,
|
|
866
|
+
createWriteStream as createWriteStream2,
|
|
866
867
|
existsSync,
|
|
867
868
|
mkdirSync,
|
|
868
|
-
readFileSync,
|
|
869
869
|
readdirSync,
|
|
870
870
|
rmSync,
|
|
871
871
|
writeFileSync
|
|
872
872
|
} from "fs";
|
|
873
873
|
import { extname, join as join2 } from "path";
|
|
874
|
+
import { Transform } from "stream";
|
|
875
|
+
import { pipeline as pipeline2 } from "stream/promises";
|
|
874
876
|
import * as yauzl from "yauzl";
|
|
875
877
|
|
|
876
878
|
// src/utils/jq.ts
|
|
877
879
|
import { spawn as spawn3 } from "child_process";
|
|
880
|
+
import { createWriteStream } from "fs";
|
|
881
|
+
import { rename } from "fs/promises";
|
|
882
|
+
import { pipeline } from "stream/promises";
|
|
878
883
|
function getJqPath() {
|
|
879
884
|
const p = process.env.JQ_PATH?.trim();
|
|
880
885
|
return p && p.length > 0 ? p : "jq";
|
|
@@ -926,25 +931,139 @@ function runJqFileJson(program, filePath) {
|
|
|
926
931
|
});
|
|
927
932
|
});
|
|
928
933
|
}
|
|
929
|
-
function
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
]
|
|
935
|
-
|
|
936
|
-
|
|
934
|
+
function runJqFileToFile(program, inputPath, outputPath) {
|
|
935
|
+
return new Promise((resolve, reject) => {
|
|
936
|
+
const jqPath = getJqPath();
|
|
937
|
+
const args = [program, inputPath];
|
|
938
|
+
const child = spawn3(jqPath, args, {
|
|
939
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
940
|
+
env: process.env
|
|
941
|
+
});
|
|
942
|
+
let stderr = "";
|
|
943
|
+
let exitCode = null;
|
|
944
|
+
let pipelineDone = false;
|
|
945
|
+
let settled = false;
|
|
946
|
+
child.stderr.setEncoding("utf-8");
|
|
947
|
+
child.stderr.on("data", (chunk) => {
|
|
948
|
+
stderr += chunk;
|
|
949
|
+
});
|
|
950
|
+
const ws = createWriteStream(outputPath);
|
|
951
|
+
function trySettle() {
|
|
952
|
+
if (settled) return;
|
|
953
|
+
if (!pipelineDone || exitCode === null) return;
|
|
954
|
+
settled = true;
|
|
955
|
+
if (exitCode !== 0) {
|
|
956
|
+
reject(
|
|
957
|
+
new Error(
|
|
958
|
+
`jq exited with code ${exitCode}. ${stderr ? "Stderr: " + stderr : ""}`
|
|
959
|
+
)
|
|
960
|
+
);
|
|
961
|
+
} else {
|
|
962
|
+
resolve();
|
|
963
|
+
}
|
|
964
|
+
}
|
|
965
|
+
child.on("error", (err) => {
|
|
966
|
+
if (settled) return;
|
|
967
|
+
settled = true;
|
|
968
|
+
ws.destroy();
|
|
969
|
+
reject(err);
|
|
970
|
+
});
|
|
971
|
+
pipeline(child.stdout, ws).then(() => {
|
|
972
|
+
pipelineDone = true;
|
|
973
|
+
trySettle();
|
|
974
|
+
}).catch((err) => {
|
|
975
|
+
if (settled) return;
|
|
976
|
+
settled = true;
|
|
977
|
+
reject(err);
|
|
978
|
+
});
|
|
979
|
+
child.on("close", (code) => {
|
|
980
|
+
exitCode = code ?? 1;
|
|
981
|
+
trySettle();
|
|
982
|
+
});
|
|
983
|
+
});
|
|
937
984
|
}
|
|
938
|
-
function
|
|
985
|
+
function runJqFileLines(program, filePath, onLine) {
|
|
986
|
+
return new Promise((resolve, reject) => {
|
|
987
|
+
const jqPath = getJqPath();
|
|
988
|
+
const args = ["-r", program, filePath];
|
|
989
|
+
const child = spawn3(jqPath, args, {
|
|
990
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
991
|
+
env: process.env
|
|
992
|
+
});
|
|
993
|
+
let stderr = "";
|
|
994
|
+
let buffer = "";
|
|
995
|
+
let callbackError = false;
|
|
996
|
+
child.stdout.setEncoding("utf-8");
|
|
997
|
+
child.stderr.setEncoding("utf-8");
|
|
998
|
+
function safeOnLine(line) {
|
|
999
|
+
if (callbackError) return;
|
|
1000
|
+
try {
|
|
1001
|
+
onLine(line);
|
|
1002
|
+
} catch (err) {
|
|
1003
|
+
callbackError = true;
|
|
1004
|
+
child.kill();
|
|
1005
|
+
reject(err);
|
|
1006
|
+
}
|
|
1007
|
+
}
|
|
1008
|
+
child.stdout.on("data", (chunk) => {
|
|
1009
|
+
buffer += chunk;
|
|
1010
|
+
let newlineIdx;
|
|
1011
|
+
while ((newlineIdx = buffer.indexOf("\n")) !== -1) {
|
|
1012
|
+
const line = buffer.slice(0, newlineIdx);
|
|
1013
|
+
buffer = buffer.slice(newlineIdx + 1);
|
|
1014
|
+
if (line.length > 0) {
|
|
1015
|
+
safeOnLine(line);
|
|
1016
|
+
}
|
|
1017
|
+
}
|
|
1018
|
+
});
|
|
1019
|
+
child.stderr.on("data", (chunk) => {
|
|
1020
|
+
stderr += chunk;
|
|
1021
|
+
});
|
|
1022
|
+
child.on("error", (err) => {
|
|
1023
|
+
if (!callbackError) reject(err);
|
|
1024
|
+
});
|
|
1025
|
+
child.on("close", (code) => {
|
|
1026
|
+
if (callbackError) return;
|
|
1027
|
+
if (buffer.length > 0) {
|
|
1028
|
+
safeOnLine(buffer);
|
|
1029
|
+
}
|
|
1030
|
+
if (callbackError) return;
|
|
1031
|
+
if (code !== 0) {
|
|
1032
|
+
reject(
|
|
1033
|
+
new Error(
|
|
1034
|
+
`jq exited with code ${code}. ${stderr ? "Stderr: " + stderr : ""}`
|
|
1035
|
+
)
|
|
1036
|
+
);
|
|
1037
|
+
} else {
|
|
1038
|
+
resolve();
|
|
1039
|
+
}
|
|
1040
|
+
});
|
|
1041
|
+
});
|
|
1042
|
+
}
|
|
1043
|
+
async function jqExtractBase64PngStringsStreaming(filePath, onImage) {
|
|
1044
|
+
let index = 0;
|
|
1045
|
+
await runJqFileLines(
|
|
1046
|
+
'.. | select(type == "string" and startswith("data:image/png;base64"))',
|
|
1047
|
+
filePath,
|
|
1048
|
+
(line) => {
|
|
1049
|
+
onImage(line, index);
|
|
1050
|
+
index++;
|
|
1051
|
+
}
|
|
1052
|
+
);
|
|
1053
|
+
return index;
|
|
1054
|
+
}
|
|
1055
|
+
async function jqReplaceBase64WithPathsToFile(inputPath, outputPath, dirName, prefix) {
|
|
939
1056
|
const program = `
|
|
940
1057
|
reduce paths(type == "string" and startswith("data:image/png;base64")) as $p (
|
|
941
1058
|
{data: ., counter: 0};
|
|
942
1059
|
.counter as $idx |
|
|
943
1060
|
.data |= setpath($p; "${dirName}/${prefix}_\\($idx).png") |
|
|
944
1061
|
.counter += 1
|
|
945
|
-
) |
|
|
1062
|
+
) | .data
|
|
946
1063
|
`;
|
|
947
|
-
|
|
1064
|
+
const tmpPath = outputPath + ".tmp";
|
|
1065
|
+
await runJqFileToFile(program, inputPath, tmpPath);
|
|
1066
|
+
await rename(tmpPath, outputPath);
|
|
948
1067
|
}
|
|
949
1068
|
|
|
950
1069
|
// src/processors/image-extractor.ts
|
|
@@ -972,7 +1091,7 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
972
1091
|
return;
|
|
973
1092
|
}
|
|
974
1093
|
mkdirSync(join2(entryPath, ".."), { recursive: true });
|
|
975
|
-
const writeStream =
|
|
1094
|
+
const writeStream = createWriteStream2(entryPath);
|
|
976
1095
|
readStream.pipe(writeStream);
|
|
977
1096
|
writeStream.on("finish", () => {
|
|
978
1097
|
zipfile.readEntry();
|
|
@@ -988,26 +1107,6 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
988
1107
|
});
|
|
989
1108
|
});
|
|
990
1109
|
}
|
|
991
|
-
/**
|
|
992
|
-
* Extract base64 images from JSON file using jq (for large files)
|
|
993
|
-
* Returns array of base64 data strings
|
|
994
|
-
*/
|
|
995
|
-
static async extractBase64ImagesFromJsonWithJq(jsonPath) {
|
|
996
|
-
return jqExtractBase64PngStrings(jsonPath);
|
|
997
|
-
}
|
|
998
|
-
/**
|
|
999
|
-
* Replace base64 images with file paths in JSON using jq (for large files)
|
|
1000
|
-
* Uses reduce to maintain counter state while walking the JSON
|
|
1001
|
-
*/
|
|
1002
|
-
static async replaceBase64ImagesInJsonWithJq(jsonPath, outputPath, dirName, prefix) {
|
|
1003
|
-
const { data, count } = await jqReplaceBase64WithPaths(
|
|
1004
|
-
jsonPath,
|
|
1005
|
-
dirName,
|
|
1006
|
-
prefix
|
|
1007
|
-
);
|
|
1008
|
-
writeFileSync(outputPath, JSON.stringify(data, null, 2), "utf-8");
|
|
1009
|
-
return count;
|
|
1010
|
-
}
|
|
1011
1110
|
/**
|
|
1012
1111
|
* Extract a base64-encoded image to a file and return the relative path
|
|
1013
1112
|
*/
|
|
@@ -1021,8 +1120,66 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1021
1120
|
return `${dirName}/${filename}`;
|
|
1022
1121
|
}
|
|
1023
1122
|
/**
|
|
1024
|
-
*
|
|
1025
|
-
*
|
|
1123
|
+
* Extract base64 images from HTML using streaming.
|
|
1124
|
+
* Reads HTML file as a stream, extracts base64 images from src attributes,
|
|
1125
|
+
* saves them as PNG files, and replaces with file paths in the output HTML.
|
|
1126
|
+
* Returns the number of images extracted.
|
|
1127
|
+
*/
|
|
1128
|
+
static async extractImagesFromHtmlStream(htmlInputPath, htmlOutputPath, imagesDir) {
|
|
1129
|
+
let imageIndex = 0;
|
|
1130
|
+
let pending = "";
|
|
1131
|
+
const MARKER = 'src="data:image/png;base64,';
|
|
1132
|
+
const transform = new Transform({
|
|
1133
|
+
decodeStrings: false,
|
|
1134
|
+
encoding: "utf-8",
|
|
1135
|
+
transform(chunk, _encoding, callback) {
|
|
1136
|
+
pending += chunk;
|
|
1137
|
+
let result = "";
|
|
1138
|
+
while (true) {
|
|
1139
|
+
const markerIdx = pending.indexOf(MARKER);
|
|
1140
|
+
if (markerIdx === -1) {
|
|
1141
|
+
const safeEnd = Math.max(0, pending.length - MARKER.length);
|
|
1142
|
+
result += pending.slice(0, safeEnd);
|
|
1143
|
+
pending = pending.slice(safeEnd);
|
|
1144
|
+
break;
|
|
1145
|
+
}
|
|
1146
|
+
result += pending.slice(0, markerIdx);
|
|
1147
|
+
const dataStart = markerIdx + MARKER.length;
|
|
1148
|
+
const quoteIdx = pending.indexOf('"', dataStart);
|
|
1149
|
+
if (quoteIdx === -1) {
|
|
1150
|
+
pending = pending.slice(markerIdx);
|
|
1151
|
+
break;
|
|
1152
|
+
}
|
|
1153
|
+
const base64Content = pending.slice(dataStart, quoteIdx);
|
|
1154
|
+
const filename = `image_${imageIndex}.png`;
|
|
1155
|
+
const filepath = join2(imagesDir, filename);
|
|
1156
|
+
const buf = Buffer.from(base64Content, "base64");
|
|
1157
|
+
writeFileSync(filepath, buf);
|
|
1158
|
+
const relativePath = `images/${filename}`;
|
|
1159
|
+
result += `src="${relativePath}"`;
|
|
1160
|
+
imageIndex++;
|
|
1161
|
+
pending = pending.slice(quoteIdx + 1);
|
|
1162
|
+
}
|
|
1163
|
+
if (result.length > 0) {
|
|
1164
|
+
this.push(result);
|
|
1165
|
+
}
|
|
1166
|
+
callback();
|
|
1167
|
+
},
|
|
1168
|
+
flush(callback) {
|
|
1169
|
+
if (pending.length > 0) {
|
|
1170
|
+
this.push(pending);
|
|
1171
|
+
}
|
|
1172
|
+
callback();
|
|
1173
|
+
}
|
|
1174
|
+
});
|
|
1175
|
+
const rs = createReadStream(htmlInputPath, { encoding: "utf-8" });
|
|
1176
|
+
const ws = createWriteStream2(htmlOutputPath, { encoding: "utf-8" });
|
|
1177
|
+
await pipeline2(rs, transform, ws);
|
|
1178
|
+
return imageIndex;
|
|
1179
|
+
}
|
|
1180
|
+
/**
|
|
1181
|
+
* Save JSON and HTML documents with base64 images extracted to separate files.
|
|
1182
|
+
* Uses jq for JSON processing and streaming for HTML to handle large files.
|
|
1026
1183
|
*
|
|
1027
1184
|
* This method:
|
|
1028
1185
|
* 1. Extracts base64-encoded images from JSON and HTML content
|
|
@@ -1030,7 +1187,7 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1030
1187
|
* 3. Replaces base64 data with relative file paths
|
|
1031
1188
|
* 4. Saves the transformed documents to the output directory
|
|
1032
1189
|
*/
|
|
1033
|
-
static async saveDocumentsWithExtractedImages(logger, outputDir, filename, jsonSourcePath,
|
|
1190
|
+
static async saveDocumentsWithExtractedImages(logger, outputDir, filename, jsonSourcePath, htmlSourcePath) {
|
|
1034
1191
|
try {
|
|
1035
1192
|
if (existsSync(outputDir)) {
|
|
1036
1193
|
rmSync(outputDir, { recursive: true, force: true });
|
|
@@ -1046,27 +1203,29 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1046
1203
|
if (!existsSync(imagesDir)) {
|
|
1047
1204
|
mkdirSync(imagesDir, { recursive: true });
|
|
1048
1205
|
}
|
|
1049
|
-
const
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1206
|
+
const imageCount = await jqExtractBase64PngStringsStreaming(
|
|
1207
|
+
jsonSourcePath,
|
|
1208
|
+
(base64Data, index) => {
|
|
1209
|
+
_ImageExtractor.extractBase64ImageToFile(
|
|
1210
|
+
base64Data,
|
|
1211
|
+
imagesDir,
|
|
1212
|
+
index,
|
|
1213
|
+
"pic",
|
|
1214
|
+
"images"
|
|
1215
|
+
);
|
|
1216
|
+
}
|
|
1217
|
+
);
|
|
1059
1218
|
logger.info(
|
|
1060
|
-
`[PDFConverter] Extracted ${
|
|
1219
|
+
`[PDFConverter] Extracted ${imageCount} picture images from JSON to ${imagesDir}`
|
|
1061
1220
|
);
|
|
1062
|
-
|
|
1221
|
+
await jqReplaceBase64WithPathsToFile(
|
|
1063
1222
|
jsonSourcePath,
|
|
1064
1223
|
jsonPath,
|
|
1065
1224
|
"images",
|
|
1066
1225
|
"pic"
|
|
1067
1226
|
);
|
|
1068
1227
|
logger.info(
|
|
1069
|
-
`[PDFConverter] Replaced ${
|
|
1228
|
+
`[PDFConverter] Replaced ${imageCount} base64 images with file paths`
|
|
1070
1229
|
);
|
|
1071
1230
|
} catch (e) {
|
|
1072
1231
|
logger.warn(
|
|
@@ -1082,42 +1241,36 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1082
1241
|
if (!existsSync(imagesDir)) {
|
|
1083
1242
|
mkdirSync(imagesDir, { recursive: true });
|
|
1084
1243
|
}
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
const filename2 = `image_${imageIndex}.png`;
|
|
1090
|
-
const filepath = join2(imagesDir, filename2);
|
|
1091
|
-
const buffer = Buffer.from(base64Content, "base64");
|
|
1092
|
-
writeFileSync(filepath, buffer);
|
|
1093
|
-
const relativePath = `images/${filename2}`;
|
|
1094
|
-
imageIndex += 1;
|
|
1095
|
-
return `src="${relativePath}"`;
|
|
1096
|
-
}
|
|
1244
|
+
const htmlImageCount = await _ImageExtractor.extractImagesFromHtmlStream(
|
|
1245
|
+
htmlSourcePath,
|
|
1246
|
+
htmlPath,
|
|
1247
|
+
imagesDir
|
|
1097
1248
|
);
|
|
1098
1249
|
logger.info(
|
|
1099
|
-
`[PDFConverter] Extracted ${
|
|
1250
|
+
`[PDFConverter] Extracted ${htmlImageCount} images from HTML to ${imagesDir}`
|
|
1100
1251
|
);
|
|
1101
|
-
writeFileSync(htmlPath, transformedHtml, "utf-8");
|
|
1102
1252
|
} catch (e) {
|
|
1103
1253
|
logger.warn(
|
|
1104
|
-
"[PDFConverter] Failed to extract images from HTML,
|
|
1254
|
+
"[PDFConverter] Failed to extract images from HTML, copying original. Error:",
|
|
1105
1255
|
e
|
|
1106
1256
|
);
|
|
1107
|
-
|
|
1257
|
+
const rs = createReadStream(htmlSourcePath);
|
|
1258
|
+
const ws = createWriteStream2(htmlPath);
|
|
1259
|
+
await pipeline2(rs, ws);
|
|
1108
1260
|
}
|
|
1109
1261
|
logger.info("[PDFConverter] Saved HTML:", htmlPath);
|
|
1110
1262
|
}
|
|
1111
1263
|
/**
|
|
1112
1264
|
* Extract documents from ZIP and save with extracted images
|
|
1113
|
-
* Uses jq for JSON processing to handle large files
|
|
1265
|
+
* Uses jq for JSON processing and streaming for HTML to handle large files
|
|
1266
|
+
* without loading into Node.js memory
|
|
1114
1267
|
*
|
|
1115
1268
|
* Complete workflow:
|
|
1116
1269
|
* 1. Extract ZIP file to temporary directory
|
|
1117
1270
|
* 2. Find JSON and HTML files from extracted files
|
|
1118
|
-
* 3. Use jq to extract base64 images from JSON and save as separate files
|
|
1119
|
-
* 4. Use jq to replace base64 with file paths in JSON
|
|
1120
|
-
* 5. Process HTML with
|
|
1271
|
+
* 3. Use jq to stream-extract base64 images from JSON and save as separate files
|
|
1272
|
+
* 4. Use jq to replace base64 with file paths in JSON (piped to file)
|
|
1273
|
+
* 5. Process HTML with streaming Transform to extract and replace images
|
|
1121
1274
|
* 6. Save transformed documents to output directory (as result.json and result.html)
|
|
1122
1275
|
*/
|
|
1123
1276
|
static async extractAndSaveDocumentsFromZip(logger, zipPath, extractDir, outputDir) {
|
|
@@ -1133,14 +1286,13 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1133
1286
|
}
|
|
1134
1287
|
const jsonPath = join2(extractDir, jsonFile);
|
|
1135
1288
|
const htmlPath = join2(extractDir, htmlFile);
|
|
1136
|
-
const htmlContent = readFileSync(htmlPath, "utf-8");
|
|
1137
1289
|
logger.info("[PDFConverter] Saving converted files to output...");
|
|
1138
1290
|
await _ImageExtractor.saveDocumentsWithExtractedImages(
|
|
1139
1291
|
logger,
|
|
1140
1292
|
outputDir,
|
|
1141
1293
|
"result",
|
|
1142
1294
|
jsonPath,
|
|
1143
|
-
|
|
1295
|
+
htmlPath
|
|
1144
1296
|
);
|
|
1145
1297
|
logger.info("[PDFConverter] Files saved to:", outputDir);
|
|
1146
1298
|
}
|
|
@@ -1149,43 +1301,82 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1149
1301
|
// src/processors/page-renderer.ts
|
|
1150
1302
|
import { existsSync as existsSync2, mkdirSync as mkdirSync2, readdirSync as readdirSync2 } from "fs";
|
|
1151
1303
|
import { join as join3 } from "path";
|
|
1152
|
-
var
|
|
1304
|
+
var PROGRESS_LOG_PERCENT_STEP = 10;
|
|
1153
1305
|
var PageRenderer = class {
|
|
1154
1306
|
constructor(logger) {
|
|
1155
1307
|
this.logger = logger;
|
|
1156
1308
|
}
|
|
1309
|
+
lastLoggedPercent = 0;
|
|
1157
1310
|
/**
|
|
1158
1311
|
* Render all pages of a PDF to individual PNG files.
|
|
1159
1312
|
*
|
|
1313
|
+
* Uses per-page rendering (`magick 'input.pdf[N]'`) when page count is known,
|
|
1314
|
+
* limiting peak memory to ~15MB/page instead of loading all pages at once.
|
|
1315
|
+
*
|
|
1160
1316
|
* @param pdfPath - Absolute path to the source PDF file
|
|
1161
1317
|
* @param outputDir - Directory where pages/ subdirectory will be created
|
|
1162
1318
|
* @param options - Rendering options
|
|
1163
1319
|
* @returns Render result with page count and file paths
|
|
1164
1320
|
*/
|
|
1165
1321
|
async renderPages(pdfPath, outputDir, options) {
|
|
1166
|
-
const dpi = options?.dpi ?? DEFAULT_DPI;
|
|
1322
|
+
const dpi = options?.dpi ?? PAGE_RENDERING.DEFAULT_DPI;
|
|
1167
1323
|
const pagesDir = join3(outputDir, "pages");
|
|
1168
1324
|
if (!existsSync2(pagesDir)) {
|
|
1169
1325
|
mkdirSync2(pagesDir, { recursive: true });
|
|
1170
1326
|
}
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
dpi.toString(),
|
|
1176
|
-
pdfPath,
|
|
1177
|
-
"-background",
|
|
1178
|
-
"white",
|
|
1179
|
-
"-alpha",
|
|
1180
|
-
"remove",
|
|
1181
|
-
"-alpha",
|
|
1182
|
-
"off",
|
|
1183
|
-
outputPattern
|
|
1184
|
-
]);
|
|
1185
|
-
if (result.code !== 0) {
|
|
1186
|
-
throw new Error(
|
|
1187
|
-
`[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
|
|
1327
|
+
const totalPages = await this.getPageCount(pdfPath);
|
|
1328
|
+
if (totalPages > 0) {
|
|
1329
|
+
this.logger.info(
|
|
1330
|
+
`[PageRenderer] Rendering ${totalPages} pages at ${dpi} DPI...`
|
|
1188
1331
|
);
|
|
1332
|
+
this.lastLoggedPercent = 0;
|
|
1333
|
+
for (let i = 0; i < totalPages; i++) {
|
|
1334
|
+
const result = await spawnAsync(
|
|
1335
|
+
"magick",
|
|
1336
|
+
[
|
|
1337
|
+
"-density",
|
|
1338
|
+
dpi.toString(),
|
|
1339
|
+
`${pdfPath}[${i}]`,
|
|
1340
|
+
"-background",
|
|
1341
|
+
"white",
|
|
1342
|
+
"-alpha",
|
|
1343
|
+
"remove",
|
|
1344
|
+
"-alpha",
|
|
1345
|
+
"off",
|
|
1346
|
+
join3(pagesDir, `page_${i}.png`)
|
|
1347
|
+
],
|
|
1348
|
+
{ captureStdout: false }
|
|
1349
|
+
);
|
|
1350
|
+
if (result.code !== 0) {
|
|
1351
|
+
throw new Error(
|
|
1352
|
+
`[PageRenderer] Failed to render page ${i + 1}/${totalPages}: ${result.stderr || "Unknown error"}`
|
|
1353
|
+
);
|
|
1354
|
+
}
|
|
1355
|
+
this.logProgress(i + 1, totalPages);
|
|
1356
|
+
}
|
|
1357
|
+
} else {
|
|
1358
|
+
this.logger.info(`[PageRenderer] Rendering PDF at ${dpi} DPI...`);
|
|
1359
|
+
const result = await spawnAsync(
|
|
1360
|
+
"magick",
|
|
1361
|
+
[
|
|
1362
|
+
"-density",
|
|
1363
|
+
dpi.toString(),
|
|
1364
|
+
pdfPath,
|
|
1365
|
+
"-background",
|
|
1366
|
+
"white",
|
|
1367
|
+
"-alpha",
|
|
1368
|
+
"remove",
|
|
1369
|
+
"-alpha",
|
|
1370
|
+
"off",
|
|
1371
|
+
join3(pagesDir, "page_%d.png")
|
|
1372
|
+
],
|
|
1373
|
+
{ captureStdout: false }
|
|
1374
|
+
);
|
|
1375
|
+
if (result.code !== 0) {
|
|
1376
|
+
throw new Error(
|
|
1377
|
+
`[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
|
|
1378
|
+
);
|
|
1379
|
+
}
|
|
1189
1380
|
}
|
|
1190
1381
|
const pageFiles = readdirSync2(pagesDir).filter((f) => f.startsWith("page_") && f.endsWith(".png")).sort((a, b) => {
|
|
1191
1382
|
const numA = parseInt(a.replace("page_", "").replace(".png", ""), 10);
|
|
@@ -1201,6 +1392,32 @@ var PageRenderer = class {
|
|
|
1201
1392
|
pageFiles
|
|
1202
1393
|
};
|
|
1203
1394
|
}
|
|
1395
|
+
/**
|
|
1396
|
+
* Log rendering progress at appropriate intervals (every 10%).
|
|
1397
|
+
*/
|
|
1398
|
+
logProgress(current, total) {
|
|
1399
|
+
const percent = Math.floor(current / total * 100);
|
|
1400
|
+
if (percent >= this.lastLoggedPercent + PROGRESS_LOG_PERCENT_STEP || current === total) {
|
|
1401
|
+
this.lastLoggedPercent = percent;
|
|
1402
|
+
this.logger.info(
|
|
1403
|
+
`[PageRenderer] Rendering pages: ${current}/${total} (${percent}%)`
|
|
1404
|
+
);
|
|
1405
|
+
}
|
|
1406
|
+
}
|
|
1407
|
+
/**
|
|
1408
|
+
* Get total page count using pdfinfo.
|
|
1409
|
+
* Returns 0 on failure (progress logging will be skipped).
|
|
1410
|
+
*/
|
|
1411
|
+
async getPageCount(pdfPath) {
|
|
1412
|
+
try {
|
|
1413
|
+
const result = await spawnAsync("pdfinfo", [pdfPath]);
|
|
1414
|
+
if (result.code !== 0) return 0;
|
|
1415
|
+
const match = result.stdout.match(/^Pages:\s+(\d+)/m);
|
|
1416
|
+
return match ? parseInt(match[1], 10) : 0;
|
|
1417
|
+
} catch {
|
|
1418
|
+
return 0;
|
|
1419
|
+
}
|
|
1420
|
+
}
|
|
1204
1421
|
};
|
|
1205
1422
|
|
|
1206
1423
|
// src/processors/pdf-text-extractor.ts
|
|
@@ -1286,7 +1503,7 @@ var PdfTextExtractor = class {
|
|
|
1286
1503
|
};
|
|
1287
1504
|
|
|
1288
1505
|
// src/processors/vlm-text-corrector.ts
|
|
1289
|
-
import { readFileSync
|
|
1506
|
+
import { readFileSync, writeFileSync as writeFileSync2 } from "fs";
|
|
1290
1507
|
import { join as join4 } from "path";
|
|
1291
1508
|
|
|
1292
1509
|
// src/types/vlm-text-correction-schema.ts
|
|
@@ -1418,7 +1635,7 @@ var VlmTextCorrector = class {
|
|
|
1418
1635
|
async correctAndSave(outputDir, model, options) {
|
|
1419
1636
|
this.logger.info("[VlmTextCorrector] Starting text correction...");
|
|
1420
1637
|
const resultPath = join4(outputDir, "result.json");
|
|
1421
|
-
const doc = JSON.parse(
|
|
1638
|
+
const doc = JSON.parse(readFileSync(resultPath, "utf-8"));
|
|
1422
1639
|
let pageNumbers = this.getPageNumbers(doc);
|
|
1423
1640
|
if (pageNumbers.length === 0) {
|
|
1424
1641
|
this.logger.info("[VlmTextCorrector] No pages to process");
|
|
@@ -1745,7 +1962,7 @@ var VlmTextCorrector = class {
|
|
|
1745
1962
|
*/
|
|
1746
1963
|
readPageImage(outputDir, pageNo) {
|
|
1747
1964
|
const imagePath = join4(outputDir, "pages", `page_${pageNo - 1}.png`);
|
|
1748
|
-
return
|
|
1965
|
+
return readFileSync(imagePath).toString("base64");
|
|
1749
1966
|
}
|
|
1750
1967
|
/**
|
|
1751
1968
|
* Apply VLM corrections to the DoclingDocument.
|
|
@@ -1800,9 +2017,8 @@ var VlmTextCorrector = class {
|
|
|
1800
2017
|
|
|
1801
2018
|
// src/samplers/ocr-strategy-sampler.ts
|
|
1802
2019
|
import { normalizeToBcp47 } from "@heripo/model";
|
|
1803
|
-
import { readFileSync as
|
|
2020
|
+
import { readFileSync as readFileSync2 } from "fs";
|
|
1804
2021
|
import { z as z2 } from "zod/v4";
|
|
1805
|
-
var SAMPLE_DPI = 150;
|
|
1806
2022
|
var EDGE_TRIM_RATIO = 0.1;
|
|
1807
2023
|
var DEFAULT_MAX_SAMPLE_PAGES = 15;
|
|
1808
2024
|
var DEFAULT_MAX_RETRIES2 = 3;
|
|
@@ -1853,7 +2069,7 @@ var OcrStrategySampler = class {
|
|
|
1853
2069
|
const renderResult = await this.pageRenderer.renderPages(
|
|
1854
2070
|
pdfPath,
|
|
1855
2071
|
outputDir,
|
|
1856
|
-
{ dpi: SAMPLE_DPI }
|
|
2072
|
+
{ dpi: PAGE_RENDERING.SAMPLE_DPI }
|
|
1857
2073
|
);
|
|
1858
2074
|
if (renderResult.pageCount === 0) {
|
|
1859
2075
|
this.logger.info("[OcrStrategySampler] No pages found in PDF");
|
|
@@ -2015,7 +2231,7 @@ var OcrStrategySampler = class {
|
|
|
2015
2231
|
this.logger.debug(
|
|
2016
2232
|
`[OcrStrategySampler] Analyzing page ${pageNo} for Korean-Hanja mix and language...`
|
|
2017
2233
|
);
|
|
2018
|
-
const base64Image =
|
|
2234
|
+
const base64Image = readFileSync2(pageFile).toString("base64");
|
|
2019
2235
|
const messages = [
|
|
2020
2236
|
{
|
|
2021
2237
|
role: "user",
|
|
@@ -2063,7 +2279,7 @@ var OcrStrategySampler = class {
|
|
|
2063
2279
|
};
|
|
2064
2280
|
|
|
2065
2281
|
// src/utils/local-file-server.ts
|
|
2066
|
-
import { createReadStream, statSync } from "fs";
|
|
2282
|
+
import { createReadStream as createReadStream2, statSync } from "fs";
|
|
2067
2283
|
import { createServer } from "http";
|
|
2068
2284
|
import { basename } from "path";
|
|
2069
2285
|
var LocalFileServer = class {
|
|
@@ -2085,7 +2301,7 @@ var LocalFileServer = class {
|
|
|
2085
2301
|
"Content-Type": "application/pdf",
|
|
2086
2302
|
"Content-Length": stat.size
|
|
2087
2303
|
});
|
|
2088
|
-
|
|
2304
|
+
createReadStream2(filePath).pipe(res);
|
|
2089
2305
|
} else {
|
|
2090
2306
|
res.writeHead(404);
|
|
2091
2307
|
res.end("Not Found");
|
|
@@ -2370,8 +2586,10 @@ var PDFConverter = class {
|
|
|
2370
2586
|
let pageTexts;
|
|
2371
2587
|
try {
|
|
2372
2588
|
const resultPath2 = join6(outputDir, "result.json");
|
|
2373
|
-
const
|
|
2374
|
-
|
|
2589
|
+
const totalPages = await runJqFileJson(
|
|
2590
|
+
".pages | length",
|
|
2591
|
+
resultPath2
|
|
2592
|
+
);
|
|
2375
2593
|
const textExtractor = new PdfTextExtractor(this.logger);
|
|
2376
2594
|
pageTexts = await textExtractor.extractText(pdfPath, totalPages);
|
|
2377
2595
|
} catch {
|
|
@@ -2703,8 +2921,8 @@ var PDFConverter = class {
|
|
|
2703
2921
|
const zipPath = join6(process.cwd(), "result.zip");
|
|
2704
2922
|
this.logger.info("[PDFConverter] Saving ZIP file to:", zipPath);
|
|
2705
2923
|
if (zipResult.fileStream) {
|
|
2706
|
-
const writeStream =
|
|
2707
|
-
await
|
|
2924
|
+
const writeStream = createWriteStream3(zipPath);
|
|
2925
|
+
await pipeline3(zipResult.fileStream, writeStream);
|
|
2708
2926
|
return;
|
|
2709
2927
|
}
|
|
2710
2928
|
if (zipResult.data) {
|
|
@@ -2736,6 +2954,7 @@ var PDFConverter = class {
|
|
|
2736
2954
|
}
|
|
2737
2955
|
/**
|
|
2738
2956
|
* Render page images from the source PDF using ImageMagick and update result.json.
|
|
2957
|
+
* Uses jq to update the JSON file without loading it into Node.js memory.
|
|
2739
2958
|
* Replaces Docling's generate_page_images which fails on large PDFs
|
|
2740
2959
|
* due to memory limits when embedding all page images as base64.
|
|
2741
2960
|
*/
|
|
@@ -2753,17 +2972,18 @@ var PDFConverter = class {
|
|
|
2753
2972
|
const renderer = new PageRenderer(this.logger);
|
|
2754
2973
|
const renderResult = await renderer.renderPages(pdfPath, outputDir);
|
|
2755
2974
|
const resultPath = join6(outputDir, "result.json");
|
|
2756
|
-
const
|
|
2757
|
-
|
|
2758
|
-
|
|
2759
|
-
|
|
2760
|
-
|
|
2761
|
-
|
|
2762
|
-
|
|
2763
|
-
|
|
2764
|
-
|
|
2765
|
-
|
|
2766
|
-
await
|
|
2975
|
+
const tmpPath = resultPath + ".tmp";
|
|
2976
|
+
const jqProgram = `
|
|
2977
|
+
.pages |= with_entries(
|
|
2978
|
+
if (.value.page_no - 1) >= 0 and (.value.page_no - 1) < ${renderResult.pageCount} then
|
|
2979
|
+
.value.image.uri = "pages/page_\\(.value.page_no - 1).png" |
|
|
2980
|
+
.value.image.mimetype = "image/png" |
|
|
2981
|
+
.value.image.dpi = ${PAGE_RENDERING.DEFAULT_DPI}
|
|
2982
|
+
else . end
|
|
2983
|
+
)
|
|
2984
|
+
`;
|
|
2985
|
+
await runJqFileToFile(jqProgram, resultPath, tmpPath);
|
|
2986
|
+
await rename2(tmpPath, resultPath);
|
|
2767
2987
|
this.logger.info(
|
|
2768
2988
|
`[PDFConverter] Rendered ${renderResult.pageCount} page images`
|
|
2769
2989
|
);
|