@heripo/pdf-parser 0.1.8 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.ko.md +19 -9
- package/README.md +19 -9
- package/dist/index.cjs +419 -151
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -1
- package/dist/index.d.ts +4 -1
- package/dist/index.js +388 -126
- package/dist/index.js.map +1 -1
- package/package.json +4 -4
package/dist/index.js
CHANGED
|
@@ -43,11 +43,17 @@ var DOCLING_ENVIRONMENT = {
|
|
|
43
43
|
*/
|
|
44
44
|
STARTUP_DELAY_MS: 2e3
|
|
45
45
|
};
|
|
46
|
+
var PAGE_RENDERING = {
|
|
47
|
+
/** Default rendering DPI for VLM text recognition quality */
|
|
48
|
+
DEFAULT_DPI: 200,
|
|
49
|
+
/** Low-resolution DPI for OCR strategy sampling */
|
|
50
|
+
SAMPLE_DPI: 150
|
|
51
|
+
};
|
|
46
52
|
var IMAGE_PDF_CONVERTER = {
|
|
47
53
|
/**
|
|
48
54
|
* ImageMagick density option (DPI) for PDF to image conversion
|
|
49
55
|
*/
|
|
50
|
-
DENSITY:
|
|
56
|
+
DENSITY: PAGE_RENDERING.DEFAULT_DPI,
|
|
51
57
|
/**
|
|
52
58
|
* ImageMagick quality option (1-100)
|
|
53
59
|
*/
|
|
@@ -837,16 +843,10 @@ var DoclingEnvironment = class _DoclingEnvironment {
|
|
|
837
843
|
|
|
838
844
|
// src/core/pdf-converter.ts
|
|
839
845
|
import { omit } from "es-toolkit";
|
|
840
|
-
import {
|
|
841
|
-
|
|
842
|
-
createWriteStream as createWriteStream2,
|
|
843
|
-
existsSync as existsSync4,
|
|
844
|
-
readFileSync as readFileSync4,
|
|
845
|
-
rmSync as rmSync3
|
|
846
|
-
} from "fs";
|
|
847
|
-
import { writeFile } from "fs/promises";
|
|
846
|
+
import { copyFileSync, createWriteStream as createWriteStream3, existsSync as existsSync4, rmSync as rmSync3 } from "fs";
|
|
847
|
+
import { rename as rename2, writeFile } from "fs/promises";
|
|
848
848
|
import { join as join6 } from "path";
|
|
849
|
-
import { pipeline } from "stream/promises";
|
|
849
|
+
import { pipeline as pipeline3 } from "stream/promises";
|
|
850
850
|
|
|
851
851
|
// src/errors/image-pdf-fallback-error.ts
|
|
852
852
|
var ImagePdfFallbackError = class extends Error {
|
|
@@ -862,19 +862,24 @@ var ImagePdfFallbackError = class extends Error {
|
|
|
862
862
|
|
|
863
863
|
// src/processors/image-extractor.ts
|
|
864
864
|
import {
|
|
865
|
-
|
|
865
|
+
createReadStream,
|
|
866
|
+
createWriteStream as createWriteStream2,
|
|
866
867
|
existsSync,
|
|
867
868
|
mkdirSync,
|
|
868
|
-
readFileSync,
|
|
869
869
|
readdirSync,
|
|
870
870
|
rmSync,
|
|
871
871
|
writeFileSync
|
|
872
872
|
} from "fs";
|
|
873
873
|
import { extname, join as join2 } from "path";
|
|
874
|
+
import { Transform } from "stream";
|
|
875
|
+
import { pipeline as pipeline2 } from "stream/promises";
|
|
874
876
|
import * as yauzl from "yauzl";
|
|
875
877
|
|
|
876
878
|
// src/utils/jq.ts
|
|
877
879
|
import { spawn as spawn3 } from "child_process";
|
|
880
|
+
import { createWriteStream } from "fs";
|
|
881
|
+
import { rename } from "fs/promises";
|
|
882
|
+
import { pipeline } from "stream/promises";
|
|
878
883
|
function getJqPath() {
|
|
879
884
|
const p = process.env.JQ_PATH?.trim();
|
|
880
885
|
return p && p.length > 0 ? p : "jq";
|
|
@@ -926,25 +931,139 @@ function runJqFileJson(program, filePath) {
|
|
|
926
931
|
});
|
|
927
932
|
});
|
|
928
933
|
}
|
|
929
|
-
function
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
]
|
|
935
|
-
|
|
936
|
-
|
|
934
|
+
function runJqFileToFile(program, inputPath, outputPath) {
|
|
935
|
+
return new Promise((resolve, reject) => {
|
|
936
|
+
const jqPath = getJqPath();
|
|
937
|
+
const args = [program, inputPath];
|
|
938
|
+
const child = spawn3(jqPath, args, {
|
|
939
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
940
|
+
env: process.env
|
|
941
|
+
});
|
|
942
|
+
let stderr = "";
|
|
943
|
+
let exitCode = null;
|
|
944
|
+
let pipelineDone = false;
|
|
945
|
+
let settled = false;
|
|
946
|
+
child.stderr.setEncoding("utf-8");
|
|
947
|
+
child.stderr.on("data", (chunk) => {
|
|
948
|
+
stderr += chunk;
|
|
949
|
+
});
|
|
950
|
+
const ws = createWriteStream(outputPath);
|
|
951
|
+
function trySettle() {
|
|
952
|
+
if (settled) return;
|
|
953
|
+
if (!pipelineDone || exitCode === null) return;
|
|
954
|
+
settled = true;
|
|
955
|
+
if (exitCode !== 0) {
|
|
956
|
+
reject(
|
|
957
|
+
new Error(
|
|
958
|
+
`jq exited with code ${exitCode}. ${stderr ? "Stderr: " + stderr : ""}`
|
|
959
|
+
)
|
|
960
|
+
);
|
|
961
|
+
} else {
|
|
962
|
+
resolve();
|
|
963
|
+
}
|
|
964
|
+
}
|
|
965
|
+
child.on("error", (err) => {
|
|
966
|
+
if (settled) return;
|
|
967
|
+
settled = true;
|
|
968
|
+
ws.destroy();
|
|
969
|
+
reject(err);
|
|
970
|
+
});
|
|
971
|
+
pipeline(child.stdout, ws).then(() => {
|
|
972
|
+
pipelineDone = true;
|
|
973
|
+
trySettle();
|
|
974
|
+
}).catch((err) => {
|
|
975
|
+
if (settled) return;
|
|
976
|
+
settled = true;
|
|
977
|
+
reject(err);
|
|
978
|
+
});
|
|
979
|
+
child.on("close", (code) => {
|
|
980
|
+
exitCode = code ?? 1;
|
|
981
|
+
trySettle();
|
|
982
|
+
});
|
|
983
|
+
});
|
|
984
|
+
}
|
|
985
|
+
function runJqFileLines(program, filePath, onLine) {
|
|
986
|
+
return new Promise((resolve, reject) => {
|
|
987
|
+
const jqPath = getJqPath();
|
|
988
|
+
const args = ["-r", program, filePath];
|
|
989
|
+
const child = spawn3(jqPath, args, {
|
|
990
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
991
|
+
env: process.env
|
|
992
|
+
});
|
|
993
|
+
let stderr = "";
|
|
994
|
+
let buffer = "";
|
|
995
|
+
let callbackError = false;
|
|
996
|
+
child.stdout.setEncoding("utf-8");
|
|
997
|
+
child.stderr.setEncoding("utf-8");
|
|
998
|
+
function safeOnLine(line) {
|
|
999
|
+
if (callbackError) return;
|
|
1000
|
+
try {
|
|
1001
|
+
onLine(line);
|
|
1002
|
+
} catch (err) {
|
|
1003
|
+
callbackError = true;
|
|
1004
|
+
child.kill();
|
|
1005
|
+
reject(err);
|
|
1006
|
+
}
|
|
1007
|
+
}
|
|
1008
|
+
child.stdout.on("data", (chunk) => {
|
|
1009
|
+
buffer += chunk;
|
|
1010
|
+
let newlineIdx;
|
|
1011
|
+
while ((newlineIdx = buffer.indexOf("\n")) !== -1) {
|
|
1012
|
+
const line = buffer.slice(0, newlineIdx);
|
|
1013
|
+
buffer = buffer.slice(newlineIdx + 1);
|
|
1014
|
+
if (line.length > 0) {
|
|
1015
|
+
safeOnLine(line);
|
|
1016
|
+
}
|
|
1017
|
+
}
|
|
1018
|
+
});
|
|
1019
|
+
child.stderr.on("data", (chunk) => {
|
|
1020
|
+
stderr += chunk;
|
|
1021
|
+
});
|
|
1022
|
+
child.on("error", (err) => {
|
|
1023
|
+
if (!callbackError) reject(err);
|
|
1024
|
+
});
|
|
1025
|
+
child.on("close", (code) => {
|
|
1026
|
+
if (callbackError) return;
|
|
1027
|
+
if (buffer.length > 0) {
|
|
1028
|
+
safeOnLine(buffer);
|
|
1029
|
+
}
|
|
1030
|
+
if (callbackError) return;
|
|
1031
|
+
if (code !== 0) {
|
|
1032
|
+
reject(
|
|
1033
|
+
new Error(
|
|
1034
|
+
`jq exited with code ${code}. ${stderr ? "Stderr: " + stderr : ""}`
|
|
1035
|
+
)
|
|
1036
|
+
);
|
|
1037
|
+
} else {
|
|
1038
|
+
resolve();
|
|
1039
|
+
}
|
|
1040
|
+
});
|
|
1041
|
+
});
|
|
937
1042
|
}
|
|
938
|
-
function
|
|
1043
|
+
async function jqExtractBase64PngStringsStreaming(filePath, onImage) {
|
|
1044
|
+
let index = 0;
|
|
1045
|
+
await runJqFileLines(
|
|
1046
|
+
'.. | select(type == "string" and startswith("data:image/png;base64"))',
|
|
1047
|
+
filePath,
|
|
1048
|
+
(line) => {
|
|
1049
|
+
onImage(line, index);
|
|
1050
|
+
index++;
|
|
1051
|
+
}
|
|
1052
|
+
);
|
|
1053
|
+
return index;
|
|
1054
|
+
}
|
|
1055
|
+
async function jqReplaceBase64WithPathsToFile(inputPath, outputPath, dirName, prefix) {
|
|
939
1056
|
const program = `
|
|
940
1057
|
reduce paths(type == "string" and startswith("data:image/png;base64")) as $p (
|
|
941
1058
|
{data: ., counter: 0};
|
|
942
1059
|
.counter as $idx |
|
|
943
1060
|
.data |= setpath($p; "${dirName}/${prefix}_\\($idx).png") |
|
|
944
1061
|
.counter += 1
|
|
945
|
-
) |
|
|
1062
|
+
) | .data
|
|
946
1063
|
`;
|
|
947
|
-
|
|
1064
|
+
const tmpPath = outputPath + ".tmp";
|
|
1065
|
+
await runJqFileToFile(program, inputPath, tmpPath);
|
|
1066
|
+
await rename(tmpPath, outputPath);
|
|
948
1067
|
}
|
|
949
1068
|
|
|
950
1069
|
// src/processors/image-extractor.ts
|
|
@@ -972,7 +1091,7 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
972
1091
|
return;
|
|
973
1092
|
}
|
|
974
1093
|
mkdirSync(join2(entryPath, ".."), { recursive: true });
|
|
975
|
-
const writeStream =
|
|
1094
|
+
const writeStream = createWriteStream2(entryPath);
|
|
976
1095
|
readStream.pipe(writeStream);
|
|
977
1096
|
writeStream.on("finish", () => {
|
|
978
1097
|
zipfile.readEntry();
|
|
@@ -988,26 +1107,6 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
988
1107
|
});
|
|
989
1108
|
});
|
|
990
1109
|
}
|
|
991
|
-
/**
|
|
992
|
-
* Extract base64 images from JSON file using jq (for large files)
|
|
993
|
-
* Returns array of base64 data strings
|
|
994
|
-
*/
|
|
995
|
-
static async extractBase64ImagesFromJsonWithJq(jsonPath) {
|
|
996
|
-
return jqExtractBase64PngStrings(jsonPath);
|
|
997
|
-
}
|
|
998
|
-
/**
|
|
999
|
-
* Replace base64 images with file paths in JSON using jq (for large files)
|
|
1000
|
-
* Uses reduce to maintain counter state while walking the JSON
|
|
1001
|
-
*/
|
|
1002
|
-
static async replaceBase64ImagesInJsonWithJq(jsonPath, outputPath, dirName, prefix) {
|
|
1003
|
-
const { data, count } = await jqReplaceBase64WithPaths(
|
|
1004
|
-
jsonPath,
|
|
1005
|
-
dirName,
|
|
1006
|
-
prefix
|
|
1007
|
-
);
|
|
1008
|
-
writeFileSync(outputPath, JSON.stringify(data, null, 2), "utf-8");
|
|
1009
|
-
return count;
|
|
1010
|
-
}
|
|
1011
1110
|
/**
|
|
1012
1111
|
* Extract a base64-encoded image to a file and return the relative path
|
|
1013
1112
|
*/
|
|
@@ -1021,8 +1120,66 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1021
1120
|
return `${dirName}/${filename}`;
|
|
1022
1121
|
}
|
|
1023
1122
|
/**
|
|
1024
|
-
*
|
|
1025
|
-
*
|
|
1123
|
+
* Extract base64 images from HTML using streaming.
|
|
1124
|
+
* Reads HTML file as a stream, extracts base64 images from src attributes,
|
|
1125
|
+
* saves them as PNG files, and replaces with file paths in the output HTML.
|
|
1126
|
+
* Returns the number of images extracted.
|
|
1127
|
+
*/
|
|
1128
|
+
static async extractImagesFromHtmlStream(htmlInputPath, htmlOutputPath, imagesDir) {
|
|
1129
|
+
let imageIndex = 0;
|
|
1130
|
+
let pending = "";
|
|
1131
|
+
const MARKER = 'src="data:image/png;base64,';
|
|
1132
|
+
const transform = new Transform({
|
|
1133
|
+
decodeStrings: false,
|
|
1134
|
+
encoding: "utf-8",
|
|
1135
|
+
transform(chunk, _encoding, callback) {
|
|
1136
|
+
pending += chunk;
|
|
1137
|
+
let result = "";
|
|
1138
|
+
while (true) {
|
|
1139
|
+
const markerIdx = pending.indexOf(MARKER);
|
|
1140
|
+
if (markerIdx === -1) {
|
|
1141
|
+
const safeEnd = Math.max(0, pending.length - MARKER.length);
|
|
1142
|
+
result += pending.slice(0, safeEnd);
|
|
1143
|
+
pending = pending.slice(safeEnd);
|
|
1144
|
+
break;
|
|
1145
|
+
}
|
|
1146
|
+
result += pending.slice(0, markerIdx);
|
|
1147
|
+
const dataStart = markerIdx + MARKER.length;
|
|
1148
|
+
const quoteIdx = pending.indexOf('"', dataStart);
|
|
1149
|
+
if (quoteIdx === -1) {
|
|
1150
|
+
pending = pending.slice(markerIdx);
|
|
1151
|
+
break;
|
|
1152
|
+
}
|
|
1153
|
+
const base64Content = pending.slice(dataStart, quoteIdx);
|
|
1154
|
+
const filename = `image_${imageIndex}.png`;
|
|
1155
|
+
const filepath = join2(imagesDir, filename);
|
|
1156
|
+
const buf = Buffer.from(base64Content, "base64");
|
|
1157
|
+
writeFileSync(filepath, buf);
|
|
1158
|
+
const relativePath = `images/${filename}`;
|
|
1159
|
+
result += `src="${relativePath}"`;
|
|
1160
|
+
imageIndex++;
|
|
1161
|
+
pending = pending.slice(quoteIdx + 1);
|
|
1162
|
+
}
|
|
1163
|
+
if (result.length > 0) {
|
|
1164
|
+
this.push(result);
|
|
1165
|
+
}
|
|
1166
|
+
callback();
|
|
1167
|
+
},
|
|
1168
|
+
flush(callback) {
|
|
1169
|
+
if (pending.length > 0) {
|
|
1170
|
+
this.push(pending);
|
|
1171
|
+
}
|
|
1172
|
+
callback();
|
|
1173
|
+
}
|
|
1174
|
+
});
|
|
1175
|
+
const rs = createReadStream(htmlInputPath, { encoding: "utf-8" });
|
|
1176
|
+
const ws = createWriteStream2(htmlOutputPath, { encoding: "utf-8" });
|
|
1177
|
+
await pipeline2(rs, transform, ws);
|
|
1178
|
+
return imageIndex;
|
|
1179
|
+
}
|
|
1180
|
+
/**
|
|
1181
|
+
* Save JSON and HTML documents with base64 images extracted to separate files.
|
|
1182
|
+
* Uses jq for JSON processing and streaming for HTML to handle large files.
|
|
1026
1183
|
*
|
|
1027
1184
|
* This method:
|
|
1028
1185
|
* 1. Extracts base64-encoded images from JSON and HTML content
|
|
@@ -1030,7 +1187,7 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1030
1187
|
* 3. Replaces base64 data with relative file paths
|
|
1031
1188
|
* 4. Saves the transformed documents to the output directory
|
|
1032
1189
|
*/
|
|
1033
|
-
static async saveDocumentsWithExtractedImages(logger, outputDir, filename, jsonSourcePath,
|
|
1190
|
+
static async saveDocumentsWithExtractedImages(logger, outputDir, filename, jsonSourcePath, htmlSourcePath) {
|
|
1034
1191
|
try {
|
|
1035
1192
|
if (existsSync(outputDir)) {
|
|
1036
1193
|
rmSync(outputDir, { recursive: true, force: true });
|
|
@@ -1042,31 +1199,33 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1042
1199
|
const baseName = filename.replace(extname(filename), "");
|
|
1043
1200
|
const jsonPath = join2(outputDir, `${baseName}.json`);
|
|
1044
1201
|
try {
|
|
1045
|
-
const
|
|
1046
|
-
if (!existsSync(
|
|
1047
|
-
mkdirSync(
|
|
1048
|
-
}
|
|
1049
|
-
const
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1202
|
+
const imagesDir = join2(outputDir, "images");
|
|
1203
|
+
if (!existsSync(imagesDir)) {
|
|
1204
|
+
mkdirSync(imagesDir, { recursive: true });
|
|
1205
|
+
}
|
|
1206
|
+
const imageCount = await jqExtractBase64PngStringsStreaming(
|
|
1207
|
+
jsonSourcePath,
|
|
1208
|
+
(base64Data, index) => {
|
|
1209
|
+
_ImageExtractor.extractBase64ImageToFile(
|
|
1210
|
+
base64Data,
|
|
1211
|
+
imagesDir,
|
|
1212
|
+
index,
|
|
1213
|
+
"pic",
|
|
1214
|
+
"images"
|
|
1215
|
+
);
|
|
1216
|
+
}
|
|
1217
|
+
);
|
|
1059
1218
|
logger.info(
|
|
1060
|
-
`[PDFConverter] Extracted ${
|
|
1219
|
+
`[PDFConverter] Extracted ${imageCount} picture images from JSON to ${imagesDir}`
|
|
1061
1220
|
);
|
|
1062
|
-
|
|
1221
|
+
await jqReplaceBase64WithPathsToFile(
|
|
1063
1222
|
jsonSourcePath,
|
|
1064
1223
|
jsonPath,
|
|
1065
|
-
"
|
|
1066
|
-
"
|
|
1224
|
+
"images",
|
|
1225
|
+
"pic"
|
|
1067
1226
|
);
|
|
1068
1227
|
logger.info(
|
|
1069
|
-
`[PDFConverter] Replaced ${
|
|
1228
|
+
`[PDFConverter] Replaced ${imageCount} base64 images with file paths`
|
|
1070
1229
|
);
|
|
1071
1230
|
} catch (e) {
|
|
1072
1231
|
logger.warn(
|
|
@@ -1082,42 +1241,36 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1082
1241
|
if (!existsSync(imagesDir)) {
|
|
1083
1242
|
mkdirSync(imagesDir, { recursive: true });
|
|
1084
1243
|
}
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
const filename2 = `image_${imageIndex}.png`;
|
|
1090
|
-
const filepath = join2(imagesDir, filename2);
|
|
1091
|
-
const buffer = Buffer.from(base64Content, "base64");
|
|
1092
|
-
writeFileSync(filepath, buffer);
|
|
1093
|
-
const relativePath = `images/${filename2}`;
|
|
1094
|
-
imageIndex += 1;
|
|
1095
|
-
return `src="${relativePath}"`;
|
|
1096
|
-
}
|
|
1244
|
+
const htmlImageCount = await _ImageExtractor.extractImagesFromHtmlStream(
|
|
1245
|
+
htmlSourcePath,
|
|
1246
|
+
htmlPath,
|
|
1247
|
+
imagesDir
|
|
1097
1248
|
);
|
|
1098
1249
|
logger.info(
|
|
1099
|
-
`[PDFConverter] Extracted ${
|
|
1250
|
+
`[PDFConverter] Extracted ${htmlImageCount} images from HTML to ${imagesDir}`
|
|
1100
1251
|
);
|
|
1101
|
-
writeFileSync(htmlPath, transformedHtml, "utf-8");
|
|
1102
1252
|
} catch (e) {
|
|
1103
1253
|
logger.warn(
|
|
1104
|
-
"[PDFConverter] Failed to extract images from HTML,
|
|
1254
|
+
"[PDFConverter] Failed to extract images from HTML, copying original. Error:",
|
|
1105
1255
|
e
|
|
1106
1256
|
);
|
|
1107
|
-
|
|
1257
|
+
const rs = createReadStream(htmlSourcePath);
|
|
1258
|
+
const ws = createWriteStream2(htmlPath);
|
|
1259
|
+
await pipeline2(rs, ws);
|
|
1108
1260
|
}
|
|
1109
1261
|
logger.info("[PDFConverter] Saved HTML:", htmlPath);
|
|
1110
1262
|
}
|
|
1111
1263
|
/**
|
|
1112
1264
|
* Extract documents from ZIP and save with extracted images
|
|
1113
|
-
* Uses jq for JSON processing to handle large files
|
|
1265
|
+
* Uses jq for JSON processing and streaming for HTML to handle large files
|
|
1266
|
+
* without loading into Node.js memory
|
|
1114
1267
|
*
|
|
1115
1268
|
* Complete workflow:
|
|
1116
1269
|
* 1. Extract ZIP file to temporary directory
|
|
1117
1270
|
* 2. Find JSON and HTML files from extracted files
|
|
1118
|
-
* 3. Use jq to extract base64 images from JSON and save as separate files
|
|
1119
|
-
* 4. Use jq to replace base64 with file paths in JSON
|
|
1120
|
-
* 5. Process HTML with
|
|
1271
|
+
* 3. Use jq to stream-extract base64 images from JSON and save as separate files
|
|
1272
|
+
* 4. Use jq to replace base64 with file paths in JSON (piped to file)
|
|
1273
|
+
* 5. Process HTML with streaming Transform to extract and replace images
|
|
1121
1274
|
* 6. Save transformed documents to output directory (as result.json and result.html)
|
|
1122
1275
|
*/
|
|
1123
1276
|
static async extractAndSaveDocumentsFromZip(logger, zipPath, extractDir, outputDir) {
|
|
@@ -1133,14 +1286,13 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1133
1286
|
}
|
|
1134
1287
|
const jsonPath = join2(extractDir, jsonFile);
|
|
1135
1288
|
const htmlPath = join2(extractDir, htmlFile);
|
|
1136
|
-
const htmlContent = readFileSync(htmlPath, "utf-8");
|
|
1137
1289
|
logger.info("[PDFConverter] Saving converted files to output...");
|
|
1138
1290
|
await _ImageExtractor.saveDocumentsWithExtractedImages(
|
|
1139
1291
|
logger,
|
|
1140
1292
|
outputDir,
|
|
1141
1293
|
"result",
|
|
1142
1294
|
jsonPath,
|
|
1143
|
-
|
|
1295
|
+
htmlPath
|
|
1144
1296
|
);
|
|
1145
1297
|
logger.info("[PDFConverter] Files saved to:", outputDir);
|
|
1146
1298
|
}
|
|
@@ -1149,7 +1301,7 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1149
1301
|
// src/processors/page-renderer.ts
|
|
1150
1302
|
import { existsSync as existsSync2, mkdirSync as mkdirSync2, readdirSync as readdirSync2 } from "fs";
|
|
1151
1303
|
import { join as join3 } from "path";
|
|
1152
|
-
var
|
|
1304
|
+
var PROGRESS_POLL_INTERVAL_MS = 2e3;
|
|
1153
1305
|
var PageRenderer = class {
|
|
1154
1306
|
constructor(logger) {
|
|
1155
1307
|
this.logger = logger;
|
|
@@ -1163,29 +1315,60 @@ var PageRenderer = class {
|
|
|
1163
1315
|
* @returns Render result with page count and file paths
|
|
1164
1316
|
*/
|
|
1165
1317
|
async renderPages(pdfPath, outputDir, options) {
|
|
1166
|
-
const dpi = options?.dpi ?? DEFAULT_DPI;
|
|
1318
|
+
const dpi = options?.dpi ?? PAGE_RENDERING.DEFAULT_DPI;
|
|
1167
1319
|
const pagesDir = join3(outputDir, "pages");
|
|
1168
1320
|
if (!existsSync2(pagesDir)) {
|
|
1169
1321
|
mkdirSync2(pagesDir, { recursive: true });
|
|
1170
1322
|
}
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
dpi.toString(),
|
|
1176
|
-
pdfPath,
|
|
1177
|
-
"-background",
|
|
1178
|
-
"white",
|
|
1179
|
-
"-alpha",
|
|
1180
|
-
"remove",
|
|
1181
|
-
"-alpha",
|
|
1182
|
-
"off",
|
|
1183
|
-
outputPattern
|
|
1184
|
-
]);
|
|
1185
|
-
if (result.code !== 0) {
|
|
1186
|
-
throw new Error(
|
|
1187
|
-
`[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
|
|
1323
|
+
const totalPages = await this.getPageCount(pdfPath);
|
|
1324
|
+
if (totalPages > 0) {
|
|
1325
|
+
this.logger.info(
|
|
1326
|
+
`[PageRenderer] Rendering ${totalPages} pages at ${dpi} DPI...`
|
|
1188
1327
|
);
|
|
1328
|
+
} else {
|
|
1329
|
+
this.logger.info(`[PageRenderer] Rendering PDF at ${dpi} DPI...`);
|
|
1330
|
+
}
|
|
1331
|
+
const outputPattern = join3(pagesDir, "page_%d.png");
|
|
1332
|
+
let progressInterval = null;
|
|
1333
|
+
if (totalPages > 0) {
|
|
1334
|
+
let lastLoggedCount = 0;
|
|
1335
|
+
progressInterval = setInterval(() => {
|
|
1336
|
+
try {
|
|
1337
|
+
const rendered = readdirSync2(pagesDir).filter(
|
|
1338
|
+
(f) => f.startsWith("page_") && f.endsWith(".png")
|
|
1339
|
+
).length;
|
|
1340
|
+
if (rendered > 0 && rendered !== lastLoggedCount) {
|
|
1341
|
+
lastLoggedCount = rendered;
|
|
1342
|
+
this.logger.info(
|
|
1343
|
+
`[PageRenderer] Rendering pages: ${rendered}/${totalPages}`
|
|
1344
|
+
);
|
|
1345
|
+
}
|
|
1346
|
+
} catch {
|
|
1347
|
+
}
|
|
1348
|
+
}, PROGRESS_POLL_INTERVAL_MS);
|
|
1349
|
+
}
|
|
1350
|
+
try {
|
|
1351
|
+
const result = await spawnAsync("magick", [
|
|
1352
|
+
"-density",
|
|
1353
|
+
dpi.toString(),
|
|
1354
|
+
pdfPath,
|
|
1355
|
+
"-background",
|
|
1356
|
+
"white",
|
|
1357
|
+
"-alpha",
|
|
1358
|
+
"remove",
|
|
1359
|
+
"-alpha",
|
|
1360
|
+
"off",
|
|
1361
|
+
outputPattern
|
|
1362
|
+
]);
|
|
1363
|
+
if (result.code !== 0) {
|
|
1364
|
+
throw new Error(
|
|
1365
|
+
`[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
|
|
1366
|
+
);
|
|
1367
|
+
}
|
|
1368
|
+
} finally {
|
|
1369
|
+
if (progressInterval) {
|
|
1370
|
+
clearInterval(progressInterval);
|
|
1371
|
+
}
|
|
1189
1372
|
}
|
|
1190
1373
|
const pageFiles = readdirSync2(pagesDir).filter((f) => f.startsWith("page_") && f.endsWith(".png")).sort((a, b) => {
|
|
1191
1374
|
const numA = parseInt(a.replace("page_", "").replace(".png", ""), 10);
|
|
@@ -1201,6 +1384,20 @@ var PageRenderer = class {
|
|
|
1201
1384
|
pageFiles
|
|
1202
1385
|
};
|
|
1203
1386
|
}
|
|
1387
|
+
/**
|
|
1388
|
+
* Get total page count using pdfinfo.
|
|
1389
|
+
* Returns 0 on failure (progress logging will be skipped).
|
|
1390
|
+
*/
|
|
1391
|
+
async getPageCount(pdfPath) {
|
|
1392
|
+
try {
|
|
1393
|
+
const result = await spawnAsync("pdfinfo", [pdfPath]);
|
|
1394
|
+
if (result.code !== 0) return 0;
|
|
1395
|
+
const match = result.stdout.match(/^Pages:\s+(\d+)/m);
|
|
1396
|
+
return match ? parseInt(match[1], 10) : 0;
|
|
1397
|
+
} catch {
|
|
1398
|
+
return 0;
|
|
1399
|
+
}
|
|
1400
|
+
}
|
|
1204
1401
|
};
|
|
1205
1402
|
|
|
1206
1403
|
// src/processors/pdf-text-extractor.ts
|
|
@@ -1286,7 +1483,7 @@ var PdfTextExtractor = class {
|
|
|
1286
1483
|
};
|
|
1287
1484
|
|
|
1288
1485
|
// src/processors/vlm-text-corrector.ts
|
|
1289
|
-
import { readFileSync
|
|
1486
|
+
import { readFileSync, writeFileSync as writeFileSync2 } from "fs";
|
|
1290
1487
|
import { join as join4 } from "path";
|
|
1291
1488
|
|
|
1292
1489
|
// src/types/vlm-text-correction-schema.ts
|
|
@@ -1418,7 +1615,7 @@ var VlmTextCorrector = class {
|
|
|
1418
1615
|
async correctAndSave(outputDir, model, options) {
|
|
1419
1616
|
this.logger.info("[VlmTextCorrector] Starting text correction...");
|
|
1420
1617
|
const resultPath = join4(outputDir, "result.json");
|
|
1421
|
-
const doc = JSON.parse(
|
|
1618
|
+
const doc = JSON.parse(readFileSync(resultPath, "utf-8"));
|
|
1422
1619
|
let pageNumbers = this.getPageNumbers(doc);
|
|
1423
1620
|
if (pageNumbers.length === 0) {
|
|
1424
1621
|
this.logger.info("[VlmTextCorrector] No pages to process");
|
|
@@ -1745,7 +1942,7 @@ var VlmTextCorrector = class {
|
|
|
1745
1942
|
*/
|
|
1746
1943
|
readPageImage(outputDir, pageNo) {
|
|
1747
1944
|
const imagePath = join4(outputDir, "pages", `page_${pageNo - 1}.png`);
|
|
1748
|
-
return
|
|
1945
|
+
return readFileSync(imagePath).toString("base64");
|
|
1749
1946
|
}
|
|
1750
1947
|
/**
|
|
1751
1948
|
* Apply VLM corrections to the DoclingDocument.
|
|
@@ -1799,9 +1996,9 @@ var VlmTextCorrector = class {
|
|
|
1799
1996
|
};
|
|
1800
1997
|
|
|
1801
1998
|
// src/samplers/ocr-strategy-sampler.ts
|
|
1802
|
-
import {
|
|
1999
|
+
import { normalizeToBcp47 } from "@heripo/model";
|
|
2000
|
+
import { readFileSync as readFileSync2 } from "fs";
|
|
1803
2001
|
import { z as z2 } from "zod/v4";
|
|
1804
|
-
var SAMPLE_DPI = 150;
|
|
1805
2002
|
var EDGE_TRIM_RATIO = 0.1;
|
|
1806
2003
|
var DEFAULT_MAX_SAMPLE_PAGES = 15;
|
|
1807
2004
|
var DEFAULT_MAX_RETRIES2 = 3;
|
|
@@ -1852,7 +2049,7 @@ var OcrStrategySampler = class {
|
|
|
1852
2049
|
const renderResult = await this.pageRenderer.renderPages(
|
|
1853
2050
|
pdfPath,
|
|
1854
2051
|
outputDir,
|
|
1855
|
-
{ dpi: SAMPLE_DPI }
|
|
2052
|
+
{ dpi: PAGE_RENDERING.SAMPLE_DPI }
|
|
1856
2053
|
);
|
|
1857
2054
|
if (renderResult.pageCount === 0) {
|
|
1858
2055
|
this.logger.info("[OcrStrategySampler] No pages found in PDF");
|
|
@@ -1871,7 +2068,7 @@ var OcrStrategySampler = class {
|
|
|
1871
2068
|
`[OcrStrategySampler] Sampling ${sampleIndices.length} of ${renderResult.pageCount} pages: [${sampleIndices.map((i) => i + 1).join(", ")}]`
|
|
1872
2069
|
);
|
|
1873
2070
|
let sampledCount = 0;
|
|
1874
|
-
|
|
2071
|
+
const languageFrequency = /* @__PURE__ */ new Map();
|
|
1875
2072
|
for (const idx of sampleIndices) {
|
|
1876
2073
|
sampledCount++;
|
|
1877
2074
|
const pageFile = renderResult.pageFiles[idx];
|
|
@@ -1881,14 +2078,17 @@ var OcrStrategySampler = class {
|
|
|
1881
2078
|
model,
|
|
1882
2079
|
options
|
|
1883
2080
|
);
|
|
1884
|
-
|
|
2081
|
+
for (const lang of pageAnalysis.detectedLanguages) {
|
|
2082
|
+
languageFrequency.set(lang, (languageFrequency.get(lang) ?? 0) + 1);
|
|
2083
|
+
}
|
|
1885
2084
|
if (pageAnalysis.hasKoreanHanjaMix) {
|
|
1886
2085
|
this.logger.info(
|
|
1887
2086
|
`[OcrStrategySampler] Korean-Hanja mix detected on page ${idx + 1} \u2192 VLM strategy`
|
|
1888
2087
|
);
|
|
2088
|
+
const detectedLanguages2 = this.aggregateLanguages(languageFrequency);
|
|
1889
2089
|
return {
|
|
1890
2090
|
method: "vlm",
|
|
1891
|
-
detectedLanguages,
|
|
2091
|
+
detectedLanguages: detectedLanguages2,
|
|
1892
2092
|
reason: `Korean-Hanja mix detected on page ${idx + 1}`,
|
|
1893
2093
|
sampledPages: sampledCount,
|
|
1894
2094
|
totalPages: renderResult.pageCount
|
|
@@ -1898,6 +2098,7 @@ var OcrStrategySampler = class {
|
|
|
1898
2098
|
this.logger.info(
|
|
1899
2099
|
"[OcrStrategySampler] No Korean-Hanja mix detected \u2192 ocrmac strategy"
|
|
1900
2100
|
);
|
|
2101
|
+
const detectedLanguages = this.aggregateLanguages(languageFrequency);
|
|
1901
2102
|
return {
|
|
1902
2103
|
method: "ocrmac",
|
|
1903
2104
|
detectedLanguages,
|
|
@@ -2002,14 +2203,15 @@ var OcrStrategySampler = class {
|
|
|
2002
2203
|
}
|
|
2003
2204
|
/**
|
|
2004
2205
|
* Analyze a single sample page for Korean-Hanja mixed script and primary language.
|
|
2206
|
+
* Normalizes raw VLM language responses to valid BCP 47 tags, filtering out invalid ones.
|
|
2005
2207
|
*
|
|
2006
|
-
* @returns Object with Korean-Hanja detection result and detected languages
|
|
2208
|
+
* @returns Object with Korean-Hanja detection result and normalized detected languages
|
|
2007
2209
|
*/
|
|
2008
2210
|
async analyzeSamplePage(pageFile, pageNo, model, options) {
|
|
2009
2211
|
this.logger.debug(
|
|
2010
2212
|
`[OcrStrategySampler] Analyzing page ${pageNo} for Korean-Hanja mix and language...`
|
|
2011
2213
|
);
|
|
2012
|
-
const base64Image =
|
|
2214
|
+
const base64Image = readFileSync2(pageFile).toString("base64");
|
|
2013
2215
|
const messages = [
|
|
2014
2216
|
{
|
|
2015
2217
|
role: "user",
|
|
@@ -2037,18 +2239,27 @@ var OcrStrategySampler = class {
|
|
|
2037
2239
|
options.aggregator.track(result.usage);
|
|
2038
2240
|
}
|
|
2039
2241
|
const output = result.output;
|
|
2242
|
+
const normalizedLanguages = output.detectedLanguages.map(normalizeToBcp47).filter((tag) => tag !== null);
|
|
2040
2243
|
this.logger.debug(
|
|
2041
|
-
`[OcrStrategySampler] Page ${pageNo}: hasKoreanHanjaMix=${output.hasKoreanHanjaMix}, detectedLanguages=${
|
|
2244
|
+
`[OcrStrategySampler] Page ${pageNo}: hasKoreanHanjaMix=${output.hasKoreanHanjaMix}, detectedLanguages=${normalizedLanguages.join(",")}`
|
|
2042
2245
|
);
|
|
2043
2246
|
return {
|
|
2044
2247
|
hasKoreanHanjaMix: output.hasKoreanHanjaMix,
|
|
2045
|
-
detectedLanguages:
|
|
2248
|
+
detectedLanguages: normalizedLanguages
|
|
2046
2249
|
};
|
|
2047
2250
|
}
|
|
2251
|
+
/**
|
|
2252
|
+
* Aggregate language frequency map into a sorted array.
|
|
2253
|
+
* Returns languages sorted by frequency (descending), or undefined if empty.
|
|
2254
|
+
*/
|
|
2255
|
+
aggregateLanguages(frequencyMap) {
|
|
2256
|
+
if (frequencyMap.size === 0) return void 0;
|
|
2257
|
+
return [...frequencyMap.entries()].sort((a, b) => b[1] - a[1]).map(([lang]) => lang);
|
|
2258
|
+
}
|
|
2048
2259
|
};
|
|
2049
2260
|
|
|
2050
2261
|
// src/utils/local-file-server.ts
|
|
2051
|
-
import { createReadStream, statSync } from "fs";
|
|
2262
|
+
import { createReadStream as createReadStream2, statSync } from "fs";
|
|
2052
2263
|
import { createServer } from "http";
|
|
2053
2264
|
import { basename } from "path";
|
|
2054
2265
|
var LocalFileServer = class {
|
|
@@ -2070,7 +2281,7 @@ var LocalFileServer = class {
|
|
|
2070
2281
|
"Content-Type": "application/pdf",
|
|
2071
2282
|
"Content-Length": stat.size
|
|
2072
2283
|
});
|
|
2073
|
-
|
|
2284
|
+
createReadStream2(filePath).pipe(res);
|
|
2074
2285
|
} else {
|
|
2075
2286
|
res.writeHead(404);
|
|
2076
2287
|
res.end("Not Found");
|
|
@@ -2355,8 +2566,10 @@ var PDFConverter = class {
|
|
|
2355
2566
|
let pageTexts;
|
|
2356
2567
|
try {
|
|
2357
2568
|
const resultPath2 = join6(outputDir, "result.json");
|
|
2358
|
-
const
|
|
2359
|
-
|
|
2569
|
+
const totalPages = await runJqFileJson(
|
|
2570
|
+
".pages | length",
|
|
2571
|
+
resultPath2
|
|
2572
|
+
);
|
|
2360
2573
|
const textExtractor = new PdfTextExtractor(this.logger);
|
|
2361
2574
|
pageTexts = await textExtractor.extractText(pdfPath, totalPages);
|
|
2362
2575
|
} catch {
|
|
@@ -2513,6 +2726,7 @@ var PDFConverter = class {
|
|
|
2513
2726
|
const outputDir = join6(cwd, "output", reportId);
|
|
2514
2727
|
try {
|
|
2515
2728
|
await this.processConvertedFiles(zipPath, extractDir, outputDir);
|
|
2729
|
+
await this.renderPageImages(url, outputDir);
|
|
2516
2730
|
if (abortSignal?.aborted) {
|
|
2517
2731
|
this.logger.info("[PDFConverter] Conversion aborted before callback");
|
|
2518
2732
|
const error = new Error("PDF conversion was aborted");
|
|
@@ -2568,6 +2782,8 @@ var PDFConverter = class {
|
|
|
2568
2782
|
framework: "livetext"
|
|
2569
2783
|
},
|
|
2570
2784
|
generate_picture_images: true,
|
|
2785
|
+
generate_page_images: false,
|
|
2786
|
+
// Page images are rendered by PageRenderer (ImageMagick) after conversion
|
|
2571
2787
|
images_scale: 2,
|
|
2572
2788
|
/**
|
|
2573
2789
|
* While disabling this option yields the most accurate text extraction for readable PDFs,
|
|
@@ -2685,8 +2901,8 @@ var PDFConverter = class {
|
|
|
2685
2901
|
const zipPath = join6(process.cwd(), "result.zip");
|
|
2686
2902
|
this.logger.info("[PDFConverter] Saving ZIP file to:", zipPath);
|
|
2687
2903
|
if (zipResult.fileStream) {
|
|
2688
|
-
const writeStream =
|
|
2689
|
-
await
|
|
2904
|
+
const writeStream = createWriteStream3(zipPath);
|
|
2905
|
+
await pipeline3(zipResult.fileStream, writeStream);
|
|
2690
2906
|
return;
|
|
2691
2907
|
}
|
|
2692
2908
|
if (zipResult.data) {
|
|
@@ -2716,6 +2932,42 @@ var PDFConverter = class {
|
|
|
2716
2932
|
outputDir
|
|
2717
2933
|
);
|
|
2718
2934
|
}
|
|
2935
|
+
/**
|
|
2936
|
+
* Render page images from the source PDF using ImageMagick and update result.json.
|
|
2937
|
+
* Uses jq to update the JSON file without loading it into Node.js memory.
|
|
2938
|
+
* Replaces Docling's generate_page_images which fails on large PDFs
|
|
2939
|
+
* due to memory limits when embedding all page images as base64.
|
|
2940
|
+
*/
|
|
2941
|
+
async renderPageImages(url, outputDir) {
|
|
2942
|
+
if (!url.startsWith("file://")) {
|
|
2943
|
+
this.logger.warn(
|
|
2944
|
+
"[PDFConverter] Page image rendering skipped: only supported for local files (file:// URLs)"
|
|
2945
|
+
);
|
|
2946
|
+
return;
|
|
2947
|
+
}
|
|
2948
|
+
const pdfPath = url.slice(7);
|
|
2949
|
+
this.logger.info(
|
|
2950
|
+
"[PDFConverter] Rendering page images with ImageMagick..."
|
|
2951
|
+
);
|
|
2952
|
+
const renderer = new PageRenderer(this.logger);
|
|
2953
|
+
const renderResult = await renderer.renderPages(pdfPath, outputDir);
|
|
2954
|
+
const resultPath = join6(outputDir, "result.json");
|
|
2955
|
+
const tmpPath = resultPath + ".tmp";
|
|
2956
|
+
const jqProgram = `
|
|
2957
|
+
.pages |= with_entries(
|
|
2958
|
+
if (.value.page_no - 1) >= 0 and (.value.page_no - 1) < ${renderResult.pageCount} then
|
|
2959
|
+
.value.image.uri = "pages/page_\\(.value.page_no - 1).png" |
|
|
2960
|
+
.value.image.mimetype = "image/png" |
|
|
2961
|
+
.value.image.dpi = ${PAGE_RENDERING.DEFAULT_DPI}
|
|
2962
|
+
else . end
|
|
2963
|
+
)
|
|
2964
|
+
`;
|
|
2965
|
+
await runJqFileToFile(jqProgram, resultPath, tmpPath);
|
|
2966
|
+
await rename2(tmpPath, resultPath);
|
|
2967
|
+
this.logger.info(
|
|
2968
|
+
`[PDFConverter] Rendered ${renderResult.pageCount} page images`
|
|
2969
|
+
);
|
|
2970
|
+
}
|
|
2719
2971
|
};
|
|
2720
2972
|
|
|
2721
2973
|
// src/core/pdf-parser.ts
|
|
@@ -2754,6 +3006,7 @@ var PDFParser = class {
|
|
|
2754
3006
|
this.logger.info("[PDFParser] Initializing...");
|
|
2755
3007
|
this.checkOperatingSystem();
|
|
2756
3008
|
this.checkJqInstalled();
|
|
3009
|
+
this.checkPopplerInstalled();
|
|
2757
3010
|
this.checkMacOSVersion();
|
|
2758
3011
|
if (this.enableImagePdfFallback && !this.baseUrl) {
|
|
2759
3012
|
this.checkImageMagickInstalled();
|
|
@@ -2810,6 +3063,15 @@ var PDFParser = class {
|
|
|
2810
3063
|
);
|
|
2811
3064
|
}
|
|
2812
3065
|
}
|
|
3066
|
+
checkPopplerInstalled() {
|
|
3067
|
+
try {
|
|
3068
|
+
execSync("which pdftotext", { stdio: "ignore" });
|
|
3069
|
+
} catch {
|
|
3070
|
+
throw new Error(
|
|
3071
|
+
"poppler is not installed. Please install poppler using: brew install poppler"
|
|
3072
|
+
);
|
|
3073
|
+
}
|
|
3074
|
+
}
|
|
2813
3075
|
checkMacOSVersion() {
|
|
2814
3076
|
try {
|
|
2815
3077
|
const versionOutput = execSync("sw_vers -productVersion", {
|