@clazic/kordoc 2.4.11 → 2.4.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +25 -0
- package/dist/{chunk-PJSXZBZB.js → chunk-5R37N6KE.js} +19 -4
- package/dist/chunk-5R37N6KE.js.map +1 -0
- package/dist/chunk-I6YC6ZGK.js +219 -0
- package/dist/chunk-I6YC6ZGK.js.map +1 -0
- package/dist/{chunk-JGMLDBW5.js → chunk-KJEZPVEK.js} +680 -301
- package/dist/chunk-KJEZPVEK.js.map +1 -0
- package/dist/cli.js +68 -8
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +1678 -329
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +121 -1
- package/dist/index.d.ts +121 -1
- package/dist/index.js +1656 -310
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +11 -2
- package/dist/mcp.js.map +1 -1
- package/dist/{provider-PYZL2VNN.js → provider-T2D5XRTI.js} +30 -2
- package/dist/provider-T2D5XRTI.js.map +1 -0
- package/dist/{resolve-4I65IGMM.js → resolve-673XFZQ6.js} +18 -1
- package/dist/resolve-673XFZQ6.js.map +1 -0
- package/dist/{utils-HKVOS2O3.js → utils-XLLXVB7V.js} +4 -2
- package/dist/{watch-EYOGF3HY.js → watch-SOMS2KR7.js} +4 -3
- package/dist/{watch-EYOGF3HY.js.map → watch-SOMS2KR7.js.map} +1 -1
- package/package.json +2 -1
- package/dist/chunk-JGMLDBW5.js.map +0 -1
- package/dist/chunk-PJSXZBZB.js.map +0 -1
- package/dist/provider-PYZL2VNN.js.map +0 -1
- package/dist/resolve-4I65IGMM.js.map +0 -1
- /package/dist/{utils-HKVOS2O3.js.map → utils-XLLXVB7V.js.map} +0 -0
|
@@ -6,10 +6,11 @@ import {
|
|
|
6
6
|
KordocError,
|
|
7
7
|
classifyError,
|
|
8
8
|
isPathTraversal,
|
|
9
|
+
normalizeKordocError,
|
|
9
10
|
precheckZipSize,
|
|
10
11
|
sanitizeHref,
|
|
11
12
|
toArrayBuffer
|
|
12
|
-
} from "./chunk-
|
|
13
|
+
} from "./chunk-5R37N6KE.js";
|
|
13
14
|
import {
|
|
14
15
|
parsePageRange
|
|
15
16
|
} from "./chunk-MOL7MDBG.js";
|
|
@@ -19,6 +20,10 @@ import {
|
|
|
19
20
|
import {
|
|
20
21
|
createCliOcrProvider
|
|
21
22
|
} from "./chunk-34WIGIQC.js";
|
|
23
|
+
import {
|
|
24
|
+
createLoggerFromEnv,
|
|
25
|
+
generateRunId
|
|
26
|
+
} from "./chunk-I6YC6ZGK.js";
|
|
22
27
|
import {
|
|
23
28
|
__commonJS,
|
|
24
29
|
__require,
|
|
@@ -344,8 +349,8 @@ var require_cfb = __commonJS({
|
|
|
344
349
|
}
|
|
345
350
|
return L.length - R.length;
|
|
346
351
|
}
|
|
347
|
-
function
|
|
348
|
-
if (p.charAt(p.length - 1) == "/") return p.slice(0, -1).indexOf("/") === -1 ? p :
|
|
352
|
+
function dirname2(p) {
|
|
353
|
+
if (p.charAt(p.length - 1) == "/") return p.slice(0, -1).indexOf("/") === -1 ? p : dirname2(p.slice(0, -1));
|
|
349
354
|
var c = p.lastIndexOf("/");
|
|
350
355
|
return c === -1 ? p : p.slice(0, c + 1);
|
|
351
356
|
}
|
|
@@ -766,10 +771,10 @@ var require_cfb = __commonJS({
|
|
|
766
771
|
data.push([cfb.FullPaths[i2], cfb.FileIndex[i2]]);
|
|
767
772
|
}
|
|
768
773
|
for (i2 = 0; i2 < data.length; ++i2) {
|
|
769
|
-
var dad =
|
|
774
|
+
var dad = dirname2(data[i2][0]);
|
|
770
775
|
s = fullPaths[dad];
|
|
771
776
|
while (!s) {
|
|
772
|
-
while (
|
|
777
|
+
while (dirname2(dad) && !fullPaths[dirname2(dad)]) dad = dirname2(dad);
|
|
773
778
|
data.push([dad, {
|
|
774
779
|
name: filename(dad).replace("/", ""),
|
|
775
780
|
type: 1,
|
|
@@ -779,7 +784,7 @@ var require_cfb = __commonJS({
|
|
|
779
784
|
content: null
|
|
780
785
|
}]);
|
|
781
786
|
fullPaths[dad] = true;
|
|
782
|
-
dad =
|
|
787
|
+
dad = dirname2(data[i2][0]);
|
|
783
788
|
s = fullPaths[dad];
|
|
784
789
|
}
|
|
785
790
|
}
|
|
@@ -805,13 +810,13 @@ var require_cfb = __commonJS({
|
|
|
805
810
|
elt.size = 0;
|
|
806
811
|
elt.type = 5;
|
|
807
812
|
} else if (nm.slice(-1) == "/") {
|
|
808
|
-
for (j = i2 + 1; j < data.length; ++j) if (
|
|
813
|
+
for (j = i2 + 1; j < data.length; ++j) if (dirname2(cfb.FullPaths[j]) == nm) break;
|
|
809
814
|
elt.C = j >= data.length ? -1 : j;
|
|
810
|
-
for (j = i2 + 1; j < data.length; ++j) if (
|
|
815
|
+
for (j = i2 + 1; j < data.length; ++j) if (dirname2(cfb.FullPaths[j]) == dirname2(nm)) break;
|
|
811
816
|
elt.R = j >= data.length ? -1 : j;
|
|
812
817
|
elt.type = 1;
|
|
813
818
|
} else {
|
|
814
|
-
if (
|
|
819
|
+
if (dirname2(cfb.FullPaths[i2 + 1] || "") == dirname2(nm)) elt.R = i2 + 1;
|
|
815
820
|
elt.type = 2;
|
|
816
821
|
}
|
|
817
822
|
}
|
|
@@ -2308,50 +2313,89 @@ function stripDtd(xml) {
|
|
|
2308
2313
|
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
2309
2314
|
}
|
|
2310
2315
|
async function parseHwpxDocument(buffer, options, existingZip) {
|
|
2311
|
-
|
|
2312
|
-
|
|
2316
|
+
const logger = createLoggerFromEnv().child({ component: "hwpx/parser.ts", stage: "detect" });
|
|
2317
|
+
logger.log({ level: "info", event: "start", message: "HWPX \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.byteLength } });
|
|
2318
|
+
let lastParsedSection = 0;
|
|
2313
2319
|
try {
|
|
2314
|
-
|
|
2315
|
-
|
|
2316
|
-
return await extractFromBrokenZip(buffer);
|
|
2317
|
-
}
|
|
2318
|
-
const actualEntryCount = Object.keys(zip.files).length;
|
|
2319
|
-
if (actualEntryCount > MAX_ZIP_ENTRIES) {
|
|
2320
|
-
throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
2321
|
-
}
|
|
2322
|
-
const decompressed = { total: 0 };
|
|
2323
|
-
const metadata = {};
|
|
2324
|
-
await extractHwpxMetadata(zip, metadata, decompressed);
|
|
2325
|
-
const styleMap = await extractHwpxStyles(zip, decompressed);
|
|
2326
|
-
const warnings = [];
|
|
2327
|
-
const sectionPaths = await resolveSectionPaths(zip);
|
|
2328
|
-
if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
2329
|
-
metadata.pageCount = sectionPaths.length;
|
|
2330
|
-
const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
|
|
2331
|
-
const totalTarget = pageFilter ? pageFilter.size : sectionPaths.length;
|
|
2332
|
-
const blocks = [];
|
|
2333
|
-
let parsedSections = 0;
|
|
2334
|
-
for (let si = 0; si < sectionPaths.length; si++) {
|
|
2335
|
-
if (pageFilter && !pageFilter.has(si + 1)) continue;
|
|
2336
|
-
const file = zip.file(sectionPaths[si]);
|
|
2337
|
-
if (!file) continue;
|
|
2320
|
+
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
2321
|
+
let zip;
|
|
2338
2322
|
try {
|
|
2339
|
-
|
|
2340
|
-
|
|
2341
|
-
|
|
2342
|
-
|
|
2343
|
-
|
|
2344
|
-
|
|
2345
|
-
|
|
2346
|
-
|
|
2347
|
-
|
|
2348
|
-
}
|
|
2349
|
-
|
|
2350
|
-
|
|
2351
|
-
|
|
2352
|
-
|
|
2353
|
-
|
|
2354
|
-
|
|
2323
|
+
zip = existingZip ?? await JSZip2.loadAsync(buffer);
|
|
2324
|
+
} catch {
|
|
2325
|
+
return await extractFromBrokenZip(buffer);
|
|
2326
|
+
}
|
|
2327
|
+
const actualEntryCount = Object.keys(zip.files).length;
|
|
2328
|
+
if (actualEntryCount > MAX_ZIP_ENTRIES) {
|
|
2329
|
+
throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
2330
|
+
}
|
|
2331
|
+
const decompressed = { total: 0 };
|
|
2332
|
+
const metadata = {};
|
|
2333
|
+
await extractHwpxMetadata(zip, metadata, decompressed);
|
|
2334
|
+
const styleMap = await extractHwpxStyles(zip, decompressed);
|
|
2335
|
+
const warnings = [];
|
|
2336
|
+
const sectionPaths = await resolveSectionPaths(zip);
|
|
2337
|
+
if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
2338
|
+
metadata.pageCount = sectionPaths.length;
|
|
2339
|
+
logger.log({ level: "debug", stage: "convert", event: "progress", message: "\uC139\uC158 \uACBD\uB85C \uD574\uC11D \uC644\uB8CC", meta: { sections: sectionPaths.length } });
|
|
2340
|
+
const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
|
|
2341
|
+
const totalTarget = pageFilter ? pageFilter.size : sectionPaths.length;
|
|
2342
|
+
const blocks = [];
|
|
2343
|
+
let parsedSections = 0;
|
|
2344
|
+
for (let si = 0; si < sectionPaths.length; si++) {
|
|
2345
|
+
if (pageFilter && !pageFilter.has(si + 1)) continue;
|
|
2346
|
+
const file = zip.file(sectionPaths[si]);
|
|
2347
|
+
if (!file) continue;
|
|
2348
|
+
try {
|
|
2349
|
+
const xml = await file.async("text");
|
|
2350
|
+
decompressed.total += xml.length * 2;
|
|
2351
|
+
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
2352
|
+
blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1));
|
|
2353
|
+
parsedSections++;
|
|
2354
|
+
options?.onProgress?.(parsedSections, totalTarget);
|
|
2355
|
+
logger.log({
|
|
2356
|
+
level: "debug",
|
|
2357
|
+
stage: "convert",
|
|
2358
|
+
event: "progress",
|
|
2359
|
+
message: "\uC139\uC158 \uD30C\uC2F1 \uC644\uB8CC",
|
|
2360
|
+
meta: { section: si + 1, parsedSections, totalTarget }
|
|
2361
|
+
});
|
|
2362
|
+
lastParsedSection = si + 1;
|
|
2363
|
+
} catch (secErr) {
|
|
2364
|
+
if (secErr instanceof KordocError) throw secErr;
|
|
2365
|
+
warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
2366
|
+
logger.log({
|
|
2367
|
+
level: "warn",
|
|
2368
|
+
stage: "convert",
|
|
2369
|
+
event: "progress",
|
|
2370
|
+
message: "\uC139\uC158 \uD30C\uC2F1 \uC2E4\uD328",
|
|
2371
|
+
meta: { section: si + 1 },
|
|
2372
|
+
error: { message: secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: secErr instanceof Error ? secErr.name : "Error" }
|
|
2373
|
+
});
|
|
2374
|
+
}
|
|
2375
|
+
}
|
|
2376
|
+
const images = await extractImagesFromZip(zip, blocks, decompressed, warnings);
|
|
2377
|
+
detectHwpxHeadings(blocks, styleMap);
|
|
2378
|
+
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
2379
|
+
const markdown = blocksToMarkdown(blocks);
|
|
2380
|
+
logger.log({
|
|
2381
|
+
level: "info",
|
|
2382
|
+
stage: "finalize",
|
|
2383
|
+
event: "done",
|
|
2384
|
+
message: "HWPX \uD30C\uC2F1 \uC644\uB8CC",
|
|
2385
|
+
meta: { blocks: blocks.length, warnings: warnings.length, images: images.length, outline: outline.length }
|
|
2386
|
+
});
|
|
2387
|
+
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
2388
|
+
} catch (err) {
|
|
2389
|
+
logger.log({
|
|
2390
|
+
level: "error",
|
|
2391
|
+
stage: "finalize",
|
|
2392
|
+
event: "error",
|
|
2393
|
+
message: "HWPX \uD30C\uC2F1 \uC2E4\uD328",
|
|
2394
|
+
meta: { lastParsedSection },
|
|
2395
|
+
error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
|
|
2396
|
+
});
|
|
2397
|
+
throw err;
|
|
2398
|
+
}
|
|
2355
2399
|
}
|
|
2356
2400
|
function imageExtToMime(ext) {
|
|
2357
2401
|
switch (ext.toLowerCase()) {
|
|
@@ -4084,71 +4128,110 @@ var CFB = __toESM(require_cfb(), 1);
|
|
|
4084
4128
|
var MAX_SECTIONS = 100;
|
|
4085
4129
|
var MAX_TOTAL_DECOMPRESS = 500 * 1024 * 1024;
|
|
4086
4130
|
function parseHwp5Document(buffer, options) {
|
|
4087
|
-
|
|
4088
|
-
|
|
4089
|
-
|
|
4131
|
+
const logger = createLoggerFromEnv().child({ component: "hwp5/parser.ts", stage: "detect" });
|
|
4132
|
+
logger.log({ level: "info", event: "start", message: "HWP5 \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.length } });
|
|
4133
|
+
let lastParsedSection = 0;
|
|
4090
4134
|
try {
|
|
4091
|
-
cfb =
|
|
4092
|
-
|
|
4135
|
+
let cfb = null;
|
|
4136
|
+
let lenientCfb = null;
|
|
4137
|
+
const warnings = [];
|
|
4093
4138
|
try {
|
|
4094
|
-
|
|
4095
|
-
warnings.push({ message: "\uC190\uC0C1\uB41C CFB \uCEE8\uD14C\uC774\uB108 \u2014 lenient \uBAA8\uB4DC\uB85C \uBCF5\uAD6C", code: "LENIENT_CFB_RECOVERY" });
|
|
4139
|
+
cfb = CFB.parse(buffer);
|
|
4096
4140
|
} catch {
|
|
4097
|
-
|
|
4141
|
+
try {
|
|
4142
|
+
lenientCfb = parseLenientCfb(buffer);
|
|
4143
|
+
warnings.push({ message: "\uC190\uC0C1\uB41C CFB \uCEE8\uD14C\uC774\uB108 \u2014 lenient \uBAA8\uB4DC\uB85C \uBCF5\uAD6C", code: "LENIENT_CFB_RECOVERY" });
|
|
4144
|
+
} catch {
|
|
4145
|
+
throw new KordocError("CFB \uCEE8\uD14C\uC774\uB108 \uD30C\uC2F1 \uC2E4\uD328 (strict \uBC0F lenient \uBAA8\uB450)");
|
|
4146
|
+
}
|
|
4098
4147
|
}
|
|
4099
|
-
|
|
4100
|
-
|
|
4101
|
-
|
|
4102
|
-
|
|
4103
|
-
|
|
4148
|
+
const findStream = (path) => {
|
|
4149
|
+
if (cfb) {
|
|
4150
|
+
const entry = CFB.find(cfb, path);
|
|
4151
|
+
return entry?.content ? Buffer.from(entry.content) : null;
|
|
4152
|
+
}
|
|
4153
|
+
return lenientCfb.findStream(path);
|
|
4154
|
+
};
|
|
4155
|
+
const headerData = findStream("/FileHeader");
|
|
4156
|
+
if (!headerData) throw new KordocError("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
|
|
4157
|
+
const header = parseFileHeader(headerData);
|
|
4158
|
+
if (header.signature !== "HWP Document File") throw new KordocError("HWP \uC2DC\uADF8\uB2C8\uCC98 \uBD88\uC77C\uCE58");
|
|
4159
|
+
if (header.flags & FLAG_ENCRYPTED) throw new KordocError("\uC554\uD638\uD654\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
|
|
4160
|
+
if (header.flags & FLAG_DRM) throw new KordocError("DRM \uBCF4\uD638\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
|
|
4161
|
+
const compressed = (header.flags & FLAG_COMPRESSED) !== 0;
|
|
4162
|
+
const distribution = (header.flags & FLAG_DISTRIBUTION) !== 0;
|
|
4163
|
+
const metadata = {
|
|
4164
|
+
version: `${header.versionMajor}.x`
|
|
4165
|
+
};
|
|
4166
|
+
if (cfb) extractHwp5Metadata(cfb, metadata);
|
|
4167
|
+
const docInfo = cfb ? parseDocInfoStream(cfb, compressed) : parseDocInfoFromStream(findStream("/DocInfo"), compressed);
|
|
4168
|
+
const sections = distribution ? cfb ? findViewTextSections(cfb, compressed) : findViewTextSectionsLenient(lenientCfb, compressed) : cfb ? findSections(cfb) : findSectionsLenient(lenientCfb, compressed);
|
|
4169
|
+
if (sections.length === 0) throw new KordocError("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
4170
|
+
logger.log({ level: "debug", stage: "convert", event: "progress", message: "\uC139\uC158 \uBAA9\uB85D \uD574\uC11D \uC644\uB8CC", meta: { sections: sections.length, distribution } });
|
|
4171
|
+
metadata.pageCount = sections.length;
|
|
4172
|
+
const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
|
|
4173
|
+
const totalTarget = pageFilter ? pageFilter.size : sections.length;
|
|
4174
|
+
const blocks = [];
|
|
4175
|
+
let totalDecompressed = 0;
|
|
4176
|
+
let parsedSections = 0;
|
|
4177
|
+
for (let si = 0; si < sections.length; si++) {
|
|
4178
|
+
if (pageFilter && !pageFilter.has(si + 1)) continue;
|
|
4179
|
+
try {
|
|
4180
|
+
const sectionData = sections[si];
|
|
4181
|
+
const data = !distribution && compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
|
|
4182
|
+
totalDecompressed += data.length;
|
|
4183
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
4184
|
+
const records = readRecords(data);
|
|
4185
|
+
const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
|
|
4186
|
+
blocks.push(...sectionBlocks);
|
|
4187
|
+
parsedSections++;
|
|
4188
|
+
options?.onProgress?.(parsedSections, totalTarget);
|
|
4189
|
+
logger.log({
|
|
4190
|
+
level: "debug",
|
|
4191
|
+
stage: "convert",
|
|
4192
|
+
event: "progress",
|
|
4193
|
+
message: "\uC139\uC158 \uD30C\uC2F1 \uC644\uB8CC",
|
|
4194
|
+
meta: { section: si + 1, parsedSections, totalTarget }
|
|
4195
|
+
});
|
|
4196
|
+
lastParsedSection = si + 1;
|
|
4197
|
+
} catch (secErr) {
|
|
4198
|
+
if (secErr instanceof KordocError) throw secErr;
|
|
4199
|
+
warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
4200
|
+
logger.log({
|
|
4201
|
+
level: "warn",
|
|
4202
|
+
stage: "convert",
|
|
4203
|
+
event: "progress",
|
|
4204
|
+
message: "\uC139\uC158 \uD30C\uC2F1 \uC2E4\uD328",
|
|
4205
|
+
meta: { section: si + 1 },
|
|
4206
|
+
error: { message: secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: secErr instanceof Error ? secErr.name : "Error" }
|
|
4207
|
+
});
|
|
4208
|
+
}
|
|
4104
4209
|
}
|
|
4105
|
-
|
|
4106
|
-
|
|
4107
|
-
|
|
4108
|
-
|
|
4109
|
-
|
|
4110
|
-
|
|
4111
|
-
|
|
4112
|
-
|
|
4113
|
-
|
|
4114
|
-
|
|
4115
|
-
|
|
4116
|
-
|
|
4117
|
-
|
|
4118
|
-
|
|
4119
|
-
|
|
4120
|
-
|
|
4121
|
-
|
|
4122
|
-
|
|
4123
|
-
|
|
4124
|
-
|
|
4125
|
-
|
|
4126
|
-
|
|
4127
|
-
|
|
4128
|
-
|
|
4129
|
-
|
|
4130
|
-
try {
|
|
4131
|
-
const sectionData = sections[si];
|
|
4132
|
-
const data = !distribution && compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
|
|
4133
|
-
totalDecompressed += data.length;
|
|
4134
|
-
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
4135
|
-
const records = readRecords(data);
|
|
4136
|
-
const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
|
|
4137
|
-
blocks.push(...sectionBlocks);
|
|
4138
|
-
parsedSections++;
|
|
4139
|
-
options?.onProgress?.(parsedSections, totalTarget);
|
|
4140
|
-
} catch (secErr) {
|
|
4141
|
-
if (secErr instanceof KordocError) throw secErr;
|
|
4142
|
-
warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
4143
|
-
}
|
|
4144
|
-
}
|
|
4145
|
-
const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
|
|
4146
|
-
if (docInfo) {
|
|
4147
|
-
detectHwp5Headings(blocks, docInfo);
|
|
4148
|
-
}
|
|
4149
|
-
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
4150
|
-
const markdown = blocksToMarkdown(blocks);
|
|
4151
|
-
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
4210
|
+
const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
|
|
4211
|
+
if (docInfo) {
|
|
4212
|
+
detectHwp5Headings(blocks, docInfo);
|
|
4213
|
+
}
|
|
4214
|
+
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
4215
|
+
const markdown = blocksToMarkdown(blocks);
|
|
4216
|
+
logger.log({
|
|
4217
|
+
level: "info",
|
|
4218
|
+
stage: "finalize",
|
|
4219
|
+
event: "done",
|
|
4220
|
+
message: "HWP5 \uD30C\uC2F1 \uC644\uB8CC",
|
|
4221
|
+
meta: { blocks: blocks.length, warnings: warnings.length, images: images.length, outline: outline.length }
|
|
4222
|
+
});
|
|
4223
|
+
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
4224
|
+
} catch (err) {
|
|
4225
|
+
logger.log({
|
|
4226
|
+
level: "error",
|
|
4227
|
+
stage: "finalize",
|
|
4228
|
+
event: "error",
|
|
4229
|
+
message: "HWP5 \uD30C\uC2F1 \uC2E4\uD328",
|
|
4230
|
+
meta: { lastParsedSection },
|
|
4231
|
+
error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
|
|
4232
|
+
});
|
|
4233
|
+
throw err;
|
|
4234
|
+
}
|
|
4152
4235
|
}
|
|
4153
4236
|
function parseDocInfoStream(cfb, compressed) {
|
|
4154
4237
|
try {
|
|
@@ -4707,6 +4790,10 @@ function arrangeCells(rows, cols, cells) {
|
|
|
4707
4790
|
return grid.map((row) => row.map((c) => c || { text: "", colSpan: 1, rowSpan: 1 }));
|
|
4708
4791
|
}
|
|
4709
4792
|
|
|
4793
|
+
// src/pdf/parser.ts
|
|
4794
|
+
import { createRequire } from "module";
|
|
4795
|
+
import { dirname, join, resolve } from "path";
|
|
4796
|
+
|
|
4710
4797
|
// src/pdf/line-detector.ts
|
|
4711
4798
|
import { OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
4712
4799
|
var ORIENTATION_TOL = 2;
|
|
@@ -4893,12 +4980,17 @@ function buildTableGrids(horizontals, verticals) {
|
|
|
4893
4980
|
const rawXs = vLines.map((l) => l.x1);
|
|
4894
4981
|
const colXs = clusterCoordinates(rawXs).sort((a, b) => a - b);
|
|
4895
4982
|
if (rowYs.length < 2 || colXs.length < 2) continue;
|
|
4983
|
+
const rowCount = rowYs.length - 1;
|
|
4984
|
+
const colCount = colXs.length - 1;
|
|
4985
|
+
if (rowCount <= 0 || colCount <= 0) continue;
|
|
4986
|
+
if (rowCount * colCount < 2) continue;
|
|
4896
4987
|
const bbox = {
|
|
4897
4988
|
x1: colXs[0],
|
|
4898
4989
|
y1: rowYs[rowYs.length - 1],
|
|
4899
4990
|
x2: colXs[colXs.length - 1],
|
|
4900
4991
|
y2: rowYs[0]
|
|
4901
4992
|
};
|
|
4993
|
+
if (!hasReliableGridStructure(rowYs, colXs, hLines, vLines, bbox)) continue;
|
|
4902
4994
|
grids.push({ rowYs, colXs, bbox });
|
|
4903
4995
|
}
|
|
4904
4996
|
return mergeAdjacentGrids(grids);
|
|
@@ -4948,6 +5040,35 @@ function clusterCoordinates(values) {
|
|
|
4948
5040
|
}
|
|
4949
5041
|
return clusters.map((c) => c.sum / c.count);
|
|
4950
5042
|
}
|
|
5043
|
+
function hasReliableGridStructure(rowYs, colXs, hLines, vLines, bbox) {
|
|
5044
|
+
const internalRows = rowYs.slice(1, -1);
|
|
5045
|
+
const internalCols = colXs.slice(1, -1);
|
|
5046
|
+
const width = Math.max(1, bbox.x2 - bbox.x1);
|
|
5047
|
+
const height = Math.max(1, bbox.y2 - bbox.y1);
|
|
5048
|
+
const coverageThreshold = 0.55;
|
|
5049
|
+
const coveredRows = internalRows.filter(
|
|
5050
|
+
(y) => hLines.some((h) => Math.abs(h.y1 - y) <= COORD_MERGE_TOL && lineOverlapRatio(h.x1, h.x2, bbox.x1, bbox.x2) >= coverageThreshold)
|
|
5051
|
+
).length;
|
|
5052
|
+
const coveredCols = internalCols.filter(
|
|
5053
|
+
(x) => vLines.some((v) => Math.abs(v.x1 - x) <= COORD_MERGE_TOL && lineOverlapRatio(v.y1, v.y2, bbox.y1, bbox.y2) >= coverageThreshold)
|
|
5054
|
+
).length;
|
|
5055
|
+
const rowCoverage = internalRows.length > 0 ? coveredRows / internalRows.length : 1;
|
|
5056
|
+
const colCoverage = internalCols.length > 0 ? coveredCols / internalCols.length : 1;
|
|
5057
|
+
const longHorizontal = hLines.filter((h) => Math.abs(h.x2 - h.x1) >= width * 0.7).length;
|
|
5058
|
+
const longVertical = vLines.filter((v) => Math.abs(v.y2 - v.y1) >= height * 0.7).length;
|
|
5059
|
+
const hasAxisSupport = longHorizontal >= 2 && longVertical >= 2;
|
|
5060
|
+
if (!hasAxisSupport) return false;
|
|
5061
|
+
if (internalRows.length > 0 && rowCoverage < 0.5) return false;
|
|
5062
|
+
if (internalCols.length > 0 && colCoverage < 0.5) return false;
|
|
5063
|
+
return true;
|
|
5064
|
+
}
|
|
5065
|
+
function lineOverlapRatio(a1, a2, b1, b2) {
|
|
5066
|
+
const left = Math.max(Math.min(a1, a2), Math.min(b1, b2));
|
|
5067
|
+
const right = Math.min(Math.max(a1, a2), Math.max(b1, b2));
|
|
5068
|
+
const overlap = Math.max(0, right - left);
|
|
5069
|
+
const target = Math.max(1, Math.abs(b2 - b1));
|
|
5070
|
+
return overlap / target;
|
|
5071
|
+
}
|
|
4951
5072
|
function groupConnectedLines(lines) {
|
|
4952
5073
|
const parent = lines.map((_, i) => i);
|
|
4953
5074
|
function find2(x) {
|
|
@@ -5344,6 +5465,17 @@ g.pdfjsWorker = pdfjsWorker;
|
|
|
5344
5465
|
// src/pdf/parser.ts
|
|
5345
5466
|
import { getDocument, GlobalWorkerOptions } from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
5346
5467
|
GlobalWorkerOptions.workerSrc = "";
|
|
5468
|
+
var require2 = createRequire(
|
|
5469
|
+
typeof __filename !== "undefined" ? __filename : resolve(process.cwd(), "kordoc.require.cjs")
|
|
5470
|
+
);
|
|
5471
|
+
function resolvePdfjsWasmUrl() {
|
|
5472
|
+
try {
|
|
5473
|
+
const pdfjsPkg = require2.resolve("pdfjs-dist/package.json");
|
|
5474
|
+
return join(dirname(pdfjsPkg), "wasm/");
|
|
5475
|
+
} catch {
|
|
5476
|
+
return resolve(process.cwd(), "node_modules/pdfjs-dist/wasm/");
|
|
5477
|
+
}
|
|
5478
|
+
}
|
|
5347
5479
|
var MAX_PAGES = 5e3;
|
|
5348
5480
|
var MAX_TOTAL_TEXT = 500 * 1024 * 1024;
|
|
5349
5481
|
function calcPdfTimeout(bufferSize) {
|
|
@@ -5359,7 +5491,8 @@ async function loadPdfWithTimeout(buffer) {
|
|
|
5359
5491
|
data: new Uint8Array(buffer),
|
|
5360
5492
|
useSystemFonts: true,
|
|
5361
5493
|
disableFontFace: true,
|
|
5362
|
-
isEvalSupported: false
|
|
5494
|
+
isEvalSupported: false,
|
|
5495
|
+
wasmUrl: resolvePdfjsWasmUrl()
|
|
5363
5496
|
});
|
|
5364
5497
|
let timer;
|
|
5365
5498
|
try {
|
|
@@ -5376,7 +5509,47 @@ async function loadPdfWithTimeout(buffer) {
|
|
|
5376
5509
|
if (timer !== void 0) clearTimeout(timer);
|
|
5377
5510
|
}
|
|
5378
5511
|
}
|
|
5512
|
+
function estimateImageBasedPdf(metrics) {
|
|
5513
|
+
if (metrics.length === 0) {
|
|
5514
|
+
return { isImageBased: true, score: 1, reason: "\uC0D8\uD50C \uD1B5\uACC4 \uC5C6\uC74C" };
|
|
5515
|
+
}
|
|
5516
|
+
const totalPages = metrics.length;
|
|
5517
|
+
const totalChars = metrics.reduce((s, m) => s + m.nonWhitespaceChars, 0);
|
|
5518
|
+
const totalItems = metrics.reduce((s, m) => s + m.visibleItems, 0);
|
|
5519
|
+
const pagesWithText = metrics.filter((m) => m.nonWhitespaceChars >= 20 || m.visibleItems >= 15).length;
|
|
5520
|
+
const avgChars = totalChars / totalPages;
|
|
5521
|
+
const avgItems = totalItems / totalPages;
|
|
5522
|
+
const textPresenceRatio = pagesWithText / totalPages;
|
|
5523
|
+
let score = 0;
|
|
5524
|
+
if (avgChars < 10) score += 0.45;
|
|
5525
|
+
if (avgItems < 8) score += 0.35;
|
|
5526
|
+
if (textPresenceRatio < 0.35) score += 0.25;
|
|
5527
|
+
if (avgChars > 40) score -= 0.35;
|
|
5528
|
+
if (avgItems > 25) score -= 0.35;
|
|
5529
|
+
if (textPresenceRatio > 0.7) score -= 0.25;
|
|
5530
|
+
score = Math.max(0, Math.min(1, score));
|
|
5531
|
+
const isImageBased = score >= 0.5;
|
|
5532
|
+
const reason = `avgChars=${avgChars.toFixed(1)}, avgItems=${avgItems.toFixed(1)}, textPresence=${(textPresenceRatio * 100).toFixed(0)}%, score=${score.toFixed(2)}`;
|
|
5533
|
+
return { isImageBased, score, reason };
|
|
5534
|
+
}
|
|
5535
|
+
function summarizePartialFailures(failedPages, totalTarget) {
|
|
5536
|
+
if (failedPages.length === 0) return null;
|
|
5537
|
+
const sorted = [...failedPages].sort((a, b) => a - b);
|
|
5538
|
+
const preview = sorted.slice(0, 10).join(", ");
|
|
5539
|
+
const suffix = sorted.length > 10 ? ` \uC678 ${sorted.length - 10}\uD398\uC774\uC9C0` : "";
|
|
5540
|
+
return `\uBD80\uBD84 \uD30C\uC2F1 \uC2E4\uD328 \uC694\uC57D: ${sorted.length}/${totalTarget}\uD398\uC774\uC9C0 \uC2E4\uD328 (p${preview}${suffix})`;
|
|
5541
|
+
}
|
|
5542
|
+
function shouldAbortForPartialFailures(failedPages, totalTarget, maxPartialFailureRatio) {
|
|
5543
|
+
if (typeof maxPartialFailureRatio !== "number") {
|
|
5544
|
+
return { abort: false, ratio: 0, threshold: 0 };
|
|
5545
|
+
}
|
|
5546
|
+
const threshold = Math.max(0, Math.min(1, maxPartialFailureRatio));
|
|
5547
|
+
const ratio = totalTarget > 0 ? failedPages.length / totalTarget : 0;
|
|
5548
|
+
return { abort: ratio > threshold, ratio, threshold };
|
|
5549
|
+
}
|
|
5379
5550
|
async function parsePdfDocument(buffer, options) {
|
|
5551
|
+
const logger = createLoggerFromEnv().child({ component: "pdf/parser.ts", stage: "detect" });
|
|
5552
|
+
logger.log({ level: "info", event: "start", message: "PDF \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.byteLength } });
|
|
5380
5553
|
const doc = await loadPdfWithTimeout(buffer);
|
|
5381
5554
|
try {
|
|
5382
5555
|
const pageCount = doc.numPages;
|
|
@@ -5385,9 +5558,13 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5385
5558
|
await extractPdfMetadata(doc, metadata);
|
|
5386
5559
|
const blocks = [];
|
|
5387
5560
|
const warnings = [];
|
|
5561
|
+
const failedPages = [];
|
|
5562
|
+
let lastParsedPage2 = 0;
|
|
5563
|
+
const sampleMetricsByPage = /* @__PURE__ */ new Map();
|
|
5388
5564
|
let totalChars = 0;
|
|
5389
5565
|
let totalTextBytes = 0;
|
|
5390
5566
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
5567
|
+
logger.log({ level: "debug", event: "progress", message: "PDF \uB85C\uB529 \uC644\uB8CC", meta: { pageCount, effectivePageCount } });
|
|
5391
5568
|
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
5392
5569
|
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
5393
5570
|
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
@@ -5424,11 +5601,17 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5424
5601
|
totalChars += t.replace(/\s/g, "").length;
|
|
5425
5602
|
totalTextBytes += t.length * 2;
|
|
5426
5603
|
}
|
|
5604
|
+
sampleMetricsByPage.set(i, {
|
|
5605
|
+
nonWhitespaceChars: visible.reduce((sum, it) => sum + it.text.replace(/\s/g, "").length, 0),
|
|
5606
|
+
visibleItems: visible.length
|
|
5607
|
+
});
|
|
5608
|
+
lastParsedPage2 = i;
|
|
5427
5609
|
if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
|
|
5428
5610
|
parsedPages++;
|
|
5429
5611
|
options?.onProgress?.(parsedPages, totalTarget);
|
|
5430
5612
|
} catch (pageErr) {
|
|
5431
5613
|
if (pageErr instanceof KordocError) throw pageErr;
|
|
5614
|
+
if (!failedPages.includes(i)) failedPages.push(i);
|
|
5432
5615
|
warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
5433
5616
|
}
|
|
5434
5617
|
};
|
|
@@ -5445,8 +5628,21 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5445
5628
|
for (const si of sampledIndices) {
|
|
5446
5629
|
await parseSinglePage(targetPageNums[si]);
|
|
5447
5630
|
}
|
|
5448
|
-
const
|
|
5449
|
-
const
|
|
5631
|
+
const sampledMetrics = [];
|
|
5632
|
+
for (const si of sampledIndices) {
|
|
5633
|
+
const pageNum = targetPageNums[si];
|
|
5634
|
+
const m = sampleMetricsByPage.get(pageNum);
|
|
5635
|
+
if (m) sampledMetrics.push(m);
|
|
5636
|
+
}
|
|
5637
|
+
const imageBasedDecision = estimateImageBasedPdf(sampledMetrics);
|
|
5638
|
+
const isImageBased = imageBasedDecision.isImageBased;
|
|
5639
|
+
logger.log({
|
|
5640
|
+
level: "info",
|
|
5641
|
+
stage: "probe",
|
|
5642
|
+
event: "done",
|
|
5643
|
+
message: "\uC774\uBBF8\uC9C0 \uAE30\uBC18 \uD310\uC815",
|
|
5644
|
+
meta: { isImageBased, reason: imageBasedDecision.reason, sampledPages: sampledMetrics.length }
|
|
5645
|
+
});
|
|
5450
5646
|
if (!isImageBased) {
|
|
5451
5647
|
for (let si = 0; si < targetPageNums.length; si++) {
|
|
5452
5648
|
if (!sampledIndices.has(si)) {
|
|
@@ -5454,20 +5650,52 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5454
5650
|
}
|
|
5455
5651
|
}
|
|
5456
5652
|
}
|
|
5653
|
+
const partialSummary = summarizePartialFailures(failedPages, totalTarget);
|
|
5654
|
+
if (partialSummary) {
|
|
5655
|
+
warnings.push({
|
|
5656
|
+
message: partialSummary,
|
|
5657
|
+
code: "PARTIAL_PARSE"
|
|
5658
|
+
});
|
|
5659
|
+
}
|
|
5660
|
+
if (isImageBased) {
|
|
5661
|
+
warnings.push({
|
|
5662
|
+
message: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 \uD310\uC815: ${imageBasedDecision.reason}`,
|
|
5663
|
+
code: "OCR_FALLBACK"
|
|
5664
|
+
});
|
|
5665
|
+
}
|
|
5666
|
+
const partialPolicy = shouldAbortForPartialFailures(
|
|
5667
|
+
failedPages,
|
|
5668
|
+
totalTarget,
|
|
5669
|
+
options?.maxPartialFailureRatio
|
|
5670
|
+
);
|
|
5671
|
+
if (partialPolicy.abort) {
|
|
5672
|
+
throw new KordocError(
|
|
5673
|
+
`\uBD80\uBD84 \uD30C\uC2F1 \uC2E4\uD328 \uBE44\uC728 \uCD08\uACFC: ${(partialPolicy.ratio * 100).toFixed(1)}% (\uD5C8\uC6A9 ${(partialPolicy.threshold * 100).toFixed(1)}%)`
|
|
5674
|
+
);
|
|
5675
|
+
}
|
|
5457
5676
|
const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
|
|
5458
5677
|
if (isImageBased) {
|
|
5459
5678
|
const ocrMode = options?.ocrMode ?? "auto";
|
|
5460
5679
|
const concurrency = options?.ocrConcurrency ?? 1;
|
|
5461
5680
|
const batchSize = options?.ocrBatchSize;
|
|
5681
|
+
logger.log({
|
|
5682
|
+
level: "info",
|
|
5683
|
+
stage: "ocr",
|
|
5684
|
+
event: "start",
|
|
5685
|
+
message: "\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF OCR \uC2DC\uC791",
|
|
5686
|
+
meta: { ocrMode, concurrency, batchSize, totalTarget }
|
|
5687
|
+
});
|
|
5462
5688
|
if (ocrMode === "off") {
|
|
5463
5689
|
throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
|
|
5464
5690
|
}
|
|
5465
|
-
const { resolveOcrProvider } = await import("./resolve-
|
|
5466
|
-
const { ocrPages } = await import("./provider-
|
|
5691
|
+
const { resolveOcrProvider } = await import("./resolve-673XFZQ6.js");
|
|
5692
|
+
const { ocrPages } = await import("./provider-T2D5XRTI.js");
|
|
5467
5693
|
const tryProvider = async (provider, filter) => {
|
|
5468
5694
|
try {
|
|
5695
|
+
logger.log({ level: "debug", stage: "ocr", event: "progress", message: "OCR provider \uC2E4\uD589", meta: { filteredPages: filter?.size } });
|
|
5469
5696
|
return await ocrPages(doc, provider, filter, effectivePageCount, warnings, concurrency, options?.onProgress);
|
|
5470
5697
|
} catch {
|
|
5698
|
+
logger.log({ level: "warn", stage: "ocr", event: "progress", message: "OCR provider \uC2E4\uD589 \uC2E4\uD328(\uBE48 \uACB0\uACFC\uB85C \uCC98\uB9AC)" });
|
|
5471
5699
|
return [];
|
|
5472
5700
|
} finally {
|
|
5473
5701
|
const terminable = provider;
|
|
@@ -5490,6 +5718,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5490
5718
|
for (const mode of getAutoFallbackChain()) {
|
|
5491
5719
|
if (pendingPages.size === 0) break;
|
|
5492
5720
|
try {
|
|
5721
|
+
logger.log({ level: "info", stage: "ocr", event: "progress", message: "OCR \uC5D4\uC9C4 \uC2DC\uB3C4", meta: { mode, pendingPages: pendingPages.size } });
|
|
5493
5722
|
const modeFilter = pendingPages.size < effectivePageCount ? new Set(pendingPages) : pageFilter;
|
|
5494
5723
|
const provider = await resolveOcrProvider(mode, warnings, concurrency, batchSize);
|
|
5495
5724
|
const blocks2 = await tryProvider(provider, modeFilter);
|
|
@@ -5504,10 +5733,20 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5504
5733
|
code: "OCR_CLI_FALLBACK"
|
|
5505
5734
|
});
|
|
5506
5735
|
}
|
|
5736
|
+
logger.log({ level: "info", stage: "ocr", event: "progress", message: "OCR \uC5D4\uC9C4 \uCC98\uB9AC \uC644\uB8CC", meta: { mode, blocks: blocks2.length, pendingPages: pendingPages.size } });
|
|
5507
5737
|
} else {
|
|
5508
5738
|
warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
|
|
5739
|
+
logger.log({ level: "warn", stage: "ocr", event: "progress", message: "OCR \uC5D4\uC9C4 \uACB0\uACFC \uC5C6\uC74C", meta: { mode } });
|
|
5509
5740
|
}
|
|
5510
|
-
} catch {
|
|
5741
|
+
} catch (engineErr) {
|
|
5742
|
+
logger.log({
|
|
5743
|
+
level: "warn",
|
|
5744
|
+
stage: "ocr",
|
|
5745
|
+
event: "progress",
|
|
5746
|
+
message: "OCR \uC5D4\uC9C4 \uCD08\uAE30\uD654/\uC2E4\uD589 \uC2E4\uD328",
|
|
5747
|
+
meta: { mode },
|
|
5748
|
+
error: { message: engineErr instanceof Error ? engineErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: engineErr instanceof Error ? engineErr.name : "Error" }
|
|
5749
|
+
});
|
|
5511
5750
|
}
|
|
5512
5751
|
}
|
|
5513
5752
|
allOcrBlocks.sort((a, b) => (a.pageNumber ?? 0) - (b.pageNumber ?? 0));
|
|
@@ -5525,6 +5764,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5525
5764
|
}
|
|
5526
5765
|
if (ocrBlocks.length > 0) {
|
|
5527
5766
|
const ocrMarkdown = blocksToMarkdown(ocrBlocks);
|
|
5767
|
+
logger.log({ level: "info", stage: "ocr", event: "done", message: "\uC774\uBBF8\uC9C0 \uAE30\uBC18 OCR \uC644\uB8CC", meta: { blocks: ocrBlocks.length } });
|
|
5528
5768
|
return {
|
|
5529
5769
|
markdown: ocrMarkdown,
|
|
5530
5770
|
blocks: ocrBlocks,
|
|
@@ -5550,8 +5790,25 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5550
5790
|
}
|
|
5551
5791
|
detectMarkerHeadings(blocks);
|
|
5552
5792
|
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
5553
|
-
let markdown = cleanPdfText(blocksToMarkdown(blocks));
|
|
5793
|
+
let markdown = cleanPdfText(blocksToMarkdown(blocks), options?.pdfTextNormalization ?? "default");
|
|
5794
|
+
logger.log({
|
|
5795
|
+
level: "info",
|
|
5796
|
+
stage: "finalize",
|
|
5797
|
+
event: "done",
|
|
5798
|
+
message: "PDF \uD30C\uC2F1 \uC644\uB8CC",
|
|
5799
|
+
meta: { blocks: blocks.length, warnings: warnings.length, outline: outline.length, isImageBased: false }
|
|
5800
|
+
});
|
|
5554
5801
|
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
5802
|
+
} catch (err) {
|
|
5803
|
+
logger.log({
|
|
5804
|
+
level: "error",
|
|
5805
|
+
stage: "finalize",
|
|
5806
|
+
event: "error",
|
|
5807
|
+
message: "PDF \uD30C\uC2F1 \uC2E4\uD328",
|
|
5808
|
+
meta: { lastParsedPage },
|
|
5809
|
+
error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
|
|
5810
|
+
});
|
|
5811
|
+
throw err;
|
|
5555
5812
|
} finally {
|
|
5556
5813
|
await doc.destroy().catch(() => {
|
|
5557
5814
|
});
|
|
@@ -5656,6 +5913,17 @@ function shouldDemoteTable(table) {
|
|
|
5656
5913
|
const emptyCells = totalCells - allCells.length;
|
|
5657
5914
|
if (table.rows <= 2 && emptyCells > totalCells * 0.5) return true;
|
|
5658
5915
|
if (table.rows === 1 && !/\d{2,}/.test(allText)) return true;
|
|
5916
|
+
if (table.cols >= 3 && table.rows <= 4) {
|
|
5917
|
+
const markerCells = allCells.filter((t) => /^[□■◆○●▶▷◇◆]/.test(t)).length;
|
|
5918
|
+
const numericCells = allCells.filter((t) => /\d/.test(t)).length;
|
|
5919
|
+
if (markerCells >= Math.max(1, Math.floor(allCells.length * 0.35)) && numericCells <= Math.floor(allCells.length * 0.15)) {
|
|
5920
|
+
return true;
|
|
5921
|
+
}
|
|
5922
|
+
}
|
|
5923
|
+
if (table.cols >= 3 && table.rows >= 2) {
|
|
5924
|
+
const sparseRows = table.cells.filter((row) => row.filter((c) => c.text.trim()).length <= 1).length;
|
|
5925
|
+
if (sparseRows >= Math.ceil(table.rows * 0.7)) return true;
|
|
5926
|
+
}
|
|
5659
5927
|
return false;
|
|
5660
5928
|
}
|
|
5661
5929
|
function demoteTableToText(table) {
|
|
@@ -6211,10 +6479,15 @@ function mergeLineSimple(items) {
|
|
|
6211
6479
|
}
|
|
6212
6480
|
return result;
|
|
6213
6481
|
}
|
|
6214
|
-
function
|
|
6215
|
-
return
|
|
6216
|
-
|
|
6217
|
-
|
|
6482
|
+
function stripPdfPageNumberArtifacts(text) {
|
|
6483
|
+
return text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "");
|
|
6484
|
+
}
|
|
6485
|
+
function cleanPdfText(text, mode = "default") {
|
|
6486
|
+
const stripped = stripPdfPageNumberArtifacts(text);
|
|
6487
|
+
if (mode === "strict-preserve") {
|
|
6488
|
+
return stripped.replace(/\n{4,}/g, "\n\n\n").trim();
|
|
6489
|
+
}
|
|
6490
|
+
return mergeKoreanLines(stripped).replace(/^(?!\|).{3,30}$/gm, (line) => collapseEvenSpacing(line)).replace(/\n{3,}/g, "\n\n").trim();
|
|
6218
6491
|
}
|
|
6219
6492
|
function startsWithMarker(line) {
|
|
6220
6493
|
const t = line.trimStart();
|
|
@@ -6610,100 +6883,139 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
|
|
|
6610
6883
|
return blocks;
|
|
6611
6884
|
}
|
|
6612
6885
|
async function parseXlsxDocument(buffer, options, existingZip) {
|
|
6613
|
-
|
|
6614
|
-
|
|
6615
|
-
|
|
6616
|
-
|
|
6617
|
-
|
|
6618
|
-
|
|
6619
|
-
|
|
6620
|
-
|
|
6621
|
-
|
|
6622
|
-
|
|
6623
|
-
|
|
6624
|
-
|
|
6625
|
-
|
|
6626
|
-
|
|
6627
|
-
|
|
6628
|
-
|
|
6629
|
-
|
|
6630
|
-
|
|
6631
|
-
|
|
6632
|
-
|
|
6633
|
-
|
|
6634
|
-
|
|
6635
|
-
|
|
6636
|
-
|
|
6637
|
-
|
|
6638
|
-
|
|
6639
|
-
|
|
6640
|
-
|
|
6641
|
-
|
|
6642
|
-
|
|
6643
|
-
if (pageFilter && !pageFilter.has(i + 1)) continue;
|
|
6644
|
-
const sheet = sheets[i];
|
|
6645
|
-
options?.onProgress?.(i + 1, processedSheets);
|
|
6646
|
-
let sheetPath = relsMap.get(sheet.rId);
|
|
6647
|
-
if (sheetPath) {
|
|
6648
|
-
if (!sheetPath.startsWith("xl/") && !sheetPath.startsWith("/")) {
|
|
6649
|
-
sheetPath = `xl/${sheetPath}`;
|
|
6650
|
-
} else if (sheetPath.startsWith("/")) {
|
|
6651
|
-
sheetPath = sheetPath.slice(1);
|
|
6652
|
-
}
|
|
6653
|
-
} else {
|
|
6654
|
-
sheetPath = `xl/worksheets/sheet${i + 1}.xml`;
|
|
6655
|
-
}
|
|
6656
|
-
const sheetFile = zip.file(sheetPath);
|
|
6657
|
-
if (!sheetFile) {
|
|
6658
|
-
warnings.push({
|
|
6659
|
-
page: i + 1,
|
|
6660
|
-
message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${sheetPath}`,
|
|
6661
|
-
code: "PARTIAL_PARSE"
|
|
6662
|
-
});
|
|
6663
|
-
continue;
|
|
6886
|
+
const logger = createLoggerFromEnv().child({ component: "xlsx/parser.ts", stage: "detect" });
|
|
6887
|
+
logger.log({ level: "info", event: "start", message: "XLSX \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.byteLength } });
|
|
6888
|
+
let lastProcessedSheet = 0;
|
|
6889
|
+
try {
|
|
6890
|
+
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
|
|
6891
|
+
const zip = existingZip ?? await JSZip3.loadAsync(buffer);
|
|
6892
|
+
const warnings = [];
|
|
6893
|
+
const workbookFile = zip.file("xl/workbook.xml");
|
|
6894
|
+
if (!workbookFile) {
|
|
6895
|
+
throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 XLSX \uD30C\uC77C: xl/workbook.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
6896
|
+
}
|
|
6897
|
+
let sharedStrings = [];
|
|
6898
|
+
const ssFile = zip.file("xl/sharedStrings.xml");
|
|
6899
|
+
if (ssFile) {
|
|
6900
|
+
sharedStrings = parseSharedStrings(await ssFile.async("text"));
|
|
6901
|
+
}
|
|
6902
|
+
const sheets = parseWorkbook(await workbookFile.async("text"));
|
|
6903
|
+
if (sheets.length === 0) {
|
|
6904
|
+
throw new KordocError("XLSX \uD30C\uC77C\uC5D0 \uC2DC\uD2B8\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
6905
|
+
}
|
|
6906
|
+
logger.log({ level: "debug", event: "progress", message: "\uC2DC\uD2B8 \uBAA9\uB85D \uB85C\uB4DC", meta: { sheets: sheets.length } });
|
|
6907
|
+
let relsMap = /* @__PURE__ */ new Map();
|
|
6908
|
+
const relsFile = zip.file("xl/_rels/workbook.xml.rels");
|
|
6909
|
+
if (relsFile) {
|
|
6910
|
+
relsMap = parseRels(await relsFile.async("text"));
|
|
6911
|
+
}
|
|
6912
|
+
let pageFilter = null;
|
|
6913
|
+
if (options?.pages) {
|
|
6914
|
+
const { parsePageRange: parsePageRange2 } = await import("./page-range-ALIRXAL5.js");
|
|
6915
|
+
pageFilter = parsePageRange2(options.pages, sheets.length);
|
|
6664
6916
|
}
|
|
6665
|
-
|
|
6666
|
-
|
|
6667
|
-
|
|
6668
|
-
|
|
6669
|
-
if (
|
|
6670
|
-
|
|
6671
|
-
|
|
6917
|
+
const blocks = [];
|
|
6918
|
+
const processedSheets = Math.min(sheets.length, MAX_SHEETS);
|
|
6919
|
+
let totalCells = 0;
|
|
6920
|
+
for (let i = 0; i < processedSheets; i++) {
|
|
6921
|
+
if (pageFilter && !pageFilter.has(i + 1)) continue;
|
|
6922
|
+
const sheet = sheets[i];
|
|
6923
|
+
options?.onProgress?.(i + 1, processedSheets);
|
|
6924
|
+
let sheetPath = relsMap.get(sheet.rId);
|
|
6925
|
+
if (sheetPath) {
|
|
6926
|
+
if (!sheetPath.startsWith("xl/") && !sheetPath.startsWith("/")) {
|
|
6927
|
+
sheetPath = `xl/${sheetPath}`;
|
|
6928
|
+
} else if (sheetPath.startsWith("/")) {
|
|
6929
|
+
sheetPath = sheetPath.slice(1);
|
|
6930
|
+
}
|
|
6931
|
+
} else {
|
|
6932
|
+
sheetPath = `xl/worksheets/sheet${i + 1}.xml`;
|
|
6933
|
+
}
|
|
6934
|
+
const sheetFile = zip.file(sheetPath);
|
|
6935
|
+
if (!sheetFile) {
|
|
6936
|
+
warnings.push({
|
|
6937
|
+
page: i + 1,
|
|
6938
|
+
message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${sheetPath}`,
|
|
6939
|
+
code: "PARTIAL_PARSE"
|
|
6940
|
+
});
|
|
6941
|
+
continue;
|
|
6942
|
+
}
|
|
6943
|
+
try {
|
|
6944
|
+
const sheetXml = await sheetFile.async("text");
|
|
6945
|
+
const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
|
|
6946
|
+
totalCells += maxRow * maxCol;
|
|
6947
|
+
if (totalCells > MAX_TOTAL_CELLS) {
|
|
6948
|
+
warnings.push({ message: `\uCD1D \uC140 \uC218 \uC81C\uD55C \uCD08\uACFC (${totalCells.toLocaleString()}\uC140), \uC774\uD6C4 \uC2DC\uD2B8 \uC0DD\uB7B5`, code: "PARTIAL_PARSE" });
|
|
6949
|
+
break;
|
|
6950
|
+
}
|
|
6951
|
+
const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
|
|
6952
|
+
blocks.push(...sheetBlocks);
|
|
6953
|
+
logger.log({
|
|
6954
|
+
level: "debug",
|
|
6955
|
+
stage: "convert",
|
|
6956
|
+
event: "progress",
|
|
6957
|
+
message: "\uC2DC\uD2B8 \uD30C\uC2F1 \uC644\uB8CC",
|
|
6958
|
+
meta: { sheet: sheet.name, index: i + 1, processedSheets }
|
|
6959
|
+
});
|
|
6960
|
+
lastProcessedSheet = i + 1;
|
|
6961
|
+
} catch (err) {
|
|
6962
|
+
warnings.push({
|
|
6963
|
+
page: i + 1,
|
|
6964
|
+
message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC2F1 \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
6965
|
+
code: "PARTIAL_PARSE"
|
|
6966
|
+
});
|
|
6967
|
+
logger.log({
|
|
6968
|
+
level: "warn",
|
|
6969
|
+
stage: "convert",
|
|
6970
|
+
event: "progress",
|
|
6971
|
+
message: "\uC2DC\uD2B8 \uD30C\uC2F1 \uC2E4\uD328",
|
|
6972
|
+
meta: { sheet: sheet.name, index: i + 1 },
|
|
6973
|
+
error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error" }
|
|
6974
|
+
});
|
|
6672
6975
|
}
|
|
6673
|
-
const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
|
|
6674
|
-
blocks.push(...sheetBlocks);
|
|
6675
|
-
} catch (err) {
|
|
6676
|
-
warnings.push({
|
|
6677
|
-
page: i + 1,
|
|
6678
|
-
message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC2F1 \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
6679
|
-
code: "PARTIAL_PARSE"
|
|
6680
|
-
});
|
|
6681
6976
|
}
|
|
6682
|
-
|
|
6683
|
-
|
|
6684
|
-
|
|
6685
|
-
|
|
6686
|
-
|
|
6687
|
-
|
|
6688
|
-
|
|
6689
|
-
|
|
6690
|
-
|
|
6691
|
-
|
|
6692
|
-
|
|
6693
|
-
|
|
6694
|
-
|
|
6695
|
-
|
|
6696
|
-
|
|
6697
|
-
|
|
6698
|
-
|
|
6699
|
-
|
|
6700
|
-
|
|
6701
|
-
|
|
6702
|
-
|
|
6977
|
+
const metadata = {
|
|
6978
|
+
pageCount: processedSheets
|
|
6979
|
+
};
|
|
6980
|
+
const coreFile = zip.file("docProps/core.xml");
|
|
6981
|
+
if (coreFile) {
|
|
6982
|
+
try {
|
|
6983
|
+
const coreXml = await coreFile.async("text");
|
|
6984
|
+
const doc = parseXml(coreXml);
|
|
6985
|
+
const getFirst = (tag) => {
|
|
6986
|
+
const els = doc.getElementsByTagName(tag);
|
|
6987
|
+
return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
|
|
6988
|
+
};
|
|
6989
|
+
metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
|
|
6990
|
+
metadata.author = getFirst("dc:creator");
|
|
6991
|
+
metadata.description = getFirst("dc:description");
|
|
6992
|
+
const created = getFirst("dcterms:created");
|
|
6993
|
+
if (created) metadata.createdAt = created;
|
|
6994
|
+
const modified = getFirst("dcterms:modified");
|
|
6995
|
+
if (modified) metadata.modifiedAt = modified;
|
|
6996
|
+
} catch {
|
|
6997
|
+
}
|
|
6703
6998
|
}
|
|
6999
|
+
const markdown = blocksToMarkdown(blocks);
|
|
7000
|
+
logger.log({
|
|
7001
|
+
level: "info",
|
|
7002
|
+
stage: "finalize",
|
|
7003
|
+
event: "done",
|
|
7004
|
+
message: "XLSX \uD30C\uC2F1 \uC644\uB8CC",
|
|
7005
|
+
meta: { blocks: blocks.length, warnings: warnings.length, pageCount: processedSheets }
|
|
7006
|
+
});
|
|
7007
|
+
return { markdown, blocks, metadata, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
7008
|
+
} catch (err) {
|
|
7009
|
+
logger.log({
|
|
7010
|
+
level: "error",
|
|
7011
|
+
stage: "finalize",
|
|
7012
|
+
event: "error",
|
|
7013
|
+
message: "XLSX \uD30C\uC2F1 \uC2E4\uD328",
|
|
7014
|
+
meta: { lastProcessedSheet },
|
|
7015
|
+
error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
|
|
7016
|
+
});
|
|
7017
|
+
throw err;
|
|
6704
7018
|
}
|
|
6705
|
-
const markdown = blocksToMarkdown(blocks);
|
|
6706
|
-
return { markdown, blocks, metadata, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
6707
7019
|
}
|
|
6708
7020
|
|
|
6709
7021
|
// src/docx/parser.ts
|
|
@@ -7070,95 +7382,120 @@ async function extractImages(zip, rels, doc) {
|
|
|
7070
7382
|
return { blocks, images };
|
|
7071
7383
|
}
|
|
7072
7384
|
async function parseDocxDocument(buffer, options, existingZip) {
|
|
7073
|
-
|
|
7074
|
-
|
|
7075
|
-
|
|
7076
|
-
|
|
7077
|
-
|
|
7078
|
-
|
|
7079
|
-
|
|
7080
|
-
|
|
7081
|
-
|
|
7082
|
-
|
|
7083
|
-
|
|
7084
|
-
|
|
7085
|
-
|
|
7086
|
-
|
|
7087
|
-
|
|
7088
|
-
|
|
7089
|
-
|
|
7090
|
-
|
|
7385
|
+
const logger = createLoggerFromEnv().child({ component: "docx/parser.ts", stage: "detect" });
|
|
7386
|
+
logger.log({ level: "info", event: "start", message: "DOCX \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.byteLength } });
|
|
7387
|
+
let lastProcessedNode = 0;
|
|
7388
|
+
try {
|
|
7389
|
+
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
|
|
7390
|
+
const zip = existingZip ?? await JSZip4.loadAsync(buffer);
|
|
7391
|
+
const warnings = [];
|
|
7392
|
+
const docFile = zip.file("word/document.xml");
|
|
7393
|
+
if (!docFile) {
|
|
7394
|
+
throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 DOCX \uD30C\uC77C: word/document.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
7395
|
+
}
|
|
7396
|
+
let rels = /* @__PURE__ */ new Map();
|
|
7397
|
+
const relsFile = zip.file("word/_rels/document.xml.rels");
|
|
7398
|
+
if (relsFile) {
|
|
7399
|
+
rels = parseRels2(await relsFile.async("text"));
|
|
7400
|
+
}
|
|
7401
|
+
let styles = /* @__PURE__ */ new Map();
|
|
7402
|
+
const stylesFile = zip.file("word/styles.xml");
|
|
7403
|
+
if (stylesFile) {
|
|
7404
|
+
try {
|
|
7405
|
+
styles = parseStyles(await stylesFile.async("text"));
|
|
7406
|
+
} catch {
|
|
7407
|
+
}
|
|
7091
7408
|
}
|
|
7092
|
-
|
|
7093
|
-
|
|
7094
|
-
|
|
7095
|
-
|
|
7096
|
-
|
|
7097
|
-
|
|
7098
|
-
|
|
7409
|
+
let numbering = /* @__PURE__ */ new Map();
|
|
7410
|
+
const numFile = zip.file("word/numbering.xml");
|
|
7411
|
+
if (numFile) {
|
|
7412
|
+
try {
|
|
7413
|
+
numbering = parseNumbering(await numFile.async("text"));
|
|
7414
|
+
} catch {
|
|
7415
|
+
}
|
|
7099
7416
|
}
|
|
7100
|
-
|
|
7101
|
-
|
|
7102
|
-
|
|
7103
|
-
|
|
7104
|
-
|
|
7105
|
-
|
|
7106
|
-
|
|
7417
|
+
let footnotes = /* @__PURE__ */ new Map();
|
|
7418
|
+
const fnFile = zip.file("word/footnotes.xml");
|
|
7419
|
+
if (fnFile) {
|
|
7420
|
+
try {
|
|
7421
|
+
footnotes = parseFootnotes(await fnFile.async("text"));
|
|
7422
|
+
} catch {
|
|
7423
|
+
}
|
|
7107
7424
|
}
|
|
7108
|
-
|
|
7109
|
-
|
|
7110
|
-
|
|
7111
|
-
|
|
7112
|
-
|
|
7113
|
-
throw new KordocError("DOCX \uBCF8\uBB38(w:body)\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
7114
|
-
}
|
|
7115
|
-
const blocks = [];
|
|
7116
|
-
const bodyEl = body[0];
|
|
7117
|
-
const children = bodyEl.childNodes;
|
|
7118
|
-
for (let i = 0; i < children.length; i++) {
|
|
7119
|
-
const node = children[i];
|
|
7120
|
-
if (node.nodeType !== 1) continue;
|
|
7121
|
-
const el = node;
|
|
7122
|
-
const localName = el.localName ?? el.tagName?.split(":").pop();
|
|
7123
|
-
if (localName === "p") {
|
|
7124
|
-
const block = parseParagraph(el, styles, numbering, footnotes, rels);
|
|
7125
|
-
if (block) blocks.push(block);
|
|
7126
|
-
} else if (localName === "tbl") {
|
|
7127
|
-
const block = parseTable(el, styles, numbering, footnotes, rels);
|
|
7128
|
-
if (block) blocks.push(block);
|
|
7129
|
-
}
|
|
7130
|
-
}
|
|
7131
|
-
const { blocks: imgBlocks, images } = await extractImages(zip, rels, doc);
|
|
7132
|
-
const metadata = {};
|
|
7133
|
-
const coreFile = zip.file("docProps/core.xml");
|
|
7134
|
-
if (coreFile) {
|
|
7135
|
-
try {
|
|
7136
|
-
const coreXml = await coreFile.async("text");
|
|
7137
|
-
const coreDoc = parseXml2(coreXml);
|
|
7138
|
-
const getFirst = (tag) => {
|
|
7139
|
-
const els = coreDoc.getElementsByTagName(tag);
|
|
7140
|
-
return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
|
|
7141
|
-
};
|
|
7142
|
-
metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
|
|
7143
|
-
metadata.author = getFirst("dc:creator");
|
|
7144
|
-
metadata.description = getFirst("dc:description");
|
|
7145
|
-
const created = getFirst("dcterms:created");
|
|
7146
|
-
if (created) metadata.createdAt = created;
|
|
7147
|
-
const modified = getFirst("dcterms:modified");
|
|
7148
|
-
if (modified) metadata.modifiedAt = modified;
|
|
7149
|
-
} catch {
|
|
7425
|
+
const docXml = await docFile.async("text");
|
|
7426
|
+
const doc = parseXml2(docXml);
|
|
7427
|
+
const body = findElements(doc, "body");
|
|
7428
|
+
if (body.length === 0) {
|
|
7429
|
+
throw new KordocError("DOCX \uBCF8\uBB38(w:body)\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
7150
7430
|
}
|
|
7431
|
+
const blocks = [];
|
|
7432
|
+
const bodyEl = body[0];
|
|
7433
|
+
const children = bodyEl.childNodes;
|
|
7434
|
+
for (let i = 0; i < children.length; i++) {
|
|
7435
|
+
const node = children[i];
|
|
7436
|
+
if (node.nodeType !== 1) continue;
|
|
7437
|
+
const el = node;
|
|
7438
|
+
const localName = el.localName ?? el.tagName?.split(":").pop();
|
|
7439
|
+
if (localName === "p") {
|
|
7440
|
+
const block = parseParagraph(el, styles, numbering, footnotes, rels);
|
|
7441
|
+
if (block) blocks.push(block);
|
|
7442
|
+
} else if (localName === "tbl") {
|
|
7443
|
+
const block = parseTable(el, styles, numbering, footnotes, rels);
|
|
7444
|
+
if (block) blocks.push(block);
|
|
7445
|
+
}
|
|
7446
|
+
lastProcessedNode = i + 1;
|
|
7447
|
+
}
|
|
7448
|
+
logger.log({ level: "debug", stage: "convert", event: "progress", message: "\uBCF8\uBB38 \uBE14\uB85D \uD30C\uC2F1 \uC644\uB8CC", meta: { blocks: blocks.length } });
|
|
7449
|
+
const { blocks: imgBlocks, images } = await extractImages(zip, rels, doc);
|
|
7450
|
+
logger.log({ level: "debug", stage: "convert", event: "progress", message: "\uC774\uBBF8\uC9C0 \uCD94\uCD9C \uC644\uB8CC", meta: { imageBlocks: imgBlocks.length, images: images.length } });
|
|
7451
|
+
const metadata = {};
|
|
7452
|
+
const coreFile = zip.file("docProps/core.xml");
|
|
7453
|
+
if (coreFile) {
|
|
7454
|
+
try {
|
|
7455
|
+
const coreXml = await coreFile.async("text");
|
|
7456
|
+
const coreDoc = parseXml2(coreXml);
|
|
7457
|
+
const getFirst = (tag) => {
|
|
7458
|
+
const els = coreDoc.getElementsByTagName(tag);
|
|
7459
|
+
return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
|
|
7460
|
+
};
|
|
7461
|
+
metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
|
|
7462
|
+
metadata.author = getFirst("dc:creator");
|
|
7463
|
+
metadata.description = getFirst("dc:description");
|
|
7464
|
+
const created = getFirst("dcterms:created");
|
|
7465
|
+
if (created) metadata.createdAt = created;
|
|
7466
|
+
const modified = getFirst("dcterms:modified");
|
|
7467
|
+
if (modified) metadata.modifiedAt = modified;
|
|
7468
|
+
} catch {
|
|
7469
|
+
}
|
|
7470
|
+
}
|
|
7471
|
+
const outline = blocks.filter((b) => b.type === "heading").map((b) => ({ level: b.level ?? 2, text: b.text ?? "" }));
|
|
7472
|
+
const markdown = blocksToMarkdown(blocks);
|
|
7473
|
+
logger.log({
|
|
7474
|
+
level: "info",
|
|
7475
|
+
stage: "finalize",
|
|
7476
|
+
event: "done",
|
|
7477
|
+
message: "DOCX \uD30C\uC2F1 \uC644\uB8CC",
|
|
7478
|
+
meta: { blocks: blocks.length, warnings: warnings.length, outline: outline.length, images: images.length }
|
|
7479
|
+
});
|
|
7480
|
+
return {
|
|
7481
|
+
markdown,
|
|
7482
|
+
blocks,
|
|
7483
|
+
metadata,
|
|
7484
|
+
outline: outline.length > 0 ? outline : void 0,
|
|
7485
|
+
warnings: warnings.length > 0 ? warnings : void 0,
|
|
7486
|
+
images: images.length > 0 ? images : void 0
|
|
7487
|
+
};
|
|
7488
|
+
} catch (err) {
|
|
7489
|
+
logger.log({
|
|
7490
|
+
level: "error",
|
|
7491
|
+
stage: "finalize",
|
|
7492
|
+
event: "error",
|
|
7493
|
+
message: "DOCX \uD30C\uC2F1 \uC2E4\uD328",
|
|
7494
|
+
meta: { lastProcessedNode },
|
|
7495
|
+
error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
|
|
7496
|
+
});
|
|
7497
|
+
throw err;
|
|
7151
7498
|
}
|
|
7152
|
-
const outline = blocks.filter((b) => b.type === "heading").map((b) => ({ level: b.level ?? 2, text: b.text ?? "" }));
|
|
7153
|
-
const markdown = blocksToMarkdown(blocks);
|
|
7154
|
-
return {
|
|
7155
|
-
markdown,
|
|
7156
|
-
blocks,
|
|
7157
|
-
metadata,
|
|
7158
|
-
outline: outline.length > 0 ? outline : void 0,
|
|
7159
|
-
warnings: warnings.length > 0 ? warnings : void 0,
|
|
7160
|
-
images: images.length > 0 ? images : void 0
|
|
7161
|
-
};
|
|
7162
7499
|
}
|
|
7163
7500
|
|
|
7164
7501
|
// src/form/recognize.ts
|
|
@@ -9487,8 +9824,22 @@ async function markdownToXlsx(markdown, options) {
|
|
|
9487
9824
|
return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
|
|
9488
9825
|
}
|
|
9489
9826
|
|
|
9827
|
+
// src/pipeline/unified-ocr.ts
|
|
9828
|
+
import libre from "libreoffice-convert";
|
|
9829
|
+
var libreConvert = libre.convert;
|
|
9830
|
+
var PROOFREAD_PROMPT = [
|
|
9831
|
+
"\uC544\uB798 Markdown\uC744 \uBE44\uD30C\uAD34 \uAD50\uC815\uB9CC \uC218\uD589\uD558\uC138\uC694.",
|
|
9832
|
+
"\uADDC\uCE59:",
|
|
9833
|
+
"- \uC0AC\uC2E4 \uCD94\uAC00/\uC0AD\uC81C/\uCD94\uCE21 \uAE08\uC9C0",
|
|
9834
|
+
"- \uC22B\uC790, \uB2E8\uC704, \uACE0\uC720\uBA85\uC0AC \uBCC0\uACBD \uAE08\uC9C0",
|
|
9835
|
+
"- \uC624\uD0C8\uC790, \uB744\uC5B4\uC4F0\uAE30, \uC904\uBC14\uAFC8, Markdown \uAD6C\uC870\uB9CC \uAD50\uC815",
|
|
9836
|
+
"- \uACB0\uACFC\uB294 Markdown \uBCF8\uBB38\uB9CC \uCD9C\uB825"
|
|
9837
|
+
].join("\n");
|
|
9838
|
+
|
|
9490
9839
|
// src/index.ts
|
|
9491
9840
|
async function parse2(input, options) {
|
|
9841
|
+
const logger = createLoggerFromEnv().withRun(generateRunId("parse")).child({ component: "index.ts", stage: "detect" });
|
|
9842
|
+
logger.log({ level: "info", event: "start", message: "parse \uD638\uCD9C \uC2DC\uC791" });
|
|
9492
9843
|
let buffer;
|
|
9493
9844
|
if (typeof input === "string") {
|
|
9494
9845
|
try {
|
|
@@ -9496,6 +9847,13 @@ async function parse2(input, options) {
|
|
|
9496
9847
|
buffer = toArrayBuffer(buf);
|
|
9497
9848
|
} catch (err) {
|
|
9498
9849
|
const msg = err instanceof Error && "code" in err && err.code === "ENOENT" ? `\uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${input}` : `\uD30C\uC77C \uC77D\uAE30 \uC2E4\uD328: ${input}`;
|
|
9850
|
+
logger.log({
|
|
9851
|
+
level: "error",
|
|
9852
|
+
stage: "detect",
|
|
9853
|
+
event: "error",
|
|
9854
|
+
message: msg,
|
|
9855
|
+
error: { code: "PARSE_ERROR", message: msg, name: err instanceof Error ? err.name : "Error" }
|
|
9856
|
+
});
|
|
9499
9857
|
return { success: false, fileType: "unknown", error: msg, code: "PARSE_ERROR" };
|
|
9500
9858
|
}
|
|
9501
9859
|
} else if (Buffer.isBuffer(input)) {
|
|
@@ -9504,13 +9862,23 @@ async function parse2(input, options) {
|
|
|
9504
9862
|
buffer = input;
|
|
9505
9863
|
}
|
|
9506
9864
|
if (!buffer || buffer.byteLength === 0) {
|
|
9865
|
+
logger.log({ level: "error", stage: "detect", event: "error", message: "\uBE48 \uC785\uB825 \uBC84\uD37C", error: { code: "EMPTY_INPUT", message: "\uBE48 \uC785\uB825 \uBC84\uD37C", name: "KordocError" } });
|
|
9507
9866
|
return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
|
|
9508
9867
|
}
|
|
9509
9868
|
const MAX_FILE_SIZE = 500 * 1024 * 1024;
|
|
9510
9869
|
if (buffer.byteLength > MAX_FILE_SIZE) {
|
|
9870
|
+
logger.log({
|
|
9871
|
+
level: "error",
|
|
9872
|
+
stage: "detect",
|
|
9873
|
+
event: "error",
|
|
9874
|
+
message: "\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC",
|
|
9875
|
+
meta: { size: buffer.byteLength },
|
|
9876
|
+
error: { code: "FILE_TOO_LARGE", message: "\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC", name: "KordocError" }
|
|
9877
|
+
});
|
|
9511
9878
|
return { success: false, fileType: "unknown", error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`, code: "FILE_TOO_LARGE" };
|
|
9512
9879
|
}
|
|
9513
9880
|
const format = detectFormat(buffer);
|
|
9881
|
+
logger.log({ level: "info", event: "done", message: "\uD3EC\uB9F7 \uAC10\uC9C0 \uC644\uB8CC", meta: { format } });
|
|
9514
9882
|
switch (format) {
|
|
9515
9883
|
case "hwpx": {
|
|
9516
9884
|
const { format: zipFormat, zip } = await detectZipFormat(buffer);
|
|
@@ -9588,7 +9956,8 @@ async function parseHwpx(buffer, options, zip) {
|
|
|
9588
9956
|
const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options, zip);
|
|
9589
9957
|
return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
9590
9958
|
} catch (err) {
|
|
9591
|
-
|
|
9959
|
+
const normalized = normalizeKordocError(err, "HWPX \uD30C\uC2F1 \uC2E4\uD328", "finalize");
|
|
9960
|
+
return { success: false, fileType: "hwpx", error: normalized.message, code: normalized.code ?? classifyError(normalized) };
|
|
9592
9961
|
}
|
|
9593
9962
|
}
|
|
9594
9963
|
async function parseHwp(buffer, options) {
|
|
@@ -9596,7 +9965,8 @@ async function parseHwp(buffer, options) {
|
|
|
9596
9965
|
const { markdown, blocks, metadata, outline, warnings, images } = parseHwp5Document(Buffer.from(buffer), options);
|
|
9597
9966
|
return { success: true, fileType: "hwp", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
9598
9967
|
} catch (err) {
|
|
9599
|
-
|
|
9968
|
+
const normalized = normalizeKordocError(err, "HWP \uD30C\uC2F1 \uC2E4\uD328", "finalize");
|
|
9969
|
+
return { success: false, fileType: "hwp", error: normalized.message, code: normalized.code ?? classifyError(normalized) };
|
|
9600
9970
|
}
|
|
9601
9971
|
}
|
|
9602
9972
|
async function parsePdf(buffer, options) {
|
|
@@ -9604,8 +9974,15 @@ async function parsePdf(buffer, options) {
|
|
|
9604
9974
|
const { markdown, blocks, metadata, outline, warnings, isImageBased } = await parsePdfDocument(buffer, options);
|
|
9605
9975
|
return { success: true, fileType: "pdf", markdown, blocks, metadata, outline, warnings, isImageBased };
|
|
9606
9976
|
} catch (err) {
|
|
9977
|
+
const normalized = normalizeKordocError(err, "PDF \uD30C\uC2F1 \uC2E4\uD328", "finalize");
|
|
9607
9978
|
const isImageBased = err instanceof Error && "isImageBased" in err ? true : void 0;
|
|
9608
|
-
return {
|
|
9979
|
+
return {
|
|
9980
|
+
success: false,
|
|
9981
|
+
fileType: "pdf",
|
|
9982
|
+
error: normalized.message,
|
|
9983
|
+
code: normalized.code ?? classifyError(normalized),
|
|
9984
|
+
isImageBased
|
|
9985
|
+
};
|
|
9609
9986
|
}
|
|
9610
9987
|
}
|
|
9611
9988
|
async function parseXlsx(buffer, options, zip) {
|
|
@@ -9613,7 +9990,8 @@ async function parseXlsx(buffer, options, zip) {
|
|
|
9613
9990
|
const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options, zip);
|
|
9614
9991
|
return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
|
|
9615
9992
|
} catch (err) {
|
|
9616
|
-
|
|
9993
|
+
const normalized = normalizeKordocError(err, "XLSX \uD30C\uC2F1 \uC2E4\uD328", "finalize");
|
|
9994
|
+
return { success: false, fileType: "xlsx", error: normalized.message, code: normalized.code ?? classifyError(normalized) };
|
|
9617
9995
|
}
|
|
9618
9996
|
}
|
|
9619
9997
|
async function parseDocx(buffer, options, zip) {
|
|
@@ -9621,7 +9999,8 @@ async function parseDocx(buffer, options, zip) {
|
|
|
9621
9999
|
const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options, zip);
|
|
9622
10000
|
return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
9623
10001
|
} catch (err) {
|
|
9624
|
-
|
|
10002
|
+
const normalized = normalizeKordocError(err, "DOCX \uD30C\uC2F1 \uC2E4\uD328", "finalize");
|
|
10003
|
+
return { success: false, fileType: "docx", error: normalized.message, code: normalized.code ?? classifyError(normalized) };
|
|
9625
10004
|
}
|
|
9626
10005
|
}
|
|
9627
10006
|
|
|
@@ -9813,4 +10192,4 @@ export {
|
|
|
9813
10192
|
cfb/cfb.js:
|
|
9814
10193
|
(*! crc32.js (C) 2014-present SheetJS -- http://sheetjs.com *)
|
|
9815
10194
|
*/
|
|
9816
|
-
//# sourceMappingURL=chunk-
|
|
10195
|
+
//# sourceMappingURL=chunk-KJEZPVEK.js.map
|