@clazic/kordoc 2.5.1 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -1
- package/dist/batch-provider-XRF6F26E.js +234 -0
- package/dist/batch-provider-XRF6F26E.js.map +1 -0
- package/dist/chunk-S7BHLD2V.js +200 -0
- package/dist/{chunk-Y4WFKJ5P.js.map → chunk-S7BHLD2V.js.map} +1 -1
- package/dist/{chunk-IJGNPAK2.js → chunk-TND4YFBV.js} +2 -2
- package/dist/{chunk-QG6BYZMR.js → chunk-TS3F57LY.js} +160 -8
- package/dist/chunk-TS3F57LY.js.map +1 -0
- package/dist/cli.js +53 -6
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +420 -145
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +71 -2
- package/dist/index.d.ts +71 -2
- package/dist/index.js +407 -135
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +44 -3
- package/dist/mcp.js.map +1 -1
- package/dist/{resolve-XWYJYKKH.js → resolve-ZSUEJK3E.js} +4 -4
- package/dist/{utils-RBXHHCLI.js → utils-F66K7PXH.js} +2 -2
- package/dist/{watch-5CCMTZ7F.js → watch-2S5ULHAM.js} +4 -4
- package/package.json +1 -1
- package/dist/batch-provider-5BFJRKAZ.js +0 -190
- package/dist/batch-provider-5BFJRKAZ.js.map +0 -1
- package/dist/chunk-QG6BYZMR.js.map +0 -1
- package/dist/chunk-Y4WFKJ5P.js +0 -167
- /package/dist/{chunk-IJGNPAK2.js.map → chunk-TND4YFBV.js.map} +0 -0
- /package/dist/{resolve-XWYJYKKH.js.map → resolve-ZSUEJK3E.js.map} +0 -0
- /package/dist/{utils-RBXHHCLI.js.map → utils-F66K7PXH.js.map} +0 -0
- /package/dist/{watch-5CCMTZ7F.js.map → watch-2S5ULHAM.js.map} +0 -0
package/dist/index.cjs
CHANGED
|
@@ -33,6 +33,118 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
33
33
|
));
|
|
34
34
|
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
35
35
|
|
|
36
|
+
// src/utils.ts
|
|
37
|
+
var utils_exports = {};
|
|
38
|
+
__export(utils_exports, {
|
|
39
|
+
KordocError: () => KordocError,
|
|
40
|
+
VERSION: () => VERSION,
|
|
41
|
+
classifyError: () => classifyError,
|
|
42
|
+
isPathTraversal: () => isPathTraversal,
|
|
43
|
+
normalizeKordocError: () => normalizeKordocError,
|
|
44
|
+
precheckZipSize: () => precheckZipSize,
|
|
45
|
+
sanitizeError: () => sanitizeError,
|
|
46
|
+
sanitizeHref: () => sanitizeHref,
|
|
47
|
+
toArrayBuffer: () => toArrayBuffer
|
|
48
|
+
});
|
|
49
|
+
function toArrayBuffer(buf) {
|
|
50
|
+
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
51
|
+
return buf.buffer;
|
|
52
|
+
}
|
|
53
|
+
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
54
|
+
}
|
|
55
|
+
function sanitizeError(err) {
|
|
56
|
+
if (err instanceof KordocError) return err.message;
|
|
57
|
+
return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
|
|
58
|
+
}
|
|
59
|
+
function isPathTraversal(name) {
|
|
60
|
+
if (name.includes("\0")) return true;
|
|
61
|
+
const normalized = name.replace(/\\/g, "/");
|
|
62
|
+
return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
63
|
+
}
|
|
64
|
+
function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
|
|
65
|
+
try {
|
|
66
|
+
const data = new DataView(buffer);
|
|
67
|
+
const len = buffer.byteLength;
|
|
68
|
+
let eocdOffset = -1;
|
|
69
|
+
for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
|
|
70
|
+
if (data.getUint32(i, true) === 101010256) {
|
|
71
|
+
eocdOffset = i;
|
|
72
|
+
break;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
|
|
76
|
+
const entryCount = data.getUint16(eocdOffset + 10, true);
|
|
77
|
+
if (entryCount > maxEntries) {
|
|
78
|
+
throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
|
|
79
|
+
}
|
|
80
|
+
const cdSize = data.getUint32(eocdOffset + 12, true);
|
|
81
|
+
const cdOffset = data.getUint32(eocdOffset + 16, true);
|
|
82
|
+
if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
|
|
83
|
+
let totalUncompressed = 0;
|
|
84
|
+
let pos = cdOffset;
|
|
85
|
+
for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
|
|
86
|
+
if (data.getUint32(pos, true) !== 33639248) break;
|
|
87
|
+
totalUncompressed += data.getUint32(pos + 24, true);
|
|
88
|
+
const nameLen = data.getUint16(pos + 28, true);
|
|
89
|
+
const extraLen = data.getUint16(pos + 30, true);
|
|
90
|
+
const commentLen = data.getUint16(pos + 32, true);
|
|
91
|
+
pos += 46 + nameLen + extraLen + commentLen;
|
|
92
|
+
}
|
|
93
|
+
if (totalUncompressed > maxUncompressedSize) {
|
|
94
|
+
throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
|
|
95
|
+
}
|
|
96
|
+
return { totalUncompressed, entryCount };
|
|
97
|
+
} catch (err) {
|
|
98
|
+
if (err instanceof KordocError) throw err;
|
|
99
|
+
return { totalUncompressed: 0, entryCount: 0 };
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
function sanitizeHref(href) {
|
|
103
|
+
const trimmed = href.trim();
|
|
104
|
+
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
105
|
+
return trimmed;
|
|
106
|
+
}
|
|
107
|
+
function classifyError(err) {
|
|
108
|
+
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
109
|
+
const msg = err.message;
|
|
110
|
+
if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
|
|
111
|
+
if (msg.includes("DRM")) return "DRM_PROTECTED";
|
|
112
|
+
if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
|
|
113
|
+
if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
|
|
114
|
+
if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
|
|
115
|
+
if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
|
|
116
|
+
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
117
|
+
return "PARSE_ERROR";
|
|
118
|
+
}
|
|
119
|
+
function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
|
|
120
|
+
if (err instanceof KordocError) {
|
|
121
|
+
if (!err.stage) err.stage = stage;
|
|
122
|
+
if (!err.code) err.code = fallbackCode;
|
|
123
|
+
return err;
|
|
124
|
+
}
|
|
125
|
+
const message = err instanceof Error ? err.message : fallbackMessage;
|
|
126
|
+
const code = err instanceof Error ? classifyError(err) : fallbackCode;
|
|
127
|
+
return new KordocError(message || fallbackMessage, { code, stage });
|
|
128
|
+
}
|
|
129
|
+
var VERSION, KordocError, SAFE_HREF_RE;
|
|
130
|
+
var init_utils = __esm({
|
|
131
|
+
"src/utils.ts"() {
|
|
132
|
+
"use strict";
|
|
133
|
+
VERSION = true ? "2.5.2" : "0.0.0-dev";
|
|
134
|
+
KordocError = class extends Error {
|
|
135
|
+
code;
|
|
136
|
+
stage;
|
|
137
|
+
constructor(message, opts = {}) {
|
|
138
|
+
super(message);
|
|
139
|
+
this.name = "KordocError";
|
|
140
|
+
this.code = opts.code;
|
|
141
|
+
this.stage = opts.stage;
|
|
142
|
+
}
|
|
143
|
+
};
|
|
144
|
+
SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
145
|
+
}
|
|
146
|
+
});
|
|
147
|
+
|
|
36
148
|
// src/page-range.ts
|
|
37
149
|
var page_range_exports = {};
|
|
38
150
|
__export(page_range_exports, {
|
|
@@ -2394,15 +2506,48 @@ var init_cli_provider = __esm({
|
|
|
2394
2506
|
import_fs2 = require("fs");
|
|
2395
2507
|
import_path2 = require("path");
|
|
2396
2508
|
import_os = require("os");
|
|
2397
|
-
OCR_PROMPT = `\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \
|
|
2398
|
-
|
|
2399
|
-
|
|
2400
|
-
- \
|
|
2401
|
-
- \uD5E4\uB529\uC740 \
|
|
2402
|
-
- \uB9AC\uC2A4\uD2B8\uB294
|
|
2403
|
-
- \uC774\uBBF8\uC9C0
|
|
2404
|
-
- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\
|
|
2405
|
-
|
|
2509
|
+
OCR_PROMPT = `\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD45C\uB97C \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uACE0, \uBA85\uBC31\uD55C OCR \uC624\uC778\uC2DD\uACFC \uD45C \uAD6C\uC870 \uAE68\uC9D0\uB9CC \uAD50\uC815\uD558\uC5EC \uCD5C\uC885 \uACB0\uACFC\uB97C \uCD9C\uB825\uD558\uC138\uC694.
|
|
2510
|
+
|
|
2511
|
+
[\uAE30\uBCF8 \uCD94\uCD9C \uADDC\uCE59]
|
|
2512
|
+
- \uBCF8\uBB38, \uD45C, \uC81C\uBAA9, \uB9AC\uC2A4\uD2B8, \uCEA1\uC158\uC744 \uC6D0\uBB38 \uAD6C\uC870 \uADF8\uB300\uB85C Markdown\uC73C\uB85C \uBCC0\uD658
|
|
2513
|
+
- \uD5E4\uB529\uC740 \uC2DC\uAC01\uC801 \uD06C\uAE30\xB7\uAD75\uAE30\uC5D0 \uB530\uB77C # ~ ###### \uC0AC\uC6A9
|
|
2514
|
+
- \uB9AC\uC2A4\uD2B8\uB294 -, 1. \uC0AC\uC6A9 (\uC6D0\uBB38 \uBC88\uD638 \uCCB4\uACC4\uAC00 \u2460, \uAC00., 1) \uB4F1\uC774\uBA74 \uADF8 \uD45C\uAE30 \uC720\uC9C0)
|
|
2515
|
+
- \uC774\uBBF8\uC9C0\xB7\uB3C4\uD615\xB7\uB85C\uACE0\xB7\uD398\uC774\uC9C0\uBC88\uD638\xB7\uBA38\uB9AC\uAE00/\uBC14\uB2E5\uAE00\uC740 \uBB34\uC2DC
|
|
2516
|
+
- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C(\uC88C\u2192\uC6B0, \uC704\u2192\uC544\uB798, \uB2E4\uB2E8\uC774\uBA74 \uB2E8\uBCC4\uB85C)\uB97C \uC720\uC9C0
|
|
2517
|
+
|
|
2518
|
+
[\uD45C \uCD94\uCD9C \uADDC\uCE59 \u2014 \uAC00\uC7A5 \uC911\uC694]
|
|
2519
|
+
- \uD45C\uB294 \uBC18\uB4DC\uC2DC Markdown \uD30C\uC774\uD504 \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9: \uD5E4\uB354 \uD589 + |---| \uAD6C\uBD84\uC120 + \uB370\uC774\uD130 \uD589
|
|
2520
|
+
- \uC2DC\uAC01\uC801\uC73C\uB85C \uAD75\uAC70\uB098 \uC74C\uC601(\uD68C\uC0C9/\uC0C9\uC0C1) \uBC30\uACBD\uC774 \uC788\uB294 \uD589\uC744 \uD5E4\uB354\uB85C \uC2DD\uBCC4. \uD5E4\uB354\uAC00 \uC5C6\uB294 \uD45C\uB77C\uB3C4 \uCCAB \uD589\uC744 \uD5E4\uB354\uB85C \uB450\uACE0 |---| \uAD6C\uBD84\uC120 \uCD94\uAC00
|
|
2521
|
+
- \uBAA8\uB4E0 \uD589\uC758 \uD30C\uC774\uD504(|) \uAC1C\uC218\uB97C \uD5E4\uB354\uC640 \uB3D9\uC77C\uD558\uAC8C \uB9DE\uCD9C \uAC83 \u2014 \uBE48 \uC140\uC740 \uACF5\uBC31\uC73C\uB85C \uCC44\uC6B0\uACE0 \uC808\uB300 \uC140\uC744 \uC0DD\uB7B5\uD558\uC9C0 \uB9D0 \uAC83
|
|
2522
|
+
- \uC140 \uC548\uC758 \uC904\uBC14\uAFC8\uC740 <br>\uB85C \uD45C\uAE30 (\uC2E4\uC81C \uAC1C\uD589 \uBB38\uC790 \uC0AC\uC6A9 \uAE08\uC9C0 \u2014 \uD45C\uAC00 \uAE68\uC9D0)
|
|
2523
|
+
- \uBCD1\uD569 \uC140 \uCC98\uB9AC:
|
|
2524
|
+
- \uAC00\uB85C \uBCD1\uD569(colspan): \uBCD1\uD569\uB41C \uC140 \uB0B4\uC6A9\uC744 \uCCAB \uCE78\uC5D0 \uC4F0\uACE0 \uB098\uBA38\uC9C0 \uCE78\uC740 \uBE48 \uCE78\uC73C\uB85C \uB458 \uAC83
|
|
2525
|
+
- \uC138\uB85C \uBCD1\uD569(rowspan): \uBCD1\uD569\uB41C \uBAA8\uB4E0 \uD589\uC758 \uD574\uB2F9 \uCE78\uC5D0 \uB3D9\uC77C\uD55C \uB0B4\uC6A9\uC744 \uBC18\uBCF5 \uAE30\uC7AC\uD560 \uAC83
|
|
2526
|
+
- 2\uB2E8 \uD5E4\uB354(\uD5E4\uB354\uAC00 \uB450 \uC904\uC778 \uACBD\uC6B0): \uC0C1\uC704/\uD558\uC704 \uD5E4\uB354\uB97C " / "\uB85C \uD569\uCCD0 \uB2E8\uC77C \uD5E4\uB354 \uD589\uC73C\uB85C \uB9CC\uB4E4 \uAC83 (\uC608: "2024 / 1\uBD84\uAE30")
|
|
2527
|
+
- \uD45C \uC704\xB7\uC544\uB798\uC758 \uCEA1\uC158(\uC608: "<\uD45C 1> ...", "[\uD45C 2-1]")\uC740 \uD45C \uBC14\uB85C \uC704\uC5D0 \uBCC4\uB3C4 \uC904\uB85C \uBCF4\uC874
|
|
2528
|
+
- \uD55C \uD398\uC774\uC9C0\uC5D0\uC11C \uD45C\uAC00 \uC911\uAC04\uC5D0 \uB04A\uC5B4\uC838 \uBCF4\uC774\uB354\uB77C\uB3C4, \uD5E4\uB354\uAC00 \uB3D9\uC77C\uD558\uBA74 \uD558\uB098\uC758 \uD45C\uB85C \uC774\uC5B4\uBD99\uC77C \uAC83
|
|
2529
|
+
- \uC88C\uCE21 \uCCAB \uCEEC\uB7FC\uC774 \uC138\uB85C\uC4F0\uAE30 \uB77C\uBCA8(\uAD6C\uBD84/\uBD84\uB958/\uC5F0\uB3C4 \uB4F1)\uC774\uB77C\uB3C4 \uC77C\uBC18 \uC140\uB85C \uBCC0\uD658\uD558\uC5EC \uB204\uB77D \uAE08\uC9C0
|
|
2530
|
+
|
|
2531
|
+
[OCR \uC624\uC778\uC2DD\xB7\uAE68\uC9D0 \uAD50\uC815 \u2014 \uD5C8\uC6A9 \uBC94\uC704\uB9CC]
|
|
2532
|
+
- \uC22B\uC790 \uCE78\uC5D0\uC11C 'O'\u2192'0', 'l/I'\u2192'1' \uB4F1 \uB9E5\uB77D\uC0C1 \uBA85\uBC31\uD55C \uC624\uC778\uC2DD\uB9CC \uAD50\uC815
|
|
2533
|
+
- \uB2E8\uC5B4 \uC911\uAC04\uC5D0 \uC798\uBABB \uC0BD\uC785\uB41C \uACF5\uBC31 \uC81C\uAC70 (\uC608: "\uC8FC\uAC70 \uC885 \uD569 \uACC4\uD68D" \u2192 "\uC8FC\uAC70\uC885\uD569\uACC4\uD68D")
|
|
2534
|
+
- \uC904\uBC14\uAFC8 \uC624\uB958\uB85C \uB04A\uAE34 \uBB38\uC7A5\uC740 \uC758\uBBF8 \uB2E8\uC704\uB85C \uBCD1\uD569 (\uB2E8, \uD45C \uC140 \uB0B4\uBD80\uB294 <br> \uC720\uC9C0)
|
|
2535
|
+
- \uD45C\uC5D0\uC11C \uD589 \uC5B4\uAE0B\uB0A8\uC774 \uC758\uC2EC\uB418\uBA74(\uCEEC\uB7FC \uC218 \uBD88\uC77C\uCE58) \uBE48 \uC140\uB85C \uD328\uB529\uD558\uC5EC \uCEEC\uB7FC \uC815\uD569\uC131\uC744 \uC6B0\uC120 \uD655\uBCF4
|
|
2536
|
+
|
|
2537
|
+
[\uC808\uB300 \uAE08\uC9C0 \uC0AC\uD56D]
|
|
2538
|
+
- \uBB38\uC7A5\xB7\uB2E8\uB77D\xB7\uD56D\uBAA9\xB7\uD45C \uD589/\uC5F4\uC744 \uCD94\uAC00\uD558\uAC70\uB098 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83
|
|
2539
|
+
- \uC22B\uC790, \uD37C\uC13C\uD2B8, \uB0A0\uC9DC, \uB2E8\uC704, \uAE08\uC561\uC744 \uC808\uB300 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83
|
|
2540
|
+
- \uACE0\uC720\uBA85\uC0AC, \uAE30\uAD00\uBA85, \uBC95\uB839\uBA85, \uC9C0\uBA85, \uC778\uBA85\uC744 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83
|
|
2541
|
+
- \uD45C\uC758 \uD5E4\uB354 \uD14D\uC2A4\uD2B8, \uCEA1\uC158, \uD589/\uC5F4 \uAC1C\uC218\uB97C \uC784\uC758\uB85C \uBC14\uAFB8\uC9C0 \uB9D0 \uAC83
|
|
2542
|
+
- \uCCB4\uD06C\uBC15\uC2A4(\u2611/\u2610/\u25A0/\u25A1), \uD2B9\uC218\uBB38\uC790(\u203B, \u2460\u2461\u2462, \u3260, \uAC00., \uB098.)\uB97C \uC784\uC758\uB85C \uBC14\uAFB8\uC9C0 \uB9D0 \uAC83
|
|
2543
|
+
- \uC6D0\uBB38\uC5D0 \uC5C6\uB294 \uB0B4\uC6A9\uC744 \uC694\uC57D\xB7\uBCF4\uC644\xB7\uCD94\uB860\uD558\uC9C0 \uB9D0 \uAC83
|
|
2544
|
+
- \`\`\`, \`\`\`markdown \uAC19\uC740 \uCF54\uB4DC\uD39C\uC2A4\uB85C \uAC10\uC2F8\uC9C0 \uB9D0 \uAC83
|
|
2545
|
+
- \uC124\uBA85\xB7\uC8FC\uC11D\xB7\uBA54\uD0C0 \uBB38\uC7A5 \uCD94\uAC00 \uAE08\uC9C0
|
|
2546
|
+
|
|
2547
|
+
[\uBD88\uD655\uC2E4\uD560 \uB54C]
|
|
2548
|
+
- \uAE00\uC790\uAC00 \uD750\uB9BF\uD558\uAC70\uB098 \uD310\uB3C5 \uBD88\uAC00\uD558\uBA74 \uBCF4\uC774\uB294 \uADF8\uB300\uB85C \uB450\uAC70\uB098, \uBD88\uAC00\uD53C\uD558\uBA74 \uD55C \uAE00\uC790\uB9CC ?\uB85C \uD45C\uC2DC
|
|
2549
|
+
- \uD45C \uAD6C\uC870\uAC00 \uBAA8\uD638\uD558\uBA74 \uCEEC\uB7FC \uC218 \uC77C\uCE58 + \uBE48 \uC140 \uD328\uB529 \uD615\uD0DC\uB85C \uCD9C\uB825
|
|
2550
|
+
- \uCD94\uCE21\uBCF4\uB2E4 \uC6D0\uBB38 \uBCF4\uC874\uC744 \uD56D\uC0C1 \uC6B0\uC120`;
|
|
2406
2551
|
_tempDir = null;
|
|
2407
2552
|
}
|
|
2408
2553
|
});
|
|
@@ -2591,7 +2736,51 @@ var init_batch_provider = __esm({
|
|
|
2591
2736
|
import_fs3 = require("fs");
|
|
2592
2737
|
import_path3 = require("path");
|
|
2593
2738
|
import_os2 = require("os");
|
|
2594
|
-
BATCH_OCR_PROMPT =
|
|
2739
|
+
BATCH_OCR_PROMPT = `\uB2E4\uC74C \uBB38\uC11C \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uB4E4\uC744 OCR\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uACE0, \uBA85\uBC31\uD55C OCR \uC624\uC778\uC2DD\uACFC \uD45C \uAD6C\uC870 \uAE68\uC9D0\uB9CC \uAD50\uC815\uD558\uC5EC \uCD5C\uC885 \uACB0\uACFC\uB97C \uCD9C\uB825\uD558\uC138\uC694.
|
|
2740
|
+
|
|
2741
|
+
[\uD398\uC774\uC9C0 \uAD6C\uBD84 \u2014 \uD544\uC218]
|
|
2742
|
+
- \uAC01 \uD398\uC774\uC9C0 \uACB0\uACFC \uC0AC\uC774\uC5D0 \uBC18\uB4DC\uC2DC \uC774 \uAD6C\uBD84\uC790\uB97C \uC0BD\uC785: <!-- PAGE_BREAK -->
|
|
2743
|
+
|
|
2744
|
+
[\uAE30\uBCF8 \uCD94\uCD9C \uADDC\uCE59]
|
|
2745
|
+
- \uBCF8\uBB38, \uD45C, \uC81C\uBAA9, \uB9AC\uC2A4\uD2B8, \uCEA1\uC158\uC744 \uC6D0\uBB38 \uAD6C\uC870 \uADF8\uB300\uB85C Markdown\uC73C\uB85C \uBCC0\uD658
|
|
2746
|
+
- \uD5E4\uB529\uC740 \uC2DC\uAC01\uC801 \uD06C\uAE30\xB7\uAD75\uAE30\uC5D0 \uB530\uB77C # ~ ###### \uC0AC\uC6A9
|
|
2747
|
+
- \uB9AC\uC2A4\uD2B8\uB294 -, 1. \uC0AC\uC6A9 (\uC6D0\uBB38 \uBC88\uD638 \uCCB4\uACC4\uAC00 \u2460, \uAC00., 1) \uB4F1\uC774\uBA74 \uADF8 \uD45C\uAE30 \uC720\uC9C0)
|
|
2748
|
+
- \uC774\uBBF8\uC9C0\xB7\uB3C4\uD615\xB7\uB85C\uACE0\xB7\uD398\uC774\uC9C0\uBC88\uD638\xB7\uBA38\uB9AC\uAE00/\uBC14\uB2E5\uAE00\uC740 \uBB34\uC2DC
|
|
2749
|
+
- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C(\uC88C\u2192\uC6B0, \uC704\u2192\uC544\uB798, \uB2E4\uB2E8\uC774\uBA74 \uB2E8\uBCC4\uB85C)\uB97C \uC720\uC9C0
|
|
2750
|
+
|
|
2751
|
+
[\uD45C \uCD94\uCD9C \uADDC\uCE59 \u2014 \uAC00\uC7A5 \uC911\uC694]
|
|
2752
|
+
- \uD45C\uB294 \uBC18\uB4DC\uC2DC Markdown \uD30C\uC774\uD504 \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9: \uD5E4\uB354 \uD589 + |---| \uAD6C\uBD84\uC120 + \uB370\uC774\uD130 \uD589
|
|
2753
|
+
- \uC2DC\uAC01\uC801\uC73C\uB85C \uAD75\uAC70\uB098 \uC74C\uC601(\uD68C\uC0C9/\uC0C9\uC0C1) \uBC30\uACBD\uC774 \uC788\uB294 \uD589\uC744 \uD5E4\uB354\uB85C \uC2DD\uBCC4. \uD5E4\uB354\uAC00 \uC5C6\uB294 \uD45C\uB77C\uB3C4 \uCCAB \uD589\uC744 \uD5E4\uB354\uB85C \uB450\uACE0 |---| \uAD6C\uBD84\uC120 \uCD94\uAC00
|
|
2754
|
+
- \uBAA8\uB4E0 \uD589\uC758 \uD30C\uC774\uD504(|) \uAC1C\uC218\uB97C \uD5E4\uB354\uC640 \uB3D9\uC77C\uD558\uAC8C \uB9DE\uCD9C \uAC83 \u2014 \uBE48 \uC140\uC740 \uACF5\uBC31\uC73C\uB85C \uCC44\uC6B0\uACE0 \uC808\uB300 \uC140\uC744 \uC0DD\uB7B5\uD558\uC9C0 \uB9D0 \uAC83
|
|
2755
|
+
- \uC140 \uC548\uC758 \uC904\uBC14\uAFC8\uC740 <br>\uB85C \uD45C\uAE30 (\uC2E4\uC81C \uAC1C\uD589 \uBB38\uC790 \uC0AC\uC6A9 \uAE08\uC9C0 \u2014 \uD45C\uAC00 \uAE68\uC9D0)
|
|
2756
|
+
- \uBCD1\uD569 \uC140 \uCC98\uB9AC:
|
|
2757
|
+
- \uAC00\uB85C \uBCD1\uD569(colspan): \uBCD1\uD569\uB41C \uC140 \uB0B4\uC6A9\uC744 \uCCAB \uCE78\uC5D0 \uC4F0\uACE0 \uB098\uBA38\uC9C0 \uCE78\uC740 \uBE48 \uCE78\uC73C\uB85C \uB458 \uAC83
|
|
2758
|
+
- \uC138\uB85C \uBCD1\uD569(rowspan): \uBCD1\uD569\uB41C \uBAA8\uB4E0 \uD589\uC758 \uD574\uB2F9 \uCE78\uC5D0 \uB3D9\uC77C\uD55C \uB0B4\uC6A9\uC744 \uBC18\uBCF5 \uAE30\uC7AC\uD560 \uAC83
|
|
2759
|
+
- 2\uB2E8 \uD5E4\uB354(\uD5E4\uB354\uAC00 \uB450 \uC904\uC778 \uACBD\uC6B0): \uC0C1\uC704/\uD558\uC704 \uD5E4\uB354\uB97C " / "\uB85C \uD569\uCCD0 \uB2E8\uC77C \uD5E4\uB354 \uD589\uC73C\uB85C \uB9CC\uB4E4 \uAC83 (\uC608: "2024 / 1\uBD84\uAE30")
|
|
2760
|
+
- \uD45C \uC704\xB7\uC544\uB798\uC758 \uCEA1\uC158(\uC608: "<\uD45C 1> ...", "[\uD45C 2-1]")\uC740 \uD45C \uBC14\uB85C \uC704\uC5D0 \uBCC4\uB3C4 \uC904\uB85C \uBCF4\uC874
|
|
2761
|
+
- \uD55C \uD398\uC774\uC9C0\uC5D0\uC11C \uD45C\uAC00 \uC911\uAC04\uC5D0 \uB04A\uC5B4\uC838 \uBCF4\uC774\uB354\uB77C\uB3C4, \uD5E4\uB354\uAC00 \uB3D9\uC77C\uD558\uBA74 \uD558\uB098\uC758 \uD45C\uB85C \uC774\uC5B4\uBD99\uC77C \uAC83
|
|
2762
|
+
- \uC88C\uCE21 \uCCAB \uCEEC\uB7FC\uC774 \uC138\uB85C\uC4F0\uAE30 \uB77C\uBCA8(\uAD6C\uBD84/\uBD84\uB958/\uC5F0\uB3C4 \uB4F1)\uC774\uB77C\uB3C4 \uC77C\uBC18 \uC140\uB85C \uBCC0\uD658\uD558\uC5EC \uB204\uB77D \uAE08\uC9C0
|
|
2763
|
+
|
|
2764
|
+
[OCR \uC624\uC778\uC2DD\xB7\uAE68\uC9D0 \uAD50\uC815 \u2014 \uD5C8\uC6A9 \uBC94\uC704\uB9CC]
|
|
2765
|
+
- \uC22B\uC790 \uCE78\uC5D0\uC11C 'O'\u2192'0', 'l/I'\u2192'1' \uB4F1 \uB9E5\uB77D\uC0C1 \uBA85\uBC31\uD55C \uC624\uC778\uC2DD\uB9CC \uAD50\uC815
|
|
2766
|
+
- \uB2E8\uC5B4 \uC911\uAC04\uC5D0 \uC798\uBABB \uC0BD\uC785\uB41C \uACF5\uBC31 \uC81C\uAC70 (\uC608: "\uC8FC\uAC70 \uC885 \uD569 \uACC4\uD68D" \u2192 "\uC8FC\uAC70\uC885\uD569\uACC4\uD68D")
|
|
2767
|
+
- \uC904\uBC14\uAFC8 \uC624\uB958\uB85C \uB04A\uAE34 \uBB38\uC7A5\uC740 \uC758\uBBF8 \uB2E8\uC704\uB85C \uBCD1\uD569 (\uB2E8, \uD45C \uC140 \uB0B4\uBD80\uB294 <br> \uC720\uC9C0)
|
|
2768
|
+
- \uD45C\uC5D0\uC11C \uD589 \uC5B4\uAE0B\uB0A8\uC774 \uC758\uC2EC\uB418\uBA74(\uCEEC\uB7FC \uC218 \uBD88\uC77C\uCE58) \uBE48 \uC140\uB85C \uD328\uB529\uD558\uC5EC \uCEEC\uB7FC \uC815\uD569\uC131\uC744 \uC6B0\uC120 \uD655\uBCF4
|
|
2769
|
+
|
|
2770
|
+
[\uC808\uB300 \uAE08\uC9C0 \uC0AC\uD56D]
|
|
2771
|
+
- \uBB38\uC7A5\xB7\uB2E8\uB77D\xB7\uD56D\uBAA9\xB7\uD45C \uD589/\uC5F4\uC744 \uCD94\uAC00\uD558\uAC70\uB098 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83
|
|
2772
|
+
- \uC22B\uC790, \uD37C\uC13C\uD2B8, \uB0A0\uC9DC, \uB2E8\uC704, \uAE08\uC561\uC744 \uC808\uB300 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83
|
|
2773
|
+
- \uACE0\uC720\uBA85\uC0AC, \uAE30\uAD00\uBA85, \uBC95\uB839\uBA85, \uC9C0\uBA85, \uC778\uBA85\uC744 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83
|
|
2774
|
+
- \uD45C\uC758 \uD5E4\uB354 \uD14D\uC2A4\uD2B8, \uCEA1\uC158, \uD589/\uC5F4 \uAC1C\uC218\uB97C \uC784\uC758\uB85C \uBC14\uAFB8\uC9C0 \uB9D0 \uAC83
|
|
2775
|
+
- \uCCB4\uD06C\uBC15\uC2A4(\u2611/\u2610/\u25A0/\u25A1), \uD2B9\uC218\uBB38\uC790(\u203B, \u2460\u2461\u2462, \u3260, \uAC00., \uB098.)\uB97C \uC784\uC758\uB85C \uBC14\uAFB8\uC9C0 \uB9D0 \uAC83
|
|
2776
|
+
- \uC6D0\uBB38\uC5D0 \uC5C6\uB294 \uB0B4\uC6A9\uC744 \uC694\uC57D\xB7\uBCF4\uC644\xB7\uCD94\uB860\uD558\uC9C0 \uB9D0 \uAC83
|
|
2777
|
+
- \`\`\`, \`\`\`markdown \uAC19\uC740 \uCF54\uB4DC\uD39C\uC2A4\uB85C \uAC10\uC2F8\uC9C0 \uB9D0 \uAC83
|
|
2778
|
+
- \uC124\uBA85\xB7\uC8FC\uC11D\xB7\uBA54\uD0C0 \uBB38\uC7A5 \uCD94\uAC00 \uAE08\uC9C0
|
|
2779
|
+
|
|
2780
|
+
[\uBD88\uD655\uC2E4\uD560 \uB54C]
|
|
2781
|
+
- \uAE00\uC790\uAC00 \uD750\uB9BF\uD558\uAC70\uB098 \uD310\uB3C5 \uBD88\uAC00\uD558\uBA74 \uBCF4\uC774\uB294 \uADF8\uB300\uB85C \uB450\uAC70\uB098, \uBD88\uAC00\uD53C\uD558\uBA74 \uD55C \uAE00\uC790\uB9CC ?\uB85C \uD45C\uC2DC
|
|
2782
|
+
- \uD45C \uAD6C\uC870\uAC00 \uBAA8\uD638\uD558\uBA74 \uCEEC\uB7FC \uC218 \uC77C\uCE58 + \uBE48 \uC140 \uD328\uB529 \uD615\uD0DC\uB85C \uCD9C\uB825
|
|
2783
|
+
- \uCD94\uCE21\uBCF4\uB2E4 \uC6D0\uBB38 \uBCF4\uC874\uC744 \uD56D\uC0C1 \uC6B0\uC120`;
|
|
2595
2784
|
DEFAULT_BATCH_SIZES = {
|
|
2596
2785
|
gemini: 5,
|
|
2597
2786
|
claude: 5,
|
|
@@ -2985,6 +3174,9 @@ __export(index_exports, {
|
|
|
2985
3174
|
VERSION: () => VERSION,
|
|
2986
3175
|
blocksToMarkdown: () => blocksToMarkdown,
|
|
2987
3176
|
compare: () => compare,
|
|
3177
|
+
convertHwpToPdf: () => convertHwpToPdf,
|
|
3178
|
+
convertHwpxToPdf: () => convertHwpxToPdf,
|
|
3179
|
+
convertToPdf: () => convertToPdf,
|
|
2988
3180
|
detectFormat: () => detectFormat,
|
|
2989
3181
|
detectZipFormat: () => detectZipFormat,
|
|
2990
3182
|
diffBlocks: () => diffBlocks,
|
|
@@ -3004,7 +3196,7 @@ __export(index_exports, {
|
|
|
3004
3196
|
runUnifiedOcrPipeline: () => runUnifiedOcrPipeline
|
|
3005
3197
|
});
|
|
3006
3198
|
module.exports = __toCommonJS(index_exports);
|
|
3007
|
-
var
|
|
3199
|
+
var import_promises4 = require("fs/promises");
|
|
3008
3200
|
|
|
3009
3201
|
// src/detect.ts
|
|
3010
3202
|
var import_jszip = __toESM(require("jszip"), 1);
|
|
@@ -3056,97 +3248,8 @@ async function detectZipFormat(buffer) {
|
|
|
3056
3248
|
var import_jszip2 = __toESM(require("jszip"), 1);
|
|
3057
3249
|
var import_xmldom = require("@xmldom/xmldom");
|
|
3058
3250
|
|
|
3059
|
-
// src/utils.ts
|
|
3060
|
-
var VERSION = true ? "2.5.0" : "0.0.0-dev";
|
|
3061
|
-
function toArrayBuffer(buf) {
|
|
3062
|
-
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
3063
|
-
return buf.buffer;
|
|
3064
|
-
}
|
|
3065
|
-
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
3066
|
-
}
|
|
3067
|
-
var KordocError = class extends Error {
|
|
3068
|
-
code;
|
|
3069
|
-
stage;
|
|
3070
|
-
constructor(message, opts = {}) {
|
|
3071
|
-
super(message);
|
|
3072
|
-
this.name = "KordocError";
|
|
3073
|
-
this.code = opts.code;
|
|
3074
|
-
this.stage = opts.stage;
|
|
3075
|
-
}
|
|
3076
|
-
};
|
|
3077
|
-
function isPathTraversal(name) {
|
|
3078
|
-
if (name.includes("\0")) return true;
|
|
3079
|
-
const normalized = name.replace(/\\/g, "/");
|
|
3080
|
-
return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
3081
|
-
}
|
|
3082
|
-
function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
|
|
3083
|
-
try {
|
|
3084
|
-
const data = new DataView(buffer);
|
|
3085
|
-
const len = buffer.byteLength;
|
|
3086
|
-
let eocdOffset = -1;
|
|
3087
|
-
for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
|
|
3088
|
-
if (data.getUint32(i, true) === 101010256) {
|
|
3089
|
-
eocdOffset = i;
|
|
3090
|
-
break;
|
|
3091
|
-
}
|
|
3092
|
-
}
|
|
3093
|
-
if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
|
|
3094
|
-
const entryCount = data.getUint16(eocdOffset + 10, true);
|
|
3095
|
-
if (entryCount > maxEntries) {
|
|
3096
|
-
throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
|
|
3097
|
-
}
|
|
3098
|
-
const cdSize = data.getUint32(eocdOffset + 12, true);
|
|
3099
|
-
const cdOffset = data.getUint32(eocdOffset + 16, true);
|
|
3100
|
-
if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
|
|
3101
|
-
let totalUncompressed = 0;
|
|
3102
|
-
let pos = cdOffset;
|
|
3103
|
-
for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
|
|
3104
|
-
if (data.getUint32(pos, true) !== 33639248) break;
|
|
3105
|
-
totalUncompressed += data.getUint32(pos + 24, true);
|
|
3106
|
-
const nameLen = data.getUint16(pos + 28, true);
|
|
3107
|
-
const extraLen = data.getUint16(pos + 30, true);
|
|
3108
|
-
const commentLen = data.getUint16(pos + 32, true);
|
|
3109
|
-
pos += 46 + nameLen + extraLen + commentLen;
|
|
3110
|
-
}
|
|
3111
|
-
if (totalUncompressed > maxUncompressedSize) {
|
|
3112
|
-
throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
|
|
3113
|
-
}
|
|
3114
|
-
return { totalUncompressed, entryCount };
|
|
3115
|
-
} catch (err) {
|
|
3116
|
-
if (err instanceof KordocError) throw err;
|
|
3117
|
-
return { totalUncompressed: 0, entryCount: 0 };
|
|
3118
|
-
}
|
|
3119
|
-
}
|
|
3120
|
-
var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
3121
|
-
function sanitizeHref(href) {
|
|
3122
|
-
const trimmed = href.trim();
|
|
3123
|
-
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
3124
|
-
return trimmed;
|
|
3125
|
-
}
|
|
3126
|
-
function classifyError(err) {
|
|
3127
|
-
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
3128
|
-
const msg = err.message;
|
|
3129
|
-
if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
|
|
3130
|
-
if (msg.includes("DRM")) return "DRM_PROTECTED";
|
|
3131
|
-
if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
|
|
3132
|
-
if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
|
|
3133
|
-
if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
|
|
3134
|
-
if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
|
|
3135
|
-
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
3136
|
-
return "PARSE_ERROR";
|
|
3137
|
-
}
|
|
3138
|
-
function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
|
|
3139
|
-
if (err instanceof KordocError) {
|
|
3140
|
-
if (!err.stage) err.stage = stage;
|
|
3141
|
-
if (!err.code) err.code = fallbackCode;
|
|
3142
|
-
return err;
|
|
3143
|
-
}
|
|
3144
|
-
const message = err instanceof Error ? err.message : fallbackMessage;
|
|
3145
|
-
const code = err instanceof Error ? classifyError(err) : fallbackCode;
|
|
3146
|
-
return new KordocError(message || fallbackMessage, { code, stage });
|
|
3147
|
-
}
|
|
3148
|
-
|
|
3149
3251
|
// src/table/builder.ts
|
|
3252
|
+
init_utils();
|
|
3150
3253
|
var MAX_COLS = 200;
|
|
3151
3254
|
var MAX_ROWS = 1e4;
|
|
3152
3255
|
function buildTable(rows) {
|
|
@@ -3406,6 +3509,8 @@ var HEADING_RATIO_H2 = 1.3;
|
|
|
3406
3509
|
var HEADING_RATIO_H3 = 1.15;
|
|
3407
3510
|
|
|
3408
3511
|
// src/hwpx/parser.ts
|
|
3512
|
+
init_utils();
|
|
3513
|
+
init_utils();
|
|
3409
3514
|
init_page_range();
|
|
3410
3515
|
init_logger();
|
|
3411
3516
|
var MAX_DECOMPRESS_SIZE = 500 * 1024 * 1024;
|
|
@@ -4248,6 +4353,7 @@ function extractTextFromNode(node) {
|
|
|
4248
4353
|
|
|
4249
4354
|
// src/hwp5/record.ts
|
|
4250
4355
|
var import_zlib = require("zlib");
|
|
4356
|
+
init_utils();
|
|
4251
4357
|
var TAG_PARA_HEADER = 66;
|
|
4252
4358
|
var TAG_PARA_TEXT = 67;
|
|
4253
4359
|
var TAG_CHAR_SHAPE = 68;
|
|
@@ -5297,6 +5403,7 @@ function parseLenientCfb(data) {
|
|
|
5297
5403
|
}
|
|
5298
5404
|
|
|
5299
5405
|
// src/hwp5/parser.ts
|
|
5406
|
+
init_utils();
|
|
5300
5407
|
init_page_range();
|
|
5301
5408
|
init_logger();
|
|
5302
5409
|
var CFB = __toESM(require_cfb(), 1);
|
|
@@ -5952,6 +6059,7 @@ function arrangeCells(rows, cols, cells) {
|
|
|
5952
6059
|
}
|
|
5953
6060
|
|
|
5954
6061
|
// src/pdf/parser.ts
|
|
6062
|
+
init_utils();
|
|
5955
6063
|
init_page_range();
|
|
5956
6064
|
var import_module = require("module");
|
|
5957
6065
|
var import_path4 = require("path");
|
|
@@ -7845,6 +7953,7 @@ function mergeKoreanLines(text) {
|
|
|
7845
7953
|
// src/xlsx/parser.ts
|
|
7846
7954
|
var import_jszip3 = __toESM(require("jszip"), 1);
|
|
7847
7955
|
var import_xmldom2 = require("@xmldom/xmldom");
|
|
7956
|
+
init_utils();
|
|
7848
7957
|
init_logger();
|
|
7849
7958
|
var MAX_SHEETS = 100;
|
|
7850
7959
|
var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
|
|
@@ -8173,6 +8282,7 @@ async function parseXlsxDocument(buffer, options, existingZip) {
|
|
|
8173
8282
|
// src/docx/parser.ts
|
|
8174
8283
|
var import_jszip4 = __toESM(require("jszip"), 1);
|
|
8175
8284
|
var import_xmldom3 = require("@xmldom/xmldom");
|
|
8285
|
+
init_utils();
|
|
8176
8286
|
init_logger();
|
|
8177
8287
|
var MAX_DECOMPRESS_SIZE4 = 500 * 1024 * 1024;
|
|
8178
8288
|
function getChildElements(parent, localName) {
|
|
@@ -8652,6 +8762,7 @@ async function parseDocxDocument(buffer, options, existingZip) {
|
|
|
8652
8762
|
}
|
|
8653
8763
|
|
|
8654
8764
|
// src/index.ts
|
|
8765
|
+
init_utils();
|
|
8655
8766
|
init_cli_provider();
|
|
8656
8767
|
init_markdown_to_blocks();
|
|
8657
8768
|
init_logger();
|
|
@@ -11153,6 +11264,187 @@ async function markdownToXlsx(markdown, options) {
|
|
|
11153
11264
|
return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
|
|
11154
11265
|
}
|
|
11155
11266
|
|
|
11267
|
+
// src/convert/index.ts
|
|
11268
|
+
var import_promises2 = require("fs/promises");
|
|
11269
|
+
init_utils();
|
|
11270
|
+
|
|
11271
|
+
// src/convert/libreoffice.ts
|
|
11272
|
+
var import_libreoffice_convert = __toESM(require("libreoffice-convert"), 1);
|
|
11273
|
+
|
|
11274
|
+
// src/convert/error.ts
|
|
11275
|
+
var ConvertError = class extends Error {
|
|
11276
|
+
constructor(code, message) {
|
|
11277
|
+
super(message);
|
|
11278
|
+
this.code = code;
|
|
11279
|
+
this.name = "ConvertError";
|
|
11280
|
+
}
|
|
11281
|
+
};
|
|
11282
|
+
|
|
11283
|
+
// src/convert/libreoffice.ts
|
|
11284
|
+
var libreConvert = import_libreoffice_convert.default.convert;
|
|
11285
|
+
async function assertSofficeAvailable() {
|
|
11286
|
+
const { runCommand: runCommand2 } = await Promise.resolve().then(() => (init_utils(), utils_exports));
|
|
11287
|
+
try {
|
|
11288
|
+
await runCommand2("soffice", ["--version"]);
|
|
11289
|
+
} catch {
|
|
11290
|
+
throw new ConvertError(
|
|
11291
|
+
"SOFFICE_NOT_FOUND",
|
|
11292
|
+
"soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694."
|
|
11293
|
+
);
|
|
11294
|
+
}
|
|
11295
|
+
}
|
|
11296
|
+
async function convertBuffer(buffer, targetExt, timeoutMs = 6e4) {
|
|
11297
|
+
return new Promise((resolve4, reject) => {
|
|
11298
|
+
const timer = setTimeout(() => {
|
|
11299
|
+
reject(
|
|
11300
|
+
new ConvertError("TIMEOUT", `\uBCC0\uD658 \uD0C0\uC784\uC544\uC6C3 (${timeoutMs}ms \uCD08\uACFC)`)
|
|
11301
|
+
);
|
|
11302
|
+
}, timeoutMs);
|
|
11303
|
+
libreConvert(buffer, targetExt, void 0, (err, done) => {
|
|
11304
|
+
clearTimeout(timer);
|
|
11305
|
+
if (err || !done) {
|
|
11306
|
+
reject(
|
|
11307
|
+
new ConvertError(
|
|
11308
|
+
"CONVERT_FAILED",
|
|
11309
|
+
err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"
|
|
11310
|
+
)
|
|
11311
|
+
);
|
|
11312
|
+
return;
|
|
11313
|
+
}
|
|
11314
|
+
resolve4(done);
|
|
11315
|
+
});
|
|
11316
|
+
});
|
|
11317
|
+
}
|
|
11318
|
+
|
|
11319
|
+
// src/convert/index.ts
|
|
11320
|
+
var isConverting = false;
|
|
11321
|
+
var queue = [];
|
|
11322
|
+
async function acquireConvertLock() {
|
|
11323
|
+
if (!isConverting) {
|
|
11324
|
+
isConverting = true;
|
|
11325
|
+
return () => {
|
|
11326
|
+
isConverting = false;
|
|
11327
|
+
const next = queue.shift();
|
|
11328
|
+
next?.();
|
|
11329
|
+
};
|
|
11330
|
+
}
|
|
11331
|
+
return new Promise((resolve4) => {
|
|
11332
|
+
queue.push(() => {
|
|
11333
|
+
isConverting = true;
|
|
11334
|
+
resolve4(() => {
|
|
11335
|
+
isConverting = false;
|
|
11336
|
+
const next = queue.shift();
|
|
11337
|
+
next?.();
|
|
11338
|
+
});
|
|
11339
|
+
});
|
|
11340
|
+
});
|
|
11341
|
+
}
|
|
11342
|
+
async function convertToPdf(input, options) {
|
|
11343
|
+
let buffer;
|
|
11344
|
+
try {
|
|
11345
|
+
if (typeof input === "string") {
|
|
11346
|
+
buffer = await (0, import_promises2.readFile)(input);
|
|
11347
|
+
} else if (Buffer.isBuffer(input)) {
|
|
11348
|
+
buffer = input;
|
|
11349
|
+
} else {
|
|
11350
|
+
buffer = Buffer.from(input);
|
|
11351
|
+
}
|
|
11352
|
+
} catch (err) {
|
|
11353
|
+
return {
|
|
11354
|
+
success: false,
|
|
11355
|
+
code: "PARSE_ERROR",
|
|
11356
|
+
error: `\uC785\uB825 \uC77D\uAE30 \uC2E4\uD328: ${err instanceof Error ? err.message : String(err)}`,
|
|
11357
|
+
stage: "detect"
|
|
11358
|
+
};
|
|
11359
|
+
}
|
|
11360
|
+
const MAX_FILE_SIZE = 500 * 1024 * 1024;
|
|
11361
|
+
if (buffer.length > MAX_FILE_SIZE) {
|
|
11362
|
+
return {
|
|
11363
|
+
success: false,
|
|
11364
|
+
code: "FILE_TOO_LARGE",
|
|
11365
|
+
error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.length / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`,
|
|
11366
|
+
stage: "detect"
|
|
11367
|
+
};
|
|
11368
|
+
}
|
|
11369
|
+
const format = detectFormat(toArrayBuffer(buffer));
|
|
11370
|
+
if (format !== "hwp" && format !== "hwpx") {
|
|
11371
|
+
return {
|
|
11372
|
+
success: false,
|
|
11373
|
+
code: "UNSUPPORTED_FORMAT",
|
|
11374
|
+
error: `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD3EC\uB9F7\uC785\uB2C8\uB2E4: ${format}`,
|
|
11375
|
+
stage: "detect"
|
|
11376
|
+
};
|
|
11377
|
+
}
|
|
11378
|
+
try {
|
|
11379
|
+
await assertSofficeAvailable();
|
|
11380
|
+
} catch (err) {
|
|
11381
|
+
if (err instanceof ConvertError) {
|
|
11382
|
+
return {
|
|
11383
|
+
success: false,
|
|
11384
|
+
code: err.code,
|
|
11385
|
+
error: err.message,
|
|
11386
|
+
stage: "validate"
|
|
11387
|
+
};
|
|
11388
|
+
}
|
|
11389
|
+
throw err;
|
|
11390
|
+
}
|
|
11391
|
+
const releaseLock = await acquireConvertLock();
|
|
11392
|
+
try {
|
|
11393
|
+
options?.onProgress?.(10, "convert");
|
|
11394
|
+
const pdf = await convertBuffer(buffer, ".pdf", options?.timeoutMs);
|
|
11395
|
+
options?.onProgress?.(100, "done");
|
|
11396
|
+
return {
|
|
11397
|
+
success: true,
|
|
11398
|
+
pdf: new Uint8Array(pdf),
|
|
11399
|
+
sourceFormat: format
|
|
11400
|
+
};
|
|
11401
|
+
} catch (err) {
|
|
11402
|
+
if (err instanceof ConvertError) {
|
|
11403
|
+
return {
|
|
11404
|
+
success: false,
|
|
11405
|
+
code: err.code,
|
|
11406
|
+
error: err.message,
|
|
11407
|
+
stage: "convert"
|
|
11408
|
+
};
|
|
11409
|
+
}
|
|
11410
|
+
return {
|
|
11411
|
+
success: false,
|
|
11412
|
+
code: classifyError(err),
|
|
11413
|
+
error: err instanceof Error ? err.message : "\uBCC0\uD658 \uC2E4\uD328",
|
|
11414
|
+
stage: "convert"
|
|
11415
|
+
};
|
|
11416
|
+
} finally {
|
|
11417
|
+
releaseLock();
|
|
11418
|
+
}
|
|
11419
|
+
}
|
|
11420
|
+
async function convertHwpToPdf(input, options) {
|
|
11421
|
+
const result = await convertToPdf(input, options);
|
|
11422
|
+
if (result.success && result.sourceFormat !== "hwp") {
|
|
11423
|
+
return {
|
|
11424
|
+
success: false,
|
|
11425
|
+
code: "UNSUPPORTED_FORMAT",
|
|
11426
|
+
error: `HWP 5.x \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
|
|
11427
|
+
stage: "detect"
|
|
11428
|
+
};
|
|
11429
|
+
}
|
|
11430
|
+
return result;
|
|
11431
|
+
}
|
|
11432
|
+
async function convertHwpxToPdf(input, options) {
|
|
11433
|
+
const result = await convertToPdf(input, options);
|
|
11434
|
+
if (result.success && result.sourceFormat !== "hwpx") {
|
|
11435
|
+
return {
|
|
11436
|
+
success: false,
|
|
11437
|
+
code: "UNSUPPORTED_FORMAT",
|
|
11438
|
+
error: `HWPX \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
|
|
11439
|
+
stage: "detect"
|
|
11440
|
+
};
|
|
11441
|
+
}
|
|
11442
|
+
return result;
|
|
11443
|
+
}
|
|
11444
|
+
|
|
11445
|
+
// src/index.ts
|
|
11446
|
+
init_utils();
|
|
11447
|
+
|
|
11156
11448
|
// src/ocr/api-key-rotation.ts
|
|
11157
11449
|
var AllKeysCoolingDownError = class extends Error {
|
|
11158
11450
|
waitMs;
|
|
@@ -11247,11 +11539,10 @@ var ApiKeyRotationPool = class _ApiKeyRotationPool {
|
|
|
11247
11539
|
};
|
|
11248
11540
|
|
|
11249
11541
|
// src/pipeline/unified-ocr.ts
|
|
11250
|
-
var
|
|
11542
|
+
var import_promises3 = require("fs/promises");
|
|
11251
11543
|
var import_path5 = require("path");
|
|
11252
11544
|
var import_child_process4 = require("child_process");
|
|
11253
11545
|
var import_node_perf_hooks = require("perf_hooks");
|
|
11254
|
-
var import_libreoffice_convert = __toESM(require("libreoffice-convert"), 1);
|
|
11255
11546
|
init_logger();
|
|
11256
11547
|
|
|
11257
11548
|
// src/pipeline/bounded-queue.ts
|
|
@@ -11313,7 +11604,6 @@ var BoundedQueue = class {
|
|
|
11313
11604
|
};
|
|
11314
11605
|
|
|
11315
11606
|
// src/pipeline/unified-ocr.ts
|
|
11316
|
-
var libreConvert = import_libreoffice_convert.default.convert;
|
|
11317
11607
|
var UnifiedOcrError = class extends Error {
|
|
11318
11608
|
code;
|
|
11319
11609
|
stage;
|
|
@@ -11407,9 +11697,9 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11407
11697
|
const keyPool = ApiKeyRotationPool.fromEnv();
|
|
11408
11698
|
const runId = options.runId ?? generateRunId("ocr");
|
|
11409
11699
|
const logger = (options.logger ?? createLoggerFromEnv()).withRun(runId).child({ component: "pipeline/unified-ocr.ts" });
|
|
11410
|
-
await (0,
|
|
11411
|
-
await (0,
|
|
11412
|
-
await (0,
|
|
11700
|
+
await (0, import_promises3.mkdir)(imagesDir, { recursive: true });
|
|
11701
|
+
await (0, import_promises3.mkdir)(rawDir, { recursive: true });
|
|
11702
|
+
await (0, import_promises3.mkdir)(diffDir, { recursive: true });
|
|
11413
11703
|
const timingsMs = {};
|
|
11414
11704
|
const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
|
|
11415
11705
|
const markStageProgress = (stage, stagePercent, current, total, message, model) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message, model });
|
|
@@ -11428,9 +11718,9 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11428
11718
|
if ((0, import_path5.extname)(absInput).toLowerCase() !== ".pdf") {
|
|
11429
11719
|
await assertSofficeAvailable();
|
|
11430
11720
|
workingPdfPath = (0, import_path5.join)(workspaceDir, `${stem}.pdf`);
|
|
11431
|
-
const inputBuffer = await (0,
|
|
11432
|
-
const out = await
|
|
11433
|
-
await (0,
|
|
11721
|
+
const inputBuffer = await (0, import_promises3.readFile)(absInput);
|
|
11722
|
+
const out = await convertBuffer(inputBuffer, ".pdf");
|
|
11723
|
+
await (0, import_promises3.writeFile)(workingPdfPath, out);
|
|
11434
11724
|
}
|
|
11435
11725
|
timingsMs.convert = elapsedMs(convertStart);
|
|
11436
11726
|
markStageDone("convert", "PDF \uBCC0\uD658 \uC644\uB8CC");
|
|
@@ -11442,7 +11732,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11442
11732
|
markStageStart("render", "PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC911");
|
|
11443
11733
|
logStage("info", "render", "start", "PDF \uD398\uC774\uC9C0 \uB80C\uB354\uB9C1 \uC2DC\uC791", { pdf: workingPdfPath, dpi, totalPages });
|
|
11444
11734
|
await runCommand("pdftoppm", ["-png", "-r", String(dpi), "-f", "1", "-l", "1", workingPdfPath, (0, import_path5.join)(imagesDir, "page")]);
|
|
11445
|
-
const firstFiles = (await (0,
|
|
11735
|
+
const firstFiles = (await (0, import_promises3.readdir)(imagesDir)).filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
|
|
11446
11736
|
if (firstFiles.length === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uCCAB \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC2E4\uD328");
|
|
11447
11737
|
const probeImage = (0, import_path5.join)(imagesDir, firstFiles[0]);
|
|
11448
11738
|
markStageProgress("render", Math.round(1 / totalPages * 100), 1, totalPages, `\uD398\uC774\uC9C0 1/${totalPages} \uB80C\uB354\uB9C1`);
|
|
@@ -11480,7 +11770,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11480
11770
|
const keyCount = keyPool.snapshot().length;
|
|
11481
11771
|
const workerCount = Math.max(1, keyCount * concurrencyPerKey);
|
|
11482
11772
|
const queueCapacity = workerCount * 2;
|
|
11483
|
-
const
|
|
11773
|
+
const queue2 = new BoundedQueue(queueCapacity);
|
|
11484
11774
|
const ocrStart = import_node_perf_hooks.performance.now();
|
|
11485
11775
|
currentStage = "ocr";
|
|
11486
11776
|
markStageStart("ocr", `OCR \uC9C4\uD589 \uC911 (\uC6CC\uCEE4 ${workerCount}\uAC1C)`);
|
|
@@ -11488,17 +11778,17 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11488
11778
|
let renderDone = 1;
|
|
11489
11779
|
const renderProducer = (async () => {
|
|
11490
11780
|
try {
|
|
11491
|
-
await
|
|
11781
|
+
await queue2.enqueue({ pageNumber: 1, imagePath: probeImage });
|
|
11492
11782
|
if (totalPages > 1) {
|
|
11493
11783
|
for await (const item of renderPdfToPngStream(workingPdfPath, (0, import_path5.join)(imagesDir, "page"), dpi, totalPages, 2)) {
|
|
11494
|
-
await
|
|
11784
|
+
await queue2.enqueue(item);
|
|
11495
11785
|
renderDone++;
|
|
11496
11786
|
markStageProgress("render", Math.round(renderDone / totalPages * 100), renderDone, totalPages, `\uD398\uC774\uC9C0 ${renderDone}/${totalPages} \uB80C\uB354\uB9C1`);
|
|
11497
11787
|
logStage("debug", "render", "progress", "\uD398\uC774\uC9C0 \uB80C\uB354 \uC644\uB8CC", { page: item.pageNumber });
|
|
11498
11788
|
}
|
|
11499
11789
|
}
|
|
11500
11790
|
} finally {
|
|
11501
|
-
|
|
11791
|
+
queue2.close();
|
|
11502
11792
|
timingsMs.render = elapsedMs(renderStart);
|
|
11503
11793
|
markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
|
|
11504
11794
|
logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", { pages: renderDone, elapsedMs: timingsMs.render });
|
|
@@ -11507,7 +11797,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11507
11797
|
const [, pageResultsMap] = await Promise.all([
|
|
11508
11798
|
renderProducer,
|
|
11509
11799
|
ocrWorkerPool({
|
|
11510
|
-
queue,
|
|
11800
|
+
queue: queue2,
|
|
11511
11801
|
workerCount,
|
|
11512
11802
|
totalPages,
|
|
11513
11803
|
ocrInput: {
|
|
@@ -11541,7 +11831,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11541
11831
|
const rawPagePaths = [];
|
|
11542
11832
|
for (const [pageNum, markdown] of sortedEntries) {
|
|
11543
11833
|
const pagePath = (0, import_path5.join)(rawDir, `page_${String(pageNum).padStart(4, "0")}.md`);
|
|
11544
|
-
await (0,
|
|
11834
|
+
await (0, import_promises3.writeFile)(pagePath, markdown, "utf-8");
|
|
11545
11835
|
rawPagePaths.push(pagePath);
|
|
11546
11836
|
}
|
|
11547
11837
|
const mergeStart = import_node_perf_hooks.performance.now();
|
|
@@ -11549,7 +11839,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11549
11839
|
markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
|
|
11550
11840
|
logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: rawPagePaths.length });
|
|
11551
11841
|
const merged = await mergeMarkdownPages(rawPagePaths);
|
|
11552
|
-
await (0,
|
|
11842
|
+
await (0, import_promises3.writeFile)(outputPath, merged, "utf-8");
|
|
11553
11843
|
timingsMs.merge = elapsedMs(mergeStart);
|
|
11554
11844
|
markStageDone("merge", "\uBCD1\uD569 \uC644\uB8CC");
|
|
11555
11845
|
logStage("info", "merge", "done", "\uCD5C\uC885 \uBCD1\uD569 \uC644\uB8CC", { outputPath, elapsedMs: timingsMs.merge });
|
|
@@ -11565,7 +11855,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11565
11855
|
timingsMs,
|
|
11566
11856
|
modelCachePath
|
|
11567
11857
|
};
|
|
11568
|
-
await (0,
|
|
11858
|
+
await (0, import_promises3.writeFile)(reportPath, JSON.stringify(report, null, 2), "utf-8");
|
|
11569
11859
|
logStage("info", "finalize", "done", "run-report \uC800\uC7A5 \uC644\uB8CC", { reportPath });
|
|
11570
11860
|
return { outputPath, reportPath, selectedModel };
|
|
11571
11861
|
} catch (err) {
|
|
@@ -11627,17 +11917,6 @@ function emitProgress(cb, stage, stagePercent, weights, extra) {
|
|
|
11627
11917
|
model: extra.model
|
|
11628
11918
|
});
|
|
11629
11919
|
}
|
|
11630
|
-
async function convertWithLibreOffice(buffer, ext) {
|
|
11631
|
-
return await new Promise((resolvePromise, reject) => {
|
|
11632
|
-
libreConvert(buffer, ext, void 0, (err, done) => {
|
|
11633
|
-
if (err || !done) {
|
|
11634
|
-
reject(new UnifiedOcrError("CONVERT_FAILED", "convert", err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"));
|
|
11635
|
-
return;
|
|
11636
|
-
}
|
|
11637
|
-
resolvePromise(done);
|
|
11638
|
-
});
|
|
11639
|
-
});
|
|
11640
|
-
}
|
|
11641
11920
|
async function getPdfPageCount(pdfPath) {
|
|
11642
11921
|
const stdout = await runCommandWithStdout("pdfinfo", [pdfPath]);
|
|
11643
11922
|
const m = stdout.match(/^\s*Pages:\s*(\d+)\s*$/mi);
|
|
@@ -11665,7 +11944,7 @@ async function* renderPdfToPngStream(pdfPath, prefixPath, dpi, totalPages, start
|
|
|
11665
11944
|
pdfPath,
|
|
11666
11945
|
prefixPath
|
|
11667
11946
|
]);
|
|
11668
|
-
const files = await (0,
|
|
11947
|
+
const files = await (0, import_promises3.readdir)(imagesDir);
|
|
11669
11948
|
const pageFiles = files.filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
|
|
11670
11949
|
const imagePath = (0, import_path5.join)(imagesDir, pageFiles[pageFiles.length - 1]);
|
|
11671
11950
|
yield { pageNumber: page, imagePath };
|
|
@@ -11710,13 +11989,6 @@ async function runCommandWithStdout(cmd, args) {
|
|
|
11710
11989
|
});
|
|
11711
11990
|
});
|
|
11712
11991
|
}
|
|
11713
|
-
async function assertSofficeAvailable() {
|
|
11714
|
-
try {
|
|
11715
|
-
await runCommand("soffice", ["--version"]);
|
|
11716
|
-
} catch {
|
|
11717
|
-
throw new UnifiedOcrError("SOFFICE_NOT_FOUND", "convert", "soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694.");
|
|
11718
|
-
}
|
|
11719
|
-
}
|
|
11720
11992
|
function naturalPageSort(a, b) {
|
|
11721
11993
|
const na = Number((a.match(/\d+/g) || ["0"]).at(-1) || 0);
|
|
11722
11994
|
const nb = Number((b.match(/\d+/g) || ["0"]).at(-1) || 0);
|
|
@@ -11790,7 +12062,7 @@ function startParallelProbeRuns(input) {
|
|
|
11790
12062
|
}
|
|
11791
12063
|
async function loadModelCache(path) {
|
|
11792
12064
|
try {
|
|
11793
|
-
const raw = await (0,
|
|
12065
|
+
const raw = await (0, import_promises3.readFile)(path, "utf-8");
|
|
11794
12066
|
return JSON.parse(raw);
|
|
11795
12067
|
} catch {
|
|
11796
12068
|
return null;
|
|
@@ -11821,15 +12093,15 @@ async function updateModelCache(path, probes) {
|
|
|
11821
12093
|
}
|
|
11822
12094
|
}
|
|
11823
12095
|
current.updatedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
11824
|
-
await (0,
|
|
12096
|
+
await (0, import_promises3.writeFile)(path, JSON.stringify(current, null, 2), "utf-8");
|
|
11825
12097
|
}
|
|
11826
12098
|
async function ocrWorkerPool(input) {
|
|
11827
|
-
const { queue, workerCount, ocrInput, onPageDone } = input;
|
|
12099
|
+
const { queue: queue2, workerCount, ocrInput, onPageDone } = input;
|
|
11828
12100
|
const results = /* @__PURE__ */ new Map();
|
|
11829
12101
|
let completedCount = 0;
|
|
11830
12102
|
async function worker() {
|
|
11831
12103
|
while (true) {
|
|
11832
|
-
const item = await
|
|
12104
|
+
const item = await queue2.dequeue();
|
|
11833
12105
|
if (item === QUEUE_DONE) break;
|
|
11834
12106
|
const { pageNumber, imagePath, error } = item;
|
|
11835
12107
|
if (imagePath === null) {
|
|
@@ -11881,7 +12153,7 @@ async function ocrImageWithFallback(input) {
|
|
|
11881
12153
|
async function mergeMarkdownPages(paths) {
|
|
11882
12154
|
const out = [];
|
|
11883
12155
|
for (let i = 0; i < paths.length; i++) {
|
|
11884
|
-
const txt = (await (0,
|
|
12156
|
+
const txt = (await (0, import_promises3.readFile)(paths[i], "utf-8")).trim();
|
|
11885
12157
|
if (!txt) continue;
|
|
11886
12158
|
out.push(txt);
|
|
11887
12159
|
}
|
|
@@ -11997,7 +12269,7 @@ async function ocrImageViaNim(input) {
|
|
|
11997
12269
|
throw new UnifiedOcrError("OCR_FAILED", "ocr", `OCR \uC7AC\uC2DC\uB3C4 \uCD08\uACFC: ${lastErr}`);
|
|
11998
12270
|
}
|
|
11999
12271
|
async function encodeBase64(path) {
|
|
12000
|
-
const b = await (0,
|
|
12272
|
+
const b = await (0, import_promises3.readFile)(path);
|
|
12001
12273
|
return b.toString("base64");
|
|
12002
12274
|
}
|
|
12003
12275
|
function stripCodeFence3(text) {
|
|
@@ -12036,7 +12308,7 @@ async function parse2(input, options) {
|
|
|
12036
12308
|
let buffer;
|
|
12037
12309
|
if (typeof input === "string") {
|
|
12038
12310
|
try {
|
|
12039
|
-
const buf = await (0,
|
|
12311
|
+
const buf = await (0, import_promises4.readFile)(input);
|
|
12040
12312
|
buffer = toArrayBuffer(buf);
|
|
12041
12313
|
} catch (err) {
|
|
12042
12314
|
const msg = err instanceof Error && "code" in err && err.code === "ENOENT" ? `\uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${input}` : `\uD30C\uC77C \uC77D\uAE30 \uC2E4\uD328: ${input}`;
|
|
@@ -12196,6 +12468,9 @@ async function parseDocx(buffer, options, zip) {
|
|
|
12196
12468
|
VERSION,
|
|
12197
12469
|
blocksToMarkdown,
|
|
12198
12470
|
compare,
|
|
12471
|
+
convertHwpToPdf,
|
|
12472
|
+
convertHwpxToPdf,
|
|
12473
|
+
convertToPdf,
|
|
12199
12474
|
detectFormat,
|
|
12200
12475
|
detectZipFormat,
|
|
12201
12476
|
diffBlocks,
|