@clazic/kordoc 2.5.1 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -1
- package/dist/batch-provider-XRF6F26E.js +234 -0
- package/dist/batch-provider-XRF6F26E.js.map +1 -0
- package/dist/chunk-S7BHLD2V.js +200 -0
- package/dist/{chunk-Y4WFKJ5P.js.map → chunk-S7BHLD2V.js.map} +1 -1
- package/dist/{chunk-IJGNPAK2.js → chunk-TND4YFBV.js} +2 -2
- package/dist/{chunk-QG6BYZMR.js → chunk-TS3F57LY.js} +160 -8
- package/dist/chunk-TS3F57LY.js.map +1 -0
- package/dist/cli.js +53 -6
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +420 -145
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +71 -2
- package/dist/index.d.ts +71 -2
- package/dist/index.js +407 -135
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +44 -3
- package/dist/mcp.js.map +1 -1
- package/dist/{resolve-XWYJYKKH.js → resolve-ZSUEJK3E.js} +4 -4
- package/dist/{utils-RBXHHCLI.js → utils-F66K7PXH.js} +2 -2
- package/dist/{watch-5CCMTZ7F.js → watch-2S5ULHAM.js} +4 -4
- package/package.json +1 -1
- package/dist/batch-provider-5BFJRKAZ.js +0 -190
- package/dist/batch-provider-5BFJRKAZ.js.map +0 -1
- package/dist/chunk-QG6BYZMR.js.map +0 -1
- package/dist/chunk-Y4WFKJ5P.js +0 -167
- /package/dist/{chunk-IJGNPAK2.js.map → chunk-TND4YFBV.js.map} +0 -0
- /package/dist/{resolve-XWYJYKKH.js.map → resolve-ZSUEJK3E.js.map} +0 -0
- /package/dist/{utils-RBXHHCLI.js.map → utils-F66K7PXH.js.map} +0 -0
- /package/dist/{watch-5CCMTZ7F.js.map → watch-2S5ULHAM.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -37,6 +37,118 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
37
37
|
mod
|
|
38
38
|
));
|
|
39
39
|
|
|
40
|
+
// src/utils.ts
|
|
41
|
+
var utils_exports = {};
|
|
42
|
+
__export(utils_exports, {
|
|
43
|
+
KordocError: () => KordocError,
|
|
44
|
+
VERSION: () => VERSION,
|
|
45
|
+
classifyError: () => classifyError,
|
|
46
|
+
isPathTraversal: () => isPathTraversal,
|
|
47
|
+
normalizeKordocError: () => normalizeKordocError,
|
|
48
|
+
precheckZipSize: () => precheckZipSize,
|
|
49
|
+
sanitizeError: () => sanitizeError,
|
|
50
|
+
sanitizeHref: () => sanitizeHref,
|
|
51
|
+
toArrayBuffer: () => toArrayBuffer
|
|
52
|
+
});
|
|
53
|
+
function toArrayBuffer(buf) {
|
|
54
|
+
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
55
|
+
return buf.buffer;
|
|
56
|
+
}
|
|
57
|
+
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
58
|
+
}
|
|
59
|
+
function sanitizeError(err) {
|
|
60
|
+
if (err instanceof KordocError) return err.message;
|
|
61
|
+
return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
|
|
62
|
+
}
|
|
63
|
+
function isPathTraversal(name) {
|
|
64
|
+
if (name.includes("\0")) return true;
|
|
65
|
+
const normalized = name.replace(/\\/g, "/");
|
|
66
|
+
return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
67
|
+
}
|
|
68
|
+
function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
|
|
69
|
+
try {
|
|
70
|
+
const data = new DataView(buffer);
|
|
71
|
+
const len = buffer.byteLength;
|
|
72
|
+
let eocdOffset = -1;
|
|
73
|
+
for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
|
|
74
|
+
if (data.getUint32(i, true) === 101010256) {
|
|
75
|
+
eocdOffset = i;
|
|
76
|
+
break;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
|
|
80
|
+
const entryCount = data.getUint16(eocdOffset + 10, true);
|
|
81
|
+
if (entryCount > maxEntries) {
|
|
82
|
+
throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
|
|
83
|
+
}
|
|
84
|
+
const cdSize = data.getUint32(eocdOffset + 12, true);
|
|
85
|
+
const cdOffset = data.getUint32(eocdOffset + 16, true);
|
|
86
|
+
if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
|
|
87
|
+
let totalUncompressed = 0;
|
|
88
|
+
let pos = cdOffset;
|
|
89
|
+
for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
|
|
90
|
+
if (data.getUint32(pos, true) !== 33639248) break;
|
|
91
|
+
totalUncompressed += data.getUint32(pos + 24, true);
|
|
92
|
+
const nameLen = data.getUint16(pos + 28, true);
|
|
93
|
+
const extraLen = data.getUint16(pos + 30, true);
|
|
94
|
+
const commentLen = data.getUint16(pos + 32, true);
|
|
95
|
+
pos += 46 + nameLen + extraLen + commentLen;
|
|
96
|
+
}
|
|
97
|
+
if (totalUncompressed > maxUncompressedSize) {
|
|
98
|
+
throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
|
|
99
|
+
}
|
|
100
|
+
return { totalUncompressed, entryCount };
|
|
101
|
+
} catch (err) {
|
|
102
|
+
if (err instanceof KordocError) throw err;
|
|
103
|
+
return { totalUncompressed: 0, entryCount: 0 };
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
function sanitizeHref(href) {
|
|
107
|
+
const trimmed = href.trim();
|
|
108
|
+
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
109
|
+
return trimmed;
|
|
110
|
+
}
|
|
111
|
+
function classifyError(err) {
|
|
112
|
+
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
113
|
+
const msg = err.message;
|
|
114
|
+
if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
|
|
115
|
+
if (msg.includes("DRM")) return "DRM_PROTECTED";
|
|
116
|
+
if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
|
|
117
|
+
if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
|
|
118
|
+
if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
|
|
119
|
+
if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
|
|
120
|
+
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
121
|
+
return "PARSE_ERROR";
|
|
122
|
+
}
|
|
123
|
+
function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
|
|
124
|
+
if (err instanceof KordocError) {
|
|
125
|
+
if (!err.stage) err.stage = stage;
|
|
126
|
+
if (!err.code) err.code = fallbackCode;
|
|
127
|
+
return err;
|
|
128
|
+
}
|
|
129
|
+
const message = err instanceof Error ? err.message : fallbackMessage;
|
|
130
|
+
const code = err instanceof Error ? classifyError(err) : fallbackCode;
|
|
131
|
+
return new KordocError(message || fallbackMessage, { code, stage });
|
|
132
|
+
}
|
|
133
|
+
var VERSION, KordocError, SAFE_HREF_RE;
|
|
134
|
+
var init_utils = __esm({
|
|
135
|
+
"src/utils.ts"() {
|
|
136
|
+
"use strict";
|
|
137
|
+
VERSION = true ? "2.5.2" : "0.0.0-dev";
|
|
138
|
+
KordocError = class extends Error {
|
|
139
|
+
code;
|
|
140
|
+
stage;
|
|
141
|
+
constructor(message, opts = {}) {
|
|
142
|
+
super(message);
|
|
143
|
+
this.name = "KordocError";
|
|
144
|
+
this.code = opts.code;
|
|
145
|
+
this.stage = opts.stage;
|
|
146
|
+
}
|
|
147
|
+
};
|
|
148
|
+
SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
149
|
+
}
|
|
150
|
+
});
|
|
151
|
+
|
|
40
152
|
// src/page-range.ts
|
|
41
153
|
var page_range_exports = {};
|
|
42
154
|
__export(page_range_exports, {
|
|
@@ -2398,15 +2510,48 @@ var OCR_PROMPT, _tempDir;
|
|
|
2398
2510
|
var init_cli_provider = __esm({
|
|
2399
2511
|
"src/ocr/cli-provider.ts"() {
|
|
2400
2512
|
"use strict";
|
|
2401
|
-
OCR_PROMPT = `\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \
|
|
2402
|
-
|
|
2403
|
-
|
|
2404
|
-
- \
|
|
2405
|
-
- \uD5E4\uB529\uC740 \
|
|
2406
|
-
- \uB9AC\uC2A4\uD2B8\uB294
|
|
2407
|
-
- \uC774\uBBF8\uC9C0
|
|
2408
|
-
- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\
|
|
2409
|
-
|
|
2513
|
+
OCR_PROMPT = `\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD45C\uB97C \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uACE0, \uBA85\uBC31\uD55C OCR \uC624\uC778\uC2DD\uACFC \uD45C \uAD6C\uC870 \uAE68\uC9D0\uB9CC \uAD50\uC815\uD558\uC5EC \uCD5C\uC885 \uACB0\uACFC\uB97C \uCD9C\uB825\uD558\uC138\uC694.
|
|
2514
|
+
|
|
2515
|
+
[\uAE30\uBCF8 \uCD94\uCD9C \uADDC\uCE59]
|
|
2516
|
+
- \uBCF8\uBB38, \uD45C, \uC81C\uBAA9, \uB9AC\uC2A4\uD2B8, \uCEA1\uC158\uC744 \uC6D0\uBB38 \uAD6C\uC870 \uADF8\uB300\uB85C Markdown\uC73C\uB85C \uBCC0\uD658
|
|
2517
|
+
- \uD5E4\uB529\uC740 \uC2DC\uAC01\uC801 \uD06C\uAE30\xB7\uAD75\uAE30\uC5D0 \uB530\uB77C # ~ ###### \uC0AC\uC6A9
|
|
2518
|
+
- \uB9AC\uC2A4\uD2B8\uB294 -, 1. \uC0AC\uC6A9 (\uC6D0\uBB38 \uBC88\uD638 \uCCB4\uACC4\uAC00 \u2460, \uAC00., 1) \uB4F1\uC774\uBA74 \uADF8 \uD45C\uAE30 \uC720\uC9C0)
|
|
2519
|
+
- \uC774\uBBF8\uC9C0\xB7\uB3C4\uD615\xB7\uB85C\uACE0\xB7\uD398\uC774\uC9C0\uBC88\uD638\xB7\uBA38\uB9AC\uAE00/\uBC14\uB2E5\uAE00\uC740 \uBB34\uC2DC
|
|
2520
|
+
- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C(\uC88C\u2192\uC6B0, \uC704\u2192\uC544\uB798, \uB2E4\uB2E8\uC774\uBA74 \uB2E8\uBCC4\uB85C)\uB97C \uC720\uC9C0
|
|
2521
|
+
|
|
2522
|
+
[\uD45C \uCD94\uCD9C \uADDC\uCE59 \u2014 \uAC00\uC7A5 \uC911\uC694]
|
|
2523
|
+
- \uD45C\uB294 \uBC18\uB4DC\uC2DC Markdown \uD30C\uC774\uD504 \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9: \uD5E4\uB354 \uD589 + |---| \uAD6C\uBD84\uC120 + \uB370\uC774\uD130 \uD589
|
|
2524
|
+
- \uC2DC\uAC01\uC801\uC73C\uB85C \uAD75\uAC70\uB098 \uC74C\uC601(\uD68C\uC0C9/\uC0C9\uC0C1) \uBC30\uACBD\uC774 \uC788\uB294 \uD589\uC744 \uD5E4\uB354\uB85C \uC2DD\uBCC4. \uD5E4\uB354\uAC00 \uC5C6\uB294 \uD45C\uB77C\uB3C4 \uCCAB \uD589\uC744 \uD5E4\uB354\uB85C \uB450\uACE0 |---| \uAD6C\uBD84\uC120 \uCD94\uAC00
|
|
2525
|
+
- \uBAA8\uB4E0 \uD589\uC758 \uD30C\uC774\uD504(|) \uAC1C\uC218\uB97C \uD5E4\uB354\uC640 \uB3D9\uC77C\uD558\uAC8C \uB9DE\uCD9C \uAC83 \u2014 \uBE48 \uC140\uC740 \uACF5\uBC31\uC73C\uB85C \uCC44\uC6B0\uACE0 \uC808\uB300 \uC140\uC744 \uC0DD\uB7B5\uD558\uC9C0 \uB9D0 \uAC83
|
|
2526
|
+
- \uC140 \uC548\uC758 \uC904\uBC14\uAFC8\uC740 <br>\uB85C \uD45C\uAE30 (\uC2E4\uC81C \uAC1C\uD589 \uBB38\uC790 \uC0AC\uC6A9 \uAE08\uC9C0 \u2014 \uD45C\uAC00 \uAE68\uC9D0)
|
|
2527
|
+
- \uBCD1\uD569 \uC140 \uCC98\uB9AC:
|
|
2528
|
+
- \uAC00\uB85C \uBCD1\uD569(colspan): \uBCD1\uD569\uB41C \uC140 \uB0B4\uC6A9\uC744 \uCCAB \uCE78\uC5D0 \uC4F0\uACE0 \uB098\uBA38\uC9C0 \uCE78\uC740 \uBE48 \uCE78\uC73C\uB85C \uB458 \uAC83
|
|
2529
|
+
- \uC138\uB85C \uBCD1\uD569(rowspan): \uBCD1\uD569\uB41C \uBAA8\uB4E0 \uD589\uC758 \uD574\uB2F9 \uCE78\uC5D0 \uB3D9\uC77C\uD55C \uB0B4\uC6A9\uC744 \uBC18\uBCF5 \uAE30\uC7AC\uD560 \uAC83
|
|
2530
|
+
- 2\uB2E8 \uD5E4\uB354(\uD5E4\uB354\uAC00 \uB450 \uC904\uC778 \uACBD\uC6B0): \uC0C1\uC704/\uD558\uC704 \uD5E4\uB354\uB97C " / "\uB85C \uD569\uCCD0 \uB2E8\uC77C \uD5E4\uB354 \uD589\uC73C\uB85C \uB9CC\uB4E4 \uAC83 (\uC608: "2024 / 1\uBD84\uAE30")
|
|
2531
|
+
- \uD45C \uC704\xB7\uC544\uB798\uC758 \uCEA1\uC158(\uC608: "<\uD45C 1> ...", "[\uD45C 2-1]")\uC740 \uD45C \uBC14\uB85C \uC704\uC5D0 \uBCC4\uB3C4 \uC904\uB85C \uBCF4\uC874
|
|
2532
|
+
- \uD55C \uD398\uC774\uC9C0\uC5D0\uC11C \uD45C\uAC00 \uC911\uAC04\uC5D0 \uB04A\uC5B4\uC838 \uBCF4\uC774\uB354\uB77C\uB3C4, \uD5E4\uB354\uAC00 \uB3D9\uC77C\uD558\uBA74 \uD558\uB098\uC758 \uD45C\uB85C \uC774\uC5B4\uBD99\uC77C \uAC83
|
|
2533
|
+
- \uC88C\uCE21 \uCCAB \uCEEC\uB7FC\uC774 \uC138\uB85C\uC4F0\uAE30 \uB77C\uBCA8(\uAD6C\uBD84/\uBD84\uB958/\uC5F0\uB3C4 \uB4F1)\uC774\uB77C\uB3C4 \uC77C\uBC18 \uC140\uB85C \uBCC0\uD658\uD558\uC5EC \uB204\uB77D \uAE08\uC9C0
|
|
2534
|
+
|
|
2535
|
+
[OCR \uC624\uC778\uC2DD\xB7\uAE68\uC9D0 \uAD50\uC815 \u2014 \uD5C8\uC6A9 \uBC94\uC704\uB9CC]
|
|
2536
|
+
- \uC22B\uC790 \uCE78\uC5D0\uC11C 'O'\u2192'0', 'l/I'\u2192'1' \uB4F1 \uB9E5\uB77D\uC0C1 \uBA85\uBC31\uD55C \uC624\uC778\uC2DD\uB9CC \uAD50\uC815
|
|
2537
|
+
- \uB2E8\uC5B4 \uC911\uAC04\uC5D0 \uC798\uBABB \uC0BD\uC785\uB41C \uACF5\uBC31 \uC81C\uAC70 (\uC608: "\uC8FC\uAC70 \uC885 \uD569 \uACC4\uD68D" \u2192 "\uC8FC\uAC70\uC885\uD569\uACC4\uD68D")
|
|
2538
|
+
- \uC904\uBC14\uAFC8 \uC624\uB958\uB85C \uB04A\uAE34 \uBB38\uC7A5\uC740 \uC758\uBBF8 \uB2E8\uC704\uB85C \uBCD1\uD569 (\uB2E8, \uD45C \uC140 \uB0B4\uBD80\uB294 <br> \uC720\uC9C0)
|
|
2539
|
+
- \uD45C\uC5D0\uC11C \uD589 \uC5B4\uAE0B\uB0A8\uC774 \uC758\uC2EC\uB418\uBA74(\uCEEC\uB7FC \uC218 \uBD88\uC77C\uCE58) \uBE48 \uC140\uB85C \uD328\uB529\uD558\uC5EC \uCEEC\uB7FC \uC815\uD569\uC131\uC744 \uC6B0\uC120 \uD655\uBCF4
|
|
2540
|
+
|
|
2541
|
+
[\uC808\uB300 \uAE08\uC9C0 \uC0AC\uD56D]
|
|
2542
|
+
- \uBB38\uC7A5\xB7\uB2E8\uB77D\xB7\uD56D\uBAA9\xB7\uD45C \uD589/\uC5F4\uC744 \uCD94\uAC00\uD558\uAC70\uB098 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83
|
|
2543
|
+
- \uC22B\uC790, \uD37C\uC13C\uD2B8, \uB0A0\uC9DC, \uB2E8\uC704, \uAE08\uC561\uC744 \uC808\uB300 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83
|
|
2544
|
+
- \uACE0\uC720\uBA85\uC0AC, \uAE30\uAD00\uBA85, \uBC95\uB839\uBA85, \uC9C0\uBA85, \uC778\uBA85\uC744 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83
|
|
2545
|
+
- \uD45C\uC758 \uD5E4\uB354 \uD14D\uC2A4\uD2B8, \uCEA1\uC158, \uD589/\uC5F4 \uAC1C\uC218\uB97C \uC784\uC758\uB85C \uBC14\uAFB8\uC9C0 \uB9D0 \uAC83
|
|
2546
|
+
- \uCCB4\uD06C\uBC15\uC2A4(\u2611/\u2610/\u25A0/\u25A1), \uD2B9\uC218\uBB38\uC790(\u203B, \u2460\u2461\u2462, \u3260, \uAC00., \uB098.)\uB97C \uC784\uC758\uB85C \uBC14\uAFB8\uC9C0 \uB9D0 \uAC83
|
|
2547
|
+
- \uC6D0\uBB38\uC5D0 \uC5C6\uB294 \uB0B4\uC6A9\uC744 \uC694\uC57D\xB7\uBCF4\uC644\xB7\uCD94\uB860\uD558\uC9C0 \uB9D0 \uAC83
|
|
2548
|
+
- \`\`\`, \`\`\`markdown \uAC19\uC740 \uCF54\uB4DC\uD39C\uC2A4\uB85C \uAC10\uC2F8\uC9C0 \uB9D0 \uAC83
|
|
2549
|
+
- \uC124\uBA85\xB7\uC8FC\uC11D\xB7\uBA54\uD0C0 \uBB38\uC7A5 \uCD94\uAC00 \uAE08\uC9C0
|
|
2550
|
+
|
|
2551
|
+
[\uBD88\uD655\uC2E4\uD560 \uB54C]
|
|
2552
|
+
- \uAE00\uC790\uAC00 \uD750\uB9BF\uD558\uAC70\uB098 \uD310\uB3C5 \uBD88\uAC00\uD558\uBA74 \uBCF4\uC774\uB294 \uADF8\uB300\uB85C \uB450\uAC70\uB098, \uBD88\uAC00\uD53C\uD558\uBA74 \uD55C \uAE00\uC790\uB9CC ?\uB85C \uD45C\uC2DC
|
|
2553
|
+
- \uD45C \uAD6C\uC870\uAC00 \uBAA8\uD638\uD558\uBA74 \uCEEC\uB7FC \uC218 \uC77C\uCE58 + \uBE48 \uC140 \uD328\uB529 \uD615\uD0DC\uB85C \uCD9C\uB825
|
|
2554
|
+
- \uCD94\uCE21\uBCF4\uB2E4 \uC6D0\uBB38 \uBCF4\uC874\uC744 \uD56D\uC0C1 \uC6B0\uC120`;
|
|
2410
2555
|
_tempDir = null;
|
|
2411
2556
|
}
|
|
2412
2557
|
});
|
|
@@ -2595,7 +2740,51 @@ var BATCH_OCR_PROMPT, DEFAULT_BATCH_SIZES, _batchTempDir;
|
|
|
2595
2740
|
var init_batch_provider = __esm({
|
|
2596
2741
|
"src/ocr/batch-provider.ts"() {
|
|
2597
2742
|
"use strict";
|
|
2598
|
-
BATCH_OCR_PROMPT =
|
|
2743
|
+
BATCH_OCR_PROMPT = `\uB2E4\uC74C \uBB38\uC11C \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uB4E4\uC744 OCR\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uACE0, \uBA85\uBC31\uD55C OCR \uC624\uC778\uC2DD\uACFC \uD45C \uAD6C\uC870 \uAE68\uC9D0\uB9CC \uAD50\uC815\uD558\uC5EC \uCD5C\uC885 \uACB0\uACFC\uB97C \uCD9C\uB825\uD558\uC138\uC694.
|
|
2744
|
+
|
|
2745
|
+
[\uD398\uC774\uC9C0 \uAD6C\uBD84 \u2014 \uD544\uC218]
|
|
2746
|
+
- \uAC01 \uD398\uC774\uC9C0 \uACB0\uACFC \uC0AC\uC774\uC5D0 \uBC18\uB4DC\uC2DC \uC774 \uAD6C\uBD84\uC790\uB97C \uC0BD\uC785: <!-- PAGE_BREAK -->
|
|
2747
|
+
|
|
2748
|
+
[\uAE30\uBCF8 \uCD94\uCD9C \uADDC\uCE59]
|
|
2749
|
+
- \uBCF8\uBB38, \uD45C, \uC81C\uBAA9, \uB9AC\uC2A4\uD2B8, \uCEA1\uC158\uC744 \uC6D0\uBB38 \uAD6C\uC870 \uADF8\uB300\uB85C Markdown\uC73C\uB85C \uBCC0\uD658
|
|
2750
|
+
- \uD5E4\uB529\uC740 \uC2DC\uAC01\uC801 \uD06C\uAE30\xB7\uAD75\uAE30\uC5D0 \uB530\uB77C # ~ ###### \uC0AC\uC6A9
|
|
2751
|
+
- \uB9AC\uC2A4\uD2B8\uB294 -, 1. \uC0AC\uC6A9 (\uC6D0\uBB38 \uBC88\uD638 \uCCB4\uACC4\uAC00 \u2460, \uAC00., 1) \uB4F1\uC774\uBA74 \uADF8 \uD45C\uAE30 \uC720\uC9C0)
|
|
2752
|
+
- \uC774\uBBF8\uC9C0\xB7\uB3C4\uD615\xB7\uB85C\uACE0\xB7\uD398\uC774\uC9C0\uBC88\uD638\xB7\uBA38\uB9AC\uAE00/\uBC14\uB2E5\uAE00\uC740 \uBB34\uC2DC
|
|
2753
|
+
- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C(\uC88C\u2192\uC6B0, \uC704\u2192\uC544\uB798, \uB2E4\uB2E8\uC774\uBA74 \uB2E8\uBCC4\uB85C)\uB97C \uC720\uC9C0
|
|
2754
|
+
|
|
2755
|
+
[\uD45C \uCD94\uCD9C \uADDC\uCE59 \u2014 \uAC00\uC7A5 \uC911\uC694]
|
|
2756
|
+
- \uD45C\uB294 \uBC18\uB4DC\uC2DC Markdown \uD30C\uC774\uD504 \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9: \uD5E4\uB354 \uD589 + |---| \uAD6C\uBD84\uC120 + \uB370\uC774\uD130 \uD589
|
|
2757
|
+
- \uC2DC\uAC01\uC801\uC73C\uB85C \uAD75\uAC70\uB098 \uC74C\uC601(\uD68C\uC0C9/\uC0C9\uC0C1) \uBC30\uACBD\uC774 \uC788\uB294 \uD589\uC744 \uD5E4\uB354\uB85C \uC2DD\uBCC4. \uD5E4\uB354\uAC00 \uC5C6\uB294 \uD45C\uB77C\uB3C4 \uCCAB \uD589\uC744 \uD5E4\uB354\uB85C \uB450\uACE0 |---| \uAD6C\uBD84\uC120 \uCD94\uAC00
|
|
2758
|
+
- \uBAA8\uB4E0 \uD589\uC758 \uD30C\uC774\uD504(|) \uAC1C\uC218\uB97C \uD5E4\uB354\uC640 \uB3D9\uC77C\uD558\uAC8C \uB9DE\uCD9C \uAC83 \u2014 \uBE48 \uC140\uC740 \uACF5\uBC31\uC73C\uB85C \uCC44\uC6B0\uACE0 \uC808\uB300 \uC140\uC744 \uC0DD\uB7B5\uD558\uC9C0 \uB9D0 \uAC83
|
|
2759
|
+
- \uC140 \uC548\uC758 \uC904\uBC14\uAFC8\uC740 <br>\uB85C \uD45C\uAE30 (\uC2E4\uC81C \uAC1C\uD589 \uBB38\uC790 \uC0AC\uC6A9 \uAE08\uC9C0 \u2014 \uD45C\uAC00 \uAE68\uC9D0)
|
|
2760
|
+
- \uBCD1\uD569 \uC140 \uCC98\uB9AC:
|
|
2761
|
+
- \uAC00\uB85C \uBCD1\uD569(colspan): \uBCD1\uD569\uB41C \uC140 \uB0B4\uC6A9\uC744 \uCCAB \uCE78\uC5D0 \uC4F0\uACE0 \uB098\uBA38\uC9C0 \uCE78\uC740 \uBE48 \uCE78\uC73C\uB85C \uB458 \uAC83
|
|
2762
|
+
- \uC138\uB85C \uBCD1\uD569(rowspan): \uBCD1\uD569\uB41C \uBAA8\uB4E0 \uD589\uC758 \uD574\uB2F9 \uCE78\uC5D0 \uB3D9\uC77C\uD55C \uB0B4\uC6A9\uC744 \uBC18\uBCF5 \uAE30\uC7AC\uD560 \uAC83
|
|
2763
|
+
- 2\uB2E8 \uD5E4\uB354(\uD5E4\uB354\uAC00 \uB450 \uC904\uC778 \uACBD\uC6B0): \uC0C1\uC704/\uD558\uC704 \uD5E4\uB354\uB97C " / "\uB85C \uD569\uCCD0 \uB2E8\uC77C \uD5E4\uB354 \uD589\uC73C\uB85C \uB9CC\uB4E4 \uAC83 (\uC608: "2024 / 1\uBD84\uAE30")
|
|
2764
|
+
- \uD45C \uC704\xB7\uC544\uB798\uC758 \uCEA1\uC158(\uC608: "<\uD45C 1> ...", "[\uD45C 2-1]")\uC740 \uD45C \uBC14\uB85C \uC704\uC5D0 \uBCC4\uB3C4 \uC904\uB85C \uBCF4\uC874
|
|
2765
|
+
- \uD55C \uD398\uC774\uC9C0\uC5D0\uC11C \uD45C\uAC00 \uC911\uAC04\uC5D0 \uB04A\uC5B4\uC838 \uBCF4\uC774\uB354\uB77C\uB3C4, \uD5E4\uB354\uAC00 \uB3D9\uC77C\uD558\uBA74 \uD558\uB098\uC758 \uD45C\uB85C \uC774\uC5B4\uBD99\uC77C \uAC83
|
|
2766
|
+
- \uC88C\uCE21 \uCCAB \uCEEC\uB7FC\uC774 \uC138\uB85C\uC4F0\uAE30 \uB77C\uBCA8(\uAD6C\uBD84/\uBD84\uB958/\uC5F0\uB3C4 \uB4F1)\uC774\uB77C\uB3C4 \uC77C\uBC18 \uC140\uB85C \uBCC0\uD658\uD558\uC5EC \uB204\uB77D \uAE08\uC9C0
|
|
2767
|
+
|
|
2768
|
+
[OCR \uC624\uC778\uC2DD\xB7\uAE68\uC9D0 \uAD50\uC815 \u2014 \uD5C8\uC6A9 \uBC94\uC704\uB9CC]
|
|
2769
|
+
- \uC22B\uC790 \uCE78\uC5D0\uC11C 'O'\u2192'0', 'l/I'\u2192'1' \uB4F1 \uB9E5\uB77D\uC0C1 \uBA85\uBC31\uD55C \uC624\uC778\uC2DD\uB9CC \uAD50\uC815
|
|
2770
|
+
- \uB2E8\uC5B4 \uC911\uAC04\uC5D0 \uC798\uBABB \uC0BD\uC785\uB41C \uACF5\uBC31 \uC81C\uAC70 (\uC608: "\uC8FC\uAC70 \uC885 \uD569 \uACC4\uD68D" \u2192 "\uC8FC\uAC70\uC885\uD569\uACC4\uD68D")
|
|
2771
|
+
- \uC904\uBC14\uAFC8 \uC624\uB958\uB85C \uB04A\uAE34 \uBB38\uC7A5\uC740 \uC758\uBBF8 \uB2E8\uC704\uB85C \uBCD1\uD569 (\uB2E8, \uD45C \uC140 \uB0B4\uBD80\uB294 <br> \uC720\uC9C0)
|
|
2772
|
+
- \uD45C\uC5D0\uC11C \uD589 \uC5B4\uAE0B\uB0A8\uC774 \uC758\uC2EC\uB418\uBA74(\uCEEC\uB7FC \uC218 \uBD88\uC77C\uCE58) \uBE48 \uC140\uB85C \uD328\uB529\uD558\uC5EC \uCEEC\uB7FC \uC815\uD569\uC131\uC744 \uC6B0\uC120 \uD655\uBCF4
|
|
2773
|
+
|
|
2774
|
+
[\uC808\uB300 \uAE08\uC9C0 \uC0AC\uD56D]
|
|
2775
|
+
- \uBB38\uC7A5\xB7\uB2E8\uB77D\xB7\uD56D\uBAA9\xB7\uD45C \uD589/\uC5F4\uC744 \uCD94\uAC00\uD558\uAC70\uB098 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83
|
|
2776
|
+
- \uC22B\uC790, \uD37C\uC13C\uD2B8, \uB0A0\uC9DC, \uB2E8\uC704, \uAE08\uC561\uC744 \uC808\uB300 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83
|
|
2777
|
+
- \uACE0\uC720\uBA85\uC0AC, \uAE30\uAD00\uBA85, \uBC95\uB839\uBA85, \uC9C0\uBA85, \uC778\uBA85\uC744 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83
|
|
2778
|
+
- \uD45C\uC758 \uD5E4\uB354 \uD14D\uC2A4\uD2B8, \uCEA1\uC158, \uD589/\uC5F4 \uAC1C\uC218\uB97C \uC784\uC758\uB85C \uBC14\uAFB8\uC9C0 \uB9D0 \uAC83
|
|
2779
|
+
- \uCCB4\uD06C\uBC15\uC2A4(\u2611/\u2610/\u25A0/\u25A1), \uD2B9\uC218\uBB38\uC790(\u203B, \u2460\u2461\u2462, \u3260, \uAC00., \uB098.)\uB97C \uC784\uC758\uB85C \uBC14\uAFB8\uC9C0 \uB9D0 \uAC83
|
|
2780
|
+
- \uC6D0\uBB38\uC5D0 \uC5C6\uB294 \uB0B4\uC6A9\uC744 \uC694\uC57D\xB7\uBCF4\uC644\xB7\uCD94\uB860\uD558\uC9C0 \uB9D0 \uAC83
|
|
2781
|
+
- \`\`\`, \`\`\`markdown \uAC19\uC740 \uCF54\uB4DC\uD39C\uC2A4\uB85C \uAC10\uC2F8\uC9C0 \uB9D0 \uAC83
|
|
2782
|
+
- \uC124\uBA85\xB7\uC8FC\uC11D\xB7\uBA54\uD0C0 \uBB38\uC7A5 \uCD94\uAC00 \uAE08\uC9C0
|
|
2783
|
+
|
|
2784
|
+
[\uBD88\uD655\uC2E4\uD560 \uB54C]
|
|
2785
|
+
- \uAE00\uC790\uAC00 \uD750\uB9BF\uD558\uAC70\uB098 \uD310\uB3C5 \uBD88\uAC00\uD558\uBA74 \uBCF4\uC774\uB294 \uADF8\uB300\uB85C \uB450\uAC70\uB098, \uBD88\uAC00\uD53C\uD558\uBA74 \uD55C \uAE00\uC790\uB9CC ?\uB85C \uD45C\uC2DC
|
|
2786
|
+
- \uD45C \uAD6C\uC870\uAC00 \uBAA8\uD638\uD558\uBA74 \uCEEC\uB7FC \uC218 \uC77C\uCE58 + \uBE48 \uC140 \uD328\uB529 \uD615\uD0DC\uB85C \uCD9C\uB825
|
|
2787
|
+
- \uCD94\uCE21\uBCF4\uB2E4 \uC6D0\uBB38 \uBCF4\uC874\uC744 \uD56D\uC0C1 \uC6B0\uC120`;
|
|
2599
2788
|
DEFAULT_BATCH_SIZES = {
|
|
2600
2789
|
gemini: 5,
|
|
2601
2790
|
claude: 5,
|
|
@@ -2982,7 +3171,7 @@ var init_provider = __esm({
|
|
|
2982
3171
|
});
|
|
2983
3172
|
|
|
2984
3173
|
// src/index.ts
|
|
2985
|
-
import { readFile as
|
|
3174
|
+
import { readFile as readFile3 } from "fs/promises";
|
|
2986
3175
|
|
|
2987
3176
|
// src/detect.ts
|
|
2988
3177
|
import JSZip from "jszip";
|
|
@@ -3034,97 +3223,8 @@ async function detectZipFormat(buffer) {
|
|
|
3034
3223
|
import JSZip2 from "jszip";
|
|
3035
3224
|
import { DOMParser } from "@xmldom/xmldom";
|
|
3036
3225
|
|
|
3037
|
-
// src/utils.ts
|
|
3038
|
-
var VERSION = true ? "2.5.0" : "0.0.0-dev";
|
|
3039
|
-
function toArrayBuffer(buf) {
|
|
3040
|
-
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
3041
|
-
return buf.buffer;
|
|
3042
|
-
}
|
|
3043
|
-
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
3044
|
-
}
|
|
3045
|
-
var KordocError = class extends Error {
|
|
3046
|
-
code;
|
|
3047
|
-
stage;
|
|
3048
|
-
constructor(message, opts = {}) {
|
|
3049
|
-
super(message);
|
|
3050
|
-
this.name = "KordocError";
|
|
3051
|
-
this.code = opts.code;
|
|
3052
|
-
this.stage = opts.stage;
|
|
3053
|
-
}
|
|
3054
|
-
};
|
|
3055
|
-
function isPathTraversal(name) {
|
|
3056
|
-
if (name.includes("\0")) return true;
|
|
3057
|
-
const normalized = name.replace(/\\/g, "/");
|
|
3058
|
-
return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
3059
|
-
}
|
|
3060
|
-
function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
|
|
3061
|
-
try {
|
|
3062
|
-
const data = new DataView(buffer);
|
|
3063
|
-
const len = buffer.byteLength;
|
|
3064
|
-
let eocdOffset = -1;
|
|
3065
|
-
for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
|
|
3066
|
-
if (data.getUint32(i, true) === 101010256) {
|
|
3067
|
-
eocdOffset = i;
|
|
3068
|
-
break;
|
|
3069
|
-
}
|
|
3070
|
-
}
|
|
3071
|
-
if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
|
|
3072
|
-
const entryCount = data.getUint16(eocdOffset + 10, true);
|
|
3073
|
-
if (entryCount > maxEntries) {
|
|
3074
|
-
throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
|
|
3075
|
-
}
|
|
3076
|
-
const cdSize = data.getUint32(eocdOffset + 12, true);
|
|
3077
|
-
const cdOffset = data.getUint32(eocdOffset + 16, true);
|
|
3078
|
-
if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
|
|
3079
|
-
let totalUncompressed = 0;
|
|
3080
|
-
let pos = cdOffset;
|
|
3081
|
-
for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
|
|
3082
|
-
if (data.getUint32(pos, true) !== 33639248) break;
|
|
3083
|
-
totalUncompressed += data.getUint32(pos + 24, true);
|
|
3084
|
-
const nameLen = data.getUint16(pos + 28, true);
|
|
3085
|
-
const extraLen = data.getUint16(pos + 30, true);
|
|
3086
|
-
const commentLen = data.getUint16(pos + 32, true);
|
|
3087
|
-
pos += 46 + nameLen + extraLen + commentLen;
|
|
3088
|
-
}
|
|
3089
|
-
if (totalUncompressed > maxUncompressedSize) {
|
|
3090
|
-
throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
|
|
3091
|
-
}
|
|
3092
|
-
return { totalUncompressed, entryCount };
|
|
3093
|
-
} catch (err) {
|
|
3094
|
-
if (err instanceof KordocError) throw err;
|
|
3095
|
-
return { totalUncompressed: 0, entryCount: 0 };
|
|
3096
|
-
}
|
|
3097
|
-
}
|
|
3098
|
-
var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
3099
|
-
function sanitizeHref(href) {
|
|
3100
|
-
const trimmed = href.trim();
|
|
3101
|
-
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
3102
|
-
return trimmed;
|
|
3103
|
-
}
|
|
3104
|
-
function classifyError(err) {
|
|
3105
|
-
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
3106
|
-
const msg = err.message;
|
|
3107
|
-
if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
|
|
3108
|
-
if (msg.includes("DRM")) return "DRM_PROTECTED";
|
|
3109
|
-
if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
|
|
3110
|
-
if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
|
|
3111
|
-
if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
|
|
3112
|
-
if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
|
|
3113
|
-
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
3114
|
-
return "PARSE_ERROR";
|
|
3115
|
-
}
|
|
3116
|
-
function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
|
|
3117
|
-
if (err instanceof KordocError) {
|
|
3118
|
-
if (!err.stage) err.stage = stage;
|
|
3119
|
-
if (!err.code) err.code = fallbackCode;
|
|
3120
|
-
return err;
|
|
3121
|
-
}
|
|
3122
|
-
const message = err instanceof Error ? err.message : fallbackMessage;
|
|
3123
|
-
const code = err instanceof Error ? classifyError(err) : fallbackCode;
|
|
3124
|
-
return new KordocError(message || fallbackMessage, { code, stage });
|
|
3125
|
-
}
|
|
3126
|
-
|
|
3127
3226
|
// src/table/builder.ts
|
|
3227
|
+
init_utils();
|
|
3128
3228
|
var MAX_COLS = 200;
|
|
3129
3229
|
var MAX_ROWS = 1e4;
|
|
3130
3230
|
function buildTable(rows) {
|
|
@@ -3384,6 +3484,8 @@ var HEADING_RATIO_H2 = 1.3;
|
|
|
3384
3484
|
var HEADING_RATIO_H3 = 1.15;
|
|
3385
3485
|
|
|
3386
3486
|
// src/hwpx/parser.ts
|
|
3487
|
+
init_utils();
|
|
3488
|
+
init_utils();
|
|
3387
3489
|
init_page_range();
|
|
3388
3490
|
init_logger();
|
|
3389
3491
|
var MAX_DECOMPRESS_SIZE = 500 * 1024 * 1024;
|
|
@@ -4225,6 +4327,7 @@ function extractTextFromNode(node) {
|
|
|
4225
4327
|
}
|
|
4226
4328
|
|
|
4227
4329
|
// src/hwp5/record.ts
|
|
4330
|
+
init_utils();
|
|
4228
4331
|
import { inflateRawSync, inflateSync } from "zlib";
|
|
4229
4332
|
var TAG_PARA_HEADER = 66;
|
|
4230
4333
|
var TAG_PARA_TEXT = 67;
|
|
@@ -5275,6 +5378,7 @@ function parseLenientCfb(data) {
|
|
|
5275
5378
|
}
|
|
5276
5379
|
|
|
5277
5380
|
// src/hwp5/parser.ts
|
|
5381
|
+
init_utils();
|
|
5278
5382
|
init_page_range();
|
|
5279
5383
|
init_logger();
|
|
5280
5384
|
var CFB = __toESM(require_cfb(), 1);
|
|
@@ -5930,6 +6034,7 @@ function arrangeCells(rows, cols, cells) {
|
|
|
5930
6034
|
}
|
|
5931
6035
|
|
|
5932
6036
|
// src/pdf/parser.ts
|
|
6037
|
+
init_utils();
|
|
5933
6038
|
init_page_range();
|
|
5934
6039
|
import { createRequire } from "module";
|
|
5935
6040
|
import { dirname as dirname2, join as join3, resolve as resolve2 } from "path";
|
|
@@ -7821,6 +7926,7 @@ function mergeKoreanLines(text) {
|
|
|
7821
7926
|
}
|
|
7822
7927
|
|
|
7823
7928
|
// src/xlsx/parser.ts
|
|
7929
|
+
init_utils();
|
|
7824
7930
|
import JSZip3 from "jszip";
|
|
7825
7931
|
import { DOMParser as DOMParser2 } from "@xmldom/xmldom";
|
|
7826
7932
|
init_logger();
|
|
@@ -8149,6 +8255,7 @@ async function parseXlsxDocument(buffer, options, existingZip) {
|
|
|
8149
8255
|
}
|
|
8150
8256
|
|
|
8151
8257
|
// src/docx/parser.ts
|
|
8258
|
+
init_utils();
|
|
8152
8259
|
import JSZip4 from "jszip";
|
|
8153
8260
|
import { DOMParser as DOMParser3 } from "@xmldom/xmldom";
|
|
8154
8261
|
init_logger();
|
|
@@ -8630,6 +8737,7 @@ async function parseDocxDocument(buffer, options, existingZip) {
|
|
|
8630
8737
|
}
|
|
8631
8738
|
|
|
8632
8739
|
// src/index.ts
|
|
8740
|
+
init_utils();
|
|
8633
8741
|
init_cli_provider();
|
|
8634
8742
|
init_markdown_to_blocks();
|
|
8635
8743
|
init_logger();
|
|
@@ -11131,6 +11239,187 @@ async function markdownToXlsx(markdown, options) {
|
|
|
11131
11239
|
return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
|
|
11132
11240
|
}
|
|
11133
11241
|
|
|
11242
|
+
// src/convert/index.ts
|
|
11243
|
+
import { readFile } from "fs/promises";
|
|
11244
|
+
init_utils();
|
|
11245
|
+
|
|
11246
|
+
// src/convert/libreoffice.ts
|
|
11247
|
+
import libre from "libreoffice-convert";
|
|
11248
|
+
|
|
11249
|
+
// src/convert/error.ts
|
|
11250
|
+
var ConvertError = class extends Error {
|
|
11251
|
+
constructor(code, message) {
|
|
11252
|
+
super(message);
|
|
11253
|
+
this.code = code;
|
|
11254
|
+
this.name = "ConvertError";
|
|
11255
|
+
}
|
|
11256
|
+
};
|
|
11257
|
+
|
|
11258
|
+
// src/convert/libreoffice.ts
|
|
11259
|
+
var libreConvert = libre.convert;
|
|
11260
|
+
async function assertSofficeAvailable() {
|
|
11261
|
+
const { runCommand: runCommand2 } = await Promise.resolve().then(() => (init_utils(), utils_exports));
|
|
11262
|
+
try {
|
|
11263
|
+
await runCommand2("soffice", ["--version"]);
|
|
11264
|
+
} catch {
|
|
11265
|
+
throw new ConvertError(
|
|
11266
|
+
"SOFFICE_NOT_FOUND",
|
|
11267
|
+
"soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694."
|
|
11268
|
+
);
|
|
11269
|
+
}
|
|
11270
|
+
}
|
|
11271
|
+
async function convertBuffer(buffer, targetExt, timeoutMs = 6e4) {
|
|
11272
|
+
return new Promise((resolve4, reject) => {
|
|
11273
|
+
const timer = setTimeout(() => {
|
|
11274
|
+
reject(
|
|
11275
|
+
new ConvertError("TIMEOUT", `\uBCC0\uD658 \uD0C0\uC784\uC544\uC6C3 (${timeoutMs}ms \uCD08\uACFC)`)
|
|
11276
|
+
);
|
|
11277
|
+
}, timeoutMs);
|
|
11278
|
+
libreConvert(buffer, targetExt, void 0, (err, done) => {
|
|
11279
|
+
clearTimeout(timer);
|
|
11280
|
+
if (err || !done) {
|
|
11281
|
+
reject(
|
|
11282
|
+
new ConvertError(
|
|
11283
|
+
"CONVERT_FAILED",
|
|
11284
|
+
err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"
|
|
11285
|
+
)
|
|
11286
|
+
);
|
|
11287
|
+
return;
|
|
11288
|
+
}
|
|
11289
|
+
resolve4(done);
|
|
11290
|
+
});
|
|
11291
|
+
});
|
|
11292
|
+
}
|
|
11293
|
+
|
|
11294
|
+
// src/convert/index.ts
|
|
11295
|
+
var isConverting = false;
|
|
11296
|
+
var queue = [];
|
|
11297
|
+
async function acquireConvertLock() {
|
|
11298
|
+
if (!isConverting) {
|
|
11299
|
+
isConverting = true;
|
|
11300
|
+
return () => {
|
|
11301
|
+
isConverting = false;
|
|
11302
|
+
const next = queue.shift();
|
|
11303
|
+
next?.();
|
|
11304
|
+
};
|
|
11305
|
+
}
|
|
11306
|
+
return new Promise((resolve4) => {
|
|
11307
|
+
queue.push(() => {
|
|
11308
|
+
isConverting = true;
|
|
11309
|
+
resolve4(() => {
|
|
11310
|
+
isConverting = false;
|
|
11311
|
+
const next = queue.shift();
|
|
11312
|
+
next?.();
|
|
11313
|
+
});
|
|
11314
|
+
});
|
|
11315
|
+
});
|
|
11316
|
+
}
|
|
11317
|
+
async function convertToPdf(input, options) {
|
|
11318
|
+
let buffer;
|
|
11319
|
+
try {
|
|
11320
|
+
if (typeof input === "string") {
|
|
11321
|
+
buffer = await readFile(input);
|
|
11322
|
+
} else if (Buffer.isBuffer(input)) {
|
|
11323
|
+
buffer = input;
|
|
11324
|
+
} else {
|
|
11325
|
+
buffer = Buffer.from(input);
|
|
11326
|
+
}
|
|
11327
|
+
} catch (err) {
|
|
11328
|
+
return {
|
|
11329
|
+
success: false,
|
|
11330
|
+
code: "PARSE_ERROR",
|
|
11331
|
+
error: `\uC785\uB825 \uC77D\uAE30 \uC2E4\uD328: ${err instanceof Error ? err.message : String(err)}`,
|
|
11332
|
+
stage: "detect"
|
|
11333
|
+
};
|
|
11334
|
+
}
|
|
11335
|
+
const MAX_FILE_SIZE = 500 * 1024 * 1024;
|
|
11336
|
+
if (buffer.length > MAX_FILE_SIZE) {
|
|
11337
|
+
return {
|
|
11338
|
+
success: false,
|
|
11339
|
+
code: "FILE_TOO_LARGE",
|
|
11340
|
+
error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.length / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`,
|
|
11341
|
+
stage: "detect"
|
|
11342
|
+
};
|
|
11343
|
+
}
|
|
11344
|
+
const format = detectFormat(toArrayBuffer(buffer));
|
|
11345
|
+
if (format !== "hwp" && format !== "hwpx") {
|
|
11346
|
+
return {
|
|
11347
|
+
success: false,
|
|
11348
|
+
code: "UNSUPPORTED_FORMAT",
|
|
11349
|
+
error: `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD3EC\uB9F7\uC785\uB2C8\uB2E4: ${format}`,
|
|
11350
|
+
stage: "detect"
|
|
11351
|
+
};
|
|
11352
|
+
}
|
|
11353
|
+
try {
|
|
11354
|
+
await assertSofficeAvailable();
|
|
11355
|
+
} catch (err) {
|
|
11356
|
+
if (err instanceof ConvertError) {
|
|
11357
|
+
return {
|
|
11358
|
+
success: false,
|
|
11359
|
+
code: err.code,
|
|
11360
|
+
error: err.message,
|
|
11361
|
+
stage: "validate"
|
|
11362
|
+
};
|
|
11363
|
+
}
|
|
11364
|
+
throw err;
|
|
11365
|
+
}
|
|
11366
|
+
const releaseLock = await acquireConvertLock();
|
|
11367
|
+
try {
|
|
11368
|
+
options?.onProgress?.(10, "convert");
|
|
11369
|
+
const pdf = await convertBuffer(buffer, ".pdf", options?.timeoutMs);
|
|
11370
|
+
options?.onProgress?.(100, "done");
|
|
11371
|
+
return {
|
|
11372
|
+
success: true,
|
|
11373
|
+
pdf: new Uint8Array(pdf),
|
|
11374
|
+
sourceFormat: format
|
|
11375
|
+
};
|
|
11376
|
+
} catch (err) {
|
|
11377
|
+
if (err instanceof ConvertError) {
|
|
11378
|
+
return {
|
|
11379
|
+
success: false,
|
|
11380
|
+
code: err.code,
|
|
11381
|
+
error: err.message,
|
|
11382
|
+
stage: "convert"
|
|
11383
|
+
};
|
|
11384
|
+
}
|
|
11385
|
+
return {
|
|
11386
|
+
success: false,
|
|
11387
|
+
code: classifyError(err),
|
|
11388
|
+
error: err instanceof Error ? err.message : "\uBCC0\uD658 \uC2E4\uD328",
|
|
11389
|
+
stage: "convert"
|
|
11390
|
+
};
|
|
11391
|
+
} finally {
|
|
11392
|
+
releaseLock();
|
|
11393
|
+
}
|
|
11394
|
+
}
|
|
11395
|
+
async function convertHwpToPdf(input, options) {
|
|
11396
|
+
const result = await convertToPdf(input, options);
|
|
11397
|
+
if (result.success && result.sourceFormat !== "hwp") {
|
|
11398
|
+
return {
|
|
11399
|
+
success: false,
|
|
11400
|
+
code: "UNSUPPORTED_FORMAT",
|
|
11401
|
+
error: `HWP 5.x \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
|
|
11402
|
+
stage: "detect"
|
|
11403
|
+
};
|
|
11404
|
+
}
|
|
11405
|
+
return result;
|
|
11406
|
+
}
|
|
11407
|
+
async function convertHwpxToPdf(input, options) {
|
|
11408
|
+
const result = await convertToPdf(input, options);
|
|
11409
|
+
if (result.success && result.sourceFormat !== "hwpx") {
|
|
11410
|
+
return {
|
|
11411
|
+
success: false,
|
|
11412
|
+
code: "UNSUPPORTED_FORMAT",
|
|
11413
|
+
error: `HWPX \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
|
|
11414
|
+
stage: "detect"
|
|
11415
|
+
};
|
|
11416
|
+
}
|
|
11417
|
+
return result;
|
|
11418
|
+
}
|
|
11419
|
+
|
|
11420
|
+
// src/index.ts
|
|
11421
|
+
init_utils();
|
|
11422
|
+
|
|
11134
11423
|
// src/ocr/api-key-rotation.ts
|
|
11135
11424
|
var AllKeysCoolingDownError = class extends Error {
|
|
11136
11425
|
waitMs;
|
|
@@ -11225,11 +11514,10 @@ var ApiKeyRotationPool = class _ApiKeyRotationPool {
|
|
|
11225
11514
|
};
|
|
11226
11515
|
|
|
11227
11516
|
// src/pipeline/unified-ocr.ts
|
|
11228
|
-
import { mkdir, readdir, readFile, stat, writeFile } from "fs/promises";
|
|
11517
|
+
import { mkdir, readdir, readFile as readFile2, stat, writeFile } from "fs/promises";
|
|
11229
11518
|
import { basename as basename2, dirname as dirname3, extname, join as join4, resolve as resolve3 } from "path";
|
|
11230
11519
|
import { spawn as spawn2 } from "child_process";
|
|
11231
11520
|
import { performance } from "perf_hooks";
|
|
11232
|
-
import libre from "libreoffice-convert";
|
|
11233
11521
|
init_logger();
|
|
11234
11522
|
|
|
11235
11523
|
// src/pipeline/bounded-queue.ts
|
|
@@ -11291,7 +11579,6 @@ var BoundedQueue = class {
|
|
|
11291
11579
|
};
|
|
11292
11580
|
|
|
11293
11581
|
// src/pipeline/unified-ocr.ts
|
|
11294
|
-
var libreConvert = libre.convert;
|
|
11295
11582
|
var UnifiedOcrError = class extends Error {
|
|
11296
11583
|
code;
|
|
11297
11584
|
stage;
|
|
@@ -11406,8 +11693,8 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11406
11693
|
if (extname(absInput).toLowerCase() !== ".pdf") {
|
|
11407
11694
|
await assertSofficeAvailable();
|
|
11408
11695
|
workingPdfPath = join4(workspaceDir, `${stem}.pdf`);
|
|
11409
|
-
const inputBuffer = await
|
|
11410
|
-
const out = await
|
|
11696
|
+
const inputBuffer = await readFile2(absInput);
|
|
11697
|
+
const out = await convertBuffer(inputBuffer, ".pdf");
|
|
11411
11698
|
await writeFile(workingPdfPath, out);
|
|
11412
11699
|
}
|
|
11413
11700
|
timingsMs.convert = elapsedMs(convertStart);
|
|
@@ -11458,7 +11745,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11458
11745
|
const keyCount = keyPool.snapshot().length;
|
|
11459
11746
|
const workerCount = Math.max(1, keyCount * concurrencyPerKey);
|
|
11460
11747
|
const queueCapacity = workerCount * 2;
|
|
11461
|
-
const
|
|
11748
|
+
const queue2 = new BoundedQueue(queueCapacity);
|
|
11462
11749
|
const ocrStart = performance.now();
|
|
11463
11750
|
currentStage = "ocr";
|
|
11464
11751
|
markStageStart("ocr", `OCR \uC9C4\uD589 \uC911 (\uC6CC\uCEE4 ${workerCount}\uAC1C)`);
|
|
@@ -11466,17 +11753,17 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11466
11753
|
let renderDone = 1;
|
|
11467
11754
|
const renderProducer = (async () => {
|
|
11468
11755
|
try {
|
|
11469
|
-
await
|
|
11756
|
+
await queue2.enqueue({ pageNumber: 1, imagePath: probeImage });
|
|
11470
11757
|
if (totalPages > 1) {
|
|
11471
11758
|
for await (const item of renderPdfToPngStream(workingPdfPath, join4(imagesDir, "page"), dpi, totalPages, 2)) {
|
|
11472
|
-
await
|
|
11759
|
+
await queue2.enqueue(item);
|
|
11473
11760
|
renderDone++;
|
|
11474
11761
|
markStageProgress("render", Math.round(renderDone / totalPages * 100), renderDone, totalPages, `\uD398\uC774\uC9C0 ${renderDone}/${totalPages} \uB80C\uB354\uB9C1`);
|
|
11475
11762
|
logStage("debug", "render", "progress", "\uD398\uC774\uC9C0 \uB80C\uB354 \uC644\uB8CC", { page: item.pageNumber });
|
|
11476
11763
|
}
|
|
11477
11764
|
}
|
|
11478
11765
|
} finally {
|
|
11479
|
-
|
|
11766
|
+
queue2.close();
|
|
11480
11767
|
timingsMs.render = elapsedMs(renderStart);
|
|
11481
11768
|
markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
|
|
11482
11769
|
logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", { pages: renderDone, elapsedMs: timingsMs.render });
|
|
@@ -11485,7 +11772,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11485
11772
|
const [, pageResultsMap] = await Promise.all([
|
|
11486
11773
|
renderProducer,
|
|
11487
11774
|
ocrWorkerPool({
|
|
11488
|
-
queue,
|
|
11775
|
+
queue: queue2,
|
|
11489
11776
|
workerCount,
|
|
11490
11777
|
totalPages,
|
|
11491
11778
|
ocrInput: {
|
|
@@ -11605,17 +11892,6 @@ function emitProgress(cb, stage, stagePercent, weights, extra) {
|
|
|
11605
11892
|
model: extra.model
|
|
11606
11893
|
});
|
|
11607
11894
|
}
|
|
11608
|
-
async function convertWithLibreOffice(buffer, ext) {
|
|
11609
|
-
return await new Promise((resolvePromise, reject) => {
|
|
11610
|
-
libreConvert(buffer, ext, void 0, (err, done) => {
|
|
11611
|
-
if (err || !done) {
|
|
11612
|
-
reject(new UnifiedOcrError("CONVERT_FAILED", "convert", err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"));
|
|
11613
|
-
return;
|
|
11614
|
-
}
|
|
11615
|
-
resolvePromise(done);
|
|
11616
|
-
});
|
|
11617
|
-
});
|
|
11618
|
-
}
|
|
11619
11895
|
async function getPdfPageCount(pdfPath) {
|
|
11620
11896
|
const stdout = await runCommandWithStdout("pdfinfo", [pdfPath]);
|
|
11621
11897
|
const m = stdout.match(/^\s*Pages:\s*(\d+)\s*$/mi);
|
|
@@ -11688,13 +11964,6 @@ async function runCommandWithStdout(cmd, args) {
|
|
|
11688
11964
|
});
|
|
11689
11965
|
});
|
|
11690
11966
|
}
|
|
11691
|
-
async function assertSofficeAvailable() {
|
|
11692
|
-
try {
|
|
11693
|
-
await runCommand("soffice", ["--version"]);
|
|
11694
|
-
} catch {
|
|
11695
|
-
throw new UnifiedOcrError("SOFFICE_NOT_FOUND", "convert", "soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694.");
|
|
11696
|
-
}
|
|
11697
|
-
}
|
|
11698
11967
|
function naturalPageSort(a, b) {
|
|
11699
11968
|
const na = Number((a.match(/\d+/g) || ["0"]).at(-1) || 0);
|
|
11700
11969
|
const nb = Number((b.match(/\d+/g) || ["0"]).at(-1) || 0);
|
|
@@ -11768,7 +12037,7 @@ function startParallelProbeRuns(input) {
|
|
|
11768
12037
|
}
|
|
11769
12038
|
async function loadModelCache(path) {
|
|
11770
12039
|
try {
|
|
11771
|
-
const raw = await
|
|
12040
|
+
const raw = await readFile2(path, "utf-8");
|
|
11772
12041
|
return JSON.parse(raw);
|
|
11773
12042
|
} catch {
|
|
11774
12043
|
return null;
|
|
@@ -11802,12 +12071,12 @@ async function updateModelCache(path, probes) {
|
|
|
11802
12071
|
await writeFile(path, JSON.stringify(current, null, 2), "utf-8");
|
|
11803
12072
|
}
|
|
11804
12073
|
async function ocrWorkerPool(input) {
|
|
11805
|
-
const { queue, workerCount, ocrInput, onPageDone } = input;
|
|
12074
|
+
const { queue: queue2, workerCount, ocrInput, onPageDone } = input;
|
|
11806
12075
|
const results = /* @__PURE__ */ new Map();
|
|
11807
12076
|
let completedCount = 0;
|
|
11808
12077
|
async function worker() {
|
|
11809
12078
|
while (true) {
|
|
11810
|
-
const item = await
|
|
12079
|
+
const item = await queue2.dequeue();
|
|
11811
12080
|
if (item === QUEUE_DONE) break;
|
|
11812
12081
|
const { pageNumber, imagePath, error } = item;
|
|
11813
12082
|
if (imagePath === null) {
|
|
@@ -11859,7 +12128,7 @@ async function ocrImageWithFallback(input) {
|
|
|
11859
12128
|
async function mergeMarkdownPages(paths) {
|
|
11860
12129
|
const out = [];
|
|
11861
12130
|
for (let i = 0; i < paths.length; i++) {
|
|
11862
|
-
const txt = (await
|
|
12131
|
+
const txt = (await readFile2(paths[i], "utf-8")).trim();
|
|
11863
12132
|
if (!txt) continue;
|
|
11864
12133
|
out.push(txt);
|
|
11865
12134
|
}
|
|
@@ -11975,7 +12244,7 @@ async function ocrImageViaNim(input) {
|
|
|
11975
12244
|
throw new UnifiedOcrError("OCR_FAILED", "ocr", `OCR \uC7AC\uC2DC\uB3C4 \uCD08\uACFC: ${lastErr}`);
|
|
11976
12245
|
}
|
|
11977
12246
|
async function encodeBase64(path) {
|
|
11978
|
-
const b = await
|
|
12247
|
+
const b = await readFile2(path);
|
|
11979
12248
|
return b.toString("base64");
|
|
11980
12249
|
}
|
|
11981
12250
|
function stripCodeFence3(text) {
|
|
@@ -12014,7 +12283,7 @@ async function parse2(input, options) {
|
|
|
12014
12283
|
let buffer;
|
|
12015
12284
|
if (typeof input === "string") {
|
|
12016
12285
|
try {
|
|
12017
|
-
const buf = await
|
|
12286
|
+
const buf = await readFile3(input);
|
|
12018
12287
|
buffer = toArrayBuffer(buf);
|
|
12019
12288
|
} catch (err) {
|
|
12020
12289
|
const msg = err instanceof Error && "code" in err && err.code === "ENOENT" ? `\uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${input}` : `\uD30C\uC77C \uC77D\uAE30 \uC2E4\uD328: ${input}`;
|
|
@@ -12173,6 +12442,9 @@ export {
|
|
|
12173
12442
|
VERSION,
|
|
12174
12443
|
blocksToMarkdown,
|
|
12175
12444
|
compare,
|
|
12445
|
+
convertHwpToPdf,
|
|
12446
|
+
convertHwpxToPdf,
|
|
12447
|
+
convertToPdf,
|
|
12176
12448
|
detectFormat,
|
|
12177
12449
|
detectZipFormat,
|
|
12178
12450
|
diffBlocks,
|