@clazic/kordoc 2.6.1 → 2.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-BZPZXI66.js → chunk-3FTA6V7S.js} +68 -24
- package/dist/chunk-3FTA6V7S.js.map +1 -0
- package/dist/{chunk-4X5JCZFZ.js → chunk-USE7IDLV.js} +2 -2
- package/dist/cli.js +6 -6
- package/dist/index.cjs +185 -159
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +8 -2
- package/dist/index.d.ts +8 -2
- package/dist/index.js +186 -160
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +2 -2
- package/dist/{utils-56QT5C33.js → utils-XYBJBWM2.js} +2 -2
- package/dist/{watch-HRNMJWSE.js → watch-CJRS6OYE.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-BZPZXI66.js.map +0 -1
- /package/dist/{chunk-4X5JCZFZ.js.map → chunk-USE7IDLV.js.map} +0 -0
- /package/dist/{utils-56QT5C33.js.map → utils-XYBJBWM2.js.map} +0 -0
- /package/dist/{watch-HRNMJWSE.js.map → watch-CJRS6OYE.js.map} +0 -0
package/dist/index.d.cts
CHANGED
|
@@ -549,12 +549,14 @@ interface UnifiedOcrProgressEvent {
|
|
|
549
549
|
pageNumber?: number;
|
|
550
550
|
workerCount?: number;
|
|
551
551
|
}
|
|
552
|
+
/** unified-ocr에서 onEvent로 전달되는 보조 이벤트 — LibreOffice 설치/검증/에러 */
|
|
553
|
+
type UnifiedOcrAuxEvent = ConvertInstallEvent | ConvertValidateEvent | ConvertErrorEvent;
|
|
552
554
|
interface UnifiedOcrOptions {
|
|
553
555
|
workspaceDir?: string;
|
|
554
556
|
outputPath?: string;
|
|
555
557
|
dpi?: number;
|
|
556
558
|
baseUrl?: string;
|
|
557
|
-
onEvent?: (event: UnifiedOcrProgressEvent) => void;
|
|
559
|
+
onEvent?: (event: UnifiedOcrProgressEvent | UnifiedOcrAuxEvent) => void;
|
|
558
560
|
modelCandidates?: string[];
|
|
559
561
|
modelMaxTokens?: Record<string, number>;
|
|
560
562
|
stageWeights?: Partial<Record<UnifiedStage, number>>;
|
|
@@ -564,6 +566,10 @@ interface UnifiedOcrOptions {
|
|
|
564
566
|
logger?: Logger;
|
|
565
567
|
runId?: string;
|
|
566
568
|
concurrencyPerKey?: number;
|
|
569
|
+
/** LibreOffice 자동 설치 허용 (기본 false) — convert 단계에서 install/validate 이벤트가 onEvent로 전달됨 */
|
|
570
|
+
autoInstallLibreOffice?: boolean;
|
|
571
|
+
/** LibreOffice 바이너리 직접 지정 (선택) — 지정 시 자동 탐색/설치 건너뜀 */
|
|
572
|
+
sofficePath?: string;
|
|
567
573
|
}
|
|
568
574
|
interface UnifiedOcrResult {
|
|
569
575
|
outputPath: string;
|
|
@@ -602,4 +608,4 @@ declare function parseXlsx(buffer: ArrayBuffer, options?: ParseOptions, zip?: JS
|
|
|
602
608
|
/** DOCX 파일을 Markdown으로 변환 */
|
|
603
609
|
declare function parseDocx(buffer: ArrayBuffer, options?: ParseOptions, zip?: JSZip): Promise<ParseResult>;
|
|
604
610
|
|
|
605
|
-
export { type AcquireResult, AllKeysCoolingDownError, type ApiKeyRotationOptions, ApiKeyRotationPool, type BlockDiff, type BoundingBox, type CellContext, type CellDiff, type ConvertToPdfFailure, type ConvertToPdfOptions, type ConvertToPdfResult, type ConvertToPdfSuccess, type DiffChangeType, type DiffResult, type DocumentMetadata, type ErrorCode, type ExtractedImage, type FileType, type FormField, type FormResult, type IRBlock, type IRBlockType, type IRCell, type IRTable, type ImageData, type InlineStyle, type KeyHealthSnapshot, type MarkdownToXlsxOptions, type OcrMode, type OcrProvider, type OutlineItem, type ParseFailure, type ParseOptions, type ParseResult, type ParseSuccess, type ParseWarning, type StructuredOcrResult, type UnifiedOcrOptions, type UnifiedOcrProgressEvent, type UnifiedOcrResult, type UnifiedStage, VERSION, type WarningCode, type WatchOptions, blocksToMarkdown, compare, convertHwpToPdf, convertHwpxToPdf, convertToPdf, detectFormat, detectZipFormat, diffBlocks, extractFormFields, isHwpxFile, isOldHwpFile, isPdfFile, isZipFile, markdownToHwpx, markdownToXlsx, parse, parseDocx, parseHwp, parseHwpx, parsePdf, parseXlsx, runUnifiedOcrPipeline };
|
|
611
|
+
export { type AcquireResult, AllKeysCoolingDownError, type ApiKeyRotationOptions, ApiKeyRotationPool, type BlockDiff, type BoundingBox, type CellContext, type CellDiff, type ConvertErrorEvent, type ConvertInstallEvent, type ConvertToPdfFailure, type ConvertToPdfOptions, type ConvertToPdfResult, type ConvertToPdfSuccess, type ConvertValidateEvent, type DiffChangeType, type DiffResult, type DocumentMetadata, type ErrorCode, type ExtractedImage, type FileType, type FormField, type FormResult, type IRBlock, type IRBlockType, type IRCell, type IRTable, type ImageData, type InlineStyle, type KeyHealthSnapshot, type MarkdownToXlsxOptions, type OcrMode, type OcrProvider, type OutlineItem, type ParseFailure, type ParseOptions, type ParseResult, type ParseSuccess, type ParseWarning, type StructuredOcrResult, type UnifiedOcrAuxEvent, type UnifiedOcrOptions, type UnifiedOcrProgressEvent, type UnifiedOcrResult, type UnifiedStage, VERSION, type WarningCode, type WatchOptions, blocksToMarkdown, compare, convertHwpToPdf, convertHwpxToPdf, convertToPdf, detectFormat, detectZipFormat, diffBlocks, extractFormFields, isHwpxFile, isOldHwpFile, isPdfFile, isZipFile, markdownToHwpx, markdownToXlsx, parse, parseDocx, parseHwp, parseHwpx, parsePdf, parseXlsx, runUnifiedOcrPipeline };
|
package/dist/index.d.ts
CHANGED
|
@@ -549,12 +549,14 @@ interface UnifiedOcrProgressEvent {
|
|
|
549
549
|
pageNumber?: number;
|
|
550
550
|
workerCount?: number;
|
|
551
551
|
}
|
|
552
|
+
/** unified-ocr에서 onEvent로 전달되는 보조 이벤트 — LibreOffice 설치/검증/에러 */
|
|
553
|
+
type UnifiedOcrAuxEvent = ConvertInstallEvent | ConvertValidateEvent | ConvertErrorEvent;
|
|
552
554
|
interface UnifiedOcrOptions {
|
|
553
555
|
workspaceDir?: string;
|
|
554
556
|
outputPath?: string;
|
|
555
557
|
dpi?: number;
|
|
556
558
|
baseUrl?: string;
|
|
557
|
-
onEvent?: (event: UnifiedOcrProgressEvent) => void;
|
|
559
|
+
onEvent?: (event: UnifiedOcrProgressEvent | UnifiedOcrAuxEvent) => void;
|
|
558
560
|
modelCandidates?: string[];
|
|
559
561
|
modelMaxTokens?: Record<string, number>;
|
|
560
562
|
stageWeights?: Partial<Record<UnifiedStage, number>>;
|
|
@@ -564,6 +566,10 @@ interface UnifiedOcrOptions {
|
|
|
564
566
|
logger?: Logger;
|
|
565
567
|
runId?: string;
|
|
566
568
|
concurrencyPerKey?: number;
|
|
569
|
+
/** LibreOffice 자동 설치 허용 (기본 false) — convert 단계에서 install/validate 이벤트가 onEvent로 전달됨 */
|
|
570
|
+
autoInstallLibreOffice?: boolean;
|
|
571
|
+
/** LibreOffice 바이너리 직접 지정 (선택) — 지정 시 자동 탐색/설치 건너뜀 */
|
|
572
|
+
sofficePath?: string;
|
|
567
573
|
}
|
|
568
574
|
interface UnifiedOcrResult {
|
|
569
575
|
outputPath: string;
|
|
@@ -602,4 +608,4 @@ declare function parseXlsx(buffer: ArrayBuffer, options?: ParseOptions, zip?: JS
|
|
|
602
608
|
/** DOCX 파일을 Markdown으로 변환 */
|
|
603
609
|
declare function parseDocx(buffer: ArrayBuffer, options?: ParseOptions, zip?: JSZip): Promise<ParseResult>;
|
|
604
610
|
|
|
605
|
-
export { type AcquireResult, AllKeysCoolingDownError, type ApiKeyRotationOptions, ApiKeyRotationPool, type BlockDiff, type BoundingBox, type CellContext, type CellDiff, type ConvertToPdfFailure, type ConvertToPdfOptions, type ConvertToPdfResult, type ConvertToPdfSuccess, type DiffChangeType, type DiffResult, type DocumentMetadata, type ErrorCode, type ExtractedImage, type FileType, type FormField, type FormResult, type IRBlock, type IRBlockType, type IRCell, type IRTable, type ImageData, type InlineStyle, type KeyHealthSnapshot, type MarkdownToXlsxOptions, type OcrMode, type OcrProvider, type OutlineItem, type ParseFailure, type ParseOptions, type ParseResult, type ParseSuccess, type ParseWarning, type StructuredOcrResult, type UnifiedOcrOptions, type UnifiedOcrProgressEvent, type UnifiedOcrResult, type UnifiedStage, VERSION, type WarningCode, type WatchOptions, blocksToMarkdown, compare, convertHwpToPdf, convertHwpxToPdf, convertToPdf, detectFormat, detectZipFormat, diffBlocks, extractFormFields, isHwpxFile, isOldHwpFile, isPdfFile, isZipFile, markdownToHwpx, markdownToXlsx, parse, parseDocx, parseHwp, parseHwpx, parsePdf, parseXlsx, runUnifiedOcrPipeline };
|
|
611
|
+
export { type AcquireResult, AllKeysCoolingDownError, type ApiKeyRotationOptions, ApiKeyRotationPool, type BlockDiff, type BoundingBox, type CellContext, type CellDiff, type ConvertErrorEvent, type ConvertInstallEvent, type ConvertToPdfFailure, type ConvertToPdfOptions, type ConvertToPdfResult, type ConvertToPdfSuccess, type ConvertValidateEvent, type DiffChangeType, type DiffResult, type DocumentMetadata, type ErrorCode, type ExtractedImage, type FileType, type FormField, type FormResult, type IRBlock, type IRBlockType, type IRCell, type IRTable, type ImageData, type InlineStyle, type KeyHealthSnapshot, type MarkdownToXlsxOptions, type OcrMode, type OcrProvider, type OutlineItem, type ParseFailure, type ParseOptions, type ParseResult, type ParseSuccess, type ParseWarning, type StructuredOcrResult, type UnifiedOcrAuxEvent, type UnifiedOcrOptions, type UnifiedOcrProgressEvent, type UnifiedOcrResult, type UnifiedStage, VERSION, type WarningCode, type WatchOptions, blocksToMarkdown, compare, convertHwpToPdf, convertHwpxToPdf, convertToPdf, detectFormat, detectZipFormat, diffBlocks, extractFormFields, isHwpxFile, isOldHwpFile, isPdfFile, isZipFile, markdownToHwpx, markdownToXlsx, parse, parseDocx, parseHwp, parseHwpx, parsePdf, parseXlsx, runUnifiedOcrPipeline };
|
package/dist/index.js
CHANGED
|
@@ -37,118 +37,6 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
37
37
|
mod
|
|
38
38
|
));
|
|
39
39
|
|
|
40
|
-
// src/utils.ts
|
|
41
|
-
var utils_exports = {};
|
|
42
|
-
__export(utils_exports, {
|
|
43
|
-
KordocError: () => KordocError,
|
|
44
|
-
VERSION: () => VERSION,
|
|
45
|
-
classifyError: () => classifyError,
|
|
46
|
-
isPathTraversal: () => isPathTraversal,
|
|
47
|
-
normalizeKordocError: () => normalizeKordocError,
|
|
48
|
-
precheckZipSize: () => precheckZipSize,
|
|
49
|
-
sanitizeError: () => sanitizeError,
|
|
50
|
-
sanitizeHref: () => sanitizeHref,
|
|
51
|
-
toArrayBuffer: () => toArrayBuffer
|
|
52
|
-
});
|
|
53
|
-
function toArrayBuffer(buf) {
|
|
54
|
-
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
55
|
-
return buf.buffer;
|
|
56
|
-
}
|
|
57
|
-
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
58
|
-
}
|
|
59
|
-
function sanitizeError(err) {
|
|
60
|
-
if (err instanceof KordocError) return err.message;
|
|
61
|
-
return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
|
|
62
|
-
}
|
|
63
|
-
function isPathTraversal(name) {
|
|
64
|
-
if (name.includes("\0")) return true;
|
|
65
|
-
const normalized = name.replace(/\\/g, "/");
|
|
66
|
-
return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
67
|
-
}
|
|
68
|
-
function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
|
|
69
|
-
try {
|
|
70
|
-
const data = new DataView(buffer);
|
|
71
|
-
const len = buffer.byteLength;
|
|
72
|
-
let eocdOffset = -1;
|
|
73
|
-
for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
|
|
74
|
-
if (data.getUint32(i, true) === 101010256) {
|
|
75
|
-
eocdOffset = i;
|
|
76
|
-
break;
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
|
|
80
|
-
const entryCount = data.getUint16(eocdOffset + 10, true);
|
|
81
|
-
if (entryCount > maxEntries) {
|
|
82
|
-
throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
|
|
83
|
-
}
|
|
84
|
-
const cdSize = data.getUint32(eocdOffset + 12, true);
|
|
85
|
-
const cdOffset = data.getUint32(eocdOffset + 16, true);
|
|
86
|
-
if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
|
|
87
|
-
let totalUncompressed = 0;
|
|
88
|
-
let pos = cdOffset;
|
|
89
|
-
for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
|
|
90
|
-
if (data.getUint32(pos, true) !== 33639248) break;
|
|
91
|
-
totalUncompressed += data.getUint32(pos + 24, true);
|
|
92
|
-
const nameLen = data.getUint16(pos + 28, true);
|
|
93
|
-
const extraLen = data.getUint16(pos + 30, true);
|
|
94
|
-
const commentLen = data.getUint16(pos + 32, true);
|
|
95
|
-
pos += 46 + nameLen + extraLen + commentLen;
|
|
96
|
-
}
|
|
97
|
-
if (totalUncompressed > maxUncompressedSize) {
|
|
98
|
-
throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
|
|
99
|
-
}
|
|
100
|
-
return { totalUncompressed, entryCount };
|
|
101
|
-
} catch (err) {
|
|
102
|
-
if (err instanceof KordocError) throw err;
|
|
103
|
-
return { totalUncompressed: 0, entryCount: 0 };
|
|
104
|
-
}
|
|
105
|
-
}
|
|
106
|
-
function sanitizeHref(href) {
|
|
107
|
-
const trimmed = href.trim();
|
|
108
|
-
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
109
|
-
return trimmed;
|
|
110
|
-
}
|
|
111
|
-
function classifyError(err) {
|
|
112
|
-
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
113
|
-
const msg = err.message;
|
|
114
|
-
if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
|
|
115
|
-
if (msg.includes("DRM")) return "DRM_PROTECTED";
|
|
116
|
-
if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
|
|
117
|
-
if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
|
|
118
|
-
if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
|
|
119
|
-
if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
|
|
120
|
-
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
121
|
-
return "PARSE_ERROR";
|
|
122
|
-
}
|
|
123
|
-
function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
|
|
124
|
-
if (err instanceof KordocError) {
|
|
125
|
-
if (!err.stage) err.stage = stage;
|
|
126
|
-
if (!err.code) err.code = fallbackCode;
|
|
127
|
-
return err;
|
|
128
|
-
}
|
|
129
|
-
const message = err instanceof Error ? err.message : fallbackMessage;
|
|
130
|
-
const code = err instanceof Error ? classifyError(err) : fallbackCode;
|
|
131
|
-
return new KordocError(message || fallbackMessage, { code, stage });
|
|
132
|
-
}
|
|
133
|
-
var VERSION, KordocError, SAFE_HREF_RE;
|
|
134
|
-
var init_utils = __esm({
|
|
135
|
-
"src/utils.ts"() {
|
|
136
|
-
"use strict";
|
|
137
|
-
VERSION = true ? "2.6.0" : "0.0.0-dev";
|
|
138
|
-
KordocError = class extends Error {
|
|
139
|
-
code;
|
|
140
|
-
stage;
|
|
141
|
-
constructor(message, opts = {}) {
|
|
142
|
-
super(message);
|
|
143
|
-
this.name = "KordocError";
|
|
144
|
-
this.code = opts.code;
|
|
145
|
-
this.stage = opts.stage;
|
|
146
|
-
}
|
|
147
|
-
};
|
|
148
|
-
SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
149
|
-
}
|
|
150
|
-
});
|
|
151
|
-
|
|
152
40
|
// src/page-range.ts
|
|
153
41
|
var page_range_exports = {};
|
|
154
42
|
__export(page_range_exports, {
|
|
@@ -3223,8 +3111,97 @@ async function detectZipFormat(buffer) {
|
|
|
3223
3111
|
import JSZip2 from "jszip";
|
|
3224
3112
|
import { DOMParser } from "@xmldom/xmldom";
|
|
3225
3113
|
|
|
3114
|
+
// src/utils.ts
|
|
3115
|
+
var VERSION = true ? "2.6.1" : "0.0.0-dev";
|
|
3116
|
+
function toArrayBuffer(buf) {
|
|
3117
|
+
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
3118
|
+
return buf.buffer;
|
|
3119
|
+
}
|
|
3120
|
+
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
3121
|
+
}
|
|
3122
|
+
var KordocError = class extends Error {
|
|
3123
|
+
code;
|
|
3124
|
+
stage;
|
|
3125
|
+
constructor(message, opts = {}) {
|
|
3126
|
+
super(message);
|
|
3127
|
+
this.name = "KordocError";
|
|
3128
|
+
this.code = opts.code;
|
|
3129
|
+
this.stage = opts.stage;
|
|
3130
|
+
}
|
|
3131
|
+
};
|
|
3132
|
+
function isPathTraversal(name) {
|
|
3133
|
+
if (name.includes("\0")) return true;
|
|
3134
|
+
const normalized = name.replace(/\\/g, "/");
|
|
3135
|
+
return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
3136
|
+
}
|
|
3137
|
+
function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
|
|
3138
|
+
try {
|
|
3139
|
+
const data = new DataView(buffer);
|
|
3140
|
+
const len = buffer.byteLength;
|
|
3141
|
+
let eocdOffset = -1;
|
|
3142
|
+
for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
|
|
3143
|
+
if (data.getUint32(i, true) === 101010256) {
|
|
3144
|
+
eocdOffset = i;
|
|
3145
|
+
break;
|
|
3146
|
+
}
|
|
3147
|
+
}
|
|
3148
|
+
if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
|
|
3149
|
+
const entryCount = data.getUint16(eocdOffset + 10, true);
|
|
3150
|
+
if (entryCount > maxEntries) {
|
|
3151
|
+
throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
|
|
3152
|
+
}
|
|
3153
|
+
const cdSize = data.getUint32(eocdOffset + 12, true);
|
|
3154
|
+
const cdOffset = data.getUint32(eocdOffset + 16, true);
|
|
3155
|
+
if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
|
|
3156
|
+
let totalUncompressed = 0;
|
|
3157
|
+
let pos = cdOffset;
|
|
3158
|
+
for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
|
|
3159
|
+
if (data.getUint32(pos, true) !== 33639248) break;
|
|
3160
|
+
totalUncompressed += data.getUint32(pos + 24, true);
|
|
3161
|
+
const nameLen = data.getUint16(pos + 28, true);
|
|
3162
|
+
const extraLen = data.getUint16(pos + 30, true);
|
|
3163
|
+
const commentLen = data.getUint16(pos + 32, true);
|
|
3164
|
+
pos += 46 + nameLen + extraLen + commentLen;
|
|
3165
|
+
}
|
|
3166
|
+
if (totalUncompressed > maxUncompressedSize) {
|
|
3167
|
+
throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
|
|
3168
|
+
}
|
|
3169
|
+
return { totalUncompressed, entryCount };
|
|
3170
|
+
} catch (err) {
|
|
3171
|
+
if (err instanceof KordocError) throw err;
|
|
3172
|
+
return { totalUncompressed: 0, entryCount: 0 };
|
|
3173
|
+
}
|
|
3174
|
+
}
|
|
3175
|
+
var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
3176
|
+
function sanitizeHref(href) {
|
|
3177
|
+
const trimmed = href.trim();
|
|
3178
|
+
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
3179
|
+
return trimmed;
|
|
3180
|
+
}
|
|
3181
|
+
function classifyError(err) {
|
|
3182
|
+
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
3183
|
+
const msg = err.message;
|
|
3184
|
+
if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
|
|
3185
|
+
if (msg.includes("DRM")) return "DRM_PROTECTED";
|
|
3186
|
+
if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
|
|
3187
|
+
if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
|
|
3188
|
+
if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
|
|
3189
|
+
if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
|
|
3190
|
+
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
3191
|
+
return "PARSE_ERROR";
|
|
3192
|
+
}
|
|
3193
|
+
function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
|
|
3194
|
+
if (err instanceof KordocError) {
|
|
3195
|
+
if (!err.stage) err.stage = stage;
|
|
3196
|
+
if (!err.code) err.code = fallbackCode;
|
|
3197
|
+
return err;
|
|
3198
|
+
}
|
|
3199
|
+
const message = err instanceof Error ? err.message : fallbackMessage;
|
|
3200
|
+
const code = err instanceof Error ? classifyError(err) : fallbackCode;
|
|
3201
|
+
return new KordocError(message || fallbackMessage, { code, stage });
|
|
3202
|
+
}
|
|
3203
|
+
|
|
3226
3204
|
// src/table/builder.ts
|
|
3227
|
-
init_utils();
|
|
3228
3205
|
var MAX_COLS = 200;
|
|
3229
3206
|
var MAX_ROWS = 1e4;
|
|
3230
3207
|
function buildTable(rows) {
|
|
@@ -3484,8 +3461,6 @@ var HEADING_RATIO_H2 = 1.3;
|
|
|
3484
3461
|
var HEADING_RATIO_H3 = 1.15;
|
|
3485
3462
|
|
|
3486
3463
|
// src/hwpx/parser.ts
|
|
3487
|
-
init_utils();
|
|
3488
|
-
init_utils();
|
|
3489
3464
|
init_page_range();
|
|
3490
3465
|
init_logger();
|
|
3491
3466
|
var MAX_DECOMPRESS_SIZE = 500 * 1024 * 1024;
|
|
@@ -4327,7 +4302,6 @@ function extractTextFromNode(node) {
|
|
|
4327
4302
|
}
|
|
4328
4303
|
|
|
4329
4304
|
// src/hwp5/record.ts
|
|
4330
|
-
init_utils();
|
|
4331
4305
|
import { inflateRawSync, inflateSync } from "zlib";
|
|
4332
4306
|
var TAG_PARA_HEADER = 66;
|
|
4333
4307
|
var TAG_PARA_TEXT = 67;
|
|
@@ -5378,7 +5352,6 @@ function parseLenientCfb(data) {
|
|
|
5378
5352
|
}
|
|
5379
5353
|
|
|
5380
5354
|
// src/hwp5/parser.ts
|
|
5381
|
-
init_utils();
|
|
5382
5355
|
init_page_range();
|
|
5383
5356
|
init_logger();
|
|
5384
5357
|
var CFB = __toESM(require_cfb(), 1);
|
|
@@ -6034,7 +6007,6 @@ function arrangeCells(rows, cols, cells) {
|
|
|
6034
6007
|
}
|
|
6035
6008
|
|
|
6036
6009
|
// src/pdf/parser.ts
|
|
6037
|
-
init_utils();
|
|
6038
6010
|
init_page_range();
|
|
6039
6011
|
import { createRequire } from "module";
|
|
6040
6012
|
import { dirname as dirname2, join as join3, resolve as resolve2 } from "path";
|
|
@@ -7926,7 +7898,6 @@ function mergeKoreanLines(text) {
|
|
|
7926
7898
|
}
|
|
7927
7899
|
|
|
7928
7900
|
// src/xlsx/parser.ts
|
|
7929
|
-
init_utils();
|
|
7930
7901
|
import JSZip3 from "jszip";
|
|
7931
7902
|
import { DOMParser as DOMParser2 } from "@xmldom/xmldom";
|
|
7932
7903
|
init_logger();
|
|
@@ -8255,7 +8226,6 @@ async function parseXlsxDocument(buffer, options, existingZip) {
|
|
|
8255
8226
|
}
|
|
8256
8227
|
|
|
8257
8228
|
// src/docx/parser.ts
|
|
8258
|
-
init_utils();
|
|
8259
8229
|
import JSZip4 from "jszip";
|
|
8260
8230
|
import { DOMParser as DOMParser3 } from "@xmldom/xmldom";
|
|
8261
8231
|
init_logger();
|
|
@@ -8737,7 +8707,6 @@ async function parseDocxDocument(buffer, options, existingZip) {
|
|
|
8737
8707
|
}
|
|
8738
8708
|
|
|
8739
8709
|
// src/index.ts
|
|
8740
|
-
init_utils();
|
|
8741
8710
|
init_cli_provider();
|
|
8742
8711
|
init_markdown_to_blocks();
|
|
8743
8712
|
init_logger();
|
|
@@ -11241,7 +11210,6 @@ async function markdownToXlsx(markdown, options) {
|
|
|
11241
11210
|
|
|
11242
11211
|
// src/convert/index.ts
|
|
11243
11212
|
import { readFile } from "fs/promises";
|
|
11244
|
-
init_utils();
|
|
11245
11213
|
|
|
11246
11214
|
// src/convert/libreoffice.ts
|
|
11247
11215
|
import libre from "libreoffice-convert";
|
|
@@ -11261,6 +11229,7 @@ import { join as join4, delimiter } from "path";
|
|
|
11261
11229
|
import { mkdir, access, symlink, rm } from "fs/promises";
|
|
11262
11230
|
import { createWriteStream } from "fs";
|
|
11263
11231
|
import { spawn as spawn2 } from "child_process";
|
|
11232
|
+
var installInFlight = null;
|
|
11264
11233
|
var CACHE_DIR = join4(homedir(), ".cache", "kordoc", "libreoffice");
|
|
11265
11234
|
var VERSION_FILE = join4(CACHE_DIR, "version");
|
|
11266
11235
|
var PACKAGES = {
|
|
@@ -11281,13 +11250,11 @@ var PACKAGES = {
|
|
|
11281
11250
|
}
|
|
11282
11251
|
};
|
|
11283
11252
|
async function findInPath() {
|
|
11284
|
-
|
|
11285
|
-
const
|
|
11286
|
-
|
|
11287
|
-
|
|
11288
|
-
}
|
|
11289
|
-
return null;
|
|
11290
|
-
}
|
|
11253
|
+
return new Promise((resolve4) => {
|
|
11254
|
+
const child = spawn2("soffice", ["--version"], { stdio: "ignore" });
|
|
11255
|
+
child.on("close", (code) => resolve4(code === 0 ? "soffice" : null));
|
|
11256
|
+
child.on("error", () => resolve4(null));
|
|
11257
|
+
});
|
|
11291
11258
|
}
|
|
11292
11259
|
async function findInCache() {
|
|
11293
11260
|
const cachedBin = join4(CACHE_DIR, "bin", "soffice");
|
|
@@ -11298,6 +11265,38 @@ async function findInCache() {
|
|
|
11298
11265
|
return null;
|
|
11299
11266
|
}
|
|
11300
11267
|
}
|
|
11268
|
+
async function findInDefaultPaths() {
|
|
11269
|
+
const platform = process.platform;
|
|
11270
|
+
const paths = [];
|
|
11271
|
+
if (platform === "darwin") {
|
|
11272
|
+
paths.push(
|
|
11273
|
+
"/Applications/LibreOffice.app/Contents/MacOS/soffice",
|
|
11274
|
+
"/opt/homebrew/bin/soffice",
|
|
11275
|
+
"/usr/local/bin/soffice"
|
|
11276
|
+
);
|
|
11277
|
+
} else if (platform === "linux") {
|
|
11278
|
+
paths.push(
|
|
11279
|
+
"/usr/bin/soffice",
|
|
11280
|
+
"/usr/lib/libreoffice/program/soffice"
|
|
11281
|
+
);
|
|
11282
|
+
} else if (platform === "win32") {
|
|
11283
|
+
const pf = process.env["ProgramFiles"] ?? "C:\\Program Files";
|
|
11284
|
+
const pf86 = process.env["ProgramFiles(x86)"] ?? "C:\\Program Files (x86)";
|
|
11285
|
+
paths.push(
|
|
11286
|
+
join4(pf, "LibreOffice", "program", "soffice.exe"),
|
|
11287
|
+
join4(pf86, "LibreOffice", "program", "soffice.exe")
|
|
11288
|
+
);
|
|
11289
|
+
}
|
|
11290
|
+
for (const p of paths) {
|
|
11291
|
+
try {
|
|
11292
|
+
await access(p);
|
|
11293
|
+
return p;
|
|
11294
|
+
} catch {
|
|
11295
|
+
continue;
|
|
11296
|
+
}
|
|
11297
|
+
}
|
|
11298
|
+
return null;
|
|
11299
|
+
}
|
|
11301
11300
|
async function downloadWithProgress(url, dest, totalBytes, onProgress) {
|
|
11302
11301
|
const response = await fetch(url);
|
|
11303
11302
|
if (!response.body) throw new Error("\uB2E4\uC6B4\uB85C\uB4DC \uC2E4\uD328: response body \uC5C6\uC74C");
|
|
@@ -11425,6 +11424,11 @@ async function resolveSoffice(emitter, autoInstall = true) {
|
|
|
11425
11424
|
emitter.validate("soffice_found", "\uCE90\uC2DC\uB41C LibreOffice \uBC1C\uACAC", { sofficePath: inCache });
|
|
11426
11425
|
return inCache;
|
|
11427
11426
|
}
|
|
11427
|
+
const inDefault = await findInDefaultPaths();
|
|
11428
|
+
if (inDefault) {
|
|
11429
|
+
emitter.validate("soffice_found", "\uAE30\uBCF8 \uACBD\uB85C\uC5D0\uC11C LibreOffice \uBC1C\uACAC", { sofficePath: inDefault });
|
|
11430
|
+
return inDefault;
|
|
11431
|
+
}
|
|
11428
11432
|
if (!autoInstall) {
|
|
11429
11433
|
emitter.error(
|
|
11430
11434
|
"validate",
|
|
@@ -11434,38 +11438,35 @@ async function resolveSoffice(emitter, autoInstall = true) {
|
|
|
11434
11438
|
);
|
|
11435
11439
|
throw new ConvertError("SOFFICE_NOT_FOUND", "LibreOffice\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4");
|
|
11436
11440
|
}
|
|
11441
|
+
if (installInFlight) {
|
|
11442
|
+
return installInFlight;
|
|
11443
|
+
}
|
|
11437
11444
|
emitter.install("install_start", "LibreOffice \uC790\uB3D9 \uC124\uCE58\uB97C \uC2DC\uC791\uD569\uB2C8\uB2E4...");
|
|
11438
|
-
|
|
11439
|
-
|
|
11440
|
-
const
|
|
11441
|
-
|
|
11442
|
-
percent
|
|
11443
|
-
|
|
11444
|
-
|
|
11445
|
+
installInFlight = (async () => {
|
|
11446
|
+
try {
|
|
11447
|
+
const installed = await installLibreOffice((downloaded, total) => {
|
|
11448
|
+
const percent = Math.round(downloaded / total * 100);
|
|
11449
|
+
emitter.install("download_progress", `\uB2E4\uC6B4\uB85C\uB4DC \uC911... ${percent}%`, {
|
|
11450
|
+
percent,
|
|
11451
|
+
downloadedBytes: downloaded,
|
|
11452
|
+
totalBytes: total
|
|
11453
|
+
});
|
|
11445
11454
|
});
|
|
11446
|
-
|
|
11447
|
-
|
|
11448
|
-
|
|
11449
|
-
|
|
11450
|
-
|
|
11451
|
-
|
|
11452
|
-
|
|
11453
|
-
|
|
11455
|
+
emitter.install("install_complete", "\uC124\uCE58 \uC644\uB8CC", { installedPath: installed });
|
|
11456
|
+
return installed;
|
|
11457
|
+
} catch (err) {
|
|
11458
|
+
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
11459
|
+
emitter.install("install_failed", "\uC124\uCE58 \uC2E4\uD328", { error: errorMsg });
|
|
11460
|
+
throw err;
|
|
11461
|
+
} finally {
|
|
11462
|
+
installInFlight = null;
|
|
11463
|
+
}
|
|
11464
|
+
})();
|
|
11465
|
+
return installInFlight;
|
|
11454
11466
|
}
|
|
11455
11467
|
|
|
11456
11468
|
// src/convert/libreoffice.ts
|
|
11457
11469
|
var libreConvert = libre.convert;
|
|
11458
|
-
async function assertSofficeAvailable() {
|
|
11459
|
-
const { runCommand: runCommand2 } = await Promise.resolve().then(() => (init_utils(), utils_exports));
|
|
11460
|
-
try {
|
|
11461
|
-
await runCommand2("soffice", ["--version"]);
|
|
11462
|
-
} catch {
|
|
11463
|
-
throw new ConvertError(
|
|
11464
|
-
"SOFFICE_NOT_FOUND",
|
|
11465
|
-
"soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694."
|
|
11466
|
-
);
|
|
11467
|
-
}
|
|
11468
|
-
}
|
|
11469
11470
|
async function convertBuffer(buffer, targetExt, timeoutMs = 6e4) {
|
|
11470
11471
|
return new Promise((resolve4, reject) => {
|
|
11471
11472
|
const timer = setTimeout(() => {
|
|
@@ -11711,9 +11712,6 @@ async function convertHwpxToPdf(input, options) {
|
|
|
11711
11712
|
return result;
|
|
11712
11713
|
}
|
|
11713
11714
|
|
|
11714
|
-
// src/index.ts
|
|
11715
|
-
init_utils();
|
|
11716
|
-
|
|
11717
11715
|
// src/ocr/api-key-rotation.ts
|
|
11718
11716
|
var AllKeysCoolingDownError = class extends Error {
|
|
11719
11717
|
waitMs;
|
|
@@ -11809,7 +11807,7 @@ var ApiKeyRotationPool = class _ApiKeyRotationPool {
|
|
|
11809
11807
|
|
|
11810
11808
|
// src/pipeline/unified-ocr.ts
|
|
11811
11809
|
import { mkdir as mkdir2, readdir, readFile as readFile2, stat, writeFile as writeFile2 } from "fs/promises";
|
|
11812
|
-
import { basename as basename2, dirname as dirname3, extname, join as join5, resolve as resolve3 } from "path";
|
|
11810
|
+
import { basename as basename2, delimiter as delimiter2, dirname as dirname3, extname, join as join5, resolve as resolve3 } from "path";
|
|
11813
11811
|
import { spawn as spawn3 } from "child_process";
|
|
11814
11812
|
import { performance } from "perf_hooks";
|
|
11815
11813
|
init_logger();
|
|
@@ -11985,7 +11983,25 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11985
11983
|
markStageStart("convert", "\uBB38\uC11C\uB97C PDF\uB85C \uBCC0\uD658 \uC911");
|
|
11986
11984
|
logStage("info", "convert", "start", "\uBB38\uC11C\uB97C PDF\uB85C \uBCC0\uD658 \uC2DC\uC791", { input: absInput });
|
|
11987
11985
|
if (extname(absInput).toLowerCase() !== ".pdf") {
|
|
11988
|
-
|
|
11986
|
+
const convertEmitter = new ConvertEventEmitter();
|
|
11987
|
+
if (options.onEvent) {
|
|
11988
|
+
convertEmitter.setListener((evt) => {
|
|
11989
|
+
if (evt.type === "install" || evt.type === "validate" || evt.type === "error") {
|
|
11990
|
+
try {
|
|
11991
|
+
;
|
|
11992
|
+
options.onEvent(evt);
|
|
11993
|
+
} catch {
|
|
11994
|
+
}
|
|
11995
|
+
}
|
|
11996
|
+
});
|
|
11997
|
+
}
|
|
11998
|
+
if (options.sofficePath) {
|
|
11999
|
+
const sofficeDir = dirname3(options.sofficePath);
|
|
12000
|
+
process.env.PATH = `${sofficeDir}${delimiter2}${process.env.PATH ?? ""}`;
|
|
12001
|
+
convertEmitter.validate("soffice_found", "\uC9C1\uC811 \uC9C0\uC815\uB41C LibreOffice \uACBD\uB85C \uC0AC\uC6A9", { sofficePath: options.sofficePath });
|
|
12002
|
+
} else {
|
|
12003
|
+
await resolveSoffice(convertEmitter, options.autoInstallLibreOffice ?? false);
|
|
12004
|
+
}
|
|
11989
12005
|
workingPdfPath = join5(workspaceDir, `${stem}.pdf`);
|
|
11990
12006
|
const inputBuffer = await readFile2(absInput);
|
|
11991
12007
|
const out = await convertBuffer(inputBuffer, ".pdf");
|
|
@@ -12558,6 +12574,16 @@ function ensureSupportedInput(path) {
|
|
|
12558
12574
|
}
|
|
12559
12575
|
function normalizePipelineError(err, stage) {
|
|
12560
12576
|
if (err instanceof UnifiedOcrError) return err;
|
|
12577
|
+
if (err instanceof ConvertError) {
|
|
12578
|
+
const codeMap = {
|
|
12579
|
+
SOFFICE_NOT_FOUND: "SOFFICE_NOT_FOUND",
|
|
12580
|
+
CONVERT_FAILED: "CONVERT_FAILED",
|
|
12581
|
+
TIMEOUT: "CONVERT_FAILED",
|
|
12582
|
+
UNSUPPORTED_PLATFORM: "CONVERT_FAILED",
|
|
12583
|
+
UNSUPPORTED_FORMAT: "UNSUPPORTED_INPUT"
|
|
12584
|
+
};
|
|
12585
|
+
return new UnifiedOcrError(codeMap[err.code] ?? "CONVERT_FAILED", stage, err.message);
|
|
12586
|
+
}
|
|
12561
12587
|
const message = err instanceof Error ? err.message : String(err);
|
|
12562
12588
|
const codeByStage = {
|
|
12563
12589
|
convert: "CONVERT_FAILED",
|