@clazic/kordoc 2.6.1 → 2.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-BZPZXI66.js → chunk-FZJLIDFL.js} +74 -26
- package/dist/chunk-FZJLIDFL.js.map +1 -0
- package/dist/{chunk-4X5JCZFZ.js → chunk-YIJCHZLO.js} +2 -2
- package/dist/cli.js +6 -6
- package/dist/index.cjs +191 -161
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +8 -2
- package/dist/index.d.ts +8 -2
- package/dist/index.js +192 -162
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +2 -2
- package/dist/{utils-56QT5C33.js → utils-MAETCW66.js} +2 -2
- package/dist/{watch-HRNMJWSE.js → watch-6HVRALTX.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-BZPZXI66.js.map +0 -1
- /package/dist/{chunk-4X5JCZFZ.js.map → chunk-YIJCHZLO.js.map} +0 -0
- /package/dist/{utils-56QT5C33.js.map → utils-MAETCW66.js.map} +0 -0
- /package/dist/{watch-HRNMJWSE.js.map → watch-6HVRALTX.js.map} +0 -0
package/dist/index.d.cts
CHANGED
|
@@ -549,12 +549,14 @@ interface UnifiedOcrProgressEvent {
|
|
|
549
549
|
pageNumber?: number;
|
|
550
550
|
workerCount?: number;
|
|
551
551
|
}
|
|
552
|
+
/** unified-ocr에서 onEvent로 전달되는 보조 이벤트 — LibreOffice 설치/검증/에러 */
|
|
553
|
+
type UnifiedOcrAuxEvent = ConvertInstallEvent | ConvertValidateEvent | ConvertErrorEvent;
|
|
552
554
|
interface UnifiedOcrOptions {
|
|
553
555
|
workspaceDir?: string;
|
|
554
556
|
outputPath?: string;
|
|
555
557
|
dpi?: number;
|
|
556
558
|
baseUrl?: string;
|
|
557
|
-
onEvent?: (event: UnifiedOcrProgressEvent) => void;
|
|
559
|
+
onEvent?: (event: UnifiedOcrProgressEvent | UnifiedOcrAuxEvent) => void;
|
|
558
560
|
modelCandidates?: string[];
|
|
559
561
|
modelMaxTokens?: Record<string, number>;
|
|
560
562
|
stageWeights?: Partial<Record<UnifiedStage, number>>;
|
|
@@ -564,6 +566,10 @@ interface UnifiedOcrOptions {
|
|
|
564
566
|
logger?: Logger;
|
|
565
567
|
runId?: string;
|
|
566
568
|
concurrencyPerKey?: number;
|
|
569
|
+
/** LibreOffice 자동 설치 허용 (기본 false) — convert 단계에서 install/validate 이벤트가 onEvent로 전달됨 */
|
|
570
|
+
autoInstallLibreOffice?: boolean;
|
|
571
|
+
/** LibreOffice 바이너리 직접 지정 (선택) — 지정 시 자동 탐색/설치 건너뜀 */
|
|
572
|
+
sofficePath?: string;
|
|
567
573
|
}
|
|
568
574
|
interface UnifiedOcrResult {
|
|
569
575
|
outputPath: string;
|
|
@@ -602,4 +608,4 @@ declare function parseXlsx(buffer: ArrayBuffer, options?: ParseOptions, zip?: JS
|
|
|
602
608
|
/** DOCX 파일을 Markdown으로 변환 */
|
|
603
609
|
declare function parseDocx(buffer: ArrayBuffer, options?: ParseOptions, zip?: JSZip): Promise<ParseResult>;
|
|
604
610
|
|
|
605
|
-
export { type AcquireResult, AllKeysCoolingDownError, type ApiKeyRotationOptions, ApiKeyRotationPool, type BlockDiff, type BoundingBox, type CellContext, type CellDiff, type ConvertToPdfFailure, type ConvertToPdfOptions, type ConvertToPdfResult, type ConvertToPdfSuccess, type DiffChangeType, type DiffResult, type DocumentMetadata, type ErrorCode, type ExtractedImage, type FileType, type FormField, type FormResult, type IRBlock, type IRBlockType, type IRCell, type IRTable, type ImageData, type InlineStyle, type KeyHealthSnapshot, type MarkdownToXlsxOptions, type OcrMode, type OcrProvider, type OutlineItem, type ParseFailure, type ParseOptions, type ParseResult, type ParseSuccess, type ParseWarning, type StructuredOcrResult, type UnifiedOcrOptions, type UnifiedOcrProgressEvent, type UnifiedOcrResult, type UnifiedStage, VERSION, type WarningCode, type WatchOptions, blocksToMarkdown, compare, convertHwpToPdf, convertHwpxToPdf, convertToPdf, detectFormat, detectZipFormat, diffBlocks, extractFormFields, isHwpxFile, isOldHwpFile, isPdfFile, isZipFile, markdownToHwpx, markdownToXlsx, parse, parseDocx, parseHwp, parseHwpx, parsePdf, parseXlsx, runUnifiedOcrPipeline };
|
|
611
|
+
export { type AcquireResult, AllKeysCoolingDownError, type ApiKeyRotationOptions, ApiKeyRotationPool, type BlockDiff, type BoundingBox, type CellContext, type CellDiff, type ConvertErrorEvent, type ConvertInstallEvent, type ConvertToPdfFailure, type ConvertToPdfOptions, type ConvertToPdfResult, type ConvertToPdfSuccess, type ConvertValidateEvent, type DiffChangeType, type DiffResult, type DocumentMetadata, type ErrorCode, type ExtractedImage, type FileType, type FormField, type FormResult, type IRBlock, type IRBlockType, type IRCell, type IRTable, type ImageData, type InlineStyle, type KeyHealthSnapshot, type MarkdownToXlsxOptions, type OcrMode, type OcrProvider, type OutlineItem, type ParseFailure, type ParseOptions, type ParseResult, type ParseSuccess, type ParseWarning, type StructuredOcrResult, type UnifiedOcrAuxEvent, type UnifiedOcrOptions, type UnifiedOcrProgressEvent, type UnifiedOcrResult, type UnifiedStage, VERSION, type WarningCode, type WatchOptions, blocksToMarkdown, compare, convertHwpToPdf, convertHwpxToPdf, convertToPdf, detectFormat, detectZipFormat, diffBlocks, extractFormFields, isHwpxFile, isOldHwpFile, isPdfFile, isZipFile, markdownToHwpx, markdownToXlsx, parse, parseDocx, parseHwp, parseHwpx, parsePdf, parseXlsx, runUnifiedOcrPipeline };
|
package/dist/index.d.ts
CHANGED
|
@@ -549,12 +549,14 @@ interface UnifiedOcrProgressEvent {
|
|
|
549
549
|
pageNumber?: number;
|
|
550
550
|
workerCount?: number;
|
|
551
551
|
}
|
|
552
|
+
/** unified-ocr에서 onEvent로 전달되는 보조 이벤트 — LibreOffice 설치/검증/에러 */
|
|
553
|
+
type UnifiedOcrAuxEvent = ConvertInstallEvent | ConvertValidateEvent | ConvertErrorEvent;
|
|
552
554
|
interface UnifiedOcrOptions {
|
|
553
555
|
workspaceDir?: string;
|
|
554
556
|
outputPath?: string;
|
|
555
557
|
dpi?: number;
|
|
556
558
|
baseUrl?: string;
|
|
557
|
-
onEvent?: (event: UnifiedOcrProgressEvent) => void;
|
|
559
|
+
onEvent?: (event: UnifiedOcrProgressEvent | UnifiedOcrAuxEvent) => void;
|
|
558
560
|
modelCandidates?: string[];
|
|
559
561
|
modelMaxTokens?: Record<string, number>;
|
|
560
562
|
stageWeights?: Partial<Record<UnifiedStage, number>>;
|
|
@@ -564,6 +566,10 @@ interface UnifiedOcrOptions {
|
|
|
564
566
|
logger?: Logger;
|
|
565
567
|
runId?: string;
|
|
566
568
|
concurrencyPerKey?: number;
|
|
569
|
+
/** LibreOffice 자동 설치 허용 (기본 false) — convert 단계에서 install/validate 이벤트가 onEvent로 전달됨 */
|
|
570
|
+
autoInstallLibreOffice?: boolean;
|
|
571
|
+
/** LibreOffice 바이너리 직접 지정 (선택) — 지정 시 자동 탐색/설치 건너뜀 */
|
|
572
|
+
sofficePath?: string;
|
|
567
573
|
}
|
|
568
574
|
interface UnifiedOcrResult {
|
|
569
575
|
outputPath: string;
|
|
@@ -602,4 +608,4 @@ declare function parseXlsx(buffer: ArrayBuffer, options?: ParseOptions, zip?: JS
|
|
|
602
608
|
/** DOCX 파일을 Markdown으로 변환 */
|
|
603
609
|
declare function parseDocx(buffer: ArrayBuffer, options?: ParseOptions, zip?: JSZip): Promise<ParseResult>;
|
|
604
610
|
|
|
605
|
-
export { type AcquireResult, AllKeysCoolingDownError, type ApiKeyRotationOptions, ApiKeyRotationPool, type BlockDiff, type BoundingBox, type CellContext, type CellDiff, type ConvertToPdfFailure, type ConvertToPdfOptions, type ConvertToPdfResult, type ConvertToPdfSuccess, type DiffChangeType, type DiffResult, type DocumentMetadata, type ErrorCode, type ExtractedImage, type FileType, type FormField, type FormResult, type IRBlock, type IRBlockType, type IRCell, type IRTable, type ImageData, type InlineStyle, type KeyHealthSnapshot, type MarkdownToXlsxOptions, type OcrMode, type OcrProvider, type OutlineItem, type ParseFailure, type ParseOptions, type ParseResult, type ParseSuccess, type ParseWarning, type StructuredOcrResult, type UnifiedOcrOptions, type UnifiedOcrProgressEvent, type UnifiedOcrResult, type UnifiedStage, VERSION, type WarningCode, type WatchOptions, blocksToMarkdown, compare, convertHwpToPdf, convertHwpxToPdf, convertToPdf, detectFormat, detectZipFormat, diffBlocks, extractFormFields, isHwpxFile, isOldHwpFile, isPdfFile, isZipFile, markdownToHwpx, markdownToXlsx, parse, parseDocx, parseHwp, parseHwpx, parsePdf, parseXlsx, runUnifiedOcrPipeline };
|
|
611
|
+
export { type AcquireResult, AllKeysCoolingDownError, type ApiKeyRotationOptions, ApiKeyRotationPool, type BlockDiff, type BoundingBox, type CellContext, type CellDiff, type ConvertErrorEvent, type ConvertInstallEvent, type ConvertToPdfFailure, type ConvertToPdfOptions, type ConvertToPdfResult, type ConvertToPdfSuccess, type ConvertValidateEvent, type DiffChangeType, type DiffResult, type DocumentMetadata, type ErrorCode, type ExtractedImage, type FileType, type FormField, type FormResult, type IRBlock, type IRBlockType, type IRCell, type IRTable, type ImageData, type InlineStyle, type KeyHealthSnapshot, type MarkdownToXlsxOptions, type OcrMode, type OcrProvider, type OutlineItem, type ParseFailure, type ParseOptions, type ParseResult, type ParseSuccess, type ParseWarning, type StructuredOcrResult, type UnifiedOcrAuxEvent, type UnifiedOcrOptions, type UnifiedOcrProgressEvent, type UnifiedOcrResult, type UnifiedStage, VERSION, type WarningCode, type WatchOptions, blocksToMarkdown, compare, convertHwpToPdf, convertHwpxToPdf, convertToPdf, detectFormat, detectZipFormat, diffBlocks, extractFormFields, isHwpxFile, isOldHwpFile, isPdfFile, isZipFile, markdownToHwpx, markdownToXlsx, parse, parseDocx, parseHwp, parseHwpx, parsePdf, parseXlsx, runUnifiedOcrPipeline };
|
package/dist/index.js
CHANGED
|
@@ -37,118 +37,6 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
37
37
|
mod
|
|
38
38
|
));
|
|
39
39
|
|
|
40
|
-
// src/utils.ts
|
|
41
|
-
var utils_exports = {};
|
|
42
|
-
__export(utils_exports, {
|
|
43
|
-
KordocError: () => KordocError,
|
|
44
|
-
VERSION: () => VERSION,
|
|
45
|
-
classifyError: () => classifyError,
|
|
46
|
-
isPathTraversal: () => isPathTraversal,
|
|
47
|
-
normalizeKordocError: () => normalizeKordocError,
|
|
48
|
-
precheckZipSize: () => precheckZipSize,
|
|
49
|
-
sanitizeError: () => sanitizeError,
|
|
50
|
-
sanitizeHref: () => sanitizeHref,
|
|
51
|
-
toArrayBuffer: () => toArrayBuffer
|
|
52
|
-
});
|
|
53
|
-
function toArrayBuffer(buf) {
|
|
54
|
-
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
55
|
-
return buf.buffer;
|
|
56
|
-
}
|
|
57
|
-
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
58
|
-
}
|
|
59
|
-
function sanitizeError(err) {
|
|
60
|
-
if (err instanceof KordocError) return err.message;
|
|
61
|
-
return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
|
|
62
|
-
}
|
|
63
|
-
function isPathTraversal(name) {
|
|
64
|
-
if (name.includes("\0")) return true;
|
|
65
|
-
const normalized = name.replace(/\\/g, "/");
|
|
66
|
-
return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
67
|
-
}
|
|
68
|
-
function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
|
|
69
|
-
try {
|
|
70
|
-
const data = new DataView(buffer);
|
|
71
|
-
const len = buffer.byteLength;
|
|
72
|
-
let eocdOffset = -1;
|
|
73
|
-
for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
|
|
74
|
-
if (data.getUint32(i, true) === 101010256) {
|
|
75
|
-
eocdOffset = i;
|
|
76
|
-
break;
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
|
|
80
|
-
const entryCount = data.getUint16(eocdOffset + 10, true);
|
|
81
|
-
if (entryCount > maxEntries) {
|
|
82
|
-
throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
|
|
83
|
-
}
|
|
84
|
-
const cdSize = data.getUint32(eocdOffset + 12, true);
|
|
85
|
-
const cdOffset = data.getUint32(eocdOffset + 16, true);
|
|
86
|
-
if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
|
|
87
|
-
let totalUncompressed = 0;
|
|
88
|
-
let pos = cdOffset;
|
|
89
|
-
for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
|
|
90
|
-
if (data.getUint32(pos, true) !== 33639248) break;
|
|
91
|
-
totalUncompressed += data.getUint32(pos + 24, true);
|
|
92
|
-
const nameLen = data.getUint16(pos + 28, true);
|
|
93
|
-
const extraLen = data.getUint16(pos + 30, true);
|
|
94
|
-
const commentLen = data.getUint16(pos + 32, true);
|
|
95
|
-
pos += 46 + nameLen + extraLen + commentLen;
|
|
96
|
-
}
|
|
97
|
-
if (totalUncompressed > maxUncompressedSize) {
|
|
98
|
-
throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
|
|
99
|
-
}
|
|
100
|
-
return { totalUncompressed, entryCount };
|
|
101
|
-
} catch (err) {
|
|
102
|
-
if (err instanceof KordocError) throw err;
|
|
103
|
-
return { totalUncompressed: 0, entryCount: 0 };
|
|
104
|
-
}
|
|
105
|
-
}
|
|
106
|
-
function sanitizeHref(href) {
|
|
107
|
-
const trimmed = href.trim();
|
|
108
|
-
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
109
|
-
return trimmed;
|
|
110
|
-
}
|
|
111
|
-
function classifyError(err) {
|
|
112
|
-
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
113
|
-
const msg = err.message;
|
|
114
|
-
if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
|
|
115
|
-
if (msg.includes("DRM")) return "DRM_PROTECTED";
|
|
116
|
-
if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
|
|
117
|
-
if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
|
|
118
|
-
if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
|
|
119
|
-
if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
|
|
120
|
-
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
121
|
-
return "PARSE_ERROR";
|
|
122
|
-
}
|
|
123
|
-
function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
|
|
124
|
-
if (err instanceof KordocError) {
|
|
125
|
-
if (!err.stage) err.stage = stage;
|
|
126
|
-
if (!err.code) err.code = fallbackCode;
|
|
127
|
-
return err;
|
|
128
|
-
}
|
|
129
|
-
const message = err instanceof Error ? err.message : fallbackMessage;
|
|
130
|
-
const code = err instanceof Error ? classifyError(err) : fallbackCode;
|
|
131
|
-
return new KordocError(message || fallbackMessage, { code, stage });
|
|
132
|
-
}
|
|
133
|
-
var VERSION, KordocError, SAFE_HREF_RE;
|
|
134
|
-
var init_utils = __esm({
|
|
135
|
-
"src/utils.ts"() {
|
|
136
|
-
"use strict";
|
|
137
|
-
VERSION = true ? "2.6.0" : "0.0.0-dev";
|
|
138
|
-
KordocError = class extends Error {
|
|
139
|
-
code;
|
|
140
|
-
stage;
|
|
141
|
-
constructor(message, opts = {}) {
|
|
142
|
-
super(message);
|
|
143
|
-
this.name = "KordocError";
|
|
144
|
-
this.code = opts.code;
|
|
145
|
-
this.stage = opts.stage;
|
|
146
|
-
}
|
|
147
|
-
};
|
|
148
|
-
SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
149
|
-
}
|
|
150
|
-
});
|
|
151
|
-
|
|
152
40
|
// src/page-range.ts
|
|
153
41
|
var page_range_exports = {};
|
|
154
42
|
__export(page_range_exports, {
|
|
@@ -3223,8 +3111,97 @@ async function detectZipFormat(buffer) {
|
|
|
3223
3111
|
import JSZip2 from "jszip";
|
|
3224
3112
|
import { DOMParser } from "@xmldom/xmldom";
|
|
3225
3113
|
|
|
3114
|
+
// src/utils.ts
|
|
3115
|
+
var VERSION = true ? "2.7.1" : "0.0.0-dev";
|
|
3116
|
+
function toArrayBuffer(buf) {
|
|
3117
|
+
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
3118
|
+
return buf.buffer;
|
|
3119
|
+
}
|
|
3120
|
+
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
3121
|
+
}
|
|
3122
|
+
var KordocError = class extends Error {
|
|
3123
|
+
code;
|
|
3124
|
+
stage;
|
|
3125
|
+
constructor(message, opts = {}) {
|
|
3126
|
+
super(message);
|
|
3127
|
+
this.name = "KordocError";
|
|
3128
|
+
this.code = opts.code;
|
|
3129
|
+
this.stage = opts.stage;
|
|
3130
|
+
}
|
|
3131
|
+
};
|
|
3132
|
+
function isPathTraversal(name) {
|
|
3133
|
+
if (name.includes("\0")) return true;
|
|
3134
|
+
const normalized = name.replace(/\\/g, "/");
|
|
3135
|
+
return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
3136
|
+
}
|
|
3137
|
+
function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
|
|
3138
|
+
try {
|
|
3139
|
+
const data = new DataView(buffer);
|
|
3140
|
+
const len = buffer.byteLength;
|
|
3141
|
+
let eocdOffset = -1;
|
|
3142
|
+
for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
|
|
3143
|
+
if (data.getUint32(i, true) === 101010256) {
|
|
3144
|
+
eocdOffset = i;
|
|
3145
|
+
break;
|
|
3146
|
+
}
|
|
3147
|
+
}
|
|
3148
|
+
if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
|
|
3149
|
+
const entryCount = data.getUint16(eocdOffset + 10, true);
|
|
3150
|
+
if (entryCount > maxEntries) {
|
|
3151
|
+
throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
|
|
3152
|
+
}
|
|
3153
|
+
const cdSize = data.getUint32(eocdOffset + 12, true);
|
|
3154
|
+
const cdOffset = data.getUint32(eocdOffset + 16, true);
|
|
3155
|
+
if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
|
|
3156
|
+
let totalUncompressed = 0;
|
|
3157
|
+
let pos = cdOffset;
|
|
3158
|
+
for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
|
|
3159
|
+
if (data.getUint32(pos, true) !== 33639248) break;
|
|
3160
|
+
totalUncompressed += data.getUint32(pos + 24, true);
|
|
3161
|
+
const nameLen = data.getUint16(pos + 28, true);
|
|
3162
|
+
const extraLen = data.getUint16(pos + 30, true);
|
|
3163
|
+
const commentLen = data.getUint16(pos + 32, true);
|
|
3164
|
+
pos += 46 + nameLen + extraLen + commentLen;
|
|
3165
|
+
}
|
|
3166
|
+
if (totalUncompressed > maxUncompressedSize) {
|
|
3167
|
+
throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
|
|
3168
|
+
}
|
|
3169
|
+
return { totalUncompressed, entryCount };
|
|
3170
|
+
} catch (err) {
|
|
3171
|
+
if (err instanceof KordocError) throw err;
|
|
3172
|
+
return { totalUncompressed: 0, entryCount: 0 };
|
|
3173
|
+
}
|
|
3174
|
+
}
|
|
3175
|
+
var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
3176
|
+
function sanitizeHref(href) {
|
|
3177
|
+
const trimmed = href.trim();
|
|
3178
|
+
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
3179
|
+
return trimmed;
|
|
3180
|
+
}
|
|
3181
|
+
function classifyError(err) {
|
|
3182
|
+
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
3183
|
+
const msg = err.message;
|
|
3184
|
+
if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
|
|
3185
|
+
if (msg.includes("DRM")) return "DRM_PROTECTED";
|
|
3186
|
+
if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
|
|
3187
|
+
if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
|
|
3188
|
+
if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
|
|
3189
|
+
if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
|
|
3190
|
+
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
3191
|
+
return "PARSE_ERROR";
|
|
3192
|
+
}
|
|
3193
|
+
function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
|
|
3194
|
+
if (err instanceof KordocError) {
|
|
3195
|
+
if (!err.stage) err.stage = stage;
|
|
3196
|
+
if (!err.code) err.code = fallbackCode;
|
|
3197
|
+
return err;
|
|
3198
|
+
}
|
|
3199
|
+
const message = err instanceof Error ? err.message : fallbackMessage;
|
|
3200
|
+
const code = err instanceof Error ? classifyError(err) : fallbackCode;
|
|
3201
|
+
return new KordocError(message || fallbackMessage, { code, stage });
|
|
3202
|
+
}
|
|
3203
|
+
|
|
3226
3204
|
// src/table/builder.ts
|
|
3227
|
-
init_utils();
|
|
3228
3205
|
var MAX_COLS = 200;
|
|
3229
3206
|
var MAX_ROWS = 1e4;
|
|
3230
3207
|
function buildTable(rows) {
|
|
@@ -3484,8 +3461,6 @@ var HEADING_RATIO_H2 = 1.3;
|
|
|
3484
3461
|
var HEADING_RATIO_H3 = 1.15;
|
|
3485
3462
|
|
|
3486
3463
|
// src/hwpx/parser.ts
|
|
3487
|
-
init_utils();
|
|
3488
|
-
init_utils();
|
|
3489
3464
|
init_page_range();
|
|
3490
3465
|
init_logger();
|
|
3491
3466
|
var MAX_DECOMPRESS_SIZE = 500 * 1024 * 1024;
|
|
@@ -4327,7 +4302,6 @@ function extractTextFromNode(node) {
|
|
|
4327
4302
|
}
|
|
4328
4303
|
|
|
4329
4304
|
// src/hwp5/record.ts
|
|
4330
|
-
init_utils();
|
|
4331
4305
|
import { inflateRawSync, inflateSync } from "zlib";
|
|
4332
4306
|
var TAG_PARA_HEADER = 66;
|
|
4333
4307
|
var TAG_PARA_TEXT = 67;
|
|
@@ -5378,7 +5352,6 @@ function parseLenientCfb(data) {
|
|
|
5378
5352
|
}
|
|
5379
5353
|
|
|
5380
5354
|
// src/hwp5/parser.ts
|
|
5381
|
-
init_utils();
|
|
5382
5355
|
init_page_range();
|
|
5383
5356
|
init_logger();
|
|
5384
5357
|
var CFB = __toESM(require_cfb(), 1);
|
|
@@ -6034,7 +6007,6 @@ function arrangeCells(rows, cols, cells) {
|
|
|
6034
6007
|
}
|
|
6035
6008
|
|
|
6036
6009
|
// src/pdf/parser.ts
|
|
6037
|
-
init_utils();
|
|
6038
6010
|
init_page_range();
|
|
6039
6011
|
import { createRequire } from "module";
|
|
6040
6012
|
import { dirname as dirname2, join as join3, resolve as resolve2 } from "path";
|
|
@@ -7926,7 +7898,6 @@ function mergeKoreanLines(text) {
|
|
|
7926
7898
|
}
|
|
7927
7899
|
|
|
7928
7900
|
// src/xlsx/parser.ts
|
|
7929
|
-
init_utils();
|
|
7930
7901
|
import JSZip3 from "jszip";
|
|
7931
7902
|
import { DOMParser as DOMParser2 } from "@xmldom/xmldom";
|
|
7932
7903
|
init_logger();
|
|
@@ -8255,7 +8226,6 @@ async function parseXlsxDocument(buffer, options, existingZip) {
|
|
|
8255
8226
|
}
|
|
8256
8227
|
|
|
8257
8228
|
// src/docx/parser.ts
|
|
8258
|
-
init_utils();
|
|
8259
8229
|
import JSZip4 from "jszip";
|
|
8260
8230
|
import { DOMParser as DOMParser3 } from "@xmldom/xmldom";
|
|
8261
8231
|
init_logger();
|
|
@@ -8737,7 +8707,6 @@ async function parseDocxDocument(buffer, options, existingZip) {
|
|
|
8737
8707
|
}
|
|
8738
8708
|
|
|
8739
8709
|
// src/index.ts
|
|
8740
|
-
init_utils();
|
|
8741
8710
|
init_cli_provider();
|
|
8742
8711
|
init_markdown_to_blocks();
|
|
8743
8712
|
init_logger();
|
|
@@ -11241,7 +11210,6 @@ async function markdownToXlsx(markdown, options) {
|
|
|
11241
11210
|
|
|
11242
11211
|
// src/convert/index.ts
|
|
11243
11212
|
import { readFile } from "fs/promises";
|
|
11244
|
-
init_utils();
|
|
11245
11213
|
|
|
11246
11214
|
// src/convert/libreoffice.ts
|
|
11247
11215
|
import libre from "libreoffice-convert";
|
|
@@ -11261,6 +11229,7 @@ import { join as join4, delimiter } from "path";
|
|
|
11261
11229
|
import { mkdir, access, symlink, rm } from "fs/promises";
|
|
11262
11230
|
import { createWriteStream } from "fs";
|
|
11263
11231
|
import { spawn as spawn2 } from "child_process";
|
|
11232
|
+
var installInFlight = null;
|
|
11264
11233
|
var CACHE_DIR = join4(homedir(), ".cache", "kordoc", "libreoffice");
|
|
11265
11234
|
var VERSION_FILE = join4(CACHE_DIR, "version");
|
|
11266
11235
|
var PACKAGES = {
|
|
@@ -11281,13 +11250,11 @@ var PACKAGES = {
|
|
|
11281
11250
|
}
|
|
11282
11251
|
};
|
|
11283
11252
|
async function findInPath() {
|
|
11284
|
-
|
|
11285
|
-
const
|
|
11286
|
-
|
|
11287
|
-
|
|
11288
|
-
}
|
|
11289
|
-
return null;
|
|
11290
|
-
}
|
|
11253
|
+
return new Promise((resolve4) => {
|
|
11254
|
+
const child = spawn2("soffice", ["--version"], { stdio: "ignore" });
|
|
11255
|
+
child.on("close", (code) => resolve4(code === 0 ? "soffice" : null));
|
|
11256
|
+
child.on("error", () => resolve4(null));
|
|
11257
|
+
});
|
|
11291
11258
|
}
|
|
11292
11259
|
async function findInCache() {
|
|
11293
11260
|
const cachedBin = join4(CACHE_DIR, "bin", "soffice");
|
|
@@ -11298,6 +11265,38 @@ async function findInCache() {
|
|
|
11298
11265
|
return null;
|
|
11299
11266
|
}
|
|
11300
11267
|
}
|
|
11268
|
+
async function findInDefaultPaths() {
|
|
11269
|
+
const platform = process.platform;
|
|
11270
|
+
const paths = [];
|
|
11271
|
+
if (platform === "darwin") {
|
|
11272
|
+
paths.push(
|
|
11273
|
+
"/Applications/LibreOffice.app/Contents/MacOS/soffice",
|
|
11274
|
+
"/opt/homebrew/bin/soffice",
|
|
11275
|
+
"/usr/local/bin/soffice"
|
|
11276
|
+
);
|
|
11277
|
+
} else if (platform === "linux") {
|
|
11278
|
+
paths.push(
|
|
11279
|
+
"/usr/bin/soffice",
|
|
11280
|
+
"/usr/lib/libreoffice/program/soffice"
|
|
11281
|
+
);
|
|
11282
|
+
} else if (platform === "win32") {
|
|
11283
|
+
const pf = process.env["ProgramFiles"] ?? "C:\\Program Files";
|
|
11284
|
+
const pf86 = process.env["ProgramFiles(x86)"] ?? "C:\\Program Files (x86)";
|
|
11285
|
+
paths.push(
|
|
11286
|
+
join4(pf, "LibreOffice", "program", "soffice.exe"),
|
|
11287
|
+
join4(pf86, "LibreOffice", "program", "soffice.exe")
|
|
11288
|
+
);
|
|
11289
|
+
}
|
|
11290
|
+
for (const p of paths) {
|
|
11291
|
+
try {
|
|
11292
|
+
await access(p);
|
|
11293
|
+
return p;
|
|
11294
|
+
} catch {
|
|
11295
|
+
continue;
|
|
11296
|
+
}
|
|
11297
|
+
}
|
|
11298
|
+
return null;
|
|
11299
|
+
}
|
|
11301
11300
|
async function downloadWithProgress(url, dest, totalBytes, onProgress) {
|
|
11302
11301
|
const response = await fetch(url);
|
|
11303
11302
|
if (!response.body) throw new Error("\uB2E4\uC6B4\uB85C\uB4DC \uC2E4\uD328: response body \uC5C6\uC74C");
|
|
@@ -11308,13 +11307,17 @@ async function downloadWithProgress(url, dest, totalBytes, onProgress) {
|
|
|
11308
11307
|
while (true) {
|
|
11309
11308
|
const { done, value } = await reader.read();
|
|
11310
11309
|
if (done) break;
|
|
11311
|
-
file.write(value)
|
|
11310
|
+
if (!file.write(value)) {
|
|
11311
|
+
await new Promise((resolve4) => file.once("drain", resolve4));
|
|
11312
|
+
}
|
|
11312
11313
|
downloaded += value.length;
|
|
11313
11314
|
onProgress?.(downloaded, totalBytes);
|
|
11314
11315
|
}
|
|
11315
11316
|
} finally {
|
|
11316
|
-
file.end();
|
|
11317
11317
|
reader.releaseLock();
|
|
11318
|
+
await new Promise((resolve4, reject) => {
|
|
11319
|
+
file.end((err) => err ? reject(err) : resolve4());
|
|
11320
|
+
});
|
|
11318
11321
|
}
|
|
11319
11322
|
}
|
|
11320
11323
|
async function installForPlatform(pkg, onProgress) {
|
|
@@ -11425,6 +11428,11 @@ async function resolveSoffice(emitter, autoInstall = true) {
|
|
|
11425
11428
|
emitter.validate("soffice_found", "\uCE90\uC2DC\uB41C LibreOffice \uBC1C\uACAC", { sofficePath: inCache });
|
|
11426
11429
|
return inCache;
|
|
11427
11430
|
}
|
|
11431
|
+
const inDefault = await findInDefaultPaths();
|
|
11432
|
+
if (inDefault) {
|
|
11433
|
+
emitter.validate("soffice_found", "\uAE30\uBCF8 \uACBD\uB85C\uC5D0\uC11C LibreOffice \uBC1C\uACAC", { sofficePath: inDefault });
|
|
11434
|
+
return inDefault;
|
|
11435
|
+
}
|
|
11428
11436
|
if (!autoInstall) {
|
|
11429
11437
|
emitter.error(
|
|
11430
11438
|
"validate",
|
|
@@ -11434,38 +11442,35 @@ async function resolveSoffice(emitter, autoInstall = true) {
|
|
|
11434
11442
|
);
|
|
11435
11443
|
throw new ConvertError("SOFFICE_NOT_FOUND", "LibreOffice\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4");
|
|
11436
11444
|
}
|
|
11445
|
+
if (installInFlight) {
|
|
11446
|
+
return installInFlight;
|
|
11447
|
+
}
|
|
11437
11448
|
emitter.install("install_start", "LibreOffice \uC790\uB3D9 \uC124\uCE58\uB97C \uC2DC\uC791\uD569\uB2C8\uB2E4...");
|
|
11438
|
-
|
|
11439
|
-
|
|
11440
|
-
const
|
|
11441
|
-
|
|
11442
|
-
percent
|
|
11443
|
-
|
|
11444
|
-
|
|
11449
|
+
installInFlight = (async () => {
|
|
11450
|
+
try {
|
|
11451
|
+
const installed = await installLibreOffice((downloaded, total) => {
|
|
11452
|
+
const percent = Math.round(downloaded / total * 100);
|
|
11453
|
+
emitter.install("download_progress", `\uB2E4\uC6B4\uB85C\uB4DC \uC911... ${percent}%`, {
|
|
11454
|
+
percent,
|
|
11455
|
+
downloadedBytes: downloaded,
|
|
11456
|
+
totalBytes: total
|
|
11457
|
+
});
|
|
11445
11458
|
});
|
|
11446
|
-
|
|
11447
|
-
|
|
11448
|
-
|
|
11449
|
-
|
|
11450
|
-
|
|
11451
|
-
|
|
11452
|
-
|
|
11453
|
-
|
|
11459
|
+
emitter.install("install_complete", "\uC124\uCE58 \uC644\uB8CC", { installedPath: installed });
|
|
11460
|
+
return installed;
|
|
11461
|
+
} catch (err) {
|
|
11462
|
+
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
11463
|
+
emitter.install("install_failed", "\uC124\uCE58 \uC2E4\uD328", { error: errorMsg });
|
|
11464
|
+
throw err;
|
|
11465
|
+
} finally {
|
|
11466
|
+
installInFlight = null;
|
|
11467
|
+
}
|
|
11468
|
+
})();
|
|
11469
|
+
return installInFlight;
|
|
11454
11470
|
}
|
|
11455
11471
|
|
|
11456
11472
|
// src/convert/libreoffice.ts
|
|
11457
11473
|
var libreConvert = libre.convert;
|
|
11458
|
-
async function assertSofficeAvailable() {
|
|
11459
|
-
const { runCommand: runCommand2 } = await Promise.resolve().then(() => (init_utils(), utils_exports));
|
|
11460
|
-
try {
|
|
11461
|
-
await runCommand2("soffice", ["--version"]);
|
|
11462
|
-
} catch {
|
|
11463
|
-
throw new ConvertError(
|
|
11464
|
-
"SOFFICE_NOT_FOUND",
|
|
11465
|
-
"soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694."
|
|
11466
|
-
);
|
|
11467
|
-
}
|
|
11468
|
-
}
|
|
11469
11474
|
async function convertBuffer(buffer, targetExt, timeoutMs = 6e4) {
|
|
11470
11475
|
return new Promise((resolve4, reject) => {
|
|
11471
11476
|
const timer = setTimeout(() => {
|
|
@@ -11711,9 +11716,6 @@ async function convertHwpxToPdf(input, options) {
|
|
|
11711
11716
|
return result;
|
|
11712
11717
|
}
|
|
11713
11718
|
|
|
11714
|
-
// src/index.ts
|
|
11715
|
-
init_utils();
|
|
11716
|
-
|
|
11717
11719
|
// src/ocr/api-key-rotation.ts
|
|
11718
11720
|
var AllKeysCoolingDownError = class extends Error {
|
|
11719
11721
|
waitMs;
|
|
@@ -11809,7 +11811,7 @@ var ApiKeyRotationPool = class _ApiKeyRotationPool {
|
|
|
11809
11811
|
|
|
11810
11812
|
// src/pipeline/unified-ocr.ts
|
|
11811
11813
|
import { mkdir as mkdir2, readdir, readFile as readFile2, stat, writeFile as writeFile2 } from "fs/promises";
|
|
11812
|
-
import { basename as basename2, dirname as dirname3, extname, join as join5, resolve as resolve3 } from "path";
|
|
11814
|
+
import { basename as basename2, delimiter as delimiter2, dirname as dirname3, extname, join as join5, resolve as resolve3 } from "path";
|
|
11813
11815
|
import { spawn as spawn3 } from "child_process";
|
|
11814
11816
|
import { performance } from "perf_hooks";
|
|
11815
11817
|
init_logger();
|
|
@@ -11985,7 +11987,25 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11985
11987
|
markStageStart("convert", "\uBB38\uC11C\uB97C PDF\uB85C \uBCC0\uD658 \uC911");
|
|
11986
11988
|
logStage("info", "convert", "start", "\uBB38\uC11C\uB97C PDF\uB85C \uBCC0\uD658 \uC2DC\uC791", { input: absInput });
|
|
11987
11989
|
if (extname(absInput).toLowerCase() !== ".pdf") {
|
|
11988
|
-
|
|
11990
|
+
const convertEmitter = new ConvertEventEmitter();
|
|
11991
|
+
if (options.onEvent) {
|
|
11992
|
+
convertEmitter.setListener((evt) => {
|
|
11993
|
+
if (evt.type === "install" || evt.type === "validate" || evt.type === "error") {
|
|
11994
|
+
try {
|
|
11995
|
+
;
|
|
11996
|
+
options.onEvent(evt);
|
|
11997
|
+
} catch {
|
|
11998
|
+
}
|
|
11999
|
+
}
|
|
12000
|
+
});
|
|
12001
|
+
}
|
|
12002
|
+
if (options.sofficePath) {
|
|
12003
|
+
const sofficeDir = dirname3(options.sofficePath);
|
|
12004
|
+
process.env.PATH = `${sofficeDir}${delimiter2}${process.env.PATH ?? ""}`;
|
|
12005
|
+
convertEmitter.validate("soffice_found", "\uC9C1\uC811 \uC9C0\uC815\uB41C LibreOffice \uACBD\uB85C \uC0AC\uC6A9", { sofficePath: options.sofficePath });
|
|
12006
|
+
} else {
|
|
12007
|
+
await resolveSoffice(convertEmitter, options.autoInstallLibreOffice ?? false);
|
|
12008
|
+
}
|
|
11989
12009
|
workingPdfPath = join5(workspaceDir, `${stem}.pdf`);
|
|
11990
12010
|
const inputBuffer = await readFile2(absInput);
|
|
11991
12011
|
const out = await convertBuffer(inputBuffer, ".pdf");
|
|
@@ -12558,6 +12578,16 @@ function ensureSupportedInput(path) {
|
|
|
12558
12578
|
}
|
|
12559
12579
|
function normalizePipelineError(err, stage) {
|
|
12560
12580
|
if (err instanceof UnifiedOcrError) return err;
|
|
12581
|
+
if (err instanceof ConvertError) {
|
|
12582
|
+
const codeMap = {
|
|
12583
|
+
SOFFICE_NOT_FOUND: "SOFFICE_NOT_FOUND",
|
|
12584
|
+
CONVERT_FAILED: "CONVERT_FAILED",
|
|
12585
|
+
TIMEOUT: "CONVERT_FAILED",
|
|
12586
|
+
UNSUPPORTED_PLATFORM: "CONVERT_FAILED",
|
|
12587
|
+
UNSUPPORTED_FORMAT: "UNSUPPORTED_INPUT"
|
|
12588
|
+
};
|
|
12589
|
+
return new UnifiedOcrError(codeMap[err.code] ?? "CONVERT_FAILED", stage, err.message);
|
|
12590
|
+
}
|
|
12561
12591
|
const message = err instanceof Error ? err.message : String(err);
|
|
12562
12592
|
const codeByStage = {
|
|
12563
12593
|
convert: "CONVERT_FAILED",
|