@clazic/kordoc 2.6.1 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -549,12 +549,14 @@ interface UnifiedOcrProgressEvent {
549
549
  pageNumber?: number;
550
550
  workerCount?: number;
551
551
  }
552
+ /** unified-ocr에서 onEvent로 전달되는 보조 이벤트 — LibreOffice 설치/검증/에러 */
553
+ type UnifiedOcrAuxEvent = ConvertInstallEvent | ConvertValidateEvent | ConvertErrorEvent;
552
554
  interface UnifiedOcrOptions {
553
555
  workspaceDir?: string;
554
556
  outputPath?: string;
555
557
  dpi?: number;
556
558
  baseUrl?: string;
557
- onEvent?: (event: UnifiedOcrProgressEvent) => void;
559
+ onEvent?: (event: UnifiedOcrProgressEvent | UnifiedOcrAuxEvent) => void;
558
560
  modelCandidates?: string[];
559
561
  modelMaxTokens?: Record<string, number>;
560
562
  stageWeights?: Partial<Record<UnifiedStage, number>>;
@@ -564,6 +566,10 @@ interface UnifiedOcrOptions {
564
566
  logger?: Logger;
565
567
  runId?: string;
566
568
  concurrencyPerKey?: number;
569
+ /** LibreOffice 자동 설치 허용 (기본 false) — convert 단계에서 install/validate 이벤트가 onEvent로 전달됨 */
570
+ autoInstallLibreOffice?: boolean;
571
+ /** LibreOffice 바이너리 직접 지정 (선택) — 지정 시 자동 탐색/설치 건너뜀 */
572
+ sofficePath?: string;
567
573
  }
568
574
  interface UnifiedOcrResult {
569
575
  outputPath: string;
@@ -602,4 +608,4 @@ declare function parseXlsx(buffer: ArrayBuffer, options?: ParseOptions, zip?: JS
602
608
  /** DOCX 파일을 Markdown으로 변환 */
603
609
  declare function parseDocx(buffer: ArrayBuffer, options?: ParseOptions, zip?: JSZip): Promise<ParseResult>;
604
610
 
605
- export { type AcquireResult, AllKeysCoolingDownError, type ApiKeyRotationOptions, ApiKeyRotationPool, type BlockDiff, type BoundingBox, type CellContext, type CellDiff, type ConvertToPdfFailure, type ConvertToPdfOptions, type ConvertToPdfResult, type ConvertToPdfSuccess, type DiffChangeType, type DiffResult, type DocumentMetadata, type ErrorCode, type ExtractedImage, type FileType, type FormField, type FormResult, type IRBlock, type IRBlockType, type IRCell, type IRTable, type ImageData, type InlineStyle, type KeyHealthSnapshot, type MarkdownToXlsxOptions, type OcrMode, type OcrProvider, type OutlineItem, type ParseFailure, type ParseOptions, type ParseResult, type ParseSuccess, type ParseWarning, type StructuredOcrResult, type UnifiedOcrOptions, type UnifiedOcrProgressEvent, type UnifiedOcrResult, type UnifiedStage, VERSION, type WarningCode, type WatchOptions, blocksToMarkdown, compare, convertHwpToPdf, convertHwpxToPdf, convertToPdf, detectFormat, detectZipFormat, diffBlocks, extractFormFields, isHwpxFile, isOldHwpFile, isPdfFile, isZipFile, markdownToHwpx, markdownToXlsx, parse, parseDocx, parseHwp, parseHwpx, parsePdf, parseXlsx, runUnifiedOcrPipeline };
611
+ export { type AcquireResult, AllKeysCoolingDownError, type ApiKeyRotationOptions, ApiKeyRotationPool, type BlockDiff, type BoundingBox, type CellContext, type CellDiff, type ConvertErrorEvent, type ConvertInstallEvent, type ConvertToPdfFailure, type ConvertToPdfOptions, type ConvertToPdfResult, type ConvertToPdfSuccess, type ConvertValidateEvent, type DiffChangeType, type DiffResult, type DocumentMetadata, type ErrorCode, type ExtractedImage, type FileType, type FormField, type FormResult, type IRBlock, type IRBlockType, type IRCell, type IRTable, type ImageData, type InlineStyle, type KeyHealthSnapshot, type MarkdownToXlsxOptions, type OcrMode, type OcrProvider, type OutlineItem, type ParseFailure, type ParseOptions, type ParseResult, type ParseSuccess, type ParseWarning, type StructuredOcrResult, type UnifiedOcrAuxEvent, type UnifiedOcrOptions, type UnifiedOcrProgressEvent, type UnifiedOcrResult, type UnifiedStage, VERSION, type WarningCode, type WatchOptions, blocksToMarkdown, compare, convertHwpToPdf, convertHwpxToPdf, convertToPdf, detectFormat, detectZipFormat, diffBlocks, extractFormFields, isHwpxFile, isOldHwpFile, isPdfFile, isZipFile, markdownToHwpx, markdownToXlsx, parse, parseDocx, parseHwp, parseHwpx, parsePdf, parseXlsx, runUnifiedOcrPipeline };
package/dist/index.d.ts CHANGED
@@ -549,12 +549,14 @@ interface UnifiedOcrProgressEvent {
549
549
  pageNumber?: number;
550
550
  workerCount?: number;
551
551
  }
552
+ /** unified-ocr에서 onEvent로 전달되는 보조 이벤트 — LibreOffice 설치/검증/에러 */
553
+ type UnifiedOcrAuxEvent = ConvertInstallEvent | ConvertValidateEvent | ConvertErrorEvent;
552
554
  interface UnifiedOcrOptions {
553
555
  workspaceDir?: string;
554
556
  outputPath?: string;
555
557
  dpi?: number;
556
558
  baseUrl?: string;
557
- onEvent?: (event: UnifiedOcrProgressEvent) => void;
559
+ onEvent?: (event: UnifiedOcrProgressEvent | UnifiedOcrAuxEvent) => void;
558
560
  modelCandidates?: string[];
559
561
  modelMaxTokens?: Record<string, number>;
560
562
  stageWeights?: Partial<Record<UnifiedStage, number>>;
@@ -564,6 +566,10 @@ interface UnifiedOcrOptions {
564
566
  logger?: Logger;
565
567
  runId?: string;
566
568
  concurrencyPerKey?: number;
569
+ /** LibreOffice 자동 설치 허용 (기본 false) — convert 단계에서 install/validate 이벤트가 onEvent로 전달됨 */
570
+ autoInstallLibreOffice?: boolean;
571
+ /** LibreOffice 바이너리 직접 지정 (선택) — 지정 시 자동 탐색/설치 건너뜀 */
572
+ sofficePath?: string;
567
573
  }
568
574
  interface UnifiedOcrResult {
569
575
  outputPath: string;
@@ -602,4 +608,4 @@ declare function parseXlsx(buffer: ArrayBuffer, options?: ParseOptions, zip?: JS
602
608
  /** DOCX 파일을 Markdown으로 변환 */
603
609
  declare function parseDocx(buffer: ArrayBuffer, options?: ParseOptions, zip?: JSZip): Promise<ParseResult>;
604
610
 
605
- export { type AcquireResult, AllKeysCoolingDownError, type ApiKeyRotationOptions, ApiKeyRotationPool, type BlockDiff, type BoundingBox, type CellContext, type CellDiff, type ConvertToPdfFailure, type ConvertToPdfOptions, type ConvertToPdfResult, type ConvertToPdfSuccess, type DiffChangeType, type DiffResult, type DocumentMetadata, type ErrorCode, type ExtractedImage, type FileType, type FormField, type FormResult, type IRBlock, type IRBlockType, type IRCell, type IRTable, type ImageData, type InlineStyle, type KeyHealthSnapshot, type MarkdownToXlsxOptions, type OcrMode, type OcrProvider, type OutlineItem, type ParseFailure, type ParseOptions, type ParseResult, type ParseSuccess, type ParseWarning, type StructuredOcrResult, type UnifiedOcrOptions, type UnifiedOcrProgressEvent, type UnifiedOcrResult, type UnifiedStage, VERSION, type WarningCode, type WatchOptions, blocksToMarkdown, compare, convertHwpToPdf, convertHwpxToPdf, convertToPdf, detectFormat, detectZipFormat, diffBlocks, extractFormFields, isHwpxFile, isOldHwpFile, isPdfFile, isZipFile, markdownToHwpx, markdownToXlsx, parse, parseDocx, parseHwp, parseHwpx, parsePdf, parseXlsx, runUnifiedOcrPipeline };
611
+ export { type AcquireResult, AllKeysCoolingDownError, type ApiKeyRotationOptions, ApiKeyRotationPool, type BlockDiff, type BoundingBox, type CellContext, type CellDiff, type ConvertErrorEvent, type ConvertInstallEvent, type ConvertToPdfFailure, type ConvertToPdfOptions, type ConvertToPdfResult, type ConvertToPdfSuccess, type ConvertValidateEvent, type DiffChangeType, type DiffResult, type DocumentMetadata, type ErrorCode, type ExtractedImage, type FileType, type FormField, type FormResult, type IRBlock, type IRBlockType, type IRCell, type IRTable, type ImageData, type InlineStyle, type KeyHealthSnapshot, type MarkdownToXlsxOptions, type OcrMode, type OcrProvider, type OutlineItem, type ParseFailure, type ParseOptions, type ParseResult, type ParseSuccess, type ParseWarning, type StructuredOcrResult, type UnifiedOcrAuxEvent, type UnifiedOcrOptions, type UnifiedOcrProgressEvent, type UnifiedOcrResult, type UnifiedStage, VERSION, type WarningCode, type WatchOptions, blocksToMarkdown, compare, convertHwpToPdf, convertHwpxToPdf, convertToPdf, detectFormat, detectZipFormat, diffBlocks, extractFormFields, isHwpxFile, isOldHwpFile, isPdfFile, isZipFile, markdownToHwpx, markdownToXlsx, parse, parseDocx, parseHwp, parseHwpx, parsePdf, parseXlsx, runUnifiedOcrPipeline };
package/dist/index.js CHANGED
@@ -37,118 +37,6 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
37
37
  mod
38
38
  ));
39
39
 
40
- // src/utils.ts
41
- var utils_exports = {};
42
- __export(utils_exports, {
43
- KordocError: () => KordocError,
44
- VERSION: () => VERSION,
45
- classifyError: () => classifyError,
46
- isPathTraversal: () => isPathTraversal,
47
- normalizeKordocError: () => normalizeKordocError,
48
- precheckZipSize: () => precheckZipSize,
49
- sanitizeError: () => sanitizeError,
50
- sanitizeHref: () => sanitizeHref,
51
- toArrayBuffer: () => toArrayBuffer
52
- });
53
- function toArrayBuffer(buf) {
54
- if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
55
- return buf.buffer;
56
- }
57
- return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
58
- }
59
- function sanitizeError(err) {
60
- if (err instanceof KordocError) return err.message;
61
- return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
62
- }
63
- function isPathTraversal(name) {
64
- if (name.includes("\0")) return true;
65
- const normalized = name.replace(/\\/g, "/");
66
- return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
67
- }
68
- function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
69
- try {
70
- const data = new DataView(buffer);
71
- const len = buffer.byteLength;
72
- let eocdOffset = -1;
73
- for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
74
- if (data.getUint32(i, true) === 101010256) {
75
- eocdOffset = i;
76
- break;
77
- }
78
- }
79
- if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
80
- const entryCount = data.getUint16(eocdOffset + 10, true);
81
- if (entryCount > maxEntries) {
82
- throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
83
- }
84
- const cdSize = data.getUint32(eocdOffset + 12, true);
85
- const cdOffset = data.getUint32(eocdOffset + 16, true);
86
- if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
87
- let totalUncompressed = 0;
88
- let pos = cdOffset;
89
- for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
90
- if (data.getUint32(pos, true) !== 33639248) break;
91
- totalUncompressed += data.getUint32(pos + 24, true);
92
- const nameLen = data.getUint16(pos + 28, true);
93
- const extraLen = data.getUint16(pos + 30, true);
94
- const commentLen = data.getUint16(pos + 32, true);
95
- pos += 46 + nameLen + extraLen + commentLen;
96
- }
97
- if (totalUncompressed > maxUncompressedSize) {
98
- throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
99
- }
100
- return { totalUncompressed, entryCount };
101
- } catch (err) {
102
- if (err instanceof KordocError) throw err;
103
- return { totalUncompressed: 0, entryCount: 0 };
104
- }
105
- }
106
- function sanitizeHref(href) {
107
- const trimmed = href.trim();
108
- if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
109
- return trimmed;
110
- }
111
- function classifyError(err) {
112
- if (!(err instanceof Error)) return "PARSE_ERROR";
113
- const msg = err.message;
114
- if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
115
- if (msg.includes("DRM")) return "DRM_PROTECTED";
116
- if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
117
- if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
118
- if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
119
- if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
120
- if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
121
- return "PARSE_ERROR";
122
- }
123
- function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
124
- if (err instanceof KordocError) {
125
- if (!err.stage) err.stage = stage;
126
- if (!err.code) err.code = fallbackCode;
127
- return err;
128
- }
129
- const message = err instanceof Error ? err.message : fallbackMessage;
130
- const code = err instanceof Error ? classifyError(err) : fallbackCode;
131
- return new KordocError(message || fallbackMessage, { code, stage });
132
- }
133
- var VERSION, KordocError, SAFE_HREF_RE;
134
- var init_utils = __esm({
135
- "src/utils.ts"() {
136
- "use strict";
137
- VERSION = true ? "2.6.0" : "0.0.0-dev";
138
- KordocError = class extends Error {
139
- code;
140
- stage;
141
- constructor(message, opts = {}) {
142
- super(message);
143
- this.name = "KordocError";
144
- this.code = opts.code;
145
- this.stage = opts.stage;
146
- }
147
- };
148
- SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
149
- }
150
- });
151
-
152
40
  // src/page-range.ts
153
41
  var page_range_exports = {};
154
42
  __export(page_range_exports, {
@@ -3223,8 +3111,97 @@ async function detectZipFormat(buffer) {
3223
3111
  import JSZip2 from "jszip";
3224
3112
  import { DOMParser } from "@xmldom/xmldom";
3225
3113
 
3114
+ // src/utils.ts
3115
+ var VERSION = true ? "2.6.1" : "0.0.0-dev";
3116
+ function toArrayBuffer(buf) {
3117
+ if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
3118
+ return buf.buffer;
3119
+ }
3120
+ return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
3121
+ }
3122
+ var KordocError = class extends Error {
3123
+ code;
3124
+ stage;
3125
+ constructor(message, opts = {}) {
3126
+ super(message);
3127
+ this.name = "KordocError";
3128
+ this.code = opts.code;
3129
+ this.stage = opts.stage;
3130
+ }
3131
+ };
3132
+ function isPathTraversal(name) {
3133
+ if (name.includes("\0")) return true;
3134
+ const normalized = name.replace(/\\/g, "/");
3135
+ return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
3136
+ }
3137
+ function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
3138
+ try {
3139
+ const data = new DataView(buffer);
3140
+ const len = buffer.byteLength;
3141
+ let eocdOffset = -1;
3142
+ for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
3143
+ if (data.getUint32(i, true) === 101010256) {
3144
+ eocdOffset = i;
3145
+ break;
3146
+ }
3147
+ }
3148
+ if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
3149
+ const entryCount = data.getUint16(eocdOffset + 10, true);
3150
+ if (entryCount > maxEntries) {
3151
+ throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
3152
+ }
3153
+ const cdSize = data.getUint32(eocdOffset + 12, true);
3154
+ const cdOffset = data.getUint32(eocdOffset + 16, true);
3155
+ if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
3156
+ let totalUncompressed = 0;
3157
+ let pos = cdOffset;
3158
+ for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
3159
+ if (data.getUint32(pos, true) !== 33639248) break;
3160
+ totalUncompressed += data.getUint32(pos + 24, true);
3161
+ const nameLen = data.getUint16(pos + 28, true);
3162
+ const extraLen = data.getUint16(pos + 30, true);
3163
+ const commentLen = data.getUint16(pos + 32, true);
3164
+ pos += 46 + nameLen + extraLen + commentLen;
3165
+ }
3166
+ if (totalUncompressed > maxUncompressedSize) {
3167
+ throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
3168
+ }
3169
+ return { totalUncompressed, entryCount };
3170
+ } catch (err) {
3171
+ if (err instanceof KordocError) throw err;
3172
+ return { totalUncompressed: 0, entryCount: 0 };
3173
+ }
3174
+ }
3175
+ var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
3176
+ function sanitizeHref(href) {
3177
+ const trimmed = href.trim();
3178
+ if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
3179
+ return trimmed;
3180
+ }
3181
+ function classifyError(err) {
3182
+ if (!(err instanceof Error)) return "PARSE_ERROR";
3183
+ const msg = err.message;
3184
+ if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
3185
+ if (msg.includes("DRM")) return "DRM_PROTECTED";
3186
+ if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
3187
+ if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
3188
+ if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
3189
+ if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
3190
+ if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
3191
+ return "PARSE_ERROR";
3192
+ }
3193
+ function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
3194
+ if (err instanceof KordocError) {
3195
+ if (!err.stage) err.stage = stage;
3196
+ if (!err.code) err.code = fallbackCode;
3197
+ return err;
3198
+ }
3199
+ const message = err instanceof Error ? err.message : fallbackMessage;
3200
+ const code = err instanceof Error ? classifyError(err) : fallbackCode;
3201
+ return new KordocError(message || fallbackMessage, { code, stage });
3202
+ }
3203
+
3226
3204
  // src/table/builder.ts
3227
- init_utils();
3228
3205
  var MAX_COLS = 200;
3229
3206
  var MAX_ROWS = 1e4;
3230
3207
  function buildTable(rows) {
@@ -3484,8 +3461,6 @@ var HEADING_RATIO_H2 = 1.3;
3484
3461
  var HEADING_RATIO_H3 = 1.15;
3485
3462
 
3486
3463
  // src/hwpx/parser.ts
3487
- init_utils();
3488
- init_utils();
3489
3464
  init_page_range();
3490
3465
  init_logger();
3491
3466
  var MAX_DECOMPRESS_SIZE = 500 * 1024 * 1024;
@@ -4327,7 +4302,6 @@ function extractTextFromNode(node) {
4327
4302
  }
4328
4303
 
4329
4304
  // src/hwp5/record.ts
4330
- init_utils();
4331
4305
  import { inflateRawSync, inflateSync } from "zlib";
4332
4306
  var TAG_PARA_HEADER = 66;
4333
4307
  var TAG_PARA_TEXT = 67;
@@ -5378,7 +5352,6 @@ function parseLenientCfb(data) {
5378
5352
  }
5379
5353
 
5380
5354
  // src/hwp5/parser.ts
5381
- init_utils();
5382
5355
  init_page_range();
5383
5356
  init_logger();
5384
5357
  var CFB = __toESM(require_cfb(), 1);
@@ -6034,7 +6007,6 @@ function arrangeCells(rows, cols, cells) {
6034
6007
  }
6035
6008
 
6036
6009
  // src/pdf/parser.ts
6037
- init_utils();
6038
6010
  init_page_range();
6039
6011
  import { createRequire } from "module";
6040
6012
  import { dirname as dirname2, join as join3, resolve as resolve2 } from "path";
@@ -7926,7 +7898,6 @@ function mergeKoreanLines(text) {
7926
7898
  }
7927
7899
 
7928
7900
  // src/xlsx/parser.ts
7929
- init_utils();
7930
7901
  import JSZip3 from "jszip";
7931
7902
  import { DOMParser as DOMParser2 } from "@xmldom/xmldom";
7932
7903
  init_logger();
@@ -8255,7 +8226,6 @@ async function parseXlsxDocument(buffer, options, existingZip) {
8255
8226
  }
8256
8227
 
8257
8228
  // src/docx/parser.ts
8258
- init_utils();
8259
8229
  import JSZip4 from "jszip";
8260
8230
  import { DOMParser as DOMParser3 } from "@xmldom/xmldom";
8261
8231
  init_logger();
@@ -8737,7 +8707,6 @@ async function parseDocxDocument(buffer, options, existingZip) {
8737
8707
  }
8738
8708
 
8739
8709
  // src/index.ts
8740
- init_utils();
8741
8710
  init_cli_provider();
8742
8711
  init_markdown_to_blocks();
8743
8712
  init_logger();
@@ -11241,7 +11210,6 @@ async function markdownToXlsx(markdown, options) {
11241
11210
 
11242
11211
  // src/convert/index.ts
11243
11212
  import { readFile } from "fs/promises";
11244
- init_utils();
11245
11213
 
11246
11214
  // src/convert/libreoffice.ts
11247
11215
  import libre from "libreoffice-convert";
@@ -11261,6 +11229,7 @@ import { join as join4, delimiter } from "path";
11261
11229
  import { mkdir, access, symlink, rm } from "fs/promises";
11262
11230
  import { createWriteStream } from "fs";
11263
11231
  import { spawn as spawn2 } from "child_process";
11232
+ var installInFlight = null;
11264
11233
  var CACHE_DIR = join4(homedir(), ".cache", "kordoc", "libreoffice");
11265
11234
  var VERSION_FILE = join4(CACHE_DIR, "version");
11266
11235
  var PACKAGES = {
@@ -11281,13 +11250,11 @@ var PACKAGES = {
11281
11250
  }
11282
11251
  };
11283
11252
  async function findInPath() {
11284
- try {
11285
- const { runCommand: runCommand2 } = await Promise.resolve().then(() => (init_utils(), utils_exports));
11286
- await runCommand2("soffice", ["--version"]);
11287
- return "soffice";
11288
- } catch {
11289
- return null;
11290
- }
11253
+ return new Promise((resolve4) => {
11254
+ const child = spawn2("soffice", ["--version"], { stdio: "ignore" });
11255
+ child.on("close", (code) => resolve4(code === 0 ? "soffice" : null));
11256
+ child.on("error", () => resolve4(null));
11257
+ });
11291
11258
  }
11292
11259
  async function findInCache() {
11293
11260
  const cachedBin = join4(CACHE_DIR, "bin", "soffice");
@@ -11298,6 +11265,38 @@ async function findInCache() {
11298
11265
  return null;
11299
11266
  }
11300
11267
  }
11268
+ async function findInDefaultPaths() {
11269
+ const platform = process.platform;
11270
+ const paths = [];
11271
+ if (platform === "darwin") {
11272
+ paths.push(
11273
+ "/Applications/LibreOffice.app/Contents/MacOS/soffice",
11274
+ "/opt/homebrew/bin/soffice",
11275
+ "/usr/local/bin/soffice"
11276
+ );
11277
+ } else if (platform === "linux") {
11278
+ paths.push(
11279
+ "/usr/bin/soffice",
11280
+ "/usr/lib/libreoffice/program/soffice"
11281
+ );
11282
+ } else if (platform === "win32") {
11283
+ const pf = process.env["ProgramFiles"] ?? "C:\\Program Files";
11284
+ const pf86 = process.env["ProgramFiles(x86)"] ?? "C:\\Program Files (x86)";
11285
+ paths.push(
11286
+ join4(pf, "LibreOffice", "program", "soffice.exe"),
11287
+ join4(pf86, "LibreOffice", "program", "soffice.exe")
11288
+ );
11289
+ }
11290
+ for (const p of paths) {
11291
+ try {
11292
+ await access(p);
11293
+ return p;
11294
+ } catch {
11295
+ continue;
11296
+ }
11297
+ }
11298
+ return null;
11299
+ }
11301
11300
  async function downloadWithProgress(url, dest, totalBytes, onProgress) {
11302
11301
  const response = await fetch(url);
11303
11302
  if (!response.body) throw new Error("\uB2E4\uC6B4\uB85C\uB4DC \uC2E4\uD328: response body \uC5C6\uC74C");
@@ -11425,6 +11424,11 @@ async function resolveSoffice(emitter, autoInstall = true) {
11425
11424
  emitter.validate("soffice_found", "\uCE90\uC2DC\uB41C LibreOffice \uBC1C\uACAC", { sofficePath: inCache });
11426
11425
  return inCache;
11427
11426
  }
11427
+ const inDefault = await findInDefaultPaths();
11428
+ if (inDefault) {
11429
+ emitter.validate("soffice_found", "\uAE30\uBCF8 \uACBD\uB85C\uC5D0\uC11C LibreOffice \uBC1C\uACAC", { sofficePath: inDefault });
11430
+ return inDefault;
11431
+ }
11428
11432
  if (!autoInstall) {
11429
11433
  emitter.error(
11430
11434
  "validate",
@@ -11434,38 +11438,35 @@ async function resolveSoffice(emitter, autoInstall = true) {
11434
11438
  );
11435
11439
  throw new ConvertError("SOFFICE_NOT_FOUND", "LibreOffice\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4");
11436
11440
  }
11441
+ if (installInFlight) {
11442
+ return installInFlight;
11443
+ }
11437
11444
  emitter.install("install_start", "LibreOffice \uC790\uB3D9 \uC124\uCE58\uB97C \uC2DC\uC791\uD569\uB2C8\uB2E4...");
11438
- try {
11439
- const installed = await installLibreOffice((downloaded, total) => {
11440
- const percent = Math.round(downloaded / total * 100);
11441
- emitter.install("download_progress", `\uB2E4\uC6B4\uB85C\uB4DC \uC911... ${percent}%`, {
11442
- percent,
11443
- downloadedBytes: downloaded,
11444
- totalBytes: total
11445
+ installInFlight = (async () => {
11446
+ try {
11447
+ const installed = await installLibreOffice((downloaded, total) => {
11448
+ const percent = Math.round(downloaded / total * 100);
11449
+ emitter.install("download_progress", `\uB2E4\uC6B4\uB85C\uB4DC \uC911... ${percent}%`, {
11450
+ percent,
11451
+ downloadedBytes: downloaded,
11452
+ totalBytes: total
11453
+ });
11445
11454
  });
11446
- });
11447
- emitter.install("install_complete", "\uC124\uCE58 \uC644\uB8CC", { installedPath: installed });
11448
- return installed;
11449
- } catch (err) {
11450
- const errorMsg = err instanceof Error ? err.message : String(err);
11451
- emitter.install("install_failed", "\uC124\uCE58 \uC2E4\uD328", { error: errorMsg });
11452
- throw err;
11453
- }
11455
+ emitter.install("install_complete", "\uC124\uCE58 \uC644\uB8CC", { installedPath: installed });
11456
+ return installed;
11457
+ } catch (err) {
11458
+ const errorMsg = err instanceof Error ? err.message : String(err);
11459
+ emitter.install("install_failed", "\uC124\uCE58 \uC2E4\uD328", { error: errorMsg });
11460
+ throw err;
11461
+ } finally {
11462
+ installInFlight = null;
11463
+ }
11464
+ })();
11465
+ return installInFlight;
11454
11466
  }
11455
11467
 
11456
11468
  // src/convert/libreoffice.ts
11457
11469
  var libreConvert = libre.convert;
11458
- async function assertSofficeAvailable() {
11459
- const { runCommand: runCommand2 } = await Promise.resolve().then(() => (init_utils(), utils_exports));
11460
- try {
11461
- await runCommand2("soffice", ["--version"]);
11462
- } catch {
11463
- throw new ConvertError(
11464
- "SOFFICE_NOT_FOUND",
11465
- "soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694."
11466
- );
11467
- }
11468
- }
11469
11470
  async function convertBuffer(buffer, targetExt, timeoutMs = 6e4) {
11470
11471
  return new Promise((resolve4, reject) => {
11471
11472
  const timer = setTimeout(() => {
@@ -11711,9 +11712,6 @@ async function convertHwpxToPdf(input, options) {
11711
11712
  return result;
11712
11713
  }
11713
11714
 
11714
- // src/index.ts
11715
- init_utils();
11716
-
11717
11715
  // src/ocr/api-key-rotation.ts
11718
11716
  var AllKeysCoolingDownError = class extends Error {
11719
11717
  waitMs;
@@ -11809,7 +11807,7 @@ var ApiKeyRotationPool = class _ApiKeyRotationPool {
11809
11807
 
11810
11808
  // src/pipeline/unified-ocr.ts
11811
11809
  import { mkdir as mkdir2, readdir, readFile as readFile2, stat, writeFile as writeFile2 } from "fs/promises";
11812
- import { basename as basename2, dirname as dirname3, extname, join as join5, resolve as resolve3 } from "path";
11810
+ import { basename as basename2, delimiter as delimiter2, dirname as dirname3, extname, join as join5, resolve as resolve3 } from "path";
11813
11811
  import { spawn as spawn3 } from "child_process";
11814
11812
  import { performance } from "perf_hooks";
11815
11813
  init_logger();
@@ -11985,7 +11983,25 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11985
11983
  markStageStart("convert", "\uBB38\uC11C\uB97C PDF\uB85C \uBCC0\uD658 \uC911");
11986
11984
  logStage("info", "convert", "start", "\uBB38\uC11C\uB97C PDF\uB85C \uBCC0\uD658 \uC2DC\uC791", { input: absInput });
11987
11985
  if (extname(absInput).toLowerCase() !== ".pdf") {
11988
- await assertSofficeAvailable();
11986
+ const convertEmitter = new ConvertEventEmitter();
11987
+ if (options.onEvent) {
11988
+ convertEmitter.setListener((evt) => {
11989
+ if (evt.type === "install" || evt.type === "validate" || evt.type === "error") {
11990
+ try {
11991
+ ;
11992
+ options.onEvent(evt);
11993
+ } catch {
11994
+ }
11995
+ }
11996
+ });
11997
+ }
11998
+ if (options.sofficePath) {
11999
+ const sofficeDir = dirname3(options.sofficePath);
12000
+ process.env.PATH = `${sofficeDir}${delimiter2}${process.env.PATH ?? ""}`;
12001
+ convertEmitter.validate("soffice_found", "\uC9C1\uC811 \uC9C0\uC815\uB41C LibreOffice \uACBD\uB85C \uC0AC\uC6A9", { sofficePath: options.sofficePath });
12002
+ } else {
12003
+ await resolveSoffice(convertEmitter, options.autoInstallLibreOffice ?? false);
12004
+ }
11989
12005
  workingPdfPath = join5(workspaceDir, `${stem}.pdf`);
11990
12006
  const inputBuffer = await readFile2(absInput);
11991
12007
  const out = await convertBuffer(inputBuffer, ".pdf");
@@ -12558,6 +12574,16 @@ function ensureSupportedInput(path) {
12558
12574
  }
12559
12575
  function normalizePipelineError(err, stage) {
12560
12576
  if (err instanceof UnifiedOcrError) return err;
12577
+ if (err instanceof ConvertError) {
12578
+ const codeMap = {
12579
+ SOFFICE_NOT_FOUND: "SOFFICE_NOT_FOUND",
12580
+ CONVERT_FAILED: "CONVERT_FAILED",
12581
+ TIMEOUT: "CONVERT_FAILED",
12582
+ UNSUPPORTED_PLATFORM: "CONVERT_FAILED",
12583
+ UNSUPPORTED_FORMAT: "UNSUPPORTED_INPUT"
12584
+ };
12585
+ return new UnifiedOcrError(codeMap[err.code] ?? "CONVERT_FAILED", stage, err.message);
12586
+ }
12561
12587
  const message = err instanceof Error ? err.message : String(err);
12562
12588
  const codeByStage = {
12563
12589
  convert: "CONVERT_FAILED",