@clazic/kordoc 2.3.1 → 2.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{batch-provider-PCT4I4LK.js → batch-provider-PNDCSGQW.js} +3 -4
- package/dist/batch-provider-PNDCSGQW.js.map +1 -0
- package/dist/chunk-4PP34NVQ.js +121 -0
- package/dist/chunk-4PP34NVQ.js.map +1 -0
- package/dist/{tesseract-provider-WCVJWBUT.js → chunk-7FMKAV4P.js} +4 -4
- package/dist/{tesseract-provider-WCVJWBUT.js.map → chunk-7FMKAV4P.js.map} +1 -1
- package/dist/chunk-JOGAFNIL.js +153 -0
- package/dist/chunk-JOGAFNIL.js.map +1 -0
- package/dist/{chunk-W5KUC23B.js → chunk-NU3KFVVZ.js} +2 -2
- package/dist/{chunk-ZOEUKD77.js → chunk-UDFKY7CH.js} +204 -49
- package/dist/chunk-UDFKY7CH.js.map +1 -0
- package/dist/cli.js +8 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +230 -72
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +11 -6
- package/dist/index.d.ts +11 -6
- package/dist/index.js +230 -72
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +5 -2
- package/dist/mcp.js.map +1 -1
- package/dist/{provider-WYHC4NHI.js → provider-HE727F7Z.js} +19 -131
- package/dist/provider-HE727F7Z.js.map +1 -0
- package/dist/resolve-UOAOPQ4H.js +111 -0
- package/dist/resolve-UOAOPQ4H.js.map +1 -0
- package/dist/tesseract-provider-MNMZPSGF.js +11 -0
- package/dist/{utils-HSF5HI5T.js → utils-STJT6CFC.js} +2 -2
- package/dist/utils-STJT6CFC.js.map +1 -0
- package/dist/{watch-R2JHXDGF.js → watch-PRQGLOW3.js} +6 -3
- package/dist/{watch-R2JHXDGF.js.map → watch-PRQGLOW3.js.map} +1 -1
- package/package.json +8 -8
- package/dist/batch-provider-PCT4I4LK.js.map +0 -1
- package/dist/chunk-ZOEUKD77.js.map +0 -1
- package/dist/provider-WYHC4NHI.js.map +0 -1
- package/dist/resolve-4FSAQF2S.js +0 -247
- package/dist/resolve-4FSAQF2S.js.map +0 -1
- /package/dist/{chunk-W5KUC23B.js.map → chunk-NU3KFVVZ.js.map} +0 -0
- /package/dist/{utils-HSF5HI5T.js.map → tesseract-provider-MNMZPSGF.js.map} +0 -0
|
@@ -6,10 +6,19 @@ import {
|
|
|
6
6
|
precheckZipSize,
|
|
7
7
|
sanitizeHref,
|
|
8
8
|
toArrayBuffer
|
|
9
|
-
} from "./chunk-
|
|
9
|
+
} from "./chunk-NU3KFVVZ.js";
|
|
10
10
|
import {
|
|
11
11
|
parsePageRange
|
|
12
12
|
} from "./chunk-MOL7MDBG.js";
|
|
13
|
+
import {
|
|
14
|
+
createTesseractProvider
|
|
15
|
+
} from "./chunk-7FMKAV4P.js";
|
|
16
|
+
import {
|
|
17
|
+
createCliOcrProvider
|
|
18
|
+
} from "./chunk-JOGAFNIL.js";
|
|
19
|
+
import {
|
|
20
|
+
markdownToBlocks
|
|
21
|
+
} from "./chunk-4PP34NVQ.js";
|
|
13
22
|
import {
|
|
14
23
|
__commonJS,
|
|
15
24
|
__require,
|
|
@@ -1918,24 +1927,29 @@ function isPdfFile(buffer) {
|
|
|
1918
1927
|
const b = magicBytes(buffer);
|
|
1919
1928
|
return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
|
|
1920
1929
|
}
|
|
1930
|
+
function isPngFile(buffer) {
|
|
1931
|
+
const b = magicBytes(buffer);
|
|
1932
|
+
return b[0] === 137 && b[1] === 80 && b[2] === 78 && b[3] === 71;
|
|
1933
|
+
}
|
|
1921
1934
|
function detectFormat(buffer) {
|
|
1922
1935
|
if (buffer.byteLength < 4) return "unknown";
|
|
1923
1936
|
if (isZipFile(buffer)) return "hwpx";
|
|
1924
1937
|
if (isOldHwpFile(buffer)) return "hwp";
|
|
1925
1938
|
if (isPdfFile(buffer)) return "pdf";
|
|
1939
|
+
if (isPngFile(buffer)) return "image";
|
|
1926
1940
|
return "unknown";
|
|
1927
1941
|
}
|
|
1928
1942
|
async function detectZipFormat(buffer) {
|
|
1929
1943
|
try {
|
|
1930
1944
|
const zip = await JSZip.loadAsync(buffer);
|
|
1931
|
-
if (zip.file("xl/workbook.xml")) return "xlsx";
|
|
1932
|
-
if (zip.file("word/document.xml")) return "docx";
|
|
1933
|
-
if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return "hwpx";
|
|
1945
|
+
if (zip.file("xl/workbook.xml")) return { format: "xlsx", zip };
|
|
1946
|
+
if (zip.file("word/document.xml")) return { format: "docx", zip };
|
|
1947
|
+
if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return { format: "hwpx", zip };
|
|
1934
1948
|
const hasSection = Object.keys(zip.files).some((f) => f.startsWith("Contents/"));
|
|
1935
|
-
if (hasSection) return "hwpx";
|
|
1936
|
-
return "unknown";
|
|
1949
|
+
if (hasSection) return { format: "hwpx", zip };
|
|
1950
|
+
return { format: "unknown", zip: null };
|
|
1937
1951
|
} catch {
|
|
1938
|
-
return "unknown";
|
|
1952
|
+
return { format: "unknown", zip: null };
|
|
1939
1953
|
}
|
|
1940
1954
|
}
|
|
1941
1955
|
|
|
@@ -2024,12 +2038,16 @@ function buildTableDirect(rows, numRows) {
|
|
|
2024
2038
|
return trimAndReturn(grid, numRows, maxCols);
|
|
2025
2039
|
}
|
|
2026
2040
|
function trimAndReturn(grid, numRows, maxCols) {
|
|
2027
|
-
let effectiveCols =
|
|
2028
|
-
|
|
2029
|
-
|
|
2030
|
-
|
|
2031
|
-
|
|
2041
|
+
let effectiveCols = 0;
|
|
2042
|
+
for (const row of grid) {
|
|
2043
|
+
for (let c = row.length - 1; c >= effectiveCols; c--) {
|
|
2044
|
+
if (row[c]?.text?.trim()) {
|
|
2045
|
+
effectiveCols = c + 1;
|
|
2046
|
+
break;
|
|
2047
|
+
}
|
|
2048
|
+
}
|
|
2032
2049
|
}
|
|
2050
|
+
if (effectiveCols === 0) effectiveCols = maxCols;
|
|
2033
2051
|
if (effectiveCols < maxCols && effectiveCols > 0) {
|
|
2034
2052
|
const trimmed = grid.map((row) => row.slice(0, effectiveCols));
|
|
2035
2053
|
return { rows: numRows, cols: effectiveCols, cells: trimmed, hasHeader: numRows > 1 };
|
|
@@ -2289,11 +2307,11 @@ function parseStyleElements(doc, map) {
|
|
|
2289
2307
|
function stripDtd(xml) {
|
|
2290
2308
|
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
2291
2309
|
}
|
|
2292
|
-
async function parseHwpxDocument(buffer, options) {
|
|
2310
|
+
async function parseHwpxDocument(buffer, options, existingZip) {
|
|
2293
2311
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
2294
2312
|
let zip;
|
|
2295
2313
|
try {
|
|
2296
|
-
zip = await JSZip2.loadAsync(buffer);
|
|
2314
|
+
zip = existingZip ?? await JSZip2.loadAsync(buffer);
|
|
2297
2315
|
} catch {
|
|
2298
2316
|
return await extractFromBrokenZip(buffer);
|
|
2299
2317
|
}
|
|
@@ -5328,8 +5346,15 @@ import { getDocument, GlobalWorkerOptions } from "pdfjs-dist/legacy/build/pdf.mj
|
|
|
5328
5346
|
GlobalWorkerOptions.workerSrc = "";
|
|
5329
5347
|
var MAX_PAGES = 5e3;
|
|
5330
5348
|
var MAX_TOTAL_TEXT = 500 * 1024 * 1024;
|
|
5331
|
-
|
|
5349
|
+
function calcPdfTimeout(bufferSize) {
|
|
5350
|
+
const base = 3e4;
|
|
5351
|
+
const perMb = 500;
|
|
5352
|
+
const mb = bufferSize / (1024 * 1024);
|
|
5353
|
+
return Math.min(base + Math.ceil(mb * perMb), 3e5);
|
|
5354
|
+
}
|
|
5332
5355
|
async function loadPdfWithTimeout(buffer) {
|
|
5356
|
+
const timeoutMs = calcPdfTimeout(buffer.byteLength);
|
|
5357
|
+
const timeoutSec = Math.round(timeoutMs / 1e3);
|
|
5333
5358
|
const loadingTask = getDocument({
|
|
5334
5359
|
data: new Uint8Array(buffer),
|
|
5335
5360
|
useSystemFonts: true,
|
|
@@ -5343,8 +5368,8 @@ async function loadPdfWithTimeout(buffer) {
|
|
|
5343
5368
|
new Promise((_, reject) => {
|
|
5344
5369
|
timer = setTimeout(() => {
|
|
5345
5370
|
loadingTask.destroy();
|
|
5346
|
-
reject(new KordocError(
|
|
5347
|
-
},
|
|
5371
|
+
reject(new KordocError(`PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (${timeoutSec}\uCD08 \uCD08\uACFC)`));
|
|
5372
|
+
}, timeoutMs);
|
|
5348
5373
|
})
|
|
5349
5374
|
]);
|
|
5350
5375
|
} finally {
|
|
@@ -5365,11 +5390,15 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5365
5390
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
5366
5391
|
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
5367
5392
|
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
5368
|
-
const
|
|
5393
|
+
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
5369
5394
|
const pageHeights = /* @__PURE__ */ new Map();
|
|
5370
|
-
|
|
5395
|
+
const targetPageNums = [];
|
|
5371
5396
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
5372
5397
|
if (pageFilter && !pageFilter.has(i)) continue;
|
|
5398
|
+
targetPageNums.push(i);
|
|
5399
|
+
}
|
|
5400
|
+
let parsedPages = 0;
|
|
5401
|
+
const parseSinglePage = async (i) => {
|
|
5373
5402
|
try {
|
|
5374
5403
|
const page = await doc.getPage(i);
|
|
5375
5404
|
const tc = await page.getTextContent();
|
|
@@ -5382,7 +5411,10 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5382
5411
|
warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
5383
5412
|
}
|
|
5384
5413
|
for (const item of visible) {
|
|
5385
|
-
if (item.fontSize > 0)
|
|
5414
|
+
if (item.fontSize > 0) {
|
|
5415
|
+
const rounded = Math.round(item.fontSize * 10) / 10;
|
|
5416
|
+
fontSizeFreq.set(rounded, (fontSizeFreq.get(rounded) || 0) + 1);
|
|
5417
|
+
}
|
|
5386
5418
|
}
|
|
5387
5419
|
const opList = await page.getOperatorList();
|
|
5388
5420
|
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
@@ -5399,14 +5431,36 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5399
5431
|
if (pageErr instanceof KordocError) throw pageErr;
|
|
5400
5432
|
warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
5401
5433
|
}
|
|
5434
|
+
};
|
|
5435
|
+
const SAMPLE_SIZE = Math.min(10, targetPageNums.length);
|
|
5436
|
+
const sampledIndices = /* @__PURE__ */ new Set();
|
|
5437
|
+
if (targetPageNums.length <= SAMPLE_SIZE) {
|
|
5438
|
+
for (let i = 0; i < targetPageNums.length; i++) sampledIndices.add(i);
|
|
5439
|
+
} else {
|
|
5440
|
+
for (let i = 0; i < SAMPLE_SIZE; i++) {
|
|
5441
|
+
const idx = Math.round(i * (targetPageNums.length - 1) / (SAMPLE_SIZE - 1));
|
|
5442
|
+
sampledIndices.add(idx);
|
|
5443
|
+
}
|
|
5444
|
+
}
|
|
5445
|
+
for (const si of sampledIndices) {
|
|
5446
|
+
await parseSinglePage(targetPageNums[si]);
|
|
5447
|
+
}
|
|
5448
|
+
const sampleParsed = parsedPages || sampledIndices.size;
|
|
5449
|
+
const isImageBased = totalChars / Math.max(sampleParsed, 1) < 10;
|
|
5450
|
+
if (!isImageBased) {
|
|
5451
|
+
for (let si = 0; si < targetPageNums.length; si++) {
|
|
5452
|
+
if (!sampledIndices.has(si)) {
|
|
5453
|
+
await parseSinglePage(targetPageNums[si]);
|
|
5454
|
+
}
|
|
5455
|
+
}
|
|
5402
5456
|
}
|
|
5403
5457
|
const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
|
|
5404
|
-
if (
|
|
5458
|
+
if (isImageBased) {
|
|
5405
5459
|
let ocrProvider = options?.ocr ?? null;
|
|
5406
|
-
const ocrMode = options?.ocrMode;
|
|
5407
|
-
if (!ocrProvider && ocrMode
|
|
5460
|
+
const ocrMode = options?.ocrMode ?? "auto";
|
|
5461
|
+
if (!ocrProvider && ocrMode !== "off") {
|
|
5408
5462
|
try {
|
|
5409
|
-
const { resolveOcrProvider } = await import("./resolve-
|
|
5463
|
+
const { resolveOcrProvider } = await import("./resolve-UOAOPQ4H.js");
|
|
5410
5464
|
const concurrency = options?.ocrConcurrency ?? 1;
|
|
5411
5465
|
const batchSize = options?.ocrBatchSize;
|
|
5412
5466
|
ocrProvider = await resolveOcrProvider(ocrMode, warnings, concurrency, batchSize);
|
|
@@ -5422,7 +5476,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5422
5476
|
if (ocrProvider) {
|
|
5423
5477
|
let ocrBlocks = [];
|
|
5424
5478
|
try {
|
|
5425
|
-
const { ocrPages } = await import("./provider-
|
|
5479
|
+
const { ocrPages } = await import("./provider-HE727F7Z.js");
|
|
5426
5480
|
const concurrency = options?.ocrConcurrency ?? 1;
|
|
5427
5481
|
ocrBlocks = await ocrPages(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
|
|
5428
5482
|
} catch {
|
|
@@ -5456,7 +5510,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5456
5510
|
blocks.splice(removed[ri], 1);
|
|
5457
5511
|
}
|
|
5458
5512
|
}
|
|
5459
|
-
const medianFontSize =
|
|
5513
|
+
const medianFontSize = computeMedianFromFreq(fontSizeFreq);
|
|
5460
5514
|
if (medianFontSize > 0) {
|
|
5461
5515
|
detectHeadings(blocks, medianFontSize);
|
|
5462
5516
|
}
|
|
@@ -5520,11 +5574,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
|
|
|
5520
5574
|
}
|
|
5521
5575
|
return { visible, hiddenCount };
|
|
5522
5576
|
}
|
|
5523
|
-
function
|
|
5524
|
-
if (
|
|
5525
|
-
const
|
|
5526
|
-
|
|
5527
|
-
|
|
5577
|
+
function computeMedianFromFreq(freq) {
|
|
5578
|
+
if (freq.size === 0) return 0;
|
|
5579
|
+
const entries = [...freq.entries()].sort((a, b) => a[0] - b[0]);
|
|
5580
|
+
let total = 0;
|
|
5581
|
+
for (const [, count] of entries) total += count;
|
|
5582
|
+
const mid = total / 2;
|
|
5583
|
+
let cumulative = 0;
|
|
5584
|
+
for (const [size, count] of entries) {
|
|
5585
|
+
cumulative += count;
|
|
5586
|
+
if (cumulative >= mid) return size;
|
|
5587
|
+
}
|
|
5588
|
+
return 0;
|
|
5528
5589
|
}
|
|
5529
5590
|
function detectHeadings(blocks, medianFontSize) {
|
|
5530
5591
|
for (const block of blocks) {
|
|
@@ -6330,6 +6391,7 @@ var MAX_SHEETS = 100;
|
|
|
6330
6391
|
var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
|
|
6331
6392
|
var MAX_ROWS2 = 1e4;
|
|
6332
6393
|
var MAX_COLS2 = 200;
|
|
6394
|
+
var MAX_TOTAL_CELLS = 2e6;
|
|
6333
6395
|
function cleanNumericValue(raw) {
|
|
6334
6396
|
if (!/^-?\d+\.\d+$/.test(raw)) return raw;
|
|
6335
6397
|
const num = parseFloat(raw);
|
|
@@ -6513,9 +6575,9 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
|
|
|
6513
6575
|
}
|
|
6514
6576
|
return blocks;
|
|
6515
6577
|
}
|
|
6516
|
-
async function parseXlsxDocument(buffer, options) {
|
|
6578
|
+
async function parseXlsxDocument(buffer, options, existingZip) {
|
|
6517
6579
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
|
|
6518
|
-
const zip = await JSZip3.loadAsync(buffer);
|
|
6580
|
+
const zip = existingZip ?? await JSZip3.loadAsync(buffer);
|
|
6519
6581
|
const warnings = [];
|
|
6520
6582
|
const workbookFile = zip.file("xl/workbook.xml");
|
|
6521
6583
|
if (!workbookFile) {
|
|
@@ -6542,6 +6604,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
6542
6604
|
}
|
|
6543
6605
|
const blocks = [];
|
|
6544
6606
|
const processedSheets = Math.min(sheets.length, MAX_SHEETS);
|
|
6607
|
+
let totalCells = 0;
|
|
6545
6608
|
for (let i = 0; i < processedSheets; i++) {
|
|
6546
6609
|
if (pageFilter && !pageFilter.has(i + 1)) continue;
|
|
6547
6610
|
const sheet = sheets[i];
|
|
@@ -6568,6 +6631,11 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
6568
6631
|
try {
|
|
6569
6632
|
const sheetXml = await sheetFile.async("text");
|
|
6570
6633
|
const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
|
|
6634
|
+
totalCells += maxRow * maxCol;
|
|
6635
|
+
if (totalCells > MAX_TOTAL_CELLS) {
|
|
6636
|
+
warnings.push({ message: `\uCD1D \uC140 \uC218 \uC81C\uD55C \uCD08\uACFC (${totalCells.toLocaleString()}\uC140), \uC774\uD6C4 \uC2DC\uD2B8 \uC0DD\uB7B5`, code: "PARTIAL_PARSE" });
|
|
6637
|
+
break;
|
|
6638
|
+
}
|
|
6571
6639
|
const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
|
|
6572
6640
|
blocks.push(...sheetBlocks);
|
|
6573
6641
|
} catch (err) {
|
|
@@ -6651,10 +6719,35 @@ function getAttr(el, localName) {
|
|
|
6651
6719
|
function parseXml2(text) {
|
|
6652
6720
|
return new DOMParser3().parseFromString(text, "text/xml");
|
|
6653
6721
|
}
|
|
6722
|
+
function buildElementIndex(root) {
|
|
6723
|
+
const index = /* @__PURE__ */ new Map();
|
|
6724
|
+
const walk = (node) => {
|
|
6725
|
+
const children = node.childNodes;
|
|
6726
|
+
for (let i = 0; i < children.length; i++) {
|
|
6727
|
+
const child = children[i];
|
|
6728
|
+
if (child.nodeType === 1) {
|
|
6729
|
+
const el = child;
|
|
6730
|
+
const name = el.localName ?? "";
|
|
6731
|
+
if (name) {
|
|
6732
|
+
let list = index.get(name);
|
|
6733
|
+
if (!list) {
|
|
6734
|
+
list = [];
|
|
6735
|
+
index.set(name, list);
|
|
6736
|
+
}
|
|
6737
|
+
list.push(el);
|
|
6738
|
+
}
|
|
6739
|
+
walk(el);
|
|
6740
|
+
}
|
|
6741
|
+
}
|
|
6742
|
+
};
|
|
6743
|
+
walk(root);
|
|
6744
|
+
return index;
|
|
6745
|
+
}
|
|
6654
6746
|
function parseStyles(xml) {
|
|
6655
6747
|
const doc = parseXml2(xml);
|
|
6656
6748
|
const styles = /* @__PURE__ */ new Map();
|
|
6657
|
-
const
|
|
6749
|
+
const idx = buildElementIndex(doc);
|
|
6750
|
+
const styleElements = idx.get("style") ?? [];
|
|
6658
6751
|
for (const el of styleElements) {
|
|
6659
6752
|
const styleId = getAttr(el, "styleId");
|
|
6660
6753
|
if (!styleId) continue;
|
|
@@ -6682,7 +6775,8 @@ function parseStyles(xml) {
|
|
|
6682
6775
|
function parseNumbering(xml) {
|
|
6683
6776
|
const doc = parseXml2(xml);
|
|
6684
6777
|
const abstractNums = /* @__PURE__ */ new Map();
|
|
6685
|
-
const
|
|
6778
|
+
const idx = buildElementIndex(doc);
|
|
6779
|
+
const abstractElements = idx.get("abstractNum") ?? [];
|
|
6686
6780
|
for (const el of abstractElements) {
|
|
6687
6781
|
const abstractNumId = getAttr(el, "abstractNumId");
|
|
6688
6782
|
if (!abstractNumId) continue;
|
|
@@ -6697,7 +6791,7 @@ function parseNumbering(xml) {
|
|
|
6697
6791
|
abstractNums.set(abstractNumId, levels);
|
|
6698
6792
|
}
|
|
6699
6793
|
const nums = /* @__PURE__ */ new Map();
|
|
6700
|
-
const numElements =
|
|
6794
|
+
const numElements = idx.get("num") ?? [];
|
|
6701
6795
|
for (const el of numElements) {
|
|
6702
6796
|
const numId = getAttr(el, "numId");
|
|
6703
6797
|
if (!numId) continue;
|
|
@@ -6941,9 +7035,9 @@ async function extractImages(zip, rels, doc) {
|
|
|
6941
7035
|
}
|
|
6942
7036
|
return { blocks, images };
|
|
6943
7037
|
}
|
|
6944
|
-
async function parseDocxDocument(buffer, options) {
|
|
7038
|
+
async function parseDocxDocument(buffer, options, existingZip) {
|
|
6945
7039
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
|
|
6946
|
-
const zip = await JSZip4.loadAsync(buffer);
|
|
7040
|
+
const zip = existingZip ?? await JSZip4.loadAsync(buffer);
|
|
6947
7041
|
const warnings = [];
|
|
6948
7042
|
const docFile = zip.file("word/document.xml");
|
|
6949
7043
|
if (!docFile) {
|
|
@@ -9378,25 +9472,86 @@ async function parse2(input, options) {
|
|
|
9378
9472
|
if (!buffer || buffer.byteLength === 0) {
|
|
9379
9473
|
return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
|
|
9380
9474
|
}
|
|
9475
|
+
const MAX_FILE_SIZE = 500 * 1024 * 1024;
|
|
9476
|
+
if (buffer.byteLength > MAX_FILE_SIZE) {
|
|
9477
|
+
return { success: false, fileType: "unknown", error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`, code: "FILE_TOO_LARGE" };
|
|
9478
|
+
}
|
|
9381
9479
|
const format = detectFormat(buffer);
|
|
9382
9480
|
switch (format) {
|
|
9383
9481
|
case "hwpx": {
|
|
9384
|
-
const zipFormat = await detectZipFormat(buffer);
|
|
9385
|
-
if (zipFormat === "xlsx") return parseXlsx(buffer, options);
|
|
9386
|
-
if (zipFormat === "docx") return parseDocx(buffer, options);
|
|
9387
|
-
return parseHwpx(buffer, options);
|
|
9482
|
+
const { format: zipFormat, zip } = await detectZipFormat(buffer);
|
|
9483
|
+
if (zipFormat === "xlsx") return parseXlsx(buffer, options, zip ?? void 0);
|
|
9484
|
+
if (zipFormat === "docx") return parseDocx(buffer, options, zip ?? void 0);
|
|
9485
|
+
return parseHwpx(buffer, options, zip ?? void 0);
|
|
9388
9486
|
}
|
|
9389
9487
|
case "hwp":
|
|
9390
9488
|
return parseHwp(buffer, options);
|
|
9391
9489
|
case "pdf":
|
|
9392
9490
|
return parsePdf(buffer, options);
|
|
9491
|
+
case "image":
|
|
9492
|
+
return parseImage(buffer, options);
|
|
9393
9493
|
default:
|
|
9394
9494
|
return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
|
|
9395
9495
|
}
|
|
9396
9496
|
}
|
|
9397
|
-
async function
|
|
9497
|
+
async function parseImage(buffer, options) {
|
|
9498
|
+
const ocrMode = options?.ocrMode || "auto";
|
|
9499
|
+
if (ocrMode === "off") {
|
|
9500
|
+
return { success: false, fileType: "image", error: "OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC774\uBBF8\uC9C0 \uD30C\uC77C\uC744 \uCC98\uB9AC\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "IMAGE_BASED_PDF" };
|
|
9501
|
+
}
|
|
9502
|
+
let ocrProvider;
|
|
9503
|
+
let actualOcrMode = "auto";
|
|
9504
|
+
try {
|
|
9505
|
+
if (ocrMode === "gemini" || ocrMode === "claude" || ocrMode === "codex" || ocrMode === "ollama") {
|
|
9506
|
+
ocrProvider = createCliOcrProvider(ocrMode);
|
|
9507
|
+
actualOcrMode = ocrMode;
|
|
9508
|
+
} else if (ocrMode === "tesseract") {
|
|
9509
|
+
ocrProvider = await createTesseractProvider();
|
|
9510
|
+
actualOcrMode = ocrMode;
|
|
9511
|
+
} else if (ocrMode === "auto") {
|
|
9512
|
+
const modesToTry = ["gemini", "claude", "codex", "ollama"];
|
|
9513
|
+
for (const mode of modesToTry) {
|
|
9514
|
+
try {
|
|
9515
|
+
ocrProvider = createCliOcrProvider(mode);
|
|
9516
|
+
actualOcrMode = mode;
|
|
9517
|
+
break;
|
|
9518
|
+
} catch (e) {
|
|
9519
|
+
console.warn(`[kordoc] OCR auto-detection: ${mode} CLI not available or failed. Trying next.`, e);
|
|
9520
|
+
}
|
|
9521
|
+
}
|
|
9522
|
+
if (!ocrProvider) {
|
|
9523
|
+
ocrProvider = await createTesseractProvider();
|
|
9524
|
+
actualOcrMode = "tesseract";
|
|
9525
|
+
}
|
|
9526
|
+
}
|
|
9527
|
+
if (!ocrProvider) {
|
|
9528
|
+
return { success: false, fileType: "image", error: "\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uD504\uB85C\uBC14\uC774\uB354\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };
|
|
9529
|
+
}
|
|
9530
|
+
const imageUint8Array = new Uint8Array(buffer);
|
|
9531
|
+
const ocrResult = await ocrProvider(imageUint8Array, 1, "image/png");
|
|
9532
|
+
if (ocrProvider.terminate) {
|
|
9533
|
+
await ocrProvider.terminate();
|
|
9534
|
+
}
|
|
9535
|
+
const markdown = typeof ocrResult === "string" ? ocrResult : ocrResult.markdown;
|
|
9536
|
+
const blocks = markdownToBlocks(markdown, 1);
|
|
9537
|
+
return {
|
|
9538
|
+
success: true,
|
|
9539
|
+
fileType: "image",
|
|
9540
|
+
markdown,
|
|
9541
|
+
blocks,
|
|
9542
|
+
isImageBased: true,
|
|
9543
|
+
warnings: [{ message: `OCR \uCC98\uB9AC\uB428 (${actualOcrMode})`, code: "OCR_FALLBACK" }]
|
|
9544
|
+
};
|
|
9545
|
+
} catch (err) {
|
|
9546
|
+
if (ocrProvider && ocrProvider.terminate) {
|
|
9547
|
+
await ocrProvider.terminate();
|
|
9548
|
+
}
|
|
9549
|
+
return { success: false, fileType: "image", error: err instanceof Error ? err.message : "\uC774\uBBF8\uC9C0 OCR \uC2E4\uD328", code: classifyError(err) };
|
|
9550
|
+
}
|
|
9551
|
+
}
|
|
9552
|
+
async function parseHwpx(buffer, options, zip) {
|
|
9398
9553
|
try {
|
|
9399
|
-
const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options);
|
|
9554
|
+
const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options, zip);
|
|
9400
9555
|
return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
9401
9556
|
} catch (err) {
|
|
9402
9557
|
return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
@@ -9419,17 +9574,17 @@ async function parsePdf(buffer, options) {
|
|
|
9419
9574
|
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
|
|
9420
9575
|
}
|
|
9421
9576
|
}
|
|
9422
|
-
async function parseXlsx(buffer, options) {
|
|
9577
|
+
async function parseXlsx(buffer, options, zip) {
|
|
9423
9578
|
try {
|
|
9424
|
-
const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options);
|
|
9579
|
+
const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options, zip);
|
|
9425
9580
|
return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
|
|
9426
9581
|
} catch (err) {
|
|
9427
9582
|
return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
9428
9583
|
}
|
|
9429
9584
|
}
|
|
9430
|
-
async function parseDocx(buffer, options) {
|
|
9585
|
+
async function parseDocx(buffer, options, zip) {
|
|
9431
9586
|
try {
|
|
9432
|
-
const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options);
|
|
9587
|
+
const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options, zip);
|
|
9433
9588
|
return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
9434
9589
|
} catch (err) {
|
|
9435
9590
|
return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
@@ -9624,4 +9779,4 @@ export {
|
|
|
9624
9779
|
cfb/cfb.js:
|
|
9625
9780
|
(*! crc32.js (C) 2014-present SheetJS -- http://sheetjs.com *)
|
|
9626
9781
|
*/
|
|
9627
|
-
//# sourceMappingURL=chunk-
|
|
9782
|
+
//# sourceMappingURL=chunk-UDFKY7CH.js.map
|