kordoc 2.2.6 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -3
- package/dist/{chunk-RF6UJXR3.js → chunk-KSBPABBQ.js} +482 -78
- package/dist/chunk-KSBPABBQ.js.map +1 -0
- package/dist/{chunk-5Y2Q3BRW.js → chunk-M3E3C5GS.js} +8 -1
- package/dist/chunk-M3E3C5GS.js.map +1 -0
- package/dist/{chunk-FCQEF2ZM.js → chunk-VJPDY4YT.js} +2 -2
- package/dist/{chunk-NL5XLN5R.js.map → chunk-VJPDY4YT.js.map} +1 -1
- package/dist/{chunk-HXUCZ2IL.cjs → chunk-VLSATRNQ.cjs} +2 -2
- package/dist/{chunk-HXUCZ2IL.cjs.map → chunk-VLSATRNQ.cjs.map} +1 -1
- package/dist/{chunk-NL5XLN5R.js → chunk-XG5CQUSC.js} +2 -2
- package/dist/{chunk-FCQEF2ZM.js.map → chunk-XG5CQUSC.js.map} +1 -1
- package/dist/cli.js +5 -5
- package/dist/cli.js.map +1 -1
- package/dist/{detect-GYK3HKD5.js → detect-I7YIS4Q6.js} +4 -2
- package/dist/index.cjs +608 -197
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +6 -2
- package/dist/index.d.ts +6 -2
- package/dist/index.js +500 -89
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +5 -5
- package/dist/{parser-AMP7MAOH.js → parser-4275GJRB.js} +45 -42
- package/dist/{parser-AMP7MAOH.js.map → parser-4275GJRB.js.map} +1 -1
- package/dist/{parser-KOWPTDJU.cjs → parser-STAOZMUC.cjs} +61 -58
- package/dist/{parser-KOWPTDJU.cjs.map → parser-STAOZMUC.cjs.map} +1 -1
- package/dist/{parser-43IAQ5KE.js → parser-XRUZEFZT.js} +45 -42
- package/dist/{parser-43IAQ5KE.js.map → parser-XRUZEFZT.js.map} +1 -1
- package/dist/{watch-IUQXOXW3.js → watch-BFLNFJBE.js} +4 -4
- package/package.json +2 -2
- package/dist/chunk-5Y2Q3BRW.js.map +0 -1
- package/dist/chunk-RF6UJXR3.js.map +0 -1
- /package/dist/{detect-GYK3HKD5.js.map → detect-I7YIS4Q6.js.map} +0 -0
- /package/dist/{watch-IUQXOXW3.js.map → watch-BFLNFJBE.js.map} +0 -0
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import {
|
|
3
3
|
detectFormat,
|
|
4
4
|
detectZipFormat
|
|
5
|
-
} from "./chunk-
|
|
5
|
+
} from "./chunk-M3E3C5GS.js";
|
|
6
6
|
import {
|
|
7
7
|
HEADING_RATIO_H1,
|
|
8
8
|
HEADING_RATIO_H2,
|
|
@@ -20,7 +20,7 @@ import {
|
|
|
20
20
|
sanitizeHref,
|
|
21
21
|
stripDtd,
|
|
22
22
|
toArrayBuffer
|
|
23
|
-
} from "./chunk-
|
|
23
|
+
} from "./chunk-VJPDY4YT.js";
|
|
24
24
|
import {
|
|
25
25
|
parsePageRange
|
|
26
26
|
} from "./chunk-MOL7MDBG.js";
|
|
@@ -29,6 +29,100 @@ import {
|
|
|
29
29
|
import JSZip from "jszip";
|
|
30
30
|
import { inflateRawSync } from "zlib";
|
|
31
31
|
import { DOMParser } from "@xmldom/xmldom";
|
|
32
|
+
|
|
33
|
+
// src/hwpx/com-fallback.ts
|
|
34
|
+
import { execFileSync } from "child_process";
|
|
35
|
+
import { platform } from "os";
|
|
36
|
+
function isComFallbackAvailable() {
|
|
37
|
+
return platform() === "win32";
|
|
38
|
+
}
|
|
39
|
+
function isEncryptedHwpx(manifestXml) {
|
|
40
|
+
return manifestXml.includes("encryption-data");
|
|
41
|
+
}
|
|
42
|
+
function extractTextViaCom(filePath) {
|
|
43
|
+
if (!isComFallbackAvailable()) {
|
|
44
|
+
throw new Error("COM fallback\uC740 Windows\uC5D0\uC11C\uB9CC \uC0AC\uC6A9 \uAC00\uB2A5\uD569\uB2C8\uB2E4");
|
|
45
|
+
}
|
|
46
|
+
const escaped = filePath.replace(/'/g, "''");
|
|
47
|
+
const ps1 = `
|
|
48
|
+
[Console]::OutputEncoding = [System.Text.Encoding]::UTF8
|
|
49
|
+
$ErrorActionPreference = 'Stop'
|
|
50
|
+
try {
|
|
51
|
+
$hwp = New-Object -ComObject HWPFrame.HwpObject
|
|
52
|
+
$hwp.RegisterModule('FilePathCheckerModule', 'FilePathCheckerModuleExample') | Out-Null
|
|
53
|
+
$hwp.Open('${escaped}', '', '') | Out-Null
|
|
54
|
+
$pc = $hwp.PageCount
|
|
55
|
+
$result = @{ pageCount = $pc; pages = @() }
|
|
56
|
+
for ($p = 1; $p -le $pc; $p++) {
|
|
57
|
+
$t = $hwp.GetPageText($p, 0)
|
|
58
|
+
$result.pages += @($t)
|
|
59
|
+
}
|
|
60
|
+
$hwp.Clear(1)
|
|
61
|
+
[System.Runtime.InteropServices.Marshal]::ReleaseComObject($hwp) | Out-Null
|
|
62
|
+
$result | ConvertTo-Json -Depth 3 -Compress
|
|
63
|
+
} catch {
|
|
64
|
+
@{ error = $_.Exception.Message } | ConvertTo-Json -Compress
|
|
65
|
+
}
|
|
66
|
+
`;
|
|
67
|
+
const stdout = execFileSync("powershell", [
|
|
68
|
+
"-NoProfile",
|
|
69
|
+
"-NonInteractive",
|
|
70
|
+
"-ExecutionPolicy",
|
|
71
|
+
"Bypass",
|
|
72
|
+
"-Command",
|
|
73
|
+
ps1
|
|
74
|
+
], {
|
|
75
|
+
encoding: "utf-8",
|
|
76
|
+
timeout: 12e4,
|
|
77
|
+
// 2분 타임아웃
|
|
78
|
+
windowsHide: true,
|
|
79
|
+
maxBuffer: 50 * 1024 * 1024
|
|
80
|
+
// 50MB
|
|
81
|
+
});
|
|
82
|
+
const trimmed = stdout.trim();
|
|
83
|
+
const jsonStart = trimmed.indexOf("{");
|
|
84
|
+
if (jsonStart < 0) throw new Error(`COM \uCD9C\uB825\uC5D0 JSON\uC774 \uC5C6\uC2B5\uB2C8\uB2E4: ${trimmed.slice(0, 200)}`);
|
|
85
|
+
const json = JSON.parse(trimmed.slice(jsonStart));
|
|
86
|
+
if (json.error) {
|
|
87
|
+
throw new Error(`COM \uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uC2E4\uD328: ${json.error}`);
|
|
88
|
+
}
|
|
89
|
+
const warnings = [];
|
|
90
|
+
const pages = Array.isArray(json.pages) ? json.pages : [];
|
|
91
|
+
const pageCount = json.pageCount ?? pages.length;
|
|
92
|
+
if (pages.length === 0) {
|
|
93
|
+
warnings.push({ message: "COM\uC73C\uB85C \uD14D\uC2A4\uD2B8\uB97C \uCD94\uCD9C\uD558\uC9C0 \uBABB\uD588\uC2B5\uB2C8\uB2E4", code: "COM_EMPTY" });
|
|
94
|
+
}
|
|
95
|
+
return { pages, pageCount, warnings };
|
|
96
|
+
}
|
|
97
|
+
function comResultToParseResult(pages, pageCount, warnings) {
|
|
98
|
+
const blocks = [];
|
|
99
|
+
const lines = [];
|
|
100
|
+
for (let i = 0; i < pages.length; i++) {
|
|
101
|
+
const text = (pages[i] ?? "").trim();
|
|
102
|
+
if (!text) continue;
|
|
103
|
+
const paragraphs = text.split(/\n/);
|
|
104
|
+
for (const para of paragraphs) {
|
|
105
|
+
const trimmed = para.trim();
|
|
106
|
+
if (!trimmed) continue;
|
|
107
|
+
blocks.push({ type: "paragraph", text: trimmed, pageNumber: i + 1 });
|
|
108
|
+
lines.push(trimmed);
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
const markdown = lines.join("\n\n");
|
|
112
|
+
const metadata = { pageCount };
|
|
113
|
+
warnings.push({
|
|
114
|
+
message: "DRM \uBB38\uC11C: \uD55C\uCEF4 COM API\uB85C \uD14D\uC2A4\uD2B8 \uCD94\uCD9C (\uC11C\uC2DD/\uD45C \uC815\uBCF4 \uC81C\uD55C\uC801)",
|
|
115
|
+
code: "DRM_COM_FALLBACK"
|
|
116
|
+
});
|
|
117
|
+
return {
|
|
118
|
+
markdown,
|
|
119
|
+
blocks,
|
|
120
|
+
metadata,
|
|
121
|
+
warnings: warnings.length > 0 ? warnings : void 0
|
|
122
|
+
};
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// src/hwpx/parser.ts
|
|
32
126
|
var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
|
|
33
127
|
var MAX_ZIP_ENTRIES = 500;
|
|
34
128
|
function clampSpan(val, max) {
|
|
@@ -133,6 +227,19 @@ async function parseHwpxDocument(buffer, options) {
|
|
|
133
227
|
if (actualEntryCount > MAX_ZIP_ENTRIES) {
|
|
134
228
|
throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
135
229
|
}
|
|
230
|
+
const manifestFile = zip.file("META-INF/manifest.xml");
|
|
231
|
+
if (manifestFile) {
|
|
232
|
+
const manifestXml = await manifestFile.async("text");
|
|
233
|
+
if (isEncryptedHwpx(manifestXml)) {
|
|
234
|
+
if (isComFallbackAvailable() && options?.filePath) {
|
|
235
|
+
const { pages, pageCount, warnings: warnings2 } = extractTextViaCom(options.filePath);
|
|
236
|
+
if (pages.some((p) => p && p.trim().length > 0)) {
|
|
237
|
+
return comResultToParseResult(pages, pageCount, warnings2);
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
throw new KordocError("DRM \uC554\uD638\uD654\uB41C HWPX \uD30C\uC77C\uC785\uB2C8\uB2E4. Windows + \uD55C\uCEF4 \uC624\uD53C\uC2A4 \uC124\uCE58 \uC2DC \uC790\uB3D9 \uCD94\uCD9C\uB429\uB2C8\uB2E4.");
|
|
241
|
+
}
|
|
242
|
+
}
|
|
136
243
|
const decompressed = { total: 0 };
|
|
137
244
|
const metadata = {};
|
|
138
245
|
await extractHwpxMetadata(zip, metadata, decompressed);
|
|
@@ -144,6 +251,7 @@ async function parseHwpxDocument(buffer, options) {
|
|
|
144
251
|
const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
|
|
145
252
|
const totalTarget = pageFilter ? pageFilter.size : sectionPaths.length;
|
|
146
253
|
const blocks = [];
|
|
254
|
+
const nestedTableCounter = { count: 0 };
|
|
147
255
|
let parsedSections = 0;
|
|
148
256
|
for (let si = 0; si < sectionPaths.length; si++) {
|
|
149
257
|
if (pageFilter && !pageFilter.has(si + 1)) continue;
|
|
@@ -153,7 +261,7 @@ async function parseHwpxDocument(buffer, options) {
|
|
|
153
261
|
const xml = await file.async("text");
|
|
154
262
|
decompressed.total += xml.length * 2;
|
|
155
263
|
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
156
|
-
blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1));
|
|
264
|
+
blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1, nestedTableCounter));
|
|
157
265
|
parsedSections++;
|
|
158
266
|
options?.onProgress?.(parsedSections, totalTarget);
|
|
159
267
|
} catch (secErr) {
|
|
@@ -214,8 +322,20 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
|
|
|
214
322
|
ref
|
|
215
323
|
// 절대 경로일 수도 있음
|
|
216
324
|
];
|
|
325
|
+
let resolvedPath = null;
|
|
326
|
+
if (!ref.includes(".")) {
|
|
327
|
+
const prefixes = [`BinData/${ref}`, `Contents/BinData/${ref}`];
|
|
328
|
+
for (const prefix of prefixes) {
|
|
329
|
+
const match = zip.file(new RegExp(`^${prefix.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\.[a-zA-Z0-9]+$`));
|
|
330
|
+
if (match.length > 0) {
|
|
331
|
+
resolvedPath = match[0].name;
|
|
332
|
+
break;
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
}
|
|
217
336
|
let found = false;
|
|
218
|
-
|
|
337
|
+
const allCandidates = resolvedPath ? [resolvedPath, ...candidates] : candidates;
|
|
338
|
+
for (const path of allCandidates) {
|
|
219
339
|
if (isPathTraversal(path)) continue;
|
|
220
340
|
const file = zip.file(path);
|
|
221
341
|
if (!file) continue;
|
|
@@ -223,7 +343,8 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
|
|
|
223
343
|
const data = await file.async("uint8array");
|
|
224
344
|
decompressed.total += data.length;
|
|
225
345
|
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
226
|
-
const
|
|
346
|
+
const actualPath = path;
|
|
347
|
+
const ext = actualPath.includes(".") ? actualPath.split(".").pop() || "png" : "png";
|
|
227
348
|
const mimeType = imageExtToMime(ext);
|
|
228
349
|
imageIndex++;
|
|
229
350
|
const filename = `image_${String(imageIndex).padStart(3, "0")}.${mimeToExt(mimeType)}`;
|
|
@@ -309,6 +430,7 @@ function extractFromBrokenZip(buffer) {
|
|
|
309
430
|
let totalDecompressed = 0;
|
|
310
431
|
let entryCount = 0;
|
|
311
432
|
let sectionNum = 0;
|
|
433
|
+
const nestedTableCounter = { count: 0 };
|
|
312
434
|
while (pos < data.length - 30) {
|
|
313
435
|
if (data[pos] !== 80 || data[pos + 1] !== 75 || data[pos + 2] !== 3 || data[pos + 3] !== 4) {
|
|
314
436
|
pos++;
|
|
@@ -355,7 +477,7 @@ function extractFromBrokenZip(buffer) {
|
|
|
355
477
|
totalDecompressed += content.length * 2;
|
|
356
478
|
if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
|
|
357
479
|
sectionNum++;
|
|
358
|
-
blocks.push(...parseSectionXml(content, void 0, warnings, sectionNum));
|
|
480
|
+
blocks.push(...parseSectionXml(content, void 0, warnings, sectionNum, nestedTableCounter));
|
|
359
481
|
} catch {
|
|
360
482
|
continue;
|
|
361
483
|
}
|
|
@@ -440,12 +562,40 @@ function detectHwpxHeadings(blocks, styleMap) {
|
|
|
440
562
|
}
|
|
441
563
|
}
|
|
442
564
|
}
|
|
443
|
-
function
|
|
565
|
+
function makeNestedTableMarker(counter, rows) {
|
|
566
|
+
counter.count++;
|
|
567
|
+
const firstRow = rows[0] ?? [];
|
|
568
|
+
const hint = firstRow.map((c) => c.text.trim().replace(/\n/g, " ")).filter(Boolean).join(" | ");
|
|
569
|
+
const hintChars = [...hint];
|
|
570
|
+
const truncated = hintChars.length > 60 ? hintChars.slice(0, 60).join("") + "\u2026" : hint;
|
|
571
|
+
return truncated ? `[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}: ${truncated}]` : `[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}]`;
|
|
572
|
+
}
|
|
573
|
+
function handleNestedTable(newTable, tableStack, blocks, ctx) {
|
|
574
|
+
const parentTable = tableStack.pop();
|
|
575
|
+
let nestedCols = 0;
|
|
576
|
+
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
577
|
+
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
578
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: ctx.sectionNum });
|
|
579
|
+
if (parentTable.cell) {
|
|
580
|
+
const marker = ctx.counter ? makeNestedTableMarker(ctx.counter, newTable.rows) : "[\uC911\uCCA9 \uD14C\uC774\uBE14]";
|
|
581
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + marker;
|
|
582
|
+
}
|
|
583
|
+
} else {
|
|
584
|
+
const nestedText = convertTableToText(newTable.rows);
|
|
585
|
+
if (parentTable.cell) {
|
|
586
|
+
const marker = ctx.counter ? makeNestedTableMarker(ctx.counter, newTable.rows) : "[\uC911\uCCA9 \uD14C\uC774\uBE14]";
|
|
587
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + marker + "\n" + nestedText;
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
return parentTable;
|
|
591
|
+
}
|
|
592
|
+
function parseSectionXml(xml, styleMap, warnings, sectionNum, counter) {
|
|
444
593
|
const parser = createXmlParser(warnings);
|
|
445
594
|
const doc = parser.parseFromString(stripDtd(xml), "text/xml");
|
|
446
595
|
if (!doc.documentElement) return [];
|
|
447
596
|
const blocks = [];
|
|
448
|
-
|
|
597
|
+
const ctx = { styleMap, warnings, sectionNum, counter };
|
|
598
|
+
walkSection(doc.documentElement, blocks, null, [], ctx);
|
|
449
599
|
return blocks;
|
|
450
600
|
}
|
|
451
601
|
function extractImageRef(el) {
|
|
@@ -466,7 +616,7 @@ function extractImageRef(el) {
|
|
|
466
616
|
if (directRef) return directRef;
|
|
467
617
|
return null;
|
|
468
618
|
}
|
|
469
|
-
function walkSection(node, blocks, tableCtx, tableStack,
|
|
619
|
+
function walkSection(node, blocks, tableCtx, tableStack, ctx, depth = 0) {
|
|
470
620
|
if (depth > MAX_XML_DEPTH) return;
|
|
471
621
|
const children = node.childNodes;
|
|
472
622
|
if (!children) return;
|
|
@@ -479,23 +629,12 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
479
629
|
case "tbl": {
|
|
480
630
|
if (tableCtx) tableStack.push(tableCtx);
|
|
481
631
|
const newTable = { rows: [], currentRow: [], cell: null };
|
|
482
|
-
walkSection(el, blocks, newTable, tableStack,
|
|
632
|
+
walkSection(el, blocks, newTable, tableStack, ctx, depth + 1);
|
|
483
633
|
if (newTable.rows.length > 0) {
|
|
484
634
|
if (tableStack.length > 0) {
|
|
485
|
-
|
|
486
|
-
let nestedCols = 0;
|
|
487
|
-
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
488
|
-
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
489
|
-
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
490
|
-
} else {
|
|
491
|
-
const nestedText = convertTableToText(newTable.rows);
|
|
492
|
-
if (parentTable.cell) {
|
|
493
|
-
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
494
|
-
}
|
|
495
|
-
}
|
|
496
|
-
tableCtx = parentTable;
|
|
635
|
+
tableCtx = handleNestedTable(newTable, tableStack, blocks, ctx);
|
|
497
636
|
} else {
|
|
498
|
-
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
637
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: ctx.sectionNum });
|
|
499
638
|
tableCtx = null;
|
|
500
639
|
}
|
|
501
640
|
} else {
|
|
@@ -506,7 +645,7 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
506
645
|
case "tr":
|
|
507
646
|
if (tableCtx) {
|
|
508
647
|
tableCtx.currentRow = [];
|
|
509
|
-
walkSection(el, blocks, tableCtx, tableStack,
|
|
648
|
+
walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
510
649
|
if (tableCtx.currentRow.length > 0) tableCtx.rows.push(tableCtx.currentRow);
|
|
511
650
|
tableCtx.currentRow = [];
|
|
512
651
|
}
|
|
@@ -514,7 +653,7 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
514
653
|
case "tc":
|
|
515
654
|
if (tableCtx) {
|
|
516
655
|
tableCtx.cell = { text: "", colSpan: 1, rowSpan: 1 };
|
|
517
|
-
walkSection(el, blocks, tableCtx, tableStack,
|
|
656
|
+
walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
518
657
|
if (tableCtx.cell) {
|
|
519
658
|
tableCtx.currentRow.push(tableCtx.cell);
|
|
520
659
|
tableCtx.cell = null;
|
|
@@ -540,19 +679,19 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
540
679
|
}
|
|
541
680
|
break;
|
|
542
681
|
case "p": {
|
|
543
|
-
const { text, href, footnote, style } = extractParagraphInfo(el, styleMap);
|
|
682
|
+
const { text, href, footnote, style } = extractParagraphInfo(el, ctx.styleMap);
|
|
544
683
|
if (text) {
|
|
545
684
|
if (tableCtx?.cell) {
|
|
546
685
|
tableCtx.cell.text += (tableCtx.cell.text ? "\n" : "") + text;
|
|
547
686
|
} else if (!tableCtx) {
|
|
548
|
-
const block = { type: "paragraph", text, pageNumber: sectionNum };
|
|
687
|
+
const block = { type: "paragraph", text, pageNumber: ctx.sectionNum };
|
|
549
688
|
if (style) block.style = style;
|
|
550
689
|
if (href) block.href = href;
|
|
551
690
|
if (footnote) block.footnoteText = footnote;
|
|
552
691
|
blocks.push(block);
|
|
553
692
|
}
|
|
554
693
|
}
|
|
555
|
-
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack,
|
|
694
|
+
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
556
695
|
break;
|
|
557
696
|
}
|
|
558
697
|
// 이미지/그림 — 경로 추출 또는 경고
|
|
@@ -561,19 +700,19 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
561
700
|
case "drawingObject": {
|
|
562
701
|
const imgRef = extractImageRef(el);
|
|
563
702
|
if (imgRef) {
|
|
564
|
-
blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
|
|
565
|
-
} else if (warnings && sectionNum) {
|
|
566
|
-
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
703
|
+
blocks.push({ type: "image", text: imgRef, pageNumber: ctx.sectionNum });
|
|
704
|
+
} else if (ctx.warnings && ctx.sectionNum) {
|
|
705
|
+
ctx.warnings.push({ page: ctx.sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
567
706
|
}
|
|
568
707
|
break;
|
|
569
708
|
}
|
|
570
709
|
default:
|
|
571
|
-
walkSection(el, blocks, tableCtx, tableStack,
|
|
710
|
+
walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
572
711
|
break;
|
|
573
712
|
}
|
|
574
713
|
}
|
|
575
714
|
}
|
|
576
|
-
function walkParagraphChildren(node, blocks, tableCtx, tableStack,
|
|
715
|
+
function walkParagraphChildren(node, blocks, tableCtx, tableStack, ctx, depth = 0) {
|
|
577
716
|
if (depth > MAX_XML_DEPTH) return tableCtx;
|
|
578
717
|
const children = node.childNodes;
|
|
579
718
|
if (!children) return tableCtx;
|
|
@@ -589,23 +728,12 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
589
728
|
if (localTag === "tbl") {
|
|
590
729
|
if (tableCtx) tableStack.push(tableCtx);
|
|
591
730
|
const newTable = { rows: [], currentRow: [], cell: null };
|
|
592
|
-
walkSection(el, blocks, newTable, tableStack,
|
|
731
|
+
walkSection(el, blocks, newTable, tableStack, ctx, d + 1);
|
|
593
732
|
if (newTable.rows.length > 0) {
|
|
594
733
|
if (tableStack.length > 0) {
|
|
595
|
-
|
|
596
|
-
let nestedCols = 0;
|
|
597
|
-
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
598
|
-
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
599
|
-
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
600
|
-
} else {
|
|
601
|
-
const nestedText = convertTableToText(newTable.rows);
|
|
602
|
-
if (parentTable.cell) {
|
|
603
|
-
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
604
|
-
}
|
|
605
|
-
}
|
|
606
|
-
tableCtx = parentTable;
|
|
734
|
+
tableCtx = handleNestedTable(newTable, tableStack, blocks, ctx);
|
|
607
735
|
} else {
|
|
608
|
-
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
736
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: ctx.sectionNum });
|
|
609
737
|
tableCtx = null;
|
|
610
738
|
}
|
|
611
739
|
} else {
|
|
@@ -614,21 +742,21 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
614
742
|
} else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
|
|
615
743
|
const drawTextChild = findDescendant(el, "drawText");
|
|
616
744
|
if (drawTextChild) {
|
|
617
|
-
extractDrawTextBlocks(drawTextChild, blocks, styleMap, sectionNum);
|
|
745
|
+
extractDrawTextBlocks(drawTextChild, blocks, ctx.styleMap, ctx.sectionNum);
|
|
618
746
|
} else {
|
|
619
747
|
const imgRef = extractImageRef(el);
|
|
620
748
|
if (imgRef) {
|
|
621
|
-
blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
|
|
622
|
-
} else if (warnings && sectionNum) {
|
|
623
|
-
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
749
|
+
blocks.push({ type: "image", text: imgRef, pageNumber: ctx.sectionNum });
|
|
750
|
+
} else if (ctx.warnings && ctx.sectionNum) {
|
|
751
|
+
ctx.warnings.push({ page: ctx.sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
624
752
|
}
|
|
625
753
|
}
|
|
626
754
|
} else if (localTag === "drawText") {
|
|
627
|
-
extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
|
|
755
|
+
extractDrawTextBlocks(el, blocks, ctx.styleMap, ctx.sectionNum);
|
|
628
756
|
} else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
|
|
629
757
|
walkChildren(el, d + 1);
|
|
630
758
|
} else if (localTag === "run") {
|
|
631
|
-
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack,
|
|
759
|
+
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
632
760
|
}
|
|
633
761
|
}
|
|
634
762
|
};
|
|
@@ -1901,6 +2029,7 @@ function parseHwp5Document(buffer, options) {
|
|
|
1901
2029
|
const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
|
|
1902
2030
|
const totalTarget = pageFilter ? pageFilter.size : sections.length;
|
|
1903
2031
|
const blocks = [];
|
|
2032
|
+
const nestedTableCounter = { count: 0 };
|
|
1904
2033
|
let totalDecompressed = 0;
|
|
1905
2034
|
let parsedSections = 0;
|
|
1906
2035
|
for (let si = 0; si < sections.length; si++) {
|
|
@@ -1911,7 +2040,7 @@ function parseHwp5Document(buffer, options) {
|
|
|
1911
2040
|
totalDecompressed += data.length;
|
|
1912
2041
|
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
1913
2042
|
const records = readRecords(data);
|
|
1914
|
-
const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
|
|
2043
|
+
const sectionBlocks = parseSection(records, docInfo, warnings, si + 1, nestedTableCounter);
|
|
1915
2044
|
blocks.push(...sectionBlocks);
|
|
1916
2045
|
parsedSections++;
|
|
1917
2046
|
options?.onProgress?.(parsedSections, totalTarget);
|
|
@@ -2245,13 +2374,13 @@ function extractHwp5ImagesLenient(lcfb, blocks, compressed, warnings) {
|
|
|
2245
2374
|
}
|
|
2246
2375
|
return images;
|
|
2247
2376
|
}
|
|
2248
|
-
function parseSection(records, docInfo, warnings, sectionNum) {
|
|
2377
|
+
function parseSection(records, docInfo, warnings, sectionNum, counter) {
|
|
2249
2378
|
const blocks = [];
|
|
2250
2379
|
let i = 0;
|
|
2251
2380
|
while (i < records.length) {
|
|
2252
2381
|
const rec = records[i];
|
|
2253
2382
|
if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
|
|
2254
|
-
const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i);
|
|
2383
|
+
const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i, counter);
|
|
2255
2384
|
if (paragraph) {
|
|
2256
2385
|
const block = { type: "paragraph", text: paragraph, pageNumber: sectionNum };
|
|
2257
2386
|
if (docInfo && charShapeIds.length > 0) {
|
|
@@ -2274,7 +2403,7 @@ function parseSection(records, docInfo, warnings, sectionNum) {
|
|
|
2274
2403
|
if (rec.tagId === TAG_CTRL_HEADER && rec.level <= 1 && rec.data.length >= 4) {
|
|
2275
2404
|
const ctrlId = rec.data.subarray(0, 4).toString("ascii");
|
|
2276
2405
|
if (ctrlId === " lbt" || ctrlId === "tbl ") {
|
|
2277
|
-
const { table, nextIdx } = parseTableBlock(records, i);
|
|
2406
|
+
const { table, nextIdx } = parseTableBlock(records, i, counter);
|
|
2278
2407
|
if (table) blocks.push({ type: "table", table, pageNumber: sectionNum });
|
|
2279
2408
|
i = nextIdx;
|
|
2280
2409
|
continue;
|
|
@@ -2379,7 +2508,7 @@ function resolveCharStyle(charShapeIds, docInfo) {
|
|
|
2379
2508
|
if (cs.attrFlags & 2) style.bold = true;
|
|
2380
2509
|
return style.fontSize || style.bold || style.italic ? style : void 0;
|
|
2381
2510
|
}
|
|
2382
|
-
function parseParagraphWithTables(records, startIdx) {
|
|
2511
|
+
function parseParagraphWithTables(records, startIdx, counter) {
|
|
2383
2512
|
const startLevel = records[startIdx].level;
|
|
2384
2513
|
let text = "";
|
|
2385
2514
|
const tables = [];
|
|
@@ -2401,7 +2530,7 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
2401
2530
|
if (rec.tagId === TAG_CTRL_HEADER && rec.data.length >= 4) {
|
|
2402
2531
|
const ctrlId = rec.data.subarray(0, 4).toString("ascii");
|
|
2403
2532
|
if (ctrlId === " lbt" || ctrlId === "tbl ") {
|
|
2404
|
-
const { table, nextIdx } = parseTableBlock(records, i);
|
|
2533
|
+
const { table, nextIdx } = parseTableBlock(records, i, counter);
|
|
2405
2534
|
if (table) tables.push(table);
|
|
2406
2535
|
i = nextIdx;
|
|
2407
2536
|
continue;
|
|
@@ -2412,7 +2541,7 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
2412
2541
|
const trimmed = text.trim();
|
|
2413
2542
|
return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds, paraShapeId };
|
|
2414
2543
|
}
|
|
2415
|
-
function parseTableBlock(records, startIdx) {
|
|
2544
|
+
function parseTableBlock(records, startIdx, counter) {
|
|
2416
2545
|
const tableLevel = records[startIdx].level;
|
|
2417
2546
|
let i = startIdx + 1;
|
|
2418
2547
|
let rows = 0, cols = 0;
|
|
@@ -2426,7 +2555,7 @@ function parseTableBlock(records, startIdx) {
|
|
|
2426
2555
|
cols = Math.min(rec.data.readUInt16LE(6), MAX_COLS);
|
|
2427
2556
|
}
|
|
2428
2557
|
if (rec.tagId === TAG_LIST_HEADER) {
|
|
2429
|
-
const { cell, nextIdx } = parseCellBlock(records, i, tableLevel);
|
|
2558
|
+
const { cell, nextIdx } = parseCellBlock(records, i, tableLevel, counter);
|
|
2430
2559
|
if (cell) cells.push(cell);
|
|
2431
2560
|
i = nextIdx;
|
|
2432
2561
|
continue;
|
|
@@ -2447,7 +2576,7 @@ function parseTableBlock(records, startIdx) {
|
|
|
2447
2576
|
const cellRows = arrangeCells(rows, cols, cells);
|
|
2448
2577
|
return { table: buildTable(cellRows), nextIdx: i };
|
|
2449
2578
|
}
|
|
2450
|
-
function parseCellBlock(records, startIdx, tableLevel) {
|
|
2579
|
+
function parseCellBlock(records, startIdx, tableLevel, counter) {
|
|
2451
2580
|
const rec = records[startIdx];
|
|
2452
2581
|
const cellLevel = rec.level;
|
|
2453
2582
|
const texts = [];
|
|
@@ -2472,6 +2601,17 @@ function parseCellBlock(records, startIdx, tableLevel) {
|
|
|
2472
2601
|
const t = extractText(r.data).trim();
|
|
2473
2602
|
if (t) texts.push(t);
|
|
2474
2603
|
}
|
|
2604
|
+
if (r.tagId === TAG_CTRL_HEADER && r.data.length >= 4) {
|
|
2605
|
+
const ctrlId = r.data.subarray(0, 4).toString("ascii");
|
|
2606
|
+
if (ctrlId === " lbt" || ctrlId === "tbl ") {
|
|
2607
|
+
if (counter) {
|
|
2608
|
+
counter.count++;
|
|
2609
|
+
texts.push(`[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}]`);
|
|
2610
|
+
} else {
|
|
2611
|
+
texts.push("[\uC911\uCCA9 \uD14C\uC774\uBE14]");
|
|
2612
|
+
}
|
|
2613
|
+
}
|
|
2614
|
+
}
|
|
2475
2615
|
i++;
|
|
2476
2616
|
}
|
|
2477
2617
|
return { cell: { text: texts.join("\n"), colSpan, rowSpan, colAddr, rowAddr }, nextIdx: i };
|
|
@@ -3829,21 +3969,21 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
3829
3969
|
import JSZip5 from "jszip";
|
|
3830
3970
|
import { DOMParser as DOMParser4 } from "@xmldom/xmldom";
|
|
3831
3971
|
var MAX_DECOMPRESS_SIZE4 = 100 * 1024 * 1024;
|
|
3832
|
-
function getChildElements(parent,
|
|
3972
|
+
function getChildElements(parent, localName3) {
|
|
3833
3973
|
const result = [];
|
|
3834
3974
|
const children = parent.childNodes;
|
|
3835
3975
|
for (let i = 0; i < children.length; i++) {
|
|
3836
3976
|
const node = children[i];
|
|
3837
3977
|
if (node.nodeType === 1) {
|
|
3838
3978
|
const el = node;
|
|
3839
|
-
if (el.localName ===
|
|
3979
|
+
if (el.localName === localName3 || el.tagName?.endsWith(`:${localName3}`)) {
|
|
3840
3980
|
result.push(el);
|
|
3841
3981
|
}
|
|
3842
3982
|
}
|
|
3843
3983
|
}
|
|
3844
3984
|
return result;
|
|
3845
3985
|
}
|
|
3846
|
-
function findElements(parent,
|
|
3986
|
+
function findElements(parent, localName3) {
|
|
3847
3987
|
const result = [];
|
|
3848
3988
|
const walk = (node) => {
|
|
3849
3989
|
const children = node.childNodes;
|
|
@@ -3851,7 +3991,7 @@ function findElements(parent, localName2) {
|
|
|
3851
3991
|
const child = children[i];
|
|
3852
3992
|
if (child.nodeType === 1) {
|
|
3853
3993
|
const el = child;
|
|
3854
|
-
if (el.localName ===
|
|
3994
|
+
if (el.localName === localName3 || el.tagName?.endsWith(`:${localName3}`)) {
|
|
3855
3995
|
result.push(el);
|
|
3856
3996
|
}
|
|
3857
3997
|
walk(el);
|
|
@@ -3861,11 +4001,11 @@ function findElements(parent, localName2) {
|
|
|
3861
4001
|
walk(parent);
|
|
3862
4002
|
return result;
|
|
3863
4003
|
}
|
|
3864
|
-
function getAttr(el,
|
|
4004
|
+
function getAttr(el, localName3) {
|
|
3865
4005
|
const attrs = el.attributes;
|
|
3866
4006
|
for (let i = 0; i < attrs.length; i++) {
|
|
3867
4007
|
const attr = attrs[i];
|
|
3868
|
-
if (attr.localName ===
|
|
4008
|
+
if (attr.localName === localName3 || attr.name === localName3) return attr.value;
|
|
3869
4009
|
}
|
|
3870
4010
|
return null;
|
|
3871
4011
|
}
|
|
@@ -4212,11 +4352,11 @@ async function parseDocxDocument(buffer, options) {
|
|
|
4212
4352
|
const node = children[i];
|
|
4213
4353
|
if (node.nodeType !== 1) continue;
|
|
4214
4354
|
const el = node;
|
|
4215
|
-
const
|
|
4216
|
-
if (
|
|
4355
|
+
const localName3 = el.localName ?? el.tagName?.split(":").pop();
|
|
4356
|
+
if (localName3 === "p") {
|
|
4217
4357
|
const block = parseParagraph(el, styles, numbering, footnotes, rels);
|
|
4218
4358
|
if (block) blocks.push(block);
|
|
4219
|
-
} else if (
|
|
4359
|
+
} else if (localName3 === "tbl") {
|
|
4220
4360
|
const block = parseTable(el, styles, numbering, footnotes, rels);
|
|
4221
4361
|
if (block) blocks.push(block);
|
|
4222
4362
|
}
|
|
@@ -4254,9 +4394,263 @@ async function parseDocxDocument(buffer, options) {
|
|
|
4254
4394
|
};
|
|
4255
4395
|
}
|
|
4256
4396
|
|
|
4397
|
+
// src/hwpml/parser.ts
|
|
4398
|
+
import { DOMParser as DOMParser5 } from "@xmldom/xmldom";
|
|
4399
|
+
var MAX_XML_DEPTH2 = 200;
|
|
4400
|
+
var MAX_TABLE_ROWS = 5e3;
|
|
4401
|
+
var MAX_TABLE_COLS = 500;
|
|
4402
|
+
var MAX_HWPML_BYTES = 50 * 1024 * 1024;
|
|
4403
|
+
function parseHwpmlDocument(buffer, options) {
|
|
4404
|
+
if (buffer.byteLength > MAX_HWPML_BYTES) {
|
|
4405
|
+
throw new Error(`HWPML \uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC (${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB > 50MB)`);
|
|
4406
|
+
}
|
|
4407
|
+
const text = new TextDecoder("utf-8").decode(buffer).replace(/^\uFEFF/, "");
|
|
4408
|
+
const normalized = text.replace(/ /g, " ");
|
|
4409
|
+
const xml = stripDtd(normalized);
|
|
4410
|
+
const warnings = [];
|
|
4411
|
+
const parser = new DOMParser5({
|
|
4412
|
+
onError: (_level, msg) => {
|
|
4413
|
+
warnings.push({ message: `HWPML XML \uD30C\uC2F1 \uACBD\uACE0: ${msg}`, code: "MALFORMED_XML" });
|
|
4414
|
+
}
|
|
4415
|
+
});
|
|
4416
|
+
const doc = parser.parseFromString(xml, "text/xml");
|
|
4417
|
+
if (!doc.documentElement) {
|
|
4418
|
+
return { markdown: "", blocks: [], warnings };
|
|
4419
|
+
}
|
|
4420
|
+
const root = doc.documentElement;
|
|
4421
|
+
const metadata = {};
|
|
4422
|
+
const docSummary = findChild(root, "DOCSUMMARY");
|
|
4423
|
+
if (docSummary) {
|
|
4424
|
+
const title = findChild(docSummary, "TITLE");
|
|
4425
|
+
const author = findChild(docSummary, "AUTHOR");
|
|
4426
|
+
const date = findChild(docSummary, "DATE");
|
|
4427
|
+
if (title) metadata.title = textContent(title).trim();
|
|
4428
|
+
if (author) metadata.author = textContent(author).trim();
|
|
4429
|
+
if (date) metadata.createdAt = textContent(date).trim() || void 0;
|
|
4430
|
+
}
|
|
4431
|
+
const paraShapeMap = buildParaShapeMap(root);
|
|
4432
|
+
const body = findChild(root, "BODY");
|
|
4433
|
+
if (!body) {
|
|
4434
|
+
return { markdown: "", blocks: [], metadata, warnings };
|
|
4435
|
+
}
|
|
4436
|
+
const blocks = [];
|
|
4437
|
+
const pageFilter = options?.pages ? parsePageRange(options.pages, countSections(body)) : null;
|
|
4438
|
+
let sectionIdx = 0;
|
|
4439
|
+
const children = body.childNodes;
|
|
4440
|
+
for (let i = 0; i < children.length; i++) {
|
|
4441
|
+
const el = children[i];
|
|
4442
|
+
if (el.nodeType !== 1) continue;
|
|
4443
|
+
if (localName2(el) !== "SECTION") continue;
|
|
4444
|
+
sectionIdx++;
|
|
4445
|
+
if (pageFilter && !pageFilter.has(sectionIdx)) continue;
|
|
4446
|
+
parseSection2(el, blocks, paraShapeMap, sectionIdx, warnings);
|
|
4447
|
+
}
|
|
4448
|
+
const outline = blocks.filter((b) => b.type === "heading" && b.text).map((b) => ({ level: b.level ?? 1, text: b.text, pageNumber: b.pageNumber }));
|
|
4449
|
+
const markdown = blocksToMarkdown(blocks);
|
|
4450
|
+
return {
|
|
4451
|
+
markdown,
|
|
4452
|
+
blocks,
|
|
4453
|
+
metadata: Object.keys(metadata).length > 0 ? metadata : void 0,
|
|
4454
|
+
outline: outline.length > 0 ? outline : void 0,
|
|
4455
|
+
warnings: warnings.length > 0 ? warnings : void 0
|
|
4456
|
+
};
|
|
4457
|
+
}
|
|
4458
|
+
function buildParaShapeMap(root) {
|
|
4459
|
+
const map = /* @__PURE__ */ new Map();
|
|
4460
|
+
const head = findChild(root, "HEAD");
|
|
4461
|
+
if (!head) return map;
|
|
4462
|
+
const mappingTable = findChild(head, "MAPPINGTABLE");
|
|
4463
|
+
if (!mappingTable) return map;
|
|
4464
|
+
const paraShapeList = findChild(mappingTable, "PARASHAPELIST");
|
|
4465
|
+
if (!paraShapeList) return map;
|
|
4466
|
+
const children = paraShapeList.childNodes;
|
|
4467
|
+
for (let i = 0; i < children.length; i++) {
|
|
4468
|
+
const el = children[i];
|
|
4469
|
+
if (el.nodeType !== 1 || localName2(el) !== "PARASHAPE") continue;
|
|
4470
|
+
const id = el.getAttribute("Id") ?? "";
|
|
4471
|
+
const headingType = el.getAttribute("HeadingType") ?? "None";
|
|
4472
|
+
const level = parseInt(el.getAttribute("Level") ?? "0", 10);
|
|
4473
|
+
let headingLevel = null;
|
|
4474
|
+
if (headingType === "Outline") {
|
|
4475
|
+
const safeLevel = isNaN(level) ? 0 : Math.max(0, level);
|
|
4476
|
+
headingLevel = Math.min(safeLevel + 1, 6);
|
|
4477
|
+
}
|
|
4478
|
+
map.set(id, { headingLevel });
|
|
4479
|
+
}
|
|
4480
|
+
return map;
|
|
4481
|
+
}
|
|
4482
|
+
function parseSection2(section, blocks, paraShapeMap, sectionNum, warnings) {
|
|
4483
|
+
walkContent(section, blocks, paraShapeMap, sectionNum, warnings, false);
|
|
4484
|
+
}
|
|
4485
|
+
function walkContent(node, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth = 0) {
|
|
4486
|
+
if (depth > MAX_XML_DEPTH2) return;
|
|
4487
|
+
const children = node.childNodes;
|
|
4488
|
+
for (let i = 0; i < children.length; i++) {
|
|
4489
|
+
const el = children[i];
|
|
4490
|
+
if (el.nodeType !== 1) continue;
|
|
4491
|
+
const tag = localName2(el);
|
|
4492
|
+
if (tag === "HEADER" || tag === "FOOTER") {
|
|
4493
|
+
continue;
|
|
4494
|
+
}
|
|
4495
|
+
if (tag === "P") {
|
|
4496
|
+
if (!inHeaderFooter) {
|
|
4497
|
+
parseParagraph2(el, blocks, paraShapeMap, sectionNum);
|
|
4498
|
+
}
|
|
4499
|
+
continue;
|
|
4500
|
+
}
|
|
4501
|
+
if (tag === "TABLE") {
|
|
4502
|
+
if (!inHeaderFooter) {
|
|
4503
|
+
parseTable2(el, blocks, paraShapeMap, sectionNum, warnings);
|
|
4504
|
+
}
|
|
4505
|
+
continue;
|
|
4506
|
+
}
|
|
4507
|
+
if (tag === "PARALIST" || tag === "SECTION" || tag === "COLDEF") {
|
|
4508
|
+
walkContent(el, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth + 1);
|
|
4509
|
+
continue;
|
|
4510
|
+
}
|
|
4511
|
+
walkContent(el, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth + 1);
|
|
4512
|
+
}
|
|
4513
|
+
}
|
|
4514
|
+
function parseParagraph2(el, blocks, paraShapeMap, sectionNum) {
|
|
4515
|
+
const paraShapeId = el.getAttribute("ParaShape") ?? "";
|
|
4516
|
+
const shapeInfo = paraShapeMap.get(paraShapeId);
|
|
4517
|
+
const text = extractParagraphText(el);
|
|
4518
|
+
if (!text) return;
|
|
4519
|
+
if (shapeInfo?.headingLevel != null) {
|
|
4520
|
+
blocks.push({ type: "heading", text, level: shapeInfo.headingLevel, pageNumber: sectionNum });
|
|
4521
|
+
} else {
|
|
4522
|
+
blocks.push({ type: "paragraph", text, pageNumber: sectionNum });
|
|
4523
|
+
}
|
|
4524
|
+
}
|
|
4525
|
+
function extractParagraphText(p) {
|
|
4526
|
+
const parts = [];
|
|
4527
|
+
collectCharText(p, parts);
|
|
4528
|
+
return parts.join("").trim();
|
|
4529
|
+
}
|
|
4530
|
+
function collectCharText(node, parts, depth = 0) {
|
|
4531
|
+
if (depth > MAX_XML_DEPTH2) return;
|
|
4532
|
+
const children = node.childNodes;
|
|
4533
|
+
for (let i = 0; i < children.length; i++) {
|
|
4534
|
+
const el = children[i];
|
|
4535
|
+
if (el.nodeType !== 1) continue;
|
|
4536
|
+
const tag = localName2(el);
|
|
4537
|
+
if (tag === "CHAR") {
|
|
4538
|
+
const t = textContent(el);
|
|
4539
|
+
if (t) parts.push(t);
|
|
4540
|
+
} else if (tag === "TABLE" || tag === "PICTURE" || tag === "SHAPEOBJECT") {
|
|
4541
|
+
} else if (tag === "AUTONUM") {
|
|
4542
|
+
} else {
|
|
4543
|
+
collectCharText(el, parts, depth + 1);
|
|
4544
|
+
}
|
|
4545
|
+
}
|
|
4546
|
+
}
|
|
4547
|
+
function parseTable2(el, blocks, paraShapeMap, sectionNum, warnings) {
|
|
4548
|
+
const cells = [];
|
|
4549
|
+
const rowCount = parseInt(el.getAttribute("RowCount") ?? "0", 10);
|
|
4550
|
+
const colCount = parseInt(el.getAttribute("ColCount") ?? "0", 10);
|
|
4551
|
+
if (isNaN(rowCount) || isNaN(colCount) || rowCount === 0 || colCount === 0) return;
|
|
4552
|
+
if (rowCount > MAX_TABLE_ROWS || colCount > MAX_TABLE_COLS) {
|
|
4553
|
+
warnings.push({ message: `\uD14C\uC774\uBE14 \uD06C\uAE30 \uCD08\uACFC (${rowCount}x${colCount}) \u2014 \uC2A4\uD0B5`, code: "TRUNCATED_TABLE" });
|
|
4554
|
+
return;
|
|
4555
|
+
}
|
|
4556
|
+
const children = el.childNodes;
|
|
4557
|
+
for (let i = 0; i < children.length; i++) {
|
|
4558
|
+
const rowEl = children[i];
|
|
4559
|
+
if (rowEl.nodeType !== 1 || localName2(rowEl) !== "ROW") continue;
|
|
4560
|
+
const rowCells = rowEl.childNodes;
|
|
4561
|
+
for (let j = 0; j < rowCells.length; j++) {
|
|
4562
|
+
const cellEl = rowCells[j];
|
|
4563
|
+
if (cellEl.nodeType !== 1 || localName2(cellEl) !== "CELL") continue;
|
|
4564
|
+
const colAddr = parseInt(cellEl.getAttribute("ColAddr") ?? "0", 10);
|
|
4565
|
+
const rowAddr = parseInt(cellEl.getAttribute("RowAddr") ?? "0", 10);
|
|
4566
|
+
const colSpan = Math.min(Math.max(1, parseInt(cellEl.getAttribute("ColSpan") ?? "1", 10) || 1), MAX_TABLE_COLS);
|
|
4567
|
+
const rowSpan = Math.min(Math.max(1, parseInt(cellEl.getAttribute("RowSpan") ?? "1", 10) || 1), MAX_TABLE_ROWS);
|
|
4568
|
+
const cellText = extractCellText2(cellEl);
|
|
4569
|
+
cells.push({ text: cellText, colSpan, rowSpan, colAddr, rowAddr });
|
|
4570
|
+
}
|
|
4571
|
+
}
|
|
4572
|
+
if (cells.length === 0) return;
|
|
4573
|
+
const grid = Array.from({ length: rowCount }, () => Array(colCount).fill(null));
|
|
4574
|
+
for (const cell of cells) {
|
|
4575
|
+
const r = cell.rowAddr ?? 0;
|
|
4576
|
+
const c = cell.colAddr ?? 0;
|
|
4577
|
+
if (isNaN(r) || isNaN(c) || r >= rowCount || c >= colCount) continue;
|
|
4578
|
+
grid[r][c] = cell;
|
|
4579
|
+
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
4580
|
+
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
4581
|
+
if (dr === 0 && dc === 0) continue;
|
|
4582
|
+
if (r + dr < rowCount && c + dc < colCount) {
|
|
4583
|
+
grid[r + dr][c + dc] = { text: "", colSpan: 1, rowSpan: 1 };
|
|
4584
|
+
}
|
|
4585
|
+
}
|
|
4586
|
+
}
|
|
4587
|
+
}
|
|
4588
|
+
const cellRows = grid.map(
|
|
4589
|
+
(row) => row.map((cell) => cell ?? { text: "", colSpan: 1, rowSpan: 1 })
|
|
4590
|
+
);
|
|
4591
|
+
const table = buildTable(cellRows);
|
|
4592
|
+
blocks.push({ type: "table", table, pageNumber: sectionNum });
|
|
4593
|
+
}
|
|
4594
|
+
function extractCellText2(cellEl) {
|
|
4595
|
+
const textParts = [];
|
|
4596
|
+
collectCellText(cellEl, textParts, 0);
|
|
4597
|
+
return textParts.filter(Boolean).join("\n").trim();
|
|
4598
|
+
}
|
|
4599
|
+
function collectCellText(node, parts, depth) {
|
|
4600
|
+
if (depth > 20) return;
|
|
4601
|
+
const children = node.childNodes;
|
|
4602
|
+
for (let i = 0; i < children.length; i++) {
|
|
4603
|
+
const el = children[i];
|
|
4604
|
+
if (el.nodeType !== 1) continue;
|
|
4605
|
+
const tag = localName2(el);
|
|
4606
|
+
if (tag === "P") {
|
|
4607
|
+
const t = extractParagraphText(el);
|
|
4608
|
+
if (t) parts.push(t);
|
|
4609
|
+
} else if (tag === "TABLE") {
|
|
4610
|
+
parts.push("[\uC911\uCCA9 \uD14C\uC774\uBE14]");
|
|
4611
|
+
} else {
|
|
4612
|
+
collectCellText(el, parts, depth + 1);
|
|
4613
|
+
}
|
|
4614
|
+
}
|
|
4615
|
+
}
|
|
4616
|
+
function localName2(el) {
|
|
4617
|
+
return (el.tagName || el.localName || "").replace(/^[^:]+:/, "");
|
|
4618
|
+
}
|
|
4619
|
+
function findChild(parent, tag) {
|
|
4620
|
+
const children = parent.childNodes;
|
|
4621
|
+
for (let i = 0; i < children.length; i++) {
|
|
4622
|
+
const el = children[i];
|
|
4623
|
+
if (el.nodeType === 1 && localName2(el) === tag) return el;
|
|
4624
|
+
}
|
|
4625
|
+
return null;
|
|
4626
|
+
}
|
|
4627
|
+
function textContent(el) {
|
|
4628
|
+
const children = el.childNodes;
|
|
4629
|
+
const parts = [];
|
|
4630
|
+
for (let i = 0; i < children.length; i++) {
|
|
4631
|
+
const node = children[i];
|
|
4632
|
+
if (node.nodeType === 3) {
|
|
4633
|
+
parts.push(node.nodeValue || "");
|
|
4634
|
+
} else if (node.nodeType === 1) {
|
|
4635
|
+
parts.push(textContent(node));
|
|
4636
|
+
}
|
|
4637
|
+
}
|
|
4638
|
+
return parts.join("");
|
|
4639
|
+
}
|
|
4640
|
+
function countSections(body) {
|
|
4641
|
+
let count = 0;
|
|
4642
|
+
const children = body.childNodes;
|
|
4643
|
+
for (let i = 0; i < children.length; i++) {
|
|
4644
|
+
const el = children[i];
|
|
4645
|
+
if (el.nodeType === 1 && localName2(el) === "SECTION") count++;
|
|
4646
|
+
}
|
|
4647
|
+
return count;
|
|
4648
|
+
}
|
|
4649
|
+
|
|
4257
4650
|
// src/index.ts
|
|
4258
4651
|
async function parse(input, options) {
|
|
4259
4652
|
let buffer;
|
|
4653
|
+
const opts = typeof input === "string" && !options?.filePath ? { ...options, filePath: input } : options;
|
|
4260
4654
|
if (typeof input === "string") {
|
|
4261
4655
|
try {
|
|
4262
4656
|
const buf = await readFile(input);
|
|
@@ -4277,14 +4671,16 @@ async function parse(input, options) {
|
|
|
4277
4671
|
switch (format) {
|
|
4278
4672
|
case "hwpx": {
|
|
4279
4673
|
const zipFormat = await detectZipFormat(buffer);
|
|
4280
|
-
if (zipFormat === "xlsx") return parseXlsx(buffer,
|
|
4281
|
-
if (zipFormat === "docx") return parseDocx(buffer,
|
|
4282
|
-
return parseHwpx(buffer,
|
|
4674
|
+
if (zipFormat === "xlsx") return parseXlsx(buffer, opts);
|
|
4675
|
+
if (zipFormat === "docx") return parseDocx(buffer, opts);
|
|
4676
|
+
return parseHwpx(buffer, opts);
|
|
4283
4677
|
}
|
|
4284
4678
|
case "hwp":
|
|
4285
|
-
return parseHwp(buffer,
|
|
4679
|
+
return parseHwp(buffer, opts);
|
|
4680
|
+
case "hwpml":
|
|
4681
|
+
return parseHwpml(buffer, opts);
|
|
4286
4682
|
case "pdf":
|
|
4287
|
-
return parsePdf(buffer,
|
|
4683
|
+
return parsePdf(buffer, opts);
|
|
4288
4684
|
default:
|
|
4289
4685
|
return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
|
|
4290
4686
|
}
|
|
@@ -4308,7 +4704,7 @@ async function parseHwp(buffer, options) {
|
|
|
4308
4704
|
async function parsePdf(buffer, options) {
|
|
4309
4705
|
let parsePdfDocument;
|
|
4310
4706
|
try {
|
|
4311
|
-
const mod = await import("./parser-
|
|
4707
|
+
const mod = await import("./parser-4275GJRB.js");
|
|
4312
4708
|
parsePdfDocument = mod.parsePdfDocument;
|
|
4313
4709
|
} catch {
|
|
4314
4710
|
return {
|
|
@@ -4342,6 +4738,14 @@ async function parseDocx(buffer, options) {
|
|
|
4342
4738
|
return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
4343
4739
|
}
|
|
4344
4740
|
}
|
|
4741
|
+
async function parseHwpml(buffer, options) {
|
|
4742
|
+
try {
|
|
4743
|
+
const { markdown, blocks, metadata, outline, warnings } = parseHwpmlDocument(buffer, options);
|
|
4744
|
+
return { success: true, fileType: "hwpml", markdown, blocks, metadata, outline, warnings };
|
|
4745
|
+
} catch (err) {
|
|
4746
|
+
return { success: false, fileType: "hwpml", error: err instanceof Error ? err.message : "HWPML \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
4747
|
+
}
|
|
4748
|
+
}
|
|
4345
4749
|
|
|
4346
4750
|
// src/diff/text-diff.ts
|
|
4347
4751
|
function similarity(a, b) {
|
|
@@ -4530,4 +4934,4 @@ export {
|
|
|
4530
4934
|
compare,
|
|
4531
4935
|
parse
|
|
4532
4936
|
};
|
|
4533
|
-
//# sourceMappingURL=chunk-
|
|
4937
|
+
//# sourceMappingURL=chunk-KSBPABBQ.js.map
|