kordoc 2.2.6 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -3
- package/dist/{chunk-RF6UJXR3.js → chunk-KSBPABBQ.js} +482 -78
- package/dist/chunk-KSBPABBQ.js.map +1 -0
- package/dist/{chunk-5Y2Q3BRW.js → chunk-M3E3C5GS.js} +8 -1
- package/dist/chunk-M3E3C5GS.js.map +1 -0
- package/dist/{chunk-FCQEF2ZM.js → chunk-VJPDY4YT.js} +2 -2
- package/dist/{chunk-NL5XLN5R.js.map → chunk-VJPDY4YT.js.map} +1 -1
- package/dist/{chunk-HXUCZ2IL.cjs → chunk-VLSATRNQ.cjs} +2 -2
- package/dist/{chunk-HXUCZ2IL.cjs.map → chunk-VLSATRNQ.cjs.map} +1 -1
- package/dist/{chunk-NL5XLN5R.js → chunk-XG5CQUSC.js} +2 -2
- package/dist/{chunk-FCQEF2ZM.js.map → chunk-XG5CQUSC.js.map} +1 -1
- package/dist/cli.js +5 -5
- package/dist/cli.js.map +1 -1
- package/dist/{detect-GYK3HKD5.js → detect-I7YIS4Q6.js} +4 -2
- package/dist/index.cjs +608 -197
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +6 -2
- package/dist/index.d.ts +6 -2
- package/dist/index.js +500 -89
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +5 -5
- package/dist/{parser-AMP7MAOH.js → parser-4275GJRB.js} +45 -42
- package/dist/{parser-AMP7MAOH.js.map → parser-4275GJRB.js.map} +1 -1
- package/dist/{parser-KOWPTDJU.cjs → parser-STAOZMUC.cjs} +61 -58
- package/dist/{parser-KOWPTDJU.cjs.map → parser-STAOZMUC.cjs.map} +1 -1
- package/dist/{parser-43IAQ5KE.js → parser-XRUZEFZT.js} +45 -42
- package/dist/{parser-43IAQ5KE.js.map → parser-XRUZEFZT.js.map} +1 -1
- package/dist/{watch-IUQXOXW3.js → watch-BFLNFJBE.js} +4 -4
- package/package.json +2 -2
- package/dist/chunk-5Y2Q3BRW.js.map +0 -1
- package/dist/chunk-RF6UJXR3.js.map +0 -1
- /package/dist/{detect-GYK3HKD5.js.map → detect-I7YIS4Q6.js.map} +0 -0
- /package/dist/{watch-IUQXOXW3.js.map → watch-BFLNFJBE.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -16,7 +16,7 @@ import {
|
|
|
16
16
|
sanitizeHref,
|
|
17
17
|
stripDtd,
|
|
18
18
|
toArrayBuffer
|
|
19
|
-
} from "./chunk-
|
|
19
|
+
} from "./chunk-XG5CQUSC.js";
|
|
20
20
|
import {
|
|
21
21
|
parsePageRange
|
|
22
22
|
} from "./chunk-SBVRCJFH.js";
|
|
@@ -44,11 +44,17 @@ function isPdfFile(buffer) {
|
|
|
44
44
|
const b = magicBytes(buffer);
|
|
45
45
|
return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
|
|
46
46
|
}
|
|
47
|
+
function isHwpmlFile(buffer) {
|
|
48
|
+
const bytes = new Uint8Array(buffer, 0, Math.min(512, buffer.byteLength));
|
|
49
|
+
const head = new TextDecoder("utf-8", { fatal: false }).decode(bytes).replace(/^\uFEFF/, "");
|
|
50
|
+
return head.trimStart().startsWith("<?xml") && head.includes("<HWPML");
|
|
51
|
+
}
|
|
47
52
|
function detectFormat(buffer) {
|
|
48
53
|
if (buffer.byteLength < 4) return "unknown";
|
|
49
54
|
if (isZipFile(buffer)) return "hwpx";
|
|
50
55
|
if (isOldHwpFile(buffer)) return "hwp";
|
|
51
56
|
if (isPdfFile(buffer)) return "pdf";
|
|
57
|
+
if (isHwpmlFile(buffer)) return "hwpml";
|
|
52
58
|
return "unknown";
|
|
53
59
|
}
|
|
54
60
|
async function detectZipFormat(buffer) {
|
|
@@ -69,6 +75,100 @@ async function detectZipFormat(buffer) {
|
|
|
69
75
|
import JSZip2 from "jszip";
|
|
70
76
|
import { inflateRawSync } from "zlib";
|
|
71
77
|
import { DOMParser } from "@xmldom/xmldom";
|
|
78
|
+
|
|
79
|
+
// src/hwpx/com-fallback.ts
|
|
80
|
+
import { execFileSync } from "child_process";
|
|
81
|
+
import { platform } from "os";
|
|
82
|
+
function isComFallbackAvailable() {
|
|
83
|
+
return platform() === "win32";
|
|
84
|
+
}
|
|
85
|
+
function isEncryptedHwpx(manifestXml) {
|
|
86
|
+
return manifestXml.includes("encryption-data");
|
|
87
|
+
}
|
|
88
|
+
function extractTextViaCom(filePath) {
|
|
89
|
+
if (!isComFallbackAvailable()) {
|
|
90
|
+
throw new Error("COM fallback\uC740 Windows\uC5D0\uC11C\uB9CC \uC0AC\uC6A9 \uAC00\uB2A5\uD569\uB2C8\uB2E4");
|
|
91
|
+
}
|
|
92
|
+
const escaped = filePath.replace(/'/g, "''");
|
|
93
|
+
const ps1 = `
|
|
94
|
+
[Console]::OutputEncoding = [System.Text.Encoding]::UTF8
|
|
95
|
+
$ErrorActionPreference = 'Stop'
|
|
96
|
+
try {
|
|
97
|
+
$hwp = New-Object -ComObject HWPFrame.HwpObject
|
|
98
|
+
$hwp.RegisterModule('FilePathCheckerModule', 'FilePathCheckerModuleExample') | Out-Null
|
|
99
|
+
$hwp.Open('${escaped}', '', '') | Out-Null
|
|
100
|
+
$pc = $hwp.PageCount
|
|
101
|
+
$result = @{ pageCount = $pc; pages = @() }
|
|
102
|
+
for ($p = 1; $p -le $pc; $p++) {
|
|
103
|
+
$t = $hwp.GetPageText($p, 0)
|
|
104
|
+
$result.pages += @($t)
|
|
105
|
+
}
|
|
106
|
+
$hwp.Clear(1)
|
|
107
|
+
[System.Runtime.InteropServices.Marshal]::ReleaseComObject($hwp) | Out-Null
|
|
108
|
+
$result | ConvertTo-Json -Depth 3 -Compress
|
|
109
|
+
} catch {
|
|
110
|
+
@{ error = $_.Exception.Message } | ConvertTo-Json -Compress
|
|
111
|
+
}
|
|
112
|
+
`;
|
|
113
|
+
const stdout = execFileSync("powershell", [
|
|
114
|
+
"-NoProfile",
|
|
115
|
+
"-NonInteractive",
|
|
116
|
+
"-ExecutionPolicy",
|
|
117
|
+
"Bypass",
|
|
118
|
+
"-Command",
|
|
119
|
+
ps1
|
|
120
|
+
], {
|
|
121
|
+
encoding: "utf-8",
|
|
122
|
+
timeout: 12e4,
|
|
123
|
+
// 2분 타임아웃
|
|
124
|
+
windowsHide: true,
|
|
125
|
+
maxBuffer: 50 * 1024 * 1024
|
|
126
|
+
// 50MB
|
|
127
|
+
});
|
|
128
|
+
const trimmed = stdout.trim();
|
|
129
|
+
const jsonStart = trimmed.indexOf("{");
|
|
130
|
+
if (jsonStart < 0) throw new Error(`COM \uCD9C\uB825\uC5D0 JSON\uC774 \uC5C6\uC2B5\uB2C8\uB2E4: ${trimmed.slice(0, 200)}`);
|
|
131
|
+
const json = JSON.parse(trimmed.slice(jsonStart));
|
|
132
|
+
if (json.error) {
|
|
133
|
+
throw new Error(`COM \uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uC2E4\uD328: ${json.error}`);
|
|
134
|
+
}
|
|
135
|
+
const warnings = [];
|
|
136
|
+
const pages = Array.isArray(json.pages) ? json.pages : [];
|
|
137
|
+
const pageCount = json.pageCount ?? pages.length;
|
|
138
|
+
if (pages.length === 0) {
|
|
139
|
+
warnings.push({ message: "COM\uC73C\uB85C \uD14D\uC2A4\uD2B8\uB97C \uCD94\uCD9C\uD558\uC9C0 \uBABB\uD588\uC2B5\uB2C8\uB2E4", code: "COM_EMPTY" });
|
|
140
|
+
}
|
|
141
|
+
return { pages, pageCount, warnings };
|
|
142
|
+
}
|
|
143
|
+
function comResultToParseResult(pages, pageCount, warnings) {
|
|
144
|
+
const blocks = [];
|
|
145
|
+
const lines = [];
|
|
146
|
+
for (let i = 0; i < pages.length; i++) {
|
|
147
|
+
const text = (pages[i] ?? "").trim();
|
|
148
|
+
if (!text) continue;
|
|
149
|
+
const paragraphs = text.split(/\n/);
|
|
150
|
+
for (const para of paragraphs) {
|
|
151
|
+
const trimmed = para.trim();
|
|
152
|
+
if (!trimmed) continue;
|
|
153
|
+
blocks.push({ type: "paragraph", text: trimmed, pageNumber: i + 1 });
|
|
154
|
+
lines.push(trimmed);
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
const markdown = lines.join("\n\n");
|
|
158
|
+
const metadata = { pageCount };
|
|
159
|
+
warnings.push({
|
|
160
|
+
message: "DRM \uBB38\uC11C: \uD55C\uCEF4 COM API\uB85C \uD14D\uC2A4\uD2B8 \uCD94\uCD9C (\uC11C\uC2DD/\uD45C \uC815\uBCF4 \uC81C\uD55C\uC801)",
|
|
161
|
+
code: "DRM_COM_FALLBACK"
|
|
162
|
+
});
|
|
163
|
+
return {
|
|
164
|
+
markdown,
|
|
165
|
+
blocks,
|
|
166
|
+
metadata,
|
|
167
|
+
warnings: warnings.length > 0 ? warnings : void 0
|
|
168
|
+
};
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// src/hwpx/parser.ts
|
|
72
172
|
var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
|
|
73
173
|
var MAX_ZIP_ENTRIES = 500;
|
|
74
174
|
function clampSpan(val, max) {
|
|
@@ -173,6 +273,19 @@ async function parseHwpxDocument(buffer, options) {
|
|
|
173
273
|
if (actualEntryCount > MAX_ZIP_ENTRIES) {
|
|
174
274
|
throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
175
275
|
}
|
|
276
|
+
const manifestFile = zip.file("META-INF/manifest.xml");
|
|
277
|
+
if (manifestFile) {
|
|
278
|
+
const manifestXml = await manifestFile.async("text");
|
|
279
|
+
if (isEncryptedHwpx(manifestXml)) {
|
|
280
|
+
if (isComFallbackAvailable() && options?.filePath) {
|
|
281
|
+
const { pages, pageCount, warnings: warnings2 } = extractTextViaCom(options.filePath);
|
|
282
|
+
if (pages.some((p) => p && p.trim().length > 0)) {
|
|
283
|
+
return comResultToParseResult(pages, pageCount, warnings2);
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
throw new KordocError("DRM \uC554\uD638\uD654\uB41C HWPX \uD30C\uC77C\uC785\uB2C8\uB2E4. Windows + \uD55C\uCEF4 \uC624\uD53C\uC2A4 \uC124\uCE58 \uC2DC \uC790\uB3D9 \uCD94\uCD9C\uB429\uB2C8\uB2E4.");
|
|
287
|
+
}
|
|
288
|
+
}
|
|
176
289
|
const decompressed = { total: 0 };
|
|
177
290
|
const metadata = {};
|
|
178
291
|
await extractHwpxMetadata(zip, metadata, decompressed);
|
|
@@ -184,6 +297,7 @@ async function parseHwpxDocument(buffer, options) {
|
|
|
184
297
|
const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
|
|
185
298
|
const totalTarget = pageFilter ? pageFilter.size : sectionPaths.length;
|
|
186
299
|
const blocks = [];
|
|
300
|
+
const nestedTableCounter = { count: 0 };
|
|
187
301
|
let parsedSections = 0;
|
|
188
302
|
for (let si = 0; si < sectionPaths.length; si++) {
|
|
189
303
|
if (pageFilter && !pageFilter.has(si + 1)) continue;
|
|
@@ -193,7 +307,7 @@ async function parseHwpxDocument(buffer, options) {
|
|
|
193
307
|
const xml = await file.async("text");
|
|
194
308
|
decompressed.total += xml.length * 2;
|
|
195
309
|
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
196
|
-
blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1));
|
|
310
|
+
blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1, nestedTableCounter));
|
|
197
311
|
parsedSections++;
|
|
198
312
|
options?.onProgress?.(parsedSections, totalTarget);
|
|
199
313
|
} catch (secErr) {
|
|
@@ -254,8 +368,20 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
|
|
|
254
368
|
ref
|
|
255
369
|
// 절대 경로일 수도 있음
|
|
256
370
|
];
|
|
371
|
+
let resolvedPath = null;
|
|
372
|
+
if (!ref.includes(".")) {
|
|
373
|
+
const prefixes = [`BinData/${ref}`, `Contents/BinData/${ref}`];
|
|
374
|
+
for (const prefix of prefixes) {
|
|
375
|
+
const match = zip.file(new RegExp(`^${prefix.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\.[a-zA-Z0-9]+$`));
|
|
376
|
+
if (match.length > 0) {
|
|
377
|
+
resolvedPath = match[0].name;
|
|
378
|
+
break;
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
}
|
|
257
382
|
let found = false;
|
|
258
|
-
|
|
383
|
+
const allCandidates = resolvedPath ? [resolvedPath, ...candidates] : candidates;
|
|
384
|
+
for (const path of allCandidates) {
|
|
259
385
|
if (isPathTraversal(path)) continue;
|
|
260
386
|
const file = zip.file(path);
|
|
261
387
|
if (!file) continue;
|
|
@@ -263,7 +389,8 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
|
|
|
263
389
|
const data = await file.async("uint8array");
|
|
264
390
|
decompressed.total += data.length;
|
|
265
391
|
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
266
|
-
const
|
|
392
|
+
const actualPath = path;
|
|
393
|
+
const ext = actualPath.includes(".") ? actualPath.split(".").pop() || "png" : "png";
|
|
267
394
|
const mimeType = imageExtToMime(ext);
|
|
268
395
|
imageIndex++;
|
|
269
396
|
const filename = `image_${String(imageIndex).padStart(3, "0")}.${mimeToExt(mimeType)}`;
|
|
@@ -336,6 +463,7 @@ function extractFromBrokenZip(buffer) {
|
|
|
336
463
|
let totalDecompressed = 0;
|
|
337
464
|
let entryCount = 0;
|
|
338
465
|
let sectionNum = 0;
|
|
466
|
+
const nestedTableCounter = { count: 0 };
|
|
339
467
|
while (pos < data.length - 30) {
|
|
340
468
|
if (data[pos] !== 80 || data[pos + 1] !== 75 || data[pos + 2] !== 3 || data[pos + 3] !== 4) {
|
|
341
469
|
pos++;
|
|
@@ -382,7 +510,7 @@ function extractFromBrokenZip(buffer) {
|
|
|
382
510
|
totalDecompressed += content.length * 2;
|
|
383
511
|
if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
|
|
384
512
|
sectionNum++;
|
|
385
|
-
blocks.push(...parseSectionXml(content, void 0, warnings, sectionNum));
|
|
513
|
+
blocks.push(...parseSectionXml(content, void 0, warnings, sectionNum, nestedTableCounter));
|
|
386
514
|
} catch {
|
|
387
515
|
continue;
|
|
388
516
|
}
|
|
@@ -467,12 +595,40 @@ function detectHwpxHeadings(blocks, styleMap) {
|
|
|
467
595
|
}
|
|
468
596
|
}
|
|
469
597
|
}
|
|
470
|
-
function
|
|
598
|
+
function makeNestedTableMarker(counter, rows) {
|
|
599
|
+
counter.count++;
|
|
600
|
+
const firstRow = rows[0] ?? [];
|
|
601
|
+
const hint = firstRow.map((c) => c.text.trim().replace(/\n/g, " ")).filter(Boolean).join(" | ");
|
|
602
|
+
const hintChars = [...hint];
|
|
603
|
+
const truncated = hintChars.length > 60 ? hintChars.slice(0, 60).join("") + "\u2026" : hint;
|
|
604
|
+
return truncated ? `[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}: ${truncated}]` : `[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}]`;
|
|
605
|
+
}
|
|
606
|
+
function handleNestedTable(newTable, tableStack, blocks, ctx) {
|
|
607
|
+
const parentTable = tableStack.pop();
|
|
608
|
+
let nestedCols = 0;
|
|
609
|
+
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
610
|
+
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
611
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: ctx.sectionNum });
|
|
612
|
+
if (parentTable.cell) {
|
|
613
|
+
const marker = ctx.counter ? makeNestedTableMarker(ctx.counter, newTable.rows) : "[\uC911\uCCA9 \uD14C\uC774\uBE14]";
|
|
614
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + marker;
|
|
615
|
+
}
|
|
616
|
+
} else {
|
|
617
|
+
const nestedText = convertTableToText(newTable.rows);
|
|
618
|
+
if (parentTable.cell) {
|
|
619
|
+
const marker = ctx.counter ? makeNestedTableMarker(ctx.counter, newTable.rows) : "[\uC911\uCCA9 \uD14C\uC774\uBE14]";
|
|
620
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + marker + "\n" + nestedText;
|
|
621
|
+
}
|
|
622
|
+
}
|
|
623
|
+
return parentTable;
|
|
624
|
+
}
|
|
625
|
+
function parseSectionXml(xml, styleMap, warnings, sectionNum, counter) {
|
|
471
626
|
const parser = createXmlParser(warnings);
|
|
472
627
|
const doc = parser.parseFromString(stripDtd(xml), "text/xml");
|
|
473
628
|
if (!doc.documentElement) return [];
|
|
474
629
|
const blocks = [];
|
|
475
|
-
|
|
630
|
+
const ctx = { styleMap, warnings, sectionNum, counter };
|
|
631
|
+
walkSection(doc.documentElement, blocks, null, [], ctx);
|
|
476
632
|
return blocks;
|
|
477
633
|
}
|
|
478
634
|
function extractImageRef(el) {
|
|
@@ -493,7 +649,7 @@ function extractImageRef(el) {
|
|
|
493
649
|
if (directRef) return directRef;
|
|
494
650
|
return null;
|
|
495
651
|
}
|
|
496
|
-
function walkSection(node, blocks, tableCtx, tableStack,
|
|
652
|
+
function walkSection(node, blocks, tableCtx, tableStack, ctx, depth = 0) {
|
|
497
653
|
if (depth > MAX_XML_DEPTH) return;
|
|
498
654
|
const children = node.childNodes;
|
|
499
655
|
if (!children) return;
|
|
@@ -506,23 +662,12 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
506
662
|
case "tbl": {
|
|
507
663
|
if (tableCtx) tableStack.push(tableCtx);
|
|
508
664
|
const newTable = { rows: [], currentRow: [], cell: null };
|
|
509
|
-
walkSection(el, blocks, newTable, tableStack,
|
|
665
|
+
walkSection(el, blocks, newTable, tableStack, ctx, depth + 1);
|
|
510
666
|
if (newTable.rows.length > 0) {
|
|
511
667
|
if (tableStack.length > 0) {
|
|
512
|
-
|
|
513
|
-
let nestedCols = 0;
|
|
514
|
-
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
515
|
-
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
516
|
-
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
517
|
-
} else {
|
|
518
|
-
const nestedText = convertTableToText(newTable.rows);
|
|
519
|
-
if (parentTable.cell) {
|
|
520
|
-
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
521
|
-
}
|
|
522
|
-
}
|
|
523
|
-
tableCtx = parentTable;
|
|
668
|
+
tableCtx = handleNestedTable(newTable, tableStack, blocks, ctx);
|
|
524
669
|
} else {
|
|
525
|
-
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
670
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: ctx.sectionNum });
|
|
526
671
|
tableCtx = null;
|
|
527
672
|
}
|
|
528
673
|
} else {
|
|
@@ -533,7 +678,7 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
533
678
|
case "tr":
|
|
534
679
|
if (tableCtx) {
|
|
535
680
|
tableCtx.currentRow = [];
|
|
536
|
-
walkSection(el, blocks, tableCtx, tableStack,
|
|
681
|
+
walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
537
682
|
if (tableCtx.currentRow.length > 0) tableCtx.rows.push(tableCtx.currentRow);
|
|
538
683
|
tableCtx.currentRow = [];
|
|
539
684
|
}
|
|
@@ -541,7 +686,7 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
541
686
|
case "tc":
|
|
542
687
|
if (tableCtx) {
|
|
543
688
|
tableCtx.cell = { text: "", colSpan: 1, rowSpan: 1 };
|
|
544
|
-
walkSection(el, blocks, tableCtx, tableStack,
|
|
689
|
+
walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
545
690
|
if (tableCtx.cell) {
|
|
546
691
|
tableCtx.currentRow.push(tableCtx.cell);
|
|
547
692
|
tableCtx.cell = null;
|
|
@@ -567,19 +712,19 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
567
712
|
}
|
|
568
713
|
break;
|
|
569
714
|
case "p": {
|
|
570
|
-
const { text, href, footnote, style } = extractParagraphInfo(el, styleMap);
|
|
715
|
+
const { text, href, footnote, style } = extractParagraphInfo(el, ctx.styleMap);
|
|
571
716
|
if (text) {
|
|
572
717
|
if (tableCtx?.cell) {
|
|
573
718
|
tableCtx.cell.text += (tableCtx.cell.text ? "\n" : "") + text;
|
|
574
719
|
} else if (!tableCtx) {
|
|
575
|
-
const block = { type: "paragraph", text, pageNumber: sectionNum };
|
|
720
|
+
const block = { type: "paragraph", text, pageNumber: ctx.sectionNum };
|
|
576
721
|
if (style) block.style = style;
|
|
577
722
|
if (href) block.href = href;
|
|
578
723
|
if (footnote) block.footnoteText = footnote;
|
|
579
724
|
blocks.push(block);
|
|
580
725
|
}
|
|
581
726
|
}
|
|
582
|
-
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack,
|
|
727
|
+
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
583
728
|
break;
|
|
584
729
|
}
|
|
585
730
|
// 이미지/그림 — 경로 추출 또는 경고
|
|
@@ -588,19 +733,19 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
588
733
|
case "drawingObject": {
|
|
589
734
|
const imgRef = extractImageRef(el);
|
|
590
735
|
if (imgRef) {
|
|
591
|
-
blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
|
|
592
|
-
} else if (warnings && sectionNum) {
|
|
593
|
-
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
736
|
+
blocks.push({ type: "image", text: imgRef, pageNumber: ctx.sectionNum });
|
|
737
|
+
} else if (ctx.warnings && ctx.sectionNum) {
|
|
738
|
+
ctx.warnings.push({ page: ctx.sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
594
739
|
}
|
|
595
740
|
break;
|
|
596
741
|
}
|
|
597
742
|
default:
|
|
598
|
-
walkSection(el, blocks, tableCtx, tableStack,
|
|
743
|
+
walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
599
744
|
break;
|
|
600
745
|
}
|
|
601
746
|
}
|
|
602
747
|
}
|
|
603
|
-
function walkParagraphChildren(node, blocks, tableCtx, tableStack,
|
|
748
|
+
function walkParagraphChildren(node, blocks, tableCtx, tableStack, ctx, depth = 0) {
|
|
604
749
|
if (depth > MAX_XML_DEPTH) return tableCtx;
|
|
605
750
|
const children = node.childNodes;
|
|
606
751
|
if (!children) return tableCtx;
|
|
@@ -616,23 +761,12 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
616
761
|
if (localTag === "tbl") {
|
|
617
762
|
if (tableCtx) tableStack.push(tableCtx);
|
|
618
763
|
const newTable = { rows: [], currentRow: [], cell: null };
|
|
619
|
-
walkSection(el, blocks, newTable, tableStack,
|
|
764
|
+
walkSection(el, blocks, newTable, tableStack, ctx, d + 1);
|
|
620
765
|
if (newTable.rows.length > 0) {
|
|
621
766
|
if (tableStack.length > 0) {
|
|
622
|
-
|
|
623
|
-
let nestedCols = 0;
|
|
624
|
-
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
625
|
-
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
626
|
-
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
627
|
-
} else {
|
|
628
|
-
const nestedText = convertTableToText(newTable.rows);
|
|
629
|
-
if (parentTable.cell) {
|
|
630
|
-
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
631
|
-
}
|
|
632
|
-
}
|
|
633
|
-
tableCtx = parentTable;
|
|
767
|
+
tableCtx = handleNestedTable(newTable, tableStack, blocks, ctx);
|
|
634
768
|
} else {
|
|
635
|
-
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
769
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: ctx.sectionNum });
|
|
636
770
|
tableCtx = null;
|
|
637
771
|
}
|
|
638
772
|
} else {
|
|
@@ -641,21 +775,21 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
641
775
|
} else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
|
|
642
776
|
const drawTextChild = findDescendant(el, "drawText");
|
|
643
777
|
if (drawTextChild) {
|
|
644
|
-
extractDrawTextBlocks(drawTextChild, blocks, styleMap, sectionNum);
|
|
778
|
+
extractDrawTextBlocks(drawTextChild, blocks, ctx.styleMap, ctx.sectionNum);
|
|
645
779
|
} else {
|
|
646
780
|
const imgRef = extractImageRef(el);
|
|
647
781
|
if (imgRef) {
|
|
648
|
-
blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
|
|
649
|
-
} else if (warnings && sectionNum) {
|
|
650
|
-
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
782
|
+
blocks.push({ type: "image", text: imgRef, pageNumber: ctx.sectionNum });
|
|
783
|
+
} else if (ctx.warnings && ctx.sectionNum) {
|
|
784
|
+
ctx.warnings.push({ page: ctx.sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
651
785
|
}
|
|
652
786
|
}
|
|
653
787
|
} else if (localTag === "drawText") {
|
|
654
|
-
extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
|
|
788
|
+
extractDrawTextBlocks(el, blocks, ctx.styleMap, ctx.sectionNum);
|
|
655
789
|
} else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
|
|
656
790
|
walkChildren(el, d + 1);
|
|
657
791
|
} else if (localTag === "run") {
|
|
658
|
-
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack,
|
|
792
|
+
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
659
793
|
}
|
|
660
794
|
}
|
|
661
795
|
};
|
|
@@ -1928,6 +2062,7 @@ function parseHwp5Document(buffer, options) {
|
|
|
1928
2062
|
const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
|
|
1929
2063
|
const totalTarget = pageFilter ? pageFilter.size : sections.length;
|
|
1930
2064
|
const blocks = [];
|
|
2065
|
+
const nestedTableCounter = { count: 0 };
|
|
1931
2066
|
let totalDecompressed = 0;
|
|
1932
2067
|
let parsedSections = 0;
|
|
1933
2068
|
for (let si = 0; si < sections.length; si++) {
|
|
@@ -1938,7 +2073,7 @@ function parseHwp5Document(buffer, options) {
|
|
|
1938
2073
|
totalDecompressed += data.length;
|
|
1939
2074
|
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
1940
2075
|
const records = readRecords(data);
|
|
1941
|
-
const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
|
|
2076
|
+
const sectionBlocks = parseSection(records, docInfo, warnings, si + 1, nestedTableCounter);
|
|
1942
2077
|
blocks.push(...sectionBlocks);
|
|
1943
2078
|
parsedSections++;
|
|
1944
2079
|
options?.onProgress?.(parsedSections, totalTarget);
|
|
@@ -2258,13 +2393,13 @@ function extractHwp5ImagesLenient(lcfb, blocks, compressed, warnings) {
|
|
|
2258
2393
|
}
|
|
2259
2394
|
return images;
|
|
2260
2395
|
}
|
|
2261
|
-
function parseSection(records, docInfo, warnings, sectionNum) {
|
|
2396
|
+
function parseSection(records, docInfo, warnings, sectionNum, counter) {
|
|
2262
2397
|
const blocks = [];
|
|
2263
2398
|
let i = 0;
|
|
2264
2399
|
while (i < records.length) {
|
|
2265
2400
|
const rec = records[i];
|
|
2266
2401
|
if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
|
|
2267
|
-
const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i);
|
|
2402
|
+
const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i, counter);
|
|
2268
2403
|
if (paragraph) {
|
|
2269
2404
|
const block = { type: "paragraph", text: paragraph, pageNumber: sectionNum };
|
|
2270
2405
|
if (docInfo && charShapeIds.length > 0) {
|
|
@@ -2287,7 +2422,7 @@ function parseSection(records, docInfo, warnings, sectionNum) {
|
|
|
2287
2422
|
if (rec.tagId === TAG_CTRL_HEADER && rec.level <= 1 && rec.data.length >= 4) {
|
|
2288
2423
|
const ctrlId = rec.data.subarray(0, 4).toString("ascii");
|
|
2289
2424
|
if (ctrlId === " lbt" || ctrlId === "tbl ") {
|
|
2290
|
-
const { table, nextIdx } = parseTableBlock(records, i);
|
|
2425
|
+
const { table, nextIdx } = parseTableBlock(records, i, counter);
|
|
2291
2426
|
if (table) blocks.push({ type: "table", table, pageNumber: sectionNum });
|
|
2292
2427
|
i = nextIdx;
|
|
2293
2428
|
continue;
|
|
@@ -2392,7 +2527,7 @@ function resolveCharStyle(charShapeIds, docInfo) {
|
|
|
2392
2527
|
if (cs.attrFlags & 2) style.bold = true;
|
|
2393
2528
|
return style.fontSize || style.bold || style.italic ? style : void 0;
|
|
2394
2529
|
}
|
|
2395
|
-
function parseParagraphWithTables(records, startIdx) {
|
|
2530
|
+
function parseParagraphWithTables(records, startIdx, counter) {
|
|
2396
2531
|
const startLevel = records[startIdx].level;
|
|
2397
2532
|
let text = "";
|
|
2398
2533
|
const tables = [];
|
|
@@ -2414,7 +2549,7 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
2414
2549
|
if (rec.tagId === TAG_CTRL_HEADER && rec.data.length >= 4) {
|
|
2415
2550
|
const ctrlId = rec.data.subarray(0, 4).toString("ascii");
|
|
2416
2551
|
if (ctrlId === " lbt" || ctrlId === "tbl ") {
|
|
2417
|
-
const { table, nextIdx } = parseTableBlock(records, i);
|
|
2552
|
+
const { table, nextIdx } = parseTableBlock(records, i, counter);
|
|
2418
2553
|
if (table) tables.push(table);
|
|
2419
2554
|
i = nextIdx;
|
|
2420
2555
|
continue;
|
|
@@ -2425,7 +2560,7 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
2425
2560
|
const trimmed = text.trim();
|
|
2426
2561
|
return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds, paraShapeId };
|
|
2427
2562
|
}
|
|
2428
|
-
function parseTableBlock(records, startIdx) {
|
|
2563
|
+
function parseTableBlock(records, startIdx, counter) {
|
|
2429
2564
|
const tableLevel = records[startIdx].level;
|
|
2430
2565
|
let i = startIdx + 1;
|
|
2431
2566
|
let rows = 0, cols = 0;
|
|
@@ -2439,7 +2574,7 @@ function parseTableBlock(records, startIdx) {
|
|
|
2439
2574
|
cols = Math.min(rec.data.readUInt16LE(6), MAX_COLS);
|
|
2440
2575
|
}
|
|
2441
2576
|
if (rec.tagId === TAG_LIST_HEADER) {
|
|
2442
|
-
const { cell, nextIdx } = parseCellBlock(records, i, tableLevel);
|
|
2577
|
+
const { cell, nextIdx } = parseCellBlock(records, i, tableLevel, counter);
|
|
2443
2578
|
if (cell) cells.push(cell);
|
|
2444
2579
|
i = nextIdx;
|
|
2445
2580
|
continue;
|
|
@@ -2460,7 +2595,7 @@ function parseTableBlock(records, startIdx) {
|
|
|
2460
2595
|
const cellRows = arrangeCells(rows, cols, cells);
|
|
2461
2596
|
return { table: buildTable(cellRows), nextIdx: i };
|
|
2462
2597
|
}
|
|
2463
|
-
function parseCellBlock(records, startIdx, tableLevel) {
|
|
2598
|
+
function parseCellBlock(records, startIdx, tableLevel, counter) {
|
|
2464
2599
|
const rec = records[startIdx];
|
|
2465
2600
|
const cellLevel = rec.level;
|
|
2466
2601
|
const texts = [];
|
|
@@ -2485,6 +2620,17 @@ function parseCellBlock(records, startIdx, tableLevel) {
|
|
|
2485
2620
|
const t = extractText(r.data).trim();
|
|
2486
2621
|
if (t) texts.push(t);
|
|
2487
2622
|
}
|
|
2623
|
+
if (r.tagId === TAG_CTRL_HEADER && r.data.length >= 4) {
|
|
2624
|
+
const ctrlId = r.data.subarray(0, 4).toString("ascii");
|
|
2625
|
+
if (ctrlId === " lbt" || ctrlId === "tbl ") {
|
|
2626
|
+
if (counter) {
|
|
2627
|
+
counter.count++;
|
|
2628
|
+
texts.push(`[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}]`);
|
|
2629
|
+
} else {
|
|
2630
|
+
texts.push("[\uC911\uCCA9 \uD14C\uC774\uBE14]");
|
|
2631
|
+
}
|
|
2632
|
+
}
|
|
2633
|
+
}
|
|
2488
2634
|
i++;
|
|
2489
2635
|
}
|
|
2490
2636
|
return { cell: { text: texts.join("\n"), colSpan, rowSpan, colAddr, rowAddr }, nextIdx: i };
|
|
@@ -2811,21 +2957,21 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
2811
2957
|
import JSZip4 from "jszip";
|
|
2812
2958
|
import { DOMParser as DOMParser3 } from "@xmldom/xmldom";
|
|
2813
2959
|
var MAX_DECOMPRESS_SIZE4 = 100 * 1024 * 1024;
|
|
2814
|
-
function getChildElements(parent,
|
|
2960
|
+
function getChildElements(parent, localName3) {
|
|
2815
2961
|
const result = [];
|
|
2816
2962
|
const children = parent.childNodes;
|
|
2817
2963
|
for (let i = 0; i < children.length; i++) {
|
|
2818
2964
|
const node = children[i];
|
|
2819
2965
|
if (node.nodeType === 1) {
|
|
2820
2966
|
const el = node;
|
|
2821
|
-
if (el.localName ===
|
|
2967
|
+
if (el.localName === localName3 || el.tagName?.endsWith(`:${localName3}`)) {
|
|
2822
2968
|
result.push(el);
|
|
2823
2969
|
}
|
|
2824
2970
|
}
|
|
2825
2971
|
}
|
|
2826
2972
|
return result;
|
|
2827
2973
|
}
|
|
2828
|
-
function findElements(parent,
|
|
2974
|
+
function findElements(parent, localName3) {
|
|
2829
2975
|
const result = [];
|
|
2830
2976
|
const walk = (node) => {
|
|
2831
2977
|
const children = node.childNodes;
|
|
@@ -2833,7 +2979,7 @@ function findElements(parent, localName2) {
|
|
|
2833
2979
|
const child = children[i];
|
|
2834
2980
|
if (child.nodeType === 1) {
|
|
2835
2981
|
const el = child;
|
|
2836
|
-
if (el.localName ===
|
|
2982
|
+
if (el.localName === localName3 || el.tagName?.endsWith(`:${localName3}`)) {
|
|
2837
2983
|
result.push(el);
|
|
2838
2984
|
}
|
|
2839
2985
|
walk(el);
|
|
@@ -2843,11 +2989,11 @@ function findElements(parent, localName2) {
|
|
|
2843
2989
|
walk(parent);
|
|
2844
2990
|
return result;
|
|
2845
2991
|
}
|
|
2846
|
-
function getAttr(el,
|
|
2992
|
+
function getAttr(el, localName3) {
|
|
2847
2993
|
const attrs = el.attributes;
|
|
2848
2994
|
for (let i = 0; i < attrs.length; i++) {
|
|
2849
2995
|
const attr = attrs[i];
|
|
2850
|
-
if (attr.localName ===
|
|
2996
|
+
if (attr.localName === localName3 || attr.name === localName3) return attr.value;
|
|
2851
2997
|
}
|
|
2852
2998
|
return null;
|
|
2853
2999
|
}
|
|
@@ -3194,11 +3340,11 @@ async function parseDocxDocument(buffer, options) {
|
|
|
3194
3340
|
const node = children[i];
|
|
3195
3341
|
if (node.nodeType !== 1) continue;
|
|
3196
3342
|
const el = node;
|
|
3197
|
-
const
|
|
3198
|
-
if (
|
|
3343
|
+
const localName3 = el.localName ?? el.tagName?.split(":").pop();
|
|
3344
|
+
if (localName3 === "p") {
|
|
3199
3345
|
const block = parseParagraph(el, styles, numbering, footnotes, rels);
|
|
3200
3346
|
if (block) blocks.push(block);
|
|
3201
|
-
} else if (
|
|
3347
|
+
} else if (localName3 === "tbl") {
|
|
3202
3348
|
const block = parseTable(el, styles, numbering, footnotes, rels);
|
|
3203
3349
|
if (block) blocks.push(block);
|
|
3204
3350
|
}
|
|
@@ -3236,6 +3382,259 @@ async function parseDocxDocument(buffer, options) {
|
|
|
3236
3382
|
};
|
|
3237
3383
|
}
|
|
3238
3384
|
|
|
3385
|
+
// src/hwpml/parser.ts
|
|
3386
|
+
import { DOMParser as DOMParser4 } from "@xmldom/xmldom";
|
|
3387
|
+
var MAX_XML_DEPTH2 = 200;
|
|
3388
|
+
var MAX_TABLE_ROWS = 5e3;
|
|
3389
|
+
var MAX_TABLE_COLS = 500;
|
|
3390
|
+
var MAX_HWPML_BYTES = 50 * 1024 * 1024;
|
|
3391
|
+
function parseHwpmlDocument(buffer, options) {
|
|
3392
|
+
if (buffer.byteLength > MAX_HWPML_BYTES) {
|
|
3393
|
+
throw new Error(`HWPML \uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC (${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB > 50MB)`);
|
|
3394
|
+
}
|
|
3395
|
+
const text = new TextDecoder("utf-8").decode(buffer).replace(/^\uFEFF/, "");
|
|
3396
|
+
const normalized = text.replace(/ /g, " ");
|
|
3397
|
+
const xml = stripDtd(normalized);
|
|
3398
|
+
const warnings = [];
|
|
3399
|
+
const parser = new DOMParser4({
|
|
3400
|
+
onError: (_level, msg) => {
|
|
3401
|
+
warnings.push({ message: `HWPML XML \uD30C\uC2F1 \uACBD\uACE0: ${msg}`, code: "MALFORMED_XML" });
|
|
3402
|
+
}
|
|
3403
|
+
});
|
|
3404
|
+
const doc = parser.parseFromString(xml, "text/xml");
|
|
3405
|
+
if (!doc.documentElement) {
|
|
3406
|
+
return { markdown: "", blocks: [], warnings };
|
|
3407
|
+
}
|
|
3408
|
+
const root = doc.documentElement;
|
|
3409
|
+
const metadata = {};
|
|
3410
|
+
const docSummary = findChild(root, "DOCSUMMARY");
|
|
3411
|
+
if (docSummary) {
|
|
3412
|
+
const title = findChild(docSummary, "TITLE");
|
|
3413
|
+
const author = findChild(docSummary, "AUTHOR");
|
|
3414
|
+
const date = findChild(docSummary, "DATE");
|
|
3415
|
+
if (title) metadata.title = textContent(title).trim();
|
|
3416
|
+
if (author) metadata.author = textContent(author).trim();
|
|
3417
|
+
if (date) metadata.createdAt = textContent(date).trim() || void 0;
|
|
3418
|
+
}
|
|
3419
|
+
const paraShapeMap = buildParaShapeMap(root);
|
|
3420
|
+
const body = findChild(root, "BODY");
|
|
3421
|
+
if (!body) {
|
|
3422
|
+
return { markdown: "", blocks: [], metadata, warnings };
|
|
3423
|
+
}
|
|
3424
|
+
const blocks = [];
|
|
3425
|
+
const pageFilter = options?.pages ? parsePageRange(options.pages, countSections(body)) : null;
|
|
3426
|
+
let sectionIdx = 0;
|
|
3427
|
+
const children = body.childNodes;
|
|
3428
|
+
for (let i = 0; i < children.length; i++) {
|
|
3429
|
+
const el = children[i];
|
|
3430
|
+
if (el.nodeType !== 1) continue;
|
|
3431
|
+
if (localName(el) !== "SECTION") continue;
|
|
3432
|
+
sectionIdx++;
|
|
3433
|
+
if (pageFilter && !pageFilter.has(sectionIdx)) continue;
|
|
3434
|
+
parseSection2(el, blocks, paraShapeMap, sectionIdx, warnings);
|
|
3435
|
+
}
|
|
3436
|
+
const outline = blocks.filter((b) => b.type === "heading" && b.text).map((b) => ({ level: b.level ?? 1, text: b.text, pageNumber: b.pageNumber }));
|
|
3437
|
+
const markdown = blocksToMarkdown(blocks);
|
|
3438
|
+
return {
|
|
3439
|
+
markdown,
|
|
3440
|
+
blocks,
|
|
3441
|
+
metadata: Object.keys(metadata).length > 0 ? metadata : void 0,
|
|
3442
|
+
outline: outline.length > 0 ? outline : void 0,
|
|
3443
|
+
warnings: warnings.length > 0 ? warnings : void 0
|
|
3444
|
+
};
|
|
3445
|
+
}
|
|
3446
|
+
function buildParaShapeMap(root) {
|
|
3447
|
+
const map = /* @__PURE__ */ new Map();
|
|
3448
|
+
const head = findChild(root, "HEAD");
|
|
3449
|
+
if (!head) return map;
|
|
3450
|
+
const mappingTable = findChild(head, "MAPPINGTABLE");
|
|
3451
|
+
if (!mappingTable) return map;
|
|
3452
|
+
const paraShapeList = findChild(mappingTable, "PARASHAPELIST");
|
|
3453
|
+
if (!paraShapeList) return map;
|
|
3454
|
+
const children = paraShapeList.childNodes;
|
|
3455
|
+
for (let i = 0; i < children.length; i++) {
|
|
3456
|
+
const el = children[i];
|
|
3457
|
+
if (el.nodeType !== 1 || localName(el) !== "PARASHAPE") continue;
|
|
3458
|
+
const id = el.getAttribute("Id") ?? "";
|
|
3459
|
+
const headingType = el.getAttribute("HeadingType") ?? "None";
|
|
3460
|
+
const level = parseInt(el.getAttribute("Level") ?? "0", 10);
|
|
3461
|
+
let headingLevel = null;
|
|
3462
|
+
if (headingType === "Outline") {
|
|
3463
|
+
const safeLevel = isNaN(level) ? 0 : Math.max(0, level);
|
|
3464
|
+
headingLevel = Math.min(safeLevel + 1, 6);
|
|
3465
|
+
}
|
|
3466
|
+
map.set(id, { headingLevel });
|
|
3467
|
+
}
|
|
3468
|
+
return map;
|
|
3469
|
+
}
|
|
3470
|
+
function parseSection2(section, blocks, paraShapeMap, sectionNum, warnings) {
|
|
3471
|
+
walkContent(section, blocks, paraShapeMap, sectionNum, warnings, false);
|
|
3472
|
+
}
|
|
3473
|
+
function walkContent(node, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth = 0) {
|
|
3474
|
+
if (depth > MAX_XML_DEPTH2) return;
|
|
3475
|
+
const children = node.childNodes;
|
|
3476
|
+
for (let i = 0; i < children.length; i++) {
|
|
3477
|
+
const el = children[i];
|
|
3478
|
+
if (el.nodeType !== 1) continue;
|
|
3479
|
+
const tag = localName(el);
|
|
3480
|
+
if (tag === "HEADER" || tag === "FOOTER") {
|
|
3481
|
+
continue;
|
|
3482
|
+
}
|
|
3483
|
+
if (tag === "P") {
|
|
3484
|
+
if (!inHeaderFooter) {
|
|
3485
|
+
parseParagraph2(el, blocks, paraShapeMap, sectionNum);
|
|
3486
|
+
}
|
|
3487
|
+
continue;
|
|
3488
|
+
}
|
|
3489
|
+
if (tag === "TABLE") {
|
|
3490
|
+
if (!inHeaderFooter) {
|
|
3491
|
+
parseTable2(el, blocks, paraShapeMap, sectionNum, warnings);
|
|
3492
|
+
}
|
|
3493
|
+
continue;
|
|
3494
|
+
}
|
|
3495
|
+
if (tag === "PARALIST" || tag === "SECTION" || tag === "COLDEF") {
|
|
3496
|
+
walkContent(el, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth + 1);
|
|
3497
|
+
continue;
|
|
3498
|
+
}
|
|
3499
|
+
walkContent(el, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth + 1);
|
|
3500
|
+
}
|
|
3501
|
+
}
|
|
3502
|
+
function parseParagraph2(el, blocks, paraShapeMap, sectionNum) {
|
|
3503
|
+
const paraShapeId = el.getAttribute("ParaShape") ?? "";
|
|
3504
|
+
const shapeInfo = paraShapeMap.get(paraShapeId);
|
|
3505
|
+
const text = extractParagraphText(el);
|
|
3506
|
+
if (!text) return;
|
|
3507
|
+
if (shapeInfo?.headingLevel != null) {
|
|
3508
|
+
blocks.push({ type: "heading", text, level: shapeInfo.headingLevel, pageNumber: sectionNum });
|
|
3509
|
+
} else {
|
|
3510
|
+
blocks.push({ type: "paragraph", text, pageNumber: sectionNum });
|
|
3511
|
+
}
|
|
3512
|
+
}
|
|
3513
|
+
function extractParagraphText(p) {
|
|
3514
|
+
const parts = [];
|
|
3515
|
+
collectCharText(p, parts);
|
|
3516
|
+
return parts.join("").trim();
|
|
3517
|
+
}
|
|
3518
|
+
function collectCharText(node, parts, depth = 0) {
|
|
3519
|
+
if (depth > MAX_XML_DEPTH2) return;
|
|
3520
|
+
const children = node.childNodes;
|
|
3521
|
+
for (let i = 0; i < children.length; i++) {
|
|
3522
|
+
const el = children[i];
|
|
3523
|
+
if (el.nodeType !== 1) continue;
|
|
3524
|
+
const tag = localName(el);
|
|
3525
|
+
if (tag === "CHAR") {
|
|
3526
|
+
const t = textContent(el);
|
|
3527
|
+
if (t) parts.push(t);
|
|
3528
|
+
} else if (tag === "TABLE" || tag === "PICTURE" || tag === "SHAPEOBJECT") {
|
|
3529
|
+
} else if (tag === "AUTONUM") {
|
|
3530
|
+
} else {
|
|
3531
|
+
collectCharText(el, parts, depth + 1);
|
|
3532
|
+
}
|
|
3533
|
+
}
|
|
3534
|
+
}
|
|
3535
|
+
function parseTable2(el, blocks, paraShapeMap, sectionNum, warnings) {
|
|
3536
|
+
const cells = [];
|
|
3537
|
+
const rowCount = parseInt(el.getAttribute("RowCount") ?? "0", 10);
|
|
3538
|
+
const colCount = parseInt(el.getAttribute("ColCount") ?? "0", 10);
|
|
3539
|
+
if (isNaN(rowCount) || isNaN(colCount) || rowCount === 0 || colCount === 0) return;
|
|
3540
|
+
if (rowCount > MAX_TABLE_ROWS || colCount > MAX_TABLE_COLS) {
|
|
3541
|
+
warnings.push({ message: `\uD14C\uC774\uBE14 \uD06C\uAE30 \uCD08\uACFC (${rowCount}x${colCount}) \u2014 \uC2A4\uD0B5`, code: "TRUNCATED_TABLE" });
|
|
3542
|
+
return;
|
|
3543
|
+
}
|
|
3544
|
+
const children = el.childNodes;
|
|
3545
|
+
for (let i = 0; i < children.length; i++) {
|
|
3546
|
+
const rowEl = children[i];
|
|
3547
|
+
if (rowEl.nodeType !== 1 || localName(rowEl) !== "ROW") continue;
|
|
3548
|
+
const rowCells = rowEl.childNodes;
|
|
3549
|
+
for (let j = 0; j < rowCells.length; j++) {
|
|
3550
|
+
const cellEl = rowCells[j];
|
|
3551
|
+
if (cellEl.nodeType !== 1 || localName(cellEl) !== "CELL") continue;
|
|
3552
|
+
const colAddr = parseInt(cellEl.getAttribute("ColAddr") ?? "0", 10);
|
|
3553
|
+
const rowAddr = parseInt(cellEl.getAttribute("RowAddr") ?? "0", 10);
|
|
3554
|
+
const colSpan = Math.min(Math.max(1, parseInt(cellEl.getAttribute("ColSpan") ?? "1", 10) || 1), MAX_TABLE_COLS);
|
|
3555
|
+
const rowSpan = Math.min(Math.max(1, parseInt(cellEl.getAttribute("RowSpan") ?? "1", 10) || 1), MAX_TABLE_ROWS);
|
|
3556
|
+
const cellText = extractCellText(cellEl);
|
|
3557
|
+
cells.push({ text: cellText, colSpan, rowSpan, colAddr, rowAddr });
|
|
3558
|
+
}
|
|
3559
|
+
}
|
|
3560
|
+
if (cells.length === 0) return;
|
|
3561
|
+
const grid = Array.from({ length: rowCount }, () => Array(colCount).fill(null));
|
|
3562
|
+
for (const cell of cells) {
|
|
3563
|
+
const r = cell.rowAddr ?? 0;
|
|
3564
|
+
const c = cell.colAddr ?? 0;
|
|
3565
|
+
if (isNaN(r) || isNaN(c) || r >= rowCount || c >= colCount) continue;
|
|
3566
|
+
grid[r][c] = cell;
|
|
3567
|
+
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
3568
|
+
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
3569
|
+
if (dr === 0 && dc === 0) continue;
|
|
3570
|
+
if (r + dr < rowCount && c + dc < colCount) {
|
|
3571
|
+
grid[r + dr][c + dc] = { text: "", colSpan: 1, rowSpan: 1 };
|
|
3572
|
+
}
|
|
3573
|
+
}
|
|
3574
|
+
}
|
|
3575
|
+
}
|
|
3576
|
+
const cellRows = grid.map(
|
|
3577
|
+
(row) => row.map((cell) => cell ?? { text: "", colSpan: 1, rowSpan: 1 })
|
|
3578
|
+
);
|
|
3579
|
+
const table = buildTable(cellRows);
|
|
3580
|
+
blocks.push({ type: "table", table, pageNumber: sectionNum });
|
|
3581
|
+
}
|
|
3582
|
+
function extractCellText(cellEl) {
|
|
3583
|
+
const textParts = [];
|
|
3584
|
+
collectCellText(cellEl, textParts, 0);
|
|
3585
|
+
return textParts.filter(Boolean).join("\n").trim();
|
|
3586
|
+
}
|
|
3587
|
+
function collectCellText(node, parts, depth) {
|
|
3588
|
+
if (depth > 20) return;
|
|
3589
|
+
const children = node.childNodes;
|
|
3590
|
+
for (let i = 0; i < children.length; i++) {
|
|
3591
|
+
const el = children[i];
|
|
3592
|
+
if (el.nodeType !== 1) continue;
|
|
3593
|
+
const tag = localName(el);
|
|
3594
|
+
if (tag === "P") {
|
|
3595
|
+
const t = extractParagraphText(el);
|
|
3596
|
+
if (t) parts.push(t);
|
|
3597
|
+
} else if (tag === "TABLE") {
|
|
3598
|
+
parts.push("[\uC911\uCCA9 \uD14C\uC774\uBE14]");
|
|
3599
|
+
} else {
|
|
3600
|
+
collectCellText(el, parts, depth + 1);
|
|
3601
|
+
}
|
|
3602
|
+
}
|
|
3603
|
+
}
|
|
3604
|
+
function localName(el) {
|
|
3605
|
+
return (el.tagName || el.localName || "").replace(/^[^:]+:/, "");
|
|
3606
|
+
}
|
|
3607
|
+
function findChild(parent, tag) {
|
|
3608
|
+
const children = parent.childNodes;
|
|
3609
|
+
for (let i = 0; i < children.length; i++) {
|
|
3610
|
+
const el = children[i];
|
|
3611
|
+
if (el.nodeType === 1 && localName(el) === tag) return el;
|
|
3612
|
+
}
|
|
3613
|
+
return null;
|
|
3614
|
+
}
|
|
3615
|
+
function textContent(el) {
|
|
3616
|
+
const children = el.childNodes;
|
|
3617
|
+
const parts = [];
|
|
3618
|
+
for (let i = 0; i < children.length; i++) {
|
|
3619
|
+
const node = children[i];
|
|
3620
|
+
if (node.nodeType === 3) {
|
|
3621
|
+
parts.push(node.nodeValue || "");
|
|
3622
|
+
} else if (node.nodeType === 1) {
|
|
3623
|
+
parts.push(textContent(node));
|
|
3624
|
+
}
|
|
3625
|
+
}
|
|
3626
|
+
return parts.join("");
|
|
3627
|
+
}
|
|
3628
|
+
function countSections(body) {
|
|
3629
|
+
let count = 0;
|
|
3630
|
+
const children = body.childNodes;
|
|
3631
|
+
for (let i = 0; i < children.length; i++) {
|
|
3632
|
+
const el = children[i];
|
|
3633
|
+
if (el.nodeType === 1 && localName(el) === "SECTION") count++;
|
|
3634
|
+
}
|
|
3635
|
+
return count;
|
|
3636
|
+
}
|
|
3637
|
+
|
|
3239
3638
|
// src/form/recognize.ts
|
|
3240
3639
|
var LABEL_KEYWORDS = /* @__PURE__ */ new Set([
|
|
3241
3640
|
"\uC131\uBA85",
|
|
@@ -3570,7 +3969,7 @@ function fillInlineFields(text, values, filled, matchedLabels) {
|
|
|
3570
3969
|
|
|
3571
3970
|
// src/form/filler-hwpx.ts
|
|
3572
3971
|
import JSZip5 from "jszip";
|
|
3573
|
-
import { DOMParser as
|
|
3972
|
+
import { DOMParser as DOMParser5, XMLSerializer } from "@xmldom/xmldom";
|
|
3574
3973
|
async function fillHwpx(hwpxBuffer, values) {
|
|
3575
3974
|
const zip = await JSZip5.loadAsync(hwpxBuffer);
|
|
3576
3975
|
const filled = [];
|
|
@@ -3580,7 +3979,7 @@ async function fillHwpx(hwpxBuffer, values) {
|
|
|
3580
3979
|
if (sectionFiles.length === 0) {
|
|
3581
3980
|
throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
3582
3981
|
}
|
|
3583
|
-
const xmlParser = new
|
|
3982
|
+
const xmlParser = new DOMParser5();
|
|
3584
3983
|
const xmlSerializer = new XMLSerializer();
|
|
3585
3984
|
for (const sectionPath of sectionFiles) {
|
|
3586
3985
|
const zipEntry = zip.file(sectionPath);
|
|
@@ -3612,10 +4011,10 @@ async function fillHwpx(hwpxBuffer, values) {
|
|
|
3612
4011
|
const trEl = rows[rowIdx];
|
|
3613
4012
|
const cells = findDirectChildren(trEl, "tc");
|
|
3614
4013
|
for (let colIdx = 0; colIdx < cells.length - 1; colIdx++) {
|
|
3615
|
-
const labelText =
|
|
4014
|
+
const labelText = extractCellText2(cells[colIdx]);
|
|
3616
4015
|
if (!isLabelCell(labelText)) continue;
|
|
3617
4016
|
const valueCell = cells[colIdx + 1];
|
|
3618
|
-
const valueText =
|
|
4017
|
+
const valueText = extractCellText2(valueCell);
|
|
3619
4018
|
if (isKeywordLabel(valueText)) continue;
|
|
3620
4019
|
const normalizedCellLabel = normalizeLabel(labelText);
|
|
3621
4020
|
if (!normalizedCellLabel) continue;
|
|
@@ -3640,14 +4039,14 @@ async function fillHwpx(hwpxBuffer, values) {
|
|
|
3640
4039
|
if (rows.length >= 2) {
|
|
3641
4040
|
const headerCells = findDirectChildren(rows[0], "tc");
|
|
3642
4041
|
const allLabels = headerCells.every((cell) => {
|
|
3643
|
-
const t =
|
|
4042
|
+
const t = extractCellText2(cell).trim();
|
|
3644
4043
|
return t.length > 0 && t.length <= 20 && isLabelCell(t);
|
|
3645
4044
|
});
|
|
3646
4045
|
if (allLabels) {
|
|
3647
4046
|
for (let rowIdx = 1; rowIdx < rows.length; rowIdx++) {
|
|
3648
4047
|
const dataCells = findDirectChildren(rows[rowIdx], "tc");
|
|
3649
4048
|
for (let colIdx = 0; colIdx < Math.min(headerCells.length, dataCells.length); colIdx++) {
|
|
3650
|
-
const headerLabel = normalizeLabel(
|
|
4049
|
+
const headerLabel = normalizeLabel(extractCellText2(headerCells[colIdx]));
|
|
3651
4050
|
const matchKey = findMatchingKey(headerLabel, normalizedValues);
|
|
3652
4051
|
if (matchKey === void 0) continue;
|
|
3653
4052
|
if (matchedLabels.has(matchKey)) continue;
|
|
@@ -3655,7 +4054,7 @@ async function fillHwpx(hwpxBuffer, values) {
|
|
|
3655
4054
|
replaceCellText(dataCells[colIdx], newValue);
|
|
3656
4055
|
matchedLabels.add(matchKey);
|
|
3657
4056
|
filled.push({
|
|
3658
|
-
label:
|
|
4057
|
+
label: extractCellText2(headerCells[colIdx]).trim(),
|
|
3659
4058
|
value: newValue,
|
|
3660
4059
|
row: rowIdx,
|
|
3661
4060
|
col: colIdx
|
|
@@ -3697,7 +4096,7 @@ async function fillHwpx(hwpxBuffer, values) {
|
|
|
3697
4096
|
const buffer = await zip.generateAsync({ type: "arraybuffer" });
|
|
3698
4097
|
return { buffer, filled, unmatched };
|
|
3699
4098
|
}
|
|
3700
|
-
function
|
|
4099
|
+
function localName2(el) {
|
|
3701
4100
|
return (el.tagName || el.localName || "").replace(/^[^:]+:/, "");
|
|
3702
4101
|
}
|
|
3703
4102
|
function findAllElements(node, tagLocalName) {
|
|
@@ -3708,7 +4107,7 @@ function findAllElements(node, tagLocalName) {
|
|
|
3708
4107
|
for (let i = 0; i < children.length; i++) {
|
|
3709
4108
|
const child = children[i];
|
|
3710
4109
|
if (child.nodeType !== 1) continue;
|
|
3711
|
-
if (
|
|
4110
|
+
if (localName2(child) === tagLocalName) result.push(child);
|
|
3712
4111
|
walk(child);
|
|
3713
4112
|
}
|
|
3714
4113
|
};
|
|
@@ -3721,7 +4120,7 @@ function findDirectChildren(parent, tagLocalName) {
|
|
|
3721
4120
|
if (!children) return result;
|
|
3722
4121
|
for (let i = 0; i < children.length; i++) {
|
|
3723
4122
|
const child = children[i];
|
|
3724
|
-
if (child.nodeType === 1 &&
|
|
4123
|
+
if (child.nodeType === 1 && localName2(child) === tagLocalName) {
|
|
3725
4124
|
result.push(child);
|
|
3726
4125
|
}
|
|
3727
4126
|
}
|
|
@@ -3730,12 +4129,12 @@ function findDirectChildren(parent, tagLocalName) {
|
|
|
3730
4129
|
function isInsideTable(el) {
|
|
3731
4130
|
let parent = el.parentNode;
|
|
3732
4131
|
while (parent) {
|
|
3733
|
-
if (parent.nodeType === 1 &&
|
|
4132
|
+
if (parent.nodeType === 1 && localName2(parent) === "tbl") return true;
|
|
3734
4133
|
parent = parent.parentNode;
|
|
3735
4134
|
}
|
|
3736
4135
|
return false;
|
|
3737
4136
|
}
|
|
3738
|
-
function
|
|
4137
|
+
function extractCellText2(tcEl) {
|
|
3739
4138
|
const parts = [];
|
|
3740
4139
|
const walk = (node) => {
|
|
3741
4140
|
const children = node.childNodes;
|
|
@@ -3745,7 +4144,7 @@ function extractCellText(tcEl) {
|
|
|
3745
4144
|
if (child.nodeType === 3) {
|
|
3746
4145
|
parts.push(child.textContent || "");
|
|
3747
4146
|
} else if (child.nodeType === 1) {
|
|
3748
|
-
const tag =
|
|
4147
|
+
const tag = localName2(child);
|
|
3749
4148
|
if (tag === "t") walk(child);
|
|
3750
4149
|
else if (tag === "run" || tag === "r" || tag === "p" || tag === "subList") walk(child);
|
|
3751
4150
|
else if (tag === "tab") parts.push(" ");
|
|
@@ -4444,6 +4843,7 @@ function diffTableCells(a, b) {
|
|
|
4444
4843
|
// src/index.ts
|
|
4445
4844
|
async function parse(input, options) {
|
|
4446
4845
|
let buffer;
|
|
4846
|
+
const opts = typeof input === "string" && !options?.filePath ? { ...options, filePath: input } : options;
|
|
4447
4847
|
if (typeof input === "string") {
|
|
4448
4848
|
try {
|
|
4449
4849
|
const buf = await readFile(input);
|
|
@@ -4464,14 +4864,16 @@ async function parse(input, options) {
|
|
|
4464
4864
|
switch (format) {
|
|
4465
4865
|
case "hwpx": {
|
|
4466
4866
|
const zipFormat = await detectZipFormat(buffer);
|
|
4467
|
-
if (zipFormat === "xlsx") return parseXlsx(buffer,
|
|
4468
|
-
if (zipFormat === "docx") return parseDocx(buffer,
|
|
4469
|
-
return parseHwpx(buffer,
|
|
4867
|
+
if (zipFormat === "xlsx") return parseXlsx(buffer, opts);
|
|
4868
|
+
if (zipFormat === "docx") return parseDocx(buffer, opts);
|
|
4869
|
+
return parseHwpx(buffer, opts);
|
|
4470
4870
|
}
|
|
4471
4871
|
case "hwp":
|
|
4472
|
-
return parseHwp(buffer,
|
|
4872
|
+
return parseHwp(buffer, opts);
|
|
4873
|
+
case "hwpml":
|
|
4874
|
+
return parseHwpml(buffer, opts);
|
|
4473
4875
|
case "pdf":
|
|
4474
|
-
return parsePdf(buffer,
|
|
4876
|
+
return parsePdf(buffer, opts);
|
|
4475
4877
|
default:
|
|
4476
4878
|
return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
|
|
4477
4879
|
}
|
|
@@ -4495,7 +4897,7 @@ async function parseHwp(buffer, options) {
|
|
|
4495
4897
|
async function parsePdf(buffer, options) {
|
|
4496
4898
|
let parsePdfDocument;
|
|
4497
4899
|
try {
|
|
4498
|
-
const mod = await import("./parser-
|
|
4900
|
+
const mod = await import("./parser-XRUZEFZT.js");
|
|
4499
4901
|
parsePdfDocument = mod.parsePdfDocument;
|
|
4500
4902
|
} catch {
|
|
4501
4903
|
return {
|
|
@@ -4529,6 +4931,14 @@ async function parseDocx(buffer, options) {
|
|
|
4529
4931
|
return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
4530
4932
|
}
|
|
4531
4933
|
}
|
|
4934
|
+
async function parseHwpml(buffer, options) {
|
|
4935
|
+
try {
|
|
4936
|
+
const { markdown, blocks, metadata, outline, warnings } = parseHwpmlDocument(buffer, options);
|
|
4937
|
+
return { success: true, fileType: "hwpml", markdown, blocks, metadata, outline, warnings };
|
|
4938
|
+
} catch (err) {
|
|
4939
|
+
return { success: false, fileType: "hwpml", error: err instanceof Error ? err.message : "HWPML \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
4940
|
+
}
|
|
4941
|
+
}
|
|
4532
4942
|
async function fillForm(input, values, outputFormat = "markdown") {
|
|
4533
4943
|
let buffer;
|
|
4534
4944
|
if (typeof input === "string") {
|
|
@@ -4588,6 +4998,7 @@ export {
|
|
|
4588
4998
|
parse,
|
|
4589
4999
|
parseDocx,
|
|
4590
5000
|
parseHwp,
|
|
5001
|
+
parseHwpml,
|
|
4591
5002
|
parseHwpx,
|
|
4592
5003
|
parsePdf,
|
|
4593
5004
|
parseXlsx
|