kordoc 2.2.6 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -3
- package/dist/{chunk-RF6UJXR3.js → chunk-KSBPABBQ.js} +482 -78
- package/dist/chunk-KSBPABBQ.js.map +1 -0
- package/dist/{chunk-5Y2Q3BRW.js → chunk-M3E3C5GS.js} +8 -1
- package/dist/chunk-M3E3C5GS.js.map +1 -0
- package/dist/{chunk-FCQEF2ZM.js → chunk-VJPDY4YT.js} +2 -2
- package/dist/{chunk-NL5XLN5R.js.map → chunk-VJPDY4YT.js.map} +1 -1
- package/dist/{chunk-HXUCZ2IL.cjs → chunk-VLSATRNQ.cjs} +2 -2
- package/dist/{chunk-HXUCZ2IL.cjs.map → chunk-VLSATRNQ.cjs.map} +1 -1
- package/dist/{chunk-NL5XLN5R.js → chunk-XG5CQUSC.js} +2 -2
- package/dist/{chunk-FCQEF2ZM.js.map → chunk-XG5CQUSC.js.map} +1 -1
- package/dist/cli.js +5 -5
- package/dist/cli.js.map +1 -1
- package/dist/{detect-GYK3HKD5.js → detect-I7YIS4Q6.js} +4 -2
- package/dist/index.cjs +608 -197
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +6 -2
- package/dist/index.d.ts +6 -2
- package/dist/index.js +500 -89
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +5 -5
- package/dist/{parser-AMP7MAOH.js → parser-4275GJRB.js} +45 -42
- package/dist/{parser-AMP7MAOH.js.map → parser-4275GJRB.js.map} +1 -1
- package/dist/{parser-KOWPTDJU.cjs → parser-STAOZMUC.cjs} +61 -58
- package/dist/{parser-KOWPTDJU.cjs.map → parser-STAOZMUC.cjs.map} +1 -1
- package/dist/{parser-43IAQ5KE.js → parser-XRUZEFZT.js} +45 -42
- package/dist/{parser-43IAQ5KE.js.map → parser-XRUZEFZT.js.map} +1 -1
- package/dist/{watch-IUQXOXW3.js → watch-BFLNFJBE.js} +4 -4
- package/package.json +2 -2
- package/dist/chunk-5Y2Q3BRW.js.map +0 -1
- package/dist/chunk-RF6UJXR3.js.map +0 -1
- /package/dist/{detect-GYK3HKD5.js.map → detect-I7YIS4Q6.js.map} +0 -0
- /package/dist/{watch-IUQXOXW3.js.map → watch-BFLNFJBE.js.map} +0 -0
package/dist/index.cjs
CHANGED
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
var
|
|
19
|
+
var _chunkVLSATRNQcjs = require('./chunk-VLSATRNQ.cjs');
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
var _chunkMUOQXDZ4cjs = require('./chunk-MUOQXDZ4.cjs');
|
|
@@ -44,11 +44,17 @@ function isPdfFile(buffer) {
|
|
|
44
44
|
const b = magicBytes(buffer);
|
|
45
45
|
return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
|
|
46
46
|
}
|
|
47
|
+
function isHwpmlFile(buffer) {
|
|
48
|
+
const bytes = new Uint8Array(buffer, 0, Math.min(512, buffer.byteLength));
|
|
49
|
+
const head = new TextDecoder("utf-8", { fatal: false }).decode(bytes).replace(/^\uFEFF/, "");
|
|
50
|
+
return head.trimStart().startsWith("<?xml") && head.includes("<HWPML");
|
|
51
|
+
}
|
|
47
52
|
function detectFormat(buffer) {
|
|
48
53
|
if (buffer.byteLength < 4) return "unknown";
|
|
49
54
|
if (isZipFile(buffer)) return "hwpx";
|
|
50
55
|
if (isOldHwpFile(buffer)) return "hwp";
|
|
51
56
|
if (isPdfFile(buffer)) return "pdf";
|
|
57
|
+
if (isHwpmlFile(buffer)) return "hwpml";
|
|
52
58
|
return "unknown";
|
|
53
59
|
}
|
|
54
60
|
async function detectZipFormat(buffer) {
|
|
@@ -69,6 +75,100 @@ async function detectZipFormat(buffer) {
|
|
|
69
75
|
|
|
70
76
|
var _zlib = require('zlib');
|
|
71
77
|
var _xmldom = require('@xmldom/xmldom');
|
|
78
|
+
|
|
79
|
+
// src/hwpx/com-fallback.ts
|
|
80
|
+
var _child_process = require('child_process');
|
|
81
|
+
var _os = require('os');
|
|
82
|
+
function isComFallbackAvailable() {
|
|
83
|
+
return _os.platform.call(void 0, ) === "win32";
|
|
84
|
+
}
|
|
85
|
+
function isEncryptedHwpx(manifestXml) {
|
|
86
|
+
return manifestXml.includes("encryption-data");
|
|
87
|
+
}
|
|
88
|
+
function extractTextViaCom(filePath) {
|
|
89
|
+
if (!isComFallbackAvailable()) {
|
|
90
|
+
throw new Error("COM fallback\uC740 Windows\uC5D0\uC11C\uB9CC \uC0AC\uC6A9 \uAC00\uB2A5\uD569\uB2C8\uB2E4");
|
|
91
|
+
}
|
|
92
|
+
const escaped = filePath.replace(/'/g, "''");
|
|
93
|
+
const ps1 = `
|
|
94
|
+
[Console]::OutputEncoding = [System.Text.Encoding]::UTF8
|
|
95
|
+
$ErrorActionPreference = 'Stop'
|
|
96
|
+
try {
|
|
97
|
+
$hwp = New-Object -ComObject HWPFrame.HwpObject
|
|
98
|
+
$hwp.RegisterModule('FilePathCheckerModule', 'FilePathCheckerModuleExample') | Out-Null
|
|
99
|
+
$hwp.Open('${escaped}', '', '') | Out-Null
|
|
100
|
+
$pc = $hwp.PageCount
|
|
101
|
+
$result = @{ pageCount = $pc; pages = @() }
|
|
102
|
+
for ($p = 1; $p -le $pc; $p++) {
|
|
103
|
+
$t = $hwp.GetPageText($p, 0)
|
|
104
|
+
$result.pages += @($t)
|
|
105
|
+
}
|
|
106
|
+
$hwp.Clear(1)
|
|
107
|
+
[System.Runtime.InteropServices.Marshal]::ReleaseComObject($hwp) | Out-Null
|
|
108
|
+
$result | ConvertTo-Json -Depth 3 -Compress
|
|
109
|
+
} catch {
|
|
110
|
+
@{ error = $_.Exception.Message } | ConvertTo-Json -Compress
|
|
111
|
+
}
|
|
112
|
+
`;
|
|
113
|
+
const stdout = _child_process.execFileSync.call(void 0, "powershell", [
|
|
114
|
+
"-NoProfile",
|
|
115
|
+
"-NonInteractive",
|
|
116
|
+
"-ExecutionPolicy",
|
|
117
|
+
"Bypass",
|
|
118
|
+
"-Command",
|
|
119
|
+
ps1
|
|
120
|
+
], {
|
|
121
|
+
encoding: "utf-8",
|
|
122
|
+
timeout: 12e4,
|
|
123
|
+
// 2분 타임아웃
|
|
124
|
+
windowsHide: true,
|
|
125
|
+
maxBuffer: 50 * 1024 * 1024
|
|
126
|
+
// 50MB
|
|
127
|
+
});
|
|
128
|
+
const trimmed = stdout.trim();
|
|
129
|
+
const jsonStart = trimmed.indexOf("{");
|
|
130
|
+
if (jsonStart < 0) throw new Error(`COM \uCD9C\uB825\uC5D0 JSON\uC774 \uC5C6\uC2B5\uB2C8\uB2E4: ${trimmed.slice(0, 200)}`);
|
|
131
|
+
const json = JSON.parse(trimmed.slice(jsonStart));
|
|
132
|
+
if (json.error) {
|
|
133
|
+
throw new Error(`COM \uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uC2E4\uD328: ${json.error}`);
|
|
134
|
+
}
|
|
135
|
+
const warnings = [];
|
|
136
|
+
const pages = Array.isArray(json.pages) ? json.pages : [];
|
|
137
|
+
const pageCount = _nullishCoalesce(json.pageCount, () => ( pages.length));
|
|
138
|
+
if (pages.length === 0) {
|
|
139
|
+
warnings.push({ message: "COM\uC73C\uB85C \uD14D\uC2A4\uD2B8\uB97C \uCD94\uCD9C\uD558\uC9C0 \uBABB\uD588\uC2B5\uB2C8\uB2E4", code: "COM_EMPTY" });
|
|
140
|
+
}
|
|
141
|
+
return { pages, pageCount, warnings };
|
|
142
|
+
}
|
|
143
|
+
function comResultToParseResult(pages, pageCount, warnings) {
|
|
144
|
+
const blocks = [];
|
|
145
|
+
const lines = [];
|
|
146
|
+
for (let i = 0; i < pages.length; i++) {
|
|
147
|
+
const text = (_nullishCoalesce(pages[i], () => ( ""))).trim();
|
|
148
|
+
if (!text) continue;
|
|
149
|
+
const paragraphs = text.split(/\n/);
|
|
150
|
+
for (const para of paragraphs) {
|
|
151
|
+
const trimmed = para.trim();
|
|
152
|
+
if (!trimmed) continue;
|
|
153
|
+
blocks.push({ type: "paragraph", text: trimmed, pageNumber: i + 1 });
|
|
154
|
+
lines.push(trimmed);
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
const markdown = lines.join("\n\n");
|
|
158
|
+
const metadata = { pageCount };
|
|
159
|
+
warnings.push({
|
|
160
|
+
message: "DRM \uBB38\uC11C: \uD55C\uCEF4 COM API\uB85C \uD14D\uC2A4\uD2B8 \uCD94\uCD9C (\uC11C\uC2DD/\uD45C \uC815\uBCF4 \uC81C\uD55C\uC801)",
|
|
161
|
+
code: "DRM_COM_FALLBACK"
|
|
162
|
+
});
|
|
163
|
+
return {
|
|
164
|
+
markdown,
|
|
165
|
+
blocks,
|
|
166
|
+
metadata,
|
|
167
|
+
warnings: warnings.length > 0 ? warnings : void 0
|
|
168
|
+
};
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// src/hwpx/parser.ts
|
|
72
172
|
var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
|
|
73
173
|
var MAX_ZIP_ENTRIES = 500;
|
|
74
174
|
function clampSpan(val, max) {
|
|
@@ -78,7 +178,7 @@ var MAX_XML_DEPTH = 200;
|
|
|
78
178
|
function createXmlParser(warnings) {
|
|
79
179
|
return new (0, _xmldom.DOMParser)({
|
|
80
180
|
onError(level, msg) {
|
|
81
|
-
if (level === "fatalError") throw new (0,
|
|
181
|
+
if (level === "fatalError") throw new (0, _chunkVLSATRNQcjs.KordocError)(`XML \uD30C\uC2F1 \uC2E4\uD328: ${msg}`);
|
|
82
182
|
_optionalChain([warnings, 'optionalAccess', _2 => _2.push, 'call', _3 => _3({ code: "MALFORMED_XML", message: `XML ${level === "warn" ? "\uACBD\uACE0" : "\uC624\uB958"}: ${msg}` })]);
|
|
83
183
|
}
|
|
84
184
|
});
|
|
@@ -97,10 +197,10 @@ async function extractHwpxStyles(zip, decompressed) {
|
|
|
97
197
|
const xml = await file.async("text");
|
|
98
198
|
if (decompressed) {
|
|
99
199
|
decompressed.total += xml.length * 2;
|
|
100
|
-
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new (0,
|
|
200
|
+
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new (0, _chunkVLSATRNQcjs.KordocError)("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
101
201
|
}
|
|
102
202
|
const parser = createXmlParser();
|
|
103
|
-
const doc = parser.parseFromString(
|
|
203
|
+
const doc = parser.parseFromString(_chunkVLSATRNQcjs.stripDtd.call(void 0, xml), "text/xml");
|
|
104
204
|
if (!doc.documentElement) continue;
|
|
105
205
|
parseCharProperties(doc, result.charProperties);
|
|
106
206
|
parseStyleElements(doc, result.styles);
|
|
@@ -162,7 +262,7 @@ function parseStyleElements(doc, map) {
|
|
|
162
262
|
}
|
|
163
263
|
}
|
|
164
264
|
async function parseHwpxDocument(buffer, options) {
|
|
165
|
-
|
|
265
|
+
_chunkVLSATRNQcjs.precheckZipSize.call(void 0, buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
166
266
|
let zip;
|
|
167
267
|
try {
|
|
168
268
|
zip = await _jszip2.default.loadAsync(buffer);
|
|
@@ -171,7 +271,20 @@ async function parseHwpxDocument(buffer, options) {
|
|
|
171
271
|
}
|
|
172
272
|
const actualEntryCount = Object.keys(zip.files).length;
|
|
173
273
|
if (actualEntryCount > MAX_ZIP_ENTRIES) {
|
|
174
|
-
throw new (0,
|
|
274
|
+
throw new (0, _chunkVLSATRNQcjs.KordocError)("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
275
|
+
}
|
|
276
|
+
const manifestFile = zip.file("META-INF/manifest.xml");
|
|
277
|
+
if (manifestFile) {
|
|
278
|
+
const manifestXml = await manifestFile.async("text");
|
|
279
|
+
if (isEncryptedHwpx(manifestXml)) {
|
|
280
|
+
if (isComFallbackAvailable() && _optionalChain([options, 'optionalAccess', _4 => _4.filePath])) {
|
|
281
|
+
const { pages, pageCount, warnings: warnings2 } = extractTextViaCom(options.filePath);
|
|
282
|
+
if (pages.some((p) => p && p.trim().length > 0)) {
|
|
283
|
+
return comResultToParseResult(pages, pageCount, warnings2);
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
throw new (0, _chunkVLSATRNQcjs.KordocError)("DRM \uC554\uD638\uD654\uB41C HWPX \uD30C\uC77C\uC785\uB2C8\uB2E4. Windows + \uD55C\uCEF4 \uC624\uD53C\uC2A4 \uC124\uCE58 \uC2DC \uC790\uB3D9 \uCD94\uCD9C\uB429\uB2C8\uB2E4.");
|
|
287
|
+
}
|
|
175
288
|
}
|
|
176
289
|
const decompressed = { total: 0 };
|
|
177
290
|
const metadata = {};
|
|
@@ -179,11 +292,12 @@ async function parseHwpxDocument(buffer, options) {
|
|
|
179
292
|
const styleMap = await extractHwpxStyles(zip, decompressed);
|
|
180
293
|
const warnings = [];
|
|
181
294
|
const sectionPaths = await resolveSectionPaths(zip);
|
|
182
|
-
if (sectionPaths.length === 0) throw new (0,
|
|
295
|
+
if (sectionPaths.length === 0) throw new (0, _chunkVLSATRNQcjs.KordocError)("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
183
296
|
metadata.pageCount = sectionPaths.length;
|
|
184
|
-
const pageFilter = _optionalChain([options, 'optionalAccess',
|
|
297
|
+
const pageFilter = _optionalChain([options, 'optionalAccess', _5 => _5.pages]) ? _chunkMUOQXDZ4cjs.parsePageRange.call(void 0, options.pages, sectionPaths.length) : null;
|
|
185
298
|
const totalTarget = pageFilter ? pageFilter.size : sectionPaths.length;
|
|
186
299
|
const blocks = [];
|
|
300
|
+
const nestedTableCounter = { count: 0 };
|
|
187
301
|
let parsedSections = 0;
|
|
188
302
|
for (let si = 0; si < sectionPaths.length; si++) {
|
|
189
303
|
if (pageFilter && !pageFilter.has(si + 1)) continue;
|
|
@@ -192,19 +306,19 @@ async function parseHwpxDocument(buffer, options) {
|
|
|
192
306
|
try {
|
|
193
307
|
const xml = await file.async("text");
|
|
194
308
|
decompressed.total += xml.length * 2;
|
|
195
|
-
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new (0,
|
|
196
|
-
blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1));
|
|
309
|
+
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new (0, _chunkVLSATRNQcjs.KordocError)("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
310
|
+
blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1, nestedTableCounter));
|
|
197
311
|
parsedSections++;
|
|
198
|
-
_optionalChain([options, 'optionalAccess',
|
|
312
|
+
_optionalChain([options, 'optionalAccess', _6 => _6.onProgress, 'optionalCall', _7 => _7(parsedSections, totalTarget)]);
|
|
199
313
|
} catch (secErr) {
|
|
200
|
-
if (secErr instanceof
|
|
314
|
+
if (secErr instanceof _chunkVLSATRNQcjs.KordocError) throw secErr;
|
|
201
315
|
warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
202
316
|
}
|
|
203
317
|
}
|
|
204
318
|
const images = await extractImagesFromZip(zip, blocks, decompressed, warnings);
|
|
205
319
|
detectHwpxHeadings(blocks, styleMap);
|
|
206
320
|
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
207
|
-
const markdown =
|
|
321
|
+
const markdown = _chunkVLSATRNQcjs.blocksToMarkdown.call(void 0, blocks);
|
|
208
322
|
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
209
323
|
}
|
|
210
324
|
function imageExtToMime(ext) {
|
|
@@ -254,16 +368,29 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
|
|
|
254
368
|
ref
|
|
255
369
|
// 절대 경로일 수도 있음
|
|
256
370
|
];
|
|
371
|
+
let resolvedPath = null;
|
|
372
|
+
if (!ref.includes(".")) {
|
|
373
|
+
const prefixes = [`BinData/${ref}`, `Contents/BinData/${ref}`];
|
|
374
|
+
for (const prefix of prefixes) {
|
|
375
|
+
const match = zip.file(new RegExp(`^${prefix.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\.[a-zA-Z0-9]+$`));
|
|
376
|
+
if (match.length > 0) {
|
|
377
|
+
resolvedPath = match[0].name;
|
|
378
|
+
break;
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
}
|
|
257
382
|
let found = false;
|
|
258
|
-
|
|
259
|
-
|
|
383
|
+
const allCandidates = resolvedPath ? [resolvedPath, ...candidates] : candidates;
|
|
384
|
+
for (const path of allCandidates) {
|
|
385
|
+
if (_chunkVLSATRNQcjs.isPathTraversal.call(void 0, path)) continue;
|
|
260
386
|
const file = zip.file(path);
|
|
261
387
|
if (!file) continue;
|
|
262
388
|
try {
|
|
263
389
|
const data = await file.async("uint8array");
|
|
264
390
|
decompressed.total += data.length;
|
|
265
|
-
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new (0,
|
|
266
|
-
const
|
|
391
|
+
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new (0, _chunkVLSATRNQcjs.KordocError)("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
392
|
+
const actualPath = path;
|
|
393
|
+
const ext = actualPath.includes(".") ? actualPath.split(".").pop() || "png" : "png";
|
|
267
394
|
const mimeType = imageExtToMime(ext);
|
|
268
395
|
imageIndex++;
|
|
269
396
|
const filename = `image_${String(imageIndex).padStart(3, "0")}.${mimeToExt(mimeType)}`;
|
|
@@ -273,11 +400,11 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
|
|
|
273
400
|
found = true;
|
|
274
401
|
break;
|
|
275
402
|
} catch (err) {
|
|
276
|
-
if (err instanceof
|
|
403
|
+
if (err instanceof _chunkVLSATRNQcjs.KordocError) throw err;
|
|
277
404
|
}
|
|
278
405
|
}
|
|
279
406
|
if (!found) {
|
|
280
|
-
_optionalChain([warnings, 'optionalAccess',
|
|
407
|
+
_optionalChain([warnings, 'optionalAccess', _8 => _8.push, 'call', _9 => _9({ page: block.pageNumber, message: `\uC774\uBBF8\uC9C0 \uD30C\uC77C \uC5C6\uC74C: ${ref}`, code: "SKIPPED_IMAGE" })]);
|
|
281
408
|
block.type = "paragraph";
|
|
282
409
|
block.text = `[\uC774\uBBF8\uC9C0: ${ref}]`;
|
|
283
410
|
}
|
|
@@ -293,7 +420,7 @@ async function extractHwpxMetadata(zip, metadata, decompressed) {
|
|
|
293
420
|
const xml = await file.async("text");
|
|
294
421
|
if (decompressed) {
|
|
295
422
|
decompressed.total += xml.length * 2;
|
|
296
|
-
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new (0,
|
|
423
|
+
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new (0, _chunkVLSATRNQcjs.KordocError)("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
297
424
|
}
|
|
298
425
|
parseDublinCoreMetadata(xml, metadata);
|
|
299
426
|
if (metadata.title || metadata.author) return;
|
|
@@ -303,13 +430,13 @@ async function extractHwpxMetadata(zip, metadata, decompressed) {
|
|
|
303
430
|
}
|
|
304
431
|
function parseDublinCoreMetadata(xml, metadata) {
|
|
305
432
|
const parser = createXmlParser();
|
|
306
|
-
const doc = parser.parseFromString(
|
|
433
|
+
const doc = parser.parseFromString(_chunkVLSATRNQcjs.stripDtd.call(void 0, xml), "text/xml");
|
|
307
434
|
if (!doc.documentElement) return;
|
|
308
435
|
const getText = (tagNames) => {
|
|
309
436
|
for (const tag of tagNames) {
|
|
310
437
|
const els = doc.getElementsByTagName(tag);
|
|
311
438
|
if (els.length > 0) {
|
|
312
|
-
const text = _optionalChain([els, 'access',
|
|
439
|
+
const text = _optionalChain([els, 'access', _10 => _10[0], 'access', _11 => _11.textContent, 'optionalAccess', _12 => _12.trim, 'call', _13 => _13()]);
|
|
313
440
|
if (text) return text;
|
|
314
441
|
}
|
|
315
442
|
}
|
|
@@ -336,6 +463,7 @@ function extractFromBrokenZip(buffer) {
|
|
|
336
463
|
let totalDecompressed = 0;
|
|
337
464
|
let entryCount = 0;
|
|
338
465
|
let sectionNum = 0;
|
|
466
|
+
const nestedTableCounter = { count: 0 };
|
|
339
467
|
while (pos < data.length - 30) {
|
|
340
468
|
if (data[pos] !== 80 || data[pos + 1] !== 75 || data[pos + 2] !== 3 || data[pos + 3] !== 4) {
|
|
341
469
|
pos++;
|
|
@@ -362,7 +490,7 @@ function extractFromBrokenZip(buffer) {
|
|
|
362
490
|
}
|
|
363
491
|
const nameBytes = data.slice(pos + 30, pos + 30 + nameLen);
|
|
364
492
|
const name = new TextDecoder().decode(nameBytes);
|
|
365
|
-
if (
|
|
493
|
+
if (_chunkVLSATRNQcjs.isPathTraversal.call(void 0, name)) {
|
|
366
494
|
pos = fileStart + compSize;
|
|
367
495
|
continue;
|
|
368
496
|
}
|
|
@@ -380,15 +508,15 @@ function extractFromBrokenZip(buffer) {
|
|
|
380
508
|
continue;
|
|
381
509
|
}
|
|
382
510
|
totalDecompressed += content.length * 2;
|
|
383
|
-
if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new (0,
|
|
511
|
+
if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new (0, _chunkVLSATRNQcjs.KordocError)("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
|
|
384
512
|
sectionNum++;
|
|
385
|
-
blocks.push(...parseSectionXml(content, void 0, warnings, sectionNum));
|
|
513
|
+
blocks.push(...parseSectionXml(content, void 0, warnings, sectionNum, nestedTableCounter));
|
|
386
514
|
} catch (e6) {
|
|
387
515
|
continue;
|
|
388
516
|
}
|
|
389
517
|
}
|
|
390
|
-
if (blocks.length === 0) throw new (0,
|
|
391
|
-
const markdown =
|
|
518
|
+
if (blocks.length === 0) throw new (0, _chunkVLSATRNQcjs.KordocError)("\uC190\uC0C1\uB41C HWPX\uC5D0\uC11C \uC139\uC158 \uB370\uC774\uD130\uB97C \uBCF5\uAD6C\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
519
|
+
const markdown = _chunkVLSATRNQcjs.blocksToMarkdown.call(void 0, blocks);
|
|
392
520
|
return { markdown, blocks, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
393
521
|
}
|
|
394
522
|
async function resolveSectionPaths(zip) {
|
|
@@ -406,7 +534,7 @@ async function resolveSectionPaths(zip) {
|
|
|
406
534
|
}
|
|
407
535
|
function parseSectionPathsFromManifest(xml) {
|
|
408
536
|
const parser = createXmlParser();
|
|
409
|
-
const doc = parser.parseFromString(
|
|
537
|
+
const doc = parser.parseFromString(_chunkVLSATRNQcjs.stripDtd.call(void 0, xml), "text/xml");
|
|
410
538
|
const items = doc.getElementsByTagName("opf:item");
|
|
411
539
|
const spine = doc.getElementsByTagName("opf:itemref");
|
|
412
540
|
const isSectionId = (id) => /^s/i.test(id) || id.toLowerCase().includes("section");
|
|
@@ -435,7 +563,7 @@ function detectHwpxHeadings(blocks, styleMap) {
|
|
|
435
563
|
let baseFontSize = 0;
|
|
436
564
|
const sizeFreq = /* @__PURE__ */ new Map();
|
|
437
565
|
for (const b of blocks) {
|
|
438
|
-
if (_optionalChain([b, 'access',
|
|
566
|
+
if (_optionalChain([b, 'access', _14 => _14.style, 'optionalAccess', _15 => _15.fontSize])) {
|
|
439
567
|
sizeFreq.set(b.style.fontSize, (sizeFreq.get(b.style.fontSize) || 0) + 1);
|
|
440
568
|
}
|
|
441
569
|
}
|
|
@@ -451,11 +579,11 @@ function detectHwpxHeadings(blocks, styleMap) {
|
|
|
451
579
|
const text = block.text.trim();
|
|
452
580
|
if (text.length === 0 || text.length > 200 || /^\d+$/.test(text)) continue;
|
|
453
581
|
let level = 0;
|
|
454
|
-
if (baseFontSize > 0 && _optionalChain([block, 'access',
|
|
582
|
+
if (baseFontSize > 0 && _optionalChain([block, 'access', _16 => _16.style, 'optionalAccess', _17 => _17.fontSize])) {
|
|
455
583
|
const ratio = block.style.fontSize / baseFontSize;
|
|
456
|
-
if (ratio >=
|
|
457
|
-
else if (ratio >=
|
|
458
|
-
else if (ratio >=
|
|
584
|
+
if (ratio >= _chunkVLSATRNQcjs.HEADING_RATIO_H1) level = 1;
|
|
585
|
+
else if (ratio >= _chunkVLSATRNQcjs.HEADING_RATIO_H2) level = 2;
|
|
586
|
+
else if (ratio >= _chunkVLSATRNQcjs.HEADING_RATIO_H3) level = 3;
|
|
459
587
|
}
|
|
460
588
|
const compactText = text.replace(/\s+/g, "");
|
|
461
589
|
if (/^제\d+[조장절편]/.test(compactText) && text.length <= 50) {
|
|
@@ -467,12 +595,40 @@ function detectHwpxHeadings(blocks, styleMap) {
|
|
|
467
595
|
}
|
|
468
596
|
}
|
|
469
597
|
}
|
|
470
|
-
function
|
|
598
|
+
function makeNestedTableMarker(counter, rows) {
|
|
599
|
+
counter.count++;
|
|
600
|
+
const firstRow = _nullishCoalesce(rows[0], () => ( []));
|
|
601
|
+
const hint = firstRow.map((c) => c.text.trim().replace(/\n/g, " ")).filter(Boolean).join(" | ");
|
|
602
|
+
const hintChars = [...hint];
|
|
603
|
+
const truncated = hintChars.length > 60 ? hintChars.slice(0, 60).join("") + "\u2026" : hint;
|
|
604
|
+
return truncated ? `[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}: ${truncated}]` : `[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}]`;
|
|
605
|
+
}
|
|
606
|
+
function handleNestedTable(newTable, tableStack, blocks, ctx) {
|
|
607
|
+
const parentTable = tableStack.pop();
|
|
608
|
+
let nestedCols = 0;
|
|
609
|
+
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
610
|
+
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
611
|
+
blocks.push({ type: "table", table: _chunkVLSATRNQcjs.buildTable.call(void 0, newTable.rows), pageNumber: ctx.sectionNum });
|
|
612
|
+
if (parentTable.cell) {
|
|
613
|
+
const marker = ctx.counter ? makeNestedTableMarker(ctx.counter, newTable.rows) : "[\uC911\uCCA9 \uD14C\uC774\uBE14]";
|
|
614
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + marker;
|
|
615
|
+
}
|
|
616
|
+
} else {
|
|
617
|
+
const nestedText = _chunkVLSATRNQcjs.convertTableToText.call(void 0, newTable.rows);
|
|
618
|
+
if (parentTable.cell) {
|
|
619
|
+
const marker = ctx.counter ? makeNestedTableMarker(ctx.counter, newTable.rows) : "[\uC911\uCCA9 \uD14C\uC774\uBE14]";
|
|
620
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + marker + "\n" + nestedText;
|
|
621
|
+
}
|
|
622
|
+
}
|
|
623
|
+
return parentTable;
|
|
624
|
+
}
|
|
625
|
+
function parseSectionXml(xml, styleMap, warnings, sectionNum, counter) {
|
|
471
626
|
const parser = createXmlParser(warnings);
|
|
472
|
-
const doc = parser.parseFromString(
|
|
627
|
+
const doc = parser.parseFromString(_chunkVLSATRNQcjs.stripDtd.call(void 0, xml), "text/xml");
|
|
473
628
|
if (!doc.documentElement) return [];
|
|
474
629
|
const blocks = [];
|
|
475
|
-
|
|
630
|
+
const ctx = { styleMap, warnings, sectionNum, counter };
|
|
631
|
+
walkSection(doc.documentElement, blocks, null, [], ctx);
|
|
476
632
|
return blocks;
|
|
477
633
|
}
|
|
478
634
|
function extractImageRef(el) {
|
|
@@ -493,7 +649,7 @@ function extractImageRef(el) {
|
|
|
493
649
|
if (directRef) return directRef;
|
|
494
650
|
return null;
|
|
495
651
|
}
|
|
496
|
-
function walkSection(node, blocks, tableCtx, tableStack,
|
|
652
|
+
function walkSection(node, blocks, tableCtx, tableStack, ctx, depth = 0) {
|
|
497
653
|
if (depth > MAX_XML_DEPTH) return;
|
|
498
654
|
const children = node.childNodes;
|
|
499
655
|
if (!children) return;
|
|
@@ -506,23 +662,12 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
506
662
|
case "tbl": {
|
|
507
663
|
if (tableCtx) tableStack.push(tableCtx);
|
|
508
664
|
const newTable = { rows: [], currentRow: [], cell: null };
|
|
509
|
-
walkSection(el, blocks, newTable, tableStack,
|
|
665
|
+
walkSection(el, blocks, newTable, tableStack, ctx, depth + 1);
|
|
510
666
|
if (newTable.rows.length > 0) {
|
|
511
667
|
if (tableStack.length > 0) {
|
|
512
|
-
|
|
513
|
-
let nestedCols = 0;
|
|
514
|
-
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
515
|
-
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
516
|
-
blocks.push({ type: "table", table: _chunkHXUCZ2ILcjs.buildTable.call(void 0, newTable.rows), pageNumber: sectionNum });
|
|
517
|
-
} else {
|
|
518
|
-
const nestedText = _chunkHXUCZ2ILcjs.convertTableToText.call(void 0, newTable.rows);
|
|
519
|
-
if (parentTable.cell) {
|
|
520
|
-
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
521
|
-
}
|
|
522
|
-
}
|
|
523
|
-
tableCtx = parentTable;
|
|
668
|
+
tableCtx = handleNestedTable(newTable, tableStack, blocks, ctx);
|
|
524
669
|
} else {
|
|
525
|
-
blocks.push({ type: "table", table:
|
|
670
|
+
blocks.push({ type: "table", table: _chunkVLSATRNQcjs.buildTable.call(void 0, newTable.rows), pageNumber: ctx.sectionNum });
|
|
526
671
|
tableCtx = null;
|
|
527
672
|
}
|
|
528
673
|
} else {
|
|
@@ -533,7 +678,7 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
533
678
|
case "tr":
|
|
534
679
|
if (tableCtx) {
|
|
535
680
|
tableCtx.currentRow = [];
|
|
536
|
-
walkSection(el, blocks, tableCtx, tableStack,
|
|
681
|
+
walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
537
682
|
if (tableCtx.currentRow.length > 0) tableCtx.rows.push(tableCtx.currentRow);
|
|
538
683
|
tableCtx.currentRow = [];
|
|
539
684
|
}
|
|
@@ -541,7 +686,7 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
541
686
|
case "tc":
|
|
542
687
|
if (tableCtx) {
|
|
543
688
|
tableCtx.cell = { text: "", colSpan: 1, rowSpan: 1 };
|
|
544
|
-
walkSection(el, blocks, tableCtx, tableStack,
|
|
689
|
+
walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
545
690
|
if (tableCtx.cell) {
|
|
546
691
|
tableCtx.currentRow.push(tableCtx.cell);
|
|
547
692
|
tableCtx.cell = null;
|
|
@@ -549,7 +694,7 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
549
694
|
}
|
|
550
695
|
break;
|
|
551
696
|
case "cellAddr":
|
|
552
|
-
if (_optionalChain([tableCtx, 'optionalAccess',
|
|
697
|
+
if (_optionalChain([tableCtx, 'optionalAccess', _18 => _18.cell])) {
|
|
553
698
|
const ca = parseInt(el.getAttribute("colAddr") || "", 10);
|
|
554
699
|
const ra = parseInt(el.getAttribute("rowAddr") || "", 10);
|
|
555
700
|
if (!isNaN(ca)) tableCtx.cell.colAddr = ca;
|
|
@@ -557,29 +702,29 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
557
702
|
}
|
|
558
703
|
break;
|
|
559
704
|
case "cellSpan":
|
|
560
|
-
if (_optionalChain([tableCtx, 'optionalAccess',
|
|
705
|
+
if (_optionalChain([tableCtx, 'optionalAccess', _19 => _19.cell])) {
|
|
561
706
|
const rawCs = parseInt(el.getAttribute("colSpan") || "1", 10);
|
|
562
707
|
const cs = isNaN(rawCs) ? 1 : rawCs;
|
|
563
708
|
const rawRs = parseInt(el.getAttribute("rowSpan") || "1", 10);
|
|
564
709
|
const rs = isNaN(rawRs) ? 1 : rawRs;
|
|
565
|
-
tableCtx.cell.colSpan = clampSpan(cs,
|
|
566
|
-
tableCtx.cell.rowSpan = clampSpan(rs,
|
|
710
|
+
tableCtx.cell.colSpan = clampSpan(cs, _chunkVLSATRNQcjs.MAX_COLS);
|
|
711
|
+
tableCtx.cell.rowSpan = clampSpan(rs, _chunkVLSATRNQcjs.MAX_ROWS);
|
|
567
712
|
}
|
|
568
713
|
break;
|
|
569
714
|
case "p": {
|
|
570
|
-
const { text, href, footnote, style } = extractParagraphInfo(el, styleMap);
|
|
715
|
+
const { text, href, footnote, style } = extractParagraphInfo(el, ctx.styleMap);
|
|
571
716
|
if (text) {
|
|
572
|
-
if (_optionalChain([tableCtx, 'optionalAccess',
|
|
717
|
+
if (_optionalChain([tableCtx, 'optionalAccess', _20 => _20.cell])) {
|
|
573
718
|
tableCtx.cell.text += (tableCtx.cell.text ? "\n" : "") + text;
|
|
574
719
|
} else if (!tableCtx) {
|
|
575
|
-
const block = { type: "paragraph", text, pageNumber: sectionNum };
|
|
720
|
+
const block = { type: "paragraph", text, pageNumber: ctx.sectionNum };
|
|
576
721
|
if (style) block.style = style;
|
|
577
722
|
if (href) block.href = href;
|
|
578
723
|
if (footnote) block.footnoteText = footnote;
|
|
579
724
|
blocks.push(block);
|
|
580
725
|
}
|
|
581
726
|
}
|
|
582
|
-
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack,
|
|
727
|
+
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
583
728
|
break;
|
|
584
729
|
}
|
|
585
730
|
// 이미지/그림 — 경로 추출 또는 경고
|
|
@@ -588,19 +733,19 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
588
733
|
case "drawingObject": {
|
|
589
734
|
const imgRef = extractImageRef(el);
|
|
590
735
|
if (imgRef) {
|
|
591
|
-
blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
|
|
592
|
-
} else if (warnings && sectionNum) {
|
|
593
|
-
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
736
|
+
blocks.push({ type: "image", text: imgRef, pageNumber: ctx.sectionNum });
|
|
737
|
+
} else if (ctx.warnings && ctx.sectionNum) {
|
|
738
|
+
ctx.warnings.push({ page: ctx.sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
594
739
|
}
|
|
595
740
|
break;
|
|
596
741
|
}
|
|
597
742
|
default:
|
|
598
|
-
walkSection(el, blocks, tableCtx, tableStack,
|
|
743
|
+
walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
599
744
|
break;
|
|
600
745
|
}
|
|
601
746
|
}
|
|
602
747
|
}
|
|
603
|
-
function walkParagraphChildren(node, blocks, tableCtx, tableStack,
|
|
748
|
+
function walkParagraphChildren(node, blocks, tableCtx, tableStack, ctx, depth = 0) {
|
|
604
749
|
if (depth > MAX_XML_DEPTH) return tableCtx;
|
|
605
750
|
const children = node.childNodes;
|
|
606
751
|
if (!children) return tableCtx;
|
|
@@ -616,23 +761,12 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
616
761
|
if (localTag === "tbl") {
|
|
617
762
|
if (tableCtx) tableStack.push(tableCtx);
|
|
618
763
|
const newTable = { rows: [], currentRow: [], cell: null };
|
|
619
|
-
walkSection(el, blocks, newTable, tableStack,
|
|
764
|
+
walkSection(el, blocks, newTable, tableStack, ctx, d + 1);
|
|
620
765
|
if (newTable.rows.length > 0) {
|
|
621
766
|
if (tableStack.length > 0) {
|
|
622
|
-
|
|
623
|
-
let nestedCols = 0;
|
|
624
|
-
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
625
|
-
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
626
|
-
blocks.push({ type: "table", table: _chunkHXUCZ2ILcjs.buildTable.call(void 0, newTable.rows), pageNumber: sectionNum });
|
|
627
|
-
} else {
|
|
628
|
-
const nestedText = _chunkHXUCZ2ILcjs.convertTableToText.call(void 0, newTable.rows);
|
|
629
|
-
if (parentTable.cell) {
|
|
630
|
-
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
631
|
-
}
|
|
632
|
-
}
|
|
633
|
-
tableCtx = parentTable;
|
|
767
|
+
tableCtx = handleNestedTable(newTable, tableStack, blocks, ctx);
|
|
634
768
|
} else {
|
|
635
|
-
blocks.push({ type: "table", table:
|
|
769
|
+
blocks.push({ type: "table", table: _chunkVLSATRNQcjs.buildTable.call(void 0, newTable.rows), pageNumber: ctx.sectionNum });
|
|
636
770
|
tableCtx = null;
|
|
637
771
|
}
|
|
638
772
|
} else {
|
|
@@ -641,21 +775,21 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
641
775
|
} else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
|
|
642
776
|
const drawTextChild = findDescendant(el, "drawText");
|
|
643
777
|
if (drawTextChild) {
|
|
644
|
-
extractDrawTextBlocks(drawTextChild, blocks, styleMap, sectionNum);
|
|
778
|
+
extractDrawTextBlocks(drawTextChild, blocks, ctx.styleMap, ctx.sectionNum);
|
|
645
779
|
} else {
|
|
646
780
|
const imgRef = extractImageRef(el);
|
|
647
781
|
if (imgRef) {
|
|
648
|
-
blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
|
|
649
|
-
} else if (warnings && sectionNum) {
|
|
650
|
-
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
782
|
+
blocks.push({ type: "image", text: imgRef, pageNumber: ctx.sectionNum });
|
|
783
|
+
} else if (ctx.warnings && ctx.sectionNum) {
|
|
784
|
+
ctx.warnings.push({ page: ctx.sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
651
785
|
}
|
|
652
786
|
}
|
|
653
787
|
} else if (localTag === "drawText") {
|
|
654
|
-
extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
|
|
788
|
+
extractDrawTextBlocks(el, blocks, ctx.styleMap, ctx.sectionNum);
|
|
655
789
|
} else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
|
|
656
790
|
walkChildren(el, d + 1);
|
|
657
791
|
} else if (localTag === "run") {
|
|
658
|
-
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack,
|
|
792
|
+
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
659
793
|
}
|
|
660
794
|
}
|
|
661
795
|
};
|
|
@@ -740,7 +874,7 @@ function extractParagraphInfo(para, styleMap) {
|
|
|
740
874
|
case "hyperlink": {
|
|
741
875
|
const url = child.getAttribute("url") || child.getAttribute("href") || "";
|
|
742
876
|
if (url) {
|
|
743
|
-
const safe =
|
|
877
|
+
const safe = _chunkVLSATRNQcjs.sanitizeHref.call(void 0, url);
|
|
744
878
|
if (safe) href = safe;
|
|
745
879
|
}
|
|
746
880
|
walk(child);
|
|
@@ -880,7 +1014,7 @@ function decompressStream(data) {
|
|
|
880
1014
|
return _zlib.inflateRawSync.call(void 0, data, opts);
|
|
881
1015
|
}
|
|
882
1016
|
function parseFileHeader(data) {
|
|
883
|
-
if (data.length < 40) throw new (0,
|
|
1017
|
+
if (data.length < 40) throw new (0, _chunkVLSATRNQcjs.KordocError)("FileHeader\uAC00 \uB108\uBB34 \uC9E7\uC2B5\uB2C8\uB2E4 (\uCD5C\uC18C 40\uBC14\uC774\uD2B8)");
|
|
884
1018
|
const sig = data.subarray(0, 32).toString("utf8").replace(/\0+$/, "");
|
|
885
1019
|
return {
|
|
886
1020
|
signature: sig,
|
|
@@ -1899,22 +2033,22 @@ function parseHwp5Document(buffer, options) {
|
|
|
1899
2033
|
lenientCfb = parseLenientCfb(buffer);
|
|
1900
2034
|
warnings.push({ message: "\uC190\uC0C1\uB41C CFB \uCEE8\uD14C\uC774\uB108 \u2014 lenient \uBAA8\uB4DC\uB85C \uBCF5\uAD6C", code: "LENIENT_CFB_RECOVERY" });
|
|
1901
2035
|
} catch (e11) {
|
|
1902
|
-
throw new (0,
|
|
2036
|
+
throw new (0, _chunkVLSATRNQcjs.KordocError)("CFB \uCEE8\uD14C\uC774\uB108 \uD30C\uC2F1 \uC2E4\uD328 (strict \uBC0F lenient \uBAA8\uB450)");
|
|
1903
2037
|
}
|
|
1904
2038
|
}
|
|
1905
2039
|
const findStream = (path) => {
|
|
1906
2040
|
if (cfb) {
|
|
1907
2041
|
const entry = CFB.find(cfb, path);
|
|
1908
|
-
return _optionalChain([entry, 'optionalAccess',
|
|
2042
|
+
return _optionalChain([entry, 'optionalAccess', _21 => _21.content]) ? Buffer.from(entry.content) : null;
|
|
1909
2043
|
}
|
|
1910
2044
|
return lenientCfb.findStream(path);
|
|
1911
2045
|
};
|
|
1912
2046
|
const headerData = findStream("/FileHeader");
|
|
1913
|
-
if (!headerData) throw new (0,
|
|
2047
|
+
if (!headerData) throw new (0, _chunkVLSATRNQcjs.KordocError)("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
|
|
1914
2048
|
const header = parseFileHeader(headerData);
|
|
1915
|
-
if (header.signature !== "HWP Document File") throw new (0,
|
|
1916
|
-
if (header.flags & FLAG_ENCRYPTED) throw new (0,
|
|
1917
|
-
if (header.flags & FLAG_DRM) throw new (0,
|
|
2049
|
+
if (header.signature !== "HWP Document File") throw new (0, _chunkVLSATRNQcjs.KordocError)("HWP \uC2DC\uADF8\uB2C8\uCC98 \uBD88\uC77C\uCE58");
|
|
2050
|
+
if (header.flags & FLAG_ENCRYPTED) throw new (0, _chunkVLSATRNQcjs.KordocError)("\uC554\uD638\uD654\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
|
|
2051
|
+
if (header.flags & FLAG_DRM) throw new (0, _chunkVLSATRNQcjs.KordocError)("DRM \uBCF4\uD638\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
|
|
1918
2052
|
const compressed = (header.flags & FLAG_COMPRESSED) !== 0;
|
|
1919
2053
|
const distribution = (header.flags & FLAG_DISTRIBUTION) !== 0;
|
|
1920
2054
|
const metadata = {
|
|
@@ -1923,11 +2057,12 @@ function parseHwp5Document(buffer, options) {
|
|
|
1923
2057
|
if (cfb) extractHwp5Metadata(cfb, metadata);
|
|
1924
2058
|
const docInfo = cfb ? parseDocInfoStream(cfb, compressed) : parseDocInfoFromStream(findStream("/DocInfo"), compressed);
|
|
1925
2059
|
const sections = distribution ? cfb ? findViewTextSections(cfb, compressed) : findViewTextSectionsLenient(lenientCfb, compressed) : cfb ? findSections(cfb) : findSectionsLenient(lenientCfb, compressed);
|
|
1926
|
-
if (sections.length === 0) throw new (0,
|
|
2060
|
+
if (sections.length === 0) throw new (0, _chunkVLSATRNQcjs.KordocError)("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
1927
2061
|
metadata.pageCount = sections.length;
|
|
1928
|
-
const pageFilter = _optionalChain([options, 'optionalAccess',
|
|
2062
|
+
const pageFilter = _optionalChain([options, 'optionalAccess', _22 => _22.pages]) ? _chunkMUOQXDZ4cjs.parsePageRange.call(void 0, options.pages, sections.length) : null;
|
|
1929
2063
|
const totalTarget = pageFilter ? pageFilter.size : sections.length;
|
|
1930
2064
|
const blocks = [];
|
|
2065
|
+
const nestedTableCounter = { count: 0 };
|
|
1931
2066
|
let totalDecompressed = 0;
|
|
1932
2067
|
let parsedSections = 0;
|
|
1933
2068
|
for (let si = 0; si < sections.length; si++) {
|
|
@@ -1936,30 +2071,30 @@ function parseHwp5Document(buffer, options) {
|
|
|
1936
2071
|
const sectionData = sections[si];
|
|
1937
2072
|
const data = !distribution && compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
|
|
1938
2073
|
totalDecompressed += data.length;
|
|
1939
|
-
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new (0,
|
|
2074
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new (0, _chunkVLSATRNQcjs.KordocError)("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
1940
2075
|
const records = readRecords(data);
|
|
1941
|
-
const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
|
|
2076
|
+
const sectionBlocks = parseSection(records, docInfo, warnings, si + 1, nestedTableCounter);
|
|
1942
2077
|
blocks.push(...sectionBlocks);
|
|
1943
2078
|
parsedSections++;
|
|
1944
|
-
_optionalChain([options, 'optionalAccess',
|
|
2079
|
+
_optionalChain([options, 'optionalAccess', _23 => _23.onProgress, 'optionalCall', _24 => _24(parsedSections, totalTarget)]);
|
|
1945
2080
|
} catch (secErr) {
|
|
1946
|
-
if (secErr instanceof
|
|
2081
|
+
if (secErr instanceof _chunkVLSATRNQcjs.KordocError) throw secErr;
|
|
1947
2082
|
warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
1948
2083
|
}
|
|
1949
2084
|
}
|
|
1950
2085
|
const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
|
|
1951
|
-
const flatBlocks =
|
|
2086
|
+
const flatBlocks = _chunkVLSATRNQcjs.flattenLayoutTables.call(void 0, blocks);
|
|
1952
2087
|
if (docInfo) {
|
|
1953
2088
|
detectHwp5Headings(flatBlocks, docInfo);
|
|
1954
2089
|
}
|
|
1955
2090
|
const outline = flatBlocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
1956
|
-
const markdown =
|
|
2091
|
+
const markdown = _chunkVLSATRNQcjs.blocksToMarkdown.call(void 0, flatBlocks);
|
|
1957
2092
|
return { markdown, blocks: flatBlocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
1958
2093
|
}
|
|
1959
2094
|
function parseDocInfoStream(cfb, compressed) {
|
|
1960
2095
|
try {
|
|
1961
2096
|
const entry = CFB.find(cfb, "/DocInfo");
|
|
1962
|
-
if (!_optionalChain([entry, 'optionalAccess',
|
|
2097
|
+
if (!_optionalChain([entry, 'optionalAccess', _25 => _25.content])) return null;
|
|
1963
2098
|
const data = compressed ? decompressStream(Buffer.from(entry.content)) : Buffer.from(entry.content);
|
|
1964
2099
|
const records = readRecords(data);
|
|
1965
2100
|
return parseDocInfo(records);
|
|
@@ -1982,7 +2117,7 @@ function detectHwp5Headings(blocks, docInfo) {
|
|
|
1982
2117
|
const name = (style.nameKo || style.name).toLowerCase();
|
|
1983
2118
|
if (name.includes("\uBC14\uD0D5") || name.includes("\uBCF8\uBB38") || name === "normal" || name === "body") {
|
|
1984
2119
|
const cs = docInfo.charShapes[style.charShapeId];
|
|
1985
|
-
if (_optionalChain([cs, 'optionalAccess',
|
|
2120
|
+
if (_optionalChain([cs, 'optionalAccess', _26 => _26.fontSize]) > 0) {
|
|
1986
2121
|
baseFontSize = cs.fontSize / 10;
|
|
1987
2122
|
break;
|
|
1988
2123
|
}
|
|
@@ -1991,7 +2126,7 @@ function detectHwp5Headings(blocks, docInfo) {
|
|
|
1991
2126
|
if (baseFontSize === 0) {
|
|
1992
2127
|
const sizeFreq = /* @__PURE__ */ new Map();
|
|
1993
2128
|
for (const b of blocks) {
|
|
1994
|
-
if (_optionalChain([b, 'access',
|
|
2129
|
+
if (_optionalChain([b, 'access', _27 => _27.style, 'optionalAccess', _28 => _28.fontSize])) {
|
|
1995
2130
|
sizeFreq.set(b.style.fontSize, (sizeFreq.get(b.style.fontSize) || 0) + 1);
|
|
1996
2131
|
}
|
|
1997
2132
|
}
|
|
@@ -2011,11 +2146,11 @@ function detectHwp5Headings(blocks, docInfo) {
|
|
|
2011
2146
|
if (text.length === 0 || text.length > 200) continue;
|
|
2012
2147
|
if (/^\d+$/.test(text)) continue;
|
|
2013
2148
|
let level = 0;
|
|
2014
|
-
if (_optionalChain([block, 'access',
|
|
2149
|
+
if (_optionalChain([block, 'access', _29 => _29.style, 'optionalAccess', _30 => _30.fontSize]) && baseFontSize > 0) {
|
|
2015
2150
|
const ratio = block.style.fontSize / baseFontSize;
|
|
2016
|
-
if (ratio >=
|
|
2017
|
-
else if (ratio >=
|
|
2018
|
-
else if (ratio >=
|
|
2151
|
+
if (ratio >= _chunkVLSATRNQcjs.HEADING_RATIO_H1) level = 1;
|
|
2152
|
+
else if (ratio >= _chunkVLSATRNQcjs.HEADING_RATIO_H2) level = 2;
|
|
2153
|
+
else if (ratio >= _chunkVLSATRNQcjs.HEADING_RATIO_H3) level = 3;
|
|
2019
2154
|
}
|
|
2020
2155
|
if (/^제\d+[장절편]\s/.test(text) && text.length <= 50) {
|
|
2021
2156
|
if (level === 0) level = 2;
|
|
@@ -2031,7 +2166,7 @@ function detectHwp5Headings(blocks, docInfo) {
|
|
|
2031
2166
|
function extractHwp5Metadata(cfb, metadata) {
|
|
2032
2167
|
try {
|
|
2033
2168
|
const summaryEntry = CFB.find(cfb, "/HwpSummaryInformation") || CFB.find(cfb, "/SummaryInformation");
|
|
2034
|
-
if (!_optionalChain([summaryEntry, 'optionalAccess',
|
|
2169
|
+
if (!_optionalChain([summaryEntry, 'optionalAccess', _31 => _31.content])) return;
|
|
2035
2170
|
const data = Buffer.from(summaryEntry.content);
|
|
2036
2171
|
if (data.length < 48) return;
|
|
2037
2172
|
const numSets = data.readUInt32LE(24);
|
|
@@ -2064,7 +2199,7 @@ function findViewTextSections(cfb, compressed) {
|
|
|
2064
2199
|
const sections = [];
|
|
2065
2200
|
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
2066
2201
|
const entry = CFB.find(cfb, `/ViewText/Section${i}`);
|
|
2067
|
-
if (!_optionalChain([entry, 'optionalAccess',
|
|
2202
|
+
if (!_optionalChain([entry, 'optionalAccess', _32 => _32.content])) break;
|
|
2068
2203
|
try {
|
|
2069
2204
|
const decrypted = decryptViewText(Buffer.from(entry.content), compressed);
|
|
2070
2205
|
sections.push({ idx: i, content: decrypted });
|
|
@@ -2078,13 +2213,13 @@ function findSections(cfb) {
|
|
|
2078
2213
|
const sections = [];
|
|
2079
2214
|
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
2080
2215
|
const entry = CFB.find(cfb, `/BodyText/Section${i}`);
|
|
2081
|
-
if (!_optionalChain([entry, 'optionalAccess',
|
|
2216
|
+
if (!_optionalChain([entry, 'optionalAccess', _33 => _33.content])) break;
|
|
2082
2217
|
sections.push({ idx: i, content: Buffer.from(entry.content) });
|
|
2083
2218
|
}
|
|
2084
2219
|
if (sections.length === 0 && cfb.FileIndex) {
|
|
2085
2220
|
for (const entry of cfb.FileIndex) {
|
|
2086
2221
|
if (sections.length >= MAX_SECTIONS) break;
|
|
2087
|
-
if (_optionalChain([entry, 'access',
|
|
2222
|
+
if (_optionalChain([entry, 'access', _34 => _34.name, 'optionalAccess', _35 => _35.startsWith, 'call', _36 => _36("Section")]) && entry.content) {
|
|
2088
2223
|
const idx = parseInt(entry.name.replace("Section", ""), 10) || 0;
|
|
2089
2224
|
sections.push({ idx, content: Buffer.from(entry.content) });
|
|
2090
2225
|
}
|
|
@@ -2100,7 +2235,7 @@ function findSectionsLenient(lcfb, compressed) {
|
|
|
2100
2235
|
if (!raw) break;
|
|
2101
2236
|
const content = compressed ? decompressStream(raw) : raw;
|
|
2102
2237
|
totalDecompressed += content.length;
|
|
2103
|
-
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new (0,
|
|
2238
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new (0, _chunkVLSATRNQcjs.KordocError)("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2104
2239
|
sections.push({ idx: i, content });
|
|
2105
2240
|
}
|
|
2106
2241
|
if (sections.length === 0) {
|
|
@@ -2112,7 +2247,7 @@ function findSectionsLenient(lcfb, compressed) {
|
|
|
2112
2247
|
if (raw) {
|
|
2113
2248
|
const content = compressed ? decompressStream(raw) : raw;
|
|
2114
2249
|
totalDecompressed += content.length;
|
|
2115
|
-
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new (0,
|
|
2250
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new (0, _chunkVLSATRNQcjs.KordocError)("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2116
2251
|
sections.push({ idx, content });
|
|
2117
2252
|
}
|
|
2118
2253
|
}
|
|
@@ -2129,7 +2264,7 @@ function findViewTextSectionsLenient(lcfb, compressed) {
|
|
|
2129
2264
|
try {
|
|
2130
2265
|
const content = decryptViewText(raw, compressed);
|
|
2131
2266
|
totalDecompressed += content.length;
|
|
2132
|
-
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new (0,
|
|
2267
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new (0, _chunkVLSATRNQcjs.KordocError)("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2133
2268
|
sections.push({ idx: i, content });
|
|
2134
2269
|
} catch (e16) {
|
|
2135
2270
|
break;
|
|
@@ -2167,7 +2302,7 @@ function extractHwp5Images(cfb, blocks, compressed, warnings) {
|
|
|
2167
2302
|
const binDataRe = /\/BinData\/[Bb][Ii][Nn](\d{4})$/;
|
|
2168
2303
|
if (cfb.FileIndex) {
|
|
2169
2304
|
for (const entry of cfb.FileIndex) {
|
|
2170
|
-
if (!_optionalChain([entry, 'optionalAccess',
|
|
2305
|
+
if (!_optionalChain([entry, 'optionalAccess', _37 => _37.name]) || !entry.content) continue;
|
|
2171
2306
|
const match = entry.name.match(binDataRe);
|
|
2172
2307
|
if (!match) continue;
|
|
2173
2308
|
const idx = parseInt(match[1], 10);
|
|
@@ -2258,13 +2393,13 @@ function extractHwp5ImagesLenient(lcfb, blocks, compressed, warnings) {
|
|
|
2258
2393
|
}
|
|
2259
2394
|
return images;
|
|
2260
2395
|
}
|
|
2261
|
-
function parseSection(records, docInfo, warnings, sectionNum) {
|
|
2396
|
+
function parseSection(records, docInfo, warnings, sectionNum, counter) {
|
|
2262
2397
|
const blocks = [];
|
|
2263
2398
|
let i = 0;
|
|
2264
2399
|
while (i < records.length) {
|
|
2265
2400
|
const rec = records[i];
|
|
2266
2401
|
if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
|
|
2267
|
-
const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i);
|
|
2402
|
+
const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i, counter);
|
|
2268
2403
|
if (paragraph) {
|
|
2269
2404
|
const block = { type: "paragraph", text: paragraph, pageNumber: sectionNum };
|
|
2270
2405
|
if (docInfo && charShapeIds.length > 0) {
|
|
@@ -2287,7 +2422,7 @@ function parseSection(records, docInfo, warnings, sectionNum) {
|
|
|
2287
2422
|
if (rec.tagId === TAG_CTRL_HEADER && rec.level <= 1 && rec.data.length >= 4) {
|
|
2288
2423
|
const ctrlId = rec.data.subarray(0, 4).toString("ascii");
|
|
2289
2424
|
if (ctrlId === " lbt" || ctrlId === "tbl ") {
|
|
2290
|
-
const { table, nextIdx } = parseTableBlock(records, i);
|
|
2425
|
+
const { table, nextIdx } = parseTableBlock(records, i, counter);
|
|
2291
2426
|
if (table) blocks.push({ type: "table", table, pageNumber: sectionNum });
|
|
2292
2427
|
i = nextIdx;
|
|
2293
2428
|
continue;
|
|
@@ -2317,7 +2452,7 @@ function parseSection(records, docInfo, warnings, sectionNum) {
|
|
|
2317
2452
|
if (url && blocks.length > 0) {
|
|
2318
2453
|
const lastBlock = blocks[blocks.length - 1];
|
|
2319
2454
|
if (lastBlock.type === "paragraph" && !lastBlock.href) {
|
|
2320
|
-
lastBlock.href = _nullishCoalesce(
|
|
2455
|
+
lastBlock.href = _nullishCoalesce(_chunkVLSATRNQcjs.sanitizeHref.call(void 0, url), () => ( void 0));
|
|
2321
2456
|
}
|
|
2322
2457
|
}
|
|
2323
2458
|
}
|
|
@@ -2392,7 +2527,7 @@ function resolveCharStyle(charShapeIds, docInfo) {
|
|
|
2392
2527
|
if (cs.attrFlags & 2) style.bold = true;
|
|
2393
2528
|
return style.fontSize || style.bold || style.italic ? style : void 0;
|
|
2394
2529
|
}
|
|
2395
|
-
function parseParagraphWithTables(records, startIdx) {
|
|
2530
|
+
function parseParagraphWithTables(records, startIdx, counter) {
|
|
2396
2531
|
const startLevel = records[startIdx].level;
|
|
2397
2532
|
let text = "";
|
|
2398
2533
|
const tables = [];
|
|
@@ -2414,7 +2549,7 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
2414
2549
|
if (rec.tagId === TAG_CTRL_HEADER && rec.data.length >= 4) {
|
|
2415
2550
|
const ctrlId = rec.data.subarray(0, 4).toString("ascii");
|
|
2416
2551
|
if (ctrlId === " lbt" || ctrlId === "tbl ") {
|
|
2417
|
-
const { table, nextIdx } = parseTableBlock(records, i);
|
|
2552
|
+
const { table, nextIdx } = parseTableBlock(records, i, counter);
|
|
2418
2553
|
if (table) tables.push(table);
|
|
2419
2554
|
i = nextIdx;
|
|
2420
2555
|
continue;
|
|
@@ -2425,7 +2560,7 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
2425
2560
|
const trimmed = text.trim();
|
|
2426
2561
|
return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds, paraShapeId };
|
|
2427
2562
|
}
|
|
2428
|
-
function parseTableBlock(records, startIdx) {
|
|
2563
|
+
function parseTableBlock(records, startIdx, counter) {
|
|
2429
2564
|
const tableLevel = records[startIdx].level;
|
|
2430
2565
|
let i = startIdx + 1;
|
|
2431
2566
|
let rows = 0, cols = 0;
|
|
@@ -2435,11 +2570,11 @@ function parseTableBlock(records, startIdx) {
|
|
|
2435
2570
|
if (rec.tagId === TAG_PARA_HEADER && rec.level <= tableLevel) break;
|
|
2436
2571
|
if (rec.tagId === TAG_CTRL_HEADER && rec.level <= tableLevel) break;
|
|
2437
2572
|
if (rec.tagId === TAG_TABLE && rec.data.length >= 8) {
|
|
2438
|
-
rows = Math.min(rec.data.readUInt16LE(4),
|
|
2439
|
-
cols = Math.min(rec.data.readUInt16LE(6),
|
|
2573
|
+
rows = Math.min(rec.data.readUInt16LE(4), _chunkVLSATRNQcjs.MAX_ROWS);
|
|
2574
|
+
cols = Math.min(rec.data.readUInt16LE(6), _chunkVLSATRNQcjs.MAX_COLS);
|
|
2440
2575
|
}
|
|
2441
2576
|
if (rec.tagId === TAG_LIST_HEADER) {
|
|
2442
|
-
const { cell, nextIdx } = parseCellBlock(records, i, tableLevel);
|
|
2577
|
+
const { cell, nextIdx } = parseCellBlock(records, i, tableLevel, counter);
|
|
2443
2578
|
if (cell) cells.push(cell);
|
|
2444
2579
|
i = nextIdx;
|
|
2445
2580
|
continue;
|
|
@@ -2458,9 +2593,9 @@ function parseTableBlock(records, startIdx) {
|
|
|
2458
2593
|
return { table: { rows, cols, cells: irCells, hasHeader: rows > 1 }, nextIdx: i };
|
|
2459
2594
|
}
|
|
2460
2595
|
const cellRows = arrangeCells(rows, cols, cells);
|
|
2461
|
-
return { table:
|
|
2596
|
+
return { table: _chunkVLSATRNQcjs.buildTable.call(void 0, cellRows), nextIdx: i };
|
|
2462
2597
|
}
|
|
2463
|
-
function parseCellBlock(records, startIdx, tableLevel) {
|
|
2598
|
+
function parseCellBlock(records, startIdx, tableLevel, counter) {
|
|
2464
2599
|
const rec = records[startIdx];
|
|
2465
2600
|
const cellLevel = rec.level;
|
|
2466
2601
|
const texts = [];
|
|
@@ -2473,8 +2608,8 @@ function parseCellBlock(records, startIdx, tableLevel) {
|
|
|
2473
2608
|
rowAddr = rec.data.readUInt16LE(10);
|
|
2474
2609
|
const cs = rec.data.readUInt16LE(12);
|
|
2475
2610
|
const rs = rec.data.readUInt16LE(14);
|
|
2476
|
-
if (cs > 0) colSpan = Math.min(cs,
|
|
2477
|
-
if (rs > 0) rowSpan = Math.min(rs,
|
|
2611
|
+
if (cs > 0) colSpan = Math.min(cs, _chunkVLSATRNQcjs.MAX_COLS);
|
|
2612
|
+
if (rs > 0) rowSpan = Math.min(rs, _chunkVLSATRNQcjs.MAX_ROWS);
|
|
2478
2613
|
}
|
|
2479
2614
|
let i = startIdx + 1;
|
|
2480
2615
|
while (i < records.length) {
|
|
@@ -2485,6 +2620,17 @@ function parseCellBlock(records, startIdx, tableLevel) {
|
|
|
2485
2620
|
const t = extractText(r.data).trim();
|
|
2486
2621
|
if (t) texts.push(t);
|
|
2487
2622
|
}
|
|
2623
|
+
if (r.tagId === TAG_CTRL_HEADER && r.data.length >= 4) {
|
|
2624
|
+
const ctrlId = r.data.subarray(0, 4).toString("ascii");
|
|
2625
|
+
if (ctrlId === " lbt" || ctrlId === "tbl ") {
|
|
2626
|
+
if (counter) {
|
|
2627
|
+
counter.count++;
|
|
2628
|
+
texts.push(`[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}]`);
|
|
2629
|
+
} else {
|
|
2630
|
+
texts.push("[\uC911\uCCA9 \uD14C\uC774\uBE14]");
|
|
2631
|
+
}
|
|
2632
|
+
}
|
|
2633
|
+
}
|
|
2488
2634
|
i++;
|
|
2489
2635
|
}
|
|
2490
2636
|
return { cell: { text: texts.join("\n"), colSpan, rowSpan, colAddr, rowAddr }, nextIdx: i };
|
|
@@ -2562,10 +2708,10 @@ function getElements(parent, tagName) {
|
|
|
2562
2708
|
return result;
|
|
2563
2709
|
}
|
|
2564
2710
|
function getTextContent(el) {
|
|
2565
|
-
return _nullishCoalesce(_optionalChain([el, 'access',
|
|
2711
|
+
return _nullishCoalesce(_optionalChain([el, 'access', _38 => _38.textContent, 'optionalAccess', _39 => _39.trim, 'call', _40 => _40()]), () => ( ""));
|
|
2566
2712
|
}
|
|
2567
2713
|
function parseXml(text) {
|
|
2568
|
-
return new (0, _xmldom.DOMParser)().parseFromString(
|
|
2714
|
+
return new (0, _xmldom.DOMParser)().parseFromString(_chunkVLSATRNQcjs.stripDtd.call(void 0, text), "text/xml");
|
|
2569
2715
|
}
|
|
2570
2716
|
function parseSharedStrings(xml) {
|
|
2571
2717
|
const doc = parseXml(xml);
|
|
@@ -2702,14 +2848,14 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
|
|
|
2702
2848
|
const merge = mergeMap.get(key);
|
|
2703
2849
|
row.push({
|
|
2704
2850
|
text,
|
|
2705
|
-
colSpan: _nullishCoalesce(_optionalChain([merge, 'optionalAccess',
|
|
2706
|
-
rowSpan: _nullishCoalesce(_optionalChain([merge, 'optionalAccess',
|
|
2851
|
+
colSpan: _nullishCoalesce(_optionalChain([merge, 'optionalAccess', _41 => _41.colSpan]), () => ( 1)),
|
|
2852
|
+
rowSpan: _nullishCoalesce(_optionalChain([merge, 'optionalAccess', _42 => _42.rowSpan]), () => ( 1))
|
|
2707
2853
|
});
|
|
2708
2854
|
}
|
|
2709
2855
|
cellRows.push(row);
|
|
2710
2856
|
}
|
|
2711
2857
|
if (cellRows.length > 0) {
|
|
2712
|
-
const table =
|
|
2858
|
+
const table = _chunkVLSATRNQcjs.buildTable.call(void 0, cellRows);
|
|
2713
2859
|
if (table.rows > 0) {
|
|
2714
2860
|
blocks.push({ type: "table", table, pageNumber: sheetIndex + 1 });
|
|
2715
2861
|
}
|
|
@@ -2717,12 +2863,12 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
|
|
|
2717
2863
|
return blocks;
|
|
2718
2864
|
}
|
|
2719
2865
|
async function parseXlsxDocument(buffer, options) {
|
|
2720
|
-
|
|
2866
|
+
_chunkVLSATRNQcjs.precheckZipSize.call(void 0, buffer, MAX_DECOMPRESS_SIZE3);
|
|
2721
2867
|
const zip = await _jszip2.default.loadAsync(buffer);
|
|
2722
2868
|
const warnings = [];
|
|
2723
2869
|
const workbookFile = zip.file("xl/workbook.xml");
|
|
2724
2870
|
if (!workbookFile) {
|
|
2725
|
-
throw new (0,
|
|
2871
|
+
throw new (0, _chunkVLSATRNQcjs.KordocError)("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 XLSX \uD30C\uC77C: xl/workbook.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
2726
2872
|
}
|
|
2727
2873
|
let sharedStrings = [];
|
|
2728
2874
|
const ssFile = zip.file("xl/sharedStrings.xml");
|
|
@@ -2731,7 +2877,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
2731
2877
|
}
|
|
2732
2878
|
const sheets = parseWorkbook(await workbookFile.async("text"));
|
|
2733
2879
|
if (sheets.length === 0) {
|
|
2734
|
-
throw new (0,
|
|
2880
|
+
throw new (0, _chunkVLSATRNQcjs.KordocError)("XLSX \uD30C\uC77C\uC5D0 \uC2DC\uD2B8\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
2735
2881
|
}
|
|
2736
2882
|
let relsMap = /* @__PURE__ */ new Map();
|
|
2737
2883
|
const relsFile = zip.file("xl/_rels/workbook.xml.rels");
|
|
@@ -2739,7 +2885,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
2739
2885
|
relsMap = parseRels(await relsFile.async("text"));
|
|
2740
2886
|
}
|
|
2741
2887
|
let pageFilter = null;
|
|
2742
|
-
if (_optionalChain([options, 'optionalAccess',
|
|
2888
|
+
if (_optionalChain([options, 'optionalAccess', _43 => _43.pages])) {
|
|
2743
2889
|
const { parsePageRange: parsePageRange2 } = await Promise.resolve().then(() => _interopRequireWildcard(require("./page-range-3C7UGGEK.cjs")));
|
|
2744
2890
|
pageFilter = parsePageRange2(options.pages, sheets.length);
|
|
2745
2891
|
}
|
|
@@ -2748,7 +2894,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
2748
2894
|
for (let i = 0; i < processedSheets; i++) {
|
|
2749
2895
|
if (pageFilter && !pageFilter.has(i + 1)) continue;
|
|
2750
2896
|
const sheet = sheets[i];
|
|
2751
|
-
_optionalChain([options, 'optionalAccess',
|
|
2897
|
+
_optionalChain([options, 'optionalAccess', _44 => _44.onProgress, 'optionalCall', _45 => _45(i + 1, processedSheets)]);
|
|
2752
2898
|
let sheetPath = relsMap.get(sheet.rId);
|
|
2753
2899
|
if (sheetPath) {
|
|
2754
2900
|
if (!sheetPath.startsWith("xl/") && !sheetPath.startsWith("/")) {
|
|
@@ -2803,7 +2949,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
2803
2949
|
} catch (e20) {
|
|
2804
2950
|
}
|
|
2805
2951
|
}
|
|
2806
|
-
const markdown =
|
|
2952
|
+
const markdown = _chunkVLSATRNQcjs.blocksToMarkdown.call(void 0, blocks);
|
|
2807
2953
|
return { markdown, blocks, metadata, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
2808
2954
|
}
|
|
2809
2955
|
|
|
@@ -2811,21 +2957,21 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
2811
2957
|
|
|
2812
2958
|
|
|
2813
2959
|
var MAX_DECOMPRESS_SIZE4 = 100 * 1024 * 1024;
|
|
2814
|
-
function getChildElements(parent,
|
|
2960
|
+
function getChildElements(parent, localName3) {
|
|
2815
2961
|
const result = [];
|
|
2816
2962
|
const children = parent.childNodes;
|
|
2817
2963
|
for (let i = 0; i < children.length; i++) {
|
|
2818
2964
|
const node = children[i];
|
|
2819
2965
|
if (node.nodeType === 1) {
|
|
2820
2966
|
const el = node;
|
|
2821
|
-
if (el.localName ===
|
|
2967
|
+
if (el.localName === localName3 || _optionalChain([el, 'access', _46 => _46.tagName, 'optionalAccess', _47 => _47.endsWith, 'call', _48 => _48(`:${localName3}`)])) {
|
|
2822
2968
|
result.push(el);
|
|
2823
2969
|
}
|
|
2824
2970
|
}
|
|
2825
2971
|
}
|
|
2826
2972
|
return result;
|
|
2827
2973
|
}
|
|
2828
|
-
function findElements(parent,
|
|
2974
|
+
function findElements(parent, localName3) {
|
|
2829
2975
|
const result = [];
|
|
2830
2976
|
const walk = (node) => {
|
|
2831
2977
|
const children = node.childNodes;
|
|
@@ -2833,7 +2979,7 @@ function findElements(parent, localName2) {
|
|
|
2833
2979
|
const child = children[i];
|
|
2834
2980
|
if (child.nodeType === 1) {
|
|
2835
2981
|
const el = child;
|
|
2836
|
-
if (el.localName ===
|
|
2982
|
+
if (el.localName === localName3 || _optionalChain([el, 'access', _49 => _49.tagName, 'optionalAccess', _50 => _50.endsWith, 'call', _51 => _51(`:${localName3}`)])) {
|
|
2837
2983
|
result.push(el);
|
|
2838
2984
|
}
|
|
2839
2985
|
walk(el);
|
|
@@ -2843,16 +2989,16 @@ function findElements(parent, localName2) {
|
|
|
2843
2989
|
walk(parent);
|
|
2844
2990
|
return result;
|
|
2845
2991
|
}
|
|
2846
|
-
function getAttr(el,
|
|
2992
|
+
function getAttr(el, localName3) {
|
|
2847
2993
|
const attrs = el.attributes;
|
|
2848
2994
|
for (let i = 0; i < attrs.length; i++) {
|
|
2849
2995
|
const attr = attrs[i];
|
|
2850
|
-
if (attr.localName ===
|
|
2996
|
+
if (attr.localName === localName3 || attr.name === localName3) return attr.value;
|
|
2851
2997
|
}
|
|
2852
2998
|
return null;
|
|
2853
2999
|
}
|
|
2854
3000
|
function parseXml2(text) {
|
|
2855
|
-
return new (0, _xmldom.DOMParser)().parseFromString(
|
|
3001
|
+
return new (0, _xmldom.DOMParser)().parseFromString(_chunkVLSATRNQcjs.stripDtd.call(void 0, text), "text/xml");
|
|
2856
3002
|
}
|
|
2857
3003
|
function parseStyles(xml) {
|
|
2858
3004
|
const doc = parseXml2(xml);
|
|
@@ -3017,7 +3163,7 @@ function parseParagraph(p, styles, numbering, footnotes, rels) {
|
|
|
3017
3163
|
const text = parts.join("").trim();
|
|
3018
3164
|
if (!text) return null;
|
|
3019
3165
|
const style = styles.get(styleId);
|
|
3020
|
-
if (_optionalChain([style, 'optionalAccess',
|
|
3166
|
+
if (_optionalChain([style, 'optionalAccess', _52 => _52.outlineLevel]) !== void 0 && style.outlineLevel >= 0 && style.outlineLevel <= 5) {
|
|
3021
3167
|
return {
|
|
3022
3168
|
type: "heading",
|
|
3023
3169
|
text,
|
|
@@ -3026,8 +3172,8 @@ function parseParagraph(p, styles, numbering, footnotes, rels) {
|
|
|
3026
3172
|
}
|
|
3027
3173
|
if (numId && numId !== "0") {
|
|
3028
3174
|
const numDef = numbering.get(numId);
|
|
3029
|
-
const levelInfo = _optionalChain([numDef, 'optionalAccess',
|
|
3030
|
-
const listType = _optionalChain([levelInfo, 'optionalAccess',
|
|
3175
|
+
const levelInfo = _optionalChain([numDef, 'optionalAccess', _53 => _53.get, 'call', _54 => _54(ilvl)]);
|
|
3176
|
+
const listType = _optionalChain([levelInfo, 'optionalAccess', _55 => _55.numFmt]) === "bullet" ? "unordered" : "ordered";
|
|
3031
3177
|
return { type: "list", text, listType };
|
|
3032
3178
|
}
|
|
3033
3179
|
const block = { type: "paragraph", text };
|
|
@@ -3068,7 +3214,7 @@ function parseTable(tbl, styles, numbering, footnotes, rels) {
|
|
|
3068
3214
|
const pElements = getChildElements(tc, "p");
|
|
3069
3215
|
for (const p of pElements) {
|
|
3070
3216
|
const block = parseParagraph(p, styles, numbering, footnotes, rels);
|
|
3071
|
-
if (_optionalChain([block, 'optionalAccess',
|
|
3217
|
+
if (_optionalChain([block, 'optionalAccess', _56 => _56.text])) cellTexts.push(block.text);
|
|
3072
3218
|
}
|
|
3073
3219
|
row.push({ text: cellTexts.join("\n"), colSpan, rowSpan });
|
|
3074
3220
|
}
|
|
@@ -3081,7 +3227,7 @@ function parseTable(tbl, styles, numbering, footnotes, rels) {
|
|
|
3081
3227
|
if (!cell || cell.rowSpan === 0) continue;
|
|
3082
3228
|
let span = 1;
|
|
3083
3229
|
for (let nr = r + 1; nr < rows.length; nr++) {
|
|
3084
|
-
if (_optionalChain([rows, 'access',
|
|
3230
|
+
if (_optionalChain([rows, 'access', _57 => _57[nr], 'access', _58 => _58[c], 'optionalAccess', _59 => _59.rowSpan]) === 0) span++;
|
|
3085
3231
|
else break;
|
|
3086
3232
|
}
|
|
3087
3233
|
cell.rowSpan = span;
|
|
@@ -3125,7 +3271,7 @@ async function extractImages(zip, rels, doc) {
|
|
|
3125
3271
|
try {
|
|
3126
3272
|
const data = await imgFile.async("uint8array");
|
|
3127
3273
|
imgIdx++;
|
|
3128
|
-
const ext = _nullishCoalesce(_optionalChain([imgPath, 'access',
|
|
3274
|
+
const ext = _nullishCoalesce(_optionalChain([imgPath, 'access', _60 => _60.split, 'call', _61 => _61("."), 'access', _62 => _62.pop, 'call', _63 => _63(), 'optionalAccess', _64 => _64.toLowerCase, 'call', _65 => _65()]), () => ( "png"));
|
|
3129
3275
|
const mimeMap = {
|
|
3130
3276
|
png: "image/png",
|
|
3131
3277
|
jpg: "image/jpeg",
|
|
@@ -3145,12 +3291,12 @@ async function extractImages(zip, rels, doc) {
|
|
|
3145
3291
|
return { blocks, images };
|
|
3146
3292
|
}
|
|
3147
3293
|
async function parseDocxDocument(buffer, options) {
|
|
3148
|
-
|
|
3294
|
+
_chunkVLSATRNQcjs.precheckZipSize.call(void 0, buffer, MAX_DECOMPRESS_SIZE4);
|
|
3149
3295
|
const zip = await _jszip2.default.loadAsync(buffer);
|
|
3150
3296
|
const warnings = [];
|
|
3151
3297
|
const docFile = zip.file("word/document.xml");
|
|
3152
3298
|
if (!docFile) {
|
|
3153
|
-
throw new (0,
|
|
3299
|
+
throw new (0, _chunkVLSATRNQcjs.KordocError)("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 DOCX \uD30C\uC77C: word/document.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
3154
3300
|
}
|
|
3155
3301
|
let rels = /* @__PURE__ */ new Map();
|
|
3156
3302
|
const relsFile = zip.file("word/_rels/document.xml.rels");
|
|
@@ -3185,7 +3331,7 @@ async function parseDocxDocument(buffer, options) {
|
|
|
3185
3331
|
const doc = parseXml2(docXml);
|
|
3186
3332
|
const body = findElements(doc, "body");
|
|
3187
3333
|
if (body.length === 0) {
|
|
3188
|
-
throw new (0,
|
|
3334
|
+
throw new (0, _chunkVLSATRNQcjs.KordocError)("DOCX \uBCF8\uBB38(w:body)\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
3189
3335
|
}
|
|
3190
3336
|
const blocks = [];
|
|
3191
3337
|
const bodyEl = body[0];
|
|
@@ -3194,11 +3340,11 @@ async function parseDocxDocument(buffer, options) {
|
|
|
3194
3340
|
const node = children[i];
|
|
3195
3341
|
if (node.nodeType !== 1) continue;
|
|
3196
3342
|
const el = node;
|
|
3197
|
-
const
|
|
3198
|
-
if (
|
|
3343
|
+
const localName3 = _nullishCoalesce(el.localName, () => ( _optionalChain([el, 'access', _66 => _66.tagName, 'optionalAccess', _67 => _67.split, 'call', _68 => _68(":"), 'access', _69 => _69.pop, 'call', _70 => _70()])));
|
|
3344
|
+
if (localName3 === "p") {
|
|
3199
3345
|
const block = parseParagraph(el, styles, numbering, footnotes, rels);
|
|
3200
3346
|
if (block) blocks.push(block);
|
|
3201
|
-
} else if (
|
|
3347
|
+
} else if (localName3 === "tbl") {
|
|
3202
3348
|
const block = parseTable(el, styles, numbering, footnotes, rels);
|
|
3203
3349
|
if (block) blocks.push(block);
|
|
3204
3350
|
}
|
|
@@ -3225,7 +3371,7 @@ async function parseDocxDocument(buffer, options) {
|
|
|
3225
3371
|
}
|
|
3226
3372
|
}
|
|
3227
3373
|
const outline = blocks.filter((b) => b.type === "heading").map((b) => ({ level: _nullishCoalesce(b.level, () => ( 2)), text: _nullishCoalesce(b.text, () => ( "")) }));
|
|
3228
|
-
const markdown =
|
|
3374
|
+
const markdown = _chunkVLSATRNQcjs.blocksToMarkdown.call(void 0, blocks);
|
|
3229
3375
|
return {
|
|
3230
3376
|
markdown,
|
|
3231
3377
|
blocks,
|
|
@@ -3236,6 +3382,259 @@ async function parseDocxDocument(buffer, options) {
|
|
|
3236
3382
|
};
|
|
3237
3383
|
}
|
|
3238
3384
|
|
|
3385
|
+
// src/hwpml/parser.ts
|
|
3386
|
+
|
|
3387
|
+
var MAX_XML_DEPTH2 = 200;
|
|
3388
|
+
var MAX_TABLE_ROWS = 5e3;
|
|
3389
|
+
var MAX_TABLE_COLS = 500;
|
|
3390
|
+
var MAX_HWPML_BYTES = 50 * 1024 * 1024;
|
|
3391
|
+
function parseHwpmlDocument(buffer, options) {
|
|
3392
|
+
if (buffer.byteLength > MAX_HWPML_BYTES) {
|
|
3393
|
+
throw new Error(`HWPML \uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC (${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB > 50MB)`);
|
|
3394
|
+
}
|
|
3395
|
+
const text = new TextDecoder("utf-8").decode(buffer).replace(/^\uFEFF/, "");
|
|
3396
|
+
const normalized = text.replace(/ /g, " ");
|
|
3397
|
+
const xml = _chunkVLSATRNQcjs.stripDtd.call(void 0, normalized);
|
|
3398
|
+
const warnings = [];
|
|
3399
|
+
const parser = new (0, _xmldom.DOMParser)({
|
|
3400
|
+
onError: (_level, msg) => {
|
|
3401
|
+
warnings.push({ message: `HWPML XML \uD30C\uC2F1 \uACBD\uACE0: ${msg}`, code: "MALFORMED_XML" });
|
|
3402
|
+
}
|
|
3403
|
+
});
|
|
3404
|
+
const doc = parser.parseFromString(xml, "text/xml");
|
|
3405
|
+
if (!doc.documentElement) {
|
|
3406
|
+
return { markdown: "", blocks: [], warnings };
|
|
3407
|
+
}
|
|
3408
|
+
const root = doc.documentElement;
|
|
3409
|
+
const metadata = {};
|
|
3410
|
+
const docSummary = findChild(root, "DOCSUMMARY");
|
|
3411
|
+
if (docSummary) {
|
|
3412
|
+
const title = findChild(docSummary, "TITLE");
|
|
3413
|
+
const author = findChild(docSummary, "AUTHOR");
|
|
3414
|
+
const date = findChild(docSummary, "DATE");
|
|
3415
|
+
if (title) metadata.title = textContent(title).trim();
|
|
3416
|
+
if (author) metadata.author = textContent(author).trim();
|
|
3417
|
+
if (date) metadata.createdAt = textContent(date).trim() || void 0;
|
|
3418
|
+
}
|
|
3419
|
+
const paraShapeMap = buildParaShapeMap(root);
|
|
3420
|
+
const body = findChild(root, "BODY");
|
|
3421
|
+
if (!body) {
|
|
3422
|
+
return { markdown: "", blocks: [], metadata, warnings };
|
|
3423
|
+
}
|
|
3424
|
+
const blocks = [];
|
|
3425
|
+
const pageFilter = _optionalChain([options, 'optionalAccess', _71 => _71.pages]) ? _chunkMUOQXDZ4cjs.parsePageRange.call(void 0, options.pages, countSections(body)) : null;
|
|
3426
|
+
let sectionIdx = 0;
|
|
3427
|
+
const children = body.childNodes;
|
|
3428
|
+
for (let i = 0; i < children.length; i++) {
|
|
3429
|
+
const el = children[i];
|
|
3430
|
+
if (el.nodeType !== 1) continue;
|
|
3431
|
+
if (localName(el) !== "SECTION") continue;
|
|
3432
|
+
sectionIdx++;
|
|
3433
|
+
if (pageFilter && !pageFilter.has(sectionIdx)) continue;
|
|
3434
|
+
parseSection2(el, blocks, paraShapeMap, sectionIdx, warnings);
|
|
3435
|
+
}
|
|
3436
|
+
const outline = blocks.filter((b) => b.type === "heading" && b.text).map((b) => ({ level: _nullishCoalesce(b.level, () => ( 1)), text: b.text, pageNumber: b.pageNumber }));
|
|
3437
|
+
const markdown = _chunkVLSATRNQcjs.blocksToMarkdown.call(void 0, blocks);
|
|
3438
|
+
return {
|
|
3439
|
+
markdown,
|
|
3440
|
+
blocks,
|
|
3441
|
+
metadata: Object.keys(metadata).length > 0 ? metadata : void 0,
|
|
3442
|
+
outline: outline.length > 0 ? outline : void 0,
|
|
3443
|
+
warnings: warnings.length > 0 ? warnings : void 0
|
|
3444
|
+
};
|
|
3445
|
+
}
|
|
3446
|
+
function buildParaShapeMap(root) {
|
|
3447
|
+
const map = /* @__PURE__ */ new Map();
|
|
3448
|
+
const head = findChild(root, "HEAD");
|
|
3449
|
+
if (!head) return map;
|
|
3450
|
+
const mappingTable = findChild(head, "MAPPINGTABLE");
|
|
3451
|
+
if (!mappingTable) return map;
|
|
3452
|
+
const paraShapeList = findChild(mappingTable, "PARASHAPELIST");
|
|
3453
|
+
if (!paraShapeList) return map;
|
|
3454
|
+
const children = paraShapeList.childNodes;
|
|
3455
|
+
for (let i = 0; i < children.length; i++) {
|
|
3456
|
+
const el = children[i];
|
|
3457
|
+
if (el.nodeType !== 1 || localName(el) !== "PARASHAPE") continue;
|
|
3458
|
+
const id = _nullishCoalesce(el.getAttribute("Id"), () => ( ""));
|
|
3459
|
+
const headingType = _nullishCoalesce(el.getAttribute("HeadingType"), () => ( "None"));
|
|
3460
|
+
const level = parseInt(_nullishCoalesce(el.getAttribute("Level"), () => ( "0")), 10);
|
|
3461
|
+
let headingLevel = null;
|
|
3462
|
+
if (headingType === "Outline") {
|
|
3463
|
+
const safeLevel = isNaN(level) ? 0 : Math.max(0, level);
|
|
3464
|
+
headingLevel = Math.min(safeLevel + 1, 6);
|
|
3465
|
+
}
|
|
3466
|
+
map.set(id, { headingLevel });
|
|
3467
|
+
}
|
|
3468
|
+
return map;
|
|
3469
|
+
}
|
|
3470
|
+
function parseSection2(section, blocks, paraShapeMap, sectionNum, warnings) {
|
|
3471
|
+
walkContent(section, blocks, paraShapeMap, sectionNum, warnings, false);
|
|
3472
|
+
}
|
|
3473
|
+
function walkContent(node, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth = 0) {
|
|
3474
|
+
if (depth > MAX_XML_DEPTH2) return;
|
|
3475
|
+
const children = node.childNodes;
|
|
3476
|
+
for (let i = 0; i < children.length; i++) {
|
|
3477
|
+
const el = children[i];
|
|
3478
|
+
if (el.nodeType !== 1) continue;
|
|
3479
|
+
const tag = localName(el);
|
|
3480
|
+
if (tag === "HEADER" || tag === "FOOTER") {
|
|
3481
|
+
continue;
|
|
3482
|
+
}
|
|
3483
|
+
if (tag === "P") {
|
|
3484
|
+
if (!inHeaderFooter) {
|
|
3485
|
+
parseParagraph2(el, blocks, paraShapeMap, sectionNum);
|
|
3486
|
+
}
|
|
3487
|
+
continue;
|
|
3488
|
+
}
|
|
3489
|
+
if (tag === "TABLE") {
|
|
3490
|
+
if (!inHeaderFooter) {
|
|
3491
|
+
parseTable2(el, blocks, paraShapeMap, sectionNum, warnings);
|
|
3492
|
+
}
|
|
3493
|
+
continue;
|
|
3494
|
+
}
|
|
3495
|
+
if (tag === "PARALIST" || tag === "SECTION" || tag === "COLDEF") {
|
|
3496
|
+
walkContent(el, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth + 1);
|
|
3497
|
+
continue;
|
|
3498
|
+
}
|
|
3499
|
+
walkContent(el, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth + 1);
|
|
3500
|
+
}
|
|
3501
|
+
}
|
|
3502
|
+
function parseParagraph2(el, blocks, paraShapeMap, sectionNum) {
|
|
3503
|
+
const paraShapeId = _nullishCoalesce(el.getAttribute("ParaShape"), () => ( ""));
|
|
3504
|
+
const shapeInfo = paraShapeMap.get(paraShapeId);
|
|
3505
|
+
const text = extractParagraphText(el);
|
|
3506
|
+
if (!text) return;
|
|
3507
|
+
if (_optionalChain([shapeInfo, 'optionalAccess', _72 => _72.headingLevel]) != null) {
|
|
3508
|
+
blocks.push({ type: "heading", text, level: shapeInfo.headingLevel, pageNumber: sectionNum });
|
|
3509
|
+
} else {
|
|
3510
|
+
blocks.push({ type: "paragraph", text, pageNumber: sectionNum });
|
|
3511
|
+
}
|
|
3512
|
+
}
|
|
3513
|
+
function extractParagraphText(p) {
|
|
3514
|
+
const parts = [];
|
|
3515
|
+
collectCharText(p, parts);
|
|
3516
|
+
return parts.join("").trim();
|
|
3517
|
+
}
|
|
3518
|
+
function collectCharText(node, parts, depth = 0) {
|
|
3519
|
+
if (depth > MAX_XML_DEPTH2) return;
|
|
3520
|
+
const children = node.childNodes;
|
|
3521
|
+
for (let i = 0; i < children.length; i++) {
|
|
3522
|
+
const el = children[i];
|
|
3523
|
+
if (el.nodeType !== 1) continue;
|
|
3524
|
+
const tag = localName(el);
|
|
3525
|
+
if (tag === "CHAR") {
|
|
3526
|
+
const t = textContent(el);
|
|
3527
|
+
if (t) parts.push(t);
|
|
3528
|
+
} else if (tag === "TABLE" || tag === "PICTURE" || tag === "SHAPEOBJECT") {
|
|
3529
|
+
} else if (tag === "AUTONUM") {
|
|
3530
|
+
} else {
|
|
3531
|
+
collectCharText(el, parts, depth + 1);
|
|
3532
|
+
}
|
|
3533
|
+
}
|
|
3534
|
+
}
|
|
3535
|
+
function parseTable2(el, blocks, paraShapeMap, sectionNum, warnings) {
|
|
3536
|
+
const cells = [];
|
|
3537
|
+
const rowCount = parseInt(_nullishCoalesce(el.getAttribute("RowCount"), () => ( "0")), 10);
|
|
3538
|
+
const colCount = parseInt(_nullishCoalesce(el.getAttribute("ColCount"), () => ( "0")), 10);
|
|
3539
|
+
if (isNaN(rowCount) || isNaN(colCount) || rowCount === 0 || colCount === 0) return;
|
|
3540
|
+
if (rowCount > MAX_TABLE_ROWS || colCount > MAX_TABLE_COLS) {
|
|
3541
|
+
warnings.push({ message: `\uD14C\uC774\uBE14 \uD06C\uAE30 \uCD08\uACFC (${rowCount}x${colCount}) \u2014 \uC2A4\uD0B5`, code: "TRUNCATED_TABLE" });
|
|
3542
|
+
return;
|
|
3543
|
+
}
|
|
3544
|
+
const children = el.childNodes;
|
|
3545
|
+
for (let i = 0; i < children.length; i++) {
|
|
3546
|
+
const rowEl = children[i];
|
|
3547
|
+
if (rowEl.nodeType !== 1 || localName(rowEl) !== "ROW") continue;
|
|
3548
|
+
const rowCells = rowEl.childNodes;
|
|
3549
|
+
for (let j = 0; j < rowCells.length; j++) {
|
|
3550
|
+
const cellEl = rowCells[j];
|
|
3551
|
+
if (cellEl.nodeType !== 1 || localName(cellEl) !== "CELL") continue;
|
|
3552
|
+
const colAddr = parseInt(_nullishCoalesce(cellEl.getAttribute("ColAddr"), () => ( "0")), 10);
|
|
3553
|
+
const rowAddr = parseInt(_nullishCoalesce(cellEl.getAttribute("RowAddr"), () => ( "0")), 10);
|
|
3554
|
+
const colSpan = Math.min(Math.max(1, parseInt(_nullishCoalesce(cellEl.getAttribute("ColSpan"), () => ( "1")), 10) || 1), MAX_TABLE_COLS);
|
|
3555
|
+
const rowSpan = Math.min(Math.max(1, parseInt(_nullishCoalesce(cellEl.getAttribute("RowSpan"), () => ( "1")), 10) || 1), MAX_TABLE_ROWS);
|
|
3556
|
+
const cellText = extractCellText(cellEl);
|
|
3557
|
+
cells.push({ text: cellText, colSpan, rowSpan, colAddr, rowAddr });
|
|
3558
|
+
}
|
|
3559
|
+
}
|
|
3560
|
+
if (cells.length === 0) return;
|
|
3561
|
+
const grid = Array.from({ length: rowCount }, () => Array(colCount).fill(null));
|
|
3562
|
+
for (const cell of cells) {
|
|
3563
|
+
const r = _nullishCoalesce(cell.rowAddr, () => ( 0));
|
|
3564
|
+
const c = _nullishCoalesce(cell.colAddr, () => ( 0));
|
|
3565
|
+
if (isNaN(r) || isNaN(c) || r >= rowCount || c >= colCount) continue;
|
|
3566
|
+
grid[r][c] = cell;
|
|
3567
|
+
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
3568
|
+
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
3569
|
+
if (dr === 0 && dc === 0) continue;
|
|
3570
|
+
if (r + dr < rowCount && c + dc < colCount) {
|
|
3571
|
+
grid[r + dr][c + dc] = { text: "", colSpan: 1, rowSpan: 1 };
|
|
3572
|
+
}
|
|
3573
|
+
}
|
|
3574
|
+
}
|
|
3575
|
+
}
|
|
3576
|
+
const cellRows = grid.map(
|
|
3577
|
+
(row) => row.map((cell) => _nullishCoalesce(cell, () => ( { text: "", colSpan: 1, rowSpan: 1 })))
|
|
3578
|
+
);
|
|
3579
|
+
const table = _chunkVLSATRNQcjs.buildTable.call(void 0, cellRows);
|
|
3580
|
+
blocks.push({ type: "table", table, pageNumber: sectionNum });
|
|
3581
|
+
}
|
|
3582
|
+
function extractCellText(cellEl) {
|
|
3583
|
+
const textParts = [];
|
|
3584
|
+
collectCellText(cellEl, textParts, 0);
|
|
3585
|
+
return textParts.filter(Boolean).join("\n").trim();
|
|
3586
|
+
}
|
|
3587
|
+
function collectCellText(node, parts, depth) {
|
|
3588
|
+
if (depth > 20) return;
|
|
3589
|
+
const children = node.childNodes;
|
|
3590
|
+
for (let i = 0; i < children.length; i++) {
|
|
3591
|
+
const el = children[i];
|
|
3592
|
+
if (el.nodeType !== 1) continue;
|
|
3593
|
+
const tag = localName(el);
|
|
3594
|
+
if (tag === "P") {
|
|
3595
|
+
const t = extractParagraphText(el);
|
|
3596
|
+
if (t) parts.push(t);
|
|
3597
|
+
} else if (tag === "TABLE") {
|
|
3598
|
+
parts.push("[\uC911\uCCA9 \uD14C\uC774\uBE14]");
|
|
3599
|
+
} else {
|
|
3600
|
+
collectCellText(el, parts, depth + 1);
|
|
3601
|
+
}
|
|
3602
|
+
}
|
|
3603
|
+
}
|
|
3604
|
+
function localName(el) {
|
|
3605
|
+
return (el.tagName || el.localName || "").replace(/^[^:]+:/, "");
|
|
3606
|
+
}
|
|
3607
|
+
function findChild(parent, tag) {
|
|
3608
|
+
const children = parent.childNodes;
|
|
3609
|
+
for (let i = 0; i < children.length; i++) {
|
|
3610
|
+
const el = children[i];
|
|
3611
|
+
if (el.nodeType === 1 && localName(el) === tag) return el;
|
|
3612
|
+
}
|
|
3613
|
+
return null;
|
|
3614
|
+
}
|
|
3615
|
+
function textContent(el) {
|
|
3616
|
+
const children = el.childNodes;
|
|
3617
|
+
const parts = [];
|
|
3618
|
+
for (let i = 0; i < children.length; i++) {
|
|
3619
|
+
const node = children[i];
|
|
3620
|
+
if (node.nodeType === 3) {
|
|
3621
|
+
parts.push(node.nodeValue || "");
|
|
3622
|
+
} else if (node.nodeType === 1) {
|
|
3623
|
+
parts.push(textContent(node));
|
|
3624
|
+
}
|
|
3625
|
+
}
|
|
3626
|
+
return parts.join("");
|
|
3627
|
+
}
|
|
3628
|
+
function countSections(body) {
|
|
3629
|
+
let count = 0;
|
|
3630
|
+
const children = body.childNodes;
|
|
3631
|
+
for (let i = 0; i < children.length; i++) {
|
|
3632
|
+
const el = children[i];
|
|
3633
|
+
if (el.nodeType === 1 && localName(el) === "SECTION") count++;
|
|
3634
|
+
}
|
|
3635
|
+
return count;
|
|
3636
|
+
}
|
|
3637
|
+
|
|
3239
3638
|
// src/form/recognize.ts
|
|
3240
3639
|
var LABEL_KEYWORDS = /* @__PURE__ */ new Set([
|
|
3241
3640
|
"\uC131\uBA85",
|
|
@@ -3469,7 +3868,7 @@ function fillFormFields(blocks, values) {
|
|
|
3469
3868
|
if (block.type !== "table" || !block.table) continue;
|
|
3470
3869
|
for (let r = 0; r < block.table.rows; r++) {
|
|
3471
3870
|
for (let c = 0; c < block.table.cols; c++) {
|
|
3472
|
-
const cell = _optionalChain([block, 'access',
|
|
3871
|
+
const cell = _optionalChain([block, 'access', _73 => _73.table, 'access', _74 => _74.cells, 'access', _75 => _75[r], 'optionalAccess', _76 => _76[c]]);
|
|
3473
3872
|
if (!cell) continue;
|
|
3474
3873
|
const result = fillInCellPatterns(cell.text, normalizedValues, matchedLabels);
|
|
3475
3874
|
if (result) {
|
|
@@ -3508,7 +3907,7 @@ function fillTable(table, values, filled, matchedLabels, patternFilledCells) {
|
|
|
3508
3907
|
const matchKey = findMatchingKey(normalizedCellLabel, values);
|
|
3509
3908
|
if (matchKey === void 0) continue;
|
|
3510
3909
|
const newValue = values.get(matchKey);
|
|
3511
|
-
if (_optionalChain([patternFilledCells, 'optionalAccess',
|
|
3910
|
+
if (_optionalChain([patternFilledCells, 'optionalAccess', _77 => _77.has, 'call', _78 => _78(`${r},${c + 1}`)])) {
|
|
3512
3911
|
valueCell.text = newValue + " " + valueCell.text;
|
|
3513
3912
|
} else {
|
|
3514
3913
|
valueCell.text = newValue;
|
|
@@ -3578,7 +3977,7 @@ async function fillHwpx(hwpxBuffer, values) {
|
|
|
3578
3977
|
const normalizedValues = normalizeValues(values);
|
|
3579
3978
|
const sectionFiles = Object.keys(zip.files).filter((name) => /[Ss]ection\d+\.xml$/i.test(name)).sort();
|
|
3580
3979
|
if (sectionFiles.length === 0) {
|
|
3581
|
-
throw new (0,
|
|
3980
|
+
throw new (0, _chunkVLSATRNQcjs.KordocError)("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
3582
3981
|
}
|
|
3583
3982
|
const xmlParser = new (0, _xmldom.DOMParser)();
|
|
3584
3983
|
const xmlSerializer = new (0, _xmldom.XMLSerializer)();
|
|
@@ -3586,7 +3985,7 @@ async function fillHwpx(hwpxBuffer, values) {
|
|
|
3586
3985
|
const zipEntry = zip.file(sectionPath);
|
|
3587
3986
|
if (!zipEntry) continue;
|
|
3588
3987
|
const rawXml = await zipEntry.async("text");
|
|
3589
|
-
const doc = xmlParser.parseFromString(
|
|
3988
|
+
const doc = xmlParser.parseFromString(_chunkVLSATRNQcjs.stripDtd.call(void 0, rawXml), "text/xml");
|
|
3590
3989
|
if (!doc.documentElement) continue;
|
|
3591
3990
|
let modified = false;
|
|
3592
3991
|
const tables = findAllElements(doc.documentElement, "tbl");
|
|
@@ -3612,10 +4011,10 @@ async function fillHwpx(hwpxBuffer, values) {
|
|
|
3612
4011
|
const trEl = rows[rowIdx];
|
|
3613
4012
|
const cells = findDirectChildren(trEl, "tc");
|
|
3614
4013
|
for (let colIdx = 0; colIdx < cells.length - 1; colIdx++) {
|
|
3615
|
-
const labelText =
|
|
4014
|
+
const labelText = extractCellText2(cells[colIdx]);
|
|
3616
4015
|
if (!isLabelCell(labelText)) continue;
|
|
3617
4016
|
const valueCell = cells[colIdx + 1];
|
|
3618
|
-
const valueText =
|
|
4017
|
+
const valueText = extractCellText2(valueCell);
|
|
3619
4018
|
if (isKeywordLabel(valueText)) continue;
|
|
3620
4019
|
const normalizedCellLabel = normalizeLabel(labelText);
|
|
3621
4020
|
if (!normalizedCellLabel) continue;
|
|
@@ -3640,14 +4039,14 @@ async function fillHwpx(hwpxBuffer, values) {
|
|
|
3640
4039
|
if (rows.length >= 2) {
|
|
3641
4040
|
const headerCells = findDirectChildren(rows[0], "tc");
|
|
3642
4041
|
const allLabels = headerCells.every((cell) => {
|
|
3643
|
-
const t =
|
|
4042
|
+
const t = extractCellText2(cell).trim();
|
|
3644
4043
|
return t.length > 0 && t.length <= 20 && isLabelCell(t);
|
|
3645
4044
|
});
|
|
3646
4045
|
if (allLabels) {
|
|
3647
4046
|
for (let rowIdx = 1; rowIdx < rows.length; rowIdx++) {
|
|
3648
4047
|
const dataCells = findDirectChildren(rows[rowIdx], "tc");
|
|
3649
4048
|
for (let colIdx = 0; colIdx < Math.min(headerCells.length, dataCells.length); colIdx++) {
|
|
3650
|
-
const headerLabel = normalizeLabel(
|
|
4049
|
+
const headerLabel = normalizeLabel(extractCellText2(headerCells[colIdx]));
|
|
3651
4050
|
const matchKey = findMatchingKey(headerLabel, normalizedValues);
|
|
3652
4051
|
if (matchKey === void 0) continue;
|
|
3653
4052
|
if (matchedLabels.has(matchKey)) continue;
|
|
@@ -3655,7 +4054,7 @@ async function fillHwpx(hwpxBuffer, values) {
|
|
|
3655
4054
|
replaceCellText(dataCells[colIdx], newValue);
|
|
3656
4055
|
matchedLabels.add(matchKey);
|
|
3657
4056
|
filled.push({
|
|
3658
|
-
label:
|
|
4057
|
+
label: extractCellText2(headerCells[colIdx]).trim(),
|
|
3659
4058
|
value: newValue,
|
|
3660
4059
|
row: rowIdx,
|
|
3661
4060
|
col: colIdx
|
|
@@ -3697,7 +4096,7 @@ async function fillHwpx(hwpxBuffer, values) {
|
|
|
3697
4096
|
const buffer = await zip.generateAsync({ type: "arraybuffer" });
|
|
3698
4097
|
return { buffer, filled, unmatched };
|
|
3699
4098
|
}
|
|
3700
|
-
function
|
|
4099
|
+
function localName2(el) {
|
|
3701
4100
|
return (el.tagName || el.localName || "").replace(/^[^:]+:/, "");
|
|
3702
4101
|
}
|
|
3703
4102
|
function findAllElements(node, tagLocalName) {
|
|
@@ -3708,7 +4107,7 @@ function findAllElements(node, tagLocalName) {
|
|
|
3708
4107
|
for (let i = 0; i < children.length; i++) {
|
|
3709
4108
|
const child = children[i];
|
|
3710
4109
|
if (child.nodeType !== 1) continue;
|
|
3711
|
-
if (
|
|
4110
|
+
if (localName2(child) === tagLocalName) result.push(child);
|
|
3712
4111
|
walk(child);
|
|
3713
4112
|
}
|
|
3714
4113
|
};
|
|
@@ -3721,7 +4120,7 @@ function findDirectChildren(parent, tagLocalName) {
|
|
|
3721
4120
|
if (!children) return result;
|
|
3722
4121
|
for (let i = 0; i < children.length; i++) {
|
|
3723
4122
|
const child = children[i];
|
|
3724
|
-
if (child.nodeType === 1 &&
|
|
4123
|
+
if (child.nodeType === 1 && localName2(child) === tagLocalName) {
|
|
3725
4124
|
result.push(child);
|
|
3726
4125
|
}
|
|
3727
4126
|
}
|
|
@@ -3730,12 +4129,12 @@ function findDirectChildren(parent, tagLocalName) {
|
|
|
3730
4129
|
function isInsideTable(el) {
|
|
3731
4130
|
let parent = el.parentNode;
|
|
3732
4131
|
while (parent) {
|
|
3733
|
-
if (parent.nodeType === 1 &&
|
|
4132
|
+
if (parent.nodeType === 1 && localName2(parent) === "tbl") return true;
|
|
3734
4133
|
parent = parent.parentNode;
|
|
3735
4134
|
}
|
|
3736
4135
|
return false;
|
|
3737
4136
|
}
|
|
3738
|
-
function
|
|
4137
|
+
function extractCellText2(tcEl) {
|
|
3739
4138
|
const parts = [];
|
|
3740
4139
|
const walk = (node) => {
|
|
3741
4140
|
const children = node.childNodes;
|
|
@@ -3745,7 +4144,7 @@ function extractCellText(tcEl) {
|
|
|
3745
4144
|
if (child.nodeType === 3) {
|
|
3746
4145
|
parts.push(child.textContent || "");
|
|
3747
4146
|
} else if (child.nodeType === 1) {
|
|
3748
|
-
const tag =
|
|
4147
|
+
const tag = localName2(child);
|
|
3749
4148
|
if (tag === "t") walk(child);
|
|
3750
4149
|
else if (tag === "run" || tag === "r" || tag === "p" || tag === "subList") walk(child);
|
|
3751
4150
|
else if (tag === "tab") parts.push(" ");
|
|
@@ -4444,16 +4843,17 @@ function diffTableCells(a, b) {
|
|
|
4444
4843
|
// src/index.ts
|
|
4445
4844
|
async function parse(input, options) {
|
|
4446
4845
|
let buffer;
|
|
4846
|
+
const opts = typeof input === "string" && !_optionalChain([options, 'optionalAccess', _79 => _79.filePath]) ? { ...options, filePath: input } : options;
|
|
4447
4847
|
if (typeof input === "string") {
|
|
4448
4848
|
try {
|
|
4449
4849
|
const buf = await _promises.readFile.call(void 0, input);
|
|
4450
|
-
buffer =
|
|
4850
|
+
buffer = _chunkVLSATRNQcjs.toArrayBuffer.call(void 0, buf);
|
|
4451
4851
|
} catch (err) {
|
|
4452
4852
|
const msg = err instanceof Error && "code" in err && err.code === "ENOENT" ? `\uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${input}` : `\uD30C\uC77C \uC77D\uAE30 \uC2E4\uD328: ${input}`;
|
|
4453
4853
|
return { success: false, fileType: "unknown", error: msg, code: "PARSE_ERROR" };
|
|
4454
4854
|
}
|
|
4455
4855
|
} else if (Buffer.isBuffer(input)) {
|
|
4456
|
-
buffer =
|
|
4856
|
+
buffer = _chunkVLSATRNQcjs.toArrayBuffer.call(void 0, input);
|
|
4457
4857
|
} else {
|
|
4458
4858
|
buffer = input;
|
|
4459
4859
|
}
|
|
@@ -4464,14 +4864,16 @@ async function parse(input, options) {
|
|
|
4464
4864
|
switch (format) {
|
|
4465
4865
|
case "hwpx": {
|
|
4466
4866
|
const zipFormat = await detectZipFormat(buffer);
|
|
4467
|
-
if (zipFormat === "xlsx") return parseXlsx(buffer,
|
|
4468
|
-
if (zipFormat === "docx") return parseDocx(buffer,
|
|
4469
|
-
return parseHwpx(buffer,
|
|
4867
|
+
if (zipFormat === "xlsx") return parseXlsx(buffer, opts);
|
|
4868
|
+
if (zipFormat === "docx") return parseDocx(buffer, opts);
|
|
4869
|
+
return parseHwpx(buffer, opts);
|
|
4470
4870
|
}
|
|
4471
4871
|
case "hwp":
|
|
4472
|
-
return parseHwp(buffer,
|
|
4872
|
+
return parseHwp(buffer, opts);
|
|
4873
|
+
case "hwpml":
|
|
4874
|
+
return parseHwpml(buffer, opts);
|
|
4473
4875
|
case "pdf":
|
|
4474
|
-
return parsePdf(buffer,
|
|
4876
|
+
return parsePdf(buffer, opts);
|
|
4475
4877
|
default:
|
|
4476
4878
|
return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
|
|
4477
4879
|
}
|
|
@@ -4479,23 +4881,23 @@ async function parse(input, options) {
|
|
|
4479
4881
|
async function parseHwpx(buffer, options) {
|
|
4480
4882
|
try {
|
|
4481
4883
|
const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options);
|
|
4482
|
-
return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: _optionalChain([images, 'optionalAccess',
|
|
4884
|
+
return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: _optionalChain([images, 'optionalAccess', _80 => _80.length]) ? images : void 0 };
|
|
4483
4885
|
} catch (err) {
|
|
4484
|
-
return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code:
|
|
4886
|
+
return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: _chunkVLSATRNQcjs.classifyError.call(void 0, err) };
|
|
4485
4887
|
}
|
|
4486
4888
|
}
|
|
4487
4889
|
async function parseHwp(buffer, options) {
|
|
4488
4890
|
try {
|
|
4489
4891
|
const { markdown, blocks, metadata, outline, warnings, images } = parseHwp5Document(Buffer.from(buffer), options);
|
|
4490
|
-
return { success: true, fileType: "hwp", markdown, blocks, metadata, outline, warnings, images: _optionalChain([images, 'optionalAccess',
|
|
4892
|
+
return { success: true, fileType: "hwp", markdown, blocks, metadata, outline, warnings, images: _optionalChain([images, 'optionalAccess', _81 => _81.length]) ? images : void 0 };
|
|
4491
4893
|
} catch (err) {
|
|
4492
|
-
return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328", code:
|
|
4894
|
+
return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328", code: _chunkVLSATRNQcjs.classifyError.call(void 0, err) };
|
|
4493
4895
|
}
|
|
4494
4896
|
}
|
|
4495
4897
|
async function parsePdf(buffer, options) {
|
|
4496
4898
|
let parsePdfDocument;
|
|
4497
4899
|
try {
|
|
4498
|
-
const mod = await Promise.resolve().then(() => _interopRequireWildcard(require("./parser-
|
|
4900
|
+
const mod = await Promise.resolve().then(() => _interopRequireWildcard(require("./parser-STAOZMUC.cjs")));
|
|
4499
4901
|
parsePdfDocument = mod.parsePdfDocument;
|
|
4500
4902
|
} catch (e26) {
|
|
4501
4903
|
return {
|
|
@@ -4510,7 +4912,7 @@ async function parsePdf(buffer, options) {
|
|
|
4510
4912
|
return { success: true, fileType: "pdf", markdown, blocks, metadata, outline, warnings, isImageBased };
|
|
4511
4913
|
} catch (err) {
|
|
4512
4914
|
const isImageBased = err instanceof Error && "isImageBased" in err ? true : void 0;
|
|
4513
|
-
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code:
|
|
4915
|
+
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: _chunkVLSATRNQcjs.classifyError.call(void 0, err), isImageBased };
|
|
4514
4916
|
}
|
|
4515
4917
|
}
|
|
4516
4918
|
async function parseXlsx(buffer, options) {
|
|
@@ -4518,24 +4920,32 @@ async function parseXlsx(buffer, options) {
|
|
|
4518
4920
|
const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options);
|
|
4519
4921
|
return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
|
|
4520
4922
|
} catch (err) {
|
|
4521
|
-
return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code:
|
|
4923
|
+
return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: _chunkVLSATRNQcjs.classifyError.call(void 0, err) };
|
|
4522
4924
|
}
|
|
4523
4925
|
}
|
|
4524
4926
|
async function parseDocx(buffer, options) {
|
|
4525
4927
|
try {
|
|
4526
4928
|
const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options);
|
|
4527
|
-
return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: _optionalChain([images, 'optionalAccess',
|
|
4929
|
+
return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: _optionalChain([images, 'optionalAccess', _82 => _82.length]) ? images : void 0 };
|
|
4528
4930
|
} catch (err) {
|
|
4529
|
-
return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code:
|
|
4931
|
+
return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: _chunkVLSATRNQcjs.classifyError.call(void 0, err) };
|
|
4932
|
+
}
|
|
4933
|
+
}
|
|
4934
|
+
async function parseHwpml(buffer, options) {
|
|
4935
|
+
try {
|
|
4936
|
+
const { markdown, blocks, metadata, outline, warnings } = parseHwpmlDocument(buffer, options);
|
|
4937
|
+
return { success: true, fileType: "hwpml", markdown, blocks, metadata, outline, warnings };
|
|
4938
|
+
} catch (err) {
|
|
4939
|
+
return { success: false, fileType: "hwpml", error: err instanceof Error ? err.message : "HWPML \uD30C\uC2F1 \uC2E4\uD328", code: _chunkVLSATRNQcjs.classifyError.call(void 0, err) };
|
|
4530
4940
|
}
|
|
4531
4941
|
}
|
|
4532
4942
|
async function fillForm(input, values, outputFormat = "markdown") {
|
|
4533
4943
|
let buffer;
|
|
4534
4944
|
if (typeof input === "string") {
|
|
4535
4945
|
const buf = await _promises.readFile.call(void 0, input);
|
|
4536
|
-
buffer =
|
|
4946
|
+
buffer = _chunkVLSATRNQcjs.toArrayBuffer.call(void 0, buf);
|
|
4537
4947
|
} else if (Buffer.isBuffer(input)) {
|
|
4538
|
-
buffer =
|
|
4948
|
+
buffer = _chunkVLSATRNQcjs.toArrayBuffer.call(void 0, input);
|
|
4539
4949
|
} else {
|
|
4540
4950
|
buffer = input;
|
|
4541
4951
|
}
|
|
@@ -4561,7 +4971,7 @@ async function fillForm(input, values, outputFormat = "markdown") {
|
|
|
4561
4971
|
throw new Error(`\uC11C\uC2DD \uD30C\uC2F1 \uC2E4\uD328: ${parsed.error}`);
|
|
4562
4972
|
}
|
|
4563
4973
|
const fill = fillFormFields(parsed.blocks, values);
|
|
4564
|
-
const markdown =
|
|
4974
|
+
const markdown = _chunkVLSATRNQcjs.blocksToMarkdown.call(void 0, fill.blocks);
|
|
4565
4975
|
if (outputFormat === "hwpx") {
|
|
4566
4976
|
const hwpxBuffer = await markdownToHwpx(markdown);
|
|
4567
4977
|
return { output: hwpxBuffer, format: "hwpx", fill };
|
|
@@ -4591,5 +5001,6 @@ async function fillForm(input, values, outputFormat = "markdown") {
|
|
|
4591
5001
|
|
|
4592
5002
|
|
|
4593
5003
|
|
|
4594
|
-
|
|
5004
|
+
|
|
5005
|
+
exports.VERSION = _chunkVLSATRNQcjs.VERSION; exports.blocksToMarkdown = _chunkVLSATRNQcjs.blocksToMarkdown; exports.compare = compare; exports.detectFormat = detectFormat; exports.detectZipFormat = detectZipFormat; exports.diffBlocks = diffBlocks; exports.extractFormFields = extractFormFields; exports.fillForm = fillForm; exports.fillFormFields = fillFormFields; exports.fillHwpx = fillHwpx; exports.isHwpxFile = isHwpxFile; exports.isLabelCell = isLabelCell; exports.isOldHwpFile = isOldHwpFile; exports.isPdfFile = isPdfFile; exports.isZipFile = isZipFile; exports.markdownToHwpx = markdownToHwpx; exports.parse = parse; exports.parseDocx = parseDocx; exports.parseHwp = parseHwp; exports.parseHwpml = parseHwpml; exports.parseHwpx = parseHwpx; exports.parsePdf = parsePdf; exports.parseXlsx = parseXlsx;
|
|
4595
5006
|
//# sourceMappingURL=index.cjs.map
|