kordoc 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +111 -118
- package/dist/{chunk-KCGDEP7Q.js → chunk-BWZW234S.js} +575 -63
- package/dist/chunk-BWZW234S.js.map +1 -0
- package/dist/cli.js +15 -3
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +645 -35
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +163 -6
- package/dist/index.d.ts +163 -6
- package/dist/index.js +647 -35
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +216 -13
- package/dist/mcp.js.map +1 -1
- package/dist/provider-JB7SY74K.js +38 -0
- package/dist/provider-JB7SY74K.js.map +1 -0
- package/dist/watch-LIGKH3QS.js +90 -0
- package/dist/watch-LIGKH3QS.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-KCGDEP7Q.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -5,6 +5,9 @@ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
|
5
5
|
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
6
|
var __getProtoOf = Object.getPrototypeOf;
|
|
7
7
|
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __esm = (fn, res) => function __init() {
|
|
9
|
+
return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
|
|
10
|
+
};
|
|
8
11
|
var __export = (target, all) => {
|
|
9
12
|
for (var name in all)
|
|
10
13
|
__defProp(target, name, { get: all[name], enumerable: true });
|
|
@@ -27,14 +30,61 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
27
30
|
));
|
|
28
31
|
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
32
|
|
|
33
|
+
// src/ocr/provider.ts
|
|
34
|
+
var provider_exports = {};
|
|
35
|
+
__export(provider_exports, {
|
|
36
|
+
ocrPages: () => ocrPages
|
|
37
|
+
});
|
|
38
|
+
async function ocrPages(doc, provider, pageFilter, effectivePageCount) {
|
|
39
|
+
const blocks = [];
|
|
40
|
+
for (let i = 1; i <= effectivePageCount; i++) {
|
|
41
|
+
if (pageFilter && !pageFilter.has(i)) continue;
|
|
42
|
+
const page = await doc.getPage(i);
|
|
43
|
+
try {
|
|
44
|
+
const imageData = await renderPageToPng(page);
|
|
45
|
+
const text = await provider(imageData, i, "image/png");
|
|
46
|
+
if (text.trim()) {
|
|
47
|
+
blocks.push({ type: "paragraph", text: text.trim() });
|
|
48
|
+
}
|
|
49
|
+
} catch {
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
return blocks;
|
|
53
|
+
}
|
|
54
|
+
async function renderPageToPng(page) {
|
|
55
|
+
let createCanvas;
|
|
56
|
+
try {
|
|
57
|
+
const canvasModule = await import("canvas");
|
|
58
|
+
createCanvas = canvasModule.createCanvas;
|
|
59
|
+
} catch {
|
|
60
|
+
throw new Error("OCR\uC744 \uC0AC\uC6A9\uD558\uB824\uBA74 'canvas' \uD328\uD0A4\uC9C0\uB97C \uC124\uCE58\uD558\uC138\uC694: npm install canvas");
|
|
61
|
+
}
|
|
62
|
+
const scale = 2;
|
|
63
|
+
const viewport = page.getViewport({ scale });
|
|
64
|
+
const canvas = createCanvas(Math.floor(viewport.width), Math.floor(viewport.height));
|
|
65
|
+
const ctx = canvas.getContext("2d");
|
|
66
|
+
await page.render({ canvasContext: ctx, viewport }).promise;
|
|
67
|
+
return new Uint8Array(canvas.toBuffer("image/png"));
|
|
68
|
+
}
|
|
69
|
+
var init_provider = __esm({
|
|
70
|
+
"src/ocr/provider.ts"() {
|
|
71
|
+
"use strict";
|
|
72
|
+
}
|
|
73
|
+
});
|
|
74
|
+
|
|
30
75
|
// src/index.ts
|
|
31
76
|
var index_exports = {};
|
|
32
77
|
__export(index_exports, {
|
|
33
78
|
VERSION: () => VERSION,
|
|
79
|
+
blocksToMarkdown: () => blocksToMarkdown,
|
|
80
|
+
compare: () => compare,
|
|
34
81
|
detectFormat: () => detectFormat,
|
|
82
|
+
diffBlocks: () => diffBlocks,
|
|
83
|
+
extractFormFields: () => extractFormFields,
|
|
35
84
|
isHwpxFile: () => isHwpxFile,
|
|
36
85
|
isOldHwpFile: () => isOldHwpFile,
|
|
37
86
|
isPdfFile: () => isPdfFile,
|
|
87
|
+
markdownToHwpx: () => markdownToHwpx,
|
|
38
88
|
parse: () => parse,
|
|
39
89
|
parseHwp: () => parseHwp,
|
|
40
90
|
parseHwpx: () => parseHwpx,
|
|
@@ -201,7 +251,7 @@ function tableToMarkdown(table) {
|
|
|
201
251
|
}
|
|
202
252
|
|
|
203
253
|
// src/utils.ts
|
|
204
|
-
var VERSION = true ? "1.
|
|
254
|
+
var VERSION = true ? "1.4.0" : "0.0.0-dev";
|
|
205
255
|
var KordocError = class extends Error {
|
|
206
256
|
constructor(message) {
|
|
207
257
|
super(message);
|
|
@@ -212,6 +262,47 @@ function isPathTraversal(name) {
|
|
|
212
262
|
const normalized = name.replace(/\\/g, "/");
|
|
213
263
|
return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
214
264
|
}
|
|
265
|
+
function classifyError(err) {
|
|
266
|
+
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
267
|
+
const msg = err.message;
|
|
268
|
+
if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
|
|
269
|
+
if (msg.includes("DRM")) return "DRM_PROTECTED";
|
|
270
|
+
if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
|
|
271
|
+
if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
|
|
272
|
+
if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
|
|
273
|
+
if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
|
|
274
|
+
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
275
|
+
return "PARSE_ERROR";
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
// src/page-range.ts
|
|
279
|
+
function parsePageRange(spec, maxPages) {
|
|
280
|
+
const result = /* @__PURE__ */ new Set();
|
|
281
|
+
if (maxPages <= 0) return result;
|
|
282
|
+
if (Array.isArray(spec)) {
|
|
283
|
+
for (const n of spec) {
|
|
284
|
+
const page = Math.round(n);
|
|
285
|
+
if (page >= 1 && page <= maxPages) result.add(page);
|
|
286
|
+
}
|
|
287
|
+
return result;
|
|
288
|
+
}
|
|
289
|
+
if (typeof spec !== "string" || spec.trim() === "") return result;
|
|
290
|
+
const parts = spec.split(",");
|
|
291
|
+
for (const part of parts) {
|
|
292
|
+
const trimmed = part.trim();
|
|
293
|
+
if (!trimmed) continue;
|
|
294
|
+
const rangeMatch = trimmed.match(/^(\d+)\s*-\s*(\d+)$/);
|
|
295
|
+
if (rangeMatch) {
|
|
296
|
+
const start = Math.max(1, parseInt(rangeMatch[1], 10));
|
|
297
|
+
const end = Math.min(maxPages, parseInt(rangeMatch[2], 10));
|
|
298
|
+
for (let i = start; i <= end; i++) result.add(i);
|
|
299
|
+
} else {
|
|
300
|
+
const page = parseInt(trimmed, 10);
|
|
301
|
+
if (!isNaN(page) && page >= 1 && page <= maxPages) result.add(page);
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
return result;
|
|
305
|
+
}
|
|
215
306
|
|
|
216
307
|
// src/hwpx/parser.ts
|
|
217
308
|
var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
|
|
@@ -222,7 +313,7 @@ function clampSpan(val, max) {
|
|
|
222
313
|
function stripDtd(xml) {
|
|
223
314
|
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
224
315
|
}
|
|
225
|
-
async function parseHwpxDocument(buffer) {
|
|
316
|
+
async function parseHwpxDocument(buffer, options) {
|
|
226
317
|
const precheck = precheckZipSize(buffer);
|
|
227
318
|
if (precheck.totalUncompressed > MAX_DECOMPRESS_SIZE) {
|
|
228
319
|
throw new KordocError("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
@@ -240,19 +331,62 @@ async function parseHwpxDocument(buffer) {
|
|
|
240
331
|
if (actualEntryCount > MAX_ZIP_ENTRIES) {
|
|
241
332
|
throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
242
333
|
}
|
|
334
|
+
const metadata = {};
|
|
335
|
+
await extractHwpxMetadata(zip, metadata);
|
|
243
336
|
const sectionPaths = await resolveSectionPaths(zip);
|
|
244
337
|
if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
338
|
+
metadata.pageCount = sectionPaths.length;
|
|
339
|
+
const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
|
|
245
340
|
let totalDecompressed = 0;
|
|
246
341
|
const blocks = [];
|
|
247
|
-
for (
|
|
248
|
-
|
|
342
|
+
for (let si = 0; si < sectionPaths.length; si++) {
|
|
343
|
+
if (pageFilter && !pageFilter.has(si + 1)) continue;
|
|
344
|
+
const file = zip.file(sectionPaths[si]);
|
|
249
345
|
if (!file) continue;
|
|
250
346
|
const xml = await file.async("text");
|
|
251
347
|
totalDecompressed += xml.length * 2;
|
|
252
348
|
if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
253
349
|
blocks.push(...parseSectionXml(xml));
|
|
254
350
|
}
|
|
255
|
-
|
|
351
|
+
const markdown = blocksToMarkdown(blocks);
|
|
352
|
+
return { markdown, blocks, metadata };
|
|
353
|
+
}
|
|
354
|
+
async function extractHwpxMetadata(zip, metadata) {
|
|
355
|
+
try {
|
|
356
|
+
const metaPaths = ["meta.xml", "META-INF/meta.xml", "docProps/core.xml"];
|
|
357
|
+
for (const mp of metaPaths) {
|
|
358
|
+
const file = zip.file(mp) || Object.values(zip.files).find((f) => f.name.toLowerCase() === mp.toLowerCase()) || null;
|
|
359
|
+
if (!file) continue;
|
|
360
|
+
const xml = await file.async("text");
|
|
361
|
+
parseDublinCoreMetadata(xml, metadata);
|
|
362
|
+
if (metadata.title || metadata.author) return;
|
|
363
|
+
}
|
|
364
|
+
} catch {
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
function parseDublinCoreMetadata(xml, metadata) {
|
|
368
|
+
const parser = new import_xmldom.DOMParser();
|
|
369
|
+
const doc = parser.parseFromString(stripDtd(xml), "text/xml");
|
|
370
|
+
if (!doc.documentElement) return;
|
|
371
|
+
const getText = (tagNames) => {
|
|
372
|
+
for (const tag of tagNames) {
|
|
373
|
+
const els = doc.getElementsByTagName(tag);
|
|
374
|
+
if (els.length > 0) {
|
|
375
|
+
const text = els[0].textContent?.trim();
|
|
376
|
+
if (text) return text;
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
return void 0;
|
|
380
|
+
};
|
|
381
|
+
metadata.title = metadata.title || getText(["dc:title", "title"]);
|
|
382
|
+
metadata.author = metadata.author || getText(["dc:creator", "creator", "cp:lastModifiedBy"]);
|
|
383
|
+
metadata.description = metadata.description || getText(["dc:description", "description", "dc:subject", "subject"]);
|
|
384
|
+
metadata.createdAt = metadata.createdAt || getText(["dcterms:created", "meta:creation-date"]);
|
|
385
|
+
metadata.modifiedAt = metadata.modifiedAt || getText(["dcterms:modified", "meta:date"]);
|
|
386
|
+
const keywords = getText(["dc:keyword", "cp:keywords", "meta:keyword"]);
|
|
387
|
+
if (keywords && !metadata.keywords) {
|
|
388
|
+
metadata.keywords = keywords.split(/[,;]/).map((k) => k.trim()).filter(Boolean);
|
|
389
|
+
}
|
|
256
390
|
}
|
|
257
391
|
function precheckZipSize(buffer) {
|
|
258
392
|
try {
|
|
@@ -291,7 +425,7 @@ function extractFromBrokenZip(buffer) {
|
|
|
291
425
|
const data = new Uint8Array(buffer);
|
|
292
426
|
const view = new DataView(buffer);
|
|
293
427
|
let pos = 0;
|
|
294
|
-
const
|
|
428
|
+
const blocks = [];
|
|
295
429
|
let totalDecompressed = 0;
|
|
296
430
|
let entryCount = 0;
|
|
297
431
|
while (pos < data.length - 30) {
|
|
@@ -332,14 +466,14 @@ function extractFromBrokenZip(buffer) {
|
|
|
332
466
|
}
|
|
333
467
|
totalDecompressed += content.length * 2;
|
|
334
468
|
if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
|
|
335
|
-
|
|
336
|
-
if (sectionText) texts.push(sectionText);
|
|
469
|
+
blocks.push(...parseSectionXml(content));
|
|
337
470
|
} catch {
|
|
338
471
|
continue;
|
|
339
472
|
}
|
|
340
473
|
}
|
|
341
|
-
if (
|
|
342
|
-
|
|
474
|
+
if (blocks.length === 0) throw new KordocError("\uC190\uC0C1\uB41C HWPX\uC5D0\uC11C \uC139\uC158 \uB370\uC774\uD130\uB97C \uBCF5\uAD6C\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
475
|
+
const markdown = blocksToMarkdown(blocks);
|
|
476
|
+
return { markdown, blocks };
|
|
343
477
|
}
|
|
344
478
|
async function resolveSectionPaths(zip) {
|
|
345
479
|
const manifestPaths = ["Contents/content.hpf", "content.hpf"];
|
|
@@ -612,7 +746,7 @@ var require2 = (0, import_module.createRequire)(import_meta.url);
|
|
|
612
746
|
var CFB = require2("cfb");
|
|
613
747
|
var MAX_SECTIONS = 100;
|
|
614
748
|
var MAX_TOTAL_DECOMPRESS = 100 * 1024 * 1024;
|
|
615
|
-
function parseHwp5Document(buffer) {
|
|
749
|
+
function parseHwp5Document(buffer, options) {
|
|
616
750
|
const cfb = CFB.parse(buffer);
|
|
617
751
|
const headerEntry = CFB.find(cfb, "/FileHeader");
|
|
618
752
|
if (!headerEntry?.content) throw new KordocError("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
|
|
@@ -621,18 +755,59 @@ function parseHwp5Document(buffer) {
|
|
|
621
755
|
if (header.flags & FLAG_ENCRYPTED) throw new KordocError("\uC554\uD638\uD654\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
|
|
622
756
|
if (header.flags & FLAG_DRM) throw new KordocError("DRM \uBCF4\uD638\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
|
|
623
757
|
const compressed = (header.flags & FLAG_COMPRESSED) !== 0;
|
|
758
|
+
const metadata = {
|
|
759
|
+
version: `${header.versionMajor}.x`
|
|
760
|
+
};
|
|
761
|
+
extractHwp5Metadata(cfb, metadata);
|
|
624
762
|
const sections = findSections(cfb);
|
|
625
763
|
if (sections.length === 0) throw new KordocError("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
764
|
+
metadata.pageCount = sections.length;
|
|
765
|
+
const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
|
|
626
766
|
const blocks = [];
|
|
627
767
|
let totalDecompressed = 0;
|
|
628
|
-
for (
|
|
768
|
+
for (let si = 0; si < sections.length; si++) {
|
|
769
|
+
if (pageFilter && !pageFilter.has(si + 1)) continue;
|
|
770
|
+
const sectionData = sections[si];
|
|
629
771
|
const data = compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
|
|
630
772
|
totalDecompressed += data.length;
|
|
631
773
|
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
632
774
|
const records = readRecords(data);
|
|
633
775
|
blocks.push(...parseSection(records));
|
|
634
776
|
}
|
|
635
|
-
|
|
777
|
+
const markdown = blocksToMarkdown(blocks);
|
|
778
|
+
return { markdown, blocks, metadata };
|
|
779
|
+
}
|
|
780
|
+
function extractHwp5Metadata(cfb, metadata) {
|
|
781
|
+
try {
|
|
782
|
+
const summaryEntry = CFB.find(cfb, "/HwpSummaryInformation") || CFB.find(cfb, "/SummaryInformation");
|
|
783
|
+
if (!summaryEntry?.content) return;
|
|
784
|
+
const data = Buffer.from(summaryEntry.content);
|
|
785
|
+
if (data.length < 48) return;
|
|
786
|
+
const numSets = data.readUInt32LE(24);
|
|
787
|
+
if (numSets === 0) return;
|
|
788
|
+
const setOffset = data.readUInt32LE(44);
|
|
789
|
+
if (setOffset >= data.length - 8) return;
|
|
790
|
+
const numProps = data.readUInt32LE(setOffset + 4);
|
|
791
|
+
if (numProps === 0 || numProps > 100) return;
|
|
792
|
+
for (let i = 0; i < numProps; i++) {
|
|
793
|
+
const entryOffset = setOffset + 8 + i * 8;
|
|
794
|
+
if (entryOffset + 8 > data.length) break;
|
|
795
|
+
const propId = data.readUInt32LE(entryOffset);
|
|
796
|
+
const propOffset = setOffset + data.readUInt32LE(entryOffset + 4);
|
|
797
|
+
if (propOffset + 8 > data.length) continue;
|
|
798
|
+
if (propId !== 2 && propId !== 4 && propId !== 6) continue;
|
|
799
|
+
const propType = data.readUInt32LE(propOffset);
|
|
800
|
+
if (propType !== 30) continue;
|
|
801
|
+
const strLen = data.readUInt32LE(propOffset + 4);
|
|
802
|
+
if (strLen === 0 || strLen > 1e4 || propOffset + 8 + strLen > data.length) continue;
|
|
803
|
+
const str = data.subarray(propOffset + 8, propOffset + 8 + strLen).toString("utf8").replace(/\0+$/, "").trim();
|
|
804
|
+
if (!str) continue;
|
|
805
|
+
if (propId === 2) metadata.title = str;
|
|
806
|
+
else if (propId === 4) metadata.author = str;
|
|
807
|
+
else if (propId === 6) metadata.description = str;
|
|
808
|
+
}
|
|
809
|
+
} catch {
|
|
810
|
+
}
|
|
636
811
|
}
|
|
637
812
|
function findSections(cfb) {
|
|
638
813
|
const sections = [];
|
|
@@ -794,7 +969,7 @@ var import_pdf = require("pdfjs-dist/legacy/build/pdf.mjs");
|
|
|
794
969
|
import_pdf.GlobalWorkerOptions.workerSrc = "";
|
|
795
970
|
var MAX_PAGES = 5e3;
|
|
796
971
|
var MAX_TOTAL_TEXT = 100 * 1024 * 1024;
|
|
797
|
-
async function parsePdfDocument(buffer) {
|
|
972
|
+
async function parsePdfDocument(buffer, options) {
|
|
798
973
|
const doc = await (0, import_pdf.getDocument)({
|
|
799
974
|
data: new Uint8Array(buffer),
|
|
800
975
|
useSystemFonts: true,
|
|
@@ -803,12 +978,17 @@ async function parsePdfDocument(buffer) {
|
|
|
803
978
|
}).promise;
|
|
804
979
|
try {
|
|
805
980
|
const pageCount = doc.numPages;
|
|
806
|
-
if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4." };
|
|
981
|
+
if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.", blocks: [] };
|
|
982
|
+
const metadata = { pageCount };
|
|
983
|
+
await extractPdfMetadata(doc, metadata);
|
|
807
984
|
const pageTexts = [];
|
|
985
|
+
const blocks = [];
|
|
808
986
|
let totalChars = 0;
|
|
809
987
|
let totalTextBytes = 0;
|
|
810
988
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
989
|
+
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
811
990
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
991
|
+
if (pageFilter && !pageFilter.has(i)) continue;
|
|
812
992
|
const page = await doc.getPage(i);
|
|
813
993
|
const tc = await page.getTextContent();
|
|
814
994
|
const pageText = extractPageContent(tc.items);
|
|
@@ -816,18 +996,54 @@ async function parsePdfDocument(buffer) {
|
|
|
816
996
|
totalTextBytes += pageText.length * 2;
|
|
817
997
|
if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
|
|
818
998
|
pageTexts.push(pageText);
|
|
999
|
+
blocks.push({ type: "paragraph", text: pageText });
|
|
819
1000
|
}
|
|
820
|
-
|
|
821
|
-
|
|
1001
|
+
const parsedPageCount = pageFilter ? pageFilter.size : effectivePageCount;
|
|
1002
|
+
if (totalChars / Math.max(parsedPageCount, 1) < 10) {
|
|
1003
|
+
if (options?.ocr) {
|
|
1004
|
+
try {
|
|
1005
|
+
const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
|
|
1006
|
+
const ocrBlocks = await ocrPages2(doc, options.ocr, pageFilter, effectivePageCount);
|
|
1007
|
+
if (ocrBlocks.length > 0) {
|
|
1008
|
+
const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
|
|
1009
|
+
return { success: true, fileType: "pdf", markdown: ocrMarkdown, pageCount: parsedPageCount, blocks: ocrBlocks, metadata, isImageBased: true };
|
|
1010
|
+
}
|
|
1011
|
+
} catch {
|
|
1012
|
+
}
|
|
1013
|
+
}
|
|
1014
|
+
return { success: false, fileType: "pdf", pageCount, isImageBased: true, error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`, code: "IMAGE_BASED_PDF" };
|
|
822
1015
|
}
|
|
823
1016
|
let markdown = pageTexts.filter((t) => t.trim()).join("\n\n");
|
|
824
1017
|
markdown = cleanPdfText(markdown);
|
|
825
|
-
return { success: true, fileType: "pdf", markdown, pageCount:
|
|
1018
|
+
return { success: true, fileType: "pdf", markdown, pageCount: parsedPageCount, blocks, metadata };
|
|
826
1019
|
} finally {
|
|
827
1020
|
await doc.destroy().catch(() => {
|
|
828
1021
|
});
|
|
829
1022
|
}
|
|
830
1023
|
}
|
|
1024
|
+
async function extractPdfMetadata(doc, metadata) {
|
|
1025
|
+
try {
|
|
1026
|
+
const result = await doc.getMetadata();
|
|
1027
|
+
if (!result?.info) return;
|
|
1028
|
+
const info = result.info;
|
|
1029
|
+
if (typeof info.Title === "string" && info.Title.trim()) metadata.title = info.Title.trim();
|
|
1030
|
+
if (typeof info.Author === "string" && info.Author.trim()) metadata.author = info.Author.trim();
|
|
1031
|
+
if (typeof info.Creator === "string" && info.Creator.trim()) metadata.creator = info.Creator.trim();
|
|
1032
|
+
if (typeof info.Subject === "string" && info.Subject.trim()) metadata.description = info.Subject.trim();
|
|
1033
|
+
if (typeof info.Keywords === "string" && info.Keywords.trim()) {
|
|
1034
|
+
metadata.keywords = info.Keywords.split(/[,;]/).map((k) => k.trim()).filter(Boolean);
|
|
1035
|
+
}
|
|
1036
|
+
if (typeof info.CreationDate === "string") metadata.createdAt = parsePdfDate(info.CreationDate);
|
|
1037
|
+
if (typeof info.ModDate === "string") metadata.modifiedAt = parsePdfDate(info.ModDate);
|
|
1038
|
+
} catch {
|
|
1039
|
+
}
|
|
1040
|
+
}
|
|
1041
|
+
function parsePdfDate(dateStr) {
|
|
1042
|
+
const m = dateStr.match(/D:(\d{4})(\d{2})?(\d{2})?(\d{2})?(\d{2})?(\d{2})?/);
|
|
1043
|
+
if (!m) return void 0;
|
|
1044
|
+
const [, year, month = "01", day = "01", hour = "00", min = "00", sec = "00"] = m;
|
|
1045
|
+
return `${year}-${month}-${day}T${hour}:${min}:${sec}`;
|
|
1046
|
+
}
|
|
831
1047
|
function extractPageContent(rawItems) {
|
|
832
1048
|
const items = normalizeItems(rawItems);
|
|
833
1049
|
if (items.length === 0) return "";
|
|
@@ -1100,53 +1316,447 @@ function mergeKoreanLines(text) {
|
|
|
1100
1316
|
return result.join("\n");
|
|
1101
1317
|
}
|
|
1102
1318
|
|
|
1319
|
+
// src/diff/text-diff.ts
|
|
1320
|
+
function similarity(a, b) {
|
|
1321
|
+
if (a === b) return 1;
|
|
1322
|
+
if (!a || !b) return 0;
|
|
1323
|
+
const maxLen = Math.max(a.length, b.length);
|
|
1324
|
+
if (maxLen === 0) return 1;
|
|
1325
|
+
return 1 - levenshtein(a, b) / maxLen;
|
|
1326
|
+
}
|
|
1327
|
+
function normalizedSimilarity(a, b) {
|
|
1328
|
+
return similarity(normalize(a), normalize(b));
|
|
1329
|
+
}
|
|
1330
|
+
function normalize(s) {
|
|
1331
|
+
return s.replace(/\s+/g, " ").trim();
|
|
1332
|
+
}
|
|
1333
|
+
function levenshtein(a, b) {
|
|
1334
|
+
if (a.length > b.length) [a, b] = [b, a];
|
|
1335
|
+
const m = a.length;
|
|
1336
|
+
const n = b.length;
|
|
1337
|
+
let prev = Array.from({ length: m + 1 }, (_, i) => i);
|
|
1338
|
+
let curr = new Array(m + 1);
|
|
1339
|
+
for (let j = 1; j <= n; j++) {
|
|
1340
|
+
curr[0] = j;
|
|
1341
|
+
for (let i = 1; i <= m; i++) {
|
|
1342
|
+
if (a[i - 1] === b[j - 1]) {
|
|
1343
|
+
curr[i] = prev[i - 1];
|
|
1344
|
+
} else {
|
|
1345
|
+
curr[i] = 1 + Math.min(prev[i - 1], prev[i], curr[i - 1]);
|
|
1346
|
+
}
|
|
1347
|
+
}
|
|
1348
|
+
;
|
|
1349
|
+
[prev, curr] = [curr, prev];
|
|
1350
|
+
}
|
|
1351
|
+
return prev[m];
|
|
1352
|
+
}
|
|
1353
|
+
|
|
1354
|
+
// src/diff/compare.ts
|
|
1355
|
+
var SIMILARITY_THRESHOLD = 0.4;
|
|
1356
|
+
async function compare(bufferA, bufferB, options) {
|
|
1357
|
+
const [resultA, resultB] = await Promise.all([
|
|
1358
|
+
parse(bufferA, options),
|
|
1359
|
+
parse(bufferB, options)
|
|
1360
|
+
]);
|
|
1361
|
+
if (!resultA.success) throw new Error(`\uBB38\uC11CA \uD30C\uC2F1 \uC2E4\uD328: ${resultA.error}`);
|
|
1362
|
+
if (!resultB.success) throw new Error(`\uBB38\uC11CB \uD30C\uC2F1 \uC2E4\uD328: ${resultB.error}`);
|
|
1363
|
+
return diffBlocks(resultA.blocks, resultB.blocks);
|
|
1364
|
+
}
|
|
1365
|
+
function diffBlocks(blocksA, blocksB) {
|
|
1366
|
+
const aligned = alignBlocks(blocksA, blocksB);
|
|
1367
|
+
const stats = { added: 0, removed: 0, modified: 0, unchanged: 0 };
|
|
1368
|
+
const diffs = [];
|
|
1369
|
+
for (const [a, b] of aligned) {
|
|
1370
|
+
if (a && b) {
|
|
1371
|
+
const sim = blockSimilarity(a, b);
|
|
1372
|
+
if (sim >= 0.99) {
|
|
1373
|
+
diffs.push({ type: "unchanged", before: a, after: b, similarity: 1 });
|
|
1374
|
+
stats.unchanged++;
|
|
1375
|
+
} else {
|
|
1376
|
+
const diff = { type: "modified", before: a, after: b, similarity: sim };
|
|
1377
|
+
if (a.type === "table" && b.type === "table" && a.table && b.table) {
|
|
1378
|
+
diff.cellDiffs = diffTableCells(a.table, b.table);
|
|
1379
|
+
}
|
|
1380
|
+
diffs.push(diff);
|
|
1381
|
+
stats.modified++;
|
|
1382
|
+
}
|
|
1383
|
+
} else if (a) {
|
|
1384
|
+
diffs.push({ type: "removed", before: a });
|
|
1385
|
+
stats.removed++;
|
|
1386
|
+
} else if (b) {
|
|
1387
|
+
diffs.push({ type: "added", after: b });
|
|
1388
|
+
stats.added++;
|
|
1389
|
+
}
|
|
1390
|
+
}
|
|
1391
|
+
return { stats, diffs };
|
|
1392
|
+
}
|
|
1393
|
+
function alignBlocks(a, b) {
|
|
1394
|
+
const m = a.length, n = b.length;
|
|
1395
|
+
if (m * n > 1e7) return fallbackAlign(a, b);
|
|
1396
|
+
const simCache = /* @__PURE__ */ new Map();
|
|
1397
|
+
const getSim = (i2, j2) => {
|
|
1398
|
+
const key = `${i2},${j2}`;
|
|
1399
|
+
let v = simCache.get(key);
|
|
1400
|
+
if (v === void 0) {
|
|
1401
|
+
v = blockSimilarity(a[i2], b[j2]);
|
|
1402
|
+
simCache.set(key, v);
|
|
1403
|
+
}
|
|
1404
|
+
return v;
|
|
1405
|
+
};
|
|
1406
|
+
const dp = Array.from({ length: m + 1 }, () => new Array(n + 1).fill(0));
|
|
1407
|
+
for (let i2 = 1; i2 <= m; i2++) {
|
|
1408
|
+
for (let j2 = 1; j2 <= n; j2++) {
|
|
1409
|
+
if (getSim(i2 - 1, j2 - 1) >= SIMILARITY_THRESHOLD) {
|
|
1410
|
+
dp[i2][j2] = dp[i2 - 1][j2 - 1] + 1;
|
|
1411
|
+
} else {
|
|
1412
|
+
dp[i2][j2] = Math.max(dp[i2 - 1][j2], dp[i2][j2 - 1]);
|
|
1413
|
+
}
|
|
1414
|
+
}
|
|
1415
|
+
}
|
|
1416
|
+
const pairs = [];
|
|
1417
|
+
let i = m, j = n;
|
|
1418
|
+
while (i > 0 && j > 0) {
|
|
1419
|
+
if (getSim(i - 1, j - 1) >= SIMILARITY_THRESHOLD && dp[i][j] === dp[i - 1][j - 1] + 1) {
|
|
1420
|
+
pairs.push([i - 1, j - 1]);
|
|
1421
|
+
i--;
|
|
1422
|
+
j--;
|
|
1423
|
+
} else if (dp[i - 1][j] >= dp[i][j - 1]) {
|
|
1424
|
+
i--;
|
|
1425
|
+
} else {
|
|
1426
|
+
j--;
|
|
1427
|
+
}
|
|
1428
|
+
}
|
|
1429
|
+
pairs.reverse();
|
|
1430
|
+
const result = [];
|
|
1431
|
+
let ai = 0, bi = 0;
|
|
1432
|
+
for (const [pi, pj] of pairs) {
|
|
1433
|
+
while (ai < pi) result.push([a[ai++], null]);
|
|
1434
|
+
while (bi < pj) result.push([null, b[bi++]]);
|
|
1435
|
+
result.push([a[ai++], b[bi++]]);
|
|
1436
|
+
}
|
|
1437
|
+
while (ai < m) result.push([a[ai++], null]);
|
|
1438
|
+
while (bi < n) result.push([null, b[bi++]]);
|
|
1439
|
+
return result;
|
|
1440
|
+
}
|
|
1441
|
+
function fallbackAlign(a, b) {
|
|
1442
|
+
const result = [];
|
|
1443
|
+
const len = Math.max(a.length, b.length);
|
|
1444
|
+
for (let i = 0; i < len; i++) {
|
|
1445
|
+
result.push([a[i] || null, b[i] || null]);
|
|
1446
|
+
}
|
|
1447
|
+
return result;
|
|
1448
|
+
}
|
|
1449
|
+
function blockSimilarity(a, b) {
|
|
1450
|
+
if (a.type !== b.type) return 0;
|
|
1451
|
+
if (a.type === "paragraph") {
|
|
1452
|
+
return normalizedSimilarity(a.text || "", b.text || "");
|
|
1453
|
+
}
|
|
1454
|
+
if (a.type === "table" && a.table && b.table) {
|
|
1455
|
+
return tableSimilarity(a.table, b.table);
|
|
1456
|
+
}
|
|
1457
|
+
return 0;
|
|
1458
|
+
}
|
|
1459
|
+
function tableSimilarity(a, b) {
|
|
1460
|
+
const dimSim = 1 - Math.abs(a.rows * a.cols - b.rows * b.cols) / Math.max(a.rows * a.cols, b.rows * b.cols, 1);
|
|
1461
|
+
const textsA = a.cells.flat().map((c) => c.text).join(" ");
|
|
1462
|
+
const textsB = b.cells.flat().map((c) => c.text).join(" ");
|
|
1463
|
+
const contentSim = normalizedSimilarity(textsA, textsB);
|
|
1464
|
+
return dimSim * 0.3 + contentSim * 0.7;
|
|
1465
|
+
}
|
|
1466
|
+
function diffTableCells(a, b) {
|
|
1467
|
+
const maxRows = Math.max(a.rows, b.rows);
|
|
1468
|
+
const maxCols = Math.max(a.cols, b.cols);
|
|
1469
|
+
const result = [];
|
|
1470
|
+
for (let r = 0; r < maxRows; r++) {
|
|
1471
|
+
const row = [];
|
|
1472
|
+
for (let c = 0; c < maxCols; c++) {
|
|
1473
|
+
const cellA = r < a.rows && c < a.cols ? a.cells[r][c].text : void 0;
|
|
1474
|
+
const cellB = r < b.rows && c < b.cols ? b.cells[r][c].text : void 0;
|
|
1475
|
+
let type;
|
|
1476
|
+
if (cellA === void 0) type = "added";
|
|
1477
|
+
else if (cellB === void 0) type = "removed";
|
|
1478
|
+
else if (cellA === cellB) type = "unchanged";
|
|
1479
|
+
else type = "modified";
|
|
1480
|
+
row.push({ type, before: cellA, after: cellB });
|
|
1481
|
+
}
|
|
1482
|
+
result.push(row);
|
|
1483
|
+
}
|
|
1484
|
+
return result;
|
|
1485
|
+
}
|
|
1486
|
+
|
|
1487
|
+
// src/form/recognize.ts
|
|
1488
|
+
var LABEL_KEYWORDS = /* @__PURE__ */ new Set([
|
|
1489
|
+
"\uC131\uBA85",
|
|
1490
|
+
"\uC774\uB984",
|
|
1491
|
+
"\uC8FC\uC18C",
|
|
1492
|
+
"\uC804\uD654",
|
|
1493
|
+
"\uC804\uD654\uBC88\uD638",
|
|
1494
|
+
"\uD734\uB300\uD3F0",
|
|
1495
|
+
"\uD578\uB4DC\uD3F0",
|
|
1496
|
+
"\uC5F0\uB77D\uCC98",
|
|
1497
|
+
"\uC0DD\uB144\uC6D4\uC77C",
|
|
1498
|
+
"\uC8FC\uBBFC\uB4F1\uB85D\uBC88\uD638",
|
|
1499
|
+
"\uC18C\uC18D",
|
|
1500
|
+
"\uC9C1\uC704",
|
|
1501
|
+
"\uC9C1\uAE09",
|
|
1502
|
+
"\uBD80\uC11C",
|
|
1503
|
+
"\uC774\uBA54\uC77C",
|
|
1504
|
+
"\uD329\uC2A4",
|
|
1505
|
+
"\uD559\uAD50",
|
|
1506
|
+
"\uD559\uB144",
|
|
1507
|
+
"\uBC18",
|
|
1508
|
+
"\uBC88\uD638",
|
|
1509
|
+
"\uC2E0\uCCAD\uC778",
|
|
1510
|
+
"\uB300\uD45C\uC790",
|
|
1511
|
+
"\uB2F4\uB2F9\uC790",
|
|
1512
|
+
"\uC791\uC131\uC790",
|
|
1513
|
+
"\uD655\uC778\uC790",
|
|
1514
|
+
"\uC2B9\uC778\uC790",
|
|
1515
|
+
"\uC77C\uC2DC",
|
|
1516
|
+
"\uB0A0\uC9DC",
|
|
1517
|
+
"\uAE30\uAC04",
|
|
1518
|
+
"\uC7A5\uC18C",
|
|
1519
|
+
"\uBAA9\uC801",
|
|
1520
|
+
"\uC0AC\uC720",
|
|
1521
|
+
"\uBE44\uACE0",
|
|
1522
|
+
"\uAE08\uC561",
|
|
1523
|
+
"\uC218\uB7C9",
|
|
1524
|
+
"\uB2E8\uAC00",
|
|
1525
|
+
"\uD569\uACC4",
|
|
1526
|
+
"\uACC4",
|
|
1527
|
+
"\uC18C\uACC4"
|
|
1528
|
+
]);
|
|
1529
|
+
function isLabelCell(text) {
|
|
1530
|
+
const trimmed = text.trim();
|
|
1531
|
+
if (!trimmed || trimmed.length > 30) return false;
|
|
1532
|
+
for (const kw of LABEL_KEYWORDS) {
|
|
1533
|
+
if (trimmed.includes(kw)) return true;
|
|
1534
|
+
}
|
|
1535
|
+
if (/^[가-힣\s()·:]{2,8}$/.test(trimmed) && !/\d/.test(trimmed)) return true;
|
|
1536
|
+
if (/^[가-힣A-Za-z\s]+[::]$/.test(trimmed)) return true;
|
|
1537
|
+
return false;
|
|
1538
|
+
}
|
|
1539
|
+
function extractFormFields(blocks) {
|
|
1540
|
+
const fields = [];
|
|
1541
|
+
let totalTables = 0;
|
|
1542
|
+
let formTables = 0;
|
|
1543
|
+
for (const block of blocks) {
|
|
1544
|
+
if (block.type !== "table" || !block.table) continue;
|
|
1545
|
+
totalTables++;
|
|
1546
|
+
const tableFields = extractFromTable(block.table);
|
|
1547
|
+
if (tableFields.length > 0) {
|
|
1548
|
+
formTables++;
|
|
1549
|
+
fields.push(...tableFields);
|
|
1550
|
+
}
|
|
1551
|
+
}
|
|
1552
|
+
for (const block of blocks) {
|
|
1553
|
+
if (block.type === "paragraph" && block.text) {
|
|
1554
|
+
const inlineFields = extractInlineFields(block.text);
|
|
1555
|
+
fields.push(...inlineFields);
|
|
1556
|
+
}
|
|
1557
|
+
}
|
|
1558
|
+
const confidence = totalTables > 0 ? formTables / totalTables : fields.length > 0 ? 0.3 : 0;
|
|
1559
|
+
return { fields, confidence: Math.min(confidence, 1) };
|
|
1560
|
+
}
|
|
1561
|
+
function extractFromTable(table) {
|
|
1562
|
+
const fields = [];
|
|
1563
|
+
if (table.cols >= 2) {
|
|
1564
|
+
for (let r = 0; r < table.rows; r++) {
|
|
1565
|
+
for (let c = 0; c < table.cols - 1; c++) {
|
|
1566
|
+
const labelCell = table.cells[r][c];
|
|
1567
|
+
const valueCell = table.cells[r][c + 1];
|
|
1568
|
+
if (isLabelCell(labelCell.text) && valueCell.text.trim()) {
|
|
1569
|
+
fields.push({
|
|
1570
|
+
label: labelCell.text.trim().replace(/[::]\s*$/, ""),
|
|
1571
|
+
value: valueCell.text.trim(),
|
|
1572
|
+
row: r,
|
|
1573
|
+
col: c
|
|
1574
|
+
});
|
|
1575
|
+
}
|
|
1576
|
+
}
|
|
1577
|
+
}
|
|
1578
|
+
}
|
|
1579
|
+
if (fields.length === 0 && table.rows >= 2 && table.cols >= 2) {
|
|
1580
|
+
const headerRow = table.cells[0];
|
|
1581
|
+
const allLabels = headerRow.every((cell) => {
|
|
1582
|
+
const t = cell.text.trim();
|
|
1583
|
+
return t.length > 0 && t.length <= 20;
|
|
1584
|
+
});
|
|
1585
|
+
if (allLabels) {
|
|
1586
|
+
for (let r = 1; r < table.rows; r++) {
|
|
1587
|
+
for (let c = 0; c < table.cols; c++) {
|
|
1588
|
+
const label = headerRow[c].text.trim();
|
|
1589
|
+
const value = table.cells[r][c].text.trim();
|
|
1590
|
+
if (label && value) {
|
|
1591
|
+
fields.push({ label, value, row: r, col: c });
|
|
1592
|
+
}
|
|
1593
|
+
}
|
|
1594
|
+
}
|
|
1595
|
+
}
|
|
1596
|
+
}
|
|
1597
|
+
return fields;
|
|
1598
|
+
}
|
|
1599
|
+
function extractInlineFields(text) {
|
|
1600
|
+
const fields = [];
|
|
1601
|
+
const pattern = /([가-힣A-Za-z]{2,10})\s*[::]\s*([^\n,;]{1,100})/g;
|
|
1602
|
+
let match;
|
|
1603
|
+
while ((match = pattern.exec(text)) !== null) {
|
|
1604
|
+
const label = match[1].trim();
|
|
1605
|
+
const value = match[2].trim();
|
|
1606
|
+
if (value) {
|
|
1607
|
+
fields.push({ label, value, row: -1, col: -1 });
|
|
1608
|
+
}
|
|
1609
|
+
}
|
|
1610
|
+
return fields;
|
|
1611
|
+
}
|
|
1612
|
+
|
|
1613
|
+
// src/hwpx/generator.ts
|
|
1614
|
+
var import_jszip2 = __toESM(require("jszip"), 1);
|
|
1615
|
+
var HWPML_NS = "http://www.hancom.co.kr/hwpml/2016/HwpMl";
|
|
1616
|
+
async function markdownToHwpx(markdown) {
|
|
1617
|
+
const blocks = parseMarkdownToBlocks(markdown);
|
|
1618
|
+
const sectionXml = blocksToSectionXml(blocks);
|
|
1619
|
+
const zip = new import_jszip2.default();
|
|
1620
|
+
zip.file("mimetype", "application/hwp+zip", { compression: "STORE" });
|
|
1621
|
+
zip.file("Contents/content.hpf", generateManifest());
|
|
1622
|
+
zip.file("Contents/section0.xml", sectionXml);
|
|
1623
|
+
return await zip.generateAsync({ type: "arraybuffer" });
|
|
1624
|
+
}
|
|
1625
|
+
function parseMarkdownToBlocks(md) {
|
|
1626
|
+
const lines = md.split("\n");
|
|
1627
|
+
const blocks = [];
|
|
1628
|
+
let i = 0;
|
|
1629
|
+
while (i < lines.length) {
|
|
1630
|
+
const line = lines[i];
|
|
1631
|
+
if (!line.trim()) {
|
|
1632
|
+
i++;
|
|
1633
|
+
continue;
|
|
1634
|
+
}
|
|
1635
|
+
const headingMatch = line.match(/^(#{1,6})\s+(.+)$/);
|
|
1636
|
+
if (headingMatch) {
|
|
1637
|
+
blocks.push({ type: "heading", text: headingMatch[2].trim(), level: headingMatch[1].length });
|
|
1638
|
+
i++;
|
|
1639
|
+
continue;
|
|
1640
|
+
}
|
|
1641
|
+
if (line.trimStart().startsWith("|")) {
|
|
1642
|
+
const tableRows = [];
|
|
1643
|
+
while (i < lines.length && lines[i].trimStart().startsWith("|")) {
|
|
1644
|
+
const row = lines[i];
|
|
1645
|
+
if (/^\|[\s\-:]+\|/.test(row) && !row.includes("---") === false && /^[\s|:\-]+$/.test(row)) {
|
|
1646
|
+
i++;
|
|
1647
|
+
continue;
|
|
1648
|
+
}
|
|
1649
|
+
const cells = row.split("|").slice(1, -1).map((c) => c.trim());
|
|
1650
|
+
if (cells.length > 0) tableRows.push(cells);
|
|
1651
|
+
i++;
|
|
1652
|
+
}
|
|
1653
|
+
if (tableRows.length > 0) {
|
|
1654
|
+
blocks.push({ type: "table", rows: tableRows });
|
|
1655
|
+
}
|
|
1656
|
+
continue;
|
|
1657
|
+
}
|
|
1658
|
+
blocks.push({ type: "paragraph", text: line.trim() });
|
|
1659
|
+
i++;
|
|
1660
|
+
}
|
|
1661
|
+
return blocks;
|
|
1662
|
+
}
|
|
1663
|
+
function escapeXml(text) {
|
|
1664
|
+
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """);
|
|
1665
|
+
}
|
|
1666
|
+
function generateParagraph(text) {
|
|
1667
|
+
return `<hp:p><hp:run><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
|
|
1668
|
+
}
|
|
1669
|
+
function generateTable(rows) {
|
|
1670
|
+
const trElements = rows.map((row) => {
|
|
1671
|
+
const tdElements = row.map(
|
|
1672
|
+
(cell) => `<hp:tc><hp:cellSpan colSpan="1" rowSpan="1"/>${generateParagraph(cell)}</hp:tc>`
|
|
1673
|
+
).join("");
|
|
1674
|
+
return `<hp:tr>${tdElements}</hp:tr>`;
|
|
1675
|
+
}).join("");
|
|
1676
|
+
return `<hp:tbl>${trElements}</hp:tbl>`;
|
|
1677
|
+
}
|
|
1678
|
+
function blocksToSectionXml(blocks) {
|
|
1679
|
+
const body = blocks.map((block) => {
|
|
1680
|
+
switch (block.type) {
|
|
1681
|
+
case "heading":
|
|
1682
|
+
return generateParagraph(block.text || "");
|
|
1683
|
+
case "table":
|
|
1684
|
+
return block.rows ? generateTable(block.rows) : "";
|
|
1685
|
+
case "paragraph":
|
|
1686
|
+
return generateParagraph(block.text || "");
|
|
1687
|
+
default:
|
|
1688
|
+
return "";
|
|
1689
|
+
}
|
|
1690
|
+
}).join("\n ");
|
|
1691
|
+
return `<?xml version="1.0" encoding="UTF-8"?>
|
|
1692
|
+
<hs:sec xmlns:hs="${HWPML_NS}" xmlns:hp="${HWPML_NS}">
|
|
1693
|
+
${body}
|
|
1694
|
+
</hs:sec>`;
|
|
1695
|
+
}
|
|
1696
|
+
function generateManifest() {
|
|
1697
|
+
return `<?xml version="1.0" encoding="UTF-8"?>
|
|
1698
|
+
<opf:package xmlns:opf="http://www.idpf.org/2007/opf">
|
|
1699
|
+
<opf:manifest>
|
|
1700
|
+
<opf:item id="s0" href="section0.xml" media-type="application/xml"/>
|
|
1701
|
+
</opf:manifest>
|
|
1702
|
+
<opf:spine>
|
|
1703
|
+
<opf:itemref idref="s0"/>
|
|
1704
|
+
</opf:spine>
|
|
1705
|
+
</opf:package>`;
|
|
1706
|
+
}
|
|
1707
|
+
|
|
1103
1708
|
// src/index.ts
|
|
1104
|
-
async function parse(buffer) {
|
|
1709
|
+
async function parse(buffer, options) {
|
|
1105
1710
|
if (!buffer || buffer.byteLength === 0) {
|
|
1106
|
-
return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4." };
|
|
1711
|
+
return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
|
|
1107
1712
|
}
|
|
1108
1713
|
const format = detectFormat(buffer);
|
|
1109
1714
|
switch (format) {
|
|
1110
1715
|
case "hwpx":
|
|
1111
|
-
return parseHwpx(buffer);
|
|
1716
|
+
return parseHwpx(buffer, options);
|
|
1112
1717
|
case "hwp":
|
|
1113
|
-
return parseHwp(buffer);
|
|
1718
|
+
return parseHwp(buffer, options);
|
|
1114
1719
|
case "pdf":
|
|
1115
|
-
return parsePdf(buffer);
|
|
1720
|
+
return parsePdf(buffer, options);
|
|
1116
1721
|
default:
|
|
1117
|
-
return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4." };
|
|
1722
|
+
return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
|
|
1118
1723
|
}
|
|
1119
1724
|
}
|
|
1120
|
-
async function parseHwpx(buffer) {
|
|
1725
|
+
async function parseHwpx(buffer, options) {
|
|
1121
1726
|
try {
|
|
1122
|
-
const markdown = await parseHwpxDocument(buffer);
|
|
1123
|
-
return { success: true, fileType: "hwpx", markdown };
|
|
1727
|
+
const { markdown, blocks, metadata } = await parseHwpxDocument(buffer, options);
|
|
1728
|
+
return { success: true, fileType: "hwpx", markdown, blocks, metadata };
|
|
1124
1729
|
} catch (err) {
|
|
1125
|
-
return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328" };
|
|
1730
|
+
return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
1126
1731
|
}
|
|
1127
1732
|
}
|
|
1128
|
-
async function parseHwp(buffer) {
|
|
1733
|
+
async function parseHwp(buffer, options) {
|
|
1129
1734
|
try {
|
|
1130
|
-
const markdown = parseHwp5Document(Buffer.from(buffer));
|
|
1131
|
-
return { success: true, fileType: "hwp", markdown };
|
|
1735
|
+
const { markdown, blocks, metadata } = parseHwp5Document(Buffer.from(buffer), options);
|
|
1736
|
+
return { success: true, fileType: "hwp", markdown, blocks, metadata };
|
|
1132
1737
|
} catch (err) {
|
|
1133
|
-
return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328" };
|
|
1738
|
+
return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
1134
1739
|
}
|
|
1135
1740
|
}
|
|
1136
|
-
async function parsePdf(buffer) {
|
|
1741
|
+
async function parsePdf(buffer, options) {
|
|
1137
1742
|
try {
|
|
1138
|
-
return await parsePdfDocument(buffer);
|
|
1743
|
+
return await parsePdfDocument(buffer, options);
|
|
1139
1744
|
} catch (err) {
|
|
1140
|
-
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328" };
|
|
1745
|
+
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
1141
1746
|
}
|
|
1142
1747
|
}
|
|
1143
1748
|
// Annotate the CommonJS export names for ESM import in node:
|
|
1144
1749
|
0 && (module.exports = {
|
|
1145
1750
|
VERSION,
|
|
1751
|
+
blocksToMarkdown,
|
|
1752
|
+
compare,
|
|
1146
1753
|
detectFormat,
|
|
1754
|
+
diffBlocks,
|
|
1755
|
+
extractFormFields,
|
|
1147
1756
|
isHwpxFile,
|
|
1148
1757
|
isOldHwpFile,
|
|
1149
1758
|
isPdfFile,
|
|
1759
|
+
markdownToHwpx,
|
|
1150
1760
|
parse,
|
|
1151
1761
|
parseHwp,
|
|
1152
1762
|
parseHwpx,
|