kordoc 1.2.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +111 -118
- package/dist/{chunk-4BKNDXGU.js → chunk-BWZW234S.js} +595 -86
- package/dist/chunk-BWZW234S.js.map +1 -0
- package/dist/cli.js +15 -3
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +665 -59
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +163 -6
- package/dist/index.d.ts +163 -6
- package/dist/index.js +667 -58
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +216 -13
- package/dist/mcp.js.map +1 -1
- package/dist/provider-JB7SY74K.js +38 -0
- package/dist/provider-JB7SY74K.js.map +1 -0
- package/dist/watch-LIGKH3QS.js +90 -0
- package/dist/watch-LIGKH3QS.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-4BKNDXGU.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -5,6 +5,9 @@ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
|
5
5
|
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
6
|
var __getProtoOf = Object.getPrototypeOf;
|
|
7
7
|
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __esm = (fn, res) => function __init() {
|
|
9
|
+
return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
|
|
10
|
+
};
|
|
8
11
|
var __export = (target, all) => {
|
|
9
12
|
for (var name in all)
|
|
10
13
|
__defProp(target, name, { get: all[name], enumerable: true });
|
|
@@ -27,14 +30,61 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
27
30
|
));
|
|
28
31
|
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
32
|
|
|
33
|
+
// src/ocr/provider.ts
|
|
34
|
+
var provider_exports = {};
|
|
35
|
+
__export(provider_exports, {
|
|
36
|
+
ocrPages: () => ocrPages
|
|
37
|
+
});
|
|
38
|
+
async function ocrPages(doc, provider, pageFilter, effectivePageCount) {
|
|
39
|
+
const blocks = [];
|
|
40
|
+
for (let i = 1; i <= effectivePageCount; i++) {
|
|
41
|
+
if (pageFilter && !pageFilter.has(i)) continue;
|
|
42
|
+
const page = await doc.getPage(i);
|
|
43
|
+
try {
|
|
44
|
+
const imageData = await renderPageToPng(page);
|
|
45
|
+
const text = await provider(imageData, i, "image/png");
|
|
46
|
+
if (text.trim()) {
|
|
47
|
+
blocks.push({ type: "paragraph", text: text.trim() });
|
|
48
|
+
}
|
|
49
|
+
} catch {
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
return blocks;
|
|
53
|
+
}
|
|
54
|
+
async function renderPageToPng(page) {
|
|
55
|
+
let createCanvas;
|
|
56
|
+
try {
|
|
57
|
+
const canvasModule = await import("canvas");
|
|
58
|
+
createCanvas = canvasModule.createCanvas;
|
|
59
|
+
} catch {
|
|
60
|
+
throw new Error("OCR\uC744 \uC0AC\uC6A9\uD558\uB824\uBA74 'canvas' \uD328\uD0A4\uC9C0\uB97C \uC124\uCE58\uD558\uC138\uC694: npm install canvas");
|
|
61
|
+
}
|
|
62
|
+
const scale = 2;
|
|
63
|
+
const viewport = page.getViewport({ scale });
|
|
64
|
+
const canvas = createCanvas(Math.floor(viewport.width), Math.floor(viewport.height));
|
|
65
|
+
const ctx = canvas.getContext("2d");
|
|
66
|
+
await page.render({ canvasContext: ctx, viewport }).promise;
|
|
67
|
+
return new Uint8Array(canvas.toBuffer("image/png"));
|
|
68
|
+
}
|
|
69
|
+
var init_provider = __esm({
|
|
70
|
+
"src/ocr/provider.ts"() {
|
|
71
|
+
"use strict";
|
|
72
|
+
}
|
|
73
|
+
});
|
|
74
|
+
|
|
30
75
|
// src/index.ts
|
|
31
76
|
var index_exports = {};
|
|
32
77
|
__export(index_exports, {
|
|
33
78
|
VERSION: () => VERSION,
|
|
79
|
+
blocksToMarkdown: () => blocksToMarkdown,
|
|
80
|
+
compare: () => compare,
|
|
34
81
|
detectFormat: () => detectFormat,
|
|
82
|
+
diffBlocks: () => diffBlocks,
|
|
83
|
+
extractFormFields: () => extractFormFields,
|
|
35
84
|
isHwpxFile: () => isHwpxFile,
|
|
36
85
|
isOldHwpFile: () => isOldHwpFile,
|
|
37
86
|
isPdfFile: () => isPdfFile,
|
|
87
|
+
markdownToHwpx: () => markdownToHwpx,
|
|
38
88
|
parse: () => parse,
|
|
39
89
|
parseHwp: () => parseHwp,
|
|
40
90
|
parseHwpx: () => parseHwpx,
|
|
@@ -201,7 +251,7 @@ function tableToMarkdown(table) {
|
|
|
201
251
|
}
|
|
202
252
|
|
|
203
253
|
// src/utils.ts
|
|
204
|
-
var VERSION = true ? "1.
|
|
254
|
+
var VERSION = true ? "1.4.0" : "0.0.0-dev";
|
|
205
255
|
var KordocError = class extends Error {
|
|
206
256
|
constructor(message) {
|
|
207
257
|
super(message);
|
|
@@ -212,6 +262,47 @@ function isPathTraversal(name) {
|
|
|
212
262
|
const normalized = name.replace(/\\/g, "/");
|
|
213
263
|
return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
214
264
|
}
|
|
265
|
+
function classifyError(err) {
|
|
266
|
+
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
267
|
+
const msg = err.message;
|
|
268
|
+
if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
|
|
269
|
+
if (msg.includes("DRM")) return "DRM_PROTECTED";
|
|
270
|
+
if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
|
|
271
|
+
if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
|
|
272
|
+
if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
|
|
273
|
+
if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
|
|
274
|
+
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
275
|
+
return "PARSE_ERROR";
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
// src/page-range.ts
|
|
279
|
+
function parsePageRange(spec, maxPages) {
|
|
280
|
+
const result = /* @__PURE__ */ new Set();
|
|
281
|
+
if (maxPages <= 0) return result;
|
|
282
|
+
if (Array.isArray(spec)) {
|
|
283
|
+
for (const n of spec) {
|
|
284
|
+
const page = Math.round(n);
|
|
285
|
+
if (page >= 1 && page <= maxPages) result.add(page);
|
|
286
|
+
}
|
|
287
|
+
return result;
|
|
288
|
+
}
|
|
289
|
+
if (typeof spec !== "string" || spec.trim() === "") return result;
|
|
290
|
+
const parts = spec.split(",");
|
|
291
|
+
for (const part of parts) {
|
|
292
|
+
const trimmed = part.trim();
|
|
293
|
+
if (!trimmed) continue;
|
|
294
|
+
const rangeMatch = trimmed.match(/^(\d+)\s*-\s*(\d+)$/);
|
|
295
|
+
if (rangeMatch) {
|
|
296
|
+
const start = Math.max(1, parseInt(rangeMatch[1], 10));
|
|
297
|
+
const end = Math.min(maxPages, parseInt(rangeMatch[2], 10));
|
|
298
|
+
for (let i = start; i <= end; i++) result.add(i);
|
|
299
|
+
} else {
|
|
300
|
+
const page = parseInt(trimmed, 10);
|
|
301
|
+
if (!isNaN(page) && page >= 1 && page <= maxPages) result.add(page);
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
return result;
|
|
305
|
+
}
|
|
215
306
|
|
|
216
307
|
// src/hwpx/parser.ts
|
|
217
308
|
var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
|
|
@@ -222,7 +313,7 @@ function clampSpan(val, max) {
|
|
|
222
313
|
function stripDtd(xml) {
|
|
223
314
|
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
224
315
|
}
|
|
225
|
-
async function parseHwpxDocument(buffer) {
|
|
316
|
+
async function parseHwpxDocument(buffer, options) {
|
|
226
317
|
const precheck = precheckZipSize(buffer);
|
|
227
318
|
if (precheck.totalUncompressed > MAX_DECOMPRESS_SIZE) {
|
|
228
319
|
throw new KordocError("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
@@ -240,19 +331,62 @@ async function parseHwpxDocument(buffer) {
|
|
|
240
331
|
if (actualEntryCount > MAX_ZIP_ENTRIES) {
|
|
241
332
|
throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
242
333
|
}
|
|
334
|
+
const metadata = {};
|
|
335
|
+
await extractHwpxMetadata(zip, metadata);
|
|
243
336
|
const sectionPaths = await resolveSectionPaths(zip);
|
|
244
337
|
if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
338
|
+
metadata.pageCount = sectionPaths.length;
|
|
339
|
+
const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
|
|
245
340
|
let totalDecompressed = 0;
|
|
246
341
|
const blocks = [];
|
|
247
|
-
for (
|
|
248
|
-
|
|
342
|
+
for (let si = 0; si < sectionPaths.length; si++) {
|
|
343
|
+
if (pageFilter && !pageFilter.has(si + 1)) continue;
|
|
344
|
+
const file = zip.file(sectionPaths[si]);
|
|
249
345
|
if (!file) continue;
|
|
250
346
|
const xml = await file.async("text");
|
|
251
347
|
totalDecompressed += xml.length * 2;
|
|
252
348
|
if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
253
349
|
blocks.push(...parseSectionXml(xml));
|
|
254
350
|
}
|
|
255
|
-
|
|
351
|
+
const markdown = blocksToMarkdown(blocks);
|
|
352
|
+
return { markdown, blocks, metadata };
|
|
353
|
+
}
|
|
354
|
+
async function extractHwpxMetadata(zip, metadata) {
|
|
355
|
+
try {
|
|
356
|
+
const metaPaths = ["meta.xml", "META-INF/meta.xml", "docProps/core.xml"];
|
|
357
|
+
for (const mp of metaPaths) {
|
|
358
|
+
const file = zip.file(mp) || Object.values(zip.files).find((f) => f.name.toLowerCase() === mp.toLowerCase()) || null;
|
|
359
|
+
if (!file) continue;
|
|
360
|
+
const xml = await file.async("text");
|
|
361
|
+
parseDublinCoreMetadata(xml, metadata);
|
|
362
|
+
if (metadata.title || metadata.author) return;
|
|
363
|
+
}
|
|
364
|
+
} catch {
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
function parseDublinCoreMetadata(xml, metadata) {
|
|
368
|
+
const parser = new import_xmldom.DOMParser();
|
|
369
|
+
const doc = parser.parseFromString(stripDtd(xml), "text/xml");
|
|
370
|
+
if (!doc.documentElement) return;
|
|
371
|
+
const getText = (tagNames) => {
|
|
372
|
+
for (const tag of tagNames) {
|
|
373
|
+
const els = doc.getElementsByTagName(tag);
|
|
374
|
+
if (els.length > 0) {
|
|
375
|
+
const text = els[0].textContent?.trim();
|
|
376
|
+
if (text) return text;
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
return void 0;
|
|
380
|
+
};
|
|
381
|
+
metadata.title = metadata.title || getText(["dc:title", "title"]);
|
|
382
|
+
metadata.author = metadata.author || getText(["dc:creator", "creator", "cp:lastModifiedBy"]);
|
|
383
|
+
metadata.description = metadata.description || getText(["dc:description", "description", "dc:subject", "subject"]);
|
|
384
|
+
metadata.createdAt = metadata.createdAt || getText(["dcterms:created", "meta:creation-date"]);
|
|
385
|
+
metadata.modifiedAt = metadata.modifiedAt || getText(["dcterms:modified", "meta:date"]);
|
|
386
|
+
const keywords = getText(["dc:keyword", "cp:keywords", "meta:keyword"]);
|
|
387
|
+
if (keywords && !metadata.keywords) {
|
|
388
|
+
metadata.keywords = keywords.split(/[,;]/).map((k) => k.trim()).filter(Boolean);
|
|
389
|
+
}
|
|
256
390
|
}
|
|
257
391
|
function precheckZipSize(buffer) {
|
|
258
392
|
try {
|
|
@@ -291,7 +425,7 @@ function extractFromBrokenZip(buffer) {
|
|
|
291
425
|
const data = new Uint8Array(buffer);
|
|
292
426
|
const view = new DataView(buffer);
|
|
293
427
|
let pos = 0;
|
|
294
|
-
const
|
|
428
|
+
const blocks = [];
|
|
295
429
|
let totalDecompressed = 0;
|
|
296
430
|
let entryCount = 0;
|
|
297
431
|
while (pos < data.length - 30) {
|
|
@@ -332,14 +466,14 @@ function extractFromBrokenZip(buffer) {
|
|
|
332
466
|
}
|
|
333
467
|
totalDecompressed += content.length * 2;
|
|
334
468
|
if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
|
|
335
|
-
|
|
336
|
-
if (sectionText) texts.push(sectionText);
|
|
469
|
+
blocks.push(...parseSectionXml(content));
|
|
337
470
|
} catch {
|
|
338
471
|
continue;
|
|
339
472
|
}
|
|
340
473
|
}
|
|
341
|
-
if (
|
|
342
|
-
|
|
474
|
+
if (blocks.length === 0) throw new KordocError("\uC190\uC0C1\uB41C HWPX\uC5D0\uC11C \uC139\uC158 \uB370\uC774\uD130\uB97C \uBCF5\uAD6C\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
475
|
+
const markdown = blocksToMarkdown(blocks);
|
|
476
|
+
return { markdown, blocks };
|
|
343
477
|
}
|
|
344
478
|
async function resolveSectionPaths(zip) {
|
|
345
479
|
const manifestPaths = ["Contents/content.hpf", "content.hpf"];
|
|
@@ -612,7 +746,7 @@ var require2 = (0, import_module.createRequire)(import_meta.url);
|
|
|
612
746
|
var CFB = require2("cfb");
|
|
613
747
|
var MAX_SECTIONS = 100;
|
|
614
748
|
var MAX_TOTAL_DECOMPRESS = 100 * 1024 * 1024;
|
|
615
|
-
function parseHwp5Document(buffer) {
|
|
749
|
+
function parseHwp5Document(buffer, options) {
|
|
616
750
|
const cfb = CFB.parse(buffer);
|
|
617
751
|
const headerEntry = CFB.find(cfb, "/FileHeader");
|
|
618
752
|
if (!headerEntry?.content) throw new KordocError("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
|
|
@@ -621,18 +755,59 @@ function parseHwp5Document(buffer) {
|
|
|
621
755
|
if (header.flags & FLAG_ENCRYPTED) throw new KordocError("\uC554\uD638\uD654\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
|
|
622
756
|
if (header.flags & FLAG_DRM) throw new KordocError("DRM \uBCF4\uD638\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
|
|
623
757
|
const compressed = (header.flags & FLAG_COMPRESSED) !== 0;
|
|
758
|
+
const metadata = {
|
|
759
|
+
version: `${header.versionMajor}.x`
|
|
760
|
+
};
|
|
761
|
+
extractHwp5Metadata(cfb, metadata);
|
|
624
762
|
const sections = findSections(cfb);
|
|
625
763
|
if (sections.length === 0) throw new KordocError("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
764
|
+
metadata.pageCount = sections.length;
|
|
765
|
+
const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
|
|
626
766
|
const blocks = [];
|
|
627
767
|
let totalDecompressed = 0;
|
|
628
|
-
for (
|
|
768
|
+
for (let si = 0; si < sections.length; si++) {
|
|
769
|
+
if (pageFilter && !pageFilter.has(si + 1)) continue;
|
|
770
|
+
const sectionData = sections[si];
|
|
629
771
|
const data = compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
|
|
630
772
|
totalDecompressed += data.length;
|
|
631
773
|
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
632
774
|
const records = readRecords(data);
|
|
633
775
|
blocks.push(...parseSection(records));
|
|
634
776
|
}
|
|
635
|
-
|
|
777
|
+
const markdown = blocksToMarkdown(blocks);
|
|
778
|
+
return { markdown, blocks, metadata };
|
|
779
|
+
}
|
|
780
|
+
function extractHwp5Metadata(cfb, metadata) {
|
|
781
|
+
try {
|
|
782
|
+
const summaryEntry = CFB.find(cfb, "/HwpSummaryInformation") || CFB.find(cfb, "/SummaryInformation");
|
|
783
|
+
if (!summaryEntry?.content) return;
|
|
784
|
+
const data = Buffer.from(summaryEntry.content);
|
|
785
|
+
if (data.length < 48) return;
|
|
786
|
+
const numSets = data.readUInt32LE(24);
|
|
787
|
+
if (numSets === 0) return;
|
|
788
|
+
const setOffset = data.readUInt32LE(44);
|
|
789
|
+
if (setOffset >= data.length - 8) return;
|
|
790
|
+
const numProps = data.readUInt32LE(setOffset + 4);
|
|
791
|
+
if (numProps === 0 || numProps > 100) return;
|
|
792
|
+
for (let i = 0; i < numProps; i++) {
|
|
793
|
+
const entryOffset = setOffset + 8 + i * 8;
|
|
794
|
+
if (entryOffset + 8 > data.length) break;
|
|
795
|
+
const propId = data.readUInt32LE(entryOffset);
|
|
796
|
+
const propOffset = setOffset + data.readUInt32LE(entryOffset + 4);
|
|
797
|
+
if (propOffset + 8 > data.length) continue;
|
|
798
|
+
if (propId !== 2 && propId !== 4 && propId !== 6) continue;
|
|
799
|
+
const propType = data.readUInt32LE(propOffset);
|
|
800
|
+
if (propType !== 30) continue;
|
|
801
|
+
const strLen = data.readUInt32LE(propOffset + 4);
|
|
802
|
+
if (strLen === 0 || strLen > 1e4 || propOffset + 8 + strLen > data.length) continue;
|
|
803
|
+
const str = data.subarray(propOffset + 8, propOffset + 8 + strLen).toString("utf8").replace(/\0+$/, "").trim();
|
|
804
|
+
if (!str) continue;
|
|
805
|
+
if (propId === 2) metadata.title = str;
|
|
806
|
+
else if (propId === 4) metadata.author = str;
|
|
807
|
+
else if (propId === 6) metadata.description = str;
|
|
808
|
+
}
|
|
809
|
+
} catch {
|
|
810
|
+
}
|
|
636
811
|
}
|
|
637
812
|
function findSections(cfb) {
|
|
638
813
|
const sections = [];
|
|
@@ -772,34 +947,30 @@ function arrangeCells(rows, cols, cells) {
|
|
|
772
947
|
return grid.map((row) => row.map((c) => c || { text: "", colSpan: 1, rowSpan: 1 }));
|
|
773
948
|
}
|
|
774
949
|
|
|
950
|
+
// src/pdf/polyfill.ts
|
|
951
|
+
var pdfjsWorker = __toESM(require("pdfjs-dist/legacy/build/pdf.worker.mjs"), 1);
|
|
952
|
+
var g = globalThis;
|
|
953
|
+
if (typeof g.DOMMatrix === "undefined") {
|
|
954
|
+
g.DOMMatrix = class DOMMatrix {
|
|
955
|
+
m = [1, 0, 0, 1, 0, 0];
|
|
956
|
+
constructor(init) {
|
|
957
|
+
if (init) this.m = init;
|
|
958
|
+
}
|
|
959
|
+
};
|
|
960
|
+
}
|
|
961
|
+
if (typeof g.Path2D === "undefined") {
|
|
962
|
+
g.Path2D = class Path2D {
|
|
963
|
+
};
|
|
964
|
+
}
|
|
965
|
+
g.pdfjsWorker = pdfjsWorker;
|
|
966
|
+
|
|
775
967
|
// src/pdf/parser.ts
|
|
776
|
-
var
|
|
777
|
-
|
|
778
|
-
var import_meta2 = {};
|
|
968
|
+
var import_pdf = require("pdfjs-dist/legacy/build/pdf.mjs");
|
|
969
|
+
import_pdf.GlobalWorkerOptions.workerSrc = "";
|
|
779
970
|
var MAX_PAGES = 5e3;
|
|
780
971
|
var MAX_TOTAL_TEXT = 100 * 1024 * 1024;
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
if (pdfjsModule) return pdfjsModule;
|
|
784
|
-
try {
|
|
785
|
-
const mod = await import("pdfjs-dist/legacy/build/pdf.mjs");
|
|
786
|
-
const req = (0, import_module2.createRequire)(import_meta2.url);
|
|
787
|
-
const workerPath = req.resolve("pdfjs-dist/legacy/build/pdf.worker.mjs");
|
|
788
|
-
mod.GlobalWorkerOptions.workerSrc = (0, import_url.pathToFileURL)(workerPath).href;
|
|
789
|
-
pdfjsModule = mod;
|
|
790
|
-
return mod;
|
|
791
|
-
} catch (err) {
|
|
792
|
-
const msg = err instanceof Error ? err.message : String(err);
|
|
793
|
-
if (msg.includes("Cannot find") || msg.includes("MODULE_NOT_FOUND")) return null;
|
|
794
|
-
throw new KordocError(`pdfjs-dist \uB85C\uB529 \uC2E4\uD328: ${msg}`);
|
|
795
|
-
}
|
|
796
|
-
}
|
|
797
|
-
async function parsePdfDocument(buffer) {
|
|
798
|
-
const pdfjs = await loadPdfjs();
|
|
799
|
-
if (!pdfjs) {
|
|
800
|
-
return { success: false, fileType: "pdf", pageCount: 0, error: "pdfjs-dist\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4. npm install pdfjs-dist" };
|
|
801
|
-
}
|
|
802
|
-
const doc = await pdfjs.getDocument({
|
|
972
|
+
async function parsePdfDocument(buffer, options) {
|
|
973
|
+
const doc = await (0, import_pdf.getDocument)({
|
|
803
974
|
data: new Uint8Array(buffer),
|
|
804
975
|
useSystemFonts: true,
|
|
805
976
|
disableFontFace: true,
|
|
@@ -807,12 +978,17 @@ async function parsePdfDocument(buffer) {
|
|
|
807
978
|
}).promise;
|
|
808
979
|
try {
|
|
809
980
|
const pageCount = doc.numPages;
|
|
810
|
-
if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4." };
|
|
981
|
+
if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.", blocks: [] };
|
|
982
|
+
const metadata = { pageCount };
|
|
983
|
+
await extractPdfMetadata(doc, metadata);
|
|
811
984
|
const pageTexts = [];
|
|
985
|
+
const blocks = [];
|
|
812
986
|
let totalChars = 0;
|
|
813
987
|
let totalTextBytes = 0;
|
|
814
988
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
989
|
+
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
815
990
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
991
|
+
if (pageFilter && !pageFilter.has(i)) continue;
|
|
816
992
|
const page = await doc.getPage(i);
|
|
817
993
|
const tc = await page.getTextContent();
|
|
818
994
|
const pageText = extractPageContent(tc.items);
|
|
@@ -820,18 +996,54 @@ async function parsePdfDocument(buffer) {
|
|
|
820
996
|
totalTextBytes += pageText.length * 2;
|
|
821
997
|
if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
|
|
822
998
|
pageTexts.push(pageText);
|
|
999
|
+
blocks.push({ type: "paragraph", text: pageText });
|
|
823
1000
|
}
|
|
824
|
-
|
|
825
|
-
|
|
1001
|
+
const parsedPageCount = pageFilter ? pageFilter.size : effectivePageCount;
|
|
1002
|
+
if (totalChars / Math.max(parsedPageCount, 1) < 10) {
|
|
1003
|
+
if (options?.ocr) {
|
|
1004
|
+
try {
|
|
1005
|
+
const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
|
|
1006
|
+
const ocrBlocks = await ocrPages2(doc, options.ocr, pageFilter, effectivePageCount);
|
|
1007
|
+
if (ocrBlocks.length > 0) {
|
|
1008
|
+
const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
|
|
1009
|
+
return { success: true, fileType: "pdf", markdown: ocrMarkdown, pageCount: parsedPageCount, blocks: ocrBlocks, metadata, isImageBased: true };
|
|
1010
|
+
}
|
|
1011
|
+
} catch {
|
|
1012
|
+
}
|
|
1013
|
+
}
|
|
1014
|
+
return { success: false, fileType: "pdf", pageCount, isImageBased: true, error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`, code: "IMAGE_BASED_PDF" };
|
|
826
1015
|
}
|
|
827
1016
|
let markdown = pageTexts.filter((t) => t.trim()).join("\n\n");
|
|
828
1017
|
markdown = cleanPdfText(markdown);
|
|
829
|
-
return { success: true, fileType: "pdf", markdown, pageCount:
|
|
1018
|
+
return { success: true, fileType: "pdf", markdown, pageCount: parsedPageCount, blocks, metadata };
|
|
830
1019
|
} finally {
|
|
831
1020
|
await doc.destroy().catch(() => {
|
|
832
1021
|
});
|
|
833
1022
|
}
|
|
834
1023
|
}
|
|
1024
|
+
async function extractPdfMetadata(doc, metadata) {
|
|
1025
|
+
try {
|
|
1026
|
+
const result = await doc.getMetadata();
|
|
1027
|
+
if (!result?.info) return;
|
|
1028
|
+
const info = result.info;
|
|
1029
|
+
if (typeof info.Title === "string" && info.Title.trim()) metadata.title = info.Title.trim();
|
|
1030
|
+
if (typeof info.Author === "string" && info.Author.trim()) metadata.author = info.Author.trim();
|
|
1031
|
+
if (typeof info.Creator === "string" && info.Creator.trim()) metadata.creator = info.Creator.trim();
|
|
1032
|
+
if (typeof info.Subject === "string" && info.Subject.trim()) metadata.description = info.Subject.trim();
|
|
1033
|
+
if (typeof info.Keywords === "string" && info.Keywords.trim()) {
|
|
1034
|
+
metadata.keywords = info.Keywords.split(/[,;]/).map((k) => k.trim()).filter(Boolean);
|
|
1035
|
+
}
|
|
1036
|
+
if (typeof info.CreationDate === "string") metadata.createdAt = parsePdfDate(info.CreationDate);
|
|
1037
|
+
if (typeof info.ModDate === "string") metadata.modifiedAt = parsePdfDate(info.ModDate);
|
|
1038
|
+
} catch {
|
|
1039
|
+
}
|
|
1040
|
+
}
|
|
1041
|
+
function parsePdfDate(dateStr) {
|
|
1042
|
+
const m = dateStr.match(/D:(\d{4})(\d{2})?(\d{2})?(\d{2})?(\d{2})?(\d{2})?/);
|
|
1043
|
+
if (!m) return void 0;
|
|
1044
|
+
const [, year, month = "01", day = "01", hour = "00", min = "00", sec = "00"] = m;
|
|
1045
|
+
return `${year}-${month}-${day}T${hour}:${min}:${sec}`;
|
|
1046
|
+
}
|
|
835
1047
|
function extractPageContent(rawItems) {
|
|
836
1048
|
const items = normalizeItems(rawItems);
|
|
837
1049
|
if (items.length === 0) return "";
|
|
@@ -1104,53 +1316,447 @@ function mergeKoreanLines(text) {
|
|
|
1104
1316
|
return result.join("\n");
|
|
1105
1317
|
}
|
|
1106
1318
|
|
|
1319
|
+
// src/diff/text-diff.ts
|
|
1320
|
+
function similarity(a, b) {
|
|
1321
|
+
if (a === b) return 1;
|
|
1322
|
+
if (!a || !b) return 0;
|
|
1323
|
+
const maxLen = Math.max(a.length, b.length);
|
|
1324
|
+
if (maxLen === 0) return 1;
|
|
1325
|
+
return 1 - levenshtein(a, b) / maxLen;
|
|
1326
|
+
}
|
|
1327
|
+
function normalizedSimilarity(a, b) {
|
|
1328
|
+
return similarity(normalize(a), normalize(b));
|
|
1329
|
+
}
|
|
1330
|
+
function normalize(s) {
|
|
1331
|
+
return s.replace(/\s+/g, " ").trim();
|
|
1332
|
+
}
|
|
1333
|
+
function levenshtein(a, b) {
|
|
1334
|
+
if (a.length > b.length) [a, b] = [b, a];
|
|
1335
|
+
const m = a.length;
|
|
1336
|
+
const n = b.length;
|
|
1337
|
+
let prev = Array.from({ length: m + 1 }, (_, i) => i);
|
|
1338
|
+
let curr = new Array(m + 1);
|
|
1339
|
+
for (let j = 1; j <= n; j++) {
|
|
1340
|
+
curr[0] = j;
|
|
1341
|
+
for (let i = 1; i <= m; i++) {
|
|
1342
|
+
if (a[i - 1] === b[j - 1]) {
|
|
1343
|
+
curr[i] = prev[i - 1];
|
|
1344
|
+
} else {
|
|
1345
|
+
curr[i] = 1 + Math.min(prev[i - 1], prev[i], curr[i - 1]);
|
|
1346
|
+
}
|
|
1347
|
+
}
|
|
1348
|
+
;
|
|
1349
|
+
[prev, curr] = [curr, prev];
|
|
1350
|
+
}
|
|
1351
|
+
return prev[m];
|
|
1352
|
+
}
|
|
1353
|
+
|
|
1354
|
+
// src/diff/compare.ts
|
|
1355
|
+
var SIMILARITY_THRESHOLD = 0.4;
|
|
1356
|
+
async function compare(bufferA, bufferB, options) {
|
|
1357
|
+
const [resultA, resultB] = await Promise.all([
|
|
1358
|
+
parse(bufferA, options),
|
|
1359
|
+
parse(bufferB, options)
|
|
1360
|
+
]);
|
|
1361
|
+
if (!resultA.success) throw new Error(`\uBB38\uC11CA \uD30C\uC2F1 \uC2E4\uD328: ${resultA.error}`);
|
|
1362
|
+
if (!resultB.success) throw new Error(`\uBB38\uC11CB \uD30C\uC2F1 \uC2E4\uD328: ${resultB.error}`);
|
|
1363
|
+
return diffBlocks(resultA.blocks, resultB.blocks);
|
|
1364
|
+
}
|
|
1365
|
+
function diffBlocks(blocksA, blocksB) {
|
|
1366
|
+
const aligned = alignBlocks(blocksA, blocksB);
|
|
1367
|
+
const stats = { added: 0, removed: 0, modified: 0, unchanged: 0 };
|
|
1368
|
+
const diffs = [];
|
|
1369
|
+
for (const [a, b] of aligned) {
|
|
1370
|
+
if (a && b) {
|
|
1371
|
+
const sim = blockSimilarity(a, b);
|
|
1372
|
+
if (sim >= 0.99) {
|
|
1373
|
+
diffs.push({ type: "unchanged", before: a, after: b, similarity: 1 });
|
|
1374
|
+
stats.unchanged++;
|
|
1375
|
+
} else {
|
|
1376
|
+
const diff = { type: "modified", before: a, after: b, similarity: sim };
|
|
1377
|
+
if (a.type === "table" && b.type === "table" && a.table && b.table) {
|
|
1378
|
+
diff.cellDiffs = diffTableCells(a.table, b.table);
|
|
1379
|
+
}
|
|
1380
|
+
diffs.push(diff);
|
|
1381
|
+
stats.modified++;
|
|
1382
|
+
}
|
|
1383
|
+
} else if (a) {
|
|
1384
|
+
diffs.push({ type: "removed", before: a });
|
|
1385
|
+
stats.removed++;
|
|
1386
|
+
} else if (b) {
|
|
1387
|
+
diffs.push({ type: "added", after: b });
|
|
1388
|
+
stats.added++;
|
|
1389
|
+
}
|
|
1390
|
+
}
|
|
1391
|
+
return { stats, diffs };
|
|
1392
|
+
}
|
|
1393
|
+
function alignBlocks(a, b) {
|
|
1394
|
+
const m = a.length, n = b.length;
|
|
1395
|
+
if (m * n > 1e7) return fallbackAlign(a, b);
|
|
1396
|
+
const simCache = /* @__PURE__ */ new Map();
|
|
1397
|
+
const getSim = (i2, j2) => {
|
|
1398
|
+
const key = `${i2},${j2}`;
|
|
1399
|
+
let v = simCache.get(key);
|
|
1400
|
+
if (v === void 0) {
|
|
1401
|
+
v = blockSimilarity(a[i2], b[j2]);
|
|
1402
|
+
simCache.set(key, v);
|
|
1403
|
+
}
|
|
1404
|
+
return v;
|
|
1405
|
+
};
|
|
1406
|
+
const dp = Array.from({ length: m + 1 }, () => new Array(n + 1).fill(0));
|
|
1407
|
+
for (let i2 = 1; i2 <= m; i2++) {
|
|
1408
|
+
for (let j2 = 1; j2 <= n; j2++) {
|
|
1409
|
+
if (getSim(i2 - 1, j2 - 1) >= SIMILARITY_THRESHOLD) {
|
|
1410
|
+
dp[i2][j2] = dp[i2 - 1][j2 - 1] + 1;
|
|
1411
|
+
} else {
|
|
1412
|
+
dp[i2][j2] = Math.max(dp[i2 - 1][j2], dp[i2][j2 - 1]);
|
|
1413
|
+
}
|
|
1414
|
+
}
|
|
1415
|
+
}
|
|
1416
|
+
const pairs = [];
|
|
1417
|
+
let i = m, j = n;
|
|
1418
|
+
while (i > 0 && j > 0) {
|
|
1419
|
+
if (getSim(i - 1, j - 1) >= SIMILARITY_THRESHOLD && dp[i][j] === dp[i - 1][j - 1] + 1) {
|
|
1420
|
+
pairs.push([i - 1, j - 1]);
|
|
1421
|
+
i--;
|
|
1422
|
+
j--;
|
|
1423
|
+
} else if (dp[i - 1][j] >= dp[i][j - 1]) {
|
|
1424
|
+
i--;
|
|
1425
|
+
} else {
|
|
1426
|
+
j--;
|
|
1427
|
+
}
|
|
1428
|
+
}
|
|
1429
|
+
pairs.reverse();
|
|
1430
|
+
const result = [];
|
|
1431
|
+
let ai = 0, bi = 0;
|
|
1432
|
+
for (const [pi, pj] of pairs) {
|
|
1433
|
+
while (ai < pi) result.push([a[ai++], null]);
|
|
1434
|
+
while (bi < pj) result.push([null, b[bi++]]);
|
|
1435
|
+
result.push([a[ai++], b[bi++]]);
|
|
1436
|
+
}
|
|
1437
|
+
while (ai < m) result.push([a[ai++], null]);
|
|
1438
|
+
while (bi < n) result.push([null, b[bi++]]);
|
|
1439
|
+
return result;
|
|
1440
|
+
}
|
|
1441
|
+
function fallbackAlign(a, b) {
|
|
1442
|
+
const result = [];
|
|
1443
|
+
const len = Math.max(a.length, b.length);
|
|
1444
|
+
for (let i = 0; i < len; i++) {
|
|
1445
|
+
result.push([a[i] || null, b[i] || null]);
|
|
1446
|
+
}
|
|
1447
|
+
return result;
|
|
1448
|
+
}
|
|
1449
|
+
function blockSimilarity(a, b) {
|
|
1450
|
+
if (a.type !== b.type) return 0;
|
|
1451
|
+
if (a.type === "paragraph") {
|
|
1452
|
+
return normalizedSimilarity(a.text || "", b.text || "");
|
|
1453
|
+
}
|
|
1454
|
+
if (a.type === "table" && a.table && b.table) {
|
|
1455
|
+
return tableSimilarity(a.table, b.table);
|
|
1456
|
+
}
|
|
1457
|
+
return 0;
|
|
1458
|
+
}
|
|
1459
|
+
function tableSimilarity(a, b) {
|
|
1460
|
+
const dimSim = 1 - Math.abs(a.rows * a.cols - b.rows * b.cols) / Math.max(a.rows * a.cols, b.rows * b.cols, 1);
|
|
1461
|
+
const textsA = a.cells.flat().map((c) => c.text).join(" ");
|
|
1462
|
+
const textsB = b.cells.flat().map((c) => c.text).join(" ");
|
|
1463
|
+
const contentSim = normalizedSimilarity(textsA, textsB);
|
|
1464
|
+
return dimSim * 0.3 + contentSim * 0.7;
|
|
1465
|
+
}
|
|
1466
|
+
function diffTableCells(a, b) {
|
|
1467
|
+
const maxRows = Math.max(a.rows, b.rows);
|
|
1468
|
+
const maxCols = Math.max(a.cols, b.cols);
|
|
1469
|
+
const result = [];
|
|
1470
|
+
for (let r = 0; r < maxRows; r++) {
|
|
1471
|
+
const row = [];
|
|
1472
|
+
for (let c = 0; c < maxCols; c++) {
|
|
1473
|
+
const cellA = r < a.rows && c < a.cols ? a.cells[r][c].text : void 0;
|
|
1474
|
+
const cellB = r < b.rows && c < b.cols ? b.cells[r][c].text : void 0;
|
|
1475
|
+
let type;
|
|
1476
|
+
if (cellA === void 0) type = "added";
|
|
1477
|
+
else if (cellB === void 0) type = "removed";
|
|
1478
|
+
else if (cellA === cellB) type = "unchanged";
|
|
1479
|
+
else type = "modified";
|
|
1480
|
+
row.push({ type, before: cellA, after: cellB });
|
|
1481
|
+
}
|
|
1482
|
+
result.push(row);
|
|
1483
|
+
}
|
|
1484
|
+
return result;
|
|
1485
|
+
}
|
|
1486
|
+
|
|
1487
|
+
// src/form/recognize.ts
|
|
1488
|
+
var LABEL_KEYWORDS = /* @__PURE__ */ new Set([
|
|
1489
|
+
"\uC131\uBA85",
|
|
1490
|
+
"\uC774\uB984",
|
|
1491
|
+
"\uC8FC\uC18C",
|
|
1492
|
+
"\uC804\uD654",
|
|
1493
|
+
"\uC804\uD654\uBC88\uD638",
|
|
1494
|
+
"\uD734\uB300\uD3F0",
|
|
1495
|
+
"\uD578\uB4DC\uD3F0",
|
|
1496
|
+
"\uC5F0\uB77D\uCC98",
|
|
1497
|
+
"\uC0DD\uB144\uC6D4\uC77C",
|
|
1498
|
+
"\uC8FC\uBBFC\uB4F1\uB85D\uBC88\uD638",
|
|
1499
|
+
"\uC18C\uC18D",
|
|
1500
|
+
"\uC9C1\uC704",
|
|
1501
|
+
"\uC9C1\uAE09",
|
|
1502
|
+
"\uBD80\uC11C",
|
|
1503
|
+
"\uC774\uBA54\uC77C",
|
|
1504
|
+
"\uD329\uC2A4",
|
|
1505
|
+
"\uD559\uAD50",
|
|
1506
|
+
"\uD559\uB144",
|
|
1507
|
+
"\uBC18",
|
|
1508
|
+
"\uBC88\uD638",
|
|
1509
|
+
"\uC2E0\uCCAD\uC778",
|
|
1510
|
+
"\uB300\uD45C\uC790",
|
|
1511
|
+
"\uB2F4\uB2F9\uC790",
|
|
1512
|
+
"\uC791\uC131\uC790",
|
|
1513
|
+
"\uD655\uC778\uC790",
|
|
1514
|
+
"\uC2B9\uC778\uC790",
|
|
1515
|
+
"\uC77C\uC2DC",
|
|
1516
|
+
"\uB0A0\uC9DC",
|
|
1517
|
+
"\uAE30\uAC04",
|
|
1518
|
+
"\uC7A5\uC18C",
|
|
1519
|
+
"\uBAA9\uC801",
|
|
1520
|
+
"\uC0AC\uC720",
|
|
1521
|
+
"\uBE44\uACE0",
|
|
1522
|
+
"\uAE08\uC561",
|
|
1523
|
+
"\uC218\uB7C9",
|
|
1524
|
+
"\uB2E8\uAC00",
|
|
1525
|
+
"\uD569\uACC4",
|
|
1526
|
+
"\uACC4",
|
|
1527
|
+
"\uC18C\uACC4"
|
|
1528
|
+
]);
|
|
1529
|
+
function isLabelCell(text) {
|
|
1530
|
+
const trimmed = text.trim();
|
|
1531
|
+
if (!trimmed || trimmed.length > 30) return false;
|
|
1532
|
+
for (const kw of LABEL_KEYWORDS) {
|
|
1533
|
+
if (trimmed.includes(kw)) return true;
|
|
1534
|
+
}
|
|
1535
|
+
if (/^[가-힣\s()·:]{2,8}$/.test(trimmed) && !/\d/.test(trimmed)) return true;
|
|
1536
|
+
if (/^[가-힣A-Za-z\s]+[::]$/.test(trimmed)) return true;
|
|
1537
|
+
return false;
|
|
1538
|
+
}
|
|
1539
|
+
function extractFormFields(blocks) {
|
|
1540
|
+
const fields = [];
|
|
1541
|
+
let totalTables = 0;
|
|
1542
|
+
let formTables = 0;
|
|
1543
|
+
for (const block of blocks) {
|
|
1544
|
+
if (block.type !== "table" || !block.table) continue;
|
|
1545
|
+
totalTables++;
|
|
1546
|
+
const tableFields = extractFromTable(block.table);
|
|
1547
|
+
if (tableFields.length > 0) {
|
|
1548
|
+
formTables++;
|
|
1549
|
+
fields.push(...tableFields);
|
|
1550
|
+
}
|
|
1551
|
+
}
|
|
1552
|
+
for (const block of blocks) {
|
|
1553
|
+
if (block.type === "paragraph" && block.text) {
|
|
1554
|
+
const inlineFields = extractInlineFields(block.text);
|
|
1555
|
+
fields.push(...inlineFields);
|
|
1556
|
+
}
|
|
1557
|
+
}
|
|
1558
|
+
const confidence = totalTables > 0 ? formTables / totalTables : fields.length > 0 ? 0.3 : 0;
|
|
1559
|
+
return { fields, confidence: Math.min(confidence, 1) };
|
|
1560
|
+
}
|
|
1561
|
+
function extractFromTable(table) {
|
|
1562
|
+
const fields = [];
|
|
1563
|
+
if (table.cols >= 2) {
|
|
1564
|
+
for (let r = 0; r < table.rows; r++) {
|
|
1565
|
+
for (let c = 0; c < table.cols - 1; c++) {
|
|
1566
|
+
const labelCell = table.cells[r][c];
|
|
1567
|
+
const valueCell = table.cells[r][c + 1];
|
|
1568
|
+
if (isLabelCell(labelCell.text) && valueCell.text.trim()) {
|
|
1569
|
+
fields.push({
|
|
1570
|
+
label: labelCell.text.trim().replace(/[::]\s*$/, ""),
|
|
1571
|
+
value: valueCell.text.trim(),
|
|
1572
|
+
row: r,
|
|
1573
|
+
col: c
|
|
1574
|
+
});
|
|
1575
|
+
}
|
|
1576
|
+
}
|
|
1577
|
+
}
|
|
1578
|
+
}
|
|
1579
|
+
if (fields.length === 0 && table.rows >= 2 && table.cols >= 2) {
|
|
1580
|
+
const headerRow = table.cells[0];
|
|
1581
|
+
const allLabels = headerRow.every((cell) => {
|
|
1582
|
+
const t = cell.text.trim();
|
|
1583
|
+
return t.length > 0 && t.length <= 20;
|
|
1584
|
+
});
|
|
1585
|
+
if (allLabels) {
|
|
1586
|
+
for (let r = 1; r < table.rows; r++) {
|
|
1587
|
+
for (let c = 0; c < table.cols; c++) {
|
|
1588
|
+
const label = headerRow[c].text.trim();
|
|
1589
|
+
const value = table.cells[r][c].text.trim();
|
|
1590
|
+
if (label && value) {
|
|
1591
|
+
fields.push({ label, value, row: r, col: c });
|
|
1592
|
+
}
|
|
1593
|
+
}
|
|
1594
|
+
}
|
|
1595
|
+
}
|
|
1596
|
+
}
|
|
1597
|
+
return fields;
|
|
1598
|
+
}
|
|
1599
|
+
function extractInlineFields(text) {
|
|
1600
|
+
const fields = [];
|
|
1601
|
+
const pattern = /([가-힣A-Za-z]{2,10})\s*[::]\s*([^\n,;]{1,100})/g;
|
|
1602
|
+
let match;
|
|
1603
|
+
while ((match = pattern.exec(text)) !== null) {
|
|
1604
|
+
const label = match[1].trim();
|
|
1605
|
+
const value = match[2].trim();
|
|
1606
|
+
if (value) {
|
|
1607
|
+
fields.push({ label, value, row: -1, col: -1 });
|
|
1608
|
+
}
|
|
1609
|
+
}
|
|
1610
|
+
return fields;
|
|
1611
|
+
}
|
|
1612
|
+
|
|
1613
|
+
// src/hwpx/generator.ts
|
|
1614
|
+
var import_jszip2 = __toESM(require("jszip"), 1);
|
|
1615
|
+
var HWPML_NS = "http://www.hancom.co.kr/hwpml/2016/HwpMl";
|
|
1616
|
+
async function markdownToHwpx(markdown) {
|
|
1617
|
+
const blocks = parseMarkdownToBlocks(markdown);
|
|
1618
|
+
const sectionXml = blocksToSectionXml(blocks);
|
|
1619
|
+
const zip = new import_jszip2.default();
|
|
1620
|
+
zip.file("mimetype", "application/hwp+zip", { compression: "STORE" });
|
|
1621
|
+
zip.file("Contents/content.hpf", generateManifest());
|
|
1622
|
+
zip.file("Contents/section0.xml", sectionXml);
|
|
1623
|
+
return await zip.generateAsync({ type: "arraybuffer" });
|
|
1624
|
+
}
|
|
1625
|
+
function parseMarkdownToBlocks(md) {
|
|
1626
|
+
const lines = md.split("\n");
|
|
1627
|
+
const blocks = [];
|
|
1628
|
+
let i = 0;
|
|
1629
|
+
while (i < lines.length) {
|
|
1630
|
+
const line = lines[i];
|
|
1631
|
+
if (!line.trim()) {
|
|
1632
|
+
i++;
|
|
1633
|
+
continue;
|
|
1634
|
+
}
|
|
1635
|
+
const headingMatch = line.match(/^(#{1,6})\s+(.+)$/);
|
|
1636
|
+
if (headingMatch) {
|
|
1637
|
+
blocks.push({ type: "heading", text: headingMatch[2].trim(), level: headingMatch[1].length });
|
|
1638
|
+
i++;
|
|
1639
|
+
continue;
|
|
1640
|
+
}
|
|
1641
|
+
if (line.trimStart().startsWith("|")) {
|
|
1642
|
+
const tableRows = [];
|
|
1643
|
+
while (i < lines.length && lines[i].trimStart().startsWith("|")) {
|
|
1644
|
+
const row = lines[i];
|
|
1645
|
+
if (/^\|[\s\-:]+\|/.test(row) && !row.includes("---") === false && /^[\s|:\-]+$/.test(row)) {
|
|
1646
|
+
i++;
|
|
1647
|
+
continue;
|
|
1648
|
+
}
|
|
1649
|
+
const cells = row.split("|").slice(1, -1).map((c) => c.trim());
|
|
1650
|
+
if (cells.length > 0) tableRows.push(cells);
|
|
1651
|
+
i++;
|
|
1652
|
+
}
|
|
1653
|
+
if (tableRows.length > 0) {
|
|
1654
|
+
blocks.push({ type: "table", rows: tableRows });
|
|
1655
|
+
}
|
|
1656
|
+
continue;
|
|
1657
|
+
}
|
|
1658
|
+
blocks.push({ type: "paragraph", text: line.trim() });
|
|
1659
|
+
i++;
|
|
1660
|
+
}
|
|
1661
|
+
return blocks;
|
|
1662
|
+
}
|
|
1663
|
+
function escapeXml(text) {
|
|
1664
|
+
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """);
|
|
1665
|
+
}
|
|
1666
|
+
function generateParagraph(text) {
|
|
1667
|
+
return `<hp:p><hp:run><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
|
|
1668
|
+
}
|
|
1669
|
+
function generateTable(rows) {
|
|
1670
|
+
const trElements = rows.map((row) => {
|
|
1671
|
+
const tdElements = row.map(
|
|
1672
|
+
(cell) => `<hp:tc><hp:cellSpan colSpan="1" rowSpan="1"/>${generateParagraph(cell)}</hp:tc>`
|
|
1673
|
+
).join("");
|
|
1674
|
+
return `<hp:tr>${tdElements}</hp:tr>`;
|
|
1675
|
+
}).join("");
|
|
1676
|
+
return `<hp:tbl>${trElements}</hp:tbl>`;
|
|
1677
|
+
}
|
|
1678
|
+
function blocksToSectionXml(blocks) {
|
|
1679
|
+
const body = blocks.map((block) => {
|
|
1680
|
+
switch (block.type) {
|
|
1681
|
+
case "heading":
|
|
1682
|
+
return generateParagraph(block.text || "");
|
|
1683
|
+
case "table":
|
|
1684
|
+
return block.rows ? generateTable(block.rows) : "";
|
|
1685
|
+
case "paragraph":
|
|
1686
|
+
return generateParagraph(block.text || "");
|
|
1687
|
+
default:
|
|
1688
|
+
return "";
|
|
1689
|
+
}
|
|
1690
|
+
}).join("\n ");
|
|
1691
|
+
return `<?xml version="1.0" encoding="UTF-8"?>
|
|
1692
|
+
<hs:sec xmlns:hs="${HWPML_NS}" xmlns:hp="${HWPML_NS}">
|
|
1693
|
+
${body}
|
|
1694
|
+
</hs:sec>`;
|
|
1695
|
+
}
|
|
1696
|
+
function generateManifest() {
|
|
1697
|
+
return `<?xml version="1.0" encoding="UTF-8"?>
|
|
1698
|
+
<opf:package xmlns:opf="http://www.idpf.org/2007/opf">
|
|
1699
|
+
<opf:manifest>
|
|
1700
|
+
<opf:item id="s0" href="section0.xml" media-type="application/xml"/>
|
|
1701
|
+
</opf:manifest>
|
|
1702
|
+
<opf:spine>
|
|
1703
|
+
<opf:itemref idref="s0"/>
|
|
1704
|
+
</opf:spine>
|
|
1705
|
+
</opf:package>`;
|
|
1706
|
+
}
|
|
1707
|
+
|
|
1107
1708
|
// src/index.ts
|
|
1108
|
-
async function parse(buffer) {
|
|
1709
|
+
async function parse(buffer, options) {
|
|
1109
1710
|
if (!buffer || buffer.byteLength === 0) {
|
|
1110
|
-
return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4." };
|
|
1711
|
+
return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
|
|
1111
1712
|
}
|
|
1112
1713
|
const format = detectFormat(buffer);
|
|
1113
1714
|
switch (format) {
|
|
1114
1715
|
case "hwpx":
|
|
1115
|
-
return parseHwpx(buffer);
|
|
1716
|
+
return parseHwpx(buffer, options);
|
|
1116
1717
|
case "hwp":
|
|
1117
|
-
return parseHwp(buffer);
|
|
1718
|
+
return parseHwp(buffer, options);
|
|
1118
1719
|
case "pdf":
|
|
1119
|
-
return parsePdf(buffer);
|
|
1720
|
+
return parsePdf(buffer, options);
|
|
1120
1721
|
default:
|
|
1121
|
-
return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4." };
|
|
1722
|
+
return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
|
|
1122
1723
|
}
|
|
1123
1724
|
}
|
|
1124
|
-
async function parseHwpx(buffer) {
|
|
1725
|
+
async function parseHwpx(buffer, options) {
|
|
1125
1726
|
try {
|
|
1126
|
-
const markdown = await parseHwpxDocument(buffer);
|
|
1127
|
-
return { success: true, fileType: "hwpx", markdown };
|
|
1727
|
+
const { markdown, blocks, metadata } = await parseHwpxDocument(buffer, options);
|
|
1728
|
+
return { success: true, fileType: "hwpx", markdown, blocks, metadata };
|
|
1128
1729
|
} catch (err) {
|
|
1129
|
-
return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328" };
|
|
1730
|
+
return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
1130
1731
|
}
|
|
1131
1732
|
}
|
|
1132
|
-
async function parseHwp(buffer) {
|
|
1733
|
+
async function parseHwp(buffer, options) {
|
|
1133
1734
|
try {
|
|
1134
|
-
const markdown = parseHwp5Document(Buffer.from(buffer));
|
|
1135
|
-
return { success: true, fileType: "hwp", markdown };
|
|
1735
|
+
const { markdown, blocks, metadata } = parseHwp5Document(Buffer.from(buffer), options);
|
|
1736
|
+
return { success: true, fileType: "hwp", markdown, blocks, metadata };
|
|
1136
1737
|
} catch (err) {
|
|
1137
|
-
return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328" };
|
|
1738
|
+
return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
1138
1739
|
}
|
|
1139
1740
|
}
|
|
1140
|
-
async function parsePdf(buffer) {
|
|
1741
|
+
async function parsePdf(buffer, options) {
|
|
1141
1742
|
try {
|
|
1142
|
-
return await parsePdfDocument(buffer);
|
|
1743
|
+
return await parsePdfDocument(buffer, options);
|
|
1143
1744
|
} catch (err) {
|
|
1144
|
-
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328" };
|
|
1745
|
+
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
1145
1746
|
}
|
|
1146
1747
|
}
|
|
1147
1748
|
// Annotate the CommonJS export names for ESM import in node:
|
|
1148
1749
|
0 && (module.exports = {
|
|
1149
1750
|
VERSION,
|
|
1751
|
+
blocksToMarkdown,
|
|
1752
|
+
compare,
|
|
1150
1753
|
detectFormat,
|
|
1754
|
+
diffBlocks,
|
|
1755
|
+
extractFormFields,
|
|
1151
1756
|
isHwpxFile,
|
|
1152
1757
|
isOldHwpFile,
|
|
1153
1758
|
isPdfFile,
|
|
1759
|
+
markdownToHwpx,
|
|
1154
1760
|
parse,
|
|
1155
1761
|
parseHwp,
|
|
1156
1762
|
parseHwpx,
|