kordoc 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -5,6 +5,9 @@ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
5
  var __getOwnPropNames = Object.getOwnPropertyNames;
6
6
  var __getProtoOf = Object.getPrototypeOf;
7
7
  var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __esm = (fn, res) => function __init() {
9
+ return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
10
+ };
8
11
  var __export = (target, all) => {
9
12
  for (var name in all)
10
13
  __defProp(target, name, { get: all[name], enumerable: true });
@@ -27,14 +30,61 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
27
30
  ));
28
31
  var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
32
 
33
+ // src/ocr/provider.ts
34
+ var provider_exports = {};
35
+ __export(provider_exports, {
36
+ ocrPages: () => ocrPages
37
+ });
38
+ async function ocrPages(doc, provider, pageFilter, effectivePageCount) {
39
+ const blocks = [];
40
+ for (let i = 1; i <= effectivePageCount; i++) {
41
+ if (pageFilter && !pageFilter.has(i)) continue;
42
+ const page = await doc.getPage(i);
43
+ try {
44
+ const imageData = await renderPageToPng(page);
45
+ const text = await provider(imageData, i, "image/png");
46
+ if (text.trim()) {
47
+ blocks.push({ type: "paragraph", text: text.trim() });
48
+ }
49
+ } catch {
50
+ }
51
+ }
52
+ return blocks;
53
+ }
54
+ async function renderPageToPng(page) {
55
+ let createCanvas;
56
+ try {
57
+ const canvasModule = await import("canvas");
58
+ createCanvas = canvasModule.createCanvas;
59
+ } catch {
60
+ throw new Error("OCR\uC744 \uC0AC\uC6A9\uD558\uB824\uBA74 'canvas' \uD328\uD0A4\uC9C0\uB97C \uC124\uCE58\uD558\uC138\uC694: npm install canvas");
61
+ }
62
+ const scale = 2;
63
+ const viewport = page.getViewport({ scale });
64
+ const canvas = createCanvas(Math.floor(viewport.width), Math.floor(viewport.height));
65
+ const ctx = canvas.getContext("2d");
66
+ await page.render({ canvasContext: ctx, viewport }).promise;
67
+ return new Uint8Array(canvas.toBuffer("image/png"));
68
+ }
69
+ var init_provider = __esm({
70
+ "src/ocr/provider.ts"() {
71
+ "use strict";
72
+ }
73
+ });
74
+
30
75
  // src/index.ts
31
76
  var index_exports = {};
32
77
  __export(index_exports, {
33
78
  VERSION: () => VERSION,
79
+ blocksToMarkdown: () => blocksToMarkdown,
80
+ compare: () => compare,
34
81
  detectFormat: () => detectFormat,
82
+ diffBlocks: () => diffBlocks,
83
+ extractFormFields: () => extractFormFields,
35
84
  isHwpxFile: () => isHwpxFile,
36
85
  isOldHwpFile: () => isOldHwpFile,
37
86
  isPdfFile: () => isPdfFile,
87
+ markdownToHwpx: () => markdownToHwpx,
38
88
  parse: () => parse,
39
89
  parseHwp: () => parseHwp,
40
90
  parseHwpx: () => parseHwpx,
@@ -201,7 +251,7 @@ function tableToMarkdown(table) {
201
251
  }
202
252
 
203
253
  // src/utils.ts
204
- var VERSION = true ? "1.3.0" : "0.0.0-dev";
254
+ var VERSION = true ? "1.4.0" : "0.0.0-dev";
205
255
  var KordocError = class extends Error {
206
256
  constructor(message) {
207
257
  super(message);
@@ -212,6 +262,47 @@ function isPathTraversal(name) {
212
262
  const normalized = name.replace(/\\/g, "/");
213
263
  return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
214
264
  }
265
+ function classifyError(err) {
266
+ if (!(err instanceof Error)) return "PARSE_ERROR";
267
+ const msg = err.message;
268
+ if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
269
+ if (msg.includes("DRM")) return "DRM_PROTECTED";
270
+ if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
271
+ if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
272
+ if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
273
+ if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
274
+ if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
275
+ return "PARSE_ERROR";
276
+ }
277
+
278
+ // src/page-range.ts
279
+ function parsePageRange(spec, maxPages) {
280
+ const result = /* @__PURE__ */ new Set();
281
+ if (maxPages <= 0) return result;
282
+ if (Array.isArray(spec)) {
283
+ for (const n of spec) {
284
+ const page = Math.round(n);
285
+ if (page >= 1 && page <= maxPages) result.add(page);
286
+ }
287
+ return result;
288
+ }
289
+ if (typeof spec !== "string" || spec.trim() === "") return result;
290
+ const parts = spec.split(",");
291
+ for (const part of parts) {
292
+ const trimmed = part.trim();
293
+ if (!trimmed) continue;
294
+ const rangeMatch = trimmed.match(/^(\d+)\s*-\s*(\d+)$/);
295
+ if (rangeMatch) {
296
+ const start = Math.max(1, parseInt(rangeMatch[1], 10));
297
+ const end = Math.min(maxPages, parseInt(rangeMatch[2], 10));
298
+ for (let i = start; i <= end; i++) result.add(i);
299
+ } else {
300
+ const page = parseInt(trimmed, 10);
301
+ if (!isNaN(page) && page >= 1 && page <= maxPages) result.add(page);
302
+ }
303
+ }
304
+ return result;
305
+ }
215
306
 
216
307
  // src/hwpx/parser.ts
217
308
  var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
@@ -222,7 +313,7 @@ function clampSpan(val, max) {
222
313
  function stripDtd(xml) {
223
314
  return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
224
315
  }
225
- async function parseHwpxDocument(buffer) {
316
+ async function parseHwpxDocument(buffer, options) {
226
317
  const precheck = precheckZipSize(buffer);
227
318
  if (precheck.totalUncompressed > MAX_DECOMPRESS_SIZE) {
228
319
  throw new KordocError("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
@@ -240,19 +331,62 @@ async function parseHwpxDocument(buffer) {
240
331
  if (actualEntryCount > MAX_ZIP_ENTRIES) {
241
332
  throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
242
333
  }
334
+ const metadata = {};
335
+ await extractHwpxMetadata(zip, metadata);
243
336
  const sectionPaths = await resolveSectionPaths(zip);
244
337
  if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
338
+ metadata.pageCount = sectionPaths.length;
339
+ const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
245
340
  let totalDecompressed = 0;
246
341
  const blocks = [];
247
- for (const path of sectionPaths) {
248
- const file = zip.file(path);
342
+ for (let si = 0; si < sectionPaths.length; si++) {
343
+ if (pageFilter && !pageFilter.has(si + 1)) continue;
344
+ const file = zip.file(sectionPaths[si]);
249
345
  if (!file) continue;
250
346
  const xml = await file.async("text");
251
347
  totalDecompressed += xml.length * 2;
252
348
  if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
253
349
  blocks.push(...parseSectionXml(xml));
254
350
  }
255
- return blocksToMarkdown(blocks);
351
+ const markdown = blocksToMarkdown(blocks);
352
+ return { markdown, blocks, metadata };
353
+ }
354
+ async function extractHwpxMetadata(zip, metadata) {
355
+ try {
356
+ const metaPaths = ["meta.xml", "META-INF/meta.xml", "docProps/core.xml"];
357
+ for (const mp of metaPaths) {
358
+ const file = zip.file(mp) || Object.values(zip.files).find((f) => f.name.toLowerCase() === mp.toLowerCase()) || null;
359
+ if (!file) continue;
360
+ const xml = await file.async("text");
361
+ parseDublinCoreMetadata(xml, metadata);
362
+ if (metadata.title || metadata.author) return;
363
+ }
364
+ } catch {
365
+ }
366
+ }
367
+ function parseDublinCoreMetadata(xml, metadata) {
368
+ const parser = new import_xmldom.DOMParser();
369
+ const doc = parser.parseFromString(stripDtd(xml), "text/xml");
370
+ if (!doc.documentElement) return;
371
+ const getText = (tagNames) => {
372
+ for (const tag of tagNames) {
373
+ const els = doc.getElementsByTagName(tag);
374
+ if (els.length > 0) {
375
+ const text = els[0].textContent?.trim();
376
+ if (text) return text;
377
+ }
378
+ }
379
+ return void 0;
380
+ };
381
+ metadata.title = metadata.title || getText(["dc:title", "title"]);
382
+ metadata.author = metadata.author || getText(["dc:creator", "creator", "cp:lastModifiedBy"]);
383
+ metadata.description = metadata.description || getText(["dc:description", "description", "dc:subject", "subject"]);
384
+ metadata.createdAt = metadata.createdAt || getText(["dcterms:created", "meta:creation-date"]);
385
+ metadata.modifiedAt = metadata.modifiedAt || getText(["dcterms:modified", "meta:date"]);
386
+ const keywords = getText(["dc:keyword", "cp:keywords", "meta:keyword"]);
387
+ if (keywords && !metadata.keywords) {
388
+ metadata.keywords = keywords.split(/[,;]/).map((k) => k.trim()).filter(Boolean);
389
+ }
256
390
  }
257
391
  function precheckZipSize(buffer) {
258
392
  try {
@@ -291,7 +425,7 @@ function extractFromBrokenZip(buffer) {
291
425
  const data = new Uint8Array(buffer);
292
426
  const view = new DataView(buffer);
293
427
  let pos = 0;
294
- const texts = [];
428
+ const blocks = [];
295
429
  let totalDecompressed = 0;
296
430
  let entryCount = 0;
297
431
  while (pos < data.length - 30) {
@@ -332,14 +466,14 @@ function extractFromBrokenZip(buffer) {
332
466
  }
333
467
  totalDecompressed += content.length * 2;
334
468
  if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
335
- const sectionText = blocksToMarkdown(parseSectionXml(content));
336
- if (sectionText) texts.push(sectionText);
469
+ blocks.push(...parseSectionXml(content));
337
470
  } catch {
338
471
  continue;
339
472
  }
340
473
  }
341
- if (texts.length === 0) throw new KordocError("\uC190\uC0C1\uB41C HWPX\uC5D0\uC11C \uC139\uC158 \uB370\uC774\uD130\uB97C \uBCF5\uAD6C\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
342
- return texts.join("\n\n");
474
+ if (blocks.length === 0) throw new KordocError("\uC190\uC0C1\uB41C HWPX\uC5D0\uC11C \uC139\uC158 \uB370\uC774\uD130\uB97C \uBCF5\uAD6C\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
475
+ const markdown = blocksToMarkdown(blocks);
476
+ return { markdown, blocks };
343
477
  }
344
478
  async function resolveSectionPaths(zip) {
345
479
  const manifestPaths = ["Contents/content.hpf", "content.hpf"];
@@ -612,7 +746,7 @@ var require2 = (0, import_module.createRequire)(import_meta.url);
612
746
  var CFB = require2("cfb");
613
747
  var MAX_SECTIONS = 100;
614
748
  var MAX_TOTAL_DECOMPRESS = 100 * 1024 * 1024;
615
- function parseHwp5Document(buffer) {
749
+ function parseHwp5Document(buffer, options) {
616
750
  const cfb = CFB.parse(buffer);
617
751
  const headerEntry = CFB.find(cfb, "/FileHeader");
618
752
  if (!headerEntry?.content) throw new KordocError("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
@@ -621,18 +755,59 @@ function parseHwp5Document(buffer) {
621
755
  if (header.flags & FLAG_ENCRYPTED) throw new KordocError("\uC554\uD638\uD654\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
622
756
  if (header.flags & FLAG_DRM) throw new KordocError("DRM \uBCF4\uD638\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
623
757
  const compressed = (header.flags & FLAG_COMPRESSED) !== 0;
758
+ const metadata = {
759
+ version: `${header.versionMajor}.x`
760
+ };
761
+ extractHwp5Metadata(cfb, metadata);
624
762
  const sections = findSections(cfb);
625
763
  if (sections.length === 0) throw new KordocError("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
764
+ metadata.pageCount = sections.length;
765
+ const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
626
766
  const blocks = [];
627
767
  let totalDecompressed = 0;
628
- for (const sectionData of sections) {
768
+ for (let si = 0; si < sections.length; si++) {
769
+ if (pageFilter && !pageFilter.has(si + 1)) continue;
770
+ const sectionData = sections[si];
629
771
  const data = compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
630
772
  totalDecompressed += data.length;
631
773
  if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
632
774
  const records = readRecords(data);
633
775
  blocks.push(...parseSection(records));
634
776
  }
635
- return blocksToMarkdown(blocks);
777
+ const markdown = blocksToMarkdown(blocks);
778
+ return { markdown, blocks, metadata };
779
+ }
780
+ function extractHwp5Metadata(cfb, metadata) {
781
+ try {
782
+ const summaryEntry = CFB.find(cfb, "/HwpSummaryInformation") || CFB.find(cfb, "/SummaryInformation");
783
+ if (!summaryEntry?.content) return;
784
+ const data = Buffer.from(summaryEntry.content);
785
+ if (data.length < 48) return;
786
+ const numSets = data.readUInt32LE(24);
787
+ if (numSets === 0) return;
788
+ const setOffset = data.readUInt32LE(44);
789
+ if (setOffset >= data.length - 8) return;
790
+ const numProps = data.readUInt32LE(setOffset + 4);
791
+ if (numProps === 0 || numProps > 100) return;
792
+ for (let i = 0; i < numProps; i++) {
793
+ const entryOffset = setOffset + 8 + i * 8;
794
+ if (entryOffset + 8 > data.length) break;
795
+ const propId = data.readUInt32LE(entryOffset);
796
+ const propOffset = setOffset + data.readUInt32LE(entryOffset + 4);
797
+ if (propOffset + 8 > data.length) continue;
798
+ if (propId !== 2 && propId !== 4 && propId !== 6) continue;
799
+ const propType = data.readUInt32LE(propOffset);
800
+ if (propType !== 30) continue;
801
+ const strLen = data.readUInt32LE(propOffset + 4);
802
+ if (strLen === 0 || strLen > 1e4 || propOffset + 8 + strLen > data.length) continue;
803
+ const str = data.subarray(propOffset + 8, propOffset + 8 + strLen).toString("utf8").replace(/\0+$/, "").trim();
804
+ if (!str) continue;
805
+ if (propId === 2) metadata.title = str;
806
+ else if (propId === 4) metadata.author = str;
807
+ else if (propId === 6) metadata.description = str;
808
+ }
809
+ } catch {
810
+ }
636
811
  }
637
812
  function findSections(cfb) {
638
813
  const sections = [];
@@ -794,7 +969,7 @@ var import_pdf = require("pdfjs-dist/legacy/build/pdf.mjs");
794
969
  import_pdf.GlobalWorkerOptions.workerSrc = "";
795
970
  var MAX_PAGES = 5e3;
796
971
  var MAX_TOTAL_TEXT = 100 * 1024 * 1024;
797
- async function parsePdfDocument(buffer) {
972
+ async function parsePdfDocument(buffer, options) {
798
973
  const doc = await (0, import_pdf.getDocument)({
799
974
  data: new Uint8Array(buffer),
800
975
  useSystemFonts: true,
@@ -803,12 +978,17 @@ async function parsePdfDocument(buffer) {
803
978
  }).promise;
804
979
  try {
805
980
  const pageCount = doc.numPages;
806
- if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4." };
981
+ if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.", blocks: [] };
982
+ const metadata = { pageCount };
983
+ await extractPdfMetadata(doc, metadata);
807
984
  const pageTexts = [];
985
+ const blocks = [];
808
986
  let totalChars = 0;
809
987
  let totalTextBytes = 0;
810
988
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
989
+ const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
811
990
  for (let i = 1; i <= effectivePageCount; i++) {
991
+ if (pageFilter && !pageFilter.has(i)) continue;
812
992
  const page = await doc.getPage(i);
813
993
  const tc = await page.getTextContent();
814
994
  const pageText = extractPageContent(tc.items);
@@ -816,18 +996,54 @@ async function parsePdfDocument(buffer) {
816
996
  totalTextBytes += pageText.length * 2;
817
997
  if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
818
998
  pageTexts.push(pageText);
999
+ blocks.push({ type: "paragraph", text: pageText });
819
1000
  }
820
- if (totalChars / effectivePageCount < 10) {
821
- return { success: false, fileType: "pdf", pageCount, isImageBased: true, error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)` };
1001
+ const parsedPageCount = pageFilter ? pageFilter.size : effectivePageCount;
1002
+ if (totalChars / Math.max(parsedPageCount, 1) < 10) {
1003
+ if (options?.ocr) {
1004
+ try {
1005
+ const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
1006
+ const ocrBlocks = await ocrPages2(doc, options.ocr, pageFilter, effectivePageCount);
1007
+ if (ocrBlocks.length > 0) {
1008
+ const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
1009
+ return { success: true, fileType: "pdf", markdown: ocrMarkdown, pageCount: parsedPageCount, blocks: ocrBlocks, metadata, isImageBased: true };
1010
+ }
1011
+ } catch {
1012
+ }
1013
+ }
1014
+ return { success: false, fileType: "pdf", pageCount, isImageBased: true, error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`, code: "IMAGE_BASED_PDF" };
822
1015
  }
823
1016
  let markdown = pageTexts.filter((t) => t.trim()).join("\n\n");
824
1017
  markdown = cleanPdfText(markdown);
825
- return { success: true, fileType: "pdf", markdown, pageCount: effectivePageCount };
1018
+ return { success: true, fileType: "pdf", markdown, pageCount: parsedPageCount, blocks, metadata };
826
1019
  } finally {
827
1020
  await doc.destroy().catch(() => {
828
1021
  });
829
1022
  }
830
1023
  }
1024
+ async function extractPdfMetadata(doc, metadata) {
1025
+ try {
1026
+ const result = await doc.getMetadata();
1027
+ if (!result?.info) return;
1028
+ const info = result.info;
1029
+ if (typeof info.Title === "string" && info.Title.trim()) metadata.title = info.Title.trim();
1030
+ if (typeof info.Author === "string" && info.Author.trim()) metadata.author = info.Author.trim();
1031
+ if (typeof info.Creator === "string" && info.Creator.trim()) metadata.creator = info.Creator.trim();
1032
+ if (typeof info.Subject === "string" && info.Subject.trim()) metadata.description = info.Subject.trim();
1033
+ if (typeof info.Keywords === "string" && info.Keywords.trim()) {
1034
+ metadata.keywords = info.Keywords.split(/[,;]/).map((k) => k.trim()).filter(Boolean);
1035
+ }
1036
+ if (typeof info.CreationDate === "string") metadata.createdAt = parsePdfDate(info.CreationDate);
1037
+ if (typeof info.ModDate === "string") metadata.modifiedAt = parsePdfDate(info.ModDate);
1038
+ } catch {
1039
+ }
1040
+ }
1041
+ function parsePdfDate(dateStr) {
1042
+ const m = dateStr.match(/D:(\d{4})(\d{2})?(\d{2})?(\d{2})?(\d{2})?(\d{2})?/);
1043
+ if (!m) return void 0;
1044
+ const [, year, month = "01", day = "01", hour = "00", min = "00", sec = "00"] = m;
1045
+ return `${year}-${month}-${day}T${hour}:${min}:${sec}`;
1046
+ }
831
1047
  function extractPageContent(rawItems) {
832
1048
  const items = normalizeItems(rawItems);
833
1049
  if (items.length === 0) return "";
@@ -1100,53 +1316,447 @@ function mergeKoreanLines(text) {
1100
1316
  return result.join("\n");
1101
1317
  }
1102
1318
 
1319
+ // src/diff/text-diff.ts
1320
+ function similarity(a, b) {
1321
+ if (a === b) return 1;
1322
+ if (!a || !b) return 0;
1323
+ const maxLen = Math.max(a.length, b.length);
1324
+ if (maxLen === 0) return 1;
1325
+ return 1 - levenshtein(a, b) / maxLen;
1326
+ }
1327
+ function normalizedSimilarity(a, b) {
1328
+ return similarity(normalize(a), normalize(b));
1329
+ }
1330
+ function normalize(s) {
1331
+ return s.replace(/\s+/g, " ").trim();
1332
+ }
1333
+ function levenshtein(a, b) {
1334
+ if (a.length > b.length) [a, b] = [b, a];
1335
+ const m = a.length;
1336
+ const n = b.length;
1337
+ let prev = Array.from({ length: m + 1 }, (_, i) => i);
1338
+ let curr = new Array(m + 1);
1339
+ for (let j = 1; j <= n; j++) {
1340
+ curr[0] = j;
1341
+ for (let i = 1; i <= m; i++) {
1342
+ if (a[i - 1] === b[j - 1]) {
1343
+ curr[i] = prev[i - 1];
1344
+ } else {
1345
+ curr[i] = 1 + Math.min(prev[i - 1], prev[i], curr[i - 1]);
1346
+ }
1347
+ }
1348
+ ;
1349
+ [prev, curr] = [curr, prev];
1350
+ }
1351
+ return prev[m];
1352
+ }
1353
+
1354
+ // src/diff/compare.ts
1355
+ var SIMILARITY_THRESHOLD = 0.4;
1356
+ async function compare(bufferA, bufferB, options) {
1357
+ const [resultA, resultB] = await Promise.all([
1358
+ parse(bufferA, options),
1359
+ parse(bufferB, options)
1360
+ ]);
1361
+ if (!resultA.success) throw new Error(`\uBB38\uC11CA \uD30C\uC2F1 \uC2E4\uD328: ${resultA.error}`);
1362
+ if (!resultB.success) throw new Error(`\uBB38\uC11CB \uD30C\uC2F1 \uC2E4\uD328: ${resultB.error}`);
1363
+ return diffBlocks(resultA.blocks, resultB.blocks);
1364
+ }
1365
+ function diffBlocks(blocksA, blocksB) {
1366
+ const aligned = alignBlocks(blocksA, blocksB);
1367
+ const stats = { added: 0, removed: 0, modified: 0, unchanged: 0 };
1368
+ const diffs = [];
1369
+ for (const [a, b] of aligned) {
1370
+ if (a && b) {
1371
+ const sim = blockSimilarity(a, b);
1372
+ if (sim >= 0.99) {
1373
+ diffs.push({ type: "unchanged", before: a, after: b, similarity: 1 });
1374
+ stats.unchanged++;
1375
+ } else {
1376
+ const diff = { type: "modified", before: a, after: b, similarity: sim };
1377
+ if (a.type === "table" && b.type === "table" && a.table && b.table) {
1378
+ diff.cellDiffs = diffTableCells(a.table, b.table);
1379
+ }
1380
+ diffs.push(diff);
1381
+ stats.modified++;
1382
+ }
1383
+ } else if (a) {
1384
+ diffs.push({ type: "removed", before: a });
1385
+ stats.removed++;
1386
+ } else if (b) {
1387
+ diffs.push({ type: "added", after: b });
1388
+ stats.added++;
1389
+ }
1390
+ }
1391
+ return { stats, diffs };
1392
+ }
1393
+ function alignBlocks(a, b) {
1394
+ const m = a.length, n = b.length;
1395
+ if (m * n > 1e7) return fallbackAlign(a, b);
1396
+ const simCache = /* @__PURE__ */ new Map();
1397
+ const getSim = (i2, j2) => {
1398
+ const key = `${i2},${j2}`;
1399
+ let v = simCache.get(key);
1400
+ if (v === void 0) {
1401
+ v = blockSimilarity(a[i2], b[j2]);
1402
+ simCache.set(key, v);
1403
+ }
1404
+ return v;
1405
+ };
1406
+ const dp = Array.from({ length: m + 1 }, () => new Array(n + 1).fill(0));
1407
+ for (let i2 = 1; i2 <= m; i2++) {
1408
+ for (let j2 = 1; j2 <= n; j2++) {
1409
+ if (getSim(i2 - 1, j2 - 1) >= SIMILARITY_THRESHOLD) {
1410
+ dp[i2][j2] = dp[i2 - 1][j2 - 1] + 1;
1411
+ } else {
1412
+ dp[i2][j2] = Math.max(dp[i2 - 1][j2], dp[i2][j2 - 1]);
1413
+ }
1414
+ }
1415
+ }
1416
+ const pairs = [];
1417
+ let i = m, j = n;
1418
+ while (i > 0 && j > 0) {
1419
+ if (getSim(i - 1, j - 1) >= SIMILARITY_THRESHOLD && dp[i][j] === dp[i - 1][j - 1] + 1) {
1420
+ pairs.push([i - 1, j - 1]);
1421
+ i--;
1422
+ j--;
1423
+ } else if (dp[i - 1][j] >= dp[i][j - 1]) {
1424
+ i--;
1425
+ } else {
1426
+ j--;
1427
+ }
1428
+ }
1429
+ pairs.reverse();
1430
+ const result = [];
1431
+ let ai = 0, bi = 0;
1432
+ for (const [pi, pj] of pairs) {
1433
+ while (ai < pi) result.push([a[ai++], null]);
1434
+ while (bi < pj) result.push([null, b[bi++]]);
1435
+ result.push([a[ai++], b[bi++]]);
1436
+ }
1437
+ while (ai < m) result.push([a[ai++], null]);
1438
+ while (bi < n) result.push([null, b[bi++]]);
1439
+ return result;
1440
+ }
1441
+ function fallbackAlign(a, b) {
1442
+ const result = [];
1443
+ const len = Math.max(a.length, b.length);
1444
+ for (let i = 0; i < len; i++) {
1445
+ result.push([a[i] || null, b[i] || null]);
1446
+ }
1447
+ return result;
1448
+ }
1449
+ function blockSimilarity(a, b) {
1450
+ if (a.type !== b.type) return 0;
1451
+ if (a.type === "paragraph") {
1452
+ return normalizedSimilarity(a.text || "", b.text || "");
1453
+ }
1454
+ if (a.type === "table" && a.table && b.table) {
1455
+ return tableSimilarity(a.table, b.table);
1456
+ }
1457
+ return 0;
1458
+ }
1459
+ function tableSimilarity(a, b) {
1460
+ const dimSim = 1 - Math.abs(a.rows * a.cols - b.rows * b.cols) / Math.max(a.rows * a.cols, b.rows * b.cols, 1);
1461
+ const textsA = a.cells.flat().map((c) => c.text).join(" ");
1462
+ const textsB = b.cells.flat().map((c) => c.text).join(" ");
1463
+ const contentSim = normalizedSimilarity(textsA, textsB);
1464
+ return dimSim * 0.3 + contentSim * 0.7;
1465
+ }
1466
+ function diffTableCells(a, b) {
1467
+ const maxRows = Math.max(a.rows, b.rows);
1468
+ const maxCols = Math.max(a.cols, b.cols);
1469
+ const result = [];
1470
+ for (let r = 0; r < maxRows; r++) {
1471
+ const row = [];
1472
+ for (let c = 0; c < maxCols; c++) {
1473
+ const cellA = r < a.rows && c < a.cols ? a.cells[r][c].text : void 0;
1474
+ const cellB = r < b.rows && c < b.cols ? b.cells[r][c].text : void 0;
1475
+ let type;
1476
+ if (cellA === void 0) type = "added";
1477
+ else if (cellB === void 0) type = "removed";
1478
+ else if (cellA === cellB) type = "unchanged";
1479
+ else type = "modified";
1480
+ row.push({ type, before: cellA, after: cellB });
1481
+ }
1482
+ result.push(row);
1483
+ }
1484
+ return result;
1485
+ }
1486
+
1487
+ // src/form/recognize.ts
1488
+ var LABEL_KEYWORDS = /* @__PURE__ */ new Set([
1489
+ "\uC131\uBA85",
1490
+ "\uC774\uB984",
1491
+ "\uC8FC\uC18C",
1492
+ "\uC804\uD654",
1493
+ "\uC804\uD654\uBC88\uD638",
1494
+ "\uD734\uB300\uD3F0",
1495
+ "\uD578\uB4DC\uD3F0",
1496
+ "\uC5F0\uB77D\uCC98",
1497
+ "\uC0DD\uB144\uC6D4\uC77C",
1498
+ "\uC8FC\uBBFC\uB4F1\uB85D\uBC88\uD638",
1499
+ "\uC18C\uC18D",
1500
+ "\uC9C1\uC704",
1501
+ "\uC9C1\uAE09",
1502
+ "\uBD80\uC11C",
1503
+ "\uC774\uBA54\uC77C",
1504
+ "\uD329\uC2A4",
1505
+ "\uD559\uAD50",
1506
+ "\uD559\uB144",
1507
+ "\uBC18",
1508
+ "\uBC88\uD638",
1509
+ "\uC2E0\uCCAD\uC778",
1510
+ "\uB300\uD45C\uC790",
1511
+ "\uB2F4\uB2F9\uC790",
1512
+ "\uC791\uC131\uC790",
1513
+ "\uD655\uC778\uC790",
1514
+ "\uC2B9\uC778\uC790",
1515
+ "\uC77C\uC2DC",
1516
+ "\uB0A0\uC9DC",
1517
+ "\uAE30\uAC04",
1518
+ "\uC7A5\uC18C",
1519
+ "\uBAA9\uC801",
1520
+ "\uC0AC\uC720",
1521
+ "\uBE44\uACE0",
1522
+ "\uAE08\uC561",
1523
+ "\uC218\uB7C9",
1524
+ "\uB2E8\uAC00",
1525
+ "\uD569\uACC4",
1526
+ "\uACC4",
1527
+ "\uC18C\uACC4"
1528
+ ]);
1529
+ function isLabelCell(text) {
1530
+ const trimmed = text.trim();
1531
+ if (!trimmed || trimmed.length > 30) return false;
1532
+ for (const kw of LABEL_KEYWORDS) {
1533
+ if (trimmed.includes(kw)) return true;
1534
+ }
1535
+ if (/^[가-힣\s()·:]{2,8}$/.test(trimmed) && !/\d/.test(trimmed)) return true;
1536
+ if (/^[가-힣A-Za-z\s]+[::]$/.test(trimmed)) return true;
1537
+ return false;
1538
+ }
1539
+ function extractFormFields(blocks) {
1540
+ const fields = [];
1541
+ let totalTables = 0;
1542
+ let formTables = 0;
1543
+ for (const block of blocks) {
1544
+ if (block.type !== "table" || !block.table) continue;
1545
+ totalTables++;
1546
+ const tableFields = extractFromTable(block.table);
1547
+ if (tableFields.length > 0) {
1548
+ formTables++;
1549
+ fields.push(...tableFields);
1550
+ }
1551
+ }
1552
+ for (const block of blocks) {
1553
+ if (block.type === "paragraph" && block.text) {
1554
+ const inlineFields = extractInlineFields(block.text);
1555
+ fields.push(...inlineFields);
1556
+ }
1557
+ }
1558
+ const confidence = totalTables > 0 ? formTables / totalTables : fields.length > 0 ? 0.3 : 0;
1559
+ return { fields, confidence: Math.min(confidence, 1) };
1560
+ }
1561
+ function extractFromTable(table) {
1562
+ const fields = [];
1563
+ if (table.cols >= 2) {
1564
+ for (let r = 0; r < table.rows; r++) {
1565
+ for (let c = 0; c < table.cols - 1; c++) {
1566
+ const labelCell = table.cells[r][c];
1567
+ const valueCell = table.cells[r][c + 1];
1568
+ if (isLabelCell(labelCell.text) && valueCell.text.trim()) {
1569
+ fields.push({
1570
+ label: labelCell.text.trim().replace(/[::]\s*$/, ""),
1571
+ value: valueCell.text.trim(),
1572
+ row: r,
1573
+ col: c
1574
+ });
1575
+ }
1576
+ }
1577
+ }
1578
+ }
1579
+ if (fields.length === 0 && table.rows >= 2 && table.cols >= 2) {
1580
+ const headerRow = table.cells[0];
1581
+ const allLabels = headerRow.every((cell) => {
1582
+ const t = cell.text.trim();
1583
+ return t.length > 0 && t.length <= 20;
1584
+ });
1585
+ if (allLabels) {
1586
+ for (let r = 1; r < table.rows; r++) {
1587
+ for (let c = 0; c < table.cols; c++) {
1588
+ const label = headerRow[c].text.trim();
1589
+ const value = table.cells[r][c].text.trim();
1590
+ if (label && value) {
1591
+ fields.push({ label, value, row: r, col: c });
1592
+ }
1593
+ }
1594
+ }
1595
+ }
1596
+ }
1597
+ return fields;
1598
+ }
1599
+ function extractInlineFields(text) {
1600
+ const fields = [];
1601
+ const pattern = /([가-힣A-Za-z]{2,10})\s*[::]\s*([^\n,;]{1,100})/g;
1602
+ let match;
1603
+ while ((match = pattern.exec(text)) !== null) {
1604
+ const label = match[1].trim();
1605
+ const value = match[2].trim();
1606
+ if (value) {
1607
+ fields.push({ label, value, row: -1, col: -1 });
1608
+ }
1609
+ }
1610
+ return fields;
1611
+ }
1612
+
1613
+ // src/hwpx/generator.ts
1614
+ var import_jszip2 = __toESM(require("jszip"), 1);
1615
+ var HWPML_NS = "http://www.hancom.co.kr/hwpml/2016/HwpMl";
1616
+ async function markdownToHwpx(markdown) {
1617
+ const blocks = parseMarkdownToBlocks(markdown);
1618
+ const sectionXml = blocksToSectionXml(blocks);
1619
+ const zip = new import_jszip2.default();
1620
+ zip.file("mimetype", "application/hwp+zip", { compression: "STORE" });
1621
+ zip.file("Contents/content.hpf", generateManifest());
1622
+ zip.file("Contents/section0.xml", sectionXml);
1623
+ return await zip.generateAsync({ type: "arraybuffer" });
1624
+ }
1625
+ function parseMarkdownToBlocks(md) {
1626
+ const lines = md.split("\n");
1627
+ const blocks = [];
1628
+ let i = 0;
1629
+ while (i < lines.length) {
1630
+ const line = lines[i];
1631
+ if (!line.trim()) {
1632
+ i++;
1633
+ continue;
1634
+ }
1635
+ const headingMatch = line.match(/^(#{1,6})\s+(.+)$/);
1636
+ if (headingMatch) {
1637
+ blocks.push({ type: "heading", text: headingMatch[2].trim(), level: headingMatch[1].length });
1638
+ i++;
1639
+ continue;
1640
+ }
1641
+ if (line.trimStart().startsWith("|")) {
1642
+ const tableRows = [];
1643
+ while (i < lines.length && lines[i].trimStart().startsWith("|")) {
1644
+ const row = lines[i];
1645
+ if (/^\|[\s\-:]+\|/.test(row) && !row.includes("---") === false && /^[\s|:\-]+$/.test(row)) {
1646
+ i++;
1647
+ continue;
1648
+ }
1649
+ const cells = row.split("|").slice(1, -1).map((c) => c.trim());
1650
+ if (cells.length > 0) tableRows.push(cells);
1651
+ i++;
1652
+ }
1653
+ if (tableRows.length > 0) {
1654
+ blocks.push({ type: "table", rows: tableRows });
1655
+ }
1656
+ continue;
1657
+ }
1658
+ blocks.push({ type: "paragraph", text: line.trim() });
1659
+ i++;
1660
+ }
1661
+ return blocks;
1662
+ }
1663
+ function escapeXml(text) {
1664
+ return text.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;");
1665
+ }
1666
+ function generateParagraph(text) {
1667
+ return `<hp:p><hp:run><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
1668
+ }
1669
+ function generateTable(rows) {
1670
+ const trElements = rows.map((row) => {
1671
+ const tdElements = row.map(
1672
+ (cell) => `<hp:tc><hp:cellSpan colSpan="1" rowSpan="1"/>${generateParagraph(cell)}</hp:tc>`
1673
+ ).join("");
1674
+ return `<hp:tr>${tdElements}</hp:tr>`;
1675
+ }).join("");
1676
+ return `<hp:tbl>${trElements}</hp:tbl>`;
1677
+ }
1678
+ function blocksToSectionXml(blocks) {
1679
+ const body = blocks.map((block) => {
1680
+ switch (block.type) {
1681
+ case "heading":
1682
+ return generateParagraph(block.text || "");
1683
+ case "table":
1684
+ return block.rows ? generateTable(block.rows) : "";
1685
+ case "paragraph":
1686
+ return generateParagraph(block.text || "");
1687
+ default:
1688
+ return "";
1689
+ }
1690
+ }).join("\n ");
1691
+ return `<?xml version="1.0" encoding="UTF-8"?>
1692
+ <hs:sec xmlns:hs="${HWPML_NS}" xmlns:hp="${HWPML_NS}">
1693
+ ${body}
1694
+ </hs:sec>`;
1695
+ }
1696
+ function generateManifest() {
1697
+ return `<?xml version="1.0" encoding="UTF-8"?>
1698
+ <opf:package xmlns:opf="http://www.idpf.org/2007/opf">
1699
+ <opf:manifest>
1700
+ <opf:item id="s0" href="section0.xml" media-type="application/xml"/>
1701
+ </opf:manifest>
1702
+ <opf:spine>
1703
+ <opf:itemref idref="s0"/>
1704
+ </opf:spine>
1705
+ </opf:package>`;
1706
+ }
1707
+
1103
1708
  // src/index.ts
1104
- async function parse(buffer) {
1709
+ async function parse(buffer, options) {
1105
1710
  if (!buffer || buffer.byteLength === 0) {
1106
- return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4." };
1711
+ return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
1107
1712
  }
1108
1713
  const format = detectFormat(buffer);
1109
1714
  switch (format) {
1110
1715
  case "hwpx":
1111
- return parseHwpx(buffer);
1716
+ return parseHwpx(buffer, options);
1112
1717
  case "hwp":
1113
- return parseHwp(buffer);
1718
+ return parseHwp(buffer, options);
1114
1719
  case "pdf":
1115
- return parsePdf(buffer);
1720
+ return parsePdf(buffer, options);
1116
1721
  default:
1117
- return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4." };
1722
+ return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
1118
1723
  }
1119
1724
  }
1120
- async function parseHwpx(buffer) {
1725
+ async function parseHwpx(buffer, options) {
1121
1726
  try {
1122
- const markdown = await parseHwpxDocument(buffer);
1123
- return { success: true, fileType: "hwpx", markdown };
1727
+ const { markdown, blocks, metadata } = await parseHwpxDocument(buffer, options);
1728
+ return { success: true, fileType: "hwpx", markdown, blocks, metadata };
1124
1729
  } catch (err) {
1125
- return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328" };
1730
+ return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
1126
1731
  }
1127
1732
  }
1128
- async function parseHwp(buffer) {
1733
+ async function parseHwp(buffer, options) {
1129
1734
  try {
1130
- const markdown = parseHwp5Document(Buffer.from(buffer));
1131
- return { success: true, fileType: "hwp", markdown };
1735
+ const { markdown, blocks, metadata } = parseHwp5Document(Buffer.from(buffer), options);
1736
+ return { success: true, fileType: "hwp", markdown, blocks, metadata };
1132
1737
  } catch (err) {
1133
- return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328" };
1738
+ return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
1134
1739
  }
1135
1740
  }
1136
- async function parsePdf(buffer) {
1741
+ async function parsePdf(buffer, options) {
1137
1742
  try {
1138
- return await parsePdfDocument(buffer);
1743
+ return await parsePdfDocument(buffer, options);
1139
1744
  } catch (err) {
1140
- return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328" };
1745
+ return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
1141
1746
  }
1142
1747
  }
1143
1748
  // Annotate the CommonJS export names for ESM import in node:
1144
1749
  0 && (module.exports = {
1145
1750
  VERSION,
1751
+ blocksToMarkdown,
1752
+ compare,
1146
1753
  detectFormat,
1754
+ diffBlocks,
1755
+ extractFormFields,
1147
1756
  isHwpxFile,
1148
1757
  isOldHwpFile,
1149
1758
  isPdfFile,
1759
+ markdownToHwpx,
1150
1760
  parse,
1151
1761
  parseHwp,
1152
1762
  parseHwpx,