kordoc 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,3 +1,55 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropNames = Object.getOwnPropertyNames;
3
+ var __esm = (fn, res) => function __init() {
4
+ return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
5
+ };
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+
11
+ // src/ocr/provider.ts
12
+ var provider_exports = {};
13
+ __export(provider_exports, {
14
+ ocrPages: () => ocrPages
15
+ });
16
+ async function ocrPages(doc, provider, pageFilter, effectivePageCount) {
17
+ const blocks = [];
18
+ for (let i = 1; i <= effectivePageCount; i++) {
19
+ if (pageFilter && !pageFilter.has(i)) continue;
20
+ const page = await doc.getPage(i);
21
+ try {
22
+ const imageData = await renderPageToPng(page);
23
+ const text = await provider(imageData, i, "image/png");
24
+ if (text.trim()) {
25
+ blocks.push({ type: "paragraph", text: text.trim() });
26
+ }
27
+ } catch {
28
+ }
29
+ }
30
+ return blocks;
31
+ }
32
+ async function renderPageToPng(page) {
33
+ let createCanvas;
34
+ try {
35
+ const canvasModule = await import("canvas");
36
+ createCanvas = canvasModule.createCanvas;
37
+ } catch {
38
+ throw new Error("OCR\uC744 \uC0AC\uC6A9\uD558\uB824\uBA74 'canvas' \uD328\uD0A4\uC9C0\uB97C \uC124\uCE58\uD558\uC138\uC694: npm install canvas");
39
+ }
40
+ const scale = 2;
41
+ const viewport = page.getViewport({ scale });
42
+ const canvas = createCanvas(Math.floor(viewport.width), Math.floor(viewport.height));
43
+ const ctx = canvas.getContext("2d");
44
+ await page.render({ canvasContext: ctx, viewport }).promise;
45
+ return new Uint8Array(canvas.toBuffer("image/png"));
46
+ }
47
+ var init_provider = __esm({
48
+ "src/ocr/provider.ts"() {
49
+ "use strict";
50
+ }
51
+ });
52
+
1
53
  // src/detect.ts
2
54
  function magicBytes(buffer) {
3
55
  return new Uint8Array(buffer, 0, Math.min(4, buffer.byteLength));
@@ -157,7 +209,7 @@ function tableToMarkdown(table) {
157
209
  }
158
210
 
159
211
  // src/utils.ts
160
- var VERSION = true ? "1.3.0" : "0.0.0-dev";
212
+ var VERSION = true ? "1.4.0" : "0.0.0-dev";
161
213
  var KordocError = class extends Error {
162
214
  constructor(message) {
163
215
  super(message);
@@ -168,6 +220,47 @@ function isPathTraversal(name) {
168
220
  const normalized = name.replace(/\\/g, "/");
169
221
  return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
170
222
  }
223
+ function classifyError(err) {
224
+ if (!(err instanceof Error)) return "PARSE_ERROR";
225
+ const msg = err.message;
226
+ if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
227
+ if (msg.includes("DRM")) return "DRM_PROTECTED";
228
+ if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
229
+ if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
230
+ if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
231
+ if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
232
+ if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
233
+ return "PARSE_ERROR";
234
+ }
235
+
236
+ // src/page-range.ts
237
+ function parsePageRange(spec, maxPages) {
238
+ const result = /* @__PURE__ */ new Set();
239
+ if (maxPages <= 0) return result;
240
+ if (Array.isArray(spec)) {
241
+ for (const n of spec) {
242
+ const page = Math.round(n);
243
+ if (page >= 1 && page <= maxPages) result.add(page);
244
+ }
245
+ return result;
246
+ }
247
+ if (typeof spec !== "string" || spec.trim() === "") return result;
248
+ const parts = spec.split(",");
249
+ for (const part of parts) {
250
+ const trimmed = part.trim();
251
+ if (!trimmed) continue;
252
+ const rangeMatch = trimmed.match(/^(\d+)\s*-\s*(\d+)$/);
253
+ if (rangeMatch) {
254
+ const start = Math.max(1, parseInt(rangeMatch[1], 10));
255
+ const end = Math.min(maxPages, parseInt(rangeMatch[2], 10));
256
+ for (let i = start; i <= end; i++) result.add(i);
257
+ } else {
258
+ const page = parseInt(trimmed, 10);
259
+ if (!isNaN(page) && page >= 1 && page <= maxPages) result.add(page);
260
+ }
261
+ }
262
+ return result;
263
+ }
171
264
 
172
265
  // src/hwpx/parser.ts
173
266
  var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
@@ -178,7 +271,7 @@ function clampSpan(val, max) {
178
271
  function stripDtd(xml) {
179
272
  return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
180
273
  }
181
- async function parseHwpxDocument(buffer) {
274
+ async function parseHwpxDocument(buffer, options) {
182
275
  const precheck = precheckZipSize(buffer);
183
276
  if (precheck.totalUncompressed > MAX_DECOMPRESS_SIZE) {
184
277
  throw new KordocError("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
@@ -196,19 +289,62 @@ async function parseHwpxDocument(buffer) {
196
289
  if (actualEntryCount > MAX_ZIP_ENTRIES) {
197
290
  throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
198
291
  }
292
+ const metadata = {};
293
+ await extractHwpxMetadata(zip, metadata);
199
294
  const sectionPaths = await resolveSectionPaths(zip);
200
295
  if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
296
+ metadata.pageCount = sectionPaths.length;
297
+ const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
201
298
  let totalDecompressed = 0;
202
299
  const blocks = [];
203
- for (const path of sectionPaths) {
204
- const file = zip.file(path);
300
+ for (let si = 0; si < sectionPaths.length; si++) {
301
+ if (pageFilter && !pageFilter.has(si + 1)) continue;
302
+ const file = zip.file(sectionPaths[si]);
205
303
  if (!file) continue;
206
304
  const xml = await file.async("text");
207
305
  totalDecompressed += xml.length * 2;
208
306
  if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
209
307
  blocks.push(...parseSectionXml(xml));
210
308
  }
211
- return blocksToMarkdown(blocks);
309
+ const markdown = blocksToMarkdown(blocks);
310
+ return { markdown, blocks, metadata };
311
+ }
312
+ async function extractHwpxMetadata(zip, metadata) {
313
+ try {
314
+ const metaPaths = ["meta.xml", "META-INF/meta.xml", "docProps/core.xml"];
315
+ for (const mp of metaPaths) {
316
+ const file = zip.file(mp) || Object.values(zip.files).find((f) => f.name.toLowerCase() === mp.toLowerCase()) || null;
317
+ if (!file) continue;
318
+ const xml = await file.async("text");
319
+ parseDublinCoreMetadata(xml, metadata);
320
+ if (metadata.title || metadata.author) return;
321
+ }
322
+ } catch {
323
+ }
324
+ }
325
+ function parseDublinCoreMetadata(xml, metadata) {
326
+ const parser = new DOMParser();
327
+ const doc = parser.parseFromString(stripDtd(xml), "text/xml");
328
+ if (!doc.documentElement) return;
329
+ const getText = (tagNames) => {
330
+ for (const tag of tagNames) {
331
+ const els = doc.getElementsByTagName(tag);
332
+ if (els.length > 0) {
333
+ const text = els[0].textContent?.trim();
334
+ if (text) return text;
335
+ }
336
+ }
337
+ return void 0;
338
+ };
339
+ metadata.title = metadata.title || getText(["dc:title", "title"]);
340
+ metadata.author = metadata.author || getText(["dc:creator", "creator", "cp:lastModifiedBy"]);
341
+ metadata.description = metadata.description || getText(["dc:description", "description", "dc:subject", "subject"]);
342
+ metadata.createdAt = metadata.createdAt || getText(["dcterms:created", "meta:creation-date"]);
343
+ metadata.modifiedAt = metadata.modifiedAt || getText(["dcterms:modified", "meta:date"]);
344
+ const keywords = getText(["dc:keyword", "cp:keywords", "meta:keyword"]);
345
+ if (keywords && !metadata.keywords) {
346
+ metadata.keywords = keywords.split(/[,;]/).map((k) => k.trim()).filter(Boolean);
347
+ }
212
348
  }
213
349
  function precheckZipSize(buffer) {
214
350
  try {
@@ -247,7 +383,7 @@ function extractFromBrokenZip(buffer) {
247
383
  const data = new Uint8Array(buffer);
248
384
  const view = new DataView(buffer);
249
385
  let pos = 0;
250
- const texts = [];
386
+ const blocks = [];
251
387
  let totalDecompressed = 0;
252
388
  let entryCount = 0;
253
389
  while (pos < data.length - 30) {
@@ -288,14 +424,14 @@ function extractFromBrokenZip(buffer) {
288
424
  }
289
425
  totalDecompressed += content.length * 2;
290
426
  if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
291
- const sectionText = blocksToMarkdown(parseSectionXml(content));
292
- if (sectionText) texts.push(sectionText);
427
+ blocks.push(...parseSectionXml(content));
293
428
  } catch {
294
429
  continue;
295
430
  }
296
431
  }
297
- if (texts.length === 0) throw new KordocError("\uC190\uC0C1\uB41C HWPX\uC5D0\uC11C \uC139\uC158 \uB370\uC774\uD130\uB97C \uBCF5\uAD6C\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
298
- return texts.join("\n\n");
432
+ if (blocks.length === 0) throw new KordocError("\uC190\uC0C1\uB41C HWPX\uC5D0\uC11C \uC139\uC158 \uB370\uC774\uD130\uB97C \uBCF5\uAD6C\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
433
+ const markdown = blocksToMarkdown(blocks);
434
+ return { markdown, blocks };
299
435
  }
300
436
  async function resolveSectionPaths(zip) {
301
437
  const manifestPaths = ["Contents/content.hpf", "content.hpf"];
@@ -567,7 +703,7 @@ var require2 = createRequire(import.meta.url);
567
703
  var CFB = require2("cfb");
568
704
  var MAX_SECTIONS = 100;
569
705
  var MAX_TOTAL_DECOMPRESS = 100 * 1024 * 1024;
570
- function parseHwp5Document(buffer) {
706
+ function parseHwp5Document(buffer, options) {
571
707
  const cfb = CFB.parse(buffer);
572
708
  const headerEntry = CFB.find(cfb, "/FileHeader");
573
709
  if (!headerEntry?.content) throw new KordocError("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
@@ -576,18 +712,59 @@ function parseHwp5Document(buffer) {
576
712
  if (header.flags & FLAG_ENCRYPTED) throw new KordocError("\uC554\uD638\uD654\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
577
713
  if (header.flags & FLAG_DRM) throw new KordocError("DRM \uBCF4\uD638\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
578
714
  const compressed = (header.flags & FLAG_COMPRESSED) !== 0;
715
+ const metadata = {
716
+ version: `${header.versionMajor}.x`
717
+ };
718
+ extractHwp5Metadata(cfb, metadata);
579
719
  const sections = findSections(cfb);
580
720
  if (sections.length === 0) throw new KordocError("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
721
+ metadata.pageCount = sections.length;
722
+ const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
581
723
  const blocks = [];
582
724
  let totalDecompressed = 0;
583
- for (const sectionData of sections) {
725
+ for (let si = 0; si < sections.length; si++) {
726
+ if (pageFilter && !pageFilter.has(si + 1)) continue;
727
+ const sectionData = sections[si];
584
728
  const data = compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
585
729
  totalDecompressed += data.length;
586
730
  if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
587
731
  const records = readRecords(data);
588
732
  blocks.push(...parseSection(records));
589
733
  }
590
- return blocksToMarkdown(blocks);
734
+ const markdown = blocksToMarkdown(blocks);
735
+ return { markdown, blocks, metadata };
736
+ }
737
+ function extractHwp5Metadata(cfb, metadata) {
738
+ try {
739
+ const summaryEntry = CFB.find(cfb, "/HwpSummaryInformation") || CFB.find(cfb, "/SummaryInformation");
740
+ if (!summaryEntry?.content) return;
741
+ const data = Buffer.from(summaryEntry.content);
742
+ if (data.length < 48) return;
743
+ const numSets = data.readUInt32LE(24);
744
+ if (numSets === 0) return;
745
+ const setOffset = data.readUInt32LE(44);
746
+ if (setOffset >= data.length - 8) return;
747
+ const numProps = data.readUInt32LE(setOffset + 4);
748
+ if (numProps === 0 || numProps > 100) return;
749
+ for (let i = 0; i < numProps; i++) {
750
+ const entryOffset = setOffset + 8 + i * 8;
751
+ if (entryOffset + 8 > data.length) break;
752
+ const propId = data.readUInt32LE(entryOffset);
753
+ const propOffset = setOffset + data.readUInt32LE(entryOffset + 4);
754
+ if (propOffset + 8 > data.length) continue;
755
+ if (propId !== 2 && propId !== 4 && propId !== 6) continue;
756
+ const propType = data.readUInt32LE(propOffset);
757
+ if (propType !== 30) continue;
758
+ const strLen = data.readUInt32LE(propOffset + 4);
759
+ if (strLen === 0 || strLen > 1e4 || propOffset + 8 + strLen > data.length) continue;
760
+ const str = data.subarray(propOffset + 8, propOffset + 8 + strLen).toString("utf8").replace(/\0+$/, "").trim();
761
+ if (!str) continue;
762
+ if (propId === 2) metadata.title = str;
763
+ else if (propId === 4) metadata.author = str;
764
+ else if (propId === 6) metadata.description = str;
765
+ }
766
+ } catch {
767
+ }
591
768
  }
592
769
  function findSections(cfb) {
593
770
  const sections = [];
@@ -749,7 +926,7 @@ import { getDocument, GlobalWorkerOptions } from "pdfjs-dist/legacy/build/pdf.mj
749
926
  GlobalWorkerOptions.workerSrc = "";
750
927
  var MAX_PAGES = 5e3;
751
928
  var MAX_TOTAL_TEXT = 100 * 1024 * 1024;
752
- async function parsePdfDocument(buffer) {
929
+ async function parsePdfDocument(buffer, options) {
753
930
  const doc = await getDocument({
754
931
  data: new Uint8Array(buffer),
755
932
  useSystemFonts: true,
@@ -758,12 +935,17 @@ async function parsePdfDocument(buffer) {
758
935
  }).promise;
759
936
  try {
760
937
  const pageCount = doc.numPages;
761
- if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4." };
938
+ if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.", blocks: [] };
939
+ const metadata = { pageCount };
940
+ await extractPdfMetadata(doc, metadata);
762
941
  const pageTexts = [];
942
+ const blocks = [];
763
943
  let totalChars = 0;
764
944
  let totalTextBytes = 0;
765
945
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
946
+ const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
766
947
  for (let i = 1; i <= effectivePageCount; i++) {
948
+ if (pageFilter && !pageFilter.has(i)) continue;
767
949
  const page = await doc.getPage(i);
768
950
  const tc = await page.getTextContent();
769
951
  const pageText = extractPageContent(tc.items);
@@ -771,18 +953,54 @@ async function parsePdfDocument(buffer) {
771
953
  totalTextBytes += pageText.length * 2;
772
954
  if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
773
955
  pageTexts.push(pageText);
956
+ blocks.push({ type: "paragraph", text: pageText });
774
957
  }
775
- if (totalChars / effectivePageCount < 10) {
776
- return { success: false, fileType: "pdf", pageCount, isImageBased: true, error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)` };
958
+ const parsedPageCount = pageFilter ? pageFilter.size : effectivePageCount;
959
+ if (totalChars / Math.max(parsedPageCount, 1) < 10) {
960
+ if (options?.ocr) {
961
+ try {
962
+ const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
963
+ const ocrBlocks = await ocrPages2(doc, options.ocr, pageFilter, effectivePageCount);
964
+ if (ocrBlocks.length > 0) {
965
+ const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
966
+ return { success: true, fileType: "pdf", markdown: ocrMarkdown, pageCount: parsedPageCount, blocks: ocrBlocks, metadata, isImageBased: true };
967
+ }
968
+ } catch {
969
+ }
970
+ }
971
+ return { success: false, fileType: "pdf", pageCount, isImageBased: true, error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`, code: "IMAGE_BASED_PDF" };
777
972
  }
778
973
  let markdown = pageTexts.filter((t) => t.trim()).join("\n\n");
779
974
  markdown = cleanPdfText(markdown);
780
- return { success: true, fileType: "pdf", markdown, pageCount: effectivePageCount };
975
+ return { success: true, fileType: "pdf", markdown, pageCount: parsedPageCount, blocks, metadata };
781
976
  } finally {
782
977
  await doc.destroy().catch(() => {
783
978
  });
784
979
  }
785
980
  }
981
+ async function extractPdfMetadata(doc, metadata) {
982
+ try {
983
+ const result = await doc.getMetadata();
984
+ if (!result?.info) return;
985
+ const info = result.info;
986
+ if (typeof info.Title === "string" && info.Title.trim()) metadata.title = info.Title.trim();
987
+ if (typeof info.Author === "string" && info.Author.trim()) metadata.author = info.Author.trim();
988
+ if (typeof info.Creator === "string" && info.Creator.trim()) metadata.creator = info.Creator.trim();
989
+ if (typeof info.Subject === "string" && info.Subject.trim()) metadata.description = info.Subject.trim();
990
+ if (typeof info.Keywords === "string" && info.Keywords.trim()) {
991
+ metadata.keywords = info.Keywords.split(/[,;]/).map((k) => k.trim()).filter(Boolean);
992
+ }
993
+ if (typeof info.CreationDate === "string") metadata.createdAt = parsePdfDate(info.CreationDate);
994
+ if (typeof info.ModDate === "string") metadata.modifiedAt = parsePdfDate(info.ModDate);
995
+ } catch {
996
+ }
997
+ }
998
+ function parsePdfDate(dateStr) {
999
+ const m = dateStr.match(/D:(\d{4})(\d{2})?(\d{2})?(\d{2})?(\d{2})?(\d{2})?/);
1000
+ if (!m) return void 0;
1001
+ const [, year, month = "01", day = "01", hour = "00", min = "00", sec = "00"] = m;
1002
+ return `${year}-${month}-${day}T${hour}:${min}:${sec}`;
1003
+ }
786
1004
  function extractPageContent(rawItems) {
787
1005
  const items = normalizeItems(rawItems);
788
1006
  if (items.length === 0) return "";
@@ -1055,52 +1273,446 @@ function mergeKoreanLines(text) {
1055
1273
  return result.join("\n");
1056
1274
  }
1057
1275
 
1276
+ // src/diff/text-diff.ts
1277
+ function similarity(a, b) {
1278
+ if (a === b) return 1;
1279
+ if (!a || !b) return 0;
1280
+ const maxLen = Math.max(a.length, b.length);
1281
+ if (maxLen === 0) return 1;
1282
+ return 1 - levenshtein(a, b) / maxLen;
1283
+ }
1284
+ function normalizedSimilarity(a, b) {
1285
+ return similarity(normalize(a), normalize(b));
1286
+ }
1287
+ function normalize(s) {
1288
+ return s.replace(/\s+/g, " ").trim();
1289
+ }
1290
+ function levenshtein(a, b) {
1291
+ if (a.length > b.length) [a, b] = [b, a];
1292
+ const m = a.length;
1293
+ const n = b.length;
1294
+ let prev = Array.from({ length: m + 1 }, (_, i) => i);
1295
+ let curr = new Array(m + 1);
1296
+ for (let j = 1; j <= n; j++) {
1297
+ curr[0] = j;
1298
+ for (let i = 1; i <= m; i++) {
1299
+ if (a[i - 1] === b[j - 1]) {
1300
+ curr[i] = prev[i - 1];
1301
+ } else {
1302
+ curr[i] = 1 + Math.min(prev[i - 1], prev[i], curr[i - 1]);
1303
+ }
1304
+ }
1305
+ ;
1306
+ [prev, curr] = [curr, prev];
1307
+ }
1308
+ return prev[m];
1309
+ }
1310
+
1311
+ // src/diff/compare.ts
1312
+ var SIMILARITY_THRESHOLD = 0.4;
1313
+ async function compare(bufferA, bufferB, options) {
1314
+ const [resultA, resultB] = await Promise.all([
1315
+ parse(bufferA, options),
1316
+ parse(bufferB, options)
1317
+ ]);
1318
+ if (!resultA.success) throw new Error(`\uBB38\uC11CA \uD30C\uC2F1 \uC2E4\uD328: ${resultA.error}`);
1319
+ if (!resultB.success) throw new Error(`\uBB38\uC11CB \uD30C\uC2F1 \uC2E4\uD328: ${resultB.error}`);
1320
+ return diffBlocks(resultA.blocks, resultB.blocks);
1321
+ }
1322
+ function diffBlocks(blocksA, blocksB) {
1323
+ const aligned = alignBlocks(blocksA, blocksB);
1324
+ const stats = { added: 0, removed: 0, modified: 0, unchanged: 0 };
1325
+ const diffs = [];
1326
+ for (const [a, b] of aligned) {
1327
+ if (a && b) {
1328
+ const sim = blockSimilarity(a, b);
1329
+ if (sim >= 0.99) {
1330
+ diffs.push({ type: "unchanged", before: a, after: b, similarity: 1 });
1331
+ stats.unchanged++;
1332
+ } else {
1333
+ const diff = { type: "modified", before: a, after: b, similarity: sim };
1334
+ if (a.type === "table" && b.type === "table" && a.table && b.table) {
1335
+ diff.cellDiffs = diffTableCells(a.table, b.table);
1336
+ }
1337
+ diffs.push(diff);
1338
+ stats.modified++;
1339
+ }
1340
+ } else if (a) {
1341
+ diffs.push({ type: "removed", before: a });
1342
+ stats.removed++;
1343
+ } else if (b) {
1344
+ diffs.push({ type: "added", after: b });
1345
+ stats.added++;
1346
+ }
1347
+ }
1348
+ return { stats, diffs };
1349
+ }
1350
+ function alignBlocks(a, b) {
1351
+ const m = a.length, n = b.length;
1352
+ if (m * n > 1e7) return fallbackAlign(a, b);
1353
+ const simCache = /* @__PURE__ */ new Map();
1354
+ const getSim = (i2, j2) => {
1355
+ const key = `${i2},${j2}`;
1356
+ let v = simCache.get(key);
1357
+ if (v === void 0) {
1358
+ v = blockSimilarity(a[i2], b[j2]);
1359
+ simCache.set(key, v);
1360
+ }
1361
+ return v;
1362
+ };
1363
+ const dp = Array.from({ length: m + 1 }, () => new Array(n + 1).fill(0));
1364
+ for (let i2 = 1; i2 <= m; i2++) {
1365
+ for (let j2 = 1; j2 <= n; j2++) {
1366
+ if (getSim(i2 - 1, j2 - 1) >= SIMILARITY_THRESHOLD) {
1367
+ dp[i2][j2] = dp[i2 - 1][j2 - 1] + 1;
1368
+ } else {
1369
+ dp[i2][j2] = Math.max(dp[i2 - 1][j2], dp[i2][j2 - 1]);
1370
+ }
1371
+ }
1372
+ }
1373
+ const pairs = [];
1374
+ let i = m, j = n;
1375
+ while (i > 0 && j > 0) {
1376
+ if (getSim(i - 1, j - 1) >= SIMILARITY_THRESHOLD && dp[i][j] === dp[i - 1][j - 1] + 1) {
1377
+ pairs.push([i - 1, j - 1]);
1378
+ i--;
1379
+ j--;
1380
+ } else if (dp[i - 1][j] >= dp[i][j - 1]) {
1381
+ i--;
1382
+ } else {
1383
+ j--;
1384
+ }
1385
+ }
1386
+ pairs.reverse();
1387
+ const result = [];
1388
+ let ai = 0, bi = 0;
1389
+ for (const [pi, pj] of pairs) {
1390
+ while (ai < pi) result.push([a[ai++], null]);
1391
+ while (bi < pj) result.push([null, b[bi++]]);
1392
+ result.push([a[ai++], b[bi++]]);
1393
+ }
1394
+ while (ai < m) result.push([a[ai++], null]);
1395
+ while (bi < n) result.push([null, b[bi++]]);
1396
+ return result;
1397
+ }
1398
+ function fallbackAlign(a, b) {
1399
+ const result = [];
1400
+ const len = Math.max(a.length, b.length);
1401
+ for (let i = 0; i < len; i++) {
1402
+ result.push([a[i] || null, b[i] || null]);
1403
+ }
1404
+ return result;
1405
+ }
1406
+ function blockSimilarity(a, b) {
1407
+ if (a.type !== b.type) return 0;
1408
+ if (a.type === "paragraph") {
1409
+ return normalizedSimilarity(a.text || "", b.text || "");
1410
+ }
1411
+ if (a.type === "table" && a.table && b.table) {
1412
+ return tableSimilarity(a.table, b.table);
1413
+ }
1414
+ return 0;
1415
+ }
1416
+ function tableSimilarity(a, b) {
1417
+ const dimSim = 1 - Math.abs(a.rows * a.cols - b.rows * b.cols) / Math.max(a.rows * a.cols, b.rows * b.cols, 1);
1418
+ const textsA = a.cells.flat().map((c) => c.text).join(" ");
1419
+ const textsB = b.cells.flat().map((c) => c.text).join(" ");
1420
+ const contentSim = normalizedSimilarity(textsA, textsB);
1421
+ return dimSim * 0.3 + contentSim * 0.7;
1422
+ }
1423
+ function diffTableCells(a, b) {
1424
+ const maxRows = Math.max(a.rows, b.rows);
1425
+ const maxCols = Math.max(a.cols, b.cols);
1426
+ const result = [];
1427
+ for (let r = 0; r < maxRows; r++) {
1428
+ const row = [];
1429
+ for (let c = 0; c < maxCols; c++) {
1430
+ const cellA = r < a.rows && c < a.cols ? a.cells[r][c].text : void 0;
1431
+ const cellB = r < b.rows && c < b.cols ? b.cells[r][c].text : void 0;
1432
+ let type;
1433
+ if (cellA === void 0) type = "added";
1434
+ else if (cellB === void 0) type = "removed";
1435
+ else if (cellA === cellB) type = "unchanged";
1436
+ else type = "modified";
1437
+ row.push({ type, before: cellA, after: cellB });
1438
+ }
1439
+ result.push(row);
1440
+ }
1441
+ return result;
1442
+ }
1443
+
1444
+ // src/form/recognize.ts
1445
+ var LABEL_KEYWORDS = /* @__PURE__ */ new Set([
1446
+ "\uC131\uBA85",
1447
+ "\uC774\uB984",
1448
+ "\uC8FC\uC18C",
1449
+ "\uC804\uD654",
1450
+ "\uC804\uD654\uBC88\uD638",
1451
+ "\uD734\uB300\uD3F0",
1452
+ "\uD578\uB4DC\uD3F0",
1453
+ "\uC5F0\uB77D\uCC98",
1454
+ "\uC0DD\uB144\uC6D4\uC77C",
1455
+ "\uC8FC\uBBFC\uB4F1\uB85D\uBC88\uD638",
1456
+ "\uC18C\uC18D",
1457
+ "\uC9C1\uC704",
1458
+ "\uC9C1\uAE09",
1459
+ "\uBD80\uC11C",
1460
+ "\uC774\uBA54\uC77C",
1461
+ "\uD329\uC2A4",
1462
+ "\uD559\uAD50",
1463
+ "\uD559\uB144",
1464
+ "\uBC18",
1465
+ "\uBC88\uD638",
1466
+ "\uC2E0\uCCAD\uC778",
1467
+ "\uB300\uD45C\uC790",
1468
+ "\uB2F4\uB2F9\uC790",
1469
+ "\uC791\uC131\uC790",
1470
+ "\uD655\uC778\uC790",
1471
+ "\uC2B9\uC778\uC790",
1472
+ "\uC77C\uC2DC",
1473
+ "\uB0A0\uC9DC",
1474
+ "\uAE30\uAC04",
1475
+ "\uC7A5\uC18C",
1476
+ "\uBAA9\uC801",
1477
+ "\uC0AC\uC720",
1478
+ "\uBE44\uACE0",
1479
+ "\uAE08\uC561",
1480
+ "\uC218\uB7C9",
1481
+ "\uB2E8\uAC00",
1482
+ "\uD569\uACC4",
1483
+ "\uACC4",
1484
+ "\uC18C\uACC4"
1485
+ ]);
1486
+ function isLabelCell(text) {
1487
+ const trimmed = text.trim();
1488
+ if (!trimmed || trimmed.length > 30) return false;
1489
+ for (const kw of LABEL_KEYWORDS) {
1490
+ if (trimmed.includes(kw)) return true;
1491
+ }
1492
+ if (/^[가-힣\s()·:]{2,8}$/.test(trimmed) && !/\d/.test(trimmed)) return true;
1493
+ if (/^[가-힣A-Za-z\s]+[::]$/.test(trimmed)) return true;
1494
+ return false;
1495
+ }
1496
+ function extractFormFields(blocks) {
1497
+ const fields = [];
1498
+ let totalTables = 0;
1499
+ let formTables = 0;
1500
+ for (const block of blocks) {
1501
+ if (block.type !== "table" || !block.table) continue;
1502
+ totalTables++;
1503
+ const tableFields = extractFromTable(block.table);
1504
+ if (tableFields.length > 0) {
1505
+ formTables++;
1506
+ fields.push(...tableFields);
1507
+ }
1508
+ }
1509
+ for (const block of blocks) {
1510
+ if (block.type === "paragraph" && block.text) {
1511
+ const inlineFields = extractInlineFields(block.text);
1512
+ fields.push(...inlineFields);
1513
+ }
1514
+ }
1515
+ const confidence = totalTables > 0 ? formTables / totalTables : fields.length > 0 ? 0.3 : 0;
1516
+ return { fields, confidence: Math.min(confidence, 1) };
1517
+ }
1518
+ function extractFromTable(table) {
1519
+ const fields = [];
1520
+ if (table.cols >= 2) {
1521
+ for (let r = 0; r < table.rows; r++) {
1522
+ for (let c = 0; c < table.cols - 1; c++) {
1523
+ const labelCell = table.cells[r][c];
1524
+ const valueCell = table.cells[r][c + 1];
1525
+ if (isLabelCell(labelCell.text) && valueCell.text.trim()) {
1526
+ fields.push({
1527
+ label: labelCell.text.trim().replace(/[::]\s*$/, ""),
1528
+ value: valueCell.text.trim(),
1529
+ row: r,
1530
+ col: c
1531
+ });
1532
+ }
1533
+ }
1534
+ }
1535
+ }
1536
+ if (fields.length === 0 && table.rows >= 2 && table.cols >= 2) {
1537
+ const headerRow = table.cells[0];
1538
+ const allLabels = headerRow.every((cell) => {
1539
+ const t = cell.text.trim();
1540
+ return t.length > 0 && t.length <= 20;
1541
+ });
1542
+ if (allLabels) {
1543
+ for (let r = 1; r < table.rows; r++) {
1544
+ for (let c = 0; c < table.cols; c++) {
1545
+ const label = headerRow[c].text.trim();
1546
+ const value = table.cells[r][c].text.trim();
1547
+ if (label && value) {
1548
+ fields.push({ label, value, row: r, col: c });
1549
+ }
1550
+ }
1551
+ }
1552
+ }
1553
+ }
1554
+ return fields;
1555
+ }
1556
+ function extractInlineFields(text) {
1557
+ const fields = [];
1558
+ const pattern = /([가-힣A-Za-z]{2,10})\s*[::]\s*([^\n,;]{1,100})/g;
1559
+ let match;
1560
+ while ((match = pattern.exec(text)) !== null) {
1561
+ const label = match[1].trim();
1562
+ const value = match[2].trim();
1563
+ if (value) {
1564
+ fields.push({ label, value, row: -1, col: -1 });
1565
+ }
1566
+ }
1567
+ return fields;
1568
+ }
1569
+
1570
+ // src/hwpx/generator.ts
1571
+ import JSZip2 from "jszip";
1572
+ var HWPML_NS = "http://www.hancom.co.kr/hwpml/2016/HwpMl";
1573
+ async function markdownToHwpx(markdown) {
1574
+ const blocks = parseMarkdownToBlocks(markdown);
1575
+ const sectionXml = blocksToSectionXml(blocks);
1576
+ const zip = new JSZip2();
1577
+ zip.file("mimetype", "application/hwp+zip", { compression: "STORE" });
1578
+ zip.file("Contents/content.hpf", generateManifest());
1579
+ zip.file("Contents/section0.xml", sectionXml);
1580
+ return await zip.generateAsync({ type: "arraybuffer" });
1581
+ }
1582
+ function parseMarkdownToBlocks(md) {
1583
+ const lines = md.split("\n");
1584
+ const blocks = [];
1585
+ let i = 0;
1586
+ while (i < lines.length) {
1587
+ const line = lines[i];
1588
+ if (!line.trim()) {
1589
+ i++;
1590
+ continue;
1591
+ }
1592
+ const headingMatch = line.match(/^(#{1,6})\s+(.+)$/);
1593
+ if (headingMatch) {
1594
+ blocks.push({ type: "heading", text: headingMatch[2].trim(), level: headingMatch[1].length });
1595
+ i++;
1596
+ continue;
1597
+ }
1598
+ if (line.trimStart().startsWith("|")) {
1599
+ const tableRows = [];
1600
+ while (i < lines.length && lines[i].trimStart().startsWith("|")) {
1601
+ const row = lines[i];
1602
+ if (/^\|[\s\-:]+\|/.test(row) && !row.includes("---") === false && /^[\s|:\-]+$/.test(row)) {
1603
+ i++;
1604
+ continue;
1605
+ }
1606
+ const cells = row.split("|").slice(1, -1).map((c) => c.trim());
1607
+ if (cells.length > 0) tableRows.push(cells);
1608
+ i++;
1609
+ }
1610
+ if (tableRows.length > 0) {
1611
+ blocks.push({ type: "table", rows: tableRows });
1612
+ }
1613
+ continue;
1614
+ }
1615
+ blocks.push({ type: "paragraph", text: line.trim() });
1616
+ i++;
1617
+ }
1618
+ return blocks;
1619
+ }
1620
+ function escapeXml(text) {
1621
+ return text.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;");
1622
+ }
1623
+ function generateParagraph(text) {
1624
+ return `<hp:p><hp:run><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
1625
+ }
1626
+ function generateTable(rows) {
1627
+ const trElements = rows.map((row) => {
1628
+ const tdElements = row.map(
1629
+ (cell) => `<hp:tc><hp:cellSpan colSpan="1" rowSpan="1"/>${generateParagraph(cell)}</hp:tc>`
1630
+ ).join("");
1631
+ return `<hp:tr>${tdElements}</hp:tr>`;
1632
+ }).join("");
1633
+ return `<hp:tbl>${trElements}</hp:tbl>`;
1634
+ }
1635
+ function blocksToSectionXml(blocks) {
1636
+ const body = blocks.map((block) => {
1637
+ switch (block.type) {
1638
+ case "heading":
1639
+ return generateParagraph(block.text || "");
1640
+ case "table":
1641
+ return block.rows ? generateTable(block.rows) : "";
1642
+ case "paragraph":
1643
+ return generateParagraph(block.text || "");
1644
+ default:
1645
+ return "";
1646
+ }
1647
+ }).join("\n ");
1648
+ return `<?xml version="1.0" encoding="UTF-8"?>
1649
+ <hs:sec xmlns:hs="${HWPML_NS}" xmlns:hp="${HWPML_NS}">
1650
+ ${body}
1651
+ </hs:sec>`;
1652
+ }
1653
+ function generateManifest() {
1654
+ return `<?xml version="1.0" encoding="UTF-8"?>
1655
+ <opf:package xmlns:opf="http://www.idpf.org/2007/opf">
1656
+ <opf:manifest>
1657
+ <opf:item id="s0" href="section0.xml" media-type="application/xml"/>
1658
+ </opf:manifest>
1659
+ <opf:spine>
1660
+ <opf:itemref idref="s0"/>
1661
+ </opf:spine>
1662
+ </opf:package>`;
1663
+ }
1664
+
1058
1665
  // src/index.ts
1059
- async function parse(buffer) {
1666
+ async function parse(buffer, options) {
1060
1667
  if (!buffer || buffer.byteLength === 0) {
1061
- return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4." };
1668
+ return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
1062
1669
  }
1063
1670
  const format = detectFormat(buffer);
1064
1671
  switch (format) {
1065
1672
  case "hwpx":
1066
- return parseHwpx(buffer);
1673
+ return parseHwpx(buffer, options);
1067
1674
  case "hwp":
1068
- return parseHwp(buffer);
1675
+ return parseHwp(buffer, options);
1069
1676
  case "pdf":
1070
- return parsePdf(buffer);
1677
+ return parsePdf(buffer, options);
1071
1678
  default:
1072
- return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4." };
1679
+ return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
1073
1680
  }
1074
1681
  }
1075
- async function parseHwpx(buffer) {
1682
+ async function parseHwpx(buffer, options) {
1076
1683
  try {
1077
- const markdown = await parseHwpxDocument(buffer);
1078
- return { success: true, fileType: "hwpx", markdown };
1684
+ const { markdown, blocks, metadata } = await parseHwpxDocument(buffer, options);
1685
+ return { success: true, fileType: "hwpx", markdown, blocks, metadata };
1079
1686
  } catch (err) {
1080
- return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328" };
1687
+ return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
1081
1688
  }
1082
1689
  }
1083
- async function parseHwp(buffer) {
1690
+ async function parseHwp(buffer, options) {
1084
1691
  try {
1085
- const markdown = parseHwp5Document(Buffer.from(buffer));
1086
- return { success: true, fileType: "hwp", markdown };
1692
+ const { markdown, blocks, metadata } = parseHwp5Document(Buffer.from(buffer), options);
1693
+ return { success: true, fileType: "hwp", markdown, blocks, metadata };
1087
1694
  } catch (err) {
1088
- return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328" };
1695
+ return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
1089
1696
  }
1090
1697
  }
1091
- async function parsePdf(buffer) {
1698
+ async function parsePdf(buffer, options) {
1092
1699
  try {
1093
- return await parsePdfDocument(buffer);
1700
+ return await parsePdfDocument(buffer, options);
1094
1701
  } catch (err) {
1095
- return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328" };
1702
+ return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
1096
1703
  }
1097
1704
  }
1098
1705
  export {
1099
1706
  VERSION,
1707
+ blocksToMarkdown,
1708
+ compare,
1100
1709
  detectFormat,
1710
+ diffBlocks,
1711
+ extractFormFields,
1101
1712
  isHwpxFile,
1102
1713
  isOldHwpFile,
1103
1714
  isPdfFile,
1715
+ markdownToHwpx,
1104
1716
  parse,
1105
1717
  parseHwp,
1106
1718
  parseHwpx,