kordoc 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,3 +1,55 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropNames = Object.getOwnPropertyNames;
3
+ var __esm = (fn, res) => function __init() {
4
+ return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
5
+ };
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+
11
+ // src/ocr/provider.ts
12
+ var provider_exports = {};
13
+ __export(provider_exports, {
14
+ ocrPages: () => ocrPages
15
+ });
16
+ async function ocrPages(doc, provider, pageFilter, effectivePageCount) {
17
+ const blocks = [];
18
+ for (let i = 1; i <= effectivePageCount; i++) {
19
+ if (pageFilter && !pageFilter.has(i)) continue;
20
+ const page = await doc.getPage(i);
21
+ try {
22
+ const imageData = await renderPageToPng(page);
23
+ const text = await provider(imageData, i, "image/png");
24
+ if (text.trim()) {
25
+ blocks.push({ type: "paragraph", text: text.trim() });
26
+ }
27
+ } catch {
28
+ }
29
+ }
30
+ return blocks;
31
+ }
32
+ async function renderPageToPng(page) {
33
+ let createCanvas;
34
+ try {
35
+ const canvasModule = await import("canvas");
36
+ createCanvas = canvasModule.createCanvas;
37
+ } catch {
38
+ throw new Error("OCR\uC744 \uC0AC\uC6A9\uD558\uB824\uBA74 'canvas' \uD328\uD0A4\uC9C0\uB97C \uC124\uCE58\uD558\uC138\uC694: npm install canvas");
39
+ }
40
+ const scale = 2;
41
+ const viewport = page.getViewport({ scale });
42
+ const canvas = createCanvas(Math.floor(viewport.width), Math.floor(viewport.height));
43
+ const ctx = canvas.getContext("2d");
44
+ await page.render({ canvasContext: ctx, viewport }).promise;
45
+ return new Uint8Array(canvas.toBuffer("image/png"));
46
+ }
47
+ var init_provider = __esm({
48
+ "src/ocr/provider.ts"() {
49
+ "use strict";
50
+ }
51
+ });
52
+
1
53
  // src/detect.ts
2
54
  function magicBytes(buffer) {
3
55
  return new Uint8Array(buffer, 0, Math.min(4, buffer.byteLength));
@@ -157,7 +209,7 @@ function tableToMarkdown(table) {
157
209
  }
158
210
 
159
211
  // src/utils.ts
160
- var VERSION = true ? "1.2.0" : "0.0.0-dev";
212
+ var VERSION = true ? "1.4.0" : "0.0.0-dev";
161
213
  var KordocError = class extends Error {
162
214
  constructor(message) {
163
215
  super(message);
@@ -168,6 +220,47 @@ function isPathTraversal(name) {
168
220
  const normalized = name.replace(/\\/g, "/");
169
221
  return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
170
222
  }
223
+ function classifyError(err) {
224
+ if (!(err instanceof Error)) return "PARSE_ERROR";
225
+ const msg = err.message;
226
+ if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
227
+ if (msg.includes("DRM")) return "DRM_PROTECTED";
228
+ if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
229
+ if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
230
+ if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
231
+ if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
232
+ if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
233
+ return "PARSE_ERROR";
234
+ }
235
+
236
+ // src/page-range.ts
237
+ function parsePageRange(spec, maxPages) {
238
+ const result = /* @__PURE__ */ new Set();
239
+ if (maxPages <= 0) return result;
240
+ if (Array.isArray(spec)) {
241
+ for (const n of spec) {
242
+ const page = Math.round(n);
243
+ if (page >= 1 && page <= maxPages) result.add(page);
244
+ }
245
+ return result;
246
+ }
247
+ if (typeof spec !== "string" || spec.trim() === "") return result;
248
+ const parts = spec.split(",");
249
+ for (const part of parts) {
250
+ const trimmed = part.trim();
251
+ if (!trimmed) continue;
252
+ const rangeMatch = trimmed.match(/^(\d+)\s*-\s*(\d+)$/);
253
+ if (rangeMatch) {
254
+ const start = Math.max(1, parseInt(rangeMatch[1], 10));
255
+ const end = Math.min(maxPages, parseInt(rangeMatch[2], 10));
256
+ for (let i = start; i <= end; i++) result.add(i);
257
+ } else {
258
+ const page = parseInt(trimmed, 10);
259
+ if (!isNaN(page) && page >= 1 && page <= maxPages) result.add(page);
260
+ }
261
+ }
262
+ return result;
263
+ }
171
264
 
172
265
  // src/hwpx/parser.ts
173
266
  var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
@@ -178,7 +271,7 @@ function clampSpan(val, max) {
178
271
  function stripDtd(xml) {
179
272
  return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
180
273
  }
181
- async function parseHwpxDocument(buffer) {
274
+ async function parseHwpxDocument(buffer, options) {
182
275
  const precheck = precheckZipSize(buffer);
183
276
  if (precheck.totalUncompressed > MAX_DECOMPRESS_SIZE) {
184
277
  throw new KordocError("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
@@ -196,19 +289,62 @@ async function parseHwpxDocument(buffer) {
196
289
  if (actualEntryCount > MAX_ZIP_ENTRIES) {
197
290
  throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
198
291
  }
292
+ const metadata = {};
293
+ await extractHwpxMetadata(zip, metadata);
199
294
  const sectionPaths = await resolveSectionPaths(zip);
200
295
  if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
296
+ metadata.pageCount = sectionPaths.length;
297
+ const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
201
298
  let totalDecompressed = 0;
202
299
  const blocks = [];
203
- for (const path of sectionPaths) {
204
- const file = zip.file(path);
300
+ for (let si = 0; si < sectionPaths.length; si++) {
301
+ if (pageFilter && !pageFilter.has(si + 1)) continue;
302
+ const file = zip.file(sectionPaths[si]);
205
303
  if (!file) continue;
206
304
  const xml = await file.async("text");
207
305
  totalDecompressed += xml.length * 2;
208
306
  if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
209
307
  blocks.push(...parseSectionXml(xml));
210
308
  }
211
- return blocksToMarkdown(blocks);
309
+ const markdown = blocksToMarkdown(blocks);
310
+ return { markdown, blocks, metadata };
311
+ }
312
+ async function extractHwpxMetadata(zip, metadata) {
313
+ try {
314
+ const metaPaths = ["meta.xml", "META-INF/meta.xml", "docProps/core.xml"];
315
+ for (const mp of metaPaths) {
316
+ const file = zip.file(mp) || Object.values(zip.files).find((f) => f.name.toLowerCase() === mp.toLowerCase()) || null;
317
+ if (!file) continue;
318
+ const xml = await file.async("text");
319
+ parseDublinCoreMetadata(xml, metadata);
320
+ if (metadata.title || metadata.author) return;
321
+ }
322
+ } catch {
323
+ }
324
+ }
325
+ function parseDublinCoreMetadata(xml, metadata) {
326
+ const parser = new DOMParser();
327
+ const doc = parser.parseFromString(stripDtd(xml), "text/xml");
328
+ if (!doc.documentElement) return;
329
+ const getText = (tagNames) => {
330
+ for (const tag of tagNames) {
331
+ const els = doc.getElementsByTagName(tag);
332
+ if (els.length > 0) {
333
+ const text = els[0].textContent?.trim();
334
+ if (text) return text;
335
+ }
336
+ }
337
+ return void 0;
338
+ };
339
+ metadata.title = metadata.title || getText(["dc:title", "title"]);
340
+ metadata.author = metadata.author || getText(["dc:creator", "creator", "cp:lastModifiedBy"]);
341
+ metadata.description = metadata.description || getText(["dc:description", "description", "dc:subject", "subject"]);
342
+ metadata.createdAt = metadata.createdAt || getText(["dcterms:created", "meta:creation-date"]);
343
+ metadata.modifiedAt = metadata.modifiedAt || getText(["dcterms:modified", "meta:date"]);
344
+ const keywords = getText(["dc:keyword", "cp:keywords", "meta:keyword"]);
345
+ if (keywords && !metadata.keywords) {
346
+ metadata.keywords = keywords.split(/[,;]/).map((k) => k.trim()).filter(Boolean);
347
+ }
212
348
  }
213
349
  function precheckZipSize(buffer) {
214
350
  try {
@@ -247,7 +383,7 @@ function extractFromBrokenZip(buffer) {
247
383
  const data = new Uint8Array(buffer);
248
384
  const view = new DataView(buffer);
249
385
  let pos = 0;
250
- const texts = [];
386
+ const blocks = [];
251
387
  let totalDecompressed = 0;
252
388
  let entryCount = 0;
253
389
  while (pos < data.length - 30) {
@@ -288,14 +424,14 @@ function extractFromBrokenZip(buffer) {
288
424
  }
289
425
  totalDecompressed += content.length * 2;
290
426
  if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
291
- const sectionText = blocksToMarkdown(parseSectionXml(content));
292
- if (sectionText) texts.push(sectionText);
427
+ blocks.push(...parseSectionXml(content));
293
428
  } catch {
294
429
  continue;
295
430
  }
296
431
  }
297
- if (texts.length === 0) throw new KordocError("\uC190\uC0C1\uB41C HWPX\uC5D0\uC11C \uC139\uC158 \uB370\uC774\uD130\uB97C \uBCF5\uAD6C\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
298
- return texts.join("\n\n");
432
+ if (blocks.length === 0) throw new KordocError("\uC190\uC0C1\uB41C HWPX\uC5D0\uC11C \uC139\uC158 \uB370\uC774\uD130\uB97C \uBCF5\uAD6C\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
433
+ const markdown = blocksToMarkdown(blocks);
434
+ return { markdown, blocks };
299
435
  }
300
436
  async function resolveSectionPaths(zip) {
301
437
  const manifestPaths = ["Contents/content.hpf", "content.hpf"];
@@ -567,7 +703,7 @@ var require2 = createRequire(import.meta.url);
567
703
  var CFB = require2("cfb");
568
704
  var MAX_SECTIONS = 100;
569
705
  var MAX_TOTAL_DECOMPRESS = 100 * 1024 * 1024;
570
- function parseHwp5Document(buffer) {
706
+ function parseHwp5Document(buffer, options) {
571
707
  const cfb = CFB.parse(buffer);
572
708
  const headerEntry = CFB.find(cfb, "/FileHeader");
573
709
  if (!headerEntry?.content) throw new KordocError("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
@@ -576,18 +712,59 @@ function parseHwp5Document(buffer) {
576
712
  if (header.flags & FLAG_ENCRYPTED) throw new KordocError("\uC554\uD638\uD654\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
577
713
  if (header.flags & FLAG_DRM) throw new KordocError("DRM \uBCF4\uD638\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
578
714
  const compressed = (header.flags & FLAG_COMPRESSED) !== 0;
715
+ const metadata = {
716
+ version: `${header.versionMajor}.x`
717
+ };
718
+ extractHwp5Metadata(cfb, metadata);
579
719
  const sections = findSections(cfb);
580
720
  if (sections.length === 0) throw new KordocError("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
721
+ metadata.pageCount = sections.length;
722
+ const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
581
723
  const blocks = [];
582
724
  let totalDecompressed = 0;
583
- for (const sectionData of sections) {
725
+ for (let si = 0; si < sections.length; si++) {
726
+ if (pageFilter && !pageFilter.has(si + 1)) continue;
727
+ const sectionData = sections[si];
584
728
  const data = compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
585
729
  totalDecompressed += data.length;
586
730
  if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
587
731
  const records = readRecords(data);
588
732
  blocks.push(...parseSection(records));
589
733
  }
590
- return blocksToMarkdown(blocks);
734
+ const markdown = blocksToMarkdown(blocks);
735
+ return { markdown, blocks, metadata };
736
+ }
737
+ function extractHwp5Metadata(cfb, metadata) {
738
+ try {
739
+ const summaryEntry = CFB.find(cfb, "/HwpSummaryInformation") || CFB.find(cfb, "/SummaryInformation");
740
+ if (!summaryEntry?.content) return;
741
+ const data = Buffer.from(summaryEntry.content);
742
+ if (data.length < 48) return;
743
+ const numSets = data.readUInt32LE(24);
744
+ if (numSets === 0) return;
745
+ const setOffset = data.readUInt32LE(44);
746
+ if (setOffset >= data.length - 8) return;
747
+ const numProps = data.readUInt32LE(setOffset + 4);
748
+ if (numProps === 0 || numProps > 100) return;
749
+ for (let i = 0; i < numProps; i++) {
750
+ const entryOffset = setOffset + 8 + i * 8;
751
+ if (entryOffset + 8 > data.length) break;
752
+ const propId = data.readUInt32LE(entryOffset);
753
+ const propOffset = setOffset + data.readUInt32LE(entryOffset + 4);
754
+ if (propOffset + 8 > data.length) continue;
755
+ if (propId !== 2 && propId !== 4 && propId !== 6) continue;
756
+ const propType = data.readUInt32LE(propOffset);
757
+ if (propType !== 30) continue;
758
+ const strLen = data.readUInt32LE(propOffset + 4);
759
+ if (strLen === 0 || strLen > 1e4 || propOffset + 8 + strLen > data.length) continue;
760
+ const str = data.subarray(propOffset + 8, propOffset + 8 + strLen).toString("utf8").replace(/\0+$/, "").trim();
761
+ if (!str) continue;
762
+ if (propId === 2) metadata.title = str;
763
+ else if (propId === 4) metadata.author = str;
764
+ else if (propId === 6) metadata.description = str;
765
+ }
766
+ } catch {
767
+ }
591
768
  }
592
769
  function findSections(cfb) {
593
770
  const sections = [];
@@ -727,33 +904,30 @@ function arrangeCells(rows, cols, cells) {
727
904
  return grid.map((row) => row.map((c) => c || { text: "", colSpan: 1, rowSpan: 1 }));
728
905
  }
729
906
 
907
+ // src/pdf/polyfill.ts
908
+ import * as pdfjsWorker from "pdfjs-dist/legacy/build/pdf.worker.mjs";
909
+ var g = globalThis;
910
+ if (typeof g.DOMMatrix === "undefined") {
911
+ g.DOMMatrix = class DOMMatrix {
912
+ m = [1, 0, 0, 1, 0, 0];
913
+ constructor(init) {
914
+ if (init) this.m = init;
915
+ }
916
+ };
917
+ }
918
+ if (typeof g.Path2D === "undefined") {
919
+ g.Path2D = class Path2D {
920
+ };
921
+ }
922
+ g.pdfjsWorker = pdfjsWorker;
923
+
730
924
  // src/pdf/parser.ts
731
- import { createRequire as createRequire2 } from "module";
732
- import { pathToFileURL } from "url";
925
+ import { getDocument, GlobalWorkerOptions } from "pdfjs-dist/legacy/build/pdf.mjs";
926
+ GlobalWorkerOptions.workerSrc = "";
733
927
  var MAX_PAGES = 5e3;
734
928
  var MAX_TOTAL_TEXT = 100 * 1024 * 1024;
735
- var pdfjsModule = null;
736
- async function loadPdfjs() {
737
- if (pdfjsModule) return pdfjsModule;
738
- try {
739
- const mod = await import("pdfjs-dist/legacy/build/pdf.mjs");
740
- const req = createRequire2(import.meta.url);
741
- const workerPath = req.resolve("pdfjs-dist/legacy/build/pdf.worker.mjs");
742
- mod.GlobalWorkerOptions.workerSrc = pathToFileURL(workerPath).href;
743
- pdfjsModule = mod;
744
- return mod;
745
- } catch (err) {
746
- const msg = err instanceof Error ? err.message : String(err);
747
- if (msg.includes("Cannot find") || msg.includes("MODULE_NOT_FOUND")) return null;
748
- throw new KordocError(`pdfjs-dist \uB85C\uB529 \uC2E4\uD328: ${msg}`);
749
- }
750
- }
751
- async function parsePdfDocument(buffer) {
752
- const pdfjs = await loadPdfjs();
753
- if (!pdfjs) {
754
- return { success: false, fileType: "pdf", pageCount: 0, error: "pdfjs-dist\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4. npm install pdfjs-dist" };
755
- }
756
- const doc = await pdfjs.getDocument({
929
+ async function parsePdfDocument(buffer, options) {
930
+ const doc = await getDocument({
757
931
  data: new Uint8Array(buffer),
758
932
  useSystemFonts: true,
759
933
  disableFontFace: true,
@@ -761,12 +935,17 @@ async function parsePdfDocument(buffer) {
761
935
  }).promise;
762
936
  try {
763
937
  const pageCount = doc.numPages;
764
- if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4." };
938
+ if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.", blocks: [] };
939
+ const metadata = { pageCount };
940
+ await extractPdfMetadata(doc, metadata);
765
941
  const pageTexts = [];
942
+ const blocks = [];
766
943
  let totalChars = 0;
767
944
  let totalTextBytes = 0;
768
945
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
946
+ const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
769
947
  for (let i = 1; i <= effectivePageCount; i++) {
948
+ if (pageFilter && !pageFilter.has(i)) continue;
770
949
  const page = await doc.getPage(i);
771
950
  const tc = await page.getTextContent();
772
951
  const pageText = extractPageContent(tc.items);
@@ -774,18 +953,54 @@ async function parsePdfDocument(buffer) {
774
953
  totalTextBytes += pageText.length * 2;
775
954
  if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
776
955
  pageTexts.push(pageText);
956
+ blocks.push({ type: "paragraph", text: pageText });
777
957
  }
778
- if (totalChars / effectivePageCount < 10) {
779
- return { success: false, fileType: "pdf", pageCount, isImageBased: true, error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)` };
958
+ const parsedPageCount = pageFilter ? pageFilter.size : effectivePageCount;
959
+ if (totalChars / Math.max(parsedPageCount, 1) < 10) {
960
+ if (options?.ocr) {
961
+ try {
962
+ const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
963
+ const ocrBlocks = await ocrPages2(doc, options.ocr, pageFilter, effectivePageCount);
964
+ if (ocrBlocks.length > 0) {
965
+ const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
966
+ return { success: true, fileType: "pdf", markdown: ocrMarkdown, pageCount: parsedPageCount, blocks: ocrBlocks, metadata, isImageBased: true };
967
+ }
968
+ } catch {
969
+ }
970
+ }
971
+ return { success: false, fileType: "pdf", pageCount, isImageBased: true, error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`, code: "IMAGE_BASED_PDF" };
780
972
  }
781
973
  let markdown = pageTexts.filter((t) => t.trim()).join("\n\n");
782
974
  markdown = cleanPdfText(markdown);
783
- return { success: true, fileType: "pdf", markdown, pageCount: effectivePageCount };
975
+ return { success: true, fileType: "pdf", markdown, pageCount: parsedPageCount, blocks, metadata };
784
976
  } finally {
785
977
  await doc.destroy().catch(() => {
786
978
  });
787
979
  }
788
980
  }
981
+ async function extractPdfMetadata(doc, metadata) {
982
+ try {
983
+ const result = await doc.getMetadata();
984
+ if (!result?.info) return;
985
+ const info = result.info;
986
+ if (typeof info.Title === "string" && info.Title.trim()) metadata.title = info.Title.trim();
987
+ if (typeof info.Author === "string" && info.Author.trim()) metadata.author = info.Author.trim();
988
+ if (typeof info.Creator === "string" && info.Creator.trim()) metadata.creator = info.Creator.trim();
989
+ if (typeof info.Subject === "string" && info.Subject.trim()) metadata.description = info.Subject.trim();
990
+ if (typeof info.Keywords === "string" && info.Keywords.trim()) {
991
+ metadata.keywords = info.Keywords.split(/[,;]/).map((k) => k.trim()).filter(Boolean);
992
+ }
993
+ if (typeof info.CreationDate === "string") metadata.createdAt = parsePdfDate(info.CreationDate);
994
+ if (typeof info.ModDate === "string") metadata.modifiedAt = parsePdfDate(info.ModDate);
995
+ } catch {
996
+ }
997
+ }
998
+ function parsePdfDate(dateStr) {
999
+ const m = dateStr.match(/D:(\d{4})(\d{2})?(\d{2})?(\d{2})?(\d{2})?(\d{2})?/);
1000
+ if (!m) return void 0;
1001
+ const [, year, month = "01", day = "01", hour = "00", min = "00", sec = "00"] = m;
1002
+ return `${year}-${month}-${day}T${hour}:${min}:${sec}`;
1003
+ }
789
1004
  function extractPageContent(rawItems) {
790
1005
  const items = normalizeItems(rawItems);
791
1006
  if (items.length === 0) return "";
@@ -1058,52 +1273,446 @@ function mergeKoreanLines(text) {
1058
1273
  return result.join("\n");
1059
1274
  }
1060
1275
 
1276
+ // src/diff/text-diff.ts
1277
+ function similarity(a, b) {
1278
+ if (a === b) return 1;
1279
+ if (!a || !b) return 0;
1280
+ const maxLen = Math.max(a.length, b.length);
1281
+ if (maxLen === 0) return 1;
1282
+ return 1 - levenshtein(a, b) / maxLen;
1283
+ }
1284
+ function normalizedSimilarity(a, b) {
1285
+ return similarity(normalize(a), normalize(b));
1286
+ }
1287
+ function normalize(s) {
1288
+ return s.replace(/\s+/g, " ").trim();
1289
+ }
1290
+ function levenshtein(a, b) {
1291
+ if (a.length > b.length) [a, b] = [b, a];
1292
+ const m = a.length;
1293
+ const n = b.length;
1294
+ let prev = Array.from({ length: m + 1 }, (_, i) => i);
1295
+ let curr = new Array(m + 1);
1296
+ for (let j = 1; j <= n; j++) {
1297
+ curr[0] = j;
1298
+ for (let i = 1; i <= m; i++) {
1299
+ if (a[i - 1] === b[j - 1]) {
1300
+ curr[i] = prev[i - 1];
1301
+ } else {
1302
+ curr[i] = 1 + Math.min(prev[i - 1], prev[i], curr[i - 1]);
1303
+ }
1304
+ }
1305
+ ;
1306
+ [prev, curr] = [curr, prev];
1307
+ }
1308
+ return prev[m];
1309
+ }
1310
+
1311
+ // src/diff/compare.ts
1312
+ var SIMILARITY_THRESHOLD = 0.4;
1313
+ async function compare(bufferA, bufferB, options) {
1314
+ const [resultA, resultB] = await Promise.all([
1315
+ parse(bufferA, options),
1316
+ parse(bufferB, options)
1317
+ ]);
1318
+ if (!resultA.success) throw new Error(`\uBB38\uC11CA \uD30C\uC2F1 \uC2E4\uD328: ${resultA.error}`);
1319
+ if (!resultB.success) throw new Error(`\uBB38\uC11CB \uD30C\uC2F1 \uC2E4\uD328: ${resultB.error}`);
1320
+ return diffBlocks(resultA.blocks, resultB.blocks);
1321
+ }
1322
+ function diffBlocks(blocksA, blocksB) {
1323
+ const aligned = alignBlocks(blocksA, blocksB);
1324
+ const stats = { added: 0, removed: 0, modified: 0, unchanged: 0 };
1325
+ const diffs = [];
1326
+ for (const [a, b] of aligned) {
1327
+ if (a && b) {
1328
+ const sim = blockSimilarity(a, b);
1329
+ if (sim >= 0.99) {
1330
+ diffs.push({ type: "unchanged", before: a, after: b, similarity: 1 });
1331
+ stats.unchanged++;
1332
+ } else {
1333
+ const diff = { type: "modified", before: a, after: b, similarity: sim };
1334
+ if (a.type === "table" && b.type === "table" && a.table && b.table) {
1335
+ diff.cellDiffs = diffTableCells(a.table, b.table);
1336
+ }
1337
+ diffs.push(diff);
1338
+ stats.modified++;
1339
+ }
1340
+ } else if (a) {
1341
+ diffs.push({ type: "removed", before: a });
1342
+ stats.removed++;
1343
+ } else if (b) {
1344
+ diffs.push({ type: "added", after: b });
1345
+ stats.added++;
1346
+ }
1347
+ }
1348
+ return { stats, diffs };
1349
+ }
1350
+ function alignBlocks(a, b) {
1351
+ const m = a.length, n = b.length;
1352
+ if (m * n > 1e7) return fallbackAlign(a, b);
1353
+ const simCache = /* @__PURE__ */ new Map();
1354
+ const getSim = (i2, j2) => {
1355
+ const key = `${i2},${j2}`;
1356
+ let v = simCache.get(key);
1357
+ if (v === void 0) {
1358
+ v = blockSimilarity(a[i2], b[j2]);
1359
+ simCache.set(key, v);
1360
+ }
1361
+ return v;
1362
+ };
1363
+ const dp = Array.from({ length: m + 1 }, () => new Array(n + 1).fill(0));
1364
+ for (let i2 = 1; i2 <= m; i2++) {
1365
+ for (let j2 = 1; j2 <= n; j2++) {
1366
+ if (getSim(i2 - 1, j2 - 1) >= SIMILARITY_THRESHOLD) {
1367
+ dp[i2][j2] = dp[i2 - 1][j2 - 1] + 1;
1368
+ } else {
1369
+ dp[i2][j2] = Math.max(dp[i2 - 1][j2], dp[i2][j2 - 1]);
1370
+ }
1371
+ }
1372
+ }
1373
+ const pairs = [];
1374
+ let i = m, j = n;
1375
+ while (i > 0 && j > 0) {
1376
+ if (getSim(i - 1, j - 1) >= SIMILARITY_THRESHOLD && dp[i][j] === dp[i - 1][j - 1] + 1) {
1377
+ pairs.push([i - 1, j - 1]);
1378
+ i--;
1379
+ j--;
1380
+ } else if (dp[i - 1][j] >= dp[i][j - 1]) {
1381
+ i--;
1382
+ } else {
1383
+ j--;
1384
+ }
1385
+ }
1386
+ pairs.reverse();
1387
+ const result = [];
1388
+ let ai = 0, bi = 0;
1389
+ for (const [pi, pj] of pairs) {
1390
+ while (ai < pi) result.push([a[ai++], null]);
1391
+ while (bi < pj) result.push([null, b[bi++]]);
1392
+ result.push([a[ai++], b[bi++]]);
1393
+ }
1394
+ while (ai < m) result.push([a[ai++], null]);
1395
+ while (bi < n) result.push([null, b[bi++]]);
1396
+ return result;
1397
+ }
1398
+ function fallbackAlign(a, b) {
1399
+ const result = [];
1400
+ const len = Math.max(a.length, b.length);
1401
+ for (let i = 0; i < len; i++) {
1402
+ result.push([a[i] || null, b[i] || null]);
1403
+ }
1404
+ return result;
1405
+ }
1406
+ function blockSimilarity(a, b) {
1407
+ if (a.type !== b.type) return 0;
1408
+ if (a.type === "paragraph") {
1409
+ return normalizedSimilarity(a.text || "", b.text || "");
1410
+ }
1411
+ if (a.type === "table" && a.table && b.table) {
1412
+ return tableSimilarity(a.table, b.table);
1413
+ }
1414
+ return 0;
1415
+ }
1416
+ function tableSimilarity(a, b) {
1417
+ const dimSim = 1 - Math.abs(a.rows * a.cols - b.rows * b.cols) / Math.max(a.rows * a.cols, b.rows * b.cols, 1);
1418
+ const textsA = a.cells.flat().map((c) => c.text).join(" ");
1419
+ const textsB = b.cells.flat().map((c) => c.text).join(" ");
1420
+ const contentSim = normalizedSimilarity(textsA, textsB);
1421
+ return dimSim * 0.3 + contentSim * 0.7;
1422
+ }
1423
+ function diffTableCells(a, b) {
1424
+ const maxRows = Math.max(a.rows, b.rows);
1425
+ const maxCols = Math.max(a.cols, b.cols);
1426
+ const result = [];
1427
+ for (let r = 0; r < maxRows; r++) {
1428
+ const row = [];
1429
+ for (let c = 0; c < maxCols; c++) {
1430
+ const cellA = r < a.rows && c < a.cols ? a.cells[r][c].text : void 0;
1431
+ const cellB = r < b.rows && c < b.cols ? b.cells[r][c].text : void 0;
1432
+ let type;
1433
+ if (cellA === void 0) type = "added";
1434
+ else if (cellB === void 0) type = "removed";
1435
+ else if (cellA === cellB) type = "unchanged";
1436
+ else type = "modified";
1437
+ row.push({ type, before: cellA, after: cellB });
1438
+ }
1439
+ result.push(row);
1440
+ }
1441
+ return result;
1442
+ }
1443
+
1444
+ // src/form/recognize.ts
1445
+ var LABEL_KEYWORDS = /* @__PURE__ */ new Set([
1446
+ "\uC131\uBA85",
1447
+ "\uC774\uB984",
1448
+ "\uC8FC\uC18C",
1449
+ "\uC804\uD654",
1450
+ "\uC804\uD654\uBC88\uD638",
1451
+ "\uD734\uB300\uD3F0",
1452
+ "\uD578\uB4DC\uD3F0",
1453
+ "\uC5F0\uB77D\uCC98",
1454
+ "\uC0DD\uB144\uC6D4\uC77C",
1455
+ "\uC8FC\uBBFC\uB4F1\uB85D\uBC88\uD638",
1456
+ "\uC18C\uC18D",
1457
+ "\uC9C1\uC704",
1458
+ "\uC9C1\uAE09",
1459
+ "\uBD80\uC11C",
1460
+ "\uC774\uBA54\uC77C",
1461
+ "\uD329\uC2A4",
1462
+ "\uD559\uAD50",
1463
+ "\uD559\uB144",
1464
+ "\uBC18",
1465
+ "\uBC88\uD638",
1466
+ "\uC2E0\uCCAD\uC778",
1467
+ "\uB300\uD45C\uC790",
1468
+ "\uB2F4\uB2F9\uC790",
1469
+ "\uC791\uC131\uC790",
1470
+ "\uD655\uC778\uC790",
1471
+ "\uC2B9\uC778\uC790",
1472
+ "\uC77C\uC2DC",
1473
+ "\uB0A0\uC9DC",
1474
+ "\uAE30\uAC04",
1475
+ "\uC7A5\uC18C",
1476
+ "\uBAA9\uC801",
1477
+ "\uC0AC\uC720",
1478
+ "\uBE44\uACE0",
1479
+ "\uAE08\uC561",
1480
+ "\uC218\uB7C9",
1481
+ "\uB2E8\uAC00",
1482
+ "\uD569\uACC4",
1483
+ "\uACC4",
1484
+ "\uC18C\uACC4"
1485
+ ]);
1486
+ function isLabelCell(text) {
1487
+ const trimmed = text.trim();
1488
+ if (!trimmed || trimmed.length > 30) return false;
1489
+ for (const kw of LABEL_KEYWORDS) {
1490
+ if (trimmed.includes(kw)) return true;
1491
+ }
1492
+ if (/^[가-힣\s()·:]{2,8}$/.test(trimmed) && !/\d/.test(trimmed)) return true;
1493
+ if (/^[가-힣A-Za-z\s]+[::]$/.test(trimmed)) return true;
1494
+ return false;
1495
+ }
1496
+ function extractFormFields(blocks) {
1497
+ const fields = [];
1498
+ let totalTables = 0;
1499
+ let formTables = 0;
1500
+ for (const block of blocks) {
1501
+ if (block.type !== "table" || !block.table) continue;
1502
+ totalTables++;
1503
+ const tableFields = extractFromTable(block.table);
1504
+ if (tableFields.length > 0) {
1505
+ formTables++;
1506
+ fields.push(...tableFields);
1507
+ }
1508
+ }
1509
+ for (const block of blocks) {
1510
+ if (block.type === "paragraph" && block.text) {
1511
+ const inlineFields = extractInlineFields(block.text);
1512
+ fields.push(...inlineFields);
1513
+ }
1514
+ }
1515
+ const confidence = totalTables > 0 ? formTables / totalTables : fields.length > 0 ? 0.3 : 0;
1516
+ return { fields, confidence: Math.min(confidence, 1) };
1517
+ }
1518
+ function extractFromTable(table) {
1519
+ const fields = [];
1520
+ if (table.cols >= 2) {
1521
+ for (let r = 0; r < table.rows; r++) {
1522
+ for (let c = 0; c < table.cols - 1; c++) {
1523
+ const labelCell = table.cells[r][c];
1524
+ const valueCell = table.cells[r][c + 1];
1525
+ if (isLabelCell(labelCell.text) && valueCell.text.trim()) {
1526
+ fields.push({
1527
+ label: labelCell.text.trim().replace(/[::]\s*$/, ""),
1528
+ value: valueCell.text.trim(),
1529
+ row: r,
1530
+ col: c
1531
+ });
1532
+ }
1533
+ }
1534
+ }
1535
+ }
1536
+ if (fields.length === 0 && table.rows >= 2 && table.cols >= 2) {
1537
+ const headerRow = table.cells[0];
1538
+ const allLabels = headerRow.every((cell) => {
1539
+ const t = cell.text.trim();
1540
+ return t.length > 0 && t.length <= 20;
1541
+ });
1542
+ if (allLabels) {
1543
+ for (let r = 1; r < table.rows; r++) {
1544
+ for (let c = 0; c < table.cols; c++) {
1545
+ const label = headerRow[c].text.trim();
1546
+ const value = table.cells[r][c].text.trim();
1547
+ if (label && value) {
1548
+ fields.push({ label, value, row: r, col: c });
1549
+ }
1550
+ }
1551
+ }
1552
+ }
1553
+ }
1554
+ return fields;
1555
+ }
1556
+ function extractInlineFields(text) {
1557
+ const fields = [];
1558
+ const pattern = /([가-힣A-Za-z]{2,10})\s*[::]\s*([^\n,;]{1,100})/g;
1559
+ let match;
1560
+ while ((match = pattern.exec(text)) !== null) {
1561
+ const label = match[1].trim();
1562
+ const value = match[2].trim();
1563
+ if (value) {
1564
+ fields.push({ label, value, row: -1, col: -1 });
1565
+ }
1566
+ }
1567
+ return fields;
1568
+ }
1569
+
1570
+ // src/hwpx/generator.ts
1571
+ import JSZip2 from "jszip";
1572
+ var HWPML_NS = "http://www.hancom.co.kr/hwpml/2016/HwpMl";
1573
+ async function markdownToHwpx(markdown) {
1574
+ const blocks = parseMarkdownToBlocks(markdown);
1575
+ const sectionXml = blocksToSectionXml(blocks);
1576
+ const zip = new JSZip2();
1577
+ zip.file("mimetype", "application/hwp+zip", { compression: "STORE" });
1578
+ zip.file("Contents/content.hpf", generateManifest());
1579
+ zip.file("Contents/section0.xml", sectionXml);
1580
+ return await zip.generateAsync({ type: "arraybuffer" });
1581
+ }
1582
+ function parseMarkdownToBlocks(md) {
1583
+ const lines = md.split("\n");
1584
+ const blocks = [];
1585
+ let i = 0;
1586
+ while (i < lines.length) {
1587
+ const line = lines[i];
1588
+ if (!line.trim()) {
1589
+ i++;
1590
+ continue;
1591
+ }
1592
+ const headingMatch = line.match(/^(#{1,6})\s+(.+)$/);
1593
+ if (headingMatch) {
1594
+ blocks.push({ type: "heading", text: headingMatch[2].trim(), level: headingMatch[1].length });
1595
+ i++;
1596
+ continue;
1597
+ }
1598
+ if (line.trimStart().startsWith("|")) {
1599
+ const tableRows = [];
1600
+ while (i < lines.length && lines[i].trimStart().startsWith("|")) {
1601
+ const row = lines[i];
1602
+ if (/^\|[\s\-:]+\|/.test(row) && !row.includes("---") === false && /^[\s|:\-]+$/.test(row)) {
1603
+ i++;
1604
+ continue;
1605
+ }
1606
+ const cells = row.split("|").slice(1, -1).map((c) => c.trim());
1607
+ if (cells.length > 0) tableRows.push(cells);
1608
+ i++;
1609
+ }
1610
+ if (tableRows.length > 0) {
1611
+ blocks.push({ type: "table", rows: tableRows });
1612
+ }
1613
+ continue;
1614
+ }
1615
+ blocks.push({ type: "paragraph", text: line.trim() });
1616
+ i++;
1617
+ }
1618
+ return blocks;
1619
+ }
1620
+ function escapeXml(text) {
1621
+ return text.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;");
1622
+ }
1623
+ function generateParagraph(text) {
1624
+ return `<hp:p><hp:run><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
1625
+ }
1626
+ function generateTable(rows) {
1627
+ const trElements = rows.map((row) => {
1628
+ const tdElements = row.map(
1629
+ (cell) => `<hp:tc><hp:cellSpan colSpan="1" rowSpan="1"/>${generateParagraph(cell)}</hp:tc>`
1630
+ ).join("");
1631
+ return `<hp:tr>${tdElements}</hp:tr>`;
1632
+ }).join("");
1633
+ return `<hp:tbl>${trElements}</hp:tbl>`;
1634
+ }
1635
+ function blocksToSectionXml(blocks) {
1636
+ const body = blocks.map((block) => {
1637
+ switch (block.type) {
1638
+ case "heading":
1639
+ return generateParagraph(block.text || "");
1640
+ case "table":
1641
+ return block.rows ? generateTable(block.rows) : "";
1642
+ case "paragraph":
1643
+ return generateParagraph(block.text || "");
1644
+ default:
1645
+ return "";
1646
+ }
1647
+ }).join("\n ");
1648
+ return `<?xml version="1.0" encoding="UTF-8"?>
1649
+ <hs:sec xmlns:hs="${HWPML_NS}" xmlns:hp="${HWPML_NS}">
1650
+ ${body}
1651
+ </hs:sec>`;
1652
+ }
1653
+ function generateManifest() {
1654
+ return `<?xml version="1.0" encoding="UTF-8"?>
1655
+ <opf:package xmlns:opf="http://www.idpf.org/2007/opf">
1656
+ <opf:manifest>
1657
+ <opf:item id="s0" href="section0.xml" media-type="application/xml"/>
1658
+ </opf:manifest>
1659
+ <opf:spine>
1660
+ <opf:itemref idref="s0"/>
1661
+ </opf:spine>
1662
+ </opf:package>`;
1663
+ }
1664
+
1061
1665
  // src/index.ts
1062
- async function parse(buffer) {
1666
+ async function parse(buffer, options) {
1063
1667
  if (!buffer || buffer.byteLength === 0) {
1064
- return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4." };
1668
+ return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
1065
1669
  }
1066
1670
  const format = detectFormat(buffer);
1067
1671
  switch (format) {
1068
1672
  case "hwpx":
1069
- return parseHwpx(buffer);
1673
+ return parseHwpx(buffer, options);
1070
1674
  case "hwp":
1071
- return parseHwp(buffer);
1675
+ return parseHwp(buffer, options);
1072
1676
  case "pdf":
1073
- return parsePdf(buffer);
1677
+ return parsePdf(buffer, options);
1074
1678
  default:
1075
- return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4." };
1679
+ return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
1076
1680
  }
1077
1681
  }
1078
- async function parseHwpx(buffer) {
1682
+ async function parseHwpx(buffer, options) {
1079
1683
  try {
1080
- const markdown = await parseHwpxDocument(buffer);
1081
- return { success: true, fileType: "hwpx", markdown };
1684
+ const { markdown, blocks, metadata } = await parseHwpxDocument(buffer, options);
1685
+ return { success: true, fileType: "hwpx", markdown, blocks, metadata };
1082
1686
  } catch (err) {
1083
- return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328" };
1687
+ return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
1084
1688
  }
1085
1689
  }
1086
- async function parseHwp(buffer) {
1690
+ async function parseHwp(buffer, options) {
1087
1691
  try {
1088
- const markdown = parseHwp5Document(Buffer.from(buffer));
1089
- return { success: true, fileType: "hwp", markdown };
1692
+ const { markdown, blocks, metadata } = parseHwp5Document(Buffer.from(buffer), options);
1693
+ return { success: true, fileType: "hwp", markdown, blocks, metadata };
1090
1694
  } catch (err) {
1091
- return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328" };
1695
+ return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
1092
1696
  }
1093
1697
  }
1094
- async function parsePdf(buffer) {
1698
+ async function parsePdf(buffer, options) {
1095
1699
  try {
1096
- return await parsePdfDocument(buffer);
1700
+ return await parsePdfDocument(buffer, options);
1097
1701
  } catch (err) {
1098
- return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328" };
1702
+ return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
1099
1703
  }
1100
1704
  }
1101
1705
  export {
1102
1706
  VERSION,
1707
+ blocksToMarkdown,
1708
+ compare,
1103
1709
  detectFormat,
1710
+ diffBlocks,
1711
+ extractFormFields,
1104
1712
  isHwpxFile,
1105
1713
  isOldHwpFile,
1106
1714
  isPdfFile,
1715
+ markdownToHwpx,
1107
1716
  parse,
1108
1717
  parseHwp,
1109
1718
  parseHwpx,