kordoc 1.3.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -24,34 +24,6 @@ function detectFormat(buffer) {
24
24
  return "unknown";
25
25
  }
26
26
 
27
- // src/utils.ts
28
- var VERSION = true ? "1.3.0" : "0.0.0-dev";
29
- function toArrayBuffer(buf) {
30
- if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
31
- return buf.buffer;
32
- }
33
- return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
34
- }
35
- var KordocError = class extends Error {
36
- constructor(message) {
37
- super(message);
38
- this.name = "KordocError";
39
- }
40
- };
41
- function sanitizeError(err) {
42
- if (err instanceof KordocError) return err.message;
43
- return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
44
- }
45
- function isPathTraversal(name) {
46
- const normalized = name.replace(/\\/g, "/");
47
- return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
48
- }
49
-
50
- // src/hwpx/parser.ts
51
- import JSZip from "jszip";
52
- import { inflateRawSync } from "zlib";
53
- import { DOMParser } from "@xmldom/xmldom";
54
-
55
27
  // src/table/builder.ts
56
28
  var MAX_COLS = 200;
57
29
  var MAX_ROWS = 1e4;
@@ -181,6 +153,75 @@ function tableToMarkdown(table) {
181
153
  return md.join("\n");
182
154
  }
183
155
 
156
+ // src/utils.ts
157
+ var VERSION = true ? "1.4.1" : "0.0.0-dev";
158
+ function toArrayBuffer(buf) {
159
+ if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
160
+ return buf.buffer;
161
+ }
162
+ return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
163
+ }
164
+ var KordocError = class extends Error {
165
+ constructor(message) {
166
+ super(message);
167
+ this.name = "KordocError";
168
+ }
169
+ };
170
+ function sanitizeError(err) {
171
+ if (err instanceof KordocError) return err.message;
172
+ return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
173
+ }
174
+ function isPathTraversal(name) {
175
+ const normalized = name.replace(/\\/g, "/");
176
+ return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
177
+ }
178
+ function classifyError(err) {
179
+ if (!(err instanceof Error)) return "PARSE_ERROR";
180
+ const msg = err.message;
181
+ if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
182
+ if (msg.includes("DRM")) return "DRM_PROTECTED";
183
+ if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
184
+ if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
185
+ if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
186
+ if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
187
+ if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
188
+ return "PARSE_ERROR";
189
+ }
190
+
191
+ // src/hwpx/parser.ts
192
+ import JSZip from "jszip";
193
+ import { inflateRawSync } from "zlib";
194
+ import { DOMParser } from "@xmldom/xmldom";
195
+
196
+ // src/page-range.ts
197
+ function parsePageRange(spec, maxPages) {
198
+ const result = /* @__PURE__ */ new Set();
199
+ if (maxPages <= 0) return result;
200
+ if (Array.isArray(spec)) {
201
+ for (const n of spec) {
202
+ const page = Math.round(n);
203
+ if (page >= 1 && page <= maxPages) result.add(page);
204
+ }
205
+ return result;
206
+ }
207
+ if (typeof spec !== "string" || spec.trim() === "") return result;
208
+ const parts = spec.split(",");
209
+ for (const part of parts) {
210
+ const trimmed = part.trim();
211
+ if (!trimmed) continue;
212
+ const rangeMatch = trimmed.match(/^(\d+)\s*-\s*(\d+)$/);
213
+ if (rangeMatch) {
214
+ const start = Math.max(1, parseInt(rangeMatch[1], 10));
215
+ const end = Math.min(maxPages, parseInt(rangeMatch[2], 10));
216
+ for (let i = start; i <= end; i++) result.add(i);
217
+ } else {
218
+ const page = parseInt(trimmed, 10);
219
+ if (!isNaN(page) && page >= 1 && page <= maxPages) result.add(page);
220
+ }
221
+ }
222
+ return result;
223
+ }
224
+
184
225
  // src/hwpx/parser.ts
185
226
  var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
186
227
  var MAX_ZIP_ENTRIES = 500;
@@ -190,7 +231,7 @@ function clampSpan(val, max) {
190
231
  function stripDtd(xml) {
191
232
  return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
192
233
  }
193
- async function parseHwpxDocument(buffer) {
234
+ async function parseHwpxDocument(buffer, options) {
194
235
  const precheck = precheckZipSize(buffer);
195
236
  if (precheck.totalUncompressed > MAX_DECOMPRESS_SIZE) {
196
237
  throw new KordocError("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
@@ -208,19 +249,75 @@ async function parseHwpxDocument(buffer) {
208
249
  if (actualEntryCount > MAX_ZIP_ENTRIES) {
209
250
  throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
210
251
  }
252
+ const metadata = {};
253
+ await extractHwpxMetadata(zip, metadata);
211
254
  const sectionPaths = await resolveSectionPaths(zip);
212
255
  if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
256
+ metadata.pageCount = sectionPaths.length;
257
+ const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
213
258
  let totalDecompressed = 0;
214
259
  const blocks = [];
215
- for (const path of sectionPaths) {
216
- const file = zip.file(path);
260
+ for (let si = 0; si < sectionPaths.length; si++) {
261
+ if (pageFilter && !pageFilter.has(si + 1)) continue;
262
+ const file = zip.file(sectionPaths[si]);
217
263
  if (!file) continue;
218
264
  const xml = await file.async("text");
219
265
  totalDecompressed += xml.length * 2;
220
266
  if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
221
267
  blocks.push(...parseSectionXml(xml));
222
268
  }
223
- return blocksToMarkdown(blocks);
269
+ const markdown = blocksToMarkdown(blocks);
270
+ return { markdown, blocks, metadata };
271
+ }
272
+ async function extractHwpxMetadata(zip, metadata) {
273
+ try {
274
+ const metaPaths = ["meta.xml", "META-INF/meta.xml", "docProps/core.xml"];
275
+ for (const mp of metaPaths) {
276
+ const file = zip.file(mp) || Object.values(zip.files).find((f) => f.name.toLowerCase() === mp.toLowerCase()) || null;
277
+ if (!file) continue;
278
+ const xml = await file.async("text");
279
+ parseDublinCoreMetadata(xml, metadata);
280
+ if (metadata.title || metadata.author) return;
281
+ }
282
+ } catch {
283
+ }
284
+ }
285
+ function parseDublinCoreMetadata(xml, metadata) {
286
+ const parser = new DOMParser();
287
+ const doc = parser.parseFromString(stripDtd(xml), "text/xml");
288
+ if (!doc.documentElement) return;
289
+ const getText = (tagNames) => {
290
+ for (const tag of tagNames) {
291
+ const els = doc.getElementsByTagName(tag);
292
+ if (els.length > 0) {
293
+ const text = els[0].textContent?.trim();
294
+ if (text) return text;
295
+ }
296
+ }
297
+ return void 0;
298
+ };
299
+ metadata.title = metadata.title || getText(["dc:title", "title"]);
300
+ metadata.author = metadata.author || getText(["dc:creator", "creator", "cp:lastModifiedBy"]);
301
+ metadata.description = metadata.description || getText(["dc:description", "description", "dc:subject", "subject"]);
302
+ metadata.createdAt = metadata.createdAt || getText(["dcterms:created", "meta:creation-date"]);
303
+ metadata.modifiedAt = metadata.modifiedAt || getText(["dcterms:modified", "meta:date"]);
304
+ const keywords = getText(["dc:keyword", "cp:keywords", "meta:keyword"]);
305
+ if (keywords && !metadata.keywords) {
306
+ metadata.keywords = keywords.split(/[,;]/).map((k) => k.trim()).filter(Boolean);
307
+ }
308
+ }
309
+ async function extractHwpxMetadataOnly(buffer) {
310
+ let zip;
311
+ try {
312
+ zip = await JSZip.loadAsync(buffer);
313
+ } catch {
314
+ throw new KordocError("HWPX ZIP\uC744 \uC5F4 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
315
+ }
316
+ const metadata = {};
317
+ await extractHwpxMetadata(zip, metadata);
318
+ const sectionPaths = await resolveSectionPaths(zip);
319
+ metadata.pageCount = sectionPaths.length;
320
+ return metadata;
224
321
  }
225
322
  function precheckZipSize(buffer) {
226
323
  try {
@@ -259,7 +356,7 @@ function extractFromBrokenZip(buffer) {
259
356
  const data = new Uint8Array(buffer);
260
357
  const view = new DataView(buffer);
261
358
  let pos = 0;
262
- const texts = [];
359
+ const blocks = [];
263
360
  let totalDecompressed = 0;
264
361
  let entryCount = 0;
265
362
  while (pos < data.length - 30) {
@@ -300,14 +397,14 @@ function extractFromBrokenZip(buffer) {
300
397
  }
301
398
  totalDecompressed += content.length * 2;
302
399
  if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
303
- const sectionText = blocksToMarkdown(parseSectionXml(content));
304
- if (sectionText) texts.push(sectionText);
400
+ blocks.push(...parseSectionXml(content));
305
401
  } catch {
306
402
  continue;
307
403
  }
308
404
  }
309
- if (texts.length === 0) throw new KordocError("\uC190\uC0C1\uB41C HWPX\uC5D0\uC11C \uC139\uC158 \uB370\uC774\uD130\uB97C \uBCF5\uAD6C\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
310
- return texts.join("\n\n");
405
+ if (blocks.length === 0) throw new KordocError("\uC190\uC0C1\uB41C HWPX\uC5D0\uC11C \uC139\uC158 \uB370\uC774\uD130\uB97C \uBCF5\uAD6C\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
406
+ const markdown = blocksToMarkdown(blocks);
407
+ return { markdown, blocks };
311
408
  }
312
409
  async function resolveSectionPaths(zip) {
313
410
  const manifestPaths = ["Contents/content.hpf", "content.hpf"];
@@ -579,7 +676,7 @@ var require2 = createRequire(import.meta.url);
579
676
  var CFB = require2("cfb");
580
677
  var MAX_SECTIONS = 100;
581
678
  var MAX_TOTAL_DECOMPRESS = 100 * 1024 * 1024;
582
- function parseHwp5Document(buffer) {
679
+ function parseHwp5Document(buffer, options) {
583
680
  const cfb = CFB.parse(buffer);
584
681
  const headerEntry = CFB.find(cfb, "/FileHeader");
585
682
  if (!headerEntry?.content) throw new KordocError("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
@@ -588,18 +685,73 @@ function parseHwp5Document(buffer) {
588
685
  if (header.flags & FLAG_ENCRYPTED) throw new KordocError("\uC554\uD638\uD654\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
589
686
  if (header.flags & FLAG_DRM) throw new KordocError("DRM \uBCF4\uD638\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
590
687
  const compressed = (header.flags & FLAG_COMPRESSED) !== 0;
688
+ const metadata = {
689
+ version: `${header.versionMajor}.x`
690
+ };
691
+ extractHwp5Metadata(cfb, metadata);
591
692
  const sections = findSections(cfb);
592
693
  if (sections.length === 0) throw new KordocError("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
694
+ metadata.pageCount = sections.length;
695
+ const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
593
696
  const blocks = [];
594
697
  let totalDecompressed = 0;
595
- for (const sectionData of sections) {
698
+ for (let si = 0; si < sections.length; si++) {
699
+ if (pageFilter && !pageFilter.has(si + 1)) continue;
700
+ const sectionData = sections[si];
596
701
  const data = compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
597
702
  totalDecompressed += data.length;
598
703
  if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
599
704
  const records = readRecords(data);
600
705
  blocks.push(...parseSection(records));
601
706
  }
602
- return blocksToMarkdown(blocks);
707
+ const markdown = blocksToMarkdown(blocks);
708
+ return { markdown, blocks, metadata };
709
+ }
710
+ function extractHwp5Metadata(cfb, metadata) {
711
+ try {
712
+ const summaryEntry = CFB.find(cfb, "/HwpSummaryInformation") || CFB.find(cfb, "/SummaryInformation");
713
+ if (!summaryEntry?.content) return;
714
+ const data = Buffer.from(summaryEntry.content);
715
+ if (data.length < 48) return;
716
+ const numSets = data.readUInt32LE(24);
717
+ if (numSets === 0) return;
718
+ const setOffset = data.readUInt32LE(44);
719
+ if (setOffset >= data.length - 8) return;
720
+ const numProps = data.readUInt32LE(setOffset + 4);
721
+ if (numProps === 0 || numProps > 100) return;
722
+ for (let i = 0; i < numProps; i++) {
723
+ const entryOffset = setOffset + 8 + i * 8;
724
+ if (entryOffset + 8 > data.length) break;
725
+ const propId = data.readUInt32LE(entryOffset);
726
+ const propOffset = setOffset + data.readUInt32LE(entryOffset + 4);
727
+ if (propOffset + 8 > data.length) continue;
728
+ if (propId !== 2 && propId !== 4 && propId !== 6) continue;
729
+ const propType = data.readUInt32LE(propOffset);
730
+ if (propType !== 30) continue;
731
+ const strLen = data.readUInt32LE(propOffset + 4);
732
+ if (strLen === 0 || strLen > 1e4 || propOffset + 8 + strLen > data.length) continue;
733
+ const str = data.subarray(propOffset + 8, propOffset + 8 + strLen).toString("utf8").replace(/\0+$/, "").trim();
734
+ if (!str) continue;
735
+ if (propId === 2) metadata.title = str;
736
+ else if (propId === 4) metadata.author = str;
737
+ else if (propId === 6) metadata.description = str;
738
+ }
739
+ } catch {
740
+ }
741
+ }
742
+ function extractHwp5MetadataOnly(buffer) {
743
+ const cfb = CFB.parse(buffer);
744
+ const headerEntry = CFB.find(cfb, "/FileHeader");
745
+ if (!headerEntry?.content) throw new KordocError("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
746
+ const header = parseFileHeader(Buffer.from(headerEntry.content));
747
+ if (header.signature !== "HWP Document File") throw new KordocError("HWP \uC2DC\uADF8\uB2C8\uCC98 \uBD88\uC77C\uCE58");
748
+ const metadata = {
749
+ version: `${header.versionMajor}.x`
750
+ };
751
+ extractHwp5Metadata(cfb, metadata);
752
+ const sections = findSections(cfb);
753
+ metadata.pageCount = sections.length;
754
+ return metadata;
603
755
  }
604
756
  function findSections(cfb) {
605
757
  const sections = [];
@@ -761,7 +913,7 @@ import { getDocument, GlobalWorkerOptions } from "pdfjs-dist/legacy/build/pdf.mj
761
913
  GlobalWorkerOptions.workerSrc = "";
762
914
  var MAX_PAGES = 5e3;
763
915
  var MAX_TOTAL_TEXT = 100 * 1024 * 1024;
764
- async function parsePdfDocument(buffer) {
916
+ async function parsePdfDocument(buffer, options) {
765
917
  const doc = await getDocument({
766
918
  data: new Uint8Array(buffer),
767
919
  useSystemFonts: true,
@@ -770,12 +922,17 @@ async function parsePdfDocument(buffer) {
770
922
  }).promise;
771
923
  try {
772
924
  const pageCount = doc.numPages;
773
- if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4." };
925
+ if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.", blocks: [] };
926
+ const metadata = { pageCount };
927
+ await extractPdfMetadata(doc, metadata);
774
928
  const pageTexts = [];
929
+ const blocks = [];
775
930
  let totalChars = 0;
776
931
  let totalTextBytes = 0;
777
932
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
933
+ const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
778
934
  for (let i = 1; i <= effectivePageCount; i++) {
935
+ if (pageFilter && !pageFilter.has(i)) continue;
779
936
  const page = await doc.getPage(i);
780
937
  const tc = await page.getTextContent();
781
938
  const pageText = extractPageContent(tc.items);
@@ -783,13 +940,65 @@ async function parsePdfDocument(buffer) {
783
940
  totalTextBytes += pageText.length * 2;
784
941
  if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
785
942
  pageTexts.push(pageText);
943
+ blocks.push({ type: "paragraph", text: pageText });
786
944
  }
787
- if (totalChars / effectivePageCount < 10) {
788
- return { success: false, fileType: "pdf", pageCount, isImageBased: true, error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)` };
945
+ const parsedPageCount = pageFilter ? pageFilter.size : effectivePageCount;
946
+ if (totalChars / Math.max(parsedPageCount, 1) < 10) {
947
+ if (options?.ocr) {
948
+ try {
949
+ const { ocrPages } = await import("./provider-JB7SY74K.js");
950
+ const ocrBlocks = await ocrPages(doc, options.ocr, pageFilter, effectivePageCount);
951
+ if (ocrBlocks.length > 0) {
952
+ const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
953
+ return { success: true, fileType: "pdf", markdown: ocrMarkdown, pageCount: parsedPageCount, blocks: ocrBlocks, metadata, isImageBased: true };
954
+ }
955
+ } catch {
956
+ }
957
+ }
958
+ return { success: false, fileType: "pdf", pageCount, isImageBased: true, error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`, code: "IMAGE_BASED_PDF" };
789
959
  }
790
960
  let markdown = pageTexts.filter((t) => t.trim()).join("\n\n");
791
961
  markdown = cleanPdfText(markdown);
792
- return { success: true, fileType: "pdf", markdown, pageCount: effectivePageCount };
962
+ return { success: true, fileType: "pdf", markdown, pageCount: parsedPageCount, blocks, metadata };
963
+ } finally {
964
+ await doc.destroy().catch(() => {
965
+ });
966
+ }
967
+ }
968
+ async function extractPdfMetadata(doc, metadata) {
969
+ try {
970
+ const result = await doc.getMetadata();
971
+ if (!result?.info) return;
972
+ const info = result.info;
973
+ if (typeof info.Title === "string" && info.Title.trim()) metadata.title = info.Title.trim();
974
+ if (typeof info.Author === "string" && info.Author.trim()) metadata.author = info.Author.trim();
975
+ if (typeof info.Creator === "string" && info.Creator.trim()) metadata.creator = info.Creator.trim();
976
+ if (typeof info.Subject === "string" && info.Subject.trim()) metadata.description = info.Subject.trim();
977
+ if (typeof info.Keywords === "string" && info.Keywords.trim()) {
978
+ metadata.keywords = info.Keywords.split(/[,;]/).map((k) => k.trim()).filter(Boolean);
979
+ }
980
+ if (typeof info.CreationDate === "string") metadata.createdAt = parsePdfDate(info.CreationDate);
981
+ if (typeof info.ModDate === "string") metadata.modifiedAt = parsePdfDate(info.ModDate);
982
+ } catch {
983
+ }
984
+ }
985
+ function parsePdfDate(dateStr) {
986
+ const m = dateStr.match(/D:(\d{4})(\d{2})?(\d{2})?(\d{2})?(\d{2})?(\d{2})?/);
987
+ if (!m) return void 0;
988
+ const [, year, month = "01", day = "01", hour = "00", min = "00", sec = "00"] = m;
989
+ return `${year}-${month}-${day}T${hour}:${min}:${sec}`;
990
+ }
991
+ async function extractPdfMetadataOnly(buffer) {
992
+ const doc = await getDocument({
993
+ data: new Uint8Array(buffer),
994
+ useSystemFonts: true,
995
+ disableFontFace: true,
996
+ isEvalSupported: false
997
+ }).promise;
998
+ try {
999
+ const metadata = { pageCount: doc.numPages };
1000
+ await extractPdfMetadata(doc, metadata);
1001
+ return metadata;
793
1002
  } finally {
794
1003
  await doc.destroy().catch(() => {
795
1004
  });
@@ -1067,53 +1276,356 @@ function mergeKoreanLines(text) {
1067
1276
  return result.join("\n");
1068
1277
  }
1069
1278
 
1279
+ // src/form/recognize.ts
1280
+ var LABEL_KEYWORDS = /* @__PURE__ */ new Set([
1281
+ "\uC131\uBA85",
1282
+ "\uC774\uB984",
1283
+ "\uC8FC\uC18C",
1284
+ "\uC804\uD654",
1285
+ "\uC804\uD654\uBC88\uD638",
1286
+ "\uD734\uB300\uD3F0",
1287
+ "\uD578\uB4DC\uD3F0",
1288
+ "\uC5F0\uB77D\uCC98",
1289
+ "\uC0DD\uB144\uC6D4\uC77C",
1290
+ "\uC8FC\uBBFC\uB4F1\uB85D\uBC88\uD638",
1291
+ "\uC18C\uC18D",
1292
+ "\uC9C1\uC704",
1293
+ "\uC9C1\uAE09",
1294
+ "\uBD80\uC11C",
1295
+ "\uC774\uBA54\uC77C",
1296
+ "\uD329\uC2A4",
1297
+ "\uD559\uAD50",
1298
+ "\uD559\uB144",
1299
+ "\uBC18",
1300
+ "\uBC88\uD638",
1301
+ "\uC2E0\uCCAD\uC778",
1302
+ "\uB300\uD45C\uC790",
1303
+ "\uB2F4\uB2F9\uC790",
1304
+ "\uC791\uC131\uC790",
1305
+ "\uD655\uC778\uC790",
1306
+ "\uC2B9\uC778\uC790",
1307
+ "\uC77C\uC2DC",
1308
+ "\uB0A0\uC9DC",
1309
+ "\uAE30\uAC04",
1310
+ "\uC7A5\uC18C",
1311
+ "\uBAA9\uC801",
1312
+ "\uC0AC\uC720",
1313
+ "\uBE44\uACE0",
1314
+ "\uAE08\uC561",
1315
+ "\uC218\uB7C9",
1316
+ "\uB2E8\uAC00",
1317
+ "\uD569\uACC4",
1318
+ "\uACC4",
1319
+ "\uC18C\uACC4"
1320
+ ]);
1321
+ function isLabelCell(text) {
1322
+ const trimmed = text.trim();
1323
+ if (!trimmed || trimmed.length > 30) return false;
1324
+ for (const kw of LABEL_KEYWORDS) {
1325
+ if (trimmed.includes(kw)) return true;
1326
+ }
1327
+ if (/^[가-힣\s()·:]{2,8}$/.test(trimmed) && !/\d/.test(trimmed)) return true;
1328
+ if (/^[가-힣A-Za-z\s]+[::]$/.test(trimmed)) return true;
1329
+ return false;
1330
+ }
1331
+ function extractFormFields(blocks) {
1332
+ const fields = [];
1333
+ let totalTables = 0;
1334
+ let formTables = 0;
1335
+ for (const block of blocks) {
1336
+ if (block.type !== "table" || !block.table) continue;
1337
+ totalTables++;
1338
+ const tableFields = extractFromTable(block.table);
1339
+ if (tableFields.length > 0) {
1340
+ formTables++;
1341
+ fields.push(...tableFields);
1342
+ }
1343
+ }
1344
+ for (const block of blocks) {
1345
+ if (block.type === "paragraph" && block.text) {
1346
+ const inlineFields = extractInlineFields(block.text);
1347
+ fields.push(...inlineFields);
1348
+ }
1349
+ }
1350
+ const confidence = totalTables > 0 ? formTables / totalTables : fields.length > 0 ? 0.3 : 0;
1351
+ return { fields, confidence: Math.min(confidence, 1) };
1352
+ }
1353
+ function extractFromTable(table) {
1354
+ const fields = [];
1355
+ if (table.cols >= 2) {
1356
+ for (let r = 0; r < table.rows; r++) {
1357
+ for (let c = 0; c < table.cols - 1; c++) {
1358
+ const labelCell = table.cells[r][c];
1359
+ const valueCell = table.cells[r][c + 1];
1360
+ if (isLabelCell(labelCell.text) && valueCell.text.trim()) {
1361
+ fields.push({
1362
+ label: labelCell.text.trim().replace(/[::]\s*$/, ""),
1363
+ value: valueCell.text.trim(),
1364
+ row: r,
1365
+ col: c
1366
+ });
1367
+ }
1368
+ }
1369
+ }
1370
+ }
1371
+ if (fields.length === 0 && table.rows >= 2 && table.cols >= 2) {
1372
+ const headerRow = table.cells[0];
1373
+ const allLabels = headerRow.every((cell) => {
1374
+ const t = cell.text.trim();
1375
+ return t.length > 0 && t.length <= 20;
1376
+ });
1377
+ if (allLabels) {
1378
+ for (let r = 1; r < table.rows; r++) {
1379
+ for (let c = 0; c < table.cols; c++) {
1380
+ const label = headerRow[c].text.trim();
1381
+ const value = table.cells[r][c].text.trim();
1382
+ if (label && value) {
1383
+ fields.push({ label, value, row: r, col: c });
1384
+ }
1385
+ }
1386
+ }
1387
+ }
1388
+ }
1389
+ return fields;
1390
+ }
1391
+ function extractInlineFields(text) {
1392
+ const fields = [];
1393
+ const pattern = /([가-힣A-Za-z]{2,10})\s*[::]\s*([^\n,;]{1,100})/g;
1394
+ let match;
1395
+ while ((match = pattern.exec(text)) !== null) {
1396
+ const label = match[1].trim();
1397
+ const value = match[2].trim();
1398
+ if (value) {
1399
+ fields.push({ label, value, row: -1, col: -1 });
1400
+ }
1401
+ }
1402
+ return fields;
1403
+ }
1404
+
1405
+ // src/hwpx/generator.ts
1406
+ import JSZip2 from "jszip";
1407
+
1070
1408
  // src/index.ts
1071
- async function parse(buffer) {
1409
+ async function parse(buffer, options) {
1072
1410
  if (!buffer || buffer.byteLength === 0) {
1073
- return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4." };
1411
+ return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
1074
1412
  }
1075
1413
  const format = detectFormat(buffer);
1076
1414
  switch (format) {
1077
1415
  case "hwpx":
1078
- return parseHwpx(buffer);
1416
+ return parseHwpx(buffer, options);
1079
1417
  case "hwp":
1080
- return parseHwp(buffer);
1418
+ return parseHwp(buffer, options);
1081
1419
  case "pdf":
1082
- return parsePdf(buffer);
1420
+ return parsePdf(buffer, options);
1083
1421
  default:
1084
- return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4." };
1422
+ return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
1085
1423
  }
1086
1424
  }
1087
- async function parseHwpx(buffer) {
1425
+ async function parseHwpx(buffer, options) {
1088
1426
  try {
1089
- const markdown = await parseHwpxDocument(buffer);
1090
- return { success: true, fileType: "hwpx", markdown };
1427
+ const { markdown, blocks, metadata } = await parseHwpxDocument(buffer, options);
1428
+ return { success: true, fileType: "hwpx", markdown, blocks, metadata };
1091
1429
  } catch (err) {
1092
- return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328" };
1430
+ return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
1093
1431
  }
1094
1432
  }
1095
- async function parseHwp(buffer) {
1433
+ async function parseHwp(buffer, options) {
1096
1434
  try {
1097
- const markdown = parseHwp5Document(Buffer.from(buffer));
1098
- return { success: true, fileType: "hwp", markdown };
1435
+ const { markdown, blocks, metadata } = parseHwp5Document(Buffer.from(buffer), options);
1436
+ return { success: true, fileType: "hwp", markdown, blocks, metadata };
1099
1437
  } catch (err) {
1100
- return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328" };
1438
+ return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
1101
1439
  }
1102
1440
  }
1103
- async function parsePdf(buffer) {
1441
+ async function parsePdf(buffer, options) {
1104
1442
  try {
1105
- return await parsePdfDocument(buffer);
1443
+ return await parsePdfDocument(buffer, options);
1106
1444
  } catch (err) {
1107
- return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328" };
1445
+ return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
1446
+ }
1447
+ }
1448
+
1449
+ // src/diff/text-diff.ts
1450
+ function similarity(a, b) {
1451
+ if (a === b) return 1;
1452
+ if (!a || !b) return 0;
1453
+ const maxLen = Math.max(a.length, b.length);
1454
+ if (maxLen === 0) return 1;
1455
+ return 1 - levenshtein(a, b) / maxLen;
1456
+ }
1457
+ function normalizedSimilarity(a, b) {
1458
+ return similarity(normalize(a), normalize(b));
1459
+ }
1460
+ function normalize(s) {
1461
+ return s.replace(/\s+/g, " ").trim();
1462
+ }
1463
+ function levenshtein(a, b) {
1464
+ if (a.length > b.length) [a, b] = [b, a];
1465
+ const m = a.length;
1466
+ const n = b.length;
1467
+ let prev = Array.from({ length: m + 1 }, (_, i) => i);
1468
+ let curr = new Array(m + 1);
1469
+ for (let j = 1; j <= n; j++) {
1470
+ curr[0] = j;
1471
+ for (let i = 1; i <= m; i++) {
1472
+ if (a[i - 1] === b[j - 1]) {
1473
+ curr[i] = prev[i - 1];
1474
+ } else {
1475
+ curr[i] = 1 + Math.min(prev[i - 1], prev[i], curr[i - 1]);
1476
+ }
1477
+ }
1478
+ ;
1479
+ [prev, curr] = [curr, prev];
1108
1480
  }
1481
+ return prev[m];
1482
+ }
1483
+
1484
+ // src/diff/compare.ts
1485
+ var SIMILARITY_THRESHOLD = 0.4;
1486
+ async function compare(bufferA, bufferB, options) {
1487
+ const [resultA, resultB] = await Promise.all([
1488
+ parse(bufferA, options),
1489
+ parse(bufferB, options)
1490
+ ]);
1491
+ if (!resultA.success) throw new Error(`\uBB38\uC11CA \uD30C\uC2F1 \uC2E4\uD328: ${resultA.error}`);
1492
+ if (!resultB.success) throw new Error(`\uBB38\uC11CB \uD30C\uC2F1 \uC2E4\uD328: ${resultB.error}`);
1493
+ return diffBlocks(resultA.blocks, resultB.blocks);
1494
+ }
1495
+ function diffBlocks(blocksA, blocksB) {
1496
+ const aligned = alignBlocks(blocksA, blocksB);
1497
+ const stats = { added: 0, removed: 0, modified: 0, unchanged: 0 };
1498
+ const diffs = [];
1499
+ for (const [a, b] of aligned) {
1500
+ if (a && b) {
1501
+ const sim = blockSimilarity(a, b);
1502
+ if (sim >= 0.99) {
1503
+ diffs.push({ type: "unchanged", before: a, after: b, similarity: 1 });
1504
+ stats.unchanged++;
1505
+ } else {
1506
+ const diff = { type: "modified", before: a, after: b, similarity: sim };
1507
+ if (a.type === "table" && b.type === "table" && a.table && b.table) {
1508
+ diff.cellDiffs = diffTableCells(a.table, b.table);
1509
+ }
1510
+ diffs.push(diff);
1511
+ stats.modified++;
1512
+ }
1513
+ } else if (a) {
1514
+ diffs.push({ type: "removed", before: a });
1515
+ stats.removed++;
1516
+ } else if (b) {
1517
+ diffs.push({ type: "added", after: b });
1518
+ stats.added++;
1519
+ }
1520
+ }
1521
+ return { stats, diffs };
1522
+ }
1523
+ function alignBlocks(a, b) {
1524
+ const m = a.length, n = b.length;
1525
+ if (m * n > 1e7) return fallbackAlign(a, b);
1526
+ const simCache = /* @__PURE__ */ new Map();
1527
+ const getSim = (i2, j2) => {
1528
+ const key = `${i2},${j2}`;
1529
+ let v = simCache.get(key);
1530
+ if (v === void 0) {
1531
+ v = blockSimilarity(a[i2], b[j2]);
1532
+ simCache.set(key, v);
1533
+ }
1534
+ return v;
1535
+ };
1536
+ const dp = Array.from({ length: m + 1 }, () => new Array(n + 1).fill(0));
1537
+ for (let i2 = 1; i2 <= m; i2++) {
1538
+ for (let j2 = 1; j2 <= n; j2++) {
1539
+ if (getSim(i2 - 1, j2 - 1) >= SIMILARITY_THRESHOLD) {
1540
+ dp[i2][j2] = dp[i2 - 1][j2 - 1] + 1;
1541
+ } else {
1542
+ dp[i2][j2] = Math.max(dp[i2 - 1][j2], dp[i2][j2 - 1]);
1543
+ }
1544
+ }
1545
+ }
1546
+ const pairs = [];
1547
+ let i = m, j = n;
1548
+ while (i > 0 && j > 0) {
1549
+ if (getSim(i - 1, j - 1) >= SIMILARITY_THRESHOLD && dp[i][j] === dp[i - 1][j - 1] + 1) {
1550
+ pairs.push([i - 1, j - 1]);
1551
+ i--;
1552
+ j--;
1553
+ } else if (dp[i - 1][j] >= dp[i][j - 1]) {
1554
+ i--;
1555
+ } else {
1556
+ j--;
1557
+ }
1558
+ }
1559
+ pairs.reverse();
1560
+ const result = [];
1561
+ let ai = 0, bi = 0;
1562
+ for (const [pi, pj] of pairs) {
1563
+ while (ai < pi) result.push([a[ai++], null]);
1564
+ while (bi < pj) result.push([null, b[bi++]]);
1565
+ result.push([a[ai++], b[bi++]]);
1566
+ }
1567
+ while (ai < m) result.push([a[ai++], null]);
1568
+ while (bi < n) result.push([null, b[bi++]]);
1569
+ return result;
1570
+ }
1571
+ function fallbackAlign(a, b) {
1572
+ const result = [];
1573
+ const len = Math.max(a.length, b.length);
1574
+ for (let i = 0; i < len; i++) {
1575
+ result.push([a[i] || null, b[i] || null]);
1576
+ }
1577
+ return result;
1578
+ }
1579
+ function blockSimilarity(a, b) {
1580
+ if (a.type !== b.type) return 0;
1581
+ if (a.type === "paragraph") {
1582
+ return normalizedSimilarity(a.text || "", b.text || "");
1583
+ }
1584
+ if (a.type === "table" && a.table && b.table) {
1585
+ return tableSimilarity(a.table, b.table);
1586
+ }
1587
+ return 0;
1588
+ }
1589
+ function tableSimilarity(a, b) {
1590
+ const dimSim = 1 - Math.abs(a.rows * a.cols - b.rows * b.cols) / Math.max(a.rows * a.cols, b.rows * b.cols, 1);
1591
+ const textsA = a.cells.flat().map((c) => c.text).join(" ");
1592
+ const textsB = b.cells.flat().map((c) => c.text).join(" ");
1593
+ const contentSim = normalizedSimilarity(textsA, textsB);
1594
+ return dimSim * 0.3 + contentSim * 0.7;
1595
+ }
1596
+ function diffTableCells(a, b) {
1597
+ const maxRows = Math.max(a.rows, b.rows);
1598
+ const maxCols = Math.max(a.cols, b.cols);
1599
+ const result = [];
1600
+ for (let r = 0; r < maxRows; r++) {
1601
+ const row = [];
1602
+ for (let c = 0; c < maxCols; c++) {
1603
+ const cellA = r < a.rows && c < a.cols ? a.cells[r][c].text : void 0;
1604
+ const cellB = r < b.rows && c < b.cols ? b.cells[r][c].text : void 0;
1605
+ let type;
1606
+ if (cellA === void 0) type = "added";
1607
+ else if (cellB === void 0) type = "removed";
1608
+ else if (cellA === cellB) type = "unchanged";
1609
+ else type = "modified";
1610
+ row.push({ type, before: cellA, after: cellB });
1611
+ }
1612
+ result.push(row);
1613
+ }
1614
+ return result;
1109
1615
  }
1110
1616
 
1111
1617
  export {
1112
1618
  detectFormat,
1619
+ blocksToMarkdown,
1113
1620
  VERSION,
1114
1621
  toArrayBuffer,
1115
1622
  KordocError,
1116
1623
  sanitizeError,
1624
+ extractHwpxMetadataOnly,
1625
+ extractHwp5MetadataOnly,
1626
+ extractPdfMetadataOnly,
1627
+ compare,
1628
+ extractFormFields,
1117
1629
  parse
1118
1630
  };
1119
- //# sourceMappingURL=chunk-KCGDEP7Q.js.map
1631
+ //# sourceMappingURL=chunk-FC5R5FMV.js.map