kordoc 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -5,6 +5,9 @@ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
5
  var __getOwnPropNames = Object.getOwnPropertyNames;
6
6
  var __getProtoOf = Object.getPrototypeOf;
7
7
  var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __esm = (fn, res) => function __init() {
9
+ return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
10
+ };
8
11
  var __export = (target, all) => {
9
12
  for (var name in all)
10
13
  __defProp(target, name, { get: all[name], enumerable: true });
@@ -27,14 +30,61 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
27
30
  ));
28
31
  var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
32
 
33
+ // src/ocr/provider.ts
34
+ var provider_exports = {};
35
+ __export(provider_exports, {
36
+ ocrPages: () => ocrPages
37
+ });
38
+ async function ocrPages(doc, provider, pageFilter, effectivePageCount) {
39
+ const blocks = [];
40
+ for (let i = 1; i <= effectivePageCount; i++) {
41
+ if (pageFilter && !pageFilter.has(i)) continue;
42
+ const page = await doc.getPage(i);
43
+ try {
44
+ const imageData = await renderPageToPng(page);
45
+ const text = await provider(imageData, i, "image/png");
46
+ if (text.trim()) {
47
+ blocks.push({ type: "paragraph", text: text.trim() });
48
+ }
49
+ } catch {
50
+ }
51
+ }
52
+ return blocks;
53
+ }
54
+ async function renderPageToPng(page) {
55
+ let createCanvas;
56
+ try {
57
+ const canvasModule = await import("canvas");
58
+ createCanvas = canvasModule.createCanvas;
59
+ } catch {
60
+ throw new Error("OCR\uC744 \uC0AC\uC6A9\uD558\uB824\uBA74 'canvas' \uD328\uD0A4\uC9C0\uB97C \uC124\uCE58\uD558\uC138\uC694: npm install canvas");
61
+ }
62
+ const scale = 2;
63
+ const viewport = page.getViewport({ scale });
64
+ const canvas = createCanvas(Math.floor(viewport.width), Math.floor(viewport.height));
65
+ const ctx = canvas.getContext("2d");
66
+ await page.render({ canvasContext: ctx, viewport }).promise;
67
+ return new Uint8Array(canvas.toBuffer("image/png"));
68
+ }
69
+ var init_provider = __esm({
70
+ "src/ocr/provider.ts"() {
71
+ "use strict";
72
+ }
73
+ });
74
+
30
75
  // src/index.ts
31
76
  var index_exports = {};
32
77
  __export(index_exports, {
33
78
  VERSION: () => VERSION,
79
+ blocksToMarkdown: () => blocksToMarkdown,
80
+ compare: () => compare,
34
81
  detectFormat: () => detectFormat,
82
+ diffBlocks: () => diffBlocks,
83
+ extractFormFields: () => extractFormFields,
35
84
  isHwpxFile: () => isHwpxFile,
36
85
  isOldHwpFile: () => isOldHwpFile,
37
86
  isPdfFile: () => isPdfFile,
87
+ markdownToHwpx: () => markdownToHwpx,
38
88
  parse: () => parse,
39
89
  parseHwp: () => parseHwp,
40
90
  parseHwpx: () => parseHwpx,
@@ -201,7 +251,7 @@ function tableToMarkdown(table) {
201
251
  }
202
252
 
203
253
  // src/utils.ts
204
- var VERSION = true ? "1.2.0" : "0.0.0-dev";
254
+ var VERSION = true ? "1.4.0" : "0.0.0-dev";
205
255
  var KordocError = class extends Error {
206
256
  constructor(message) {
207
257
  super(message);
@@ -212,6 +262,47 @@ function isPathTraversal(name) {
212
262
  const normalized = name.replace(/\\/g, "/");
213
263
  return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
214
264
  }
265
+ function classifyError(err) {
266
+ if (!(err instanceof Error)) return "PARSE_ERROR";
267
+ const msg = err.message;
268
+ if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
269
+ if (msg.includes("DRM")) return "DRM_PROTECTED";
270
+ if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
271
+ if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
272
+ if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
273
+ if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
274
+ if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
275
+ return "PARSE_ERROR";
276
+ }
277
+
278
+ // src/page-range.ts
279
+ function parsePageRange(spec, maxPages) {
280
+ const result = /* @__PURE__ */ new Set();
281
+ if (maxPages <= 0) return result;
282
+ if (Array.isArray(spec)) {
283
+ for (const n of spec) {
284
+ const page = Math.round(n);
285
+ if (page >= 1 && page <= maxPages) result.add(page);
286
+ }
287
+ return result;
288
+ }
289
+ if (typeof spec !== "string" || spec.trim() === "") return result;
290
+ const parts = spec.split(",");
291
+ for (const part of parts) {
292
+ const trimmed = part.trim();
293
+ if (!trimmed) continue;
294
+ const rangeMatch = trimmed.match(/^(\d+)\s*-\s*(\d+)$/);
295
+ if (rangeMatch) {
296
+ const start = Math.max(1, parseInt(rangeMatch[1], 10));
297
+ const end = Math.min(maxPages, parseInt(rangeMatch[2], 10));
298
+ for (let i = start; i <= end; i++) result.add(i);
299
+ } else {
300
+ const page = parseInt(trimmed, 10);
301
+ if (!isNaN(page) && page >= 1 && page <= maxPages) result.add(page);
302
+ }
303
+ }
304
+ return result;
305
+ }
215
306
 
216
307
  // src/hwpx/parser.ts
217
308
  var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
@@ -222,7 +313,7 @@ function clampSpan(val, max) {
222
313
  function stripDtd(xml) {
223
314
  return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
224
315
  }
225
- async function parseHwpxDocument(buffer) {
316
+ async function parseHwpxDocument(buffer, options) {
226
317
  const precheck = precheckZipSize(buffer);
227
318
  if (precheck.totalUncompressed > MAX_DECOMPRESS_SIZE) {
228
319
  throw new KordocError("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
@@ -240,19 +331,62 @@ async function parseHwpxDocument(buffer) {
240
331
  if (actualEntryCount > MAX_ZIP_ENTRIES) {
241
332
  throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
242
333
  }
334
+ const metadata = {};
335
+ await extractHwpxMetadata(zip, metadata);
243
336
  const sectionPaths = await resolveSectionPaths(zip);
244
337
  if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
338
+ metadata.pageCount = sectionPaths.length;
339
+ const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
245
340
  let totalDecompressed = 0;
246
341
  const blocks = [];
247
- for (const path of sectionPaths) {
248
- const file = zip.file(path);
342
+ for (let si = 0; si < sectionPaths.length; si++) {
343
+ if (pageFilter && !pageFilter.has(si + 1)) continue;
344
+ const file = zip.file(sectionPaths[si]);
249
345
  if (!file) continue;
250
346
  const xml = await file.async("text");
251
347
  totalDecompressed += xml.length * 2;
252
348
  if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
253
349
  blocks.push(...parseSectionXml(xml));
254
350
  }
255
- return blocksToMarkdown(blocks);
351
+ const markdown = blocksToMarkdown(blocks);
352
+ return { markdown, blocks, metadata };
353
+ }
354
+ async function extractHwpxMetadata(zip, metadata) {
355
+ try {
356
+ const metaPaths = ["meta.xml", "META-INF/meta.xml", "docProps/core.xml"];
357
+ for (const mp of metaPaths) {
358
+ const file = zip.file(mp) || Object.values(zip.files).find((f) => f.name.toLowerCase() === mp.toLowerCase()) || null;
359
+ if (!file) continue;
360
+ const xml = await file.async("text");
361
+ parseDublinCoreMetadata(xml, metadata);
362
+ if (metadata.title || metadata.author) return;
363
+ }
364
+ } catch {
365
+ }
366
+ }
367
+ function parseDublinCoreMetadata(xml, metadata) {
368
+ const parser = new import_xmldom.DOMParser();
369
+ const doc = parser.parseFromString(stripDtd(xml), "text/xml");
370
+ if (!doc.documentElement) return;
371
+ const getText = (tagNames) => {
372
+ for (const tag of tagNames) {
373
+ const els = doc.getElementsByTagName(tag);
374
+ if (els.length > 0) {
375
+ const text = els[0].textContent?.trim();
376
+ if (text) return text;
377
+ }
378
+ }
379
+ return void 0;
380
+ };
381
+ metadata.title = metadata.title || getText(["dc:title", "title"]);
382
+ metadata.author = metadata.author || getText(["dc:creator", "creator", "cp:lastModifiedBy"]);
383
+ metadata.description = metadata.description || getText(["dc:description", "description", "dc:subject", "subject"]);
384
+ metadata.createdAt = metadata.createdAt || getText(["dcterms:created", "meta:creation-date"]);
385
+ metadata.modifiedAt = metadata.modifiedAt || getText(["dcterms:modified", "meta:date"]);
386
+ const keywords = getText(["dc:keyword", "cp:keywords", "meta:keyword"]);
387
+ if (keywords && !metadata.keywords) {
388
+ metadata.keywords = keywords.split(/[,;]/).map((k) => k.trim()).filter(Boolean);
389
+ }
256
390
  }
257
391
  function precheckZipSize(buffer) {
258
392
  try {
@@ -291,7 +425,7 @@ function extractFromBrokenZip(buffer) {
291
425
  const data = new Uint8Array(buffer);
292
426
  const view = new DataView(buffer);
293
427
  let pos = 0;
294
- const texts = [];
428
+ const blocks = [];
295
429
  let totalDecompressed = 0;
296
430
  let entryCount = 0;
297
431
  while (pos < data.length - 30) {
@@ -332,14 +466,14 @@ function extractFromBrokenZip(buffer) {
332
466
  }
333
467
  totalDecompressed += content.length * 2;
334
468
  if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
335
- const sectionText = blocksToMarkdown(parseSectionXml(content));
336
- if (sectionText) texts.push(sectionText);
469
+ blocks.push(...parseSectionXml(content));
337
470
  } catch {
338
471
  continue;
339
472
  }
340
473
  }
341
- if (texts.length === 0) throw new KordocError("\uC190\uC0C1\uB41C HWPX\uC5D0\uC11C \uC139\uC158 \uB370\uC774\uD130\uB97C \uBCF5\uAD6C\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
342
- return texts.join("\n\n");
474
+ if (blocks.length === 0) throw new KordocError("\uC190\uC0C1\uB41C HWPX\uC5D0\uC11C \uC139\uC158 \uB370\uC774\uD130\uB97C \uBCF5\uAD6C\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
475
+ const markdown = blocksToMarkdown(blocks);
476
+ return { markdown, blocks };
343
477
  }
344
478
  async function resolveSectionPaths(zip) {
345
479
  const manifestPaths = ["Contents/content.hpf", "content.hpf"];
@@ -612,7 +746,7 @@ var require2 = (0, import_module.createRequire)(import_meta.url);
612
746
  var CFB = require2("cfb");
613
747
  var MAX_SECTIONS = 100;
614
748
  var MAX_TOTAL_DECOMPRESS = 100 * 1024 * 1024;
615
- function parseHwp5Document(buffer) {
749
+ function parseHwp5Document(buffer, options) {
616
750
  const cfb = CFB.parse(buffer);
617
751
  const headerEntry = CFB.find(cfb, "/FileHeader");
618
752
  if (!headerEntry?.content) throw new KordocError("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
@@ -621,18 +755,59 @@ function parseHwp5Document(buffer) {
621
755
  if (header.flags & FLAG_ENCRYPTED) throw new KordocError("\uC554\uD638\uD654\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
622
756
  if (header.flags & FLAG_DRM) throw new KordocError("DRM \uBCF4\uD638\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
623
757
  const compressed = (header.flags & FLAG_COMPRESSED) !== 0;
758
+ const metadata = {
759
+ version: `${header.versionMajor}.x`
760
+ };
761
+ extractHwp5Metadata(cfb, metadata);
624
762
  const sections = findSections(cfb);
625
763
  if (sections.length === 0) throw new KordocError("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
764
+ metadata.pageCount = sections.length;
765
+ const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
626
766
  const blocks = [];
627
767
  let totalDecompressed = 0;
628
- for (const sectionData of sections) {
768
+ for (let si = 0; si < sections.length; si++) {
769
+ if (pageFilter && !pageFilter.has(si + 1)) continue;
770
+ const sectionData = sections[si];
629
771
  const data = compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
630
772
  totalDecompressed += data.length;
631
773
  if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
632
774
  const records = readRecords(data);
633
775
  blocks.push(...parseSection(records));
634
776
  }
635
- return blocksToMarkdown(blocks);
777
+ const markdown = blocksToMarkdown(blocks);
778
+ return { markdown, blocks, metadata };
779
+ }
780
+ function extractHwp5Metadata(cfb, metadata) {
781
+ try {
782
+ const summaryEntry = CFB.find(cfb, "/HwpSummaryInformation") || CFB.find(cfb, "/SummaryInformation");
783
+ if (!summaryEntry?.content) return;
784
+ const data = Buffer.from(summaryEntry.content);
785
+ if (data.length < 48) return;
786
+ const numSets = data.readUInt32LE(24);
787
+ if (numSets === 0) return;
788
+ const setOffset = data.readUInt32LE(44);
789
+ if (setOffset >= data.length - 8) return;
790
+ const numProps = data.readUInt32LE(setOffset + 4);
791
+ if (numProps === 0 || numProps > 100) return;
792
+ for (let i = 0; i < numProps; i++) {
793
+ const entryOffset = setOffset + 8 + i * 8;
794
+ if (entryOffset + 8 > data.length) break;
795
+ const propId = data.readUInt32LE(entryOffset);
796
+ const propOffset = setOffset + data.readUInt32LE(entryOffset + 4);
797
+ if (propOffset + 8 > data.length) continue;
798
+ if (propId !== 2 && propId !== 4 && propId !== 6) continue;
799
+ const propType = data.readUInt32LE(propOffset);
800
+ if (propType !== 30) continue;
801
+ const strLen = data.readUInt32LE(propOffset + 4);
802
+ if (strLen === 0 || strLen > 1e4 || propOffset + 8 + strLen > data.length) continue;
803
+ const str = data.subarray(propOffset + 8, propOffset + 8 + strLen).toString("utf8").replace(/\0+$/, "").trim();
804
+ if (!str) continue;
805
+ if (propId === 2) metadata.title = str;
806
+ else if (propId === 4) metadata.author = str;
807
+ else if (propId === 6) metadata.description = str;
808
+ }
809
+ } catch {
810
+ }
636
811
  }
637
812
  function findSections(cfb) {
638
813
  const sections = [];
@@ -772,34 +947,30 @@ function arrangeCells(rows, cols, cells) {
772
947
  return grid.map((row) => row.map((c) => c || { text: "", colSpan: 1, rowSpan: 1 }));
773
948
  }
774
949
 
950
+ // src/pdf/polyfill.ts
951
+ var pdfjsWorker = __toESM(require("pdfjs-dist/legacy/build/pdf.worker.mjs"), 1);
952
+ var g = globalThis;
953
+ if (typeof g.DOMMatrix === "undefined") {
954
+ g.DOMMatrix = class DOMMatrix {
955
+ m = [1, 0, 0, 1, 0, 0];
956
+ constructor(init) {
957
+ if (init) this.m = init;
958
+ }
959
+ };
960
+ }
961
+ if (typeof g.Path2D === "undefined") {
962
+ g.Path2D = class Path2D {
963
+ };
964
+ }
965
+ g.pdfjsWorker = pdfjsWorker;
966
+
775
967
  // src/pdf/parser.ts
776
- var import_module2 = require("module");
777
- var import_url = require("url");
778
- var import_meta2 = {};
968
+ var import_pdf = require("pdfjs-dist/legacy/build/pdf.mjs");
969
+ import_pdf.GlobalWorkerOptions.workerSrc = "";
779
970
  var MAX_PAGES = 5e3;
780
971
  var MAX_TOTAL_TEXT = 100 * 1024 * 1024;
781
- var pdfjsModule = null;
782
- async function loadPdfjs() {
783
- if (pdfjsModule) return pdfjsModule;
784
- try {
785
- const mod = await import("pdfjs-dist/legacy/build/pdf.mjs");
786
- const req = (0, import_module2.createRequire)(import_meta2.url);
787
- const workerPath = req.resolve("pdfjs-dist/legacy/build/pdf.worker.mjs");
788
- mod.GlobalWorkerOptions.workerSrc = (0, import_url.pathToFileURL)(workerPath).href;
789
- pdfjsModule = mod;
790
- return mod;
791
- } catch (err) {
792
- const msg = err instanceof Error ? err.message : String(err);
793
- if (msg.includes("Cannot find") || msg.includes("MODULE_NOT_FOUND")) return null;
794
- throw new KordocError(`pdfjs-dist \uB85C\uB529 \uC2E4\uD328: ${msg}`);
795
- }
796
- }
797
- async function parsePdfDocument(buffer) {
798
- const pdfjs = await loadPdfjs();
799
- if (!pdfjs) {
800
- return { success: false, fileType: "pdf", pageCount: 0, error: "pdfjs-dist\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4. npm install pdfjs-dist" };
801
- }
802
- const doc = await pdfjs.getDocument({
972
+ async function parsePdfDocument(buffer, options) {
973
+ const doc = await (0, import_pdf.getDocument)({
803
974
  data: new Uint8Array(buffer),
804
975
  useSystemFonts: true,
805
976
  disableFontFace: true,
@@ -807,12 +978,17 @@ async function parsePdfDocument(buffer) {
807
978
  }).promise;
808
979
  try {
809
980
  const pageCount = doc.numPages;
810
- if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4." };
981
+ if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.", blocks: [] };
982
+ const metadata = { pageCount };
983
+ await extractPdfMetadata(doc, metadata);
811
984
  const pageTexts = [];
985
+ const blocks = [];
812
986
  let totalChars = 0;
813
987
  let totalTextBytes = 0;
814
988
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
989
+ const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
815
990
  for (let i = 1; i <= effectivePageCount; i++) {
991
+ if (pageFilter && !pageFilter.has(i)) continue;
816
992
  const page = await doc.getPage(i);
817
993
  const tc = await page.getTextContent();
818
994
  const pageText = extractPageContent(tc.items);
@@ -820,18 +996,54 @@ async function parsePdfDocument(buffer) {
820
996
  totalTextBytes += pageText.length * 2;
821
997
  if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
822
998
  pageTexts.push(pageText);
999
+ blocks.push({ type: "paragraph", text: pageText });
823
1000
  }
824
- if (totalChars / effectivePageCount < 10) {
825
- return { success: false, fileType: "pdf", pageCount, isImageBased: true, error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)` };
1001
+ const parsedPageCount = pageFilter ? pageFilter.size : effectivePageCount;
1002
+ if (totalChars / Math.max(parsedPageCount, 1) < 10) {
1003
+ if (options?.ocr) {
1004
+ try {
1005
+ const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
1006
+ const ocrBlocks = await ocrPages2(doc, options.ocr, pageFilter, effectivePageCount);
1007
+ if (ocrBlocks.length > 0) {
1008
+ const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
1009
+ return { success: true, fileType: "pdf", markdown: ocrMarkdown, pageCount: parsedPageCount, blocks: ocrBlocks, metadata, isImageBased: true };
1010
+ }
1011
+ } catch {
1012
+ }
1013
+ }
1014
+ return { success: false, fileType: "pdf", pageCount, isImageBased: true, error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`, code: "IMAGE_BASED_PDF" };
826
1015
  }
827
1016
  let markdown = pageTexts.filter((t) => t.trim()).join("\n\n");
828
1017
  markdown = cleanPdfText(markdown);
829
- return { success: true, fileType: "pdf", markdown, pageCount: effectivePageCount };
1018
+ return { success: true, fileType: "pdf", markdown, pageCount: parsedPageCount, blocks, metadata };
830
1019
  } finally {
831
1020
  await doc.destroy().catch(() => {
832
1021
  });
833
1022
  }
834
1023
  }
1024
+ async function extractPdfMetadata(doc, metadata) {
1025
+ try {
1026
+ const result = await doc.getMetadata();
1027
+ if (!result?.info) return;
1028
+ const info = result.info;
1029
+ if (typeof info.Title === "string" && info.Title.trim()) metadata.title = info.Title.trim();
1030
+ if (typeof info.Author === "string" && info.Author.trim()) metadata.author = info.Author.trim();
1031
+ if (typeof info.Creator === "string" && info.Creator.trim()) metadata.creator = info.Creator.trim();
1032
+ if (typeof info.Subject === "string" && info.Subject.trim()) metadata.description = info.Subject.trim();
1033
+ if (typeof info.Keywords === "string" && info.Keywords.trim()) {
1034
+ metadata.keywords = info.Keywords.split(/[,;]/).map((k) => k.trim()).filter(Boolean);
1035
+ }
1036
+ if (typeof info.CreationDate === "string") metadata.createdAt = parsePdfDate(info.CreationDate);
1037
+ if (typeof info.ModDate === "string") metadata.modifiedAt = parsePdfDate(info.ModDate);
1038
+ } catch {
1039
+ }
1040
+ }
1041
+ function parsePdfDate(dateStr) {
1042
+ const m = dateStr.match(/D:(\d{4})(\d{2})?(\d{2})?(\d{2})?(\d{2})?(\d{2})?/);
1043
+ if (!m) return void 0;
1044
+ const [, year, month = "01", day = "01", hour = "00", min = "00", sec = "00"] = m;
1045
+ return `${year}-${month}-${day}T${hour}:${min}:${sec}`;
1046
+ }
835
1047
  function extractPageContent(rawItems) {
836
1048
  const items = normalizeItems(rawItems);
837
1049
  if (items.length === 0) return "";
@@ -1104,53 +1316,447 @@ function mergeKoreanLines(text) {
1104
1316
  return result.join("\n");
1105
1317
  }
1106
1318
 
1319
+ // src/diff/text-diff.ts
1320
+ function similarity(a, b) {
1321
+ if (a === b) return 1;
1322
+ if (!a || !b) return 0;
1323
+ const maxLen = Math.max(a.length, b.length);
1324
+ if (maxLen === 0) return 1;
1325
+ return 1 - levenshtein(a, b) / maxLen;
1326
+ }
1327
+ function normalizedSimilarity(a, b) {
1328
+ return similarity(normalize(a), normalize(b));
1329
+ }
1330
+ function normalize(s) {
1331
+ return s.replace(/\s+/g, " ").trim();
1332
+ }
1333
+ function levenshtein(a, b) {
1334
+ if (a.length > b.length) [a, b] = [b, a];
1335
+ const m = a.length;
1336
+ const n = b.length;
1337
+ let prev = Array.from({ length: m + 1 }, (_, i) => i);
1338
+ let curr = new Array(m + 1);
1339
+ for (let j = 1; j <= n; j++) {
1340
+ curr[0] = j;
1341
+ for (let i = 1; i <= m; i++) {
1342
+ if (a[i - 1] === b[j - 1]) {
1343
+ curr[i] = prev[i - 1];
1344
+ } else {
1345
+ curr[i] = 1 + Math.min(prev[i - 1], prev[i], curr[i - 1]);
1346
+ }
1347
+ }
1348
+ ;
1349
+ [prev, curr] = [curr, prev];
1350
+ }
1351
+ return prev[m];
1352
+ }
1353
+
1354
+ // src/diff/compare.ts
1355
+ var SIMILARITY_THRESHOLD = 0.4;
1356
+ async function compare(bufferA, bufferB, options) {
1357
+ const [resultA, resultB] = await Promise.all([
1358
+ parse(bufferA, options),
1359
+ parse(bufferB, options)
1360
+ ]);
1361
+ if (!resultA.success) throw new Error(`\uBB38\uC11CA \uD30C\uC2F1 \uC2E4\uD328: ${resultA.error}`);
1362
+ if (!resultB.success) throw new Error(`\uBB38\uC11CB \uD30C\uC2F1 \uC2E4\uD328: ${resultB.error}`);
1363
+ return diffBlocks(resultA.blocks, resultB.blocks);
1364
+ }
1365
+ function diffBlocks(blocksA, blocksB) {
1366
+ const aligned = alignBlocks(blocksA, blocksB);
1367
+ const stats = { added: 0, removed: 0, modified: 0, unchanged: 0 };
1368
+ const diffs = [];
1369
+ for (const [a, b] of aligned) {
1370
+ if (a && b) {
1371
+ const sim = blockSimilarity(a, b);
1372
+ if (sim >= 0.99) {
1373
+ diffs.push({ type: "unchanged", before: a, after: b, similarity: 1 });
1374
+ stats.unchanged++;
1375
+ } else {
1376
+ const diff = { type: "modified", before: a, after: b, similarity: sim };
1377
+ if (a.type === "table" && b.type === "table" && a.table && b.table) {
1378
+ diff.cellDiffs = diffTableCells(a.table, b.table);
1379
+ }
1380
+ diffs.push(diff);
1381
+ stats.modified++;
1382
+ }
1383
+ } else if (a) {
1384
+ diffs.push({ type: "removed", before: a });
1385
+ stats.removed++;
1386
+ } else if (b) {
1387
+ diffs.push({ type: "added", after: b });
1388
+ stats.added++;
1389
+ }
1390
+ }
1391
+ return { stats, diffs };
1392
+ }
1393
+ function alignBlocks(a, b) {
1394
+ const m = a.length, n = b.length;
1395
+ if (m * n > 1e7) return fallbackAlign(a, b);
1396
+ const simCache = /* @__PURE__ */ new Map();
1397
+ const getSim = (i2, j2) => {
1398
+ const key = `${i2},${j2}`;
1399
+ let v = simCache.get(key);
1400
+ if (v === void 0) {
1401
+ v = blockSimilarity(a[i2], b[j2]);
1402
+ simCache.set(key, v);
1403
+ }
1404
+ return v;
1405
+ };
1406
+ const dp = Array.from({ length: m + 1 }, () => new Array(n + 1).fill(0));
1407
+ for (let i2 = 1; i2 <= m; i2++) {
1408
+ for (let j2 = 1; j2 <= n; j2++) {
1409
+ if (getSim(i2 - 1, j2 - 1) >= SIMILARITY_THRESHOLD) {
1410
+ dp[i2][j2] = dp[i2 - 1][j2 - 1] + 1;
1411
+ } else {
1412
+ dp[i2][j2] = Math.max(dp[i2 - 1][j2], dp[i2][j2 - 1]);
1413
+ }
1414
+ }
1415
+ }
1416
+ const pairs = [];
1417
+ let i = m, j = n;
1418
+ while (i > 0 && j > 0) {
1419
+ if (getSim(i - 1, j - 1) >= SIMILARITY_THRESHOLD && dp[i][j] === dp[i - 1][j - 1] + 1) {
1420
+ pairs.push([i - 1, j - 1]);
1421
+ i--;
1422
+ j--;
1423
+ } else if (dp[i - 1][j] >= dp[i][j - 1]) {
1424
+ i--;
1425
+ } else {
1426
+ j--;
1427
+ }
1428
+ }
1429
+ pairs.reverse();
1430
+ const result = [];
1431
+ let ai = 0, bi = 0;
1432
+ for (const [pi, pj] of pairs) {
1433
+ while (ai < pi) result.push([a[ai++], null]);
1434
+ while (bi < pj) result.push([null, b[bi++]]);
1435
+ result.push([a[ai++], b[bi++]]);
1436
+ }
1437
+ while (ai < m) result.push([a[ai++], null]);
1438
+ while (bi < n) result.push([null, b[bi++]]);
1439
+ return result;
1440
+ }
1441
+ function fallbackAlign(a, b) {
1442
+ const result = [];
1443
+ const len = Math.max(a.length, b.length);
1444
+ for (let i = 0; i < len; i++) {
1445
+ result.push([a[i] || null, b[i] || null]);
1446
+ }
1447
+ return result;
1448
+ }
1449
+ function blockSimilarity(a, b) {
1450
+ if (a.type !== b.type) return 0;
1451
+ if (a.type === "paragraph") {
1452
+ return normalizedSimilarity(a.text || "", b.text || "");
1453
+ }
1454
+ if (a.type === "table" && a.table && b.table) {
1455
+ return tableSimilarity(a.table, b.table);
1456
+ }
1457
+ return 0;
1458
+ }
1459
+ function tableSimilarity(a, b) {
1460
+ const dimSim = 1 - Math.abs(a.rows * a.cols - b.rows * b.cols) / Math.max(a.rows * a.cols, b.rows * b.cols, 1);
1461
+ const textsA = a.cells.flat().map((c) => c.text).join(" ");
1462
+ const textsB = b.cells.flat().map((c) => c.text).join(" ");
1463
+ const contentSim = normalizedSimilarity(textsA, textsB);
1464
+ return dimSim * 0.3 + contentSim * 0.7;
1465
+ }
1466
+ function diffTableCells(a, b) {
1467
+ const maxRows = Math.max(a.rows, b.rows);
1468
+ const maxCols = Math.max(a.cols, b.cols);
1469
+ const result = [];
1470
+ for (let r = 0; r < maxRows; r++) {
1471
+ const row = [];
1472
+ for (let c = 0; c < maxCols; c++) {
1473
+ const cellA = r < a.rows && c < a.cols ? a.cells[r][c].text : void 0;
1474
+ const cellB = r < b.rows && c < b.cols ? b.cells[r][c].text : void 0;
1475
+ let type;
1476
+ if (cellA === void 0) type = "added";
1477
+ else if (cellB === void 0) type = "removed";
1478
+ else if (cellA === cellB) type = "unchanged";
1479
+ else type = "modified";
1480
+ row.push({ type, before: cellA, after: cellB });
1481
+ }
1482
+ result.push(row);
1483
+ }
1484
+ return result;
1485
+ }
1486
+
1487
+ // src/form/recognize.ts
1488
+ var LABEL_KEYWORDS = /* @__PURE__ */ new Set([
1489
+ "\uC131\uBA85",
1490
+ "\uC774\uB984",
1491
+ "\uC8FC\uC18C",
1492
+ "\uC804\uD654",
1493
+ "\uC804\uD654\uBC88\uD638",
1494
+ "\uD734\uB300\uD3F0",
1495
+ "\uD578\uB4DC\uD3F0",
1496
+ "\uC5F0\uB77D\uCC98",
1497
+ "\uC0DD\uB144\uC6D4\uC77C",
1498
+ "\uC8FC\uBBFC\uB4F1\uB85D\uBC88\uD638",
1499
+ "\uC18C\uC18D",
1500
+ "\uC9C1\uC704",
1501
+ "\uC9C1\uAE09",
1502
+ "\uBD80\uC11C",
1503
+ "\uC774\uBA54\uC77C",
1504
+ "\uD329\uC2A4",
1505
+ "\uD559\uAD50",
1506
+ "\uD559\uB144",
1507
+ "\uBC18",
1508
+ "\uBC88\uD638",
1509
+ "\uC2E0\uCCAD\uC778",
1510
+ "\uB300\uD45C\uC790",
1511
+ "\uB2F4\uB2F9\uC790",
1512
+ "\uC791\uC131\uC790",
1513
+ "\uD655\uC778\uC790",
1514
+ "\uC2B9\uC778\uC790",
1515
+ "\uC77C\uC2DC",
1516
+ "\uB0A0\uC9DC",
1517
+ "\uAE30\uAC04",
1518
+ "\uC7A5\uC18C",
1519
+ "\uBAA9\uC801",
1520
+ "\uC0AC\uC720",
1521
+ "\uBE44\uACE0",
1522
+ "\uAE08\uC561",
1523
+ "\uC218\uB7C9",
1524
+ "\uB2E8\uAC00",
1525
+ "\uD569\uACC4",
1526
+ "\uACC4",
1527
+ "\uC18C\uACC4"
1528
+ ]);
1529
+ function isLabelCell(text) {
1530
+ const trimmed = text.trim();
1531
+ if (!trimmed || trimmed.length > 30) return false;
1532
+ for (const kw of LABEL_KEYWORDS) {
1533
+ if (trimmed.includes(kw)) return true;
1534
+ }
1535
+ if (/^[가-힣\s()·:]{2,8}$/.test(trimmed) && !/\d/.test(trimmed)) return true;
1536
+ if (/^[가-힣A-Za-z\s]+[::]$/.test(trimmed)) return true;
1537
+ return false;
1538
+ }
1539
+ function extractFormFields(blocks) {
1540
+ const fields = [];
1541
+ let totalTables = 0;
1542
+ let formTables = 0;
1543
+ for (const block of blocks) {
1544
+ if (block.type !== "table" || !block.table) continue;
1545
+ totalTables++;
1546
+ const tableFields = extractFromTable(block.table);
1547
+ if (tableFields.length > 0) {
1548
+ formTables++;
1549
+ fields.push(...tableFields);
1550
+ }
1551
+ }
1552
+ for (const block of blocks) {
1553
+ if (block.type === "paragraph" && block.text) {
1554
+ const inlineFields = extractInlineFields(block.text);
1555
+ fields.push(...inlineFields);
1556
+ }
1557
+ }
1558
+ const confidence = totalTables > 0 ? formTables / totalTables : fields.length > 0 ? 0.3 : 0;
1559
+ return { fields, confidence: Math.min(confidence, 1) };
1560
+ }
1561
+ function extractFromTable(table) {
1562
+ const fields = [];
1563
+ if (table.cols >= 2) {
1564
+ for (let r = 0; r < table.rows; r++) {
1565
+ for (let c = 0; c < table.cols - 1; c++) {
1566
+ const labelCell = table.cells[r][c];
1567
+ const valueCell = table.cells[r][c + 1];
1568
+ if (isLabelCell(labelCell.text) && valueCell.text.trim()) {
1569
+ fields.push({
1570
+ label: labelCell.text.trim().replace(/[::]\s*$/, ""),
1571
+ value: valueCell.text.trim(),
1572
+ row: r,
1573
+ col: c
1574
+ });
1575
+ }
1576
+ }
1577
+ }
1578
+ }
1579
+ if (fields.length === 0 && table.rows >= 2 && table.cols >= 2) {
1580
+ const headerRow = table.cells[0];
1581
+ const allLabels = headerRow.every((cell) => {
1582
+ const t = cell.text.trim();
1583
+ return t.length > 0 && t.length <= 20;
1584
+ });
1585
+ if (allLabels) {
1586
+ for (let r = 1; r < table.rows; r++) {
1587
+ for (let c = 0; c < table.cols; c++) {
1588
+ const label = headerRow[c].text.trim();
1589
+ const value = table.cells[r][c].text.trim();
1590
+ if (label && value) {
1591
+ fields.push({ label, value, row: r, col: c });
1592
+ }
1593
+ }
1594
+ }
1595
+ }
1596
+ }
1597
+ return fields;
1598
+ }
1599
+ function extractInlineFields(text) {
1600
+ const fields = [];
1601
+ const pattern = /([가-힣A-Za-z]{2,10})\s*[::]\s*([^\n,;]{1,100})/g;
1602
+ let match;
1603
+ while ((match = pattern.exec(text)) !== null) {
1604
+ const label = match[1].trim();
1605
+ const value = match[2].trim();
1606
+ if (value) {
1607
+ fields.push({ label, value, row: -1, col: -1 });
1608
+ }
1609
+ }
1610
+ return fields;
1611
+ }
1612
+
1613
+ // src/hwpx/generator.ts
1614
+ var import_jszip2 = __toESM(require("jszip"), 1);
1615
+ var HWPML_NS = "http://www.hancom.co.kr/hwpml/2016/HwpMl";
1616
+ async function markdownToHwpx(markdown) {
1617
+ const blocks = parseMarkdownToBlocks(markdown);
1618
+ const sectionXml = blocksToSectionXml(blocks);
1619
+ const zip = new import_jszip2.default();
1620
+ zip.file("mimetype", "application/hwp+zip", { compression: "STORE" });
1621
+ zip.file("Contents/content.hpf", generateManifest());
1622
+ zip.file("Contents/section0.xml", sectionXml);
1623
+ return await zip.generateAsync({ type: "arraybuffer" });
1624
+ }
1625
+ function parseMarkdownToBlocks(md) {
1626
+ const lines = md.split("\n");
1627
+ const blocks = [];
1628
+ let i = 0;
1629
+ while (i < lines.length) {
1630
+ const line = lines[i];
1631
+ if (!line.trim()) {
1632
+ i++;
1633
+ continue;
1634
+ }
1635
+ const headingMatch = line.match(/^(#{1,6})\s+(.+)$/);
1636
+ if (headingMatch) {
1637
+ blocks.push({ type: "heading", text: headingMatch[2].trim(), level: headingMatch[1].length });
1638
+ i++;
1639
+ continue;
1640
+ }
1641
+ if (line.trimStart().startsWith("|")) {
1642
+ const tableRows = [];
1643
+ while (i < lines.length && lines[i].trimStart().startsWith("|")) {
1644
+ const row = lines[i];
1645
+ if (/^\|[\s\-:]+\|/.test(row) && !row.includes("---") === false && /^[\s|:\-]+$/.test(row)) {
1646
+ i++;
1647
+ continue;
1648
+ }
1649
+ const cells = row.split("|").slice(1, -1).map((c) => c.trim());
1650
+ if (cells.length > 0) tableRows.push(cells);
1651
+ i++;
1652
+ }
1653
+ if (tableRows.length > 0) {
1654
+ blocks.push({ type: "table", rows: tableRows });
1655
+ }
1656
+ continue;
1657
+ }
1658
+ blocks.push({ type: "paragraph", text: line.trim() });
1659
+ i++;
1660
+ }
1661
+ return blocks;
1662
+ }
1663
+ function escapeXml(text) {
1664
+ return text.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;");
1665
+ }
1666
+ function generateParagraph(text) {
1667
+ return `<hp:p><hp:run><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
1668
+ }
1669
+ function generateTable(rows) {
1670
+ const trElements = rows.map((row) => {
1671
+ const tdElements = row.map(
1672
+ (cell) => `<hp:tc><hp:cellSpan colSpan="1" rowSpan="1"/>${generateParagraph(cell)}</hp:tc>`
1673
+ ).join("");
1674
+ return `<hp:tr>${tdElements}</hp:tr>`;
1675
+ }).join("");
1676
+ return `<hp:tbl>${trElements}</hp:tbl>`;
1677
+ }
1678
+ function blocksToSectionXml(blocks) {
1679
+ const body = blocks.map((block) => {
1680
+ switch (block.type) {
1681
+ case "heading":
1682
+ return generateParagraph(block.text || "");
1683
+ case "table":
1684
+ return block.rows ? generateTable(block.rows) : "";
1685
+ case "paragraph":
1686
+ return generateParagraph(block.text || "");
1687
+ default:
1688
+ return "";
1689
+ }
1690
+ }).join("\n ");
1691
+ return `<?xml version="1.0" encoding="UTF-8"?>
1692
+ <hs:sec xmlns:hs="${HWPML_NS}" xmlns:hp="${HWPML_NS}">
1693
+ ${body}
1694
+ </hs:sec>`;
1695
+ }
1696
+ function generateManifest() {
1697
+ return `<?xml version="1.0" encoding="UTF-8"?>
1698
+ <opf:package xmlns:opf="http://www.idpf.org/2007/opf">
1699
+ <opf:manifest>
1700
+ <opf:item id="s0" href="section0.xml" media-type="application/xml"/>
1701
+ </opf:manifest>
1702
+ <opf:spine>
1703
+ <opf:itemref idref="s0"/>
1704
+ </opf:spine>
1705
+ </opf:package>`;
1706
+ }
1707
+
1107
1708
  // src/index.ts
1108
- async function parse(buffer) {
1709
+ async function parse(buffer, options) {
1109
1710
  if (!buffer || buffer.byteLength === 0) {
1110
- return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4." };
1711
+ return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
1111
1712
  }
1112
1713
  const format = detectFormat(buffer);
1113
1714
  switch (format) {
1114
1715
  case "hwpx":
1115
- return parseHwpx(buffer);
1716
+ return parseHwpx(buffer, options);
1116
1717
  case "hwp":
1117
- return parseHwp(buffer);
1718
+ return parseHwp(buffer, options);
1118
1719
  case "pdf":
1119
- return parsePdf(buffer);
1720
+ return parsePdf(buffer, options);
1120
1721
  default:
1121
- return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4." };
1722
+ return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
1122
1723
  }
1123
1724
  }
1124
- async function parseHwpx(buffer) {
1725
+ async function parseHwpx(buffer, options) {
1125
1726
  try {
1126
- const markdown = await parseHwpxDocument(buffer);
1127
- return { success: true, fileType: "hwpx", markdown };
1727
+ const { markdown, blocks, metadata } = await parseHwpxDocument(buffer, options);
1728
+ return { success: true, fileType: "hwpx", markdown, blocks, metadata };
1128
1729
  } catch (err) {
1129
- return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328" };
1730
+ return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
1130
1731
  }
1131
1732
  }
1132
- async function parseHwp(buffer) {
1733
+ async function parseHwp(buffer, options) {
1133
1734
  try {
1134
- const markdown = parseHwp5Document(Buffer.from(buffer));
1135
- return { success: true, fileType: "hwp", markdown };
1735
+ const { markdown, blocks, metadata } = parseHwp5Document(Buffer.from(buffer), options);
1736
+ return { success: true, fileType: "hwp", markdown, blocks, metadata };
1136
1737
  } catch (err) {
1137
- return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328" };
1738
+ return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
1138
1739
  }
1139
1740
  }
1140
- async function parsePdf(buffer) {
1741
+ async function parsePdf(buffer, options) {
1141
1742
  try {
1142
- return await parsePdfDocument(buffer);
1743
+ return await parsePdfDocument(buffer, options);
1143
1744
  } catch (err) {
1144
- return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328" };
1745
+ return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
1145
1746
  }
1146
1747
  }
1147
1748
  // Annotate the CommonJS export names for ESM import in node:
1148
1749
  0 && (module.exports = {
1149
1750
  VERSION,
1751
+ blocksToMarkdown,
1752
+ compare,
1150
1753
  detectFormat,
1754
+ diffBlocks,
1755
+ extractFormFields,
1151
1756
  isHwpxFile,
1152
1757
  isOldHwpFile,
1153
1758
  isPdfFile,
1759
+ markdownToHwpx,
1154
1760
  parse,
1155
1761
  parseHwp,
1156
1762
  parseHwpx,