kordoc 1.2.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +111 -118
- package/dist/{chunk-4BKNDXGU.js → chunk-BWZW234S.js} +595 -86
- package/dist/chunk-BWZW234S.js.map +1 -0
- package/dist/cli.js +15 -3
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +665 -59
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +163 -6
- package/dist/index.d.ts +163 -6
- package/dist/index.js +667 -58
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +216 -13
- package/dist/mcp.js.map +1 -1
- package/dist/provider-JB7SY74K.js +38 -0
- package/dist/provider-JB7SY74K.js.map +1 -0
- package/dist/watch-LIGKH3QS.js +90 -0
- package/dist/watch-LIGKH3QS.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-4BKNDXGU.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -1,3 +1,55 @@
|
|
|
1
|
+
var __defProp = Object.defineProperty;
|
|
2
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
3
|
+
var __esm = (fn, res) => function __init() {
|
|
4
|
+
return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
|
|
5
|
+
};
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
|
|
11
|
+
// src/ocr/provider.ts
|
|
12
|
+
var provider_exports = {};
|
|
13
|
+
__export(provider_exports, {
|
|
14
|
+
ocrPages: () => ocrPages
|
|
15
|
+
});
|
|
16
|
+
async function ocrPages(doc, provider, pageFilter, effectivePageCount) {
|
|
17
|
+
const blocks = [];
|
|
18
|
+
for (let i = 1; i <= effectivePageCount; i++) {
|
|
19
|
+
if (pageFilter && !pageFilter.has(i)) continue;
|
|
20
|
+
const page = await doc.getPage(i);
|
|
21
|
+
try {
|
|
22
|
+
const imageData = await renderPageToPng(page);
|
|
23
|
+
const text = await provider(imageData, i, "image/png");
|
|
24
|
+
if (text.trim()) {
|
|
25
|
+
blocks.push({ type: "paragraph", text: text.trim() });
|
|
26
|
+
}
|
|
27
|
+
} catch {
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
return blocks;
|
|
31
|
+
}
|
|
32
|
+
async function renderPageToPng(page) {
|
|
33
|
+
let createCanvas;
|
|
34
|
+
try {
|
|
35
|
+
const canvasModule = await import("canvas");
|
|
36
|
+
createCanvas = canvasModule.createCanvas;
|
|
37
|
+
} catch {
|
|
38
|
+
throw new Error("OCR\uC744 \uC0AC\uC6A9\uD558\uB824\uBA74 'canvas' \uD328\uD0A4\uC9C0\uB97C \uC124\uCE58\uD558\uC138\uC694: npm install canvas");
|
|
39
|
+
}
|
|
40
|
+
const scale = 2;
|
|
41
|
+
const viewport = page.getViewport({ scale });
|
|
42
|
+
const canvas = createCanvas(Math.floor(viewport.width), Math.floor(viewport.height));
|
|
43
|
+
const ctx = canvas.getContext("2d");
|
|
44
|
+
await page.render({ canvasContext: ctx, viewport }).promise;
|
|
45
|
+
return new Uint8Array(canvas.toBuffer("image/png"));
|
|
46
|
+
}
|
|
47
|
+
var init_provider = __esm({
|
|
48
|
+
"src/ocr/provider.ts"() {
|
|
49
|
+
"use strict";
|
|
50
|
+
}
|
|
51
|
+
});
|
|
52
|
+
|
|
1
53
|
// src/detect.ts
|
|
2
54
|
function magicBytes(buffer) {
|
|
3
55
|
return new Uint8Array(buffer, 0, Math.min(4, buffer.byteLength));
|
|
@@ -157,7 +209,7 @@ function tableToMarkdown(table) {
|
|
|
157
209
|
}
|
|
158
210
|
|
|
159
211
|
// src/utils.ts
|
|
160
|
-
var VERSION = true ? "1.
|
|
212
|
+
var VERSION = true ? "1.4.0" : "0.0.0-dev";
|
|
161
213
|
var KordocError = class extends Error {
|
|
162
214
|
constructor(message) {
|
|
163
215
|
super(message);
|
|
@@ -168,6 +220,47 @@ function isPathTraversal(name) {
|
|
|
168
220
|
const normalized = name.replace(/\\/g, "/");
|
|
169
221
|
return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
170
222
|
}
|
|
223
|
+
function classifyError(err) {
|
|
224
|
+
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
225
|
+
const msg = err.message;
|
|
226
|
+
if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
|
|
227
|
+
if (msg.includes("DRM")) return "DRM_PROTECTED";
|
|
228
|
+
if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
|
|
229
|
+
if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
|
|
230
|
+
if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
|
|
231
|
+
if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
|
|
232
|
+
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
233
|
+
return "PARSE_ERROR";
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
// src/page-range.ts
|
|
237
|
+
function parsePageRange(spec, maxPages) {
|
|
238
|
+
const result = /* @__PURE__ */ new Set();
|
|
239
|
+
if (maxPages <= 0) return result;
|
|
240
|
+
if (Array.isArray(spec)) {
|
|
241
|
+
for (const n of spec) {
|
|
242
|
+
const page = Math.round(n);
|
|
243
|
+
if (page >= 1 && page <= maxPages) result.add(page);
|
|
244
|
+
}
|
|
245
|
+
return result;
|
|
246
|
+
}
|
|
247
|
+
if (typeof spec !== "string" || spec.trim() === "") return result;
|
|
248
|
+
const parts = spec.split(",");
|
|
249
|
+
for (const part of parts) {
|
|
250
|
+
const trimmed = part.trim();
|
|
251
|
+
if (!trimmed) continue;
|
|
252
|
+
const rangeMatch = trimmed.match(/^(\d+)\s*-\s*(\d+)$/);
|
|
253
|
+
if (rangeMatch) {
|
|
254
|
+
const start = Math.max(1, parseInt(rangeMatch[1], 10));
|
|
255
|
+
const end = Math.min(maxPages, parseInt(rangeMatch[2], 10));
|
|
256
|
+
for (let i = start; i <= end; i++) result.add(i);
|
|
257
|
+
} else {
|
|
258
|
+
const page = parseInt(trimmed, 10);
|
|
259
|
+
if (!isNaN(page) && page >= 1 && page <= maxPages) result.add(page);
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
return result;
|
|
263
|
+
}
|
|
171
264
|
|
|
172
265
|
// src/hwpx/parser.ts
|
|
173
266
|
var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
|
|
@@ -178,7 +271,7 @@ function clampSpan(val, max) {
|
|
|
178
271
|
function stripDtd(xml) {
|
|
179
272
|
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
180
273
|
}
|
|
181
|
-
async function parseHwpxDocument(buffer) {
|
|
274
|
+
async function parseHwpxDocument(buffer, options) {
|
|
182
275
|
const precheck = precheckZipSize(buffer);
|
|
183
276
|
if (precheck.totalUncompressed > MAX_DECOMPRESS_SIZE) {
|
|
184
277
|
throw new KordocError("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
@@ -196,19 +289,62 @@ async function parseHwpxDocument(buffer) {
|
|
|
196
289
|
if (actualEntryCount > MAX_ZIP_ENTRIES) {
|
|
197
290
|
throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
198
291
|
}
|
|
292
|
+
const metadata = {};
|
|
293
|
+
await extractHwpxMetadata(zip, metadata);
|
|
199
294
|
const sectionPaths = await resolveSectionPaths(zip);
|
|
200
295
|
if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
296
|
+
metadata.pageCount = sectionPaths.length;
|
|
297
|
+
const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
|
|
201
298
|
let totalDecompressed = 0;
|
|
202
299
|
const blocks = [];
|
|
203
|
-
for (
|
|
204
|
-
|
|
300
|
+
for (let si = 0; si < sectionPaths.length; si++) {
|
|
301
|
+
if (pageFilter && !pageFilter.has(si + 1)) continue;
|
|
302
|
+
const file = zip.file(sectionPaths[si]);
|
|
205
303
|
if (!file) continue;
|
|
206
304
|
const xml = await file.async("text");
|
|
207
305
|
totalDecompressed += xml.length * 2;
|
|
208
306
|
if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
209
307
|
blocks.push(...parseSectionXml(xml));
|
|
210
308
|
}
|
|
211
|
-
|
|
309
|
+
const markdown = blocksToMarkdown(blocks);
|
|
310
|
+
return { markdown, blocks, metadata };
|
|
311
|
+
}
|
|
312
|
+
async function extractHwpxMetadata(zip, metadata) {
|
|
313
|
+
try {
|
|
314
|
+
const metaPaths = ["meta.xml", "META-INF/meta.xml", "docProps/core.xml"];
|
|
315
|
+
for (const mp of metaPaths) {
|
|
316
|
+
const file = zip.file(mp) || Object.values(zip.files).find((f) => f.name.toLowerCase() === mp.toLowerCase()) || null;
|
|
317
|
+
if (!file) continue;
|
|
318
|
+
const xml = await file.async("text");
|
|
319
|
+
parseDublinCoreMetadata(xml, metadata);
|
|
320
|
+
if (metadata.title || metadata.author) return;
|
|
321
|
+
}
|
|
322
|
+
} catch {
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
function parseDublinCoreMetadata(xml, metadata) {
|
|
326
|
+
const parser = new DOMParser();
|
|
327
|
+
const doc = parser.parseFromString(stripDtd(xml), "text/xml");
|
|
328
|
+
if (!doc.documentElement) return;
|
|
329
|
+
const getText = (tagNames) => {
|
|
330
|
+
for (const tag of tagNames) {
|
|
331
|
+
const els = doc.getElementsByTagName(tag);
|
|
332
|
+
if (els.length > 0) {
|
|
333
|
+
const text = els[0].textContent?.trim();
|
|
334
|
+
if (text) return text;
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
return void 0;
|
|
338
|
+
};
|
|
339
|
+
metadata.title = metadata.title || getText(["dc:title", "title"]);
|
|
340
|
+
metadata.author = metadata.author || getText(["dc:creator", "creator", "cp:lastModifiedBy"]);
|
|
341
|
+
metadata.description = metadata.description || getText(["dc:description", "description", "dc:subject", "subject"]);
|
|
342
|
+
metadata.createdAt = metadata.createdAt || getText(["dcterms:created", "meta:creation-date"]);
|
|
343
|
+
metadata.modifiedAt = metadata.modifiedAt || getText(["dcterms:modified", "meta:date"]);
|
|
344
|
+
const keywords = getText(["dc:keyword", "cp:keywords", "meta:keyword"]);
|
|
345
|
+
if (keywords && !metadata.keywords) {
|
|
346
|
+
metadata.keywords = keywords.split(/[,;]/).map((k) => k.trim()).filter(Boolean);
|
|
347
|
+
}
|
|
212
348
|
}
|
|
213
349
|
function precheckZipSize(buffer) {
|
|
214
350
|
try {
|
|
@@ -247,7 +383,7 @@ function extractFromBrokenZip(buffer) {
|
|
|
247
383
|
const data = new Uint8Array(buffer);
|
|
248
384
|
const view = new DataView(buffer);
|
|
249
385
|
let pos = 0;
|
|
250
|
-
const
|
|
386
|
+
const blocks = [];
|
|
251
387
|
let totalDecompressed = 0;
|
|
252
388
|
let entryCount = 0;
|
|
253
389
|
while (pos < data.length - 30) {
|
|
@@ -288,14 +424,14 @@ function extractFromBrokenZip(buffer) {
|
|
|
288
424
|
}
|
|
289
425
|
totalDecompressed += content.length * 2;
|
|
290
426
|
if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
|
|
291
|
-
|
|
292
|
-
if (sectionText) texts.push(sectionText);
|
|
427
|
+
blocks.push(...parseSectionXml(content));
|
|
293
428
|
} catch {
|
|
294
429
|
continue;
|
|
295
430
|
}
|
|
296
431
|
}
|
|
297
|
-
if (
|
|
298
|
-
|
|
432
|
+
if (blocks.length === 0) throw new KordocError("\uC190\uC0C1\uB41C HWPX\uC5D0\uC11C \uC139\uC158 \uB370\uC774\uD130\uB97C \uBCF5\uAD6C\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
433
|
+
const markdown = blocksToMarkdown(blocks);
|
|
434
|
+
return { markdown, blocks };
|
|
299
435
|
}
|
|
300
436
|
async function resolveSectionPaths(zip) {
|
|
301
437
|
const manifestPaths = ["Contents/content.hpf", "content.hpf"];
|
|
@@ -567,7 +703,7 @@ var require2 = createRequire(import.meta.url);
|
|
|
567
703
|
var CFB = require2("cfb");
|
|
568
704
|
var MAX_SECTIONS = 100;
|
|
569
705
|
var MAX_TOTAL_DECOMPRESS = 100 * 1024 * 1024;
|
|
570
|
-
function parseHwp5Document(buffer) {
|
|
706
|
+
function parseHwp5Document(buffer, options) {
|
|
571
707
|
const cfb = CFB.parse(buffer);
|
|
572
708
|
const headerEntry = CFB.find(cfb, "/FileHeader");
|
|
573
709
|
if (!headerEntry?.content) throw new KordocError("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
|
|
@@ -576,18 +712,59 @@ function parseHwp5Document(buffer) {
|
|
|
576
712
|
if (header.flags & FLAG_ENCRYPTED) throw new KordocError("\uC554\uD638\uD654\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
|
|
577
713
|
if (header.flags & FLAG_DRM) throw new KordocError("DRM \uBCF4\uD638\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
|
|
578
714
|
const compressed = (header.flags & FLAG_COMPRESSED) !== 0;
|
|
715
|
+
const metadata = {
|
|
716
|
+
version: `${header.versionMajor}.x`
|
|
717
|
+
};
|
|
718
|
+
extractHwp5Metadata(cfb, metadata);
|
|
579
719
|
const sections = findSections(cfb);
|
|
580
720
|
if (sections.length === 0) throw new KordocError("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
721
|
+
metadata.pageCount = sections.length;
|
|
722
|
+
const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
|
|
581
723
|
const blocks = [];
|
|
582
724
|
let totalDecompressed = 0;
|
|
583
|
-
for (
|
|
725
|
+
for (let si = 0; si < sections.length; si++) {
|
|
726
|
+
if (pageFilter && !pageFilter.has(si + 1)) continue;
|
|
727
|
+
const sectionData = sections[si];
|
|
584
728
|
const data = compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
|
|
585
729
|
totalDecompressed += data.length;
|
|
586
730
|
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
587
731
|
const records = readRecords(data);
|
|
588
732
|
blocks.push(...parseSection(records));
|
|
589
733
|
}
|
|
590
|
-
|
|
734
|
+
const markdown = blocksToMarkdown(blocks);
|
|
735
|
+
return { markdown, blocks, metadata };
|
|
736
|
+
}
|
|
737
|
+
function extractHwp5Metadata(cfb, metadata) {
|
|
738
|
+
try {
|
|
739
|
+
const summaryEntry = CFB.find(cfb, "/HwpSummaryInformation") || CFB.find(cfb, "/SummaryInformation");
|
|
740
|
+
if (!summaryEntry?.content) return;
|
|
741
|
+
const data = Buffer.from(summaryEntry.content);
|
|
742
|
+
if (data.length < 48) return;
|
|
743
|
+
const numSets = data.readUInt32LE(24);
|
|
744
|
+
if (numSets === 0) return;
|
|
745
|
+
const setOffset = data.readUInt32LE(44);
|
|
746
|
+
if (setOffset >= data.length - 8) return;
|
|
747
|
+
const numProps = data.readUInt32LE(setOffset + 4);
|
|
748
|
+
if (numProps === 0 || numProps > 100) return;
|
|
749
|
+
for (let i = 0; i < numProps; i++) {
|
|
750
|
+
const entryOffset = setOffset + 8 + i * 8;
|
|
751
|
+
if (entryOffset + 8 > data.length) break;
|
|
752
|
+
const propId = data.readUInt32LE(entryOffset);
|
|
753
|
+
const propOffset = setOffset + data.readUInt32LE(entryOffset + 4);
|
|
754
|
+
if (propOffset + 8 > data.length) continue;
|
|
755
|
+
if (propId !== 2 && propId !== 4 && propId !== 6) continue;
|
|
756
|
+
const propType = data.readUInt32LE(propOffset);
|
|
757
|
+
if (propType !== 30) continue;
|
|
758
|
+
const strLen = data.readUInt32LE(propOffset + 4);
|
|
759
|
+
if (strLen === 0 || strLen > 1e4 || propOffset + 8 + strLen > data.length) continue;
|
|
760
|
+
const str = data.subarray(propOffset + 8, propOffset + 8 + strLen).toString("utf8").replace(/\0+$/, "").trim();
|
|
761
|
+
if (!str) continue;
|
|
762
|
+
if (propId === 2) metadata.title = str;
|
|
763
|
+
else if (propId === 4) metadata.author = str;
|
|
764
|
+
else if (propId === 6) metadata.description = str;
|
|
765
|
+
}
|
|
766
|
+
} catch {
|
|
767
|
+
}
|
|
591
768
|
}
|
|
592
769
|
function findSections(cfb) {
|
|
593
770
|
const sections = [];
|
|
@@ -727,33 +904,30 @@ function arrangeCells(rows, cols, cells) {
|
|
|
727
904
|
return grid.map((row) => row.map((c) => c || { text: "", colSpan: 1, rowSpan: 1 }));
|
|
728
905
|
}
|
|
729
906
|
|
|
907
|
+
// src/pdf/polyfill.ts
|
|
908
|
+
import * as pdfjsWorker from "pdfjs-dist/legacy/build/pdf.worker.mjs";
|
|
909
|
+
var g = globalThis;
|
|
910
|
+
if (typeof g.DOMMatrix === "undefined") {
|
|
911
|
+
g.DOMMatrix = class DOMMatrix {
|
|
912
|
+
m = [1, 0, 0, 1, 0, 0];
|
|
913
|
+
constructor(init) {
|
|
914
|
+
if (init) this.m = init;
|
|
915
|
+
}
|
|
916
|
+
};
|
|
917
|
+
}
|
|
918
|
+
if (typeof g.Path2D === "undefined") {
|
|
919
|
+
g.Path2D = class Path2D {
|
|
920
|
+
};
|
|
921
|
+
}
|
|
922
|
+
g.pdfjsWorker = pdfjsWorker;
|
|
923
|
+
|
|
730
924
|
// src/pdf/parser.ts
|
|
731
|
-
import {
|
|
732
|
-
|
|
925
|
+
import { getDocument, GlobalWorkerOptions } from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
926
|
+
GlobalWorkerOptions.workerSrc = "";
|
|
733
927
|
var MAX_PAGES = 5e3;
|
|
734
928
|
var MAX_TOTAL_TEXT = 100 * 1024 * 1024;
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
if (pdfjsModule) return pdfjsModule;
|
|
738
|
-
try {
|
|
739
|
-
const mod = await import("pdfjs-dist/legacy/build/pdf.mjs");
|
|
740
|
-
const req = createRequire2(import.meta.url);
|
|
741
|
-
const workerPath = req.resolve("pdfjs-dist/legacy/build/pdf.worker.mjs");
|
|
742
|
-
mod.GlobalWorkerOptions.workerSrc = pathToFileURL(workerPath).href;
|
|
743
|
-
pdfjsModule = mod;
|
|
744
|
-
return mod;
|
|
745
|
-
} catch (err) {
|
|
746
|
-
const msg = err instanceof Error ? err.message : String(err);
|
|
747
|
-
if (msg.includes("Cannot find") || msg.includes("MODULE_NOT_FOUND")) return null;
|
|
748
|
-
throw new KordocError(`pdfjs-dist \uB85C\uB529 \uC2E4\uD328: ${msg}`);
|
|
749
|
-
}
|
|
750
|
-
}
|
|
751
|
-
async function parsePdfDocument(buffer) {
|
|
752
|
-
const pdfjs = await loadPdfjs();
|
|
753
|
-
if (!pdfjs) {
|
|
754
|
-
return { success: false, fileType: "pdf", pageCount: 0, error: "pdfjs-dist\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4. npm install pdfjs-dist" };
|
|
755
|
-
}
|
|
756
|
-
const doc = await pdfjs.getDocument({
|
|
929
|
+
async function parsePdfDocument(buffer, options) {
|
|
930
|
+
const doc = await getDocument({
|
|
757
931
|
data: new Uint8Array(buffer),
|
|
758
932
|
useSystemFonts: true,
|
|
759
933
|
disableFontFace: true,
|
|
@@ -761,12 +935,17 @@ async function parsePdfDocument(buffer) {
|
|
|
761
935
|
}).promise;
|
|
762
936
|
try {
|
|
763
937
|
const pageCount = doc.numPages;
|
|
764
|
-
if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4." };
|
|
938
|
+
if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.", blocks: [] };
|
|
939
|
+
const metadata = { pageCount };
|
|
940
|
+
await extractPdfMetadata(doc, metadata);
|
|
765
941
|
const pageTexts = [];
|
|
942
|
+
const blocks = [];
|
|
766
943
|
let totalChars = 0;
|
|
767
944
|
let totalTextBytes = 0;
|
|
768
945
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
946
|
+
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
769
947
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
948
|
+
if (pageFilter && !pageFilter.has(i)) continue;
|
|
770
949
|
const page = await doc.getPage(i);
|
|
771
950
|
const tc = await page.getTextContent();
|
|
772
951
|
const pageText = extractPageContent(tc.items);
|
|
@@ -774,18 +953,54 @@ async function parsePdfDocument(buffer) {
|
|
|
774
953
|
totalTextBytes += pageText.length * 2;
|
|
775
954
|
if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
|
|
776
955
|
pageTexts.push(pageText);
|
|
956
|
+
blocks.push({ type: "paragraph", text: pageText });
|
|
777
957
|
}
|
|
778
|
-
|
|
779
|
-
|
|
958
|
+
const parsedPageCount = pageFilter ? pageFilter.size : effectivePageCount;
|
|
959
|
+
if (totalChars / Math.max(parsedPageCount, 1) < 10) {
|
|
960
|
+
if (options?.ocr) {
|
|
961
|
+
try {
|
|
962
|
+
const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
|
|
963
|
+
const ocrBlocks = await ocrPages2(doc, options.ocr, pageFilter, effectivePageCount);
|
|
964
|
+
if (ocrBlocks.length > 0) {
|
|
965
|
+
const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
|
|
966
|
+
return { success: true, fileType: "pdf", markdown: ocrMarkdown, pageCount: parsedPageCount, blocks: ocrBlocks, metadata, isImageBased: true };
|
|
967
|
+
}
|
|
968
|
+
} catch {
|
|
969
|
+
}
|
|
970
|
+
}
|
|
971
|
+
return { success: false, fileType: "pdf", pageCount, isImageBased: true, error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`, code: "IMAGE_BASED_PDF" };
|
|
780
972
|
}
|
|
781
973
|
let markdown = pageTexts.filter((t) => t.trim()).join("\n\n");
|
|
782
974
|
markdown = cleanPdfText(markdown);
|
|
783
|
-
return { success: true, fileType: "pdf", markdown, pageCount:
|
|
975
|
+
return { success: true, fileType: "pdf", markdown, pageCount: parsedPageCount, blocks, metadata };
|
|
784
976
|
} finally {
|
|
785
977
|
await doc.destroy().catch(() => {
|
|
786
978
|
});
|
|
787
979
|
}
|
|
788
980
|
}
|
|
981
|
+
async function extractPdfMetadata(doc, metadata) {
|
|
982
|
+
try {
|
|
983
|
+
const result = await doc.getMetadata();
|
|
984
|
+
if (!result?.info) return;
|
|
985
|
+
const info = result.info;
|
|
986
|
+
if (typeof info.Title === "string" && info.Title.trim()) metadata.title = info.Title.trim();
|
|
987
|
+
if (typeof info.Author === "string" && info.Author.trim()) metadata.author = info.Author.trim();
|
|
988
|
+
if (typeof info.Creator === "string" && info.Creator.trim()) metadata.creator = info.Creator.trim();
|
|
989
|
+
if (typeof info.Subject === "string" && info.Subject.trim()) metadata.description = info.Subject.trim();
|
|
990
|
+
if (typeof info.Keywords === "string" && info.Keywords.trim()) {
|
|
991
|
+
metadata.keywords = info.Keywords.split(/[,;]/).map((k) => k.trim()).filter(Boolean);
|
|
992
|
+
}
|
|
993
|
+
if (typeof info.CreationDate === "string") metadata.createdAt = parsePdfDate(info.CreationDate);
|
|
994
|
+
if (typeof info.ModDate === "string") metadata.modifiedAt = parsePdfDate(info.ModDate);
|
|
995
|
+
} catch {
|
|
996
|
+
}
|
|
997
|
+
}
|
|
998
|
+
function parsePdfDate(dateStr) {
|
|
999
|
+
const m = dateStr.match(/D:(\d{4})(\d{2})?(\d{2})?(\d{2})?(\d{2})?(\d{2})?/);
|
|
1000
|
+
if (!m) return void 0;
|
|
1001
|
+
const [, year, month = "01", day = "01", hour = "00", min = "00", sec = "00"] = m;
|
|
1002
|
+
return `${year}-${month}-${day}T${hour}:${min}:${sec}`;
|
|
1003
|
+
}
|
|
789
1004
|
function extractPageContent(rawItems) {
|
|
790
1005
|
const items = normalizeItems(rawItems);
|
|
791
1006
|
if (items.length === 0) return "";
|
|
@@ -1058,52 +1273,446 @@ function mergeKoreanLines(text) {
|
|
|
1058
1273
|
return result.join("\n");
|
|
1059
1274
|
}
|
|
1060
1275
|
|
|
1276
|
+
// src/diff/text-diff.ts
|
|
1277
|
+
function similarity(a, b) {
|
|
1278
|
+
if (a === b) return 1;
|
|
1279
|
+
if (!a || !b) return 0;
|
|
1280
|
+
const maxLen = Math.max(a.length, b.length);
|
|
1281
|
+
if (maxLen === 0) return 1;
|
|
1282
|
+
return 1 - levenshtein(a, b) / maxLen;
|
|
1283
|
+
}
|
|
1284
|
+
function normalizedSimilarity(a, b) {
|
|
1285
|
+
return similarity(normalize(a), normalize(b));
|
|
1286
|
+
}
|
|
1287
|
+
function normalize(s) {
|
|
1288
|
+
return s.replace(/\s+/g, " ").trim();
|
|
1289
|
+
}
|
|
1290
|
+
function levenshtein(a, b) {
|
|
1291
|
+
if (a.length > b.length) [a, b] = [b, a];
|
|
1292
|
+
const m = a.length;
|
|
1293
|
+
const n = b.length;
|
|
1294
|
+
let prev = Array.from({ length: m + 1 }, (_, i) => i);
|
|
1295
|
+
let curr = new Array(m + 1);
|
|
1296
|
+
for (let j = 1; j <= n; j++) {
|
|
1297
|
+
curr[0] = j;
|
|
1298
|
+
for (let i = 1; i <= m; i++) {
|
|
1299
|
+
if (a[i - 1] === b[j - 1]) {
|
|
1300
|
+
curr[i] = prev[i - 1];
|
|
1301
|
+
} else {
|
|
1302
|
+
curr[i] = 1 + Math.min(prev[i - 1], prev[i], curr[i - 1]);
|
|
1303
|
+
}
|
|
1304
|
+
}
|
|
1305
|
+
;
|
|
1306
|
+
[prev, curr] = [curr, prev];
|
|
1307
|
+
}
|
|
1308
|
+
return prev[m];
|
|
1309
|
+
}
|
|
1310
|
+
|
|
1311
|
+
// src/diff/compare.ts
|
|
1312
|
+
var SIMILARITY_THRESHOLD = 0.4;
|
|
1313
|
+
async function compare(bufferA, bufferB, options) {
|
|
1314
|
+
const [resultA, resultB] = await Promise.all([
|
|
1315
|
+
parse(bufferA, options),
|
|
1316
|
+
parse(bufferB, options)
|
|
1317
|
+
]);
|
|
1318
|
+
if (!resultA.success) throw new Error(`\uBB38\uC11CA \uD30C\uC2F1 \uC2E4\uD328: ${resultA.error}`);
|
|
1319
|
+
if (!resultB.success) throw new Error(`\uBB38\uC11CB \uD30C\uC2F1 \uC2E4\uD328: ${resultB.error}`);
|
|
1320
|
+
return diffBlocks(resultA.blocks, resultB.blocks);
|
|
1321
|
+
}
|
|
1322
|
+
function diffBlocks(blocksA, blocksB) {
|
|
1323
|
+
const aligned = alignBlocks(blocksA, blocksB);
|
|
1324
|
+
const stats = { added: 0, removed: 0, modified: 0, unchanged: 0 };
|
|
1325
|
+
const diffs = [];
|
|
1326
|
+
for (const [a, b] of aligned) {
|
|
1327
|
+
if (a && b) {
|
|
1328
|
+
const sim = blockSimilarity(a, b);
|
|
1329
|
+
if (sim >= 0.99) {
|
|
1330
|
+
diffs.push({ type: "unchanged", before: a, after: b, similarity: 1 });
|
|
1331
|
+
stats.unchanged++;
|
|
1332
|
+
} else {
|
|
1333
|
+
const diff = { type: "modified", before: a, after: b, similarity: sim };
|
|
1334
|
+
if (a.type === "table" && b.type === "table" && a.table && b.table) {
|
|
1335
|
+
diff.cellDiffs = diffTableCells(a.table, b.table);
|
|
1336
|
+
}
|
|
1337
|
+
diffs.push(diff);
|
|
1338
|
+
stats.modified++;
|
|
1339
|
+
}
|
|
1340
|
+
} else if (a) {
|
|
1341
|
+
diffs.push({ type: "removed", before: a });
|
|
1342
|
+
stats.removed++;
|
|
1343
|
+
} else if (b) {
|
|
1344
|
+
diffs.push({ type: "added", after: b });
|
|
1345
|
+
stats.added++;
|
|
1346
|
+
}
|
|
1347
|
+
}
|
|
1348
|
+
return { stats, diffs };
|
|
1349
|
+
}
|
|
1350
|
+
function alignBlocks(a, b) {
|
|
1351
|
+
const m = a.length, n = b.length;
|
|
1352
|
+
if (m * n > 1e7) return fallbackAlign(a, b);
|
|
1353
|
+
const simCache = /* @__PURE__ */ new Map();
|
|
1354
|
+
const getSim = (i2, j2) => {
|
|
1355
|
+
const key = `${i2},${j2}`;
|
|
1356
|
+
let v = simCache.get(key);
|
|
1357
|
+
if (v === void 0) {
|
|
1358
|
+
v = blockSimilarity(a[i2], b[j2]);
|
|
1359
|
+
simCache.set(key, v);
|
|
1360
|
+
}
|
|
1361
|
+
return v;
|
|
1362
|
+
};
|
|
1363
|
+
const dp = Array.from({ length: m + 1 }, () => new Array(n + 1).fill(0));
|
|
1364
|
+
for (let i2 = 1; i2 <= m; i2++) {
|
|
1365
|
+
for (let j2 = 1; j2 <= n; j2++) {
|
|
1366
|
+
if (getSim(i2 - 1, j2 - 1) >= SIMILARITY_THRESHOLD) {
|
|
1367
|
+
dp[i2][j2] = dp[i2 - 1][j2 - 1] + 1;
|
|
1368
|
+
} else {
|
|
1369
|
+
dp[i2][j2] = Math.max(dp[i2 - 1][j2], dp[i2][j2 - 1]);
|
|
1370
|
+
}
|
|
1371
|
+
}
|
|
1372
|
+
}
|
|
1373
|
+
const pairs = [];
|
|
1374
|
+
let i = m, j = n;
|
|
1375
|
+
while (i > 0 && j > 0) {
|
|
1376
|
+
if (getSim(i - 1, j - 1) >= SIMILARITY_THRESHOLD && dp[i][j] === dp[i - 1][j - 1] + 1) {
|
|
1377
|
+
pairs.push([i - 1, j - 1]);
|
|
1378
|
+
i--;
|
|
1379
|
+
j--;
|
|
1380
|
+
} else if (dp[i - 1][j] >= dp[i][j - 1]) {
|
|
1381
|
+
i--;
|
|
1382
|
+
} else {
|
|
1383
|
+
j--;
|
|
1384
|
+
}
|
|
1385
|
+
}
|
|
1386
|
+
pairs.reverse();
|
|
1387
|
+
const result = [];
|
|
1388
|
+
let ai = 0, bi = 0;
|
|
1389
|
+
for (const [pi, pj] of pairs) {
|
|
1390
|
+
while (ai < pi) result.push([a[ai++], null]);
|
|
1391
|
+
while (bi < pj) result.push([null, b[bi++]]);
|
|
1392
|
+
result.push([a[ai++], b[bi++]]);
|
|
1393
|
+
}
|
|
1394
|
+
while (ai < m) result.push([a[ai++], null]);
|
|
1395
|
+
while (bi < n) result.push([null, b[bi++]]);
|
|
1396
|
+
return result;
|
|
1397
|
+
}
|
|
1398
|
+
function fallbackAlign(a, b) {
|
|
1399
|
+
const result = [];
|
|
1400
|
+
const len = Math.max(a.length, b.length);
|
|
1401
|
+
for (let i = 0; i < len; i++) {
|
|
1402
|
+
result.push([a[i] || null, b[i] || null]);
|
|
1403
|
+
}
|
|
1404
|
+
return result;
|
|
1405
|
+
}
|
|
1406
|
+
function blockSimilarity(a, b) {
|
|
1407
|
+
if (a.type !== b.type) return 0;
|
|
1408
|
+
if (a.type === "paragraph") {
|
|
1409
|
+
return normalizedSimilarity(a.text || "", b.text || "");
|
|
1410
|
+
}
|
|
1411
|
+
if (a.type === "table" && a.table && b.table) {
|
|
1412
|
+
return tableSimilarity(a.table, b.table);
|
|
1413
|
+
}
|
|
1414
|
+
return 0;
|
|
1415
|
+
}
|
|
1416
|
+
function tableSimilarity(a, b) {
|
|
1417
|
+
const dimSim = 1 - Math.abs(a.rows * a.cols - b.rows * b.cols) / Math.max(a.rows * a.cols, b.rows * b.cols, 1);
|
|
1418
|
+
const textsA = a.cells.flat().map((c) => c.text).join(" ");
|
|
1419
|
+
const textsB = b.cells.flat().map((c) => c.text).join(" ");
|
|
1420
|
+
const contentSim = normalizedSimilarity(textsA, textsB);
|
|
1421
|
+
return dimSim * 0.3 + contentSim * 0.7;
|
|
1422
|
+
}
|
|
1423
|
+
function diffTableCells(a, b) {
|
|
1424
|
+
const maxRows = Math.max(a.rows, b.rows);
|
|
1425
|
+
const maxCols = Math.max(a.cols, b.cols);
|
|
1426
|
+
const result = [];
|
|
1427
|
+
for (let r = 0; r < maxRows; r++) {
|
|
1428
|
+
const row = [];
|
|
1429
|
+
for (let c = 0; c < maxCols; c++) {
|
|
1430
|
+
const cellA = r < a.rows && c < a.cols ? a.cells[r][c].text : void 0;
|
|
1431
|
+
const cellB = r < b.rows && c < b.cols ? b.cells[r][c].text : void 0;
|
|
1432
|
+
let type;
|
|
1433
|
+
if (cellA === void 0) type = "added";
|
|
1434
|
+
else if (cellB === void 0) type = "removed";
|
|
1435
|
+
else if (cellA === cellB) type = "unchanged";
|
|
1436
|
+
else type = "modified";
|
|
1437
|
+
row.push({ type, before: cellA, after: cellB });
|
|
1438
|
+
}
|
|
1439
|
+
result.push(row);
|
|
1440
|
+
}
|
|
1441
|
+
return result;
|
|
1442
|
+
}
|
|
1443
|
+
|
|
1444
|
+
// src/form/recognize.ts
|
|
1445
|
+
var LABEL_KEYWORDS = /* @__PURE__ */ new Set([
|
|
1446
|
+
"\uC131\uBA85",
|
|
1447
|
+
"\uC774\uB984",
|
|
1448
|
+
"\uC8FC\uC18C",
|
|
1449
|
+
"\uC804\uD654",
|
|
1450
|
+
"\uC804\uD654\uBC88\uD638",
|
|
1451
|
+
"\uD734\uB300\uD3F0",
|
|
1452
|
+
"\uD578\uB4DC\uD3F0",
|
|
1453
|
+
"\uC5F0\uB77D\uCC98",
|
|
1454
|
+
"\uC0DD\uB144\uC6D4\uC77C",
|
|
1455
|
+
"\uC8FC\uBBFC\uB4F1\uB85D\uBC88\uD638",
|
|
1456
|
+
"\uC18C\uC18D",
|
|
1457
|
+
"\uC9C1\uC704",
|
|
1458
|
+
"\uC9C1\uAE09",
|
|
1459
|
+
"\uBD80\uC11C",
|
|
1460
|
+
"\uC774\uBA54\uC77C",
|
|
1461
|
+
"\uD329\uC2A4",
|
|
1462
|
+
"\uD559\uAD50",
|
|
1463
|
+
"\uD559\uB144",
|
|
1464
|
+
"\uBC18",
|
|
1465
|
+
"\uBC88\uD638",
|
|
1466
|
+
"\uC2E0\uCCAD\uC778",
|
|
1467
|
+
"\uB300\uD45C\uC790",
|
|
1468
|
+
"\uB2F4\uB2F9\uC790",
|
|
1469
|
+
"\uC791\uC131\uC790",
|
|
1470
|
+
"\uD655\uC778\uC790",
|
|
1471
|
+
"\uC2B9\uC778\uC790",
|
|
1472
|
+
"\uC77C\uC2DC",
|
|
1473
|
+
"\uB0A0\uC9DC",
|
|
1474
|
+
"\uAE30\uAC04",
|
|
1475
|
+
"\uC7A5\uC18C",
|
|
1476
|
+
"\uBAA9\uC801",
|
|
1477
|
+
"\uC0AC\uC720",
|
|
1478
|
+
"\uBE44\uACE0",
|
|
1479
|
+
"\uAE08\uC561",
|
|
1480
|
+
"\uC218\uB7C9",
|
|
1481
|
+
"\uB2E8\uAC00",
|
|
1482
|
+
"\uD569\uACC4",
|
|
1483
|
+
"\uACC4",
|
|
1484
|
+
"\uC18C\uACC4"
|
|
1485
|
+
]);
|
|
1486
|
+
function isLabelCell(text) {
|
|
1487
|
+
const trimmed = text.trim();
|
|
1488
|
+
if (!trimmed || trimmed.length > 30) return false;
|
|
1489
|
+
for (const kw of LABEL_KEYWORDS) {
|
|
1490
|
+
if (trimmed.includes(kw)) return true;
|
|
1491
|
+
}
|
|
1492
|
+
if (/^[가-힣\s()·:]{2,8}$/.test(trimmed) && !/\d/.test(trimmed)) return true;
|
|
1493
|
+
if (/^[가-힣A-Za-z\s]+[::]$/.test(trimmed)) return true;
|
|
1494
|
+
return false;
|
|
1495
|
+
}
|
|
1496
|
+
function extractFormFields(blocks) {
|
|
1497
|
+
const fields = [];
|
|
1498
|
+
let totalTables = 0;
|
|
1499
|
+
let formTables = 0;
|
|
1500
|
+
for (const block of blocks) {
|
|
1501
|
+
if (block.type !== "table" || !block.table) continue;
|
|
1502
|
+
totalTables++;
|
|
1503
|
+
const tableFields = extractFromTable(block.table);
|
|
1504
|
+
if (tableFields.length > 0) {
|
|
1505
|
+
formTables++;
|
|
1506
|
+
fields.push(...tableFields);
|
|
1507
|
+
}
|
|
1508
|
+
}
|
|
1509
|
+
for (const block of blocks) {
|
|
1510
|
+
if (block.type === "paragraph" && block.text) {
|
|
1511
|
+
const inlineFields = extractInlineFields(block.text);
|
|
1512
|
+
fields.push(...inlineFields);
|
|
1513
|
+
}
|
|
1514
|
+
}
|
|
1515
|
+
const confidence = totalTables > 0 ? formTables / totalTables : fields.length > 0 ? 0.3 : 0;
|
|
1516
|
+
return { fields, confidence: Math.min(confidence, 1) };
|
|
1517
|
+
}
|
|
1518
|
+
function extractFromTable(table) {
|
|
1519
|
+
const fields = [];
|
|
1520
|
+
if (table.cols >= 2) {
|
|
1521
|
+
for (let r = 0; r < table.rows; r++) {
|
|
1522
|
+
for (let c = 0; c < table.cols - 1; c++) {
|
|
1523
|
+
const labelCell = table.cells[r][c];
|
|
1524
|
+
const valueCell = table.cells[r][c + 1];
|
|
1525
|
+
if (isLabelCell(labelCell.text) && valueCell.text.trim()) {
|
|
1526
|
+
fields.push({
|
|
1527
|
+
label: labelCell.text.trim().replace(/[::]\s*$/, ""),
|
|
1528
|
+
value: valueCell.text.trim(),
|
|
1529
|
+
row: r,
|
|
1530
|
+
col: c
|
|
1531
|
+
});
|
|
1532
|
+
}
|
|
1533
|
+
}
|
|
1534
|
+
}
|
|
1535
|
+
}
|
|
1536
|
+
if (fields.length === 0 && table.rows >= 2 && table.cols >= 2) {
|
|
1537
|
+
const headerRow = table.cells[0];
|
|
1538
|
+
const allLabels = headerRow.every((cell) => {
|
|
1539
|
+
const t = cell.text.trim();
|
|
1540
|
+
return t.length > 0 && t.length <= 20;
|
|
1541
|
+
});
|
|
1542
|
+
if (allLabels) {
|
|
1543
|
+
for (let r = 1; r < table.rows; r++) {
|
|
1544
|
+
for (let c = 0; c < table.cols; c++) {
|
|
1545
|
+
const label = headerRow[c].text.trim();
|
|
1546
|
+
const value = table.cells[r][c].text.trim();
|
|
1547
|
+
if (label && value) {
|
|
1548
|
+
fields.push({ label, value, row: r, col: c });
|
|
1549
|
+
}
|
|
1550
|
+
}
|
|
1551
|
+
}
|
|
1552
|
+
}
|
|
1553
|
+
}
|
|
1554
|
+
return fields;
|
|
1555
|
+
}
|
|
1556
|
+
function extractInlineFields(text) {
|
|
1557
|
+
const fields = [];
|
|
1558
|
+
const pattern = /([가-힣A-Za-z]{2,10})\s*[::]\s*([^\n,;]{1,100})/g;
|
|
1559
|
+
let match;
|
|
1560
|
+
while ((match = pattern.exec(text)) !== null) {
|
|
1561
|
+
const label = match[1].trim();
|
|
1562
|
+
const value = match[2].trim();
|
|
1563
|
+
if (value) {
|
|
1564
|
+
fields.push({ label, value, row: -1, col: -1 });
|
|
1565
|
+
}
|
|
1566
|
+
}
|
|
1567
|
+
return fields;
|
|
1568
|
+
}
|
|
1569
|
+
|
|
1570
|
+
// src/hwpx/generator.ts
|
|
1571
|
+
import JSZip2 from "jszip";
|
|
1572
|
+
var HWPML_NS = "http://www.hancom.co.kr/hwpml/2016/HwpMl";
|
|
1573
|
+
async function markdownToHwpx(markdown) {
|
|
1574
|
+
const blocks = parseMarkdownToBlocks(markdown);
|
|
1575
|
+
const sectionXml = blocksToSectionXml(blocks);
|
|
1576
|
+
const zip = new JSZip2();
|
|
1577
|
+
zip.file("mimetype", "application/hwp+zip", { compression: "STORE" });
|
|
1578
|
+
zip.file("Contents/content.hpf", generateManifest());
|
|
1579
|
+
zip.file("Contents/section0.xml", sectionXml);
|
|
1580
|
+
return await zip.generateAsync({ type: "arraybuffer" });
|
|
1581
|
+
}
|
|
1582
|
+
function parseMarkdownToBlocks(md) {
|
|
1583
|
+
const lines = md.split("\n");
|
|
1584
|
+
const blocks = [];
|
|
1585
|
+
let i = 0;
|
|
1586
|
+
while (i < lines.length) {
|
|
1587
|
+
const line = lines[i];
|
|
1588
|
+
if (!line.trim()) {
|
|
1589
|
+
i++;
|
|
1590
|
+
continue;
|
|
1591
|
+
}
|
|
1592
|
+
const headingMatch = line.match(/^(#{1,6})\s+(.+)$/);
|
|
1593
|
+
if (headingMatch) {
|
|
1594
|
+
blocks.push({ type: "heading", text: headingMatch[2].trim(), level: headingMatch[1].length });
|
|
1595
|
+
i++;
|
|
1596
|
+
continue;
|
|
1597
|
+
}
|
|
1598
|
+
if (line.trimStart().startsWith("|")) {
|
|
1599
|
+
const tableRows = [];
|
|
1600
|
+
while (i < lines.length && lines[i].trimStart().startsWith("|")) {
|
|
1601
|
+
const row = lines[i];
|
|
1602
|
+
if (/^\|[\s\-:]+\|/.test(row) && !row.includes("---") === false && /^[\s|:\-]+$/.test(row)) {
|
|
1603
|
+
i++;
|
|
1604
|
+
continue;
|
|
1605
|
+
}
|
|
1606
|
+
const cells = row.split("|").slice(1, -1).map((c) => c.trim());
|
|
1607
|
+
if (cells.length > 0) tableRows.push(cells);
|
|
1608
|
+
i++;
|
|
1609
|
+
}
|
|
1610
|
+
if (tableRows.length > 0) {
|
|
1611
|
+
blocks.push({ type: "table", rows: tableRows });
|
|
1612
|
+
}
|
|
1613
|
+
continue;
|
|
1614
|
+
}
|
|
1615
|
+
blocks.push({ type: "paragraph", text: line.trim() });
|
|
1616
|
+
i++;
|
|
1617
|
+
}
|
|
1618
|
+
return blocks;
|
|
1619
|
+
}
|
|
1620
|
+
function escapeXml(text) {
|
|
1621
|
+
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """);
|
|
1622
|
+
}
|
|
1623
|
+
function generateParagraph(text) {
|
|
1624
|
+
return `<hp:p><hp:run><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
|
|
1625
|
+
}
|
|
1626
|
+
function generateTable(rows) {
|
|
1627
|
+
const trElements = rows.map((row) => {
|
|
1628
|
+
const tdElements = row.map(
|
|
1629
|
+
(cell) => `<hp:tc><hp:cellSpan colSpan="1" rowSpan="1"/>${generateParagraph(cell)}</hp:tc>`
|
|
1630
|
+
).join("");
|
|
1631
|
+
return `<hp:tr>${tdElements}</hp:tr>`;
|
|
1632
|
+
}).join("");
|
|
1633
|
+
return `<hp:tbl>${trElements}</hp:tbl>`;
|
|
1634
|
+
}
|
|
1635
|
+
function blocksToSectionXml(blocks) {
|
|
1636
|
+
const body = blocks.map((block) => {
|
|
1637
|
+
switch (block.type) {
|
|
1638
|
+
case "heading":
|
|
1639
|
+
return generateParagraph(block.text || "");
|
|
1640
|
+
case "table":
|
|
1641
|
+
return block.rows ? generateTable(block.rows) : "";
|
|
1642
|
+
case "paragraph":
|
|
1643
|
+
return generateParagraph(block.text || "");
|
|
1644
|
+
default:
|
|
1645
|
+
return "";
|
|
1646
|
+
}
|
|
1647
|
+
}).join("\n ");
|
|
1648
|
+
return `<?xml version="1.0" encoding="UTF-8"?>
|
|
1649
|
+
<hs:sec xmlns:hs="${HWPML_NS}" xmlns:hp="${HWPML_NS}">
|
|
1650
|
+
${body}
|
|
1651
|
+
</hs:sec>`;
|
|
1652
|
+
}
|
|
1653
|
+
function generateManifest() {
|
|
1654
|
+
return `<?xml version="1.0" encoding="UTF-8"?>
|
|
1655
|
+
<opf:package xmlns:opf="http://www.idpf.org/2007/opf">
|
|
1656
|
+
<opf:manifest>
|
|
1657
|
+
<opf:item id="s0" href="section0.xml" media-type="application/xml"/>
|
|
1658
|
+
</opf:manifest>
|
|
1659
|
+
<opf:spine>
|
|
1660
|
+
<opf:itemref idref="s0"/>
|
|
1661
|
+
</opf:spine>
|
|
1662
|
+
</opf:package>`;
|
|
1663
|
+
}
|
|
1664
|
+
|
|
1061
1665
|
// src/index.ts
|
|
1062
|
-
async function parse(buffer) {
|
|
1666
|
+
async function parse(buffer, options) {
|
|
1063
1667
|
if (!buffer || buffer.byteLength === 0) {
|
|
1064
|
-
return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4." };
|
|
1668
|
+
return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
|
|
1065
1669
|
}
|
|
1066
1670
|
const format = detectFormat(buffer);
|
|
1067
1671
|
switch (format) {
|
|
1068
1672
|
case "hwpx":
|
|
1069
|
-
return parseHwpx(buffer);
|
|
1673
|
+
return parseHwpx(buffer, options);
|
|
1070
1674
|
case "hwp":
|
|
1071
|
-
return parseHwp(buffer);
|
|
1675
|
+
return parseHwp(buffer, options);
|
|
1072
1676
|
case "pdf":
|
|
1073
|
-
return parsePdf(buffer);
|
|
1677
|
+
return parsePdf(buffer, options);
|
|
1074
1678
|
default:
|
|
1075
|
-
return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4." };
|
|
1679
|
+
return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
|
|
1076
1680
|
}
|
|
1077
1681
|
}
|
|
1078
|
-
async function parseHwpx(buffer) {
|
|
1682
|
+
async function parseHwpx(buffer, options) {
|
|
1079
1683
|
try {
|
|
1080
|
-
const markdown = await parseHwpxDocument(buffer);
|
|
1081
|
-
return { success: true, fileType: "hwpx", markdown };
|
|
1684
|
+
const { markdown, blocks, metadata } = await parseHwpxDocument(buffer, options);
|
|
1685
|
+
return { success: true, fileType: "hwpx", markdown, blocks, metadata };
|
|
1082
1686
|
} catch (err) {
|
|
1083
|
-
return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328" };
|
|
1687
|
+
return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
1084
1688
|
}
|
|
1085
1689
|
}
|
|
1086
|
-
async function parseHwp(buffer) {
|
|
1690
|
+
async function parseHwp(buffer, options) {
|
|
1087
1691
|
try {
|
|
1088
|
-
const markdown = parseHwp5Document(Buffer.from(buffer));
|
|
1089
|
-
return { success: true, fileType: "hwp", markdown };
|
|
1692
|
+
const { markdown, blocks, metadata } = parseHwp5Document(Buffer.from(buffer), options);
|
|
1693
|
+
return { success: true, fileType: "hwp", markdown, blocks, metadata };
|
|
1090
1694
|
} catch (err) {
|
|
1091
|
-
return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328" };
|
|
1695
|
+
return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
1092
1696
|
}
|
|
1093
1697
|
}
|
|
1094
|
-
async function parsePdf(buffer) {
|
|
1698
|
+
async function parsePdf(buffer, options) {
|
|
1095
1699
|
try {
|
|
1096
|
-
return await parsePdfDocument(buffer);
|
|
1700
|
+
return await parsePdfDocument(buffer, options);
|
|
1097
1701
|
} catch (err) {
|
|
1098
|
-
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328" };
|
|
1702
|
+
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
1099
1703
|
}
|
|
1100
1704
|
}
|
|
1101
1705
|
export {
|
|
1102
1706
|
VERSION,
|
|
1707
|
+
blocksToMarkdown,
|
|
1708
|
+
compare,
|
|
1103
1709
|
detectFormat,
|
|
1710
|
+
diffBlocks,
|
|
1711
|
+
extractFormFields,
|
|
1104
1712
|
isHwpxFile,
|
|
1105
1713
|
isOldHwpFile,
|
|
1106
1714
|
isPdfFile,
|
|
1715
|
+
markdownToHwpx,
|
|
1107
1716
|
parse,
|
|
1108
1717
|
parseHwp,
|
|
1109
1718
|
parseHwpx,
|