kordoc 1.3.0 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +111 -118
- package/dist/{chunk-KCGDEP7Q.js → chunk-FC5R5FMV.js} +575 -63
- package/dist/chunk-FC5R5FMV.js.map +1 -0
- package/dist/cli.js +15 -3
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +645 -35
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +163 -6
- package/dist/index.d.ts +163 -6
- package/dist/index.js +647 -35
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +216 -13
- package/dist/mcp.js.map +1 -1
- package/dist/provider-JB7SY74K.js +38 -0
- package/dist/provider-JB7SY74K.js.map +1 -0
- package/dist/watch-K2JXCS32.js +90 -0
- package/dist/watch-K2JXCS32.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-KCGDEP7Q.js.map +0 -1
|
@@ -24,34 +24,6 @@ function detectFormat(buffer) {
|
|
|
24
24
|
return "unknown";
|
|
25
25
|
}
|
|
26
26
|
|
|
27
|
-
// src/utils.ts
|
|
28
|
-
var VERSION = true ? "1.3.0" : "0.0.0-dev";
|
|
29
|
-
function toArrayBuffer(buf) {
|
|
30
|
-
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
31
|
-
return buf.buffer;
|
|
32
|
-
}
|
|
33
|
-
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
34
|
-
}
|
|
35
|
-
var KordocError = class extends Error {
|
|
36
|
-
constructor(message) {
|
|
37
|
-
super(message);
|
|
38
|
-
this.name = "KordocError";
|
|
39
|
-
}
|
|
40
|
-
};
|
|
41
|
-
function sanitizeError(err) {
|
|
42
|
-
if (err instanceof KordocError) return err.message;
|
|
43
|
-
return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
|
|
44
|
-
}
|
|
45
|
-
function isPathTraversal(name) {
|
|
46
|
-
const normalized = name.replace(/\\/g, "/");
|
|
47
|
-
return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
// src/hwpx/parser.ts
|
|
51
|
-
import JSZip from "jszip";
|
|
52
|
-
import { inflateRawSync } from "zlib";
|
|
53
|
-
import { DOMParser } from "@xmldom/xmldom";
|
|
54
|
-
|
|
55
27
|
// src/table/builder.ts
|
|
56
28
|
var MAX_COLS = 200;
|
|
57
29
|
var MAX_ROWS = 1e4;
|
|
@@ -181,6 +153,75 @@ function tableToMarkdown(table) {
|
|
|
181
153
|
return md.join("\n");
|
|
182
154
|
}
|
|
183
155
|
|
|
156
|
+
// src/utils.ts
|
|
157
|
+
var VERSION = true ? "1.4.1" : "0.0.0-dev";
|
|
158
|
+
function toArrayBuffer(buf) {
|
|
159
|
+
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
160
|
+
return buf.buffer;
|
|
161
|
+
}
|
|
162
|
+
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
163
|
+
}
|
|
164
|
+
var KordocError = class extends Error {
|
|
165
|
+
constructor(message) {
|
|
166
|
+
super(message);
|
|
167
|
+
this.name = "KordocError";
|
|
168
|
+
}
|
|
169
|
+
};
|
|
170
|
+
function sanitizeError(err) {
|
|
171
|
+
if (err instanceof KordocError) return err.message;
|
|
172
|
+
return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
|
|
173
|
+
}
|
|
174
|
+
function isPathTraversal(name) {
|
|
175
|
+
const normalized = name.replace(/\\/g, "/");
|
|
176
|
+
return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
177
|
+
}
|
|
178
|
+
function classifyError(err) {
|
|
179
|
+
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
180
|
+
const msg = err.message;
|
|
181
|
+
if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
|
|
182
|
+
if (msg.includes("DRM")) return "DRM_PROTECTED";
|
|
183
|
+
if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
|
|
184
|
+
if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
|
|
185
|
+
if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
|
|
186
|
+
if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
|
|
187
|
+
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
188
|
+
return "PARSE_ERROR";
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// src/hwpx/parser.ts
|
|
192
|
+
import JSZip from "jszip";
|
|
193
|
+
import { inflateRawSync } from "zlib";
|
|
194
|
+
import { DOMParser } from "@xmldom/xmldom";
|
|
195
|
+
|
|
196
|
+
// src/page-range.ts
|
|
197
|
+
function parsePageRange(spec, maxPages) {
|
|
198
|
+
const result = /* @__PURE__ */ new Set();
|
|
199
|
+
if (maxPages <= 0) return result;
|
|
200
|
+
if (Array.isArray(spec)) {
|
|
201
|
+
for (const n of spec) {
|
|
202
|
+
const page = Math.round(n);
|
|
203
|
+
if (page >= 1 && page <= maxPages) result.add(page);
|
|
204
|
+
}
|
|
205
|
+
return result;
|
|
206
|
+
}
|
|
207
|
+
if (typeof spec !== "string" || spec.trim() === "") return result;
|
|
208
|
+
const parts = spec.split(",");
|
|
209
|
+
for (const part of parts) {
|
|
210
|
+
const trimmed = part.trim();
|
|
211
|
+
if (!trimmed) continue;
|
|
212
|
+
const rangeMatch = trimmed.match(/^(\d+)\s*-\s*(\d+)$/);
|
|
213
|
+
if (rangeMatch) {
|
|
214
|
+
const start = Math.max(1, parseInt(rangeMatch[1], 10));
|
|
215
|
+
const end = Math.min(maxPages, parseInt(rangeMatch[2], 10));
|
|
216
|
+
for (let i = start; i <= end; i++) result.add(i);
|
|
217
|
+
} else {
|
|
218
|
+
const page = parseInt(trimmed, 10);
|
|
219
|
+
if (!isNaN(page) && page >= 1 && page <= maxPages) result.add(page);
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
return result;
|
|
223
|
+
}
|
|
224
|
+
|
|
184
225
|
// src/hwpx/parser.ts
|
|
185
226
|
var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
|
|
186
227
|
var MAX_ZIP_ENTRIES = 500;
|
|
@@ -190,7 +231,7 @@ function clampSpan(val, max) {
|
|
|
190
231
|
function stripDtd(xml) {
|
|
191
232
|
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
192
233
|
}
|
|
193
|
-
async function parseHwpxDocument(buffer) {
|
|
234
|
+
async function parseHwpxDocument(buffer, options) {
|
|
194
235
|
const precheck = precheckZipSize(buffer);
|
|
195
236
|
if (precheck.totalUncompressed > MAX_DECOMPRESS_SIZE) {
|
|
196
237
|
throw new KordocError("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
@@ -208,19 +249,75 @@ async function parseHwpxDocument(buffer) {
|
|
|
208
249
|
if (actualEntryCount > MAX_ZIP_ENTRIES) {
|
|
209
250
|
throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
210
251
|
}
|
|
252
|
+
const metadata = {};
|
|
253
|
+
await extractHwpxMetadata(zip, metadata);
|
|
211
254
|
const sectionPaths = await resolveSectionPaths(zip);
|
|
212
255
|
if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
256
|
+
metadata.pageCount = sectionPaths.length;
|
|
257
|
+
const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
|
|
213
258
|
let totalDecompressed = 0;
|
|
214
259
|
const blocks = [];
|
|
215
|
-
for (
|
|
216
|
-
|
|
260
|
+
for (let si = 0; si < sectionPaths.length; si++) {
|
|
261
|
+
if (pageFilter && !pageFilter.has(si + 1)) continue;
|
|
262
|
+
const file = zip.file(sectionPaths[si]);
|
|
217
263
|
if (!file) continue;
|
|
218
264
|
const xml = await file.async("text");
|
|
219
265
|
totalDecompressed += xml.length * 2;
|
|
220
266
|
if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
221
267
|
blocks.push(...parseSectionXml(xml));
|
|
222
268
|
}
|
|
223
|
-
|
|
269
|
+
const markdown = blocksToMarkdown(blocks);
|
|
270
|
+
return { markdown, blocks, metadata };
|
|
271
|
+
}
|
|
272
|
+
async function extractHwpxMetadata(zip, metadata) {
|
|
273
|
+
try {
|
|
274
|
+
const metaPaths = ["meta.xml", "META-INF/meta.xml", "docProps/core.xml"];
|
|
275
|
+
for (const mp of metaPaths) {
|
|
276
|
+
const file = zip.file(mp) || Object.values(zip.files).find((f) => f.name.toLowerCase() === mp.toLowerCase()) || null;
|
|
277
|
+
if (!file) continue;
|
|
278
|
+
const xml = await file.async("text");
|
|
279
|
+
parseDublinCoreMetadata(xml, metadata);
|
|
280
|
+
if (metadata.title || metadata.author) return;
|
|
281
|
+
}
|
|
282
|
+
} catch {
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
function parseDublinCoreMetadata(xml, metadata) {
|
|
286
|
+
const parser = new DOMParser();
|
|
287
|
+
const doc = parser.parseFromString(stripDtd(xml), "text/xml");
|
|
288
|
+
if (!doc.documentElement) return;
|
|
289
|
+
const getText = (tagNames) => {
|
|
290
|
+
for (const tag of tagNames) {
|
|
291
|
+
const els = doc.getElementsByTagName(tag);
|
|
292
|
+
if (els.length > 0) {
|
|
293
|
+
const text = els[0].textContent?.trim();
|
|
294
|
+
if (text) return text;
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
return void 0;
|
|
298
|
+
};
|
|
299
|
+
metadata.title = metadata.title || getText(["dc:title", "title"]);
|
|
300
|
+
metadata.author = metadata.author || getText(["dc:creator", "creator", "cp:lastModifiedBy"]);
|
|
301
|
+
metadata.description = metadata.description || getText(["dc:description", "description", "dc:subject", "subject"]);
|
|
302
|
+
metadata.createdAt = metadata.createdAt || getText(["dcterms:created", "meta:creation-date"]);
|
|
303
|
+
metadata.modifiedAt = metadata.modifiedAt || getText(["dcterms:modified", "meta:date"]);
|
|
304
|
+
const keywords = getText(["dc:keyword", "cp:keywords", "meta:keyword"]);
|
|
305
|
+
if (keywords && !metadata.keywords) {
|
|
306
|
+
metadata.keywords = keywords.split(/[,;]/).map((k) => k.trim()).filter(Boolean);
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
async function extractHwpxMetadataOnly(buffer) {
|
|
310
|
+
let zip;
|
|
311
|
+
try {
|
|
312
|
+
zip = await JSZip.loadAsync(buffer);
|
|
313
|
+
} catch {
|
|
314
|
+
throw new KordocError("HWPX ZIP\uC744 \uC5F4 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
315
|
+
}
|
|
316
|
+
const metadata = {};
|
|
317
|
+
await extractHwpxMetadata(zip, metadata);
|
|
318
|
+
const sectionPaths = await resolveSectionPaths(zip);
|
|
319
|
+
metadata.pageCount = sectionPaths.length;
|
|
320
|
+
return metadata;
|
|
224
321
|
}
|
|
225
322
|
function precheckZipSize(buffer) {
|
|
226
323
|
try {
|
|
@@ -259,7 +356,7 @@ function extractFromBrokenZip(buffer) {
|
|
|
259
356
|
const data = new Uint8Array(buffer);
|
|
260
357
|
const view = new DataView(buffer);
|
|
261
358
|
let pos = 0;
|
|
262
|
-
const
|
|
359
|
+
const blocks = [];
|
|
263
360
|
let totalDecompressed = 0;
|
|
264
361
|
let entryCount = 0;
|
|
265
362
|
while (pos < data.length - 30) {
|
|
@@ -300,14 +397,14 @@ function extractFromBrokenZip(buffer) {
|
|
|
300
397
|
}
|
|
301
398
|
totalDecompressed += content.length * 2;
|
|
302
399
|
if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
|
|
303
|
-
|
|
304
|
-
if (sectionText) texts.push(sectionText);
|
|
400
|
+
blocks.push(...parseSectionXml(content));
|
|
305
401
|
} catch {
|
|
306
402
|
continue;
|
|
307
403
|
}
|
|
308
404
|
}
|
|
309
|
-
if (
|
|
310
|
-
|
|
405
|
+
if (blocks.length === 0) throw new KordocError("\uC190\uC0C1\uB41C HWPX\uC5D0\uC11C \uC139\uC158 \uB370\uC774\uD130\uB97C \uBCF5\uAD6C\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
406
|
+
const markdown = blocksToMarkdown(blocks);
|
|
407
|
+
return { markdown, blocks };
|
|
311
408
|
}
|
|
312
409
|
async function resolveSectionPaths(zip) {
|
|
313
410
|
const manifestPaths = ["Contents/content.hpf", "content.hpf"];
|
|
@@ -579,7 +676,7 @@ var require2 = createRequire(import.meta.url);
|
|
|
579
676
|
var CFB = require2("cfb");
|
|
580
677
|
var MAX_SECTIONS = 100;
|
|
581
678
|
var MAX_TOTAL_DECOMPRESS = 100 * 1024 * 1024;
|
|
582
|
-
function parseHwp5Document(buffer) {
|
|
679
|
+
function parseHwp5Document(buffer, options) {
|
|
583
680
|
const cfb = CFB.parse(buffer);
|
|
584
681
|
const headerEntry = CFB.find(cfb, "/FileHeader");
|
|
585
682
|
if (!headerEntry?.content) throw new KordocError("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
|
|
@@ -588,18 +685,73 @@ function parseHwp5Document(buffer) {
|
|
|
588
685
|
if (header.flags & FLAG_ENCRYPTED) throw new KordocError("\uC554\uD638\uD654\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
|
|
589
686
|
if (header.flags & FLAG_DRM) throw new KordocError("DRM \uBCF4\uD638\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
|
|
590
687
|
const compressed = (header.flags & FLAG_COMPRESSED) !== 0;
|
|
688
|
+
const metadata = {
|
|
689
|
+
version: `${header.versionMajor}.x`
|
|
690
|
+
};
|
|
691
|
+
extractHwp5Metadata(cfb, metadata);
|
|
591
692
|
const sections = findSections(cfb);
|
|
592
693
|
if (sections.length === 0) throw new KordocError("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
694
|
+
metadata.pageCount = sections.length;
|
|
695
|
+
const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
|
|
593
696
|
const blocks = [];
|
|
594
697
|
let totalDecompressed = 0;
|
|
595
|
-
for (
|
|
698
|
+
for (let si = 0; si < sections.length; si++) {
|
|
699
|
+
if (pageFilter && !pageFilter.has(si + 1)) continue;
|
|
700
|
+
const sectionData = sections[si];
|
|
596
701
|
const data = compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
|
|
597
702
|
totalDecompressed += data.length;
|
|
598
703
|
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
599
704
|
const records = readRecords(data);
|
|
600
705
|
blocks.push(...parseSection(records));
|
|
601
706
|
}
|
|
602
|
-
|
|
707
|
+
const markdown = blocksToMarkdown(blocks);
|
|
708
|
+
return { markdown, blocks, metadata };
|
|
709
|
+
}
|
|
710
|
+
function extractHwp5Metadata(cfb, metadata) {
|
|
711
|
+
try {
|
|
712
|
+
const summaryEntry = CFB.find(cfb, "/HwpSummaryInformation") || CFB.find(cfb, "/SummaryInformation");
|
|
713
|
+
if (!summaryEntry?.content) return;
|
|
714
|
+
const data = Buffer.from(summaryEntry.content);
|
|
715
|
+
if (data.length < 48) return;
|
|
716
|
+
const numSets = data.readUInt32LE(24);
|
|
717
|
+
if (numSets === 0) return;
|
|
718
|
+
const setOffset = data.readUInt32LE(44);
|
|
719
|
+
if (setOffset >= data.length - 8) return;
|
|
720
|
+
const numProps = data.readUInt32LE(setOffset + 4);
|
|
721
|
+
if (numProps === 0 || numProps > 100) return;
|
|
722
|
+
for (let i = 0; i < numProps; i++) {
|
|
723
|
+
const entryOffset = setOffset + 8 + i * 8;
|
|
724
|
+
if (entryOffset + 8 > data.length) break;
|
|
725
|
+
const propId = data.readUInt32LE(entryOffset);
|
|
726
|
+
const propOffset = setOffset + data.readUInt32LE(entryOffset + 4);
|
|
727
|
+
if (propOffset + 8 > data.length) continue;
|
|
728
|
+
if (propId !== 2 && propId !== 4 && propId !== 6) continue;
|
|
729
|
+
const propType = data.readUInt32LE(propOffset);
|
|
730
|
+
if (propType !== 30) continue;
|
|
731
|
+
const strLen = data.readUInt32LE(propOffset + 4);
|
|
732
|
+
if (strLen === 0 || strLen > 1e4 || propOffset + 8 + strLen > data.length) continue;
|
|
733
|
+
const str = data.subarray(propOffset + 8, propOffset + 8 + strLen).toString("utf8").replace(/\0+$/, "").trim();
|
|
734
|
+
if (!str) continue;
|
|
735
|
+
if (propId === 2) metadata.title = str;
|
|
736
|
+
else if (propId === 4) metadata.author = str;
|
|
737
|
+
else if (propId === 6) metadata.description = str;
|
|
738
|
+
}
|
|
739
|
+
} catch {
|
|
740
|
+
}
|
|
741
|
+
}
|
|
742
|
+
function extractHwp5MetadataOnly(buffer) {
|
|
743
|
+
const cfb = CFB.parse(buffer);
|
|
744
|
+
const headerEntry = CFB.find(cfb, "/FileHeader");
|
|
745
|
+
if (!headerEntry?.content) throw new KordocError("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
|
|
746
|
+
const header = parseFileHeader(Buffer.from(headerEntry.content));
|
|
747
|
+
if (header.signature !== "HWP Document File") throw new KordocError("HWP \uC2DC\uADF8\uB2C8\uCC98 \uBD88\uC77C\uCE58");
|
|
748
|
+
const metadata = {
|
|
749
|
+
version: `${header.versionMajor}.x`
|
|
750
|
+
};
|
|
751
|
+
extractHwp5Metadata(cfb, metadata);
|
|
752
|
+
const sections = findSections(cfb);
|
|
753
|
+
metadata.pageCount = sections.length;
|
|
754
|
+
return metadata;
|
|
603
755
|
}
|
|
604
756
|
function findSections(cfb) {
|
|
605
757
|
const sections = [];
|
|
@@ -761,7 +913,7 @@ import { getDocument, GlobalWorkerOptions } from "pdfjs-dist/legacy/build/pdf.mj
|
|
|
761
913
|
GlobalWorkerOptions.workerSrc = "";
|
|
762
914
|
var MAX_PAGES = 5e3;
|
|
763
915
|
var MAX_TOTAL_TEXT = 100 * 1024 * 1024;
|
|
764
|
-
async function parsePdfDocument(buffer) {
|
|
916
|
+
async function parsePdfDocument(buffer, options) {
|
|
765
917
|
const doc = await getDocument({
|
|
766
918
|
data: new Uint8Array(buffer),
|
|
767
919
|
useSystemFonts: true,
|
|
@@ -770,12 +922,17 @@ async function parsePdfDocument(buffer) {
|
|
|
770
922
|
}).promise;
|
|
771
923
|
try {
|
|
772
924
|
const pageCount = doc.numPages;
|
|
773
|
-
if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4." };
|
|
925
|
+
if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.", blocks: [] };
|
|
926
|
+
const metadata = { pageCount };
|
|
927
|
+
await extractPdfMetadata(doc, metadata);
|
|
774
928
|
const pageTexts = [];
|
|
929
|
+
const blocks = [];
|
|
775
930
|
let totalChars = 0;
|
|
776
931
|
let totalTextBytes = 0;
|
|
777
932
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
933
|
+
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
778
934
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
935
|
+
if (pageFilter && !pageFilter.has(i)) continue;
|
|
779
936
|
const page = await doc.getPage(i);
|
|
780
937
|
const tc = await page.getTextContent();
|
|
781
938
|
const pageText = extractPageContent(tc.items);
|
|
@@ -783,13 +940,65 @@ async function parsePdfDocument(buffer) {
|
|
|
783
940
|
totalTextBytes += pageText.length * 2;
|
|
784
941
|
if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
|
|
785
942
|
pageTexts.push(pageText);
|
|
943
|
+
blocks.push({ type: "paragraph", text: pageText });
|
|
786
944
|
}
|
|
787
|
-
|
|
788
|
-
|
|
945
|
+
const parsedPageCount = pageFilter ? pageFilter.size : effectivePageCount;
|
|
946
|
+
if (totalChars / Math.max(parsedPageCount, 1) < 10) {
|
|
947
|
+
if (options?.ocr) {
|
|
948
|
+
try {
|
|
949
|
+
const { ocrPages } = await import("./provider-JB7SY74K.js");
|
|
950
|
+
const ocrBlocks = await ocrPages(doc, options.ocr, pageFilter, effectivePageCount);
|
|
951
|
+
if (ocrBlocks.length > 0) {
|
|
952
|
+
const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
|
|
953
|
+
return { success: true, fileType: "pdf", markdown: ocrMarkdown, pageCount: parsedPageCount, blocks: ocrBlocks, metadata, isImageBased: true };
|
|
954
|
+
}
|
|
955
|
+
} catch {
|
|
956
|
+
}
|
|
957
|
+
}
|
|
958
|
+
return { success: false, fileType: "pdf", pageCount, isImageBased: true, error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`, code: "IMAGE_BASED_PDF" };
|
|
789
959
|
}
|
|
790
960
|
let markdown = pageTexts.filter((t) => t.trim()).join("\n\n");
|
|
791
961
|
markdown = cleanPdfText(markdown);
|
|
792
|
-
return { success: true, fileType: "pdf", markdown, pageCount:
|
|
962
|
+
return { success: true, fileType: "pdf", markdown, pageCount: parsedPageCount, blocks, metadata };
|
|
963
|
+
} finally {
|
|
964
|
+
await doc.destroy().catch(() => {
|
|
965
|
+
});
|
|
966
|
+
}
|
|
967
|
+
}
|
|
968
|
+
async function extractPdfMetadata(doc, metadata) {
|
|
969
|
+
try {
|
|
970
|
+
const result = await doc.getMetadata();
|
|
971
|
+
if (!result?.info) return;
|
|
972
|
+
const info = result.info;
|
|
973
|
+
if (typeof info.Title === "string" && info.Title.trim()) metadata.title = info.Title.trim();
|
|
974
|
+
if (typeof info.Author === "string" && info.Author.trim()) metadata.author = info.Author.trim();
|
|
975
|
+
if (typeof info.Creator === "string" && info.Creator.trim()) metadata.creator = info.Creator.trim();
|
|
976
|
+
if (typeof info.Subject === "string" && info.Subject.trim()) metadata.description = info.Subject.trim();
|
|
977
|
+
if (typeof info.Keywords === "string" && info.Keywords.trim()) {
|
|
978
|
+
metadata.keywords = info.Keywords.split(/[,;]/).map((k) => k.trim()).filter(Boolean);
|
|
979
|
+
}
|
|
980
|
+
if (typeof info.CreationDate === "string") metadata.createdAt = parsePdfDate(info.CreationDate);
|
|
981
|
+
if (typeof info.ModDate === "string") metadata.modifiedAt = parsePdfDate(info.ModDate);
|
|
982
|
+
} catch {
|
|
983
|
+
}
|
|
984
|
+
}
|
|
985
|
+
function parsePdfDate(dateStr) {
|
|
986
|
+
const m = dateStr.match(/D:(\d{4})(\d{2})?(\d{2})?(\d{2})?(\d{2})?(\d{2})?/);
|
|
987
|
+
if (!m) return void 0;
|
|
988
|
+
const [, year, month = "01", day = "01", hour = "00", min = "00", sec = "00"] = m;
|
|
989
|
+
return `${year}-${month}-${day}T${hour}:${min}:${sec}`;
|
|
990
|
+
}
|
|
991
|
+
async function extractPdfMetadataOnly(buffer) {
|
|
992
|
+
const doc = await getDocument({
|
|
993
|
+
data: new Uint8Array(buffer),
|
|
994
|
+
useSystemFonts: true,
|
|
995
|
+
disableFontFace: true,
|
|
996
|
+
isEvalSupported: false
|
|
997
|
+
}).promise;
|
|
998
|
+
try {
|
|
999
|
+
const metadata = { pageCount: doc.numPages };
|
|
1000
|
+
await extractPdfMetadata(doc, metadata);
|
|
1001
|
+
return metadata;
|
|
793
1002
|
} finally {
|
|
794
1003
|
await doc.destroy().catch(() => {
|
|
795
1004
|
});
|
|
@@ -1067,53 +1276,356 @@ function mergeKoreanLines(text) {
|
|
|
1067
1276
|
return result.join("\n");
|
|
1068
1277
|
}
|
|
1069
1278
|
|
|
1279
|
+
// src/form/recognize.ts
|
|
1280
|
+
var LABEL_KEYWORDS = /* @__PURE__ */ new Set([
|
|
1281
|
+
"\uC131\uBA85",
|
|
1282
|
+
"\uC774\uB984",
|
|
1283
|
+
"\uC8FC\uC18C",
|
|
1284
|
+
"\uC804\uD654",
|
|
1285
|
+
"\uC804\uD654\uBC88\uD638",
|
|
1286
|
+
"\uD734\uB300\uD3F0",
|
|
1287
|
+
"\uD578\uB4DC\uD3F0",
|
|
1288
|
+
"\uC5F0\uB77D\uCC98",
|
|
1289
|
+
"\uC0DD\uB144\uC6D4\uC77C",
|
|
1290
|
+
"\uC8FC\uBBFC\uB4F1\uB85D\uBC88\uD638",
|
|
1291
|
+
"\uC18C\uC18D",
|
|
1292
|
+
"\uC9C1\uC704",
|
|
1293
|
+
"\uC9C1\uAE09",
|
|
1294
|
+
"\uBD80\uC11C",
|
|
1295
|
+
"\uC774\uBA54\uC77C",
|
|
1296
|
+
"\uD329\uC2A4",
|
|
1297
|
+
"\uD559\uAD50",
|
|
1298
|
+
"\uD559\uB144",
|
|
1299
|
+
"\uBC18",
|
|
1300
|
+
"\uBC88\uD638",
|
|
1301
|
+
"\uC2E0\uCCAD\uC778",
|
|
1302
|
+
"\uB300\uD45C\uC790",
|
|
1303
|
+
"\uB2F4\uB2F9\uC790",
|
|
1304
|
+
"\uC791\uC131\uC790",
|
|
1305
|
+
"\uD655\uC778\uC790",
|
|
1306
|
+
"\uC2B9\uC778\uC790",
|
|
1307
|
+
"\uC77C\uC2DC",
|
|
1308
|
+
"\uB0A0\uC9DC",
|
|
1309
|
+
"\uAE30\uAC04",
|
|
1310
|
+
"\uC7A5\uC18C",
|
|
1311
|
+
"\uBAA9\uC801",
|
|
1312
|
+
"\uC0AC\uC720",
|
|
1313
|
+
"\uBE44\uACE0",
|
|
1314
|
+
"\uAE08\uC561",
|
|
1315
|
+
"\uC218\uB7C9",
|
|
1316
|
+
"\uB2E8\uAC00",
|
|
1317
|
+
"\uD569\uACC4",
|
|
1318
|
+
"\uACC4",
|
|
1319
|
+
"\uC18C\uACC4"
|
|
1320
|
+
]);
|
|
1321
|
+
function isLabelCell(text) {
|
|
1322
|
+
const trimmed = text.trim();
|
|
1323
|
+
if (!trimmed || trimmed.length > 30) return false;
|
|
1324
|
+
for (const kw of LABEL_KEYWORDS) {
|
|
1325
|
+
if (trimmed.includes(kw)) return true;
|
|
1326
|
+
}
|
|
1327
|
+
if (/^[가-힣\s()·:]{2,8}$/.test(trimmed) && !/\d/.test(trimmed)) return true;
|
|
1328
|
+
if (/^[가-힣A-Za-z\s]+[::]$/.test(trimmed)) return true;
|
|
1329
|
+
return false;
|
|
1330
|
+
}
|
|
1331
|
+
function extractFormFields(blocks) {
|
|
1332
|
+
const fields = [];
|
|
1333
|
+
let totalTables = 0;
|
|
1334
|
+
let formTables = 0;
|
|
1335
|
+
for (const block of blocks) {
|
|
1336
|
+
if (block.type !== "table" || !block.table) continue;
|
|
1337
|
+
totalTables++;
|
|
1338
|
+
const tableFields = extractFromTable(block.table);
|
|
1339
|
+
if (tableFields.length > 0) {
|
|
1340
|
+
formTables++;
|
|
1341
|
+
fields.push(...tableFields);
|
|
1342
|
+
}
|
|
1343
|
+
}
|
|
1344
|
+
for (const block of blocks) {
|
|
1345
|
+
if (block.type === "paragraph" && block.text) {
|
|
1346
|
+
const inlineFields = extractInlineFields(block.text);
|
|
1347
|
+
fields.push(...inlineFields);
|
|
1348
|
+
}
|
|
1349
|
+
}
|
|
1350
|
+
const confidence = totalTables > 0 ? formTables / totalTables : fields.length > 0 ? 0.3 : 0;
|
|
1351
|
+
return { fields, confidence: Math.min(confidence, 1) };
|
|
1352
|
+
}
|
|
1353
|
+
function extractFromTable(table) {
|
|
1354
|
+
const fields = [];
|
|
1355
|
+
if (table.cols >= 2) {
|
|
1356
|
+
for (let r = 0; r < table.rows; r++) {
|
|
1357
|
+
for (let c = 0; c < table.cols - 1; c++) {
|
|
1358
|
+
const labelCell = table.cells[r][c];
|
|
1359
|
+
const valueCell = table.cells[r][c + 1];
|
|
1360
|
+
if (isLabelCell(labelCell.text) && valueCell.text.trim()) {
|
|
1361
|
+
fields.push({
|
|
1362
|
+
label: labelCell.text.trim().replace(/[::]\s*$/, ""),
|
|
1363
|
+
value: valueCell.text.trim(),
|
|
1364
|
+
row: r,
|
|
1365
|
+
col: c
|
|
1366
|
+
});
|
|
1367
|
+
}
|
|
1368
|
+
}
|
|
1369
|
+
}
|
|
1370
|
+
}
|
|
1371
|
+
if (fields.length === 0 && table.rows >= 2 && table.cols >= 2) {
|
|
1372
|
+
const headerRow = table.cells[0];
|
|
1373
|
+
const allLabels = headerRow.every((cell) => {
|
|
1374
|
+
const t = cell.text.trim();
|
|
1375
|
+
return t.length > 0 && t.length <= 20;
|
|
1376
|
+
});
|
|
1377
|
+
if (allLabels) {
|
|
1378
|
+
for (let r = 1; r < table.rows; r++) {
|
|
1379
|
+
for (let c = 0; c < table.cols; c++) {
|
|
1380
|
+
const label = headerRow[c].text.trim();
|
|
1381
|
+
const value = table.cells[r][c].text.trim();
|
|
1382
|
+
if (label && value) {
|
|
1383
|
+
fields.push({ label, value, row: r, col: c });
|
|
1384
|
+
}
|
|
1385
|
+
}
|
|
1386
|
+
}
|
|
1387
|
+
}
|
|
1388
|
+
}
|
|
1389
|
+
return fields;
|
|
1390
|
+
}
|
|
1391
|
+
function extractInlineFields(text) {
|
|
1392
|
+
const fields = [];
|
|
1393
|
+
const pattern = /([가-힣A-Za-z]{2,10})\s*[::]\s*([^\n,;]{1,100})/g;
|
|
1394
|
+
let match;
|
|
1395
|
+
while ((match = pattern.exec(text)) !== null) {
|
|
1396
|
+
const label = match[1].trim();
|
|
1397
|
+
const value = match[2].trim();
|
|
1398
|
+
if (value) {
|
|
1399
|
+
fields.push({ label, value, row: -1, col: -1 });
|
|
1400
|
+
}
|
|
1401
|
+
}
|
|
1402
|
+
return fields;
|
|
1403
|
+
}
|
|
1404
|
+
|
|
1405
|
+
// src/hwpx/generator.ts
|
|
1406
|
+
import JSZip2 from "jszip";
|
|
1407
|
+
|
|
1070
1408
|
// src/index.ts
|
|
1071
|
-
async function parse(buffer) {
|
|
1409
|
+
async function parse(buffer, options) {
|
|
1072
1410
|
if (!buffer || buffer.byteLength === 0) {
|
|
1073
|
-
return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4." };
|
|
1411
|
+
return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
|
|
1074
1412
|
}
|
|
1075
1413
|
const format = detectFormat(buffer);
|
|
1076
1414
|
switch (format) {
|
|
1077
1415
|
case "hwpx":
|
|
1078
|
-
return parseHwpx(buffer);
|
|
1416
|
+
return parseHwpx(buffer, options);
|
|
1079
1417
|
case "hwp":
|
|
1080
|
-
return parseHwp(buffer);
|
|
1418
|
+
return parseHwp(buffer, options);
|
|
1081
1419
|
case "pdf":
|
|
1082
|
-
return parsePdf(buffer);
|
|
1420
|
+
return parsePdf(buffer, options);
|
|
1083
1421
|
default:
|
|
1084
|
-
return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4." };
|
|
1422
|
+
return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
|
|
1085
1423
|
}
|
|
1086
1424
|
}
|
|
1087
|
-
async function parseHwpx(buffer) {
|
|
1425
|
+
async function parseHwpx(buffer, options) {
|
|
1088
1426
|
try {
|
|
1089
|
-
const markdown = await parseHwpxDocument(buffer);
|
|
1090
|
-
return { success: true, fileType: "hwpx", markdown };
|
|
1427
|
+
const { markdown, blocks, metadata } = await parseHwpxDocument(buffer, options);
|
|
1428
|
+
return { success: true, fileType: "hwpx", markdown, blocks, metadata };
|
|
1091
1429
|
} catch (err) {
|
|
1092
|
-
return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328" };
|
|
1430
|
+
return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
1093
1431
|
}
|
|
1094
1432
|
}
|
|
1095
|
-
async function parseHwp(buffer) {
|
|
1433
|
+
async function parseHwp(buffer, options) {
|
|
1096
1434
|
try {
|
|
1097
|
-
const markdown = parseHwp5Document(Buffer.from(buffer));
|
|
1098
|
-
return { success: true, fileType: "hwp", markdown };
|
|
1435
|
+
const { markdown, blocks, metadata } = parseHwp5Document(Buffer.from(buffer), options);
|
|
1436
|
+
return { success: true, fileType: "hwp", markdown, blocks, metadata };
|
|
1099
1437
|
} catch (err) {
|
|
1100
|
-
return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328" };
|
|
1438
|
+
return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
1101
1439
|
}
|
|
1102
1440
|
}
|
|
1103
|
-
async function parsePdf(buffer) {
|
|
1441
|
+
async function parsePdf(buffer, options) {
|
|
1104
1442
|
try {
|
|
1105
|
-
return await parsePdfDocument(buffer);
|
|
1443
|
+
return await parsePdfDocument(buffer, options);
|
|
1106
1444
|
} catch (err) {
|
|
1107
|
-
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328" };
|
|
1445
|
+
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
1446
|
+
}
|
|
1447
|
+
}
|
|
1448
|
+
|
|
1449
|
+
// src/diff/text-diff.ts
|
|
1450
|
+
function similarity(a, b) {
|
|
1451
|
+
if (a === b) return 1;
|
|
1452
|
+
if (!a || !b) return 0;
|
|
1453
|
+
const maxLen = Math.max(a.length, b.length);
|
|
1454
|
+
if (maxLen === 0) return 1;
|
|
1455
|
+
return 1 - levenshtein(a, b) / maxLen;
|
|
1456
|
+
}
|
|
1457
|
+
function normalizedSimilarity(a, b) {
|
|
1458
|
+
return similarity(normalize(a), normalize(b));
|
|
1459
|
+
}
|
|
1460
|
+
function normalize(s) {
|
|
1461
|
+
return s.replace(/\s+/g, " ").trim();
|
|
1462
|
+
}
|
|
1463
|
+
function levenshtein(a, b) {
|
|
1464
|
+
if (a.length > b.length) [a, b] = [b, a];
|
|
1465
|
+
const m = a.length;
|
|
1466
|
+
const n = b.length;
|
|
1467
|
+
let prev = Array.from({ length: m + 1 }, (_, i) => i);
|
|
1468
|
+
let curr = new Array(m + 1);
|
|
1469
|
+
for (let j = 1; j <= n; j++) {
|
|
1470
|
+
curr[0] = j;
|
|
1471
|
+
for (let i = 1; i <= m; i++) {
|
|
1472
|
+
if (a[i - 1] === b[j - 1]) {
|
|
1473
|
+
curr[i] = prev[i - 1];
|
|
1474
|
+
} else {
|
|
1475
|
+
curr[i] = 1 + Math.min(prev[i - 1], prev[i], curr[i - 1]);
|
|
1476
|
+
}
|
|
1477
|
+
}
|
|
1478
|
+
;
|
|
1479
|
+
[prev, curr] = [curr, prev];
|
|
1108
1480
|
}
|
|
1481
|
+
return prev[m];
|
|
1482
|
+
}
|
|
1483
|
+
|
|
1484
|
+
// src/diff/compare.ts
|
|
1485
|
+
var SIMILARITY_THRESHOLD = 0.4;
|
|
1486
|
+
async function compare(bufferA, bufferB, options) {
|
|
1487
|
+
const [resultA, resultB] = await Promise.all([
|
|
1488
|
+
parse(bufferA, options),
|
|
1489
|
+
parse(bufferB, options)
|
|
1490
|
+
]);
|
|
1491
|
+
if (!resultA.success) throw new Error(`\uBB38\uC11CA \uD30C\uC2F1 \uC2E4\uD328: ${resultA.error}`);
|
|
1492
|
+
if (!resultB.success) throw new Error(`\uBB38\uC11CB \uD30C\uC2F1 \uC2E4\uD328: ${resultB.error}`);
|
|
1493
|
+
return diffBlocks(resultA.blocks, resultB.blocks);
|
|
1494
|
+
}
|
|
1495
|
+
function diffBlocks(blocksA, blocksB) {
|
|
1496
|
+
const aligned = alignBlocks(blocksA, blocksB);
|
|
1497
|
+
const stats = { added: 0, removed: 0, modified: 0, unchanged: 0 };
|
|
1498
|
+
const diffs = [];
|
|
1499
|
+
for (const [a, b] of aligned) {
|
|
1500
|
+
if (a && b) {
|
|
1501
|
+
const sim = blockSimilarity(a, b);
|
|
1502
|
+
if (sim >= 0.99) {
|
|
1503
|
+
diffs.push({ type: "unchanged", before: a, after: b, similarity: 1 });
|
|
1504
|
+
stats.unchanged++;
|
|
1505
|
+
} else {
|
|
1506
|
+
const diff = { type: "modified", before: a, after: b, similarity: sim };
|
|
1507
|
+
if (a.type === "table" && b.type === "table" && a.table && b.table) {
|
|
1508
|
+
diff.cellDiffs = diffTableCells(a.table, b.table);
|
|
1509
|
+
}
|
|
1510
|
+
diffs.push(diff);
|
|
1511
|
+
stats.modified++;
|
|
1512
|
+
}
|
|
1513
|
+
} else if (a) {
|
|
1514
|
+
diffs.push({ type: "removed", before: a });
|
|
1515
|
+
stats.removed++;
|
|
1516
|
+
} else if (b) {
|
|
1517
|
+
diffs.push({ type: "added", after: b });
|
|
1518
|
+
stats.added++;
|
|
1519
|
+
}
|
|
1520
|
+
}
|
|
1521
|
+
return { stats, diffs };
|
|
1522
|
+
}
|
|
1523
|
+
function alignBlocks(a, b) {
|
|
1524
|
+
const m = a.length, n = b.length;
|
|
1525
|
+
if (m * n > 1e7) return fallbackAlign(a, b);
|
|
1526
|
+
const simCache = /* @__PURE__ */ new Map();
|
|
1527
|
+
const getSim = (i2, j2) => {
|
|
1528
|
+
const key = `${i2},${j2}`;
|
|
1529
|
+
let v = simCache.get(key);
|
|
1530
|
+
if (v === void 0) {
|
|
1531
|
+
v = blockSimilarity(a[i2], b[j2]);
|
|
1532
|
+
simCache.set(key, v);
|
|
1533
|
+
}
|
|
1534
|
+
return v;
|
|
1535
|
+
};
|
|
1536
|
+
const dp = Array.from({ length: m + 1 }, () => new Array(n + 1).fill(0));
|
|
1537
|
+
for (let i2 = 1; i2 <= m; i2++) {
|
|
1538
|
+
for (let j2 = 1; j2 <= n; j2++) {
|
|
1539
|
+
if (getSim(i2 - 1, j2 - 1) >= SIMILARITY_THRESHOLD) {
|
|
1540
|
+
dp[i2][j2] = dp[i2 - 1][j2 - 1] + 1;
|
|
1541
|
+
} else {
|
|
1542
|
+
dp[i2][j2] = Math.max(dp[i2 - 1][j2], dp[i2][j2 - 1]);
|
|
1543
|
+
}
|
|
1544
|
+
}
|
|
1545
|
+
}
|
|
1546
|
+
const pairs = [];
|
|
1547
|
+
let i = m, j = n;
|
|
1548
|
+
while (i > 0 && j > 0) {
|
|
1549
|
+
if (getSim(i - 1, j - 1) >= SIMILARITY_THRESHOLD && dp[i][j] === dp[i - 1][j - 1] + 1) {
|
|
1550
|
+
pairs.push([i - 1, j - 1]);
|
|
1551
|
+
i--;
|
|
1552
|
+
j--;
|
|
1553
|
+
} else if (dp[i - 1][j] >= dp[i][j - 1]) {
|
|
1554
|
+
i--;
|
|
1555
|
+
} else {
|
|
1556
|
+
j--;
|
|
1557
|
+
}
|
|
1558
|
+
}
|
|
1559
|
+
pairs.reverse();
|
|
1560
|
+
const result = [];
|
|
1561
|
+
let ai = 0, bi = 0;
|
|
1562
|
+
for (const [pi, pj] of pairs) {
|
|
1563
|
+
while (ai < pi) result.push([a[ai++], null]);
|
|
1564
|
+
while (bi < pj) result.push([null, b[bi++]]);
|
|
1565
|
+
result.push([a[ai++], b[bi++]]);
|
|
1566
|
+
}
|
|
1567
|
+
while (ai < m) result.push([a[ai++], null]);
|
|
1568
|
+
while (bi < n) result.push([null, b[bi++]]);
|
|
1569
|
+
return result;
|
|
1570
|
+
}
|
|
1571
|
+
function fallbackAlign(a, b) {
|
|
1572
|
+
const result = [];
|
|
1573
|
+
const len = Math.max(a.length, b.length);
|
|
1574
|
+
for (let i = 0; i < len; i++) {
|
|
1575
|
+
result.push([a[i] || null, b[i] || null]);
|
|
1576
|
+
}
|
|
1577
|
+
return result;
|
|
1578
|
+
}
|
|
1579
|
+
function blockSimilarity(a, b) {
|
|
1580
|
+
if (a.type !== b.type) return 0;
|
|
1581
|
+
if (a.type === "paragraph") {
|
|
1582
|
+
return normalizedSimilarity(a.text || "", b.text || "");
|
|
1583
|
+
}
|
|
1584
|
+
if (a.type === "table" && a.table && b.table) {
|
|
1585
|
+
return tableSimilarity(a.table, b.table);
|
|
1586
|
+
}
|
|
1587
|
+
return 0;
|
|
1588
|
+
}
|
|
1589
|
+
function tableSimilarity(a, b) {
|
|
1590
|
+
const dimSim = 1 - Math.abs(a.rows * a.cols - b.rows * b.cols) / Math.max(a.rows * a.cols, b.rows * b.cols, 1);
|
|
1591
|
+
const textsA = a.cells.flat().map((c) => c.text).join(" ");
|
|
1592
|
+
const textsB = b.cells.flat().map((c) => c.text).join(" ");
|
|
1593
|
+
const contentSim = normalizedSimilarity(textsA, textsB);
|
|
1594
|
+
return dimSim * 0.3 + contentSim * 0.7;
|
|
1595
|
+
}
|
|
1596
|
+
function diffTableCells(a, b) {
|
|
1597
|
+
const maxRows = Math.max(a.rows, b.rows);
|
|
1598
|
+
const maxCols = Math.max(a.cols, b.cols);
|
|
1599
|
+
const result = [];
|
|
1600
|
+
for (let r = 0; r < maxRows; r++) {
|
|
1601
|
+
const row = [];
|
|
1602
|
+
for (let c = 0; c < maxCols; c++) {
|
|
1603
|
+
const cellA = r < a.rows && c < a.cols ? a.cells[r][c].text : void 0;
|
|
1604
|
+
const cellB = r < b.rows && c < b.cols ? b.cells[r][c].text : void 0;
|
|
1605
|
+
let type;
|
|
1606
|
+
if (cellA === void 0) type = "added";
|
|
1607
|
+
else if (cellB === void 0) type = "removed";
|
|
1608
|
+
else if (cellA === cellB) type = "unchanged";
|
|
1609
|
+
else type = "modified";
|
|
1610
|
+
row.push({ type, before: cellA, after: cellB });
|
|
1611
|
+
}
|
|
1612
|
+
result.push(row);
|
|
1613
|
+
}
|
|
1614
|
+
return result;
|
|
1109
1615
|
}
|
|
1110
1616
|
|
|
1111
1617
|
export {
|
|
1112
1618
|
detectFormat,
|
|
1619
|
+
blocksToMarkdown,
|
|
1113
1620
|
VERSION,
|
|
1114
1621
|
toArrayBuffer,
|
|
1115
1622
|
KordocError,
|
|
1116
1623
|
sanitizeError,
|
|
1624
|
+
extractHwpxMetadataOnly,
|
|
1625
|
+
extractHwp5MetadataOnly,
|
|
1626
|
+
extractPdfMetadataOnly,
|
|
1627
|
+
compare,
|
|
1628
|
+
extractFormFields,
|
|
1117
1629
|
parse
|
|
1118
1630
|
};
|
|
1119
|
-
//# sourceMappingURL=chunk-
|
|
1631
|
+
//# sourceMappingURL=chunk-FC5R5FMV.js.map
|