kordoc 1.4.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -3
- package/dist/{chunk-BWZW234S.js → chunk-5SZWGBNL.js} +1083 -57
- package/dist/chunk-5SZWGBNL.js.map +1 -0
- package/dist/cli.js +2 -2
- package/dist/index.cjs +1085 -59
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +55 -2
- package/dist/index.d.ts +55 -2
- package/dist/index.js +1082 -56
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +18 -4
- package/dist/mcp.js.map +1 -1
- package/dist/{provider-JB7SY74K.js → provider-A4FHJSID.js} +2 -2
- package/dist/provider-A4FHJSID.js.map +1 -0
- package/dist/{watch-LIGKH3QS.js → watch-YCWNFYAW.js} +2 -2
- package/package.json +2 -1
- package/dist/chunk-BWZW234S.js.map +0 -1
- package/dist/provider-JB7SY74K.js.map +0 -1
- /package/dist/{watch-LIGKH3QS.js.map → watch-YCWNFYAW.js.map} +0 -0
package/dist/index.cjs
CHANGED
|
@@ -44,7 +44,7 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount) {
|
|
|
44
44
|
const imageData = await renderPageToPng(page);
|
|
45
45
|
const text = await provider(imageData, i, "image/png");
|
|
46
46
|
if (text.trim()) {
|
|
47
|
-
blocks.push({ type: "paragraph", text: text.trim() });
|
|
47
|
+
blocks.push({ type: "paragraph", text: text.trim(), pageNumber: i });
|
|
48
48
|
}
|
|
49
49
|
} catch {
|
|
50
50
|
}
|
|
@@ -182,8 +182,29 @@ function blocksToMarkdown(blocks) {
|
|
|
182
182
|
const lines = [];
|
|
183
183
|
for (let i = 0; i < blocks.length; i++) {
|
|
184
184
|
const block = blocks[i];
|
|
185
|
+
if (block.type === "heading" && block.text) {
|
|
186
|
+
const prefix = "#".repeat(Math.min(block.level || 2, 6));
|
|
187
|
+
lines.push("", `${prefix} ${block.text}`, "");
|
|
188
|
+
continue;
|
|
189
|
+
}
|
|
190
|
+
if (block.type === "separator") {
|
|
191
|
+
lines.push("", "---", "");
|
|
192
|
+
continue;
|
|
193
|
+
}
|
|
194
|
+
if (block.type === "list" && block.text) {
|
|
195
|
+
const alreadyNumbered = block.listType === "ordered" && /^\d+\.\s/.test(block.text);
|
|
196
|
+
const prefix = alreadyNumbered ? "" : block.listType === "ordered" ? "1. " : "- ";
|
|
197
|
+
lines.push(`${prefix}${block.text}`);
|
|
198
|
+
if (block.children) {
|
|
199
|
+
for (const child of block.children) {
|
|
200
|
+
const childPrefix = child.listType === "ordered" ? "1." : "-";
|
|
201
|
+
lines.push(` ${childPrefix} ${child.text || ""}`);
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
continue;
|
|
205
|
+
}
|
|
185
206
|
if (block.type === "paragraph" && block.text) {
|
|
186
|
-
|
|
207
|
+
let text = block.text;
|
|
187
208
|
if (/^\[별표\s*\d+/.test(text)) {
|
|
188
209
|
const nextBlock = blocks[i + 1];
|
|
189
210
|
if (nextBlock?.type === "paragraph" && nextBlock.text && /관련\)?$/.test(nextBlock.text)) {
|
|
@@ -198,9 +219,19 @@ function blocksToMarkdown(blocks) {
|
|
|
198
219
|
lines.push(`*${text}*`, "");
|
|
199
220
|
continue;
|
|
200
221
|
}
|
|
222
|
+
if (block.href) {
|
|
223
|
+
text = `[${text}](${block.href})`;
|
|
224
|
+
}
|
|
225
|
+
if (block.footnoteText) {
|
|
226
|
+
text += ` (\uC8FC: ${block.footnoteText})`;
|
|
227
|
+
}
|
|
201
228
|
lines.push(text);
|
|
202
229
|
} else if (block.type === "table" && block.table) {
|
|
230
|
+
if (lines.length > 0 && lines[lines.length - 1] !== "") {
|
|
231
|
+
lines.push("");
|
|
232
|
+
}
|
|
203
233
|
lines.push(tableToMarkdown(block.table));
|
|
234
|
+
lines.push("");
|
|
204
235
|
}
|
|
205
236
|
}
|
|
206
237
|
return lines.join("\n").trim();
|
|
@@ -251,7 +282,7 @@ function tableToMarkdown(table) {
|
|
|
251
282
|
}
|
|
252
283
|
|
|
253
284
|
// src/utils.ts
|
|
254
|
-
var VERSION = true ? "1.
|
|
285
|
+
var VERSION = true ? "1.5.0" : "0.0.0-dev";
|
|
255
286
|
var KordocError = class extends Error {
|
|
256
287
|
constructor(message) {
|
|
257
288
|
super(message);
|
|
@@ -310,6 +341,75 @@ var MAX_ZIP_ENTRIES = 500;
|
|
|
310
341
|
function clampSpan(val, max) {
|
|
311
342
|
return Math.max(1, Math.min(val, max));
|
|
312
343
|
}
|
|
344
|
+
async function extractHwpxStyles(zip) {
|
|
345
|
+
const result = {
|
|
346
|
+
charProperties: /* @__PURE__ */ new Map(),
|
|
347
|
+
styles: /* @__PURE__ */ new Map()
|
|
348
|
+
};
|
|
349
|
+
const headerPaths = ["Contents/header.xml", "header.xml", "Contents/head.xml", "head.xml"];
|
|
350
|
+
for (const hp of headerPaths) {
|
|
351
|
+
const hpLower = hp.toLowerCase();
|
|
352
|
+
const file = zip.file(hp) || Object.values(zip.files).find((f) => f.name.toLowerCase() === hpLower) || null;
|
|
353
|
+
if (!file) continue;
|
|
354
|
+
try {
|
|
355
|
+
const xml = await file.async("text");
|
|
356
|
+
const parser = new import_xmldom.DOMParser();
|
|
357
|
+
const doc = parser.parseFromString(stripDtd(xml), "text/xml");
|
|
358
|
+
if (!doc.documentElement) continue;
|
|
359
|
+
parseCharProperties(doc, result.charProperties);
|
|
360
|
+
parseStyleElements(doc, result.styles);
|
|
361
|
+
break;
|
|
362
|
+
} catch {
|
|
363
|
+
continue;
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
return result;
|
|
367
|
+
}
|
|
368
|
+
function parseCharProperties(doc, map) {
|
|
369
|
+
const tagNames = ["hh:charPr", "charPr", "hp:charPr"];
|
|
370
|
+
for (const tagName of tagNames) {
|
|
371
|
+
const elements = doc.getElementsByTagName(tagName);
|
|
372
|
+
for (let i = 0; i < elements.length; i++) {
|
|
373
|
+
const el = elements[i];
|
|
374
|
+
const id = el.getAttribute("id") || el.getAttribute("IDRef") || "";
|
|
375
|
+
if (!id) continue;
|
|
376
|
+
const prop = {};
|
|
377
|
+
const height = el.getAttribute("height");
|
|
378
|
+
if (height) prop.fontSize = parseInt(height, 10) / 100;
|
|
379
|
+
const bold = el.getAttribute("bold");
|
|
380
|
+
if (bold === "true" || bold === "1") prop.bold = true;
|
|
381
|
+
const italic = el.getAttribute("italic");
|
|
382
|
+
if (italic === "true" || italic === "1") prop.italic = true;
|
|
383
|
+
const fontFaces = el.getElementsByTagName("*");
|
|
384
|
+
for (let j = 0; j < fontFaces.length; j++) {
|
|
385
|
+
const ff = fontFaces[j];
|
|
386
|
+
const localTag = (ff.tagName || "").replace(/^[^:]+:/, "");
|
|
387
|
+
if (localTag === "fontface" || localTag === "fontRef") {
|
|
388
|
+
const face = ff.getAttribute("face") || ff.getAttribute("FontFace");
|
|
389
|
+
if (face) {
|
|
390
|
+
prop.fontName = face;
|
|
391
|
+
break;
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
map.set(id, prop);
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
function parseStyleElements(doc, map) {
|
|
400
|
+
const tagNames = ["hh:style", "style", "hp:style"];
|
|
401
|
+
for (const tagName of tagNames) {
|
|
402
|
+
const elements = doc.getElementsByTagName(tagName);
|
|
403
|
+
for (let i = 0; i < elements.length; i++) {
|
|
404
|
+
const el = elements[i];
|
|
405
|
+
const id = el.getAttribute("id") || el.getAttribute("IDRef") || String(i);
|
|
406
|
+
const name = el.getAttribute("name") || el.getAttribute("engName") || "";
|
|
407
|
+
const charPrId = el.getAttribute("charPrIDRef") || void 0;
|
|
408
|
+
const paraPrId = el.getAttribute("paraPrIDRef") || void 0;
|
|
409
|
+
map.set(id, { name, charPrId, paraPrId });
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
}
|
|
313
413
|
function stripDtd(xml) {
|
|
314
414
|
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
315
415
|
}
|
|
@@ -333,6 +433,8 @@ async function parseHwpxDocument(buffer, options) {
|
|
|
333
433
|
}
|
|
334
434
|
const metadata = {};
|
|
335
435
|
await extractHwpxMetadata(zip, metadata);
|
|
436
|
+
const styleMap = await extractHwpxStyles(zip);
|
|
437
|
+
const warnings = [];
|
|
336
438
|
const sectionPaths = await resolveSectionPaths(zip);
|
|
337
439
|
if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
338
440
|
metadata.pageCount = sectionPaths.length;
|
|
@@ -346,10 +448,12 @@ async function parseHwpxDocument(buffer, options) {
|
|
|
346
448
|
const xml = await file.async("text");
|
|
347
449
|
totalDecompressed += xml.length * 2;
|
|
348
450
|
if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
349
|
-
blocks.push(...parseSectionXml(xml));
|
|
451
|
+
blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1));
|
|
350
452
|
}
|
|
453
|
+
detectHwpxHeadings(blocks, styleMap);
|
|
454
|
+
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
351
455
|
const markdown = blocksToMarkdown(blocks);
|
|
352
|
-
return { markdown, blocks, metadata };
|
|
456
|
+
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
353
457
|
}
|
|
354
458
|
async function extractHwpxMetadata(zip, metadata) {
|
|
355
459
|
try {
|
|
@@ -515,15 +619,50 @@ function parseSectionPathsFromManifest(xml) {
|
|
|
515
619
|
}
|
|
516
620
|
return Array.from(idToHref.entries()).filter(([id]) => isSectionId(id)).sort((a, b) => a[0].localeCompare(b[0])).map(([, href]) => href);
|
|
517
621
|
}
|
|
518
|
-
function
|
|
622
|
+
function detectHwpxHeadings(blocks, styleMap) {
|
|
623
|
+
let baseFontSize = 0;
|
|
624
|
+
const sizeFreq = /* @__PURE__ */ new Map();
|
|
625
|
+
for (const b of blocks) {
|
|
626
|
+
if (b.style?.fontSize) {
|
|
627
|
+
sizeFreq.set(b.style.fontSize, (sizeFreq.get(b.style.fontSize) || 0) + 1);
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
let maxCount = 0;
|
|
631
|
+
for (const [size, count] of sizeFreq) {
|
|
632
|
+
if (count > maxCount) {
|
|
633
|
+
maxCount = count;
|
|
634
|
+
baseFontSize = size;
|
|
635
|
+
}
|
|
636
|
+
}
|
|
637
|
+
for (const block of blocks) {
|
|
638
|
+
if (block.type !== "paragraph" || !block.text) continue;
|
|
639
|
+
const text = block.text.trim();
|
|
640
|
+
if (text.length === 0 || text.length > 200 || /^\d+$/.test(text)) continue;
|
|
641
|
+
let level = 0;
|
|
642
|
+
if (baseFontSize > 0 && block.style?.fontSize) {
|
|
643
|
+
const ratio = block.style.fontSize / baseFontSize;
|
|
644
|
+
if (ratio >= 1.5) level = 1;
|
|
645
|
+
else if (ratio >= 1.3) level = 2;
|
|
646
|
+
else if (ratio >= 1.15) level = 3;
|
|
647
|
+
}
|
|
648
|
+
if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
|
|
649
|
+
if (level === 0) level = 3;
|
|
650
|
+
}
|
|
651
|
+
if (level > 0) {
|
|
652
|
+
block.type = "heading";
|
|
653
|
+
block.level = level;
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
}
|
|
657
|
+
function parseSectionXml(xml, styleMap, warnings, sectionNum) {
|
|
519
658
|
const parser = new import_xmldom.DOMParser();
|
|
520
659
|
const doc = parser.parseFromString(stripDtd(xml), "text/xml");
|
|
521
660
|
if (!doc.documentElement) return [];
|
|
522
661
|
const blocks = [];
|
|
523
|
-
walkSection(doc.documentElement, blocks, null, []);
|
|
662
|
+
walkSection(doc.documentElement, blocks, null, [], styleMap, warnings, sectionNum);
|
|
524
663
|
return blocks;
|
|
525
664
|
}
|
|
526
|
-
function walkSection(node, blocks, tableCtx, tableStack) {
|
|
665
|
+
function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum) {
|
|
527
666
|
const children = node.childNodes;
|
|
528
667
|
if (!children) return;
|
|
529
668
|
for (let i = 0; i < children.length; i++) {
|
|
@@ -535,7 +674,7 @@ function walkSection(node, blocks, tableCtx, tableStack) {
|
|
|
535
674
|
case "tbl": {
|
|
536
675
|
if (tableCtx) tableStack.push(tableCtx);
|
|
537
676
|
const newTable = { rows: [], currentRow: [], cell: null };
|
|
538
|
-
walkSection(el, blocks, newTable, tableStack);
|
|
677
|
+
walkSection(el, blocks, newTable, tableStack, styleMap, warnings, sectionNum);
|
|
539
678
|
if (newTable.rows.length > 0) {
|
|
540
679
|
if (tableStack.length > 0) {
|
|
541
680
|
const parentTable = tableStack.pop();
|
|
@@ -545,7 +684,7 @@ function walkSection(node, blocks, tableCtx, tableStack) {
|
|
|
545
684
|
}
|
|
546
685
|
tableCtx = parentTable;
|
|
547
686
|
} else {
|
|
548
|
-
blocks.push({ type: "table", table: buildTable(newTable.rows) });
|
|
687
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
549
688
|
tableCtx = null;
|
|
550
689
|
}
|
|
551
690
|
} else {
|
|
@@ -556,7 +695,7 @@ function walkSection(node, blocks, tableCtx, tableStack) {
|
|
|
556
695
|
case "tr":
|
|
557
696
|
if (tableCtx) {
|
|
558
697
|
tableCtx.currentRow = [];
|
|
559
|
-
walkSection(el, blocks, tableCtx, tableStack);
|
|
698
|
+
walkSection(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum);
|
|
560
699
|
if (tableCtx.currentRow.length > 0) tableCtx.rows.push(tableCtx.currentRow);
|
|
561
700
|
tableCtx.currentRow = [];
|
|
562
701
|
}
|
|
@@ -564,7 +703,7 @@ function walkSection(node, blocks, tableCtx, tableStack) {
|
|
|
564
703
|
case "tc":
|
|
565
704
|
if (tableCtx) {
|
|
566
705
|
tableCtx.cell = { text: "", colSpan: 1, rowSpan: 1 };
|
|
567
|
-
walkSection(el, blocks, tableCtx, tableStack);
|
|
706
|
+
walkSection(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum);
|
|
568
707
|
if (tableCtx.cell) {
|
|
569
708
|
tableCtx.currentRow.push(tableCtx.cell);
|
|
570
709
|
tableCtx.cell = null;
|
|
@@ -580,25 +719,75 @@ function walkSection(node, blocks, tableCtx, tableStack) {
|
|
|
580
719
|
}
|
|
581
720
|
break;
|
|
582
721
|
case "p": {
|
|
583
|
-
const text =
|
|
722
|
+
const { text, href, footnote, style } = extractParagraphInfo(el, styleMap);
|
|
584
723
|
if (text) {
|
|
585
724
|
if (tableCtx?.cell) {
|
|
586
725
|
tableCtx.cell.text += (tableCtx.cell.text ? "\n" : "") + text;
|
|
587
726
|
} else if (!tableCtx) {
|
|
588
|
-
|
|
727
|
+
const block = { type: "paragraph", text, pageNumber: sectionNum };
|
|
728
|
+
if (style) block.style = style;
|
|
729
|
+
if (href) block.href = href;
|
|
730
|
+
if (footnote) block.footnoteText = footnote;
|
|
731
|
+
blocks.push(block);
|
|
589
732
|
}
|
|
590
733
|
}
|
|
591
|
-
|
|
734
|
+
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum);
|
|
592
735
|
break;
|
|
593
736
|
}
|
|
737
|
+
// 이미지/그림 — 경고 수집
|
|
738
|
+
case "pic":
|
|
739
|
+
case "shape":
|
|
740
|
+
case "drawingObject":
|
|
741
|
+
if (warnings && sectionNum) {
|
|
742
|
+
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
743
|
+
}
|
|
744
|
+
break;
|
|
594
745
|
default:
|
|
595
|
-
walkSection(el, blocks, tableCtx, tableStack);
|
|
746
|
+
walkSection(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum);
|
|
596
747
|
break;
|
|
597
748
|
}
|
|
598
749
|
}
|
|
599
750
|
}
|
|
600
|
-
function
|
|
751
|
+
function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum) {
|
|
752
|
+
const children = node.childNodes;
|
|
753
|
+
if (!children) return tableCtx;
|
|
754
|
+
for (let i = 0; i < children.length; i++) {
|
|
755
|
+
const el = children[i];
|
|
756
|
+
if (el.nodeType !== 1) continue;
|
|
757
|
+
const tag = el.tagName || el.localName || "";
|
|
758
|
+
const localTag = tag.replace(/^[^:]+:/, "");
|
|
759
|
+
if (localTag === "tbl") {
|
|
760
|
+
if (tableCtx) tableStack.push(tableCtx);
|
|
761
|
+
const newTable = { rows: [], currentRow: [], cell: null };
|
|
762
|
+
walkSection(el, blocks, newTable, tableStack, styleMap, warnings, sectionNum);
|
|
763
|
+
if (newTable.rows.length > 0) {
|
|
764
|
+
if (tableStack.length > 0) {
|
|
765
|
+
const parentTable = tableStack.pop();
|
|
766
|
+
const nestedText = convertTableToText(newTable.rows);
|
|
767
|
+
if (parentTable.cell) {
|
|
768
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
769
|
+
}
|
|
770
|
+
tableCtx = parentTable;
|
|
771
|
+
} else {
|
|
772
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
773
|
+
tableCtx = null;
|
|
774
|
+
}
|
|
775
|
+
} else {
|
|
776
|
+
tableCtx = tableStack.length > 0 ? tableStack.pop() : null;
|
|
777
|
+
}
|
|
778
|
+
} else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
|
|
779
|
+
if (warnings && sectionNum) {
|
|
780
|
+
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
781
|
+
}
|
|
782
|
+
}
|
|
783
|
+
}
|
|
784
|
+
return tableCtx;
|
|
785
|
+
}
|
|
786
|
+
function extractParagraphInfo(para, styleMap) {
|
|
601
787
|
let text = "";
|
|
788
|
+
let href;
|
|
789
|
+
let footnote;
|
|
790
|
+
let charPrId;
|
|
602
791
|
const walk = (node) => {
|
|
603
792
|
const children = node.childNodes;
|
|
604
793
|
if (!children) return;
|
|
@@ -627,6 +816,29 @@ function extractParagraphText(para) {
|
|
|
627
816
|
case "tbl":
|
|
628
817
|
break;
|
|
629
818
|
// 테이블은 walkSection에서 처리
|
|
819
|
+
// 하이퍼링크
|
|
820
|
+
case "hyperlink": {
|
|
821
|
+
const url = child.getAttribute("url") || child.getAttribute("href") || "";
|
|
822
|
+
if (url) href = url;
|
|
823
|
+
walk(child);
|
|
824
|
+
break;
|
|
825
|
+
}
|
|
826
|
+
// 각주/미주
|
|
827
|
+
case "footNote":
|
|
828
|
+
case "endNote":
|
|
829
|
+
case "fn":
|
|
830
|
+
case "en": {
|
|
831
|
+
const noteText = extractTextFromNode(child);
|
|
832
|
+
if (noteText) footnote = (footnote ? footnote + "; " : "") + noteText;
|
|
833
|
+
break;
|
|
834
|
+
}
|
|
835
|
+
// run 요소에서 charPrIDRef 추출
|
|
836
|
+
case "r": {
|
|
837
|
+
const runCharPr = child.getAttribute("charPrIDRef");
|
|
838
|
+
if (runCharPr && !charPrId) charPrId = runCharPr;
|
|
839
|
+
walk(child);
|
|
840
|
+
break;
|
|
841
|
+
}
|
|
630
842
|
default:
|
|
631
843
|
walk(child);
|
|
632
844
|
break;
|
|
@@ -634,16 +846,43 @@ function extractParagraphText(para) {
|
|
|
634
846
|
}
|
|
635
847
|
};
|
|
636
848
|
walk(para);
|
|
637
|
-
|
|
849
|
+
const cleanText = text.replace(/[ \t]+/g, " ").trim();
|
|
850
|
+
let style;
|
|
851
|
+
if (styleMap && charPrId) {
|
|
852
|
+
const charProp = styleMap.charProperties.get(charPrId);
|
|
853
|
+
if (charProp) {
|
|
854
|
+
style = {};
|
|
855
|
+
if (charProp.fontSize) style.fontSize = charProp.fontSize;
|
|
856
|
+
if (charProp.bold) style.bold = true;
|
|
857
|
+
if (charProp.italic) style.italic = true;
|
|
858
|
+
if (charProp.fontName) style.fontName = charProp.fontName;
|
|
859
|
+
if (!style.fontSize && !style.bold && !style.italic) style = void 0;
|
|
860
|
+
}
|
|
861
|
+
}
|
|
862
|
+
return { text: cleanText, href, footnote, style };
|
|
863
|
+
}
|
|
864
|
+
function extractTextFromNode(node) {
|
|
865
|
+
let result = "";
|
|
866
|
+
const children = node.childNodes;
|
|
867
|
+
if (!children) return result;
|
|
868
|
+
for (let i = 0; i < children.length; i++) {
|
|
869
|
+
const child = children[i];
|
|
870
|
+
if (child.nodeType === 3) result += child.textContent || "";
|
|
871
|
+
else if (child.nodeType === 1) result += extractTextFromNode(child);
|
|
872
|
+
}
|
|
873
|
+
return result.trim();
|
|
638
874
|
}
|
|
639
875
|
|
|
640
876
|
// src/hwp5/record.ts
|
|
641
877
|
var import_zlib2 = require("zlib");
|
|
642
878
|
var TAG_PARA_HEADER = 66;
|
|
643
879
|
var TAG_PARA_TEXT = 67;
|
|
880
|
+
var TAG_CHAR_SHAPE = 68;
|
|
644
881
|
var TAG_CTRL_HEADER = 71;
|
|
645
882
|
var TAG_LIST_HEADER = 72;
|
|
646
883
|
var TAG_TABLE = 77;
|
|
884
|
+
var TAG_DOC_CHAR_SHAPE = 55;
|
|
885
|
+
var TAG_DOC_STYLE = 58;
|
|
647
886
|
var CHAR_LINE = 0;
|
|
648
887
|
var CHAR_PARA = 13;
|
|
649
888
|
var CHAR_TAB = 9;
|
|
@@ -694,6 +933,51 @@ function parseFileHeader(data) {
|
|
|
694
933
|
flags: data.readUInt32LE(36)
|
|
695
934
|
};
|
|
696
935
|
}
|
|
936
|
+
function parseDocInfo(records) {
|
|
937
|
+
const charShapes = [];
|
|
938
|
+
const styles = [];
|
|
939
|
+
for (const rec of records) {
|
|
940
|
+
if (rec.tagId === TAG_DOC_CHAR_SHAPE && rec.data.length >= 18) {
|
|
941
|
+
if (rec.data.length >= 50) {
|
|
942
|
+
const fontSize = rec.data.readUInt32LE(42);
|
|
943
|
+
const attrFlags = rec.data.readUInt32LE(46);
|
|
944
|
+
charShapes.push({ fontSize, attrFlags });
|
|
945
|
+
} else {
|
|
946
|
+
charShapes.push({ fontSize: 0, attrFlags: 0 });
|
|
947
|
+
}
|
|
948
|
+
}
|
|
949
|
+
if (rec.tagId === TAG_DOC_STYLE && rec.data.length >= 8) {
|
|
950
|
+
try {
|
|
951
|
+
let offset = 0;
|
|
952
|
+
const nameLen = rec.data.readUInt16LE(offset);
|
|
953
|
+
offset += 2;
|
|
954
|
+
const nameBytes = nameLen * 2;
|
|
955
|
+
const name = nameBytes > 0 && offset + nameBytes <= rec.data.length ? rec.data.subarray(offset, offset + nameBytes).toString("utf16le") : "";
|
|
956
|
+
offset += nameBytes;
|
|
957
|
+
let nameKo = "";
|
|
958
|
+
if (offset + 2 <= rec.data.length) {
|
|
959
|
+
const nameKoLen = rec.data.readUInt16LE(offset);
|
|
960
|
+
offset += 2;
|
|
961
|
+
const nameKoBytes = nameKoLen * 2;
|
|
962
|
+
if (nameKoBytes > 0 && offset + nameKoBytes <= rec.data.length) {
|
|
963
|
+
nameKo = rec.data.subarray(offset, offset + nameKoBytes).toString("utf16le");
|
|
964
|
+
}
|
|
965
|
+
offset += nameKoBytes;
|
|
966
|
+
}
|
|
967
|
+
const type = offset < rec.data.length ? rec.data.readUInt8(offset) : 0;
|
|
968
|
+
offset += 1;
|
|
969
|
+
offset += 2;
|
|
970
|
+
offset += 2;
|
|
971
|
+
const paraShapeId = offset + 2 <= rec.data.length ? rec.data.readUInt16LE(offset) : 0;
|
|
972
|
+
offset += 2;
|
|
973
|
+
const charShapeId = offset + 2 <= rec.data.length ? rec.data.readUInt16LE(offset) : 0;
|
|
974
|
+
styles.push({ name, nameKo, charShapeId, paraShapeId, type });
|
|
975
|
+
} catch {
|
|
976
|
+
}
|
|
977
|
+
}
|
|
978
|
+
}
|
|
979
|
+
return { charShapes, styles };
|
|
980
|
+
}
|
|
697
981
|
function extractText(data) {
|
|
698
982
|
let result = "";
|
|
699
983
|
let i = 0;
|
|
@@ -759,6 +1043,8 @@ function parseHwp5Document(buffer, options) {
|
|
|
759
1043
|
version: `${header.versionMajor}.x`
|
|
760
1044
|
};
|
|
761
1045
|
extractHwp5Metadata(cfb, metadata);
|
|
1046
|
+
const docInfo = parseDocInfoStream(cfb, compressed);
|
|
1047
|
+
const warnings = [];
|
|
762
1048
|
const sections = findSections(cfb);
|
|
763
1049
|
if (sections.length === 0) throw new KordocError("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
764
1050
|
metadata.pageCount = sections.length;
|
|
@@ -772,10 +1058,73 @@ function parseHwp5Document(buffer, options) {
|
|
|
772
1058
|
totalDecompressed += data.length;
|
|
773
1059
|
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
774
1060
|
const records = readRecords(data);
|
|
775
|
-
|
|
1061
|
+
const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
|
|
1062
|
+
blocks.push(...sectionBlocks);
|
|
776
1063
|
}
|
|
1064
|
+
if (docInfo) {
|
|
1065
|
+
detectHwp5Headings(blocks, docInfo);
|
|
1066
|
+
}
|
|
1067
|
+
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
777
1068
|
const markdown = blocksToMarkdown(blocks);
|
|
778
|
-
return { markdown, blocks, metadata };
|
|
1069
|
+
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
1070
|
+
}
|
|
1071
|
+
function parseDocInfoStream(cfb, compressed) {
|
|
1072
|
+
try {
|
|
1073
|
+
const entry = CFB.find(cfb, "/DocInfo");
|
|
1074
|
+
if (!entry?.content) return null;
|
|
1075
|
+
const data = compressed ? decompressStream(Buffer.from(entry.content)) : Buffer.from(entry.content);
|
|
1076
|
+
const records = readRecords(data);
|
|
1077
|
+
return parseDocInfo(records);
|
|
1078
|
+
} catch {
|
|
1079
|
+
return null;
|
|
1080
|
+
}
|
|
1081
|
+
}
|
|
1082
|
+
function detectHwp5Headings(blocks, docInfo) {
|
|
1083
|
+
let baseFontSize = 0;
|
|
1084
|
+
for (const style of docInfo.styles) {
|
|
1085
|
+
const name = (style.nameKo || style.name).toLowerCase();
|
|
1086
|
+
if (name.includes("\uBC14\uD0D5") || name.includes("\uBCF8\uBB38") || name === "normal" || name === "body") {
|
|
1087
|
+
const cs = docInfo.charShapes[style.charShapeId];
|
|
1088
|
+
if (cs?.fontSize > 0) {
|
|
1089
|
+
baseFontSize = cs.fontSize / 10;
|
|
1090
|
+
break;
|
|
1091
|
+
}
|
|
1092
|
+
}
|
|
1093
|
+
}
|
|
1094
|
+
if (baseFontSize === 0) {
|
|
1095
|
+
const sizeFreq = /* @__PURE__ */ new Map();
|
|
1096
|
+
for (const b of blocks) {
|
|
1097
|
+
if (b.style?.fontSize) {
|
|
1098
|
+
sizeFreq.set(b.style.fontSize, (sizeFreq.get(b.style.fontSize) || 0) + 1);
|
|
1099
|
+
}
|
|
1100
|
+
}
|
|
1101
|
+
let maxCount = 0;
|
|
1102
|
+
for (const [size, count] of sizeFreq) {
|
|
1103
|
+
if (count > maxCount) {
|
|
1104
|
+
maxCount = count;
|
|
1105
|
+
baseFontSize = size;
|
|
1106
|
+
}
|
|
1107
|
+
}
|
|
1108
|
+
}
|
|
1109
|
+
if (baseFontSize <= 0) return;
|
|
1110
|
+
for (const block of blocks) {
|
|
1111
|
+
if (block.type !== "paragraph" || !block.text || !block.style?.fontSize) continue;
|
|
1112
|
+
const text = block.text.trim();
|
|
1113
|
+
if (text.length === 0 || text.length > 200) continue;
|
|
1114
|
+
if (/^\d+$/.test(text)) continue;
|
|
1115
|
+
const ratio = block.style.fontSize / baseFontSize;
|
|
1116
|
+
let level = 0;
|
|
1117
|
+
if (ratio >= 1.5) level = 1;
|
|
1118
|
+
else if (ratio >= 1.3) level = 2;
|
|
1119
|
+
else if (ratio >= 1.15) level = 3;
|
|
1120
|
+
if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
|
|
1121
|
+
if (level === 0) level = 3;
|
|
1122
|
+
}
|
|
1123
|
+
if (level > 0) {
|
|
1124
|
+
block.type = "heading";
|
|
1125
|
+
block.level = level;
|
|
1126
|
+
}
|
|
1127
|
+
}
|
|
779
1128
|
}
|
|
780
1129
|
function extractHwp5Metadata(cfb, metadata) {
|
|
781
1130
|
try {
|
|
@@ -827,15 +1176,22 @@ function findSections(cfb) {
|
|
|
827
1176
|
}
|
|
828
1177
|
return sections.sort((a, b) => a.idx - b.idx).map((s) => s.content);
|
|
829
1178
|
}
|
|
830
|
-
function parseSection(records) {
|
|
1179
|
+
function parseSection(records, docInfo, warnings, sectionNum) {
|
|
831
1180
|
const blocks = [];
|
|
832
1181
|
let i = 0;
|
|
833
1182
|
while (i < records.length) {
|
|
834
1183
|
const rec = records[i];
|
|
835
1184
|
if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
|
|
836
|
-
const { paragraph, tables, nextIdx } = parseParagraphWithTables(records, i);
|
|
837
|
-
if (paragraph)
|
|
838
|
-
|
|
1185
|
+
const { paragraph, tables, nextIdx, charShapeIds } = parseParagraphWithTables(records, i);
|
|
1186
|
+
if (paragraph) {
|
|
1187
|
+
const block = { type: "paragraph", text: paragraph, pageNumber: sectionNum };
|
|
1188
|
+
if (docInfo && charShapeIds.length > 0) {
|
|
1189
|
+
const style = resolveCharStyle(charShapeIds, docInfo);
|
|
1190
|
+
if (style) block.style = style;
|
|
1191
|
+
}
|
|
1192
|
+
blocks.push(block);
|
|
1193
|
+
}
|
|
1194
|
+
for (const t of tables) blocks.push({ type: "table", table: t, pageNumber: sectionNum });
|
|
839
1195
|
i = nextIdx;
|
|
840
1196
|
continue;
|
|
841
1197
|
}
|
|
@@ -843,19 +1199,43 @@ function parseSection(records) {
|
|
|
843
1199
|
const ctrlId = rec.data.subarray(0, 4).toString("ascii");
|
|
844
1200
|
if (ctrlId === " lbt" || ctrlId === "tbl ") {
|
|
845
1201
|
const { table, nextIdx } = parseTableBlock(records, i);
|
|
846
|
-
if (table) blocks.push({ type: "table", table });
|
|
1202
|
+
if (table) blocks.push({ type: "table", table, pageNumber: sectionNum });
|
|
847
1203
|
i = nextIdx;
|
|
848
1204
|
continue;
|
|
849
1205
|
}
|
|
1206
|
+
if (ctrlId === "gso " || ctrlId === " osg" || ctrlId === " elo" || ctrlId === "ole ") {
|
|
1207
|
+
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC81C\uC5B4 \uC694\uC18C: ${ctrlId.trim()}`, code: "SKIPPED_IMAGE" });
|
|
1208
|
+
}
|
|
850
1209
|
}
|
|
851
1210
|
i++;
|
|
852
1211
|
}
|
|
853
1212
|
return blocks;
|
|
854
1213
|
}
|
|
1214
|
+
function resolveCharStyle(charShapeIds, docInfo) {
|
|
1215
|
+
if (charShapeIds.length === 0 || docInfo.charShapes.length === 0) return void 0;
|
|
1216
|
+
const freq = /* @__PURE__ */ new Map();
|
|
1217
|
+
let maxCount = 0, dominantId = charShapeIds[0];
|
|
1218
|
+
for (const id of charShapeIds) {
|
|
1219
|
+
const count = (freq.get(id) || 0) + 1;
|
|
1220
|
+
freq.set(id, count);
|
|
1221
|
+
if (count > maxCount) {
|
|
1222
|
+
maxCount = count;
|
|
1223
|
+
dominantId = id;
|
|
1224
|
+
}
|
|
1225
|
+
}
|
|
1226
|
+
const cs = docInfo.charShapes[dominantId];
|
|
1227
|
+
if (!cs) return void 0;
|
|
1228
|
+
const style = {};
|
|
1229
|
+
if (cs.fontSize > 0) style.fontSize = cs.fontSize / 10;
|
|
1230
|
+
if (cs.attrFlags & 1) style.italic = true;
|
|
1231
|
+
if (cs.attrFlags & 2) style.bold = true;
|
|
1232
|
+
return style.fontSize || style.bold || style.italic ? style : void 0;
|
|
1233
|
+
}
|
|
855
1234
|
function parseParagraphWithTables(records, startIdx) {
|
|
856
1235
|
const startLevel = records[startIdx].level;
|
|
857
1236
|
let text = "";
|
|
858
1237
|
const tables = [];
|
|
1238
|
+
const charShapeIds = [];
|
|
859
1239
|
let i = startIdx + 1;
|
|
860
1240
|
while (i < records.length) {
|
|
861
1241
|
const rec = records[i];
|
|
@@ -863,6 +1243,11 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
863
1243
|
if (rec.tagId === TAG_PARA_TEXT) {
|
|
864
1244
|
text = extractText(rec.data);
|
|
865
1245
|
}
|
|
1246
|
+
if (rec.tagId === TAG_CHAR_SHAPE && rec.data.length >= 8) {
|
|
1247
|
+
for (let offset = 0; offset + 7 < rec.data.length; offset += 8) {
|
|
1248
|
+
charShapeIds.push(rec.data.readUInt32LE(offset + 4));
|
|
1249
|
+
}
|
|
1250
|
+
}
|
|
866
1251
|
if (rec.tagId === TAG_CTRL_HEADER && rec.data.length >= 4) {
|
|
867
1252
|
const ctrlId = rec.data.subarray(0, 4).toString("ascii");
|
|
868
1253
|
if (ctrlId === " lbt" || ctrlId === "tbl ") {
|
|
@@ -875,7 +1260,7 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
875
1260
|
i++;
|
|
876
1261
|
}
|
|
877
1262
|
const trimmed = text.trim();
|
|
878
|
-
return { paragraph: trimmed || null, tables, nextIdx: i };
|
|
1263
|
+
return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds };
|
|
879
1264
|
}
|
|
880
1265
|
function parseTableBlock(records, startIdx) {
|
|
881
1266
|
const tableLevel = records[startIdx].level;
|
|
@@ -947,6 +1332,355 @@ function arrangeCells(rows, cols, cells) {
|
|
|
947
1332
|
return grid.map((row) => row.map((c) => c || { text: "", colSpan: 1, rowSpan: 1 }));
|
|
948
1333
|
}
|
|
949
1334
|
|
|
1335
|
+
// src/pdf/line-detector.ts
|
|
1336
|
+
var import_pdf = require("pdfjs-dist/legacy/build/pdf.mjs");
|
|
1337
|
+
var ORIENTATION_TOL = 2;
|
|
1338
|
+
var MIN_LINE_LENGTH = 10;
|
|
1339
|
+
var COORD_MERGE_TOL = 3;
|
|
1340
|
+
var CONNECT_TOL = 5;
|
|
1341
|
+
var CELL_PADDING = 2;
|
|
1342
|
+
function extractLines(fnArray, argsArray) {
|
|
1343
|
+
const horizontals = [];
|
|
1344
|
+
const verticals = [];
|
|
1345
|
+
let lineWidth = 1;
|
|
1346
|
+
let currentPath = [];
|
|
1347
|
+
let pathStartX = 0, pathStartY = 0;
|
|
1348
|
+
let curX = 0, curY = 0;
|
|
1349
|
+
function flushPath(isStroke) {
|
|
1350
|
+
if (!isStroke) {
|
|
1351
|
+
currentPath = [];
|
|
1352
|
+
return;
|
|
1353
|
+
}
|
|
1354
|
+
for (const seg of currentPath) {
|
|
1355
|
+
classifyAndAdd(seg, lineWidth, horizontals, verticals);
|
|
1356
|
+
}
|
|
1357
|
+
currentPath = [];
|
|
1358
|
+
}
|
|
1359
|
+
for (let i = 0; i < fnArray.length; i++) {
|
|
1360
|
+
const op = fnArray[i];
|
|
1361
|
+
const args = argsArray[i];
|
|
1362
|
+
switch (op) {
|
|
1363
|
+
case import_pdf.OPS.setLineWidth:
|
|
1364
|
+
lineWidth = args[0] || 1;
|
|
1365
|
+
break;
|
|
1366
|
+
case import_pdf.OPS.constructPath: {
|
|
1367
|
+
const subOps = args[0];
|
|
1368
|
+
const coords = args[1];
|
|
1369
|
+
let ci = 0;
|
|
1370
|
+
for (const subOp of subOps) {
|
|
1371
|
+
if (subOp === import_pdf.OPS.moveTo) {
|
|
1372
|
+
curX = coords[ci++];
|
|
1373
|
+
curY = coords[ci++];
|
|
1374
|
+
pathStartX = curX;
|
|
1375
|
+
pathStartY = curY;
|
|
1376
|
+
} else if (subOp === import_pdf.OPS.lineTo) {
|
|
1377
|
+
const x2 = coords[ci++], y2 = coords[ci++];
|
|
1378
|
+
currentPath.push({ x1: curX, y1: curY, x2, y2 });
|
|
1379
|
+
curX = x2;
|
|
1380
|
+
curY = y2;
|
|
1381
|
+
} else if (subOp === import_pdf.OPS.rectangle) {
|
|
1382
|
+
const rx = coords[ci++], ry = coords[ci++];
|
|
1383
|
+
const rw = coords[ci++], rh = coords[ci++];
|
|
1384
|
+
if (Math.abs(rh) < ORIENTATION_TOL * 2) {
|
|
1385
|
+
currentPath.push({ x1: rx, y1: ry + rh / 2, x2: rx + rw, y2: ry + rh / 2 });
|
|
1386
|
+
} else if (Math.abs(rw) < ORIENTATION_TOL * 2) {
|
|
1387
|
+
currentPath.push({ x1: rx + rw / 2, y1: ry, x2: rx + rw / 2, y2: ry + rh });
|
|
1388
|
+
} else {
|
|
1389
|
+
currentPath.push(
|
|
1390
|
+
{ x1: rx, y1: ry, x2: rx + rw, y2: ry },
|
|
1391
|
+
// bottom
|
|
1392
|
+
{ x1: rx + rw, y1: ry, x2: rx + rw, y2: ry + rh },
|
|
1393
|
+
// right
|
|
1394
|
+
{ x1: rx + rw, y1: ry + rh, x2: rx, y2: ry + rh },
|
|
1395
|
+
// top
|
|
1396
|
+
{ x1: rx, y1: ry + rh, x2: rx, y2: ry }
|
|
1397
|
+
// left
|
|
1398
|
+
);
|
|
1399
|
+
}
|
|
1400
|
+
} else if (subOp === import_pdf.OPS.closePath) {
|
|
1401
|
+
if (curX !== pathStartX || curY !== pathStartY) {
|
|
1402
|
+
currentPath.push({ x1: curX, y1: curY, x2: pathStartX, y2: pathStartY });
|
|
1403
|
+
}
|
|
1404
|
+
curX = pathStartX;
|
|
1405
|
+
curY = pathStartY;
|
|
1406
|
+
} else if (subOp === import_pdf.OPS.curveTo) {
|
|
1407
|
+
ci += 6;
|
|
1408
|
+
} else if (subOp === import_pdf.OPS.curveTo2 || subOp === import_pdf.OPS.curveTo3) {
|
|
1409
|
+
ci += 4;
|
|
1410
|
+
}
|
|
1411
|
+
}
|
|
1412
|
+
break;
|
|
1413
|
+
}
|
|
1414
|
+
case import_pdf.OPS.stroke:
|
|
1415
|
+
case import_pdf.OPS.closeStroke:
|
|
1416
|
+
flushPath(true);
|
|
1417
|
+
break;
|
|
1418
|
+
case import_pdf.OPS.fill:
|
|
1419
|
+
case import_pdf.OPS.eoFill:
|
|
1420
|
+
case import_pdf.OPS.fillStroke:
|
|
1421
|
+
case import_pdf.OPS.eoFillStroke:
|
|
1422
|
+
case import_pdf.OPS.closeFillStroke:
|
|
1423
|
+
case import_pdf.OPS.closeEOFillStroke:
|
|
1424
|
+
flushPath(true);
|
|
1425
|
+
break;
|
|
1426
|
+
case import_pdf.OPS.endPath:
|
|
1427
|
+
flushPath(false);
|
|
1428
|
+
break;
|
|
1429
|
+
}
|
|
1430
|
+
}
|
|
1431
|
+
return { horizontals, verticals };
|
|
1432
|
+
}
|
|
1433
|
+
function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
1434
|
+
const dx = Math.abs(seg.x2 - seg.x1);
|
|
1435
|
+
const dy = Math.abs(seg.y2 - seg.y1);
|
|
1436
|
+
const length = Math.sqrt(dx * dx + dy * dy);
|
|
1437
|
+
if (length < MIN_LINE_LENGTH) return;
|
|
1438
|
+
if (dy <= ORIENTATION_TOL) {
|
|
1439
|
+
const y = (seg.y1 + seg.y2) / 2;
|
|
1440
|
+
const x1 = Math.min(seg.x1, seg.x2);
|
|
1441
|
+
const x2 = Math.max(seg.x1, seg.x2);
|
|
1442
|
+
horizontals.push({ x1, y1: y, x2, y2: y, lineWidth });
|
|
1443
|
+
} else if (dx <= ORIENTATION_TOL) {
|
|
1444
|
+
const x = (seg.x1 + seg.x2) / 2;
|
|
1445
|
+
const y1 = Math.min(seg.y1, seg.y2);
|
|
1446
|
+
const y2 = Math.max(seg.y1, seg.y2);
|
|
1447
|
+
verticals.push({ x1: x, y1, x2: x, y2, lineWidth });
|
|
1448
|
+
}
|
|
1449
|
+
}
|
|
1450
|
+
function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
|
|
1451
|
+
const margin = 5;
|
|
1452
|
+
return {
|
|
1453
|
+
horizontals: horizontals.filter(
|
|
1454
|
+
(l) => !(Math.abs(l.y1) < margin || Math.abs(l.y1 - pageHeight) < margin) || l.x2 - l.x1 < pageWidth * 0.9
|
|
1455
|
+
),
|
|
1456
|
+
verticals: verticals.filter(
|
|
1457
|
+
(l) => !(Math.abs(l.x1) < margin || Math.abs(l.x1 - pageWidth) < margin) || l.y2 - l.y1 < pageHeight * 0.9
|
|
1458
|
+
)
|
|
1459
|
+
};
|
|
1460
|
+
}
|
|
1461
|
+
function buildTableGrids(horizontals, verticals) {
|
|
1462
|
+
if (horizontals.length < 2 || verticals.length < 2) return [];
|
|
1463
|
+
const allLines = [
|
|
1464
|
+
...horizontals.map((l, i) => ({ ...l, type: "h", id: i })),
|
|
1465
|
+
...verticals.map((l, i) => ({ ...l, type: "v", id: i + horizontals.length }))
|
|
1466
|
+
];
|
|
1467
|
+
const groups = groupConnectedLines(allLines);
|
|
1468
|
+
const grids = [];
|
|
1469
|
+
for (const group of groups) {
|
|
1470
|
+
const hLines = group.filter((l) => l.type === "h");
|
|
1471
|
+
const vLines = group.filter((l) => l.type === "v");
|
|
1472
|
+
if (hLines.length < 2 || vLines.length < 2) continue;
|
|
1473
|
+
const rawYs = hLines.map((l) => l.y1);
|
|
1474
|
+
const rowYs = clusterCoordinates(rawYs).sort((a, b) => b - a);
|
|
1475
|
+
const rawXs = vLines.map((l) => l.x1);
|
|
1476
|
+
const colXs = clusterCoordinates(rawXs).sort((a, b) => a - b);
|
|
1477
|
+
if (rowYs.length < 2 || colXs.length < 2) continue;
|
|
1478
|
+
const bbox = {
|
|
1479
|
+
x1: colXs[0],
|
|
1480
|
+
y1: rowYs[rowYs.length - 1],
|
|
1481
|
+
x2: colXs[colXs.length - 1],
|
|
1482
|
+
y2: rowYs[0]
|
|
1483
|
+
};
|
|
1484
|
+
grids.push({ rowYs, colXs, bbox });
|
|
1485
|
+
}
|
|
1486
|
+
return grids;
|
|
1487
|
+
}
|
|
1488
|
+
function clusterCoordinates(values) {
|
|
1489
|
+
if (values.length === 0) return [];
|
|
1490
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
1491
|
+
const clusters = [{ sum: sorted[0], count: 1 }];
|
|
1492
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
1493
|
+
const last = clusters[clusters.length - 1];
|
|
1494
|
+
const avg = last.sum / last.count;
|
|
1495
|
+
if (Math.abs(sorted[i] - avg) <= COORD_MERGE_TOL) {
|
|
1496
|
+
last.sum += sorted[i];
|
|
1497
|
+
last.count++;
|
|
1498
|
+
} else {
|
|
1499
|
+
clusters.push({ sum: sorted[i], count: 1 });
|
|
1500
|
+
}
|
|
1501
|
+
}
|
|
1502
|
+
return clusters.map((c) => c.sum / c.count);
|
|
1503
|
+
}
|
|
1504
|
+
function groupConnectedLines(lines) {
|
|
1505
|
+
const parent = lines.map((_, i) => i);
|
|
1506
|
+
function find(x) {
|
|
1507
|
+
while (parent[x] !== x) {
|
|
1508
|
+
parent[x] = parent[parent[x]];
|
|
1509
|
+
x = parent[x];
|
|
1510
|
+
}
|
|
1511
|
+
return x;
|
|
1512
|
+
}
|
|
1513
|
+
function union(a, b) {
|
|
1514
|
+
const ra = find(a), rb = find(b);
|
|
1515
|
+
if (ra !== rb) parent[ra] = rb;
|
|
1516
|
+
}
|
|
1517
|
+
for (let i = 0; i < lines.length; i++) {
|
|
1518
|
+
for (let j = i + 1; j < lines.length; j++) {
|
|
1519
|
+
if (linesIntersect(lines[i], lines[j])) {
|
|
1520
|
+
union(i, j);
|
|
1521
|
+
}
|
|
1522
|
+
}
|
|
1523
|
+
}
|
|
1524
|
+
const groups = /* @__PURE__ */ new Map();
|
|
1525
|
+
for (let i = 0; i < lines.length; i++) {
|
|
1526
|
+
const root = find(i);
|
|
1527
|
+
if (!groups.has(root)) groups.set(root, []);
|
|
1528
|
+
groups.get(root).push(lines[i]);
|
|
1529
|
+
}
|
|
1530
|
+
return [...groups.values()];
|
|
1531
|
+
}
|
|
1532
|
+
function linesIntersect(a, b) {
|
|
1533
|
+
if (a.type === b.type) {
|
|
1534
|
+
if (a.type === "h") {
|
|
1535
|
+
if (Math.abs(a.y1 - b.y1) > CONNECT_TOL) return false;
|
|
1536
|
+
return Math.min(a.x2, b.x2) >= Math.max(a.x1, b.x1) - CONNECT_TOL;
|
|
1537
|
+
} else {
|
|
1538
|
+
if (Math.abs(a.x1 - b.x1) > CONNECT_TOL) return false;
|
|
1539
|
+
return Math.min(a.y2, b.y2) >= Math.max(a.y1, b.y1) - CONNECT_TOL;
|
|
1540
|
+
}
|
|
1541
|
+
}
|
|
1542
|
+
const h = a.type === "h" ? a : b;
|
|
1543
|
+
const v = a.type === "h" ? b : a;
|
|
1544
|
+
const tol = CONNECT_TOL;
|
|
1545
|
+
return v.x1 >= h.x1 - tol && v.x1 <= h.x2 + tol && h.y1 >= v.y1 - tol && h.y1 <= v.y2 + tol;
|
|
1546
|
+
}
|
|
1547
|
+
function extractCells(grid, horizontals, verticals) {
|
|
1548
|
+
const { rowYs, colXs } = grid;
|
|
1549
|
+
const numRows = rowYs.length - 1;
|
|
1550
|
+
const numCols = colXs.length - 1;
|
|
1551
|
+
if (numRows <= 0 || numCols <= 0) return [];
|
|
1552
|
+
const occupied = Array.from({ length: numRows }, () => Array(numCols).fill(false));
|
|
1553
|
+
const cells = [];
|
|
1554
|
+
for (let r = 0; r < numRows; r++) {
|
|
1555
|
+
for (let c = 0; c < numCols; c++) {
|
|
1556
|
+
if (occupied[r][c]) continue;
|
|
1557
|
+
let colSpan = 1;
|
|
1558
|
+
let rowSpan = 1;
|
|
1559
|
+
while (c + colSpan < numCols) {
|
|
1560
|
+
const borderX = colXs[c + colSpan];
|
|
1561
|
+
const topY = rowYs[r];
|
|
1562
|
+
const botY = rowYs[r + 1];
|
|
1563
|
+
if (hasVerticalLine(verticals, borderX, topY, botY)) break;
|
|
1564
|
+
colSpan++;
|
|
1565
|
+
}
|
|
1566
|
+
while (r + rowSpan < numRows) {
|
|
1567
|
+
const borderY = rowYs[r + rowSpan];
|
|
1568
|
+
const leftX = colXs[c];
|
|
1569
|
+
const rightX = colXs[c + colSpan];
|
|
1570
|
+
if (hasHorizontalLine(horizontals, borderY, leftX, rightX)) break;
|
|
1571
|
+
rowSpan++;
|
|
1572
|
+
}
|
|
1573
|
+
for (let dr = 0; dr < rowSpan; dr++) {
|
|
1574
|
+
for (let dc = 0; dc < colSpan; dc++) {
|
|
1575
|
+
occupied[r + dr][c + dc] = true;
|
|
1576
|
+
}
|
|
1577
|
+
}
|
|
1578
|
+
cells.push({
|
|
1579
|
+
row: r,
|
|
1580
|
+
col: c,
|
|
1581
|
+
rowSpan,
|
|
1582
|
+
colSpan,
|
|
1583
|
+
bbox: {
|
|
1584
|
+
x1: colXs[c],
|
|
1585
|
+
y1: rowYs[r + rowSpan],
|
|
1586
|
+
x2: colXs[c + colSpan],
|
|
1587
|
+
y2: rowYs[r]
|
|
1588
|
+
}
|
|
1589
|
+
});
|
|
1590
|
+
}
|
|
1591
|
+
}
|
|
1592
|
+
return cells;
|
|
1593
|
+
}
|
|
1594
|
+
function hasVerticalLine(verticals, x, topY, botY) {
|
|
1595
|
+
const tol = COORD_MERGE_TOL + 1;
|
|
1596
|
+
for (const v of verticals) {
|
|
1597
|
+
if (Math.abs(v.x1 - x) <= tol) {
|
|
1598
|
+
const cellH = Math.abs(topY - botY);
|
|
1599
|
+
const overlapTop = Math.min(v.y2, topY);
|
|
1600
|
+
const overlapBot = Math.max(v.y1, botY);
|
|
1601
|
+
const overlap = overlapTop - overlapBot;
|
|
1602
|
+
if (overlap >= cellH * 0.5) return true;
|
|
1603
|
+
}
|
|
1604
|
+
}
|
|
1605
|
+
return false;
|
|
1606
|
+
}
|
|
1607
|
+
function hasHorizontalLine(horizontals, y, leftX, rightX) {
|
|
1608
|
+
const tol = COORD_MERGE_TOL + 1;
|
|
1609
|
+
for (const h of horizontals) {
|
|
1610
|
+
if (Math.abs(h.y1 - y) <= tol) {
|
|
1611
|
+
const cellW = Math.abs(rightX - leftX);
|
|
1612
|
+
const overlapLeft = Math.max(h.x1, leftX);
|
|
1613
|
+
const overlapRight = Math.min(h.x2, rightX);
|
|
1614
|
+
const overlap = overlapRight - overlapLeft;
|
|
1615
|
+
if (overlap >= cellW * 0.5) return true;
|
|
1616
|
+
}
|
|
1617
|
+
}
|
|
1618
|
+
return false;
|
|
1619
|
+
}
|
|
1620
|
+
function mapTextToCells(items, cells) {
|
|
1621
|
+
const result = /* @__PURE__ */ new Map();
|
|
1622
|
+
for (const cell of cells) {
|
|
1623
|
+
result.set(cell, []);
|
|
1624
|
+
}
|
|
1625
|
+
for (const item of items) {
|
|
1626
|
+
const cx = item.x + item.w / 2;
|
|
1627
|
+
const cy = item.y;
|
|
1628
|
+
const pad = CELL_PADDING;
|
|
1629
|
+
let bestCell = null;
|
|
1630
|
+
let bestDist = Infinity;
|
|
1631
|
+
for (const cell of cells) {
|
|
1632
|
+
if (cx >= cell.bbox.x1 - pad && cx <= cell.bbox.x2 + pad && cy >= cell.bbox.y1 - pad && cy <= cell.bbox.y2 + pad) {
|
|
1633
|
+
const cellCx = (cell.bbox.x1 + cell.bbox.x2) / 2;
|
|
1634
|
+
const cellCy = (cell.bbox.y1 + cell.bbox.y2) / 2;
|
|
1635
|
+
const dist = Math.abs(cx - cellCx) + Math.abs(cy - cellCy);
|
|
1636
|
+
if (dist < bestDist) {
|
|
1637
|
+
bestDist = dist;
|
|
1638
|
+
bestCell = cell;
|
|
1639
|
+
}
|
|
1640
|
+
}
|
|
1641
|
+
}
|
|
1642
|
+
if (bestCell) {
|
|
1643
|
+
result.get(bestCell).push(item);
|
|
1644
|
+
}
|
|
1645
|
+
}
|
|
1646
|
+
return result;
|
|
1647
|
+
}
|
|
1648
|
+
function cellTextToString(items) {
|
|
1649
|
+
if (items.length === 0) return "";
|
|
1650
|
+
if (items.length === 1) return items[0].text;
|
|
1651
|
+
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
1652
|
+
const lines = [];
|
|
1653
|
+
let curLine = [sorted[0]];
|
|
1654
|
+
let curY = sorted[0].y;
|
|
1655
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
1656
|
+
const tol = Math.max(3, Math.min(sorted[i].fontSize, curLine[0].fontSize) * 0.6);
|
|
1657
|
+
if (Math.abs(sorted[i].y - curY) <= tol) {
|
|
1658
|
+
curLine.push(sorted[i]);
|
|
1659
|
+
} else {
|
|
1660
|
+
lines.push(curLine);
|
|
1661
|
+
curLine = [sorted[i]];
|
|
1662
|
+
curY = sorted[i].y;
|
|
1663
|
+
}
|
|
1664
|
+
}
|
|
1665
|
+
lines.push(curLine);
|
|
1666
|
+
const textLines = lines.map((line) => {
|
|
1667
|
+
const s = line.sort((a, b) => a.x - b.x);
|
|
1668
|
+
return s.map((i) => i.text).join(" ");
|
|
1669
|
+
});
|
|
1670
|
+
if (textLines.length <= 1) return textLines[0] || "";
|
|
1671
|
+
const merged = [textLines[0]];
|
|
1672
|
+
for (let i = 1; i < textLines.length; i++) {
|
|
1673
|
+
const prev = merged[merged.length - 1];
|
|
1674
|
+
const curr = textLines[i];
|
|
1675
|
+
if (/[가-힣]$/.test(prev) && /^[가-힣]+$/.test(curr) && curr.length <= 5 && !curr.includes(" ")) {
|
|
1676
|
+
merged[merged.length - 1] = prev + curr;
|
|
1677
|
+
} else {
|
|
1678
|
+
merged.push(curr);
|
|
1679
|
+
}
|
|
1680
|
+
}
|
|
1681
|
+
return merged.join("\n");
|
|
1682
|
+
}
|
|
1683
|
+
|
|
950
1684
|
// src/pdf/polyfill.ts
|
|
951
1685
|
var pdfjsWorker = __toESM(require("pdfjs-dist/legacy/build/pdf.worker.mjs"), 1);
|
|
952
1686
|
var g = globalThis;
|
|
@@ -965,12 +1699,12 @@ if (typeof g.Path2D === "undefined") {
|
|
|
965
1699
|
g.pdfjsWorker = pdfjsWorker;
|
|
966
1700
|
|
|
967
1701
|
// src/pdf/parser.ts
|
|
968
|
-
var
|
|
969
|
-
|
|
1702
|
+
var import_pdf2 = require("pdfjs-dist/legacy/build/pdf.mjs");
|
|
1703
|
+
import_pdf2.GlobalWorkerOptions.workerSrc = "";
|
|
970
1704
|
var MAX_PAGES = 5e3;
|
|
971
1705
|
var MAX_TOTAL_TEXT = 100 * 1024 * 1024;
|
|
972
1706
|
async function parsePdfDocument(buffer, options) {
|
|
973
|
-
const doc = await (0,
|
|
1707
|
+
const doc = await (0, import_pdf2.getDocument)({
|
|
974
1708
|
data: new Uint8Array(buffer),
|
|
975
1709
|
useSystemFonts: true,
|
|
976
1710
|
disableFontFace: true,
|
|
@@ -978,25 +1712,39 @@ async function parsePdfDocument(buffer, options) {
|
|
|
978
1712
|
}).promise;
|
|
979
1713
|
try {
|
|
980
1714
|
const pageCount = doc.numPages;
|
|
981
|
-
if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.",
|
|
1715
|
+
if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };
|
|
982
1716
|
const metadata = { pageCount };
|
|
983
1717
|
await extractPdfMetadata(doc, metadata);
|
|
984
|
-
const pageTexts = [];
|
|
985
1718
|
const blocks = [];
|
|
1719
|
+
const warnings = [];
|
|
986
1720
|
let totalChars = 0;
|
|
987
1721
|
let totalTextBytes = 0;
|
|
988
1722
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
989
1723
|
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
1724
|
+
const allFontSizes = [];
|
|
990
1725
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
991
1726
|
if (pageFilter && !pageFilter.has(i)) continue;
|
|
992
1727
|
const page = await doc.getPage(i);
|
|
993
1728
|
const tc = await page.getTextContent();
|
|
994
|
-
const
|
|
995
|
-
|
|
996
|
-
|
|
1729
|
+
const viewport = page.getViewport({ scale: 1 });
|
|
1730
|
+
const rawItems = tc.items;
|
|
1731
|
+
const items = normalizeItems(rawItems);
|
|
1732
|
+
const { visible, hiddenCount } = filterHiddenText(items, viewport.width, viewport.height);
|
|
1733
|
+
if (hiddenCount > 0) {
|
|
1734
|
+
warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
1735
|
+
}
|
|
1736
|
+
for (const item of visible) {
|
|
1737
|
+
if (item.fontSize > 0) allFontSizes.push(item.fontSize);
|
|
1738
|
+
}
|
|
1739
|
+
const opList = await page.getOperatorList();
|
|
1740
|
+
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
1741
|
+
for (const b of pageBlocks) blocks.push(b);
|
|
1742
|
+
for (const b of pageBlocks) {
|
|
1743
|
+
const t = b.text || "";
|
|
1744
|
+
totalChars += t.replace(/\s/g, "").length;
|
|
1745
|
+
totalTextBytes += t.length * 2;
|
|
1746
|
+
}
|
|
997
1747
|
if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
|
|
998
|
-
pageTexts.push(pageText);
|
|
999
|
-
blocks.push({ type: "paragraph", text: pageText });
|
|
1000
1748
|
}
|
|
1001
1749
|
const parsedPageCount = pageFilter ? pageFilter.size : effectivePageCount;
|
|
1002
1750
|
if (totalChars / Math.max(parsedPageCount, 1) < 10) {
|
|
@@ -1006,16 +1754,20 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1006
1754
|
const ocrBlocks = await ocrPages2(doc, options.ocr, pageFilter, effectivePageCount);
|
|
1007
1755
|
if (ocrBlocks.length > 0) {
|
|
1008
1756
|
const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
|
|
1009
|
-
return { success: true, fileType: "pdf", markdown: ocrMarkdown, pageCount: parsedPageCount, blocks: ocrBlocks, metadata, isImageBased: true };
|
|
1757
|
+
return { success: true, fileType: "pdf", markdown: ocrMarkdown, pageCount: parsedPageCount, blocks: ocrBlocks, metadata, isImageBased: true, warnings };
|
|
1010
1758
|
}
|
|
1011
1759
|
} catch {
|
|
1012
1760
|
}
|
|
1013
1761
|
}
|
|
1014
1762
|
return { success: false, fileType: "pdf", pageCount, isImageBased: true, error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`, code: "IMAGE_BASED_PDF" };
|
|
1015
1763
|
}
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1764
|
+
const medianFontSize = computeMedianFontSize(allFontSizes);
|
|
1765
|
+
if (medianFontSize > 0) {
|
|
1766
|
+
detectHeadings(blocks, medianFontSize);
|
|
1767
|
+
}
|
|
1768
|
+
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
1769
|
+
let markdown = cleanPdfText(blocksToMarkdown(blocks));
|
|
1770
|
+
return { success: true, fileType: "pdf", markdown, pageCount: parsedPageCount, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
1019
1771
|
} finally {
|
|
1020
1772
|
await doc.destroy().catch(() => {
|
|
1021
1773
|
});
|
|
@@ -1044,24 +1796,272 @@ function parsePdfDate(dateStr) {
|
|
|
1044
1796
|
const [, year, month = "01", day = "01", hour = "00", min = "00", sec = "00"] = m;
|
|
1045
1797
|
return `${year}-${month}-${day}T${hour}:${min}:${sec}`;
|
|
1046
1798
|
}
|
|
1047
|
-
function
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
const
|
|
1051
|
-
|
|
1799
|
+
function filterHiddenText(items, pageWidth, pageHeight) {
|
|
1800
|
+
let hiddenCount = 0;
|
|
1801
|
+
const visible = [];
|
|
1802
|
+
for (const item of items) {
|
|
1803
|
+
if (item.isHidden) {
|
|
1804
|
+
hiddenCount++;
|
|
1805
|
+
continue;
|
|
1806
|
+
}
|
|
1807
|
+
const margin = Math.max(pageWidth, pageHeight) * 0.1;
|
|
1808
|
+
if (item.x < -margin || item.x > pageWidth + margin || item.y < -margin || item.y > pageHeight + margin) {
|
|
1809
|
+
hiddenCount++;
|
|
1810
|
+
continue;
|
|
1811
|
+
}
|
|
1812
|
+
visible.push(item);
|
|
1813
|
+
}
|
|
1814
|
+
return { visible, hiddenCount };
|
|
1815
|
+
}
|
|
1816
|
+
function computeMedianFontSize(sizes) {
|
|
1817
|
+
if (sizes.length === 0) return 0;
|
|
1818
|
+
const sorted = [...sizes].sort((a, b) => a - b);
|
|
1819
|
+
const mid = Math.floor(sorted.length / 2);
|
|
1820
|
+
return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
|
|
1821
|
+
}
|
|
1822
|
+
function detectHeadings(blocks, medianFontSize) {
|
|
1823
|
+
for (const block of blocks) {
|
|
1824
|
+
if (block.type !== "paragraph" || !block.text || !block.style?.fontSize) continue;
|
|
1825
|
+
const text = block.text.trim();
|
|
1826
|
+
if (text.length === 0 || text.length > 200) continue;
|
|
1827
|
+
if (/^\d+$/.test(text)) continue;
|
|
1828
|
+
const ratio = block.style.fontSize / medianFontSize;
|
|
1829
|
+
let level = 0;
|
|
1830
|
+
if (ratio >= 1.5) level = 1;
|
|
1831
|
+
else if (ratio >= 1.3) level = 2;
|
|
1832
|
+
else if (ratio >= 1.15) level = 3;
|
|
1833
|
+
if (level > 0) {
|
|
1834
|
+
block.type = "heading";
|
|
1835
|
+
block.level = level;
|
|
1836
|
+
}
|
|
1837
|
+
}
|
|
1838
|
+
}
|
|
1839
|
+
var MAX_XYCUT_DEPTH = 50;
|
|
1840
|
+
function xyCutOrder(items, gapThreshold, depth = 0) {
|
|
1841
|
+
if (items.length === 0) return [];
|
|
1842
|
+
if (items.length <= 2 || depth >= MAX_XYCUT_DEPTH) return [items];
|
|
1843
|
+
const region = computeRegion(items);
|
|
1844
|
+
const ySplit = findYSplit(items, region, gapThreshold);
|
|
1845
|
+
if (ySplit !== null) {
|
|
1846
|
+
const upper = items.filter((i) => i.y > ySplit);
|
|
1847
|
+
const lower = items.filter((i) => i.y <= ySplit);
|
|
1848
|
+
if (upper.length > 0 && lower.length > 0 && upper.length < items.length) {
|
|
1849
|
+
return [...xyCutOrder(upper, gapThreshold, depth + 1), ...xyCutOrder(lower, gapThreshold, depth + 1)];
|
|
1850
|
+
}
|
|
1851
|
+
}
|
|
1852
|
+
const xSplit = findXSplit(items, region, gapThreshold);
|
|
1853
|
+
if (xSplit !== null) {
|
|
1854
|
+
const left = items.filter((i) => i.x + i.w / 2 < xSplit);
|
|
1855
|
+
const right = items.filter((i) => i.x + i.w / 2 >= xSplit);
|
|
1856
|
+
if (left.length > 0 && right.length > 0 && left.length < items.length) {
|
|
1857
|
+
return [...xyCutOrder(left, gapThreshold, depth + 1), ...xyCutOrder(right, gapThreshold, depth + 1)];
|
|
1858
|
+
}
|
|
1859
|
+
}
|
|
1860
|
+
return [items];
|
|
1861
|
+
}
|
|
1862
|
+
function computeRegion(items) {
|
|
1863
|
+
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
1864
|
+
for (const i of items) {
|
|
1865
|
+
if (i.x < minX) minX = i.x;
|
|
1866
|
+
if (i.y < minY) minY = i.y;
|
|
1867
|
+
if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
1868
|
+
if (i.y + i.h > maxY) maxY = i.y + i.h;
|
|
1869
|
+
}
|
|
1870
|
+
return { items, minX, minY, maxX, maxY };
|
|
1871
|
+
}
|
|
1872
|
+
function findYSplit(items, region, gapThreshold) {
|
|
1873
|
+
const sorted = [...items].sort((a, b) => b.y - a.y);
|
|
1874
|
+
let bestGap = gapThreshold;
|
|
1875
|
+
let bestSplit = null;
|
|
1876
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
1877
|
+
const prevBottom = sorted[i - 1].y - sorted[i - 1].h;
|
|
1878
|
+
const currTop = sorted[i].y;
|
|
1879
|
+
const gap = prevBottom - currTop;
|
|
1880
|
+
if (gap > bestGap) {
|
|
1881
|
+
bestGap = gap;
|
|
1882
|
+
bestSplit = (prevBottom + currTop) / 2;
|
|
1883
|
+
}
|
|
1884
|
+
}
|
|
1885
|
+
return bestSplit;
|
|
1886
|
+
}
|
|
1887
|
+
function findXSplit(items, region, gapThreshold) {
|
|
1888
|
+
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
1889
|
+
let bestGap = gapThreshold;
|
|
1890
|
+
let bestSplit = null;
|
|
1891
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
1892
|
+
const prevRight = sorted[i - 1].x + sorted[i - 1].w;
|
|
1893
|
+
const currLeft = sorted[i].x;
|
|
1894
|
+
const gap = currLeft - prevRight;
|
|
1895
|
+
if (gap > bestGap) {
|
|
1896
|
+
bestGap = gap;
|
|
1897
|
+
bestSplit = (prevRight + currLeft) / 2;
|
|
1898
|
+
}
|
|
1899
|
+
}
|
|
1900
|
+
return bestSplit;
|
|
1901
|
+
}
|
|
1902
|
+
function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeight) {
|
|
1903
|
+
if (items.length === 0) return [];
|
|
1904
|
+
let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
|
|
1905
|
+
({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
|
|
1906
|
+
const grids = buildTableGrids(horizontals, verticals);
|
|
1907
|
+
if (grids.length > 0) {
|
|
1908
|
+
return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
|
|
1909
|
+
}
|
|
1910
|
+
return extractPageBlocksFallback(items, pageNum);
|
|
1911
|
+
}
|
|
1912
|
+
function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
1913
|
+
const blocks = [];
|
|
1914
|
+
const usedItems = /* @__PURE__ */ new Set();
|
|
1915
|
+
const sortedGrids = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
1916
|
+
for (const grid of sortedGrids) {
|
|
1917
|
+
const tableItems = [];
|
|
1918
|
+
const pad = 3;
|
|
1919
|
+
for (const item of items) {
|
|
1920
|
+
if (usedItems.has(item)) continue;
|
|
1921
|
+
if (item.x >= grid.bbox.x1 - pad && item.x + item.w <= grid.bbox.x2 + pad && item.y >= grid.bbox.y1 - pad && item.y <= grid.bbox.y2 + pad) {
|
|
1922
|
+
tableItems.push(item);
|
|
1923
|
+
usedItems.add(item);
|
|
1924
|
+
}
|
|
1925
|
+
}
|
|
1926
|
+
const cells = extractCells(grid, horizontals, verticals);
|
|
1927
|
+
if (cells.length === 0) continue;
|
|
1928
|
+
const textItems = tableItems.map((i) => ({
|
|
1929
|
+
text: i.text,
|
|
1930
|
+
x: i.x,
|
|
1931
|
+
y: i.y,
|
|
1932
|
+
w: i.w,
|
|
1933
|
+
h: i.h,
|
|
1934
|
+
fontSize: i.fontSize,
|
|
1935
|
+
fontName: i.fontName
|
|
1936
|
+
}));
|
|
1937
|
+
const cellTextMap = mapTextToCells(textItems, cells);
|
|
1938
|
+
const numRows = grid.rowYs.length - 1;
|
|
1939
|
+
const numCols = grid.colXs.length - 1;
|
|
1940
|
+
const irGrid = Array.from(
|
|
1941
|
+
{ length: numRows },
|
|
1942
|
+
() => Array.from({ length: numCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
|
|
1943
|
+
);
|
|
1944
|
+
for (const cell of cells) {
|
|
1945
|
+
const textItems2 = cellTextMap.get(cell) || [];
|
|
1946
|
+
const text = cellTextToString(textItems2);
|
|
1947
|
+
irGrid[cell.row][cell.col] = {
|
|
1948
|
+
text,
|
|
1949
|
+
colSpan: cell.colSpan,
|
|
1950
|
+
rowSpan: cell.rowSpan
|
|
1951
|
+
};
|
|
1952
|
+
}
|
|
1953
|
+
const irTable = {
|
|
1954
|
+
rows: numRows,
|
|
1955
|
+
cols: numCols,
|
|
1956
|
+
cells: irGrid,
|
|
1957
|
+
hasHeader: numRows > 1
|
|
1958
|
+
};
|
|
1959
|
+
blocks.push({
|
|
1960
|
+
type: "table",
|
|
1961
|
+
table: irTable,
|
|
1962
|
+
pageNumber: pageNum,
|
|
1963
|
+
bbox: {
|
|
1964
|
+
page: pageNum,
|
|
1965
|
+
x: grid.bbox.x1,
|
|
1966
|
+
y: grid.bbox.y1,
|
|
1967
|
+
width: grid.bbox.x2 - grid.bbox.x1,
|
|
1968
|
+
height: grid.bbox.y2 - grid.bbox.y1
|
|
1969
|
+
}
|
|
1970
|
+
});
|
|
1971
|
+
}
|
|
1972
|
+
const remaining = items.filter((i) => !usedItems.has(i));
|
|
1973
|
+
if (remaining.length > 0) {
|
|
1974
|
+
remaining.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
1975
|
+
const textBlocks = detectListBlocks(extractPageBlocksFallback(remaining, pageNum));
|
|
1976
|
+
const allBlocks = [...blocks, ...textBlocks];
|
|
1977
|
+
allBlocks.sort((a, b) => {
|
|
1978
|
+
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
1979
|
+
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
1980
|
+
return by - ay;
|
|
1981
|
+
});
|
|
1982
|
+
return allBlocks;
|
|
1983
|
+
}
|
|
1984
|
+
return blocks;
|
|
1985
|
+
}
|
|
1986
|
+
function extractPageBlocksFallback(items, pageNum) {
|
|
1987
|
+
if (items.length === 0) return [];
|
|
1988
|
+
const blocks = [];
|
|
1989
|
+
const allYLines = groupByY(items);
|
|
1990
|
+
const columns = detectColumns(allYLines);
|
|
1052
1991
|
if (columns && columns.length >= 3) {
|
|
1053
|
-
|
|
1992
|
+
const tableText = extractWithColumns(allYLines, columns);
|
|
1993
|
+
const bbox = computeBBox(items, pageNum);
|
|
1994
|
+
blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
|
|
1995
|
+
} else {
|
|
1996
|
+
const allY = items.map((i) => i.y);
|
|
1997
|
+
const pageHeight = Math.max(...allY) - Math.min(...allY);
|
|
1998
|
+
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
1999
|
+
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
2000
|
+
for (const group of orderedGroups) {
|
|
2001
|
+
if (group.length === 0) continue;
|
|
2002
|
+
const yLines = groupByY(group);
|
|
2003
|
+
const groupColumns = detectColumns(yLines);
|
|
2004
|
+
if (groupColumns && groupColumns.length >= 3) {
|
|
2005
|
+
const tableText = extractWithColumns(yLines, groupColumns);
|
|
2006
|
+
const bbox = computeBBox(group, pageNum);
|
|
2007
|
+
blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(group) });
|
|
2008
|
+
} else {
|
|
2009
|
+
for (const line of yLines) {
|
|
2010
|
+
const text = mergeLineSimple(line);
|
|
2011
|
+
if (!text.trim()) continue;
|
|
2012
|
+
const bbox = computeBBox(line, pageNum);
|
|
2013
|
+
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
|
|
2014
|
+
}
|
|
2015
|
+
}
|
|
2016
|
+
}
|
|
1054
2017
|
}
|
|
1055
|
-
return
|
|
2018
|
+
return blocks;
|
|
2019
|
+
}
|
|
2020
|
+
function computeBBox(items, pageNum) {
|
|
2021
|
+
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
2022
|
+
for (const i of items) {
|
|
2023
|
+
if (i.x < minX) minX = i.x;
|
|
2024
|
+
if (i.y < minY) minY = i.y;
|
|
2025
|
+
if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
2026
|
+
const effectiveH = i.h > 0 ? i.h : i.fontSize;
|
|
2027
|
+
if (i.y + effectiveH > maxY) maxY = i.y + effectiveH;
|
|
2028
|
+
}
|
|
2029
|
+
return { page: pageNum, x: minX, y: minY, width: maxX - minX, height: maxY - minY };
|
|
2030
|
+
}
|
|
2031
|
+
function dominantStyle(items) {
|
|
2032
|
+
if (items.length === 0) return void 0;
|
|
2033
|
+
const freq = /* @__PURE__ */ new Map();
|
|
2034
|
+
let maxCount = 0, dominantSize = 0;
|
|
2035
|
+
for (const i of items) {
|
|
2036
|
+
if (i.fontSize <= 0) continue;
|
|
2037
|
+
const count = (freq.get(i.fontSize) || 0) + 1;
|
|
2038
|
+
freq.set(i.fontSize, count);
|
|
2039
|
+
if (count > maxCount) {
|
|
2040
|
+
maxCount = count;
|
|
2041
|
+
dominantSize = i.fontSize;
|
|
2042
|
+
}
|
|
2043
|
+
}
|
|
2044
|
+
if (dominantSize === 0) return void 0;
|
|
2045
|
+
const fontName = items.find((i) => i.fontSize === dominantSize)?.fontName || void 0;
|
|
2046
|
+
return { fontSize: dominantSize, fontName };
|
|
1056
2047
|
}
|
|
1057
2048
|
function normalizeItems(rawItems) {
|
|
1058
|
-
return rawItems.filter((i) => typeof i.str === "string" && i.str.trim() !== "").map((i) =>
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
2049
|
+
return rawItems.filter((i) => typeof i.str === "string" && i.str.trim() !== "").map((i) => {
|
|
2050
|
+
const scaleY = Math.abs(i.transform[3]);
|
|
2051
|
+
const scaleX = Math.abs(i.transform[0]);
|
|
2052
|
+
const fontSize = Math.round(Math.max(scaleY, scaleX));
|
|
2053
|
+
return {
|
|
2054
|
+
text: i.str.trim(),
|
|
2055
|
+
x: Math.round(i.transform[4]),
|
|
2056
|
+
y: Math.round(i.transform[5]),
|
|
2057
|
+
w: Math.round(i.width),
|
|
2058
|
+
h: Math.round(i.height),
|
|
2059
|
+
fontSize,
|
|
2060
|
+
fontName: i.fontName || "",
|
|
2061
|
+
// 0pt 폰트이거나 너비 0 → hidden text (prompt injection 의심)
|
|
2062
|
+
isHidden: fontSize === 0 || i.width === 0 && i.str.trim().length > 0
|
|
2063
|
+
};
|
|
2064
|
+
}).sort((a, b) => b.y - a.y || a.x - b.x);
|
|
1065
2065
|
}
|
|
1066
2066
|
function groupByY(items) {
|
|
1067
2067
|
if (items.length === 0) return [];
|
|
@@ -1299,6 +2299,27 @@ function startsWithMarker(line) {
|
|
|
1299
2299
|
function isStandaloneHeader(line) {
|
|
1300
2300
|
return /^제\d+[조항호장절](\([^)]*\))?(\s+\S+){0,7}$/.test(line.trim());
|
|
1301
2301
|
}
|
|
2302
|
+
function detectListBlocks(blocks) {
|
|
2303
|
+
const result = [];
|
|
2304
|
+
for (let i = 0; i < blocks.length; i++) {
|
|
2305
|
+
const block = blocks[i];
|
|
2306
|
+
if (block.type === "paragraph" && block.text) {
|
|
2307
|
+
const match = block.text.match(/^(\d+)\.\s/);
|
|
2308
|
+
if (match) {
|
|
2309
|
+
result.push({
|
|
2310
|
+
...block,
|
|
2311
|
+
type: "list",
|
|
2312
|
+
listType: "ordered",
|
|
2313
|
+
// 원래 번호를 text에 보존 (blocksToMarkdown에서 그대로 출력)
|
|
2314
|
+
text: block.text
|
|
2315
|
+
});
|
|
2316
|
+
continue;
|
|
2317
|
+
}
|
|
2318
|
+
}
|
|
2319
|
+
result.push(block);
|
|
2320
|
+
}
|
|
2321
|
+
return result;
|
|
2322
|
+
}
|
|
1302
2323
|
function mergeKoreanLines(text) {
|
|
1303
2324
|
if (!text) return "";
|
|
1304
2325
|
const lines = text.split("\n");
|
|
@@ -1307,6 +2328,10 @@ function mergeKoreanLines(text) {
|
|
|
1307
2328
|
for (let i = 1; i < lines.length; i++) {
|
|
1308
2329
|
const prev = result[result.length - 1];
|
|
1309
2330
|
const curr = lines[i];
|
|
2331
|
+
if (/^#{1,6}\s/.test(prev) || /^#{1,6}\s/.test(curr)) {
|
|
2332
|
+
result.push(curr);
|
|
2333
|
+
continue;
|
|
2334
|
+
}
|
|
1310
2335
|
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
|
|
1311
2336
|
result[result.length - 1] = prev + " " + curr;
|
|
1312
2337
|
} else {
|
|
@@ -1448,12 +2473,13 @@ function fallbackAlign(a, b) {
|
|
|
1448
2473
|
}
|
|
1449
2474
|
function blockSimilarity(a, b) {
|
|
1450
2475
|
if (a.type !== b.type) return 0;
|
|
1451
|
-
if (a.
|
|
2476
|
+
if (a.text !== void 0 && b.text !== void 0) {
|
|
1452
2477
|
return normalizedSimilarity(a.text || "", b.text || "");
|
|
1453
2478
|
}
|
|
1454
2479
|
if (a.type === "table" && a.table && b.table) {
|
|
1455
2480
|
return tableSimilarity(a.table, b.table);
|
|
1456
2481
|
}
|
|
2482
|
+
if (a.type === b.type) return 1;
|
|
1457
2483
|
return 0;
|
|
1458
2484
|
}
|
|
1459
2485
|
function tableSimilarity(a, b) {
|
|
@@ -1724,16 +2750,16 @@ async function parse(buffer, options) {
|
|
|
1724
2750
|
}
|
|
1725
2751
|
async function parseHwpx(buffer, options) {
|
|
1726
2752
|
try {
|
|
1727
|
-
const { markdown, blocks, metadata } = await parseHwpxDocument(buffer, options);
|
|
1728
|
-
return { success: true, fileType: "hwpx", markdown, blocks, metadata };
|
|
2753
|
+
const { markdown, blocks, metadata, outline, warnings } = await parseHwpxDocument(buffer, options);
|
|
2754
|
+
return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings };
|
|
1729
2755
|
} catch (err) {
|
|
1730
2756
|
return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
1731
2757
|
}
|
|
1732
2758
|
}
|
|
1733
2759
|
async function parseHwp(buffer, options) {
|
|
1734
2760
|
try {
|
|
1735
|
-
const { markdown, blocks, metadata } = parseHwp5Document(Buffer.from(buffer), options);
|
|
1736
|
-
return { success: true, fileType: "hwp", markdown, blocks, metadata };
|
|
2761
|
+
const { markdown, blocks, metadata, outline, warnings } = parseHwp5Document(Buffer.from(buffer), options);
|
|
2762
|
+
return { success: true, fileType: "hwp", markdown, blocks, metadata, outline, warnings };
|
|
1737
2763
|
} catch (err) {
|
|
1738
2764
|
return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
1739
2765
|
}
|