kordoc 1.4.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -3
- package/dist/{chunk-BWZW234S.js → chunk-5SZWGBNL.js} +1083 -57
- package/dist/chunk-5SZWGBNL.js.map +1 -0
- package/dist/cli.js +2 -2
- package/dist/index.cjs +1085 -59
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +55 -2
- package/dist/index.d.ts +55 -2
- package/dist/index.js +1082 -56
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +18 -4
- package/dist/mcp.js.map +1 -1
- package/dist/{provider-JB7SY74K.js → provider-A4FHJSID.js} +2 -2
- package/dist/provider-A4FHJSID.js.map +1 -0
- package/dist/{watch-LIGKH3QS.js → watch-YCWNFYAW.js} +2 -2
- package/package.json +2 -1
- package/dist/chunk-BWZW234S.js.map +0 -1
- package/dist/provider-JB7SY74K.js.map +0 -1
- /package/dist/{watch-LIGKH3QS.js.map → watch-YCWNFYAW.js.map} +0 -0
|
@@ -85,8 +85,29 @@ function blocksToMarkdown(blocks) {
|
|
|
85
85
|
const lines = [];
|
|
86
86
|
for (let i = 0; i < blocks.length; i++) {
|
|
87
87
|
const block = blocks[i];
|
|
88
|
+
if (block.type === "heading" && block.text) {
|
|
89
|
+
const prefix = "#".repeat(Math.min(block.level || 2, 6));
|
|
90
|
+
lines.push("", `${prefix} ${block.text}`, "");
|
|
91
|
+
continue;
|
|
92
|
+
}
|
|
93
|
+
if (block.type === "separator") {
|
|
94
|
+
lines.push("", "---", "");
|
|
95
|
+
continue;
|
|
96
|
+
}
|
|
97
|
+
if (block.type === "list" && block.text) {
|
|
98
|
+
const alreadyNumbered = block.listType === "ordered" && /^\d+\.\s/.test(block.text);
|
|
99
|
+
const prefix = alreadyNumbered ? "" : block.listType === "ordered" ? "1. " : "- ";
|
|
100
|
+
lines.push(`${prefix}${block.text}`);
|
|
101
|
+
if (block.children) {
|
|
102
|
+
for (const child of block.children) {
|
|
103
|
+
const childPrefix = child.listType === "ordered" ? "1." : "-";
|
|
104
|
+
lines.push(` ${childPrefix} ${child.text || ""}`);
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
continue;
|
|
108
|
+
}
|
|
88
109
|
if (block.type === "paragraph" && block.text) {
|
|
89
|
-
|
|
110
|
+
let text = block.text;
|
|
90
111
|
if (/^\[별표\s*\d+/.test(text)) {
|
|
91
112
|
const nextBlock = blocks[i + 1];
|
|
92
113
|
if (nextBlock?.type === "paragraph" && nextBlock.text && /관련\)?$/.test(nextBlock.text)) {
|
|
@@ -101,9 +122,19 @@ function blocksToMarkdown(blocks) {
|
|
|
101
122
|
lines.push(`*${text}*`, "");
|
|
102
123
|
continue;
|
|
103
124
|
}
|
|
125
|
+
if (block.href) {
|
|
126
|
+
text = `[${text}](${block.href})`;
|
|
127
|
+
}
|
|
128
|
+
if (block.footnoteText) {
|
|
129
|
+
text += ` (\uC8FC: ${block.footnoteText})`;
|
|
130
|
+
}
|
|
104
131
|
lines.push(text);
|
|
105
132
|
} else if (block.type === "table" && block.table) {
|
|
133
|
+
if (lines.length > 0 && lines[lines.length - 1] !== "") {
|
|
134
|
+
lines.push("");
|
|
135
|
+
}
|
|
106
136
|
lines.push(tableToMarkdown(block.table));
|
|
137
|
+
lines.push("");
|
|
107
138
|
}
|
|
108
139
|
}
|
|
109
140
|
return lines.join("\n").trim();
|
|
@@ -154,7 +185,7 @@ function tableToMarkdown(table) {
|
|
|
154
185
|
}
|
|
155
186
|
|
|
156
187
|
// src/utils.ts
|
|
157
|
-
var VERSION = true ? "1.
|
|
188
|
+
var VERSION = true ? "1.5.0" : "0.0.0-dev";
|
|
158
189
|
function toArrayBuffer(buf) {
|
|
159
190
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
160
191
|
return buf.buffer;
|
|
@@ -228,6 +259,75 @@ var MAX_ZIP_ENTRIES = 500;
|
|
|
228
259
|
function clampSpan(val, max) {
|
|
229
260
|
return Math.max(1, Math.min(val, max));
|
|
230
261
|
}
|
|
262
|
+
async function extractHwpxStyles(zip) {
|
|
263
|
+
const result = {
|
|
264
|
+
charProperties: /* @__PURE__ */ new Map(),
|
|
265
|
+
styles: /* @__PURE__ */ new Map()
|
|
266
|
+
};
|
|
267
|
+
const headerPaths = ["Contents/header.xml", "header.xml", "Contents/head.xml", "head.xml"];
|
|
268
|
+
for (const hp of headerPaths) {
|
|
269
|
+
const hpLower = hp.toLowerCase();
|
|
270
|
+
const file = zip.file(hp) || Object.values(zip.files).find((f) => f.name.toLowerCase() === hpLower) || null;
|
|
271
|
+
if (!file) continue;
|
|
272
|
+
try {
|
|
273
|
+
const xml = await file.async("text");
|
|
274
|
+
const parser = new DOMParser();
|
|
275
|
+
const doc = parser.parseFromString(stripDtd(xml), "text/xml");
|
|
276
|
+
if (!doc.documentElement) continue;
|
|
277
|
+
parseCharProperties(doc, result.charProperties);
|
|
278
|
+
parseStyleElements(doc, result.styles);
|
|
279
|
+
break;
|
|
280
|
+
} catch {
|
|
281
|
+
continue;
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
return result;
|
|
285
|
+
}
|
|
286
|
+
function parseCharProperties(doc, map) {
|
|
287
|
+
const tagNames = ["hh:charPr", "charPr", "hp:charPr"];
|
|
288
|
+
for (const tagName of tagNames) {
|
|
289
|
+
const elements = doc.getElementsByTagName(tagName);
|
|
290
|
+
for (let i = 0; i < elements.length; i++) {
|
|
291
|
+
const el = elements[i];
|
|
292
|
+
const id = el.getAttribute("id") || el.getAttribute("IDRef") || "";
|
|
293
|
+
if (!id) continue;
|
|
294
|
+
const prop = {};
|
|
295
|
+
const height = el.getAttribute("height");
|
|
296
|
+
if (height) prop.fontSize = parseInt(height, 10) / 100;
|
|
297
|
+
const bold = el.getAttribute("bold");
|
|
298
|
+
if (bold === "true" || bold === "1") prop.bold = true;
|
|
299
|
+
const italic = el.getAttribute("italic");
|
|
300
|
+
if (italic === "true" || italic === "1") prop.italic = true;
|
|
301
|
+
const fontFaces = el.getElementsByTagName("*");
|
|
302
|
+
for (let j = 0; j < fontFaces.length; j++) {
|
|
303
|
+
const ff = fontFaces[j];
|
|
304
|
+
const localTag = (ff.tagName || "").replace(/^[^:]+:/, "");
|
|
305
|
+
if (localTag === "fontface" || localTag === "fontRef") {
|
|
306
|
+
const face = ff.getAttribute("face") || ff.getAttribute("FontFace");
|
|
307
|
+
if (face) {
|
|
308
|
+
prop.fontName = face;
|
|
309
|
+
break;
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
map.set(id, prop);
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
function parseStyleElements(doc, map) {
|
|
318
|
+
const tagNames = ["hh:style", "style", "hp:style"];
|
|
319
|
+
for (const tagName of tagNames) {
|
|
320
|
+
const elements = doc.getElementsByTagName(tagName);
|
|
321
|
+
for (let i = 0; i < elements.length; i++) {
|
|
322
|
+
const el = elements[i];
|
|
323
|
+
const id = el.getAttribute("id") || el.getAttribute("IDRef") || String(i);
|
|
324
|
+
const name = el.getAttribute("name") || el.getAttribute("engName") || "";
|
|
325
|
+
const charPrId = el.getAttribute("charPrIDRef") || void 0;
|
|
326
|
+
const paraPrId = el.getAttribute("paraPrIDRef") || void 0;
|
|
327
|
+
map.set(id, { name, charPrId, paraPrId });
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
}
|
|
231
331
|
function stripDtd(xml) {
|
|
232
332
|
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
233
333
|
}
|
|
@@ -251,6 +351,8 @@ async function parseHwpxDocument(buffer, options) {
|
|
|
251
351
|
}
|
|
252
352
|
const metadata = {};
|
|
253
353
|
await extractHwpxMetadata(zip, metadata);
|
|
354
|
+
const styleMap = await extractHwpxStyles(zip);
|
|
355
|
+
const warnings = [];
|
|
254
356
|
const sectionPaths = await resolveSectionPaths(zip);
|
|
255
357
|
if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
256
358
|
metadata.pageCount = sectionPaths.length;
|
|
@@ -264,10 +366,12 @@ async function parseHwpxDocument(buffer, options) {
|
|
|
264
366
|
const xml = await file.async("text");
|
|
265
367
|
totalDecompressed += xml.length * 2;
|
|
266
368
|
if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
267
|
-
blocks.push(...parseSectionXml(xml));
|
|
369
|
+
blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1));
|
|
268
370
|
}
|
|
371
|
+
detectHwpxHeadings(blocks, styleMap);
|
|
372
|
+
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
269
373
|
const markdown = blocksToMarkdown(blocks);
|
|
270
|
-
return { markdown, blocks, metadata };
|
|
374
|
+
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
271
375
|
}
|
|
272
376
|
async function extractHwpxMetadata(zip, metadata) {
|
|
273
377
|
try {
|
|
@@ -446,15 +550,50 @@ function parseSectionPathsFromManifest(xml) {
|
|
|
446
550
|
}
|
|
447
551
|
return Array.from(idToHref.entries()).filter(([id]) => isSectionId(id)).sort((a, b) => a[0].localeCompare(b[0])).map(([, href]) => href);
|
|
448
552
|
}
|
|
449
|
-
function
|
|
553
|
+
function detectHwpxHeadings(blocks, styleMap) {
|
|
554
|
+
let baseFontSize = 0;
|
|
555
|
+
const sizeFreq = /* @__PURE__ */ new Map();
|
|
556
|
+
for (const b of blocks) {
|
|
557
|
+
if (b.style?.fontSize) {
|
|
558
|
+
sizeFreq.set(b.style.fontSize, (sizeFreq.get(b.style.fontSize) || 0) + 1);
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
let maxCount = 0;
|
|
562
|
+
for (const [size, count] of sizeFreq) {
|
|
563
|
+
if (count > maxCount) {
|
|
564
|
+
maxCount = count;
|
|
565
|
+
baseFontSize = size;
|
|
566
|
+
}
|
|
567
|
+
}
|
|
568
|
+
for (const block of blocks) {
|
|
569
|
+
if (block.type !== "paragraph" || !block.text) continue;
|
|
570
|
+
const text = block.text.trim();
|
|
571
|
+
if (text.length === 0 || text.length > 200 || /^\d+$/.test(text)) continue;
|
|
572
|
+
let level = 0;
|
|
573
|
+
if (baseFontSize > 0 && block.style?.fontSize) {
|
|
574
|
+
const ratio = block.style.fontSize / baseFontSize;
|
|
575
|
+
if (ratio >= 1.5) level = 1;
|
|
576
|
+
else if (ratio >= 1.3) level = 2;
|
|
577
|
+
else if (ratio >= 1.15) level = 3;
|
|
578
|
+
}
|
|
579
|
+
if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
|
|
580
|
+
if (level === 0) level = 3;
|
|
581
|
+
}
|
|
582
|
+
if (level > 0) {
|
|
583
|
+
block.type = "heading";
|
|
584
|
+
block.level = level;
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
}
|
|
588
|
+
function parseSectionXml(xml, styleMap, warnings, sectionNum) {
|
|
450
589
|
const parser = new DOMParser();
|
|
451
590
|
const doc = parser.parseFromString(stripDtd(xml), "text/xml");
|
|
452
591
|
if (!doc.documentElement) return [];
|
|
453
592
|
const blocks = [];
|
|
454
|
-
walkSection(doc.documentElement, blocks, null, []);
|
|
593
|
+
walkSection(doc.documentElement, blocks, null, [], styleMap, warnings, sectionNum);
|
|
455
594
|
return blocks;
|
|
456
595
|
}
|
|
457
|
-
function walkSection(node, blocks, tableCtx, tableStack) {
|
|
596
|
+
function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum) {
|
|
458
597
|
const children = node.childNodes;
|
|
459
598
|
if (!children) return;
|
|
460
599
|
for (let i = 0; i < children.length; i++) {
|
|
@@ -466,7 +605,7 @@ function walkSection(node, blocks, tableCtx, tableStack) {
|
|
|
466
605
|
case "tbl": {
|
|
467
606
|
if (tableCtx) tableStack.push(tableCtx);
|
|
468
607
|
const newTable = { rows: [], currentRow: [], cell: null };
|
|
469
|
-
walkSection(el, blocks, newTable, tableStack);
|
|
608
|
+
walkSection(el, blocks, newTable, tableStack, styleMap, warnings, sectionNum);
|
|
470
609
|
if (newTable.rows.length > 0) {
|
|
471
610
|
if (tableStack.length > 0) {
|
|
472
611
|
const parentTable = tableStack.pop();
|
|
@@ -476,7 +615,7 @@ function walkSection(node, blocks, tableCtx, tableStack) {
|
|
|
476
615
|
}
|
|
477
616
|
tableCtx = parentTable;
|
|
478
617
|
} else {
|
|
479
|
-
blocks.push({ type: "table", table: buildTable(newTable.rows) });
|
|
618
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
480
619
|
tableCtx = null;
|
|
481
620
|
}
|
|
482
621
|
} else {
|
|
@@ -487,7 +626,7 @@ function walkSection(node, blocks, tableCtx, tableStack) {
|
|
|
487
626
|
case "tr":
|
|
488
627
|
if (tableCtx) {
|
|
489
628
|
tableCtx.currentRow = [];
|
|
490
|
-
walkSection(el, blocks, tableCtx, tableStack);
|
|
629
|
+
walkSection(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum);
|
|
491
630
|
if (tableCtx.currentRow.length > 0) tableCtx.rows.push(tableCtx.currentRow);
|
|
492
631
|
tableCtx.currentRow = [];
|
|
493
632
|
}
|
|
@@ -495,7 +634,7 @@ function walkSection(node, blocks, tableCtx, tableStack) {
|
|
|
495
634
|
case "tc":
|
|
496
635
|
if (tableCtx) {
|
|
497
636
|
tableCtx.cell = { text: "", colSpan: 1, rowSpan: 1 };
|
|
498
|
-
walkSection(el, blocks, tableCtx, tableStack);
|
|
637
|
+
walkSection(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum);
|
|
499
638
|
if (tableCtx.cell) {
|
|
500
639
|
tableCtx.currentRow.push(tableCtx.cell);
|
|
501
640
|
tableCtx.cell = null;
|
|
@@ -511,25 +650,75 @@ function walkSection(node, blocks, tableCtx, tableStack) {
|
|
|
511
650
|
}
|
|
512
651
|
break;
|
|
513
652
|
case "p": {
|
|
514
|
-
const text =
|
|
653
|
+
const { text, href, footnote, style } = extractParagraphInfo(el, styleMap);
|
|
515
654
|
if (text) {
|
|
516
655
|
if (tableCtx?.cell) {
|
|
517
656
|
tableCtx.cell.text += (tableCtx.cell.text ? "\n" : "") + text;
|
|
518
657
|
} else if (!tableCtx) {
|
|
519
|
-
|
|
658
|
+
const block = { type: "paragraph", text, pageNumber: sectionNum };
|
|
659
|
+
if (style) block.style = style;
|
|
660
|
+
if (href) block.href = href;
|
|
661
|
+
if (footnote) block.footnoteText = footnote;
|
|
662
|
+
blocks.push(block);
|
|
520
663
|
}
|
|
521
664
|
}
|
|
522
|
-
|
|
665
|
+
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum);
|
|
523
666
|
break;
|
|
524
667
|
}
|
|
668
|
+
// 이미지/그림 — 경고 수집
|
|
669
|
+
case "pic":
|
|
670
|
+
case "shape":
|
|
671
|
+
case "drawingObject":
|
|
672
|
+
if (warnings && sectionNum) {
|
|
673
|
+
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
674
|
+
}
|
|
675
|
+
break;
|
|
525
676
|
default:
|
|
526
|
-
walkSection(el, blocks, tableCtx, tableStack);
|
|
677
|
+
walkSection(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum);
|
|
527
678
|
break;
|
|
528
679
|
}
|
|
529
680
|
}
|
|
530
681
|
}
|
|
531
|
-
function
|
|
682
|
+
function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum) {
|
|
683
|
+
const children = node.childNodes;
|
|
684
|
+
if (!children) return tableCtx;
|
|
685
|
+
for (let i = 0; i < children.length; i++) {
|
|
686
|
+
const el = children[i];
|
|
687
|
+
if (el.nodeType !== 1) continue;
|
|
688
|
+
const tag = el.tagName || el.localName || "";
|
|
689
|
+
const localTag = tag.replace(/^[^:]+:/, "");
|
|
690
|
+
if (localTag === "tbl") {
|
|
691
|
+
if (tableCtx) tableStack.push(tableCtx);
|
|
692
|
+
const newTable = { rows: [], currentRow: [], cell: null };
|
|
693
|
+
walkSection(el, blocks, newTable, tableStack, styleMap, warnings, sectionNum);
|
|
694
|
+
if (newTable.rows.length > 0) {
|
|
695
|
+
if (tableStack.length > 0) {
|
|
696
|
+
const parentTable = tableStack.pop();
|
|
697
|
+
const nestedText = convertTableToText(newTable.rows);
|
|
698
|
+
if (parentTable.cell) {
|
|
699
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
700
|
+
}
|
|
701
|
+
tableCtx = parentTable;
|
|
702
|
+
} else {
|
|
703
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
704
|
+
tableCtx = null;
|
|
705
|
+
}
|
|
706
|
+
} else {
|
|
707
|
+
tableCtx = tableStack.length > 0 ? tableStack.pop() : null;
|
|
708
|
+
}
|
|
709
|
+
} else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
|
|
710
|
+
if (warnings && sectionNum) {
|
|
711
|
+
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
712
|
+
}
|
|
713
|
+
}
|
|
714
|
+
}
|
|
715
|
+
return tableCtx;
|
|
716
|
+
}
|
|
717
|
+
function extractParagraphInfo(para, styleMap) {
|
|
532
718
|
let text = "";
|
|
719
|
+
let href;
|
|
720
|
+
let footnote;
|
|
721
|
+
let charPrId;
|
|
533
722
|
const walk = (node) => {
|
|
534
723
|
const children = node.childNodes;
|
|
535
724
|
if (!children) return;
|
|
@@ -558,6 +747,29 @@ function extractParagraphText(para) {
|
|
|
558
747
|
case "tbl":
|
|
559
748
|
break;
|
|
560
749
|
// 테이블은 walkSection에서 처리
|
|
750
|
+
// 하이퍼링크
|
|
751
|
+
case "hyperlink": {
|
|
752
|
+
const url = child.getAttribute("url") || child.getAttribute("href") || "";
|
|
753
|
+
if (url) href = url;
|
|
754
|
+
walk(child);
|
|
755
|
+
break;
|
|
756
|
+
}
|
|
757
|
+
// 각주/미주
|
|
758
|
+
case "footNote":
|
|
759
|
+
case "endNote":
|
|
760
|
+
case "fn":
|
|
761
|
+
case "en": {
|
|
762
|
+
const noteText = extractTextFromNode(child);
|
|
763
|
+
if (noteText) footnote = (footnote ? footnote + "; " : "") + noteText;
|
|
764
|
+
break;
|
|
765
|
+
}
|
|
766
|
+
// run 요소에서 charPrIDRef 추출
|
|
767
|
+
case "r": {
|
|
768
|
+
const runCharPr = child.getAttribute("charPrIDRef");
|
|
769
|
+
if (runCharPr && !charPrId) charPrId = runCharPr;
|
|
770
|
+
walk(child);
|
|
771
|
+
break;
|
|
772
|
+
}
|
|
561
773
|
default:
|
|
562
774
|
walk(child);
|
|
563
775
|
break;
|
|
@@ -565,16 +777,43 @@ function extractParagraphText(para) {
|
|
|
565
777
|
}
|
|
566
778
|
};
|
|
567
779
|
walk(para);
|
|
568
|
-
|
|
780
|
+
const cleanText = text.replace(/[ \t]+/g, " ").trim();
|
|
781
|
+
let style;
|
|
782
|
+
if (styleMap && charPrId) {
|
|
783
|
+
const charProp = styleMap.charProperties.get(charPrId);
|
|
784
|
+
if (charProp) {
|
|
785
|
+
style = {};
|
|
786
|
+
if (charProp.fontSize) style.fontSize = charProp.fontSize;
|
|
787
|
+
if (charProp.bold) style.bold = true;
|
|
788
|
+
if (charProp.italic) style.italic = true;
|
|
789
|
+
if (charProp.fontName) style.fontName = charProp.fontName;
|
|
790
|
+
if (!style.fontSize && !style.bold && !style.italic) style = void 0;
|
|
791
|
+
}
|
|
792
|
+
}
|
|
793
|
+
return { text: cleanText, href, footnote, style };
|
|
794
|
+
}
|
|
795
|
+
function extractTextFromNode(node) {
|
|
796
|
+
let result = "";
|
|
797
|
+
const children = node.childNodes;
|
|
798
|
+
if (!children) return result;
|
|
799
|
+
for (let i = 0; i < children.length; i++) {
|
|
800
|
+
const child = children[i];
|
|
801
|
+
if (child.nodeType === 3) result += child.textContent || "";
|
|
802
|
+
else if (child.nodeType === 1) result += extractTextFromNode(child);
|
|
803
|
+
}
|
|
804
|
+
return result.trim();
|
|
569
805
|
}
|
|
570
806
|
|
|
571
807
|
// src/hwp5/record.ts
|
|
572
808
|
import { inflateRawSync as inflateRawSync2, inflateSync } from "zlib";
|
|
573
809
|
var TAG_PARA_HEADER = 66;
|
|
574
810
|
var TAG_PARA_TEXT = 67;
|
|
811
|
+
var TAG_CHAR_SHAPE = 68;
|
|
575
812
|
var TAG_CTRL_HEADER = 71;
|
|
576
813
|
var TAG_LIST_HEADER = 72;
|
|
577
814
|
var TAG_TABLE = 77;
|
|
815
|
+
var TAG_DOC_CHAR_SHAPE = 55;
|
|
816
|
+
var TAG_DOC_STYLE = 58;
|
|
578
817
|
var CHAR_LINE = 0;
|
|
579
818
|
var CHAR_PARA = 13;
|
|
580
819
|
var CHAR_TAB = 9;
|
|
@@ -625,6 +864,51 @@ function parseFileHeader(data) {
|
|
|
625
864
|
flags: data.readUInt32LE(36)
|
|
626
865
|
};
|
|
627
866
|
}
|
|
867
|
+
function parseDocInfo(records) {
|
|
868
|
+
const charShapes = [];
|
|
869
|
+
const styles = [];
|
|
870
|
+
for (const rec of records) {
|
|
871
|
+
if (rec.tagId === TAG_DOC_CHAR_SHAPE && rec.data.length >= 18) {
|
|
872
|
+
if (rec.data.length >= 50) {
|
|
873
|
+
const fontSize = rec.data.readUInt32LE(42);
|
|
874
|
+
const attrFlags = rec.data.readUInt32LE(46);
|
|
875
|
+
charShapes.push({ fontSize, attrFlags });
|
|
876
|
+
} else {
|
|
877
|
+
charShapes.push({ fontSize: 0, attrFlags: 0 });
|
|
878
|
+
}
|
|
879
|
+
}
|
|
880
|
+
if (rec.tagId === TAG_DOC_STYLE && rec.data.length >= 8) {
|
|
881
|
+
try {
|
|
882
|
+
let offset = 0;
|
|
883
|
+
const nameLen = rec.data.readUInt16LE(offset);
|
|
884
|
+
offset += 2;
|
|
885
|
+
const nameBytes = nameLen * 2;
|
|
886
|
+
const name = nameBytes > 0 && offset + nameBytes <= rec.data.length ? rec.data.subarray(offset, offset + nameBytes).toString("utf16le") : "";
|
|
887
|
+
offset += nameBytes;
|
|
888
|
+
let nameKo = "";
|
|
889
|
+
if (offset + 2 <= rec.data.length) {
|
|
890
|
+
const nameKoLen = rec.data.readUInt16LE(offset);
|
|
891
|
+
offset += 2;
|
|
892
|
+
const nameKoBytes = nameKoLen * 2;
|
|
893
|
+
if (nameKoBytes > 0 && offset + nameKoBytes <= rec.data.length) {
|
|
894
|
+
nameKo = rec.data.subarray(offset, offset + nameKoBytes).toString("utf16le");
|
|
895
|
+
}
|
|
896
|
+
offset += nameKoBytes;
|
|
897
|
+
}
|
|
898
|
+
const type = offset < rec.data.length ? rec.data.readUInt8(offset) : 0;
|
|
899
|
+
offset += 1;
|
|
900
|
+
offset += 2;
|
|
901
|
+
offset += 2;
|
|
902
|
+
const paraShapeId = offset + 2 <= rec.data.length ? rec.data.readUInt16LE(offset) : 0;
|
|
903
|
+
offset += 2;
|
|
904
|
+
const charShapeId = offset + 2 <= rec.data.length ? rec.data.readUInt16LE(offset) : 0;
|
|
905
|
+
styles.push({ name, nameKo, charShapeId, paraShapeId, type });
|
|
906
|
+
} catch {
|
|
907
|
+
}
|
|
908
|
+
}
|
|
909
|
+
}
|
|
910
|
+
return { charShapes, styles };
|
|
911
|
+
}
|
|
628
912
|
function extractText(data) {
|
|
629
913
|
let result = "";
|
|
630
914
|
let i = 0;
|
|
@@ -689,6 +973,8 @@ function parseHwp5Document(buffer, options) {
|
|
|
689
973
|
version: `${header.versionMajor}.x`
|
|
690
974
|
};
|
|
691
975
|
extractHwp5Metadata(cfb, metadata);
|
|
976
|
+
const docInfo = parseDocInfoStream(cfb, compressed);
|
|
977
|
+
const warnings = [];
|
|
692
978
|
const sections = findSections(cfb);
|
|
693
979
|
if (sections.length === 0) throw new KordocError("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
694
980
|
metadata.pageCount = sections.length;
|
|
@@ -702,10 +988,73 @@ function parseHwp5Document(buffer, options) {
|
|
|
702
988
|
totalDecompressed += data.length;
|
|
703
989
|
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
704
990
|
const records = readRecords(data);
|
|
705
|
-
|
|
991
|
+
const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
|
|
992
|
+
blocks.push(...sectionBlocks);
|
|
706
993
|
}
|
|
994
|
+
if (docInfo) {
|
|
995
|
+
detectHwp5Headings(blocks, docInfo);
|
|
996
|
+
}
|
|
997
|
+
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
707
998
|
const markdown = blocksToMarkdown(blocks);
|
|
708
|
-
return { markdown, blocks, metadata };
|
|
999
|
+
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
1000
|
+
}
|
|
1001
|
+
function parseDocInfoStream(cfb, compressed) {
|
|
1002
|
+
try {
|
|
1003
|
+
const entry = CFB.find(cfb, "/DocInfo");
|
|
1004
|
+
if (!entry?.content) return null;
|
|
1005
|
+
const data = compressed ? decompressStream(Buffer.from(entry.content)) : Buffer.from(entry.content);
|
|
1006
|
+
const records = readRecords(data);
|
|
1007
|
+
return parseDocInfo(records);
|
|
1008
|
+
} catch {
|
|
1009
|
+
return null;
|
|
1010
|
+
}
|
|
1011
|
+
}
|
|
1012
|
+
function detectHwp5Headings(blocks, docInfo) {
|
|
1013
|
+
let baseFontSize = 0;
|
|
1014
|
+
for (const style of docInfo.styles) {
|
|
1015
|
+
const name = (style.nameKo || style.name).toLowerCase();
|
|
1016
|
+
if (name.includes("\uBC14\uD0D5") || name.includes("\uBCF8\uBB38") || name === "normal" || name === "body") {
|
|
1017
|
+
const cs = docInfo.charShapes[style.charShapeId];
|
|
1018
|
+
if (cs?.fontSize > 0) {
|
|
1019
|
+
baseFontSize = cs.fontSize / 10;
|
|
1020
|
+
break;
|
|
1021
|
+
}
|
|
1022
|
+
}
|
|
1023
|
+
}
|
|
1024
|
+
if (baseFontSize === 0) {
|
|
1025
|
+
const sizeFreq = /* @__PURE__ */ new Map();
|
|
1026
|
+
for (const b of blocks) {
|
|
1027
|
+
if (b.style?.fontSize) {
|
|
1028
|
+
sizeFreq.set(b.style.fontSize, (sizeFreq.get(b.style.fontSize) || 0) + 1);
|
|
1029
|
+
}
|
|
1030
|
+
}
|
|
1031
|
+
let maxCount = 0;
|
|
1032
|
+
for (const [size, count] of sizeFreq) {
|
|
1033
|
+
if (count > maxCount) {
|
|
1034
|
+
maxCount = count;
|
|
1035
|
+
baseFontSize = size;
|
|
1036
|
+
}
|
|
1037
|
+
}
|
|
1038
|
+
}
|
|
1039
|
+
if (baseFontSize <= 0) return;
|
|
1040
|
+
for (const block of blocks) {
|
|
1041
|
+
if (block.type !== "paragraph" || !block.text || !block.style?.fontSize) continue;
|
|
1042
|
+
const text = block.text.trim();
|
|
1043
|
+
if (text.length === 0 || text.length > 200) continue;
|
|
1044
|
+
if (/^\d+$/.test(text)) continue;
|
|
1045
|
+
const ratio = block.style.fontSize / baseFontSize;
|
|
1046
|
+
let level = 0;
|
|
1047
|
+
if (ratio >= 1.5) level = 1;
|
|
1048
|
+
else if (ratio >= 1.3) level = 2;
|
|
1049
|
+
else if (ratio >= 1.15) level = 3;
|
|
1050
|
+
if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
|
|
1051
|
+
if (level === 0) level = 3;
|
|
1052
|
+
}
|
|
1053
|
+
if (level > 0) {
|
|
1054
|
+
block.type = "heading";
|
|
1055
|
+
block.level = level;
|
|
1056
|
+
}
|
|
1057
|
+
}
|
|
709
1058
|
}
|
|
710
1059
|
function extractHwp5Metadata(cfb, metadata) {
|
|
711
1060
|
try {
|
|
@@ -771,15 +1120,22 @@ function findSections(cfb) {
|
|
|
771
1120
|
}
|
|
772
1121
|
return sections.sort((a, b) => a.idx - b.idx).map((s) => s.content);
|
|
773
1122
|
}
|
|
774
|
-
function parseSection(records) {
|
|
1123
|
+
function parseSection(records, docInfo, warnings, sectionNum) {
|
|
775
1124
|
const blocks = [];
|
|
776
1125
|
let i = 0;
|
|
777
1126
|
while (i < records.length) {
|
|
778
1127
|
const rec = records[i];
|
|
779
1128
|
if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
|
|
780
|
-
const { paragraph, tables, nextIdx } = parseParagraphWithTables(records, i);
|
|
781
|
-
if (paragraph)
|
|
782
|
-
|
|
1129
|
+
const { paragraph, tables, nextIdx, charShapeIds } = parseParagraphWithTables(records, i);
|
|
1130
|
+
if (paragraph) {
|
|
1131
|
+
const block = { type: "paragraph", text: paragraph, pageNumber: sectionNum };
|
|
1132
|
+
if (docInfo && charShapeIds.length > 0) {
|
|
1133
|
+
const style = resolveCharStyle(charShapeIds, docInfo);
|
|
1134
|
+
if (style) block.style = style;
|
|
1135
|
+
}
|
|
1136
|
+
blocks.push(block);
|
|
1137
|
+
}
|
|
1138
|
+
for (const t of tables) blocks.push({ type: "table", table: t, pageNumber: sectionNum });
|
|
783
1139
|
i = nextIdx;
|
|
784
1140
|
continue;
|
|
785
1141
|
}
|
|
@@ -787,19 +1143,43 @@ function parseSection(records) {
|
|
|
787
1143
|
const ctrlId = rec.data.subarray(0, 4).toString("ascii");
|
|
788
1144
|
if (ctrlId === " lbt" || ctrlId === "tbl ") {
|
|
789
1145
|
const { table, nextIdx } = parseTableBlock(records, i);
|
|
790
|
-
if (table) blocks.push({ type: "table", table });
|
|
1146
|
+
if (table) blocks.push({ type: "table", table, pageNumber: sectionNum });
|
|
791
1147
|
i = nextIdx;
|
|
792
1148
|
continue;
|
|
793
1149
|
}
|
|
1150
|
+
if (ctrlId === "gso " || ctrlId === " osg" || ctrlId === " elo" || ctrlId === "ole ") {
|
|
1151
|
+
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC81C\uC5B4 \uC694\uC18C: ${ctrlId.trim()}`, code: "SKIPPED_IMAGE" });
|
|
1152
|
+
}
|
|
794
1153
|
}
|
|
795
1154
|
i++;
|
|
796
1155
|
}
|
|
797
1156
|
return blocks;
|
|
798
1157
|
}
|
|
1158
|
+
function resolveCharStyle(charShapeIds, docInfo) {
|
|
1159
|
+
if (charShapeIds.length === 0 || docInfo.charShapes.length === 0) return void 0;
|
|
1160
|
+
const freq = /* @__PURE__ */ new Map();
|
|
1161
|
+
let maxCount = 0, dominantId = charShapeIds[0];
|
|
1162
|
+
for (const id of charShapeIds) {
|
|
1163
|
+
const count = (freq.get(id) || 0) + 1;
|
|
1164
|
+
freq.set(id, count);
|
|
1165
|
+
if (count > maxCount) {
|
|
1166
|
+
maxCount = count;
|
|
1167
|
+
dominantId = id;
|
|
1168
|
+
}
|
|
1169
|
+
}
|
|
1170
|
+
const cs = docInfo.charShapes[dominantId];
|
|
1171
|
+
if (!cs) return void 0;
|
|
1172
|
+
const style = {};
|
|
1173
|
+
if (cs.fontSize > 0) style.fontSize = cs.fontSize / 10;
|
|
1174
|
+
if (cs.attrFlags & 1) style.italic = true;
|
|
1175
|
+
if (cs.attrFlags & 2) style.bold = true;
|
|
1176
|
+
return style.fontSize || style.bold || style.italic ? style : void 0;
|
|
1177
|
+
}
|
|
799
1178
|
function parseParagraphWithTables(records, startIdx) {
|
|
800
1179
|
const startLevel = records[startIdx].level;
|
|
801
1180
|
let text = "";
|
|
802
1181
|
const tables = [];
|
|
1182
|
+
const charShapeIds = [];
|
|
803
1183
|
let i = startIdx + 1;
|
|
804
1184
|
while (i < records.length) {
|
|
805
1185
|
const rec = records[i];
|
|
@@ -807,6 +1187,11 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
807
1187
|
if (rec.tagId === TAG_PARA_TEXT) {
|
|
808
1188
|
text = extractText(rec.data);
|
|
809
1189
|
}
|
|
1190
|
+
if (rec.tagId === TAG_CHAR_SHAPE && rec.data.length >= 8) {
|
|
1191
|
+
for (let offset = 0; offset + 7 < rec.data.length; offset += 8) {
|
|
1192
|
+
charShapeIds.push(rec.data.readUInt32LE(offset + 4));
|
|
1193
|
+
}
|
|
1194
|
+
}
|
|
810
1195
|
if (rec.tagId === TAG_CTRL_HEADER && rec.data.length >= 4) {
|
|
811
1196
|
const ctrlId = rec.data.subarray(0, 4).toString("ascii");
|
|
812
1197
|
if (ctrlId === " lbt" || ctrlId === "tbl ") {
|
|
@@ -819,7 +1204,7 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
819
1204
|
i++;
|
|
820
1205
|
}
|
|
821
1206
|
const trimmed = text.trim();
|
|
822
|
-
return { paragraph: trimmed || null, tables, nextIdx: i };
|
|
1207
|
+
return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds };
|
|
823
1208
|
}
|
|
824
1209
|
function parseTableBlock(records, startIdx) {
|
|
825
1210
|
const tableLevel = records[startIdx].level;
|
|
@@ -891,6 +1276,355 @@ function arrangeCells(rows, cols, cells) {
|
|
|
891
1276
|
return grid.map((row) => row.map((c) => c || { text: "", colSpan: 1, rowSpan: 1 }));
|
|
892
1277
|
}
|
|
893
1278
|
|
|
1279
|
+
// src/pdf/line-detector.ts
|
|
1280
|
+
import { OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
1281
|
+
var ORIENTATION_TOL = 2;
|
|
1282
|
+
var MIN_LINE_LENGTH = 10;
|
|
1283
|
+
var COORD_MERGE_TOL = 3;
|
|
1284
|
+
var CONNECT_TOL = 5;
|
|
1285
|
+
var CELL_PADDING = 2;
|
|
1286
|
+
function extractLines(fnArray, argsArray) {
|
|
1287
|
+
const horizontals = [];
|
|
1288
|
+
const verticals = [];
|
|
1289
|
+
let lineWidth = 1;
|
|
1290
|
+
let currentPath = [];
|
|
1291
|
+
let pathStartX = 0, pathStartY = 0;
|
|
1292
|
+
let curX = 0, curY = 0;
|
|
1293
|
+
function flushPath(isStroke) {
|
|
1294
|
+
if (!isStroke) {
|
|
1295
|
+
currentPath = [];
|
|
1296
|
+
return;
|
|
1297
|
+
}
|
|
1298
|
+
for (const seg of currentPath) {
|
|
1299
|
+
classifyAndAdd(seg, lineWidth, horizontals, verticals);
|
|
1300
|
+
}
|
|
1301
|
+
currentPath = [];
|
|
1302
|
+
}
|
|
1303
|
+
for (let i = 0; i < fnArray.length; i++) {
|
|
1304
|
+
const op = fnArray[i];
|
|
1305
|
+
const args = argsArray[i];
|
|
1306
|
+
switch (op) {
|
|
1307
|
+
case OPS.setLineWidth:
|
|
1308
|
+
lineWidth = args[0] || 1;
|
|
1309
|
+
break;
|
|
1310
|
+
case OPS.constructPath: {
|
|
1311
|
+
const subOps = args[0];
|
|
1312
|
+
const coords = args[1];
|
|
1313
|
+
let ci = 0;
|
|
1314
|
+
for (const subOp of subOps) {
|
|
1315
|
+
if (subOp === OPS.moveTo) {
|
|
1316
|
+
curX = coords[ci++];
|
|
1317
|
+
curY = coords[ci++];
|
|
1318
|
+
pathStartX = curX;
|
|
1319
|
+
pathStartY = curY;
|
|
1320
|
+
} else if (subOp === OPS.lineTo) {
|
|
1321
|
+
const x2 = coords[ci++], y2 = coords[ci++];
|
|
1322
|
+
currentPath.push({ x1: curX, y1: curY, x2, y2 });
|
|
1323
|
+
curX = x2;
|
|
1324
|
+
curY = y2;
|
|
1325
|
+
} else if (subOp === OPS.rectangle) {
|
|
1326
|
+
const rx = coords[ci++], ry = coords[ci++];
|
|
1327
|
+
const rw = coords[ci++], rh = coords[ci++];
|
|
1328
|
+
if (Math.abs(rh) < ORIENTATION_TOL * 2) {
|
|
1329
|
+
currentPath.push({ x1: rx, y1: ry + rh / 2, x2: rx + rw, y2: ry + rh / 2 });
|
|
1330
|
+
} else if (Math.abs(rw) < ORIENTATION_TOL * 2) {
|
|
1331
|
+
currentPath.push({ x1: rx + rw / 2, y1: ry, x2: rx + rw / 2, y2: ry + rh });
|
|
1332
|
+
} else {
|
|
1333
|
+
currentPath.push(
|
|
1334
|
+
{ x1: rx, y1: ry, x2: rx + rw, y2: ry },
|
|
1335
|
+
// bottom
|
|
1336
|
+
{ x1: rx + rw, y1: ry, x2: rx + rw, y2: ry + rh },
|
|
1337
|
+
// right
|
|
1338
|
+
{ x1: rx + rw, y1: ry + rh, x2: rx, y2: ry + rh },
|
|
1339
|
+
// top
|
|
1340
|
+
{ x1: rx, y1: ry + rh, x2: rx, y2: ry }
|
|
1341
|
+
// left
|
|
1342
|
+
);
|
|
1343
|
+
}
|
|
1344
|
+
} else if (subOp === OPS.closePath) {
|
|
1345
|
+
if (curX !== pathStartX || curY !== pathStartY) {
|
|
1346
|
+
currentPath.push({ x1: curX, y1: curY, x2: pathStartX, y2: pathStartY });
|
|
1347
|
+
}
|
|
1348
|
+
curX = pathStartX;
|
|
1349
|
+
curY = pathStartY;
|
|
1350
|
+
} else if (subOp === OPS.curveTo) {
|
|
1351
|
+
ci += 6;
|
|
1352
|
+
} else if (subOp === OPS.curveTo2 || subOp === OPS.curveTo3) {
|
|
1353
|
+
ci += 4;
|
|
1354
|
+
}
|
|
1355
|
+
}
|
|
1356
|
+
break;
|
|
1357
|
+
}
|
|
1358
|
+
case OPS.stroke:
|
|
1359
|
+
case OPS.closeStroke:
|
|
1360
|
+
flushPath(true);
|
|
1361
|
+
break;
|
|
1362
|
+
case OPS.fill:
|
|
1363
|
+
case OPS.eoFill:
|
|
1364
|
+
case OPS.fillStroke:
|
|
1365
|
+
case OPS.eoFillStroke:
|
|
1366
|
+
case OPS.closeFillStroke:
|
|
1367
|
+
case OPS.closeEOFillStroke:
|
|
1368
|
+
flushPath(true);
|
|
1369
|
+
break;
|
|
1370
|
+
case OPS.endPath:
|
|
1371
|
+
flushPath(false);
|
|
1372
|
+
break;
|
|
1373
|
+
}
|
|
1374
|
+
}
|
|
1375
|
+
return { horizontals, verticals };
|
|
1376
|
+
}
|
|
1377
|
+
function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
1378
|
+
const dx = Math.abs(seg.x2 - seg.x1);
|
|
1379
|
+
const dy = Math.abs(seg.y2 - seg.y1);
|
|
1380
|
+
const length = Math.sqrt(dx * dx + dy * dy);
|
|
1381
|
+
if (length < MIN_LINE_LENGTH) return;
|
|
1382
|
+
if (dy <= ORIENTATION_TOL) {
|
|
1383
|
+
const y = (seg.y1 + seg.y2) / 2;
|
|
1384
|
+
const x1 = Math.min(seg.x1, seg.x2);
|
|
1385
|
+
const x2 = Math.max(seg.x1, seg.x2);
|
|
1386
|
+
horizontals.push({ x1, y1: y, x2, y2: y, lineWidth });
|
|
1387
|
+
} else if (dx <= ORIENTATION_TOL) {
|
|
1388
|
+
const x = (seg.x1 + seg.x2) / 2;
|
|
1389
|
+
const y1 = Math.min(seg.y1, seg.y2);
|
|
1390
|
+
const y2 = Math.max(seg.y1, seg.y2);
|
|
1391
|
+
verticals.push({ x1: x, y1, x2: x, y2, lineWidth });
|
|
1392
|
+
}
|
|
1393
|
+
}
|
|
1394
|
+
function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
|
|
1395
|
+
const margin = 5;
|
|
1396
|
+
return {
|
|
1397
|
+
horizontals: horizontals.filter(
|
|
1398
|
+
(l) => !(Math.abs(l.y1) < margin || Math.abs(l.y1 - pageHeight) < margin) || l.x2 - l.x1 < pageWidth * 0.9
|
|
1399
|
+
),
|
|
1400
|
+
verticals: verticals.filter(
|
|
1401
|
+
(l) => !(Math.abs(l.x1) < margin || Math.abs(l.x1 - pageWidth) < margin) || l.y2 - l.y1 < pageHeight * 0.9
|
|
1402
|
+
)
|
|
1403
|
+
};
|
|
1404
|
+
}
|
|
1405
|
+
function buildTableGrids(horizontals, verticals) {
|
|
1406
|
+
if (horizontals.length < 2 || verticals.length < 2) return [];
|
|
1407
|
+
const allLines = [
|
|
1408
|
+
...horizontals.map((l, i) => ({ ...l, type: "h", id: i })),
|
|
1409
|
+
...verticals.map((l, i) => ({ ...l, type: "v", id: i + horizontals.length }))
|
|
1410
|
+
];
|
|
1411
|
+
const groups = groupConnectedLines(allLines);
|
|
1412
|
+
const grids = [];
|
|
1413
|
+
for (const group of groups) {
|
|
1414
|
+
const hLines = group.filter((l) => l.type === "h");
|
|
1415
|
+
const vLines = group.filter((l) => l.type === "v");
|
|
1416
|
+
if (hLines.length < 2 || vLines.length < 2) continue;
|
|
1417
|
+
const rawYs = hLines.map((l) => l.y1);
|
|
1418
|
+
const rowYs = clusterCoordinates(rawYs).sort((a, b) => b - a);
|
|
1419
|
+
const rawXs = vLines.map((l) => l.x1);
|
|
1420
|
+
const colXs = clusterCoordinates(rawXs).sort((a, b) => a - b);
|
|
1421
|
+
if (rowYs.length < 2 || colXs.length < 2) continue;
|
|
1422
|
+
const bbox = {
|
|
1423
|
+
x1: colXs[0],
|
|
1424
|
+
y1: rowYs[rowYs.length - 1],
|
|
1425
|
+
x2: colXs[colXs.length - 1],
|
|
1426
|
+
y2: rowYs[0]
|
|
1427
|
+
};
|
|
1428
|
+
grids.push({ rowYs, colXs, bbox });
|
|
1429
|
+
}
|
|
1430
|
+
return grids;
|
|
1431
|
+
}
|
|
1432
|
+
function clusterCoordinates(values) {
|
|
1433
|
+
if (values.length === 0) return [];
|
|
1434
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
1435
|
+
const clusters = [{ sum: sorted[0], count: 1 }];
|
|
1436
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
1437
|
+
const last = clusters[clusters.length - 1];
|
|
1438
|
+
const avg = last.sum / last.count;
|
|
1439
|
+
if (Math.abs(sorted[i] - avg) <= COORD_MERGE_TOL) {
|
|
1440
|
+
last.sum += sorted[i];
|
|
1441
|
+
last.count++;
|
|
1442
|
+
} else {
|
|
1443
|
+
clusters.push({ sum: sorted[i], count: 1 });
|
|
1444
|
+
}
|
|
1445
|
+
}
|
|
1446
|
+
return clusters.map((c) => c.sum / c.count);
|
|
1447
|
+
}
|
|
1448
|
+
function groupConnectedLines(lines) {
|
|
1449
|
+
const parent = lines.map((_, i) => i);
|
|
1450
|
+
function find(x) {
|
|
1451
|
+
while (parent[x] !== x) {
|
|
1452
|
+
parent[x] = parent[parent[x]];
|
|
1453
|
+
x = parent[x];
|
|
1454
|
+
}
|
|
1455
|
+
return x;
|
|
1456
|
+
}
|
|
1457
|
+
function union(a, b) {
|
|
1458
|
+
const ra = find(a), rb = find(b);
|
|
1459
|
+
if (ra !== rb) parent[ra] = rb;
|
|
1460
|
+
}
|
|
1461
|
+
for (let i = 0; i < lines.length; i++) {
|
|
1462
|
+
for (let j = i + 1; j < lines.length; j++) {
|
|
1463
|
+
if (linesIntersect(lines[i], lines[j])) {
|
|
1464
|
+
union(i, j);
|
|
1465
|
+
}
|
|
1466
|
+
}
|
|
1467
|
+
}
|
|
1468
|
+
const groups = /* @__PURE__ */ new Map();
|
|
1469
|
+
for (let i = 0; i < lines.length; i++) {
|
|
1470
|
+
const root = find(i);
|
|
1471
|
+
if (!groups.has(root)) groups.set(root, []);
|
|
1472
|
+
groups.get(root).push(lines[i]);
|
|
1473
|
+
}
|
|
1474
|
+
return [...groups.values()];
|
|
1475
|
+
}
|
|
1476
|
+
function linesIntersect(a, b) {
|
|
1477
|
+
if (a.type === b.type) {
|
|
1478
|
+
if (a.type === "h") {
|
|
1479
|
+
if (Math.abs(a.y1 - b.y1) > CONNECT_TOL) return false;
|
|
1480
|
+
return Math.min(a.x2, b.x2) >= Math.max(a.x1, b.x1) - CONNECT_TOL;
|
|
1481
|
+
} else {
|
|
1482
|
+
if (Math.abs(a.x1 - b.x1) > CONNECT_TOL) return false;
|
|
1483
|
+
return Math.min(a.y2, b.y2) >= Math.max(a.y1, b.y1) - CONNECT_TOL;
|
|
1484
|
+
}
|
|
1485
|
+
}
|
|
1486
|
+
const h = a.type === "h" ? a : b;
|
|
1487
|
+
const v = a.type === "h" ? b : a;
|
|
1488
|
+
const tol = CONNECT_TOL;
|
|
1489
|
+
return v.x1 >= h.x1 - tol && v.x1 <= h.x2 + tol && h.y1 >= v.y1 - tol && h.y1 <= v.y2 + tol;
|
|
1490
|
+
}
|
|
1491
|
+
function extractCells(grid, horizontals, verticals) {
|
|
1492
|
+
const { rowYs, colXs } = grid;
|
|
1493
|
+
const numRows = rowYs.length - 1;
|
|
1494
|
+
const numCols = colXs.length - 1;
|
|
1495
|
+
if (numRows <= 0 || numCols <= 0) return [];
|
|
1496
|
+
const occupied = Array.from({ length: numRows }, () => Array(numCols).fill(false));
|
|
1497
|
+
const cells = [];
|
|
1498
|
+
for (let r = 0; r < numRows; r++) {
|
|
1499
|
+
for (let c = 0; c < numCols; c++) {
|
|
1500
|
+
if (occupied[r][c]) continue;
|
|
1501
|
+
let colSpan = 1;
|
|
1502
|
+
let rowSpan = 1;
|
|
1503
|
+
while (c + colSpan < numCols) {
|
|
1504
|
+
const borderX = colXs[c + colSpan];
|
|
1505
|
+
const topY = rowYs[r];
|
|
1506
|
+
const botY = rowYs[r + 1];
|
|
1507
|
+
if (hasVerticalLine(verticals, borderX, topY, botY)) break;
|
|
1508
|
+
colSpan++;
|
|
1509
|
+
}
|
|
1510
|
+
while (r + rowSpan < numRows) {
|
|
1511
|
+
const borderY = rowYs[r + rowSpan];
|
|
1512
|
+
const leftX = colXs[c];
|
|
1513
|
+
const rightX = colXs[c + colSpan];
|
|
1514
|
+
if (hasHorizontalLine(horizontals, borderY, leftX, rightX)) break;
|
|
1515
|
+
rowSpan++;
|
|
1516
|
+
}
|
|
1517
|
+
for (let dr = 0; dr < rowSpan; dr++) {
|
|
1518
|
+
for (let dc = 0; dc < colSpan; dc++) {
|
|
1519
|
+
occupied[r + dr][c + dc] = true;
|
|
1520
|
+
}
|
|
1521
|
+
}
|
|
1522
|
+
cells.push({
|
|
1523
|
+
row: r,
|
|
1524
|
+
col: c,
|
|
1525
|
+
rowSpan,
|
|
1526
|
+
colSpan,
|
|
1527
|
+
bbox: {
|
|
1528
|
+
x1: colXs[c],
|
|
1529
|
+
y1: rowYs[r + rowSpan],
|
|
1530
|
+
x2: colXs[c + colSpan],
|
|
1531
|
+
y2: rowYs[r]
|
|
1532
|
+
}
|
|
1533
|
+
});
|
|
1534
|
+
}
|
|
1535
|
+
}
|
|
1536
|
+
return cells;
|
|
1537
|
+
}
|
|
1538
|
+
function hasVerticalLine(verticals, x, topY, botY) {
|
|
1539
|
+
const tol = COORD_MERGE_TOL + 1;
|
|
1540
|
+
for (const v of verticals) {
|
|
1541
|
+
if (Math.abs(v.x1 - x) <= tol) {
|
|
1542
|
+
const cellH = Math.abs(topY - botY);
|
|
1543
|
+
const overlapTop = Math.min(v.y2, topY);
|
|
1544
|
+
const overlapBot = Math.max(v.y1, botY);
|
|
1545
|
+
const overlap = overlapTop - overlapBot;
|
|
1546
|
+
if (overlap >= cellH * 0.5) return true;
|
|
1547
|
+
}
|
|
1548
|
+
}
|
|
1549
|
+
return false;
|
|
1550
|
+
}
|
|
1551
|
+
function hasHorizontalLine(horizontals, y, leftX, rightX) {
|
|
1552
|
+
const tol = COORD_MERGE_TOL + 1;
|
|
1553
|
+
for (const h of horizontals) {
|
|
1554
|
+
if (Math.abs(h.y1 - y) <= tol) {
|
|
1555
|
+
const cellW = Math.abs(rightX - leftX);
|
|
1556
|
+
const overlapLeft = Math.max(h.x1, leftX);
|
|
1557
|
+
const overlapRight = Math.min(h.x2, rightX);
|
|
1558
|
+
const overlap = overlapRight - overlapLeft;
|
|
1559
|
+
if (overlap >= cellW * 0.5) return true;
|
|
1560
|
+
}
|
|
1561
|
+
}
|
|
1562
|
+
return false;
|
|
1563
|
+
}
|
|
1564
|
+
function mapTextToCells(items, cells) {
|
|
1565
|
+
const result = /* @__PURE__ */ new Map();
|
|
1566
|
+
for (const cell of cells) {
|
|
1567
|
+
result.set(cell, []);
|
|
1568
|
+
}
|
|
1569
|
+
for (const item of items) {
|
|
1570
|
+
const cx = item.x + item.w / 2;
|
|
1571
|
+
const cy = item.y;
|
|
1572
|
+
const pad = CELL_PADDING;
|
|
1573
|
+
let bestCell = null;
|
|
1574
|
+
let bestDist = Infinity;
|
|
1575
|
+
for (const cell of cells) {
|
|
1576
|
+
if (cx >= cell.bbox.x1 - pad && cx <= cell.bbox.x2 + pad && cy >= cell.bbox.y1 - pad && cy <= cell.bbox.y2 + pad) {
|
|
1577
|
+
const cellCx = (cell.bbox.x1 + cell.bbox.x2) / 2;
|
|
1578
|
+
const cellCy = (cell.bbox.y1 + cell.bbox.y2) / 2;
|
|
1579
|
+
const dist = Math.abs(cx - cellCx) + Math.abs(cy - cellCy);
|
|
1580
|
+
if (dist < bestDist) {
|
|
1581
|
+
bestDist = dist;
|
|
1582
|
+
bestCell = cell;
|
|
1583
|
+
}
|
|
1584
|
+
}
|
|
1585
|
+
}
|
|
1586
|
+
if (bestCell) {
|
|
1587
|
+
result.get(bestCell).push(item);
|
|
1588
|
+
}
|
|
1589
|
+
}
|
|
1590
|
+
return result;
|
|
1591
|
+
}
|
|
1592
|
+
function cellTextToString(items) {
|
|
1593
|
+
if (items.length === 0) return "";
|
|
1594
|
+
if (items.length === 1) return items[0].text;
|
|
1595
|
+
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
1596
|
+
const lines = [];
|
|
1597
|
+
let curLine = [sorted[0]];
|
|
1598
|
+
let curY = sorted[0].y;
|
|
1599
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
1600
|
+
const tol = Math.max(3, Math.min(sorted[i].fontSize, curLine[0].fontSize) * 0.6);
|
|
1601
|
+
if (Math.abs(sorted[i].y - curY) <= tol) {
|
|
1602
|
+
curLine.push(sorted[i]);
|
|
1603
|
+
} else {
|
|
1604
|
+
lines.push(curLine);
|
|
1605
|
+
curLine = [sorted[i]];
|
|
1606
|
+
curY = sorted[i].y;
|
|
1607
|
+
}
|
|
1608
|
+
}
|
|
1609
|
+
lines.push(curLine);
|
|
1610
|
+
const textLines = lines.map((line) => {
|
|
1611
|
+
const s = line.sort((a, b) => a.x - b.x);
|
|
1612
|
+
return s.map((i) => i.text).join(" ");
|
|
1613
|
+
});
|
|
1614
|
+
if (textLines.length <= 1) return textLines[0] || "";
|
|
1615
|
+
const merged = [textLines[0]];
|
|
1616
|
+
for (let i = 1; i < textLines.length; i++) {
|
|
1617
|
+
const prev = merged[merged.length - 1];
|
|
1618
|
+
const curr = textLines[i];
|
|
1619
|
+
if (/[가-힣]$/.test(prev) && /^[가-힣]+$/.test(curr) && curr.length <= 5 && !curr.includes(" ")) {
|
|
1620
|
+
merged[merged.length - 1] = prev + curr;
|
|
1621
|
+
} else {
|
|
1622
|
+
merged.push(curr);
|
|
1623
|
+
}
|
|
1624
|
+
}
|
|
1625
|
+
return merged.join("\n");
|
|
1626
|
+
}
|
|
1627
|
+
|
|
894
1628
|
// src/pdf/polyfill.ts
|
|
895
1629
|
import * as pdfjsWorker from "pdfjs-dist/legacy/build/pdf.worker.mjs";
|
|
896
1630
|
var g = globalThis;
|
|
@@ -922,44 +1656,62 @@ async function parsePdfDocument(buffer, options) {
|
|
|
922
1656
|
}).promise;
|
|
923
1657
|
try {
|
|
924
1658
|
const pageCount = doc.numPages;
|
|
925
|
-
if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.",
|
|
1659
|
+
if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };
|
|
926
1660
|
const metadata = { pageCount };
|
|
927
1661
|
await extractPdfMetadata(doc, metadata);
|
|
928
|
-
const pageTexts = [];
|
|
929
1662
|
const blocks = [];
|
|
1663
|
+
const warnings = [];
|
|
930
1664
|
let totalChars = 0;
|
|
931
1665
|
let totalTextBytes = 0;
|
|
932
1666
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
933
1667
|
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
1668
|
+
const allFontSizes = [];
|
|
934
1669
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
935
1670
|
if (pageFilter && !pageFilter.has(i)) continue;
|
|
936
1671
|
const page = await doc.getPage(i);
|
|
937
1672
|
const tc = await page.getTextContent();
|
|
938
|
-
const
|
|
939
|
-
|
|
940
|
-
|
|
1673
|
+
const viewport = page.getViewport({ scale: 1 });
|
|
1674
|
+
const rawItems = tc.items;
|
|
1675
|
+
const items = normalizeItems(rawItems);
|
|
1676
|
+
const { visible, hiddenCount } = filterHiddenText(items, viewport.width, viewport.height);
|
|
1677
|
+
if (hiddenCount > 0) {
|
|
1678
|
+
warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
1679
|
+
}
|
|
1680
|
+
for (const item of visible) {
|
|
1681
|
+
if (item.fontSize > 0) allFontSizes.push(item.fontSize);
|
|
1682
|
+
}
|
|
1683
|
+
const opList = await page.getOperatorList();
|
|
1684
|
+
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
1685
|
+
for (const b of pageBlocks) blocks.push(b);
|
|
1686
|
+
for (const b of pageBlocks) {
|
|
1687
|
+
const t = b.text || "";
|
|
1688
|
+
totalChars += t.replace(/\s/g, "").length;
|
|
1689
|
+
totalTextBytes += t.length * 2;
|
|
1690
|
+
}
|
|
941
1691
|
if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
|
|
942
|
-
pageTexts.push(pageText);
|
|
943
|
-
blocks.push({ type: "paragraph", text: pageText });
|
|
944
1692
|
}
|
|
945
1693
|
const parsedPageCount = pageFilter ? pageFilter.size : effectivePageCount;
|
|
946
1694
|
if (totalChars / Math.max(parsedPageCount, 1) < 10) {
|
|
947
1695
|
if (options?.ocr) {
|
|
948
1696
|
try {
|
|
949
|
-
const { ocrPages } = await import("./provider-
|
|
1697
|
+
const { ocrPages } = await import("./provider-A4FHJSID.js");
|
|
950
1698
|
const ocrBlocks = await ocrPages(doc, options.ocr, pageFilter, effectivePageCount);
|
|
951
1699
|
if (ocrBlocks.length > 0) {
|
|
952
1700
|
const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
|
|
953
|
-
return { success: true, fileType: "pdf", markdown: ocrMarkdown, pageCount: parsedPageCount, blocks: ocrBlocks, metadata, isImageBased: true };
|
|
1701
|
+
return { success: true, fileType: "pdf", markdown: ocrMarkdown, pageCount: parsedPageCount, blocks: ocrBlocks, metadata, isImageBased: true, warnings };
|
|
954
1702
|
}
|
|
955
1703
|
} catch {
|
|
956
1704
|
}
|
|
957
1705
|
}
|
|
958
1706
|
return { success: false, fileType: "pdf", pageCount, isImageBased: true, error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`, code: "IMAGE_BASED_PDF" };
|
|
959
1707
|
}
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
1708
|
+
const medianFontSize = computeMedianFontSize(allFontSizes);
|
|
1709
|
+
if (medianFontSize > 0) {
|
|
1710
|
+
detectHeadings(blocks, medianFontSize);
|
|
1711
|
+
}
|
|
1712
|
+
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
1713
|
+
let markdown = cleanPdfText(blocksToMarkdown(blocks));
|
|
1714
|
+
return { success: true, fileType: "pdf", markdown, pageCount: parsedPageCount, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
963
1715
|
} finally {
|
|
964
1716
|
await doc.destroy().catch(() => {
|
|
965
1717
|
});
|
|
@@ -1004,24 +1756,272 @@ async function extractPdfMetadataOnly(buffer) {
|
|
|
1004
1756
|
});
|
|
1005
1757
|
}
|
|
1006
1758
|
}
|
|
1007
|
-
function
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
const
|
|
1011
|
-
|
|
1759
|
+
function filterHiddenText(items, pageWidth, pageHeight) {
|
|
1760
|
+
let hiddenCount = 0;
|
|
1761
|
+
const visible = [];
|
|
1762
|
+
for (const item of items) {
|
|
1763
|
+
if (item.isHidden) {
|
|
1764
|
+
hiddenCount++;
|
|
1765
|
+
continue;
|
|
1766
|
+
}
|
|
1767
|
+
const margin = Math.max(pageWidth, pageHeight) * 0.1;
|
|
1768
|
+
if (item.x < -margin || item.x > pageWidth + margin || item.y < -margin || item.y > pageHeight + margin) {
|
|
1769
|
+
hiddenCount++;
|
|
1770
|
+
continue;
|
|
1771
|
+
}
|
|
1772
|
+
visible.push(item);
|
|
1773
|
+
}
|
|
1774
|
+
return { visible, hiddenCount };
|
|
1775
|
+
}
|
|
1776
|
+
function computeMedianFontSize(sizes) {
|
|
1777
|
+
if (sizes.length === 0) return 0;
|
|
1778
|
+
const sorted = [...sizes].sort((a, b) => a - b);
|
|
1779
|
+
const mid = Math.floor(sorted.length / 2);
|
|
1780
|
+
return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
|
|
1781
|
+
}
|
|
1782
|
+
function detectHeadings(blocks, medianFontSize) {
|
|
1783
|
+
for (const block of blocks) {
|
|
1784
|
+
if (block.type !== "paragraph" || !block.text || !block.style?.fontSize) continue;
|
|
1785
|
+
const text = block.text.trim();
|
|
1786
|
+
if (text.length === 0 || text.length > 200) continue;
|
|
1787
|
+
if (/^\d+$/.test(text)) continue;
|
|
1788
|
+
const ratio = block.style.fontSize / medianFontSize;
|
|
1789
|
+
let level = 0;
|
|
1790
|
+
if (ratio >= 1.5) level = 1;
|
|
1791
|
+
else if (ratio >= 1.3) level = 2;
|
|
1792
|
+
else if (ratio >= 1.15) level = 3;
|
|
1793
|
+
if (level > 0) {
|
|
1794
|
+
block.type = "heading";
|
|
1795
|
+
block.level = level;
|
|
1796
|
+
}
|
|
1797
|
+
}
|
|
1798
|
+
}
|
|
1799
|
+
var MAX_XYCUT_DEPTH = 50;
|
|
1800
|
+
function xyCutOrder(items, gapThreshold, depth = 0) {
|
|
1801
|
+
if (items.length === 0) return [];
|
|
1802
|
+
if (items.length <= 2 || depth >= MAX_XYCUT_DEPTH) return [items];
|
|
1803
|
+
const region = computeRegion(items);
|
|
1804
|
+
const ySplit = findYSplit(items, region, gapThreshold);
|
|
1805
|
+
if (ySplit !== null) {
|
|
1806
|
+
const upper = items.filter((i) => i.y > ySplit);
|
|
1807
|
+
const lower = items.filter((i) => i.y <= ySplit);
|
|
1808
|
+
if (upper.length > 0 && lower.length > 0 && upper.length < items.length) {
|
|
1809
|
+
return [...xyCutOrder(upper, gapThreshold, depth + 1), ...xyCutOrder(lower, gapThreshold, depth + 1)];
|
|
1810
|
+
}
|
|
1811
|
+
}
|
|
1812
|
+
const xSplit = findXSplit(items, region, gapThreshold);
|
|
1813
|
+
if (xSplit !== null) {
|
|
1814
|
+
const left = items.filter((i) => i.x + i.w / 2 < xSplit);
|
|
1815
|
+
const right = items.filter((i) => i.x + i.w / 2 >= xSplit);
|
|
1816
|
+
if (left.length > 0 && right.length > 0 && left.length < items.length) {
|
|
1817
|
+
return [...xyCutOrder(left, gapThreshold, depth + 1), ...xyCutOrder(right, gapThreshold, depth + 1)];
|
|
1818
|
+
}
|
|
1819
|
+
}
|
|
1820
|
+
return [items];
|
|
1821
|
+
}
|
|
1822
|
+
function computeRegion(items) {
|
|
1823
|
+
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
1824
|
+
for (const i of items) {
|
|
1825
|
+
if (i.x < minX) minX = i.x;
|
|
1826
|
+
if (i.y < minY) minY = i.y;
|
|
1827
|
+
if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
1828
|
+
if (i.y + i.h > maxY) maxY = i.y + i.h;
|
|
1829
|
+
}
|
|
1830
|
+
return { items, minX, minY, maxX, maxY };
|
|
1831
|
+
}
|
|
1832
|
+
function findYSplit(items, region, gapThreshold) {
|
|
1833
|
+
const sorted = [...items].sort((a, b) => b.y - a.y);
|
|
1834
|
+
let bestGap = gapThreshold;
|
|
1835
|
+
let bestSplit = null;
|
|
1836
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
1837
|
+
const prevBottom = sorted[i - 1].y - sorted[i - 1].h;
|
|
1838
|
+
const currTop = sorted[i].y;
|
|
1839
|
+
const gap = prevBottom - currTop;
|
|
1840
|
+
if (gap > bestGap) {
|
|
1841
|
+
bestGap = gap;
|
|
1842
|
+
bestSplit = (prevBottom + currTop) / 2;
|
|
1843
|
+
}
|
|
1844
|
+
}
|
|
1845
|
+
return bestSplit;
|
|
1846
|
+
}
|
|
1847
|
+
function findXSplit(items, region, gapThreshold) {
|
|
1848
|
+
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
1849
|
+
let bestGap = gapThreshold;
|
|
1850
|
+
let bestSplit = null;
|
|
1851
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
1852
|
+
const prevRight = sorted[i - 1].x + sorted[i - 1].w;
|
|
1853
|
+
const currLeft = sorted[i].x;
|
|
1854
|
+
const gap = currLeft - prevRight;
|
|
1855
|
+
if (gap > bestGap) {
|
|
1856
|
+
bestGap = gap;
|
|
1857
|
+
bestSplit = (prevRight + currLeft) / 2;
|
|
1858
|
+
}
|
|
1859
|
+
}
|
|
1860
|
+
return bestSplit;
|
|
1861
|
+
}
|
|
1862
|
+
function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeight) {
|
|
1863
|
+
if (items.length === 0) return [];
|
|
1864
|
+
let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
|
|
1865
|
+
({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
|
|
1866
|
+
const grids = buildTableGrids(horizontals, verticals);
|
|
1867
|
+
if (grids.length > 0) {
|
|
1868
|
+
return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
|
|
1869
|
+
}
|
|
1870
|
+
return extractPageBlocksFallback(items, pageNum);
|
|
1871
|
+
}
|
|
1872
|
+
function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
1873
|
+
const blocks = [];
|
|
1874
|
+
const usedItems = /* @__PURE__ */ new Set();
|
|
1875
|
+
const sortedGrids = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
1876
|
+
for (const grid of sortedGrids) {
|
|
1877
|
+
const tableItems = [];
|
|
1878
|
+
const pad = 3;
|
|
1879
|
+
for (const item of items) {
|
|
1880
|
+
if (usedItems.has(item)) continue;
|
|
1881
|
+
if (item.x >= grid.bbox.x1 - pad && item.x + item.w <= grid.bbox.x2 + pad && item.y >= grid.bbox.y1 - pad && item.y <= grid.bbox.y2 + pad) {
|
|
1882
|
+
tableItems.push(item);
|
|
1883
|
+
usedItems.add(item);
|
|
1884
|
+
}
|
|
1885
|
+
}
|
|
1886
|
+
const cells = extractCells(grid, horizontals, verticals);
|
|
1887
|
+
if (cells.length === 0) continue;
|
|
1888
|
+
const textItems = tableItems.map((i) => ({
|
|
1889
|
+
text: i.text,
|
|
1890
|
+
x: i.x,
|
|
1891
|
+
y: i.y,
|
|
1892
|
+
w: i.w,
|
|
1893
|
+
h: i.h,
|
|
1894
|
+
fontSize: i.fontSize,
|
|
1895
|
+
fontName: i.fontName
|
|
1896
|
+
}));
|
|
1897
|
+
const cellTextMap = mapTextToCells(textItems, cells);
|
|
1898
|
+
const numRows = grid.rowYs.length - 1;
|
|
1899
|
+
const numCols = grid.colXs.length - 1;
|
|
1900
|
+
const irGrid = Array.from(
|
|
1901
|
+
{ length: numRows },
|
|
1902
|
+
() => Array.from({ length: numCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
|
|
1903
|
+
);
|
|
1904
|
+
for (const cell of cells) {
|
|
1905
|
+
const textItems2 = cellTextMap.get(cell) || [];
|
|
1906
|
+
const text = cellTextToString(textItems2);
|
|
1907
|
+
irGrid[cell.row][cell.col] = {
|
|
1908
|
+
text,
|
|
1909
|
+
colSpan: cell.colSpan,
|
|
1910
|
+
rowSpan: cell.rowSpan
|
|
1911
|
+
};
|
|
1912
|
+
}
|
|
1913
|
+
const irTable = {
|
|
1914
|
+
rows: numRows,
|
|
1915
|
+
cols: numCols,
|
|
1916
|
+
cells: irGrid,
|
|
1917
|
+
hasHeader: numRows > 1
|
|
1918
|
+
};
|
|
1919
|
+
blocks.push({
|
|
1920
|
+
type: "table",
|
|
1921
|
+
table: irTable,
|
|
1922
|
+
pageNumber: pageNum,
|
|
1923
|
+
bbox: {
|
|
1924
|
+
page: pageNum,
|
|
1925
|
+
x: grid.bbox.x1,
|
|
1926
|
+
y: grid.bbox.y1,
|
|
1927
|
+
width: grid.bbox.x2 - grid.bbox.x1,
|
|
1928
|
+
height: grid.bbox.y2 - grid.bbox.y1
|
|
1929
|
+
}
|
|
1930
|
+
});
|
|
1931
|
+
}
|
|
1932
|
+
const remaining = items.filter((i) => !usedItems.has(i));
|
|
1933
|
+
if (remaining.length > 0) {
|
|
1934
|
+
remaining.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
1935
|
+
const textBlocks = detectListBlocks(extractPageBlocksFallback(remaining, pageNum));
|
|
1936
|
+
const allBlocks = [...blocks, ...textBlocks];
|
|
1937
|
+
allBlocks.sort((a, b) => {
|
|
1938
|
+
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
1939
|
+
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
1940
|
+
return by - ay;
|
|
1941
|
+
});
|
|
1942
|
+
return allBlocks;
|
|
1943
|
+
}
|
|
1944
|
+
return blocks;
|
|
1945
|
+
}
|
|
1946
|
+
function extractPageBlocksFallback(items, pageNum) {
|
|
1947
|
+
if (items.length === 0) return [];
|
|
1948
|
+
const blocks = [];
|
|
1949
|
+
const allYLines = groupByY(items);
|
|
1950
|
+
const columns = detectColumns(allYLines);
|
|
1012
1951
|
if (columns && columns.length >= 3) {
|
|
1013
|
-
|
|
1952
|
+
const tableText = extractWithColumns(allYLines, columns);
|
|
1953
|
+
const bbox = computeBBox(items, pageNum);
|
|
1954
|
+
blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
|
|
1955
|
+
} else {
|
|
1956
|
+
const allY = items.map((i) => i.y);
|
|
1957
|
+
const pageHeight = Math.max(...allY) - Math.min(...allY);
|
|
1958
|
+
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
1959
|
+
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
1960
|
+
for (const group of orderedGroups) {
|
|
1961
|
+
if (group.length === 0) continue;
|
|
1962
|
+
const yLines = groupByY(group);
|
|
1963
|
+
const groupColumns = detectColumns(yLines);
|
|
1964
|
+
if (groupColumns && groupColumns.length >= 3) {
|
|
1965
|
+
const tableText = extractWithColumns(yLines, groupColumns);
|
|
1966
|
+
const bbox = computeBBox(group, pageNum);
|
|
1967
|
+
blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(group) });
|
|
1968
|
+
} else {
|
|
1969
|
+
for (const line of yLines) {
|
|
1970
|
+
const text = mergeLineSimple(line);
|
|
1971
|
+
if (!text.trim()) continue;
|
|
1972
|
+
const bbox = computeBBox(line, pageNum);
|
|
1973
|
+
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
|
|
1974
|
+
}
|
|
1975
|
+
}
|
|
1976
|
+
}
|
|
1977
|
+
}
|
|
1978
|
+
return blocks;
|
|
1979
|
+
}
|
|
1980
|
+
function computeBBox(items, pageNum) {
|
|
1981
|
+
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
1982
|
+
for (const i of items) {
|
|
1983
|
+
if (i.x < minX) minX = i.x;
|
|
1984
|
+
if (i.y < minY) minY = i.y;
|
|
1985
|
+
if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
1986
|
+
const effectiveH = i.h > 0 ? i.h : i.fontSize;
|
|
1987
|
+
if (i.y + effectiveH > maxY) maxY = i.y + effectiveH;
|
|
1988
|
+
}
|
|
1989
|
+
return { page: pageNum, x: minX, y: minY, width: maxX - minX, height: maxY - minY };
|
|
1990
|
+
}
|
|
1991
|
+
function dominantStyle(items) {
|
|
1992
|
+
if (items.length === 0) return void 0;
|
|
1993
|
+
const freq = /* @__PURE__ */ new Map();
|
|
1994
|
+
let maxCount = 0, dominantSize = 0;
|
|
1995
|
+
for (const i of items) {
|
|
1996
|
+
if (i.fontSize <= 0) continue;
|
|
1997
|
+
const count = (freq.get(i.fontSize) || 0) + 1;
|
|
1998
|
+
freq.set(i.fontSize, count);
|
|
1999
|
+
if (count > maxCount) {
|
|
2000
|
+
maxCount = count;
|
|
2001
|
+
dominantSize = i.fontSize;
|
|
2002
|
+
}
|
|
1014
2003
|
}
|
|
1015
|
-
|
|
2004
|
+
if (dominantSize === 0) return void 0;
|
|
2005
|
+
const fontName = items.find((i) => i.fontSize === dominantSize)?.fontName || void 0;
|
|
2006
|
+
return { fontSize: dominantSize, fontName };
|
|
1016
2007
|
}
|
|
1017
2008
|
function normalizeItems(rawItems) {
|
|
1018
|
-
return rawItems.filter((i) => typeof i.str === "string" && i.str.trim() !== "").map((i) =>
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
2009
|
+
return rawItems.filter((i) => typeof i.str === "string" && i.str.trim() !== "").map((i) => {
|
|
2010
|
+
const scaleY = Math.abs(i.transform[3]);
|
|
2011
|
+
const scaleX = Math.abs(i.transform[0]);
|
|
2012
|
+
const fontSize = Math.round(Math.max(scaleY, scaleX));
|
|
2013
|
+
return {
|
|
2014
|
+
text: i.str.trim(),
|
|
2015
|
+
x: Math.round(i.transform[4]),
|
|
2016
|
+
y: Math.round(i.transform[5]),
|
|
2017
|
+
w: Math.round(i.width),
|
|
2018
|
+
h: Math.round(i.height),
|
|
2019
|
+
fontSize,
|
|
2020
|
+
fontName: i.fontName || "",
|
|
2021
|
+
// 0pt 폰트이거나 너비 0 → hidden text (prompt injection 의심)
|
|
2022
|
+
isHidden: fontSize === 0 || i.width === 0 && i.str.trim().length > 0
|
|
2023
|
+
};
|
|
2024
|
+
}).sort((a, b) => b.y - a.y || a.x - b.x);
|
|
1025
2025
|
}
|
|
1026
2026
|
function groupByY(items) {
|
|
1027
2027
|
if (items.length === 0) return [];
|
|
@@ -1259,6 +2259,27 @@ function startsWithMarker(line) {
|
|
|
1259
2259
|
function isStandaloneHeader(line) {
|
|
1260
2260
|
return /^제\d+[조항호장절](\([^)]*\))?(\s+\S+){0,7}$/.test(line.trim());
|
|
1261
2261
|
}
|
|
2262
|
+
function detectListBlocks(blocks) {
|
|
2263
|
+
const result = [];
|
|
2264
|
+
for (let i = 0; i < blocks.length; i++) {
|
|
2265
|
+
const block = blocks[i];
|
|
2266
|
+
if (block.type === "paragraph" && block.text) {
|
|
2267
|
+
const match = block.text.match(/^(\d+)\.\s/);
|
|
2268
|
+
if (match) {
|
|
2269
|
+
result.push({
|
|
2270
|
+
...block,
|
|
2271
|
+
type: "list",
|
|
2272
|
+
listType: "ordered",
|
|
2273
|
+
// 원래 번호를 text에 보존 (blocksToMarkdown에서 그대로 출력)
|
|
2274
|
+
text: block.text
|
|
2275
|
+
});
|
|
2276
|
+
continue;
|
|
2277
|
+
}
|
|
2278
|
+
}
|
|
2279
|
+
result.push(block);
|
|
2280
|
+
}
|
|
2281
|
+
return result;
|
|
2282
|
+
}
|
|
1262
2283
|
function mergeKoreanLines(text) {
|
|
1263
2284
|
if (!text) return "";
|
|
1264
2285
|
const lines = text.split("\n");
|
|
@@ -1267,6 +2288,10 @@ function mergeKoreanLines(text) {
|
|
|
1267
2288
|
for (let i = 1; i < lines.length; i++) {
|
|
1268
2289
|
const prev = result[result.length - 1];
|
|
1269
2290
|
const curr = lines[i];
|
|
2291
|
+
if (/^#{1,6}\s/.test(prev) || /^#{1,6}\s/.test(curr)) {
|
|
2292
|
+
result.push(curr);
|
|
2293
|
+
continue;
|
|
2294
|
+
}
|
|
1270
2295
|
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
|
|
1271
2296
|
result[result.length - 1] = prev + " " + curr;
|
|
1272
2297
|
} else {
|
|
@@ -1424,16 +2449,16 @@ async function parse(buffer, options) {
|
|
|
1424
2449
|
}
|
|
1425
2450
|
async function parseHwpx(buffer, options) {
|
|
1426
2451
|
try {
|
|
1427
|
-
const { markdown, blocks, metadata } = await parseHwpxDocument(buffer, options);
|
|
1428
|
-
return { success: true, fileType: "hwpx", markdown, blocks, metadata };
|
|
2452
|
+
const { markdown, blocks, metadata, outline, warnings } = await parseHwpxDocument(buffer, options);
|
|
2453
|
+
return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings };
|
|
1429
2454
|
} catch (err) {
|
|
1430
2455
|
return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
1431
2456
|
}
|
|
1432
2457
|
}
|
|
1433
2458
|
async function parseHwp(buffer, options) {
|
|
1434
2459
|
try {
|
|
1435
|
-
const { markdown, blocks, metadata } = parseHwp5Document(Buffer.from(buffer), options);
|
|
1436
|
-
return { success: true, fileType: "hwp", markdown, blocks, metadata };
|
|
2460
|
+
const { markdown, blocks, metadata, outline, warnings } = parseHwp5Document(Buffer.from(buffer), options);
|
|
2461
|
+
return { success: true, fileType: "hwp", markdown, blocks, metadata, outline, warnings };
|
|
1437
2462
|
} catch (err) {
|
|
1438
2463
|
return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
1439
2464
|
}
|
|
@@ -1578,12 +2603,13 @@ function fallbackAlign(a, b) {
|
|
|
1578
2603
|
}
|
|
1579
2604
|
function blockSimilarity(a, b) {
|
|
1580
2605
|
if (a.type !== b.type) return 0;
|
|
1581
|
-
if (a.
|
|
2606
|
+
if (a.text !== void 0 && b.text !== void 0) {
|
|
1582
2607
|
return normalizedSimilarity(a.text || "", b.text || "");
|
|
1583
2608
|
}
|
|
1584
2609
|
if (a.type === "table" && a.table && b.table) {
|
|
1585
2610
|
return tableSimilarity(a.table, b.table);
|
|
1586
2611
|
}
|
|
2612
|
+
if (a.type === b.type) return 1;
|
|
1587
2613
|
return 0;
|
|
1588
2614
|
}
|
|
1589
2615
|
function tableSimilarity(a, b) {
|
|
@@ -1628,4 +2654,4 @@ export {
|
|
|
1628
2654
|
extractFormFields,
|
|
1629
2655
|
parse
|
|
1630
2656
|
};
|
|
1631
|
-
//# sourceMappingURL=chunk-
|
|
2657
|
+
//# sourceMappingURL=chunk-5SZWGBNL.js.map
|