kordoc 1.5.0 → 1.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -2
- package/dist/{chunk-5SZWGBNL.js → chunk-DYUB34PO.js} +360 -28
- package/dist/chunk-DYUB34PO.js.map +1 -0
- package/dist/cli.js +2 -2
- package/dist/index.cjs +359 -27
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.js +359 -27
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +1 -1
- package/dist/{watch-YCWNFYAW.js → watch-3QVNEAVM.js} +2 -2
- package/package.json +1 -1
- package/dist/chunk-5SZWGBNL.js.map +0 -1
- /package/dist/{watch-YCWNFYAW.js.map → watch-3QVNEAVM.js.map} +0 -0
package/README.md
CHANGED
|
@@ -14,7 +14,23 @@
|
|
|
14
14
|
|
|
15
15
|
---
|
|
16
16
|
|
|
17
|
-
## What's New in v1.
|
|
17
|
+
## What's New in v1.6.1
|
|
18
|
+
|
|
19
|
+
- **HWP5 Table Cell Offset Fix** — Fixed critical 2-byte offset misalignment in LIST_HEADER parsing. Row address was incorrectly read as colSpan, causing 3-column tables to explode into 6+ columns with misaligned content. Tables now use colAddr/rowAddr-based direct placement for accurate cell positioning.
|
|
20
|
+
- **HWP5 TAB Control Character Fix** — TAB (0x0009) inline control's 14-byte extension data was not skipped, producing garbage characters (`࣐Ā`) after every tab in the output. Fixed by adding the required 14-byte skip.
|
|
21
|
+
|
|
22
|
+
<details>
|
|
23
|
+
<summary>v1.6.0 features</summary>
|
|
24
|
+
|
|
25
|
+
- **Cluster-Based Table Detection (PDF)** — Detects borderless tables by analyzing text alignment patterns. Baseline grouping + X-coordinate clustering identifies 2+ column tables that line-based detection misses. Sort-and-split clustering for order-independent results.
|
|
26
|
+
- **Korean Special Table Detection** — Automatically detects `구분/항목/종류`-style key-value patterns common in Korean government documents and converts them to structured 2-column tables.
|
|
27
|
+
- **Korean Word-Break Recovery** — Improved merging of broken Korean words in PDF table cells. Handles character-level PDF rendering (micro-gaps between Hangul characters) and cell line-break artifacts up to 8 characters.
|
|
28
|
+
- **Empty Table Filtering** — Tables with all-empty cells (from line detection of decorative borders) are now automatically removed.
|
|
29
|
+
|
|
30
|
+
</details>
|
|
31
|
+
|
|
32
|
+
<details>
|
|
33
|
+
<summary>v1.5.0 features</summary>
|
|
18
34
|
|
|
19
35
|
- **Line-Based Table Detection (PDF)** — Ported from OpenDataLoader. Extracts horizontal/vertical lines from PDF graphics commands, builds grid via intersection vertices, maps text to cells by bbox overlap. Proper colspan/rowspan detection. Falls back to heuristic for line-free PDFs.
|
|
20
36
|
- **IRBlock v2** — 6 block types: `heading`, `paragraph`, `table`, `list`, `image`, `separator`. New fields: `bbox`, `style`, `pageNumber`, `level`, `href`, `footnoteText`.
|
|
@@ -25,6 +41,8 @@
|
|
|
25
41
|
- **List Detection** — Numbered paragraphs after tables auto-converted to ordered list blocks.
|
|
26
42
|
- **MCP Server** — Now returns `outline` and `warnings` in parse_document responses.
|
|
27
43
|
|
|
44
|
+
</details>
|
|
45
|
+
|
|
28
46
|
<details>
|
|
29
47
|
<summary>v1.4.x features</summary>
|
|
30
48
|
|
|
@@ -205,7 +223,7 @@ import type {
|
|
|
205
223
|
| Format | Engine | Features |
|
|
206
224
|
|--------|--------|----------|
|
|
207
225
|
| **HWPX** (한컴 2020+) | ZIP + XML DOM | Manifest, nested tables, merged cells, broken ZIP recovery |
|
|
208
|
-
| **HWP 5.x** (한컴 Legacy) | OLE2 + CFB | 21 control chars, zlib decompression, DRM detection |
|
|
226
|
+
| **HWP 5.x** (한컴 Legacy) | OLE2 + CFB | 21 control chars, zlib decompression, DRM detection, colAddr-based table cell placement |
|
|
209
227
|
| **PDF** | pdfjs-dist | Line-based table detection, XY-Cut reading order, heading detection, hidden text filter, OCR |
|
|
210
228
|
|
|
211
229
|
## Security
|
|
@@ -185,7 +185,7 @@ function tableToMarkdown(table) {
|
|
|
185
185
|
}
|
|
186
186
|
|
|
187
187
|
// src/utils.ts
|
|
188
|
-
var VERSION = true ? "1.
|
|
188
|
+
var VERSION = true ? "1.6.1" : "0.0.0-dev";
|
|
189
189
|
function toArrayBuffer(buf) {
|
|
190
190
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
191
191
|
return buf.buffer;
|
|
@@ -923,6 +923,7 @@ function extractText(data) {
|
|
|
923
923
|
break;
|
|
924
924
|
case CHAR_TAB:
|
|
925
925
|
result += " ";
|
|
926
|
+
if (i + 14 <= data.length) i += 14;
|
|
926
927
|
break;
|
|
927
928
|
case CHAR_HYPHEN:
|
|
928
929
|
result += "-";
|
|
@@ -1237,9 +1238,13 @@ function parseCellBlock(records, startIdx, tableLevel) {
|
|
|
1237
1238
|
const texts = [];
|
|
1238
1239
|
let colSpan = 1;
|
|
1239
1240
|
let rowSpan = 1;
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1241
|
+
let colAddr;
|
|
1242
|
+
let rowAddr;
|
|
1243
|
+
if (rec.data.length >= 16) {
|
|
1244
|
+
colAddr = rec.data.readUInt16LE(8);
|
|
1245
|
+
rowAddr = rec.data.readUInt16LE(10);
|
|
1246
|
+
const cs = rec.data.readUInt16LE(12);
|
|
1247
|
+
const rs = rec.data.readUInt16LE(14);
|
|
1243
1248
|
if (cs > 0) colSpan = Math.min(cs, MAX_COLS);
|
|
1244
1249
|
if (rs > 0) rowSpan = Math.min(rs, MAX_ROWS);
|
|
1245
1250
|
}
|
|
@@ -1254,15 +1259,16 @@ function parseCellBlock(records, startIdx, tableLevel) {
|
|
|
1254
1259
|
}
|
|
1255
1260
|
i++;
|
|
1256
1261
|
}
|
|
1257
|
-
return { cell: { text: texts.join("\n"), colSpan, rowSpan }, nextIdx: i };
|
|
1262
|
+
return { cell: { text: texts.join("\n"), colSpan, rowSpan, colAddr, rowAddr }, nextIdx: i };
|
|
1258
1263
|
}
|
|
1259
1264
|
function arrangeCells(rows, cols, cells) {
|
|
1260
1265
|
const grid = Array.from({ length: rows }, () => Array(cols).fill(null));
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
for (
|
|
1264
|
-
|
|
1265
|
-
const
|
|
1266
|
+
const hasAddr = cells.some((c) => c.colAddr !== void 0 && c.rowAddr !== void 0);
|
|
1267
|
+
if (hasAddr) {
|
|
1268
|
+
for (const cell of cells) {
|
|
1269
|
+
const r = cell.rowAddr ?? 0;
|
|
1270
|
+
const c = cell.colAddr ?? 0;
|
|
1271
|
+
if (r >= rows || c >= cols) continue;
|
|
1266
1272
|
grid[r][c] = cell;
|
|
1267
1273
|
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
1268
1274
|
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
@@ -1272,6 +1278,22 @@ function arrangeCells(rows, cols, cells) {
|
|
|
1272
1278
|
}
|
|
1273
1279
|
}
|
|
1274
1280
|
}
|
|
1281
|
+
} else {
|
|
1282
|
+
let cellIdx = 0;
|
|
1283
|
+
for (let r = 0; r < rows && cellIdx < cells.length; r++) {
|
|
1284
|
+
for (let c = 0; c < cols && cellIdx < cells.length; c++) {
|
|
1285
|
+
if (grid[r][c] !== null) continue;
|
|
1286
|
+
const cell = cells[cellIdx++];
|
|
1287
|
+
grid[r][c] = cell;
|
|
1288
|
+
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
1289
|
+
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
1290
|
+
if (dr === 0 && dc === 0) continue;
|
|
1291
|
+
if (r + dr < rows && c + dc < cols)
|
|
1292
|
+
grid[r + dr][c + dc] = { text: "", colSpan: 1, rowSpan: 1 };
|
|
1293
|
+
}
|
|
1294
|
+
}
|
|
1295
|
+
}
|
|
1296
|
+
}
|
|
1275
1297
|
}
|
|
1276
1298
|
return grid.map((row) => row.map((c) => c || { text: "", colSpan: 1, rowSpan: 1 }));
|
|
1277
1299
|
}
|
|
@@ -1609,14 +1631,25 @@ function cellTextToString(items) {
|
|
|
1609
1631
|
lines.push(curLine);
|
|
1610
1632
|
const textLines = lines.map((line) => {
|
|
1611
1633
|
const s = line.sort((a, b) => a.x - b.x);
|
|
1612
|
-
|
|
1634
|
+
if (s.length === 1) return s[0].text;
|
|
1635
|
+
let result = s[0].text;
|
|
1636
|
+
for (let j = 1; j < s.length; j++) {
|
|
1637
|
+
const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
|
|
1638
|
+
const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
|
|
1639
|
+
if (gap < avgFs * 0.3 && /[가-힣]$/.test(result) && /^[가-힣]/.test(s[j].text)) {
|
|
1640
|
+
result += s[j].text;
|
|
1641
|
+
} else {
|
|
1642
|
+
result += " " + s[j].text;
|
|
1643
|
+
}
|
|
1644
|
+
}
|
|
1645
|
+
return result;
|
|
1613
1646
|
});
|
|
1614
1647
|
if (textLines.length <= 1) return textLines[0] || "";
|
|
1615
1648
|
const merged = [textLines[0]];
|
|
1616
1649
|
for (let i = 1; i < textLines.length; i++) {
|
|
1617
1650
|
const prev = merged[merged.length - 1];
|
|
1618
1651
|
const curr = textLines[i];
|
|
1619
|
-
if (/[가-힣]$/.test(prev) && /^[가-힣]+$/.test(curr) && curr.length <=
|
|
1652
|
+
if (/[가-힣]$/.test(prev) && /^[가-힣]+$/.test(curr) && curr.length <= 8 && !curr.includes(" ")) {
|
|
1620
1653
|
merged[merged.length - 1] = prev + curr;
|
|
1621
1654
|
} else {
|
|
1622
1655
|
merged.push(curr);
|
|
@@ -1625,6 +1658,181 @@ function cellTextToString(items) {
|
|
|
1625
1658
|
return merged.join("\n");
|
|
1626
1659
|
}
|
|
1627
1660
|
|
|
1661
|
+
// src/pdf/cluster-detector.ts
|
|
1662
|
+
var Y_TOL = 3;
|
|
1663
|
+
var COL_CLUSTER_TOL = 15;
|
|
1664
|
+
var MIN_ROWS = 3;
|
|
1665
|
+
var MIN_COLS = 2;
|
|
1666
|
+
var MIN_GAP_FACTOR = 1.5;
|
|
1667
|
+
var MIN_COL_FILL_RATIO = 0.3;
|
|
1668
|
+
function detectClusterTables(items, pageNum) {
|
|
1669
|
+
if (items.length < MIN_ROWS * MIN_COLS) return [];
|
|
1670
|
+
const rows = groupByBaseline(items);
|
|
1671
|
+
if (rows.length < MIN_ROWS) return [];
|
|
1672
|
+
const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
|
|
1673
|
+
if (suspiciousRows.length < MIN_ROWS) return [];
|
|
1674
|
+
const columns = extractColumnClusters(suspiciousRows);
|
|
1675
|
+
if (columns.length < MIN_COLS) return [];
|
|
1676
|
+
const tableRegions = findTableRegions(rows, columns);
|
|
1677
|
+
const results = [];
|
|
1678
|
+
for (const region of tableRegions) {
|
|
1679
|
+
const table = buildClusterTable(region.rows, columns, pageNum);
|
|
1680
|
+
if (table) results.push(table);
|
|
1681
|
+
}
|
|
1682
|
+
return results;
|
|
1683
|
+
}
|
|
1684
|
+
function groupByBaseline(items) {
|
|
1685
|
+
if (items.length === 0) return [];
|
|
1686
|
+
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
1687
|
+
const rows = [];
|
|
1688
|
+
let curItems = [sorted[0]];
|
|
1689
|
+
let curY = sorted[0].y;
|
|
1690
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
1691
|
+
if (Math.abs(sorted[i].y - curY) <= Y_TOL) {
|
|
1692
|
+
curItems.push(sorted[i]);
|
|
1693
|
+
} else {
|
|
1694
|
+
rows.push({ y: curY, items: curItems });
|
|
1695
|
+
curItems = [sorted[i]];
|
|
1696
|
+
curY = sorted[i].y;
|
|
1697
|
+
}
|
|
1698
|
+
}
|
|
1699
|
+
if (curItems.length > 0) rows.push({ y: curY, items: curItems });
|
|
1700
|
+
return rows;
|
|
1701
|
+
}
|
|
1702
|
+
function hasSuspiciousGaps(row) {
|
|
1703
|
+
if (row.items.length < 2) return false;
|
|
1704
|
+
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
1705
|
+
const avgFontSize = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
1706
|
+
const minGap = avgFontSize * MIN_GAP_FACTOR;
|
|
1707
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
1708
|
+
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
1709
|
+
if (gap >= minGap) return true;
|
|
1710
|
+
}
|
|
1711
|
+
return false;
|
|
1712
|
+
}
|
|
1713
|
+
function extractColumnClusters(rows) {
|
|
1714
|
+
const allX = [];
|
|
1715
|
+
for (const row of rows) {
|
|
1716
|
+
for (const item of row.items) allX.push(item.x);
|
|
1717
|
+
}
|
|
1718
|
+
if (allX.length === 0) return [];
|
|
1719
|
+
allX.sort((a, b) => a - b);
|
|
1720
|
+
const clusters = [];
|
|
1721
|
+
let clusterStart = 0;
|
|
1722
|
+
for (let i = 1; i <= allX.length; i++) {
|
|
1723
|
+
if (i === allX.length || allX[i] - allX[i - 1] > COL_CLUSTER_TOL) {
|
|
1724
|
+
const slice = allX.slice(clusterStart, i);
|
|
1725
|
+
const avg = Math.round(slice.reduce((s, v) => s + v, 0) / slice.length);
|
|
1726
|
+
clusters.push({ x: avg, count: slice.length });
|
|
1727
|
+
clusterStart = i;
|
|
1728
|
+
}
|
|
1729
|
+
}
|
|
1730
|
+
const minCount = Math.max(2, Math.floor(rows.length * MIN_COL_FILL_RATIO));
|
|
1731
|
+
return clusters.filter((c) => c.count >= minCount).sort((a, b) => a.x - b.x);
|
|
1732
|
+
}
|
|
1733
|
+
function findTableRegions(allRows, columns) {
|
|
1734
|
+
const regions = [];
|
|
1735
|
+
let currentRegion = [];
|
|
1736
|
+
for (const row of allRows) {
|
|
1737
|
+
const matchedCols = countMatchedColumns(row, columns);
|
|
1738
|
+
if (matchedCols >= MIN_COLS) {
|
|
1739
|
+
currentRegion.push(row);
|
|
1740
|
+
} else if (row.items.length === 1) {
|
|
1741
|
+
if (currentRegion.length > 0) {
|
|
1742
|
+
currentRegion.push(row);
|
|
1743
|
+
}
|
|
1744
|
+
} else {
|
|
1745
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
1746
|
+
regions.push({ rows: [...currentRegion] });
|
|
1747
|
+
}
|
|
1748
|
+
currentRegion = [];
|
|
1749
|
+
}
|
|
1750
|
+
}
|
|
1751
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
1752
|
+
regions.push({ rows: currentRegion });
|
|
1753
|
+
}
|
|
1754
|
+
return regions;
|
|
1755
|
+
}
|
|
1756
|
+
function countMatchedColumns(row, columns) {
|
|
1757
|
+
const matched = /* @__PURE__ */ new Set();
|
|
1758
|
+
for (const item of row.items) {
|
|
1759
|
+
for (let ci = 0; ci < columns.length; ci++) {
|
|
1760
|
+
if (Math.abs(item.x - columns[ci].x) <= COL_CLUSTER_TOL * 2) {
|
|
1761
|
+
matched.add(ci);
|
|
1762
|
+
break;
|
|
1763
|
+
}
|
|
1764
|
+
}
|
|
1765
|
+
}
|
|
1766
|
+
return matched.size;
|
|
1767
|
+
}
|
|
1768
|
+
function assignToColumn(item, columns) {
|
|
1769
|
+
const MAX_DIST = COL_CLUSTER_TOL * 3;
|
|
1770
|
+
let bestCol = -1;
|
|
1771
|
+
let bestDist = Infinity;
|
|
1772
|
+
for (let ci = 0; ci < columns.length; ci++) {
|
|
1773
|
+
const dist = Math.abs(item.x - columns[ci].x);
|
|
1774
|
+
if (dist < bestDist) {
|
|
1775
|
+
bestDist = dist;
|
|
1776
|
+
bestCol = ci;
|
|
1777
|
+
}
|
|
1778
|
+
}
|
|
1779
|
+
return bestDist <= MAX_DIST ? bestCol : -1;
|
|
1780
|
+
}
|
|
1781
|
+
function buildClusterTable(rows, columns, pageNum) {
|
|
1782
|
+
const numCols = columns.length;
|
|
1783
|
+
const numRows = rows.length;
|
|
1784
|
+
if (numRows < MIN_ROWS || numCols < MIN_COLS) return null;
|
|
1785
|
+
const cells = Array.from(
|
|
1786
|
+
{ length: numRows },
|
|
1787
|
+
() => Array.from({ length: numCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
|
|
1788
|
+
);
|
|
1789
|
+
const usedItems = /* @__PURE__ */ new Set();
|
|
1790
|
+
for (let r = 0; r < numRows; r++) {
|
|
1791
|
+
const row = rows[r];
|
|
1792
|
+
if (row.items.length === 1 && numCols > 1) {
|
|
1793
|
+
cells[r][0] = { text: row.items[0].text, colSpan: numCols, rowSpan: 1 };
|
|
1794
|
+
usedItems.add(row.items[0]);
|
|
1795
|
+
continue;
|
|
1796
|
+
}
|
|
1797
|
+
for (const item of row.items) {
|
|
1798
|
+
const col = assignToColumn(item, columns);
|
|
1799
|
+
if (col < 0) continue;
|
|
1800
|
+
const existing = cells[r][col].text;
|
|
1801
|
+
cells[r][col].text = existing ? existing + " " + item.text : item.text;
|
|
1802
|
+
usedItems.add(item);
|
|
1803
|
+
}
|
|
1804
|
+
}
|
|
1805
|
+
let emptyRows = 0;
|
|
1806
|
+
for (const row of cells) {
|
|
1807
|
+
if (row.every((c) => c.text === "")) emptyRows++;
|
|
1808
|
+
}
|
|
1809
|
+
if (emptyRows > numRows * 0.5) return null;
|
|
1810
|
+
for (let c = 0; c < numCols; c++) {
|
|
1811
|
+
const hasValue = cells.some((row) => row[c].text !== "");
|
|
1812
|
+
if (!hasValue) return null;
|
|
1813
|
+
}
|
|
1814
|
+
const irTable = {
|
|
1815
|
+
rows: numRows,
|
|
1816
|
+
cols: numCols,
|
|
1817
|
+
cells,
|
|
1818
|
+
hasHeader: numRows > 1
|
|
1819
|
+
};
|
|
1820
|
+
const allItems = rows.flatMap((r) => r.items);
|
|
1821
|
+
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
1822
|
+
for (const i of allItems) {
|
|
1823
|
+
if (i.x < minX) minX = i.x;
|
|
1824
|
+
if (i.y < minY) minY = i.y;
|
|
1825
|
+
if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
1826
|
+
const h = i.h > 0 ? i.h : i.fontSize;
|
|
1827
|
+
if (i.y + h > maxY) maxY = i.y + h;
|
|
1828
|
+
}
|
|
1829
|
+
return {
|
|
1830
|
+
table: irTable,
|
|
1831
|
+
bbox: { page: pageNum, x: minX, y: minY, width: maxX - minX, height: maxY - minY },
|
|
1832
|
+
usedItems
|
|
1833
|
+
};
|
|
1834
|
+
}
|
|
1835
|
+
|
|
1628
1836
|
// src/pdf/polyfill.ts
|
|
1629
1837
|
import * as pdfjsWorker from "pdfjs-dist/legacy/build/pdf.worker.mjs";
|
|
1630
1838
|
var g = globalThis;
|
|
@@ -1916,6 +2124,8 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
1916
2124
|
cells: irGrid,
|
|
1917
2125
|
hasHeader: numRows > 1
|
|
1918
2126
|
};
|
|
2127
|
+
const hasContent = irGrid.some((row) => row.some((cell) => cell.text.trim() !== ""));
|
|
2128
|
+
if (!hasContent) continue;
|
|
1919
2129
|
blocks.push({
|
|
1920
2130
|
type: "table",
|
|
1921
2131
|
table: irTable,
|
|
@@ -1953,19 +2163,28 @@ function extractPageBlocksFallback(items, pageNum) {
|
|
|
1953
2163
|
const bbox = computeBBox(items, pageNum);
|
|
1954
2164
|
blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
|
|
1955
2165
|
} else {
|
|
1956
|
-
const
|
|
1957
|
-
|
|
1958
|
-
|
|
1959
|
-
|
|
1960
|
-
|
|
1961
|
-
|
|
1962
|
-
|
|
1963
|
-
|
|
1964
|
-
|
|
1965
|
-
|
|
1966
|
-
|
|
1967
|
-
|
|
1968
|
-
|
|
2166
|
+
const clusterItems = items.map((i) => ({
|
|
2167
|
+
text: i.text,
|
|
2168
|
+
x: i.x,
|
|
2169
|
+
y: i.y,
|
|
2170
|
+
w: i.w,
|
|
2171
|
+
h: i.h,
|
|
2172
|
+
fontSize: i.fontSize,
|
|
2173
|
+
fontName: i.fontName
|
|
2174
|
+
}));
|
|
2175
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
2176
|
+
if (clusterResults.length > 0) {
|
|
2177
|
+
const usedIndices = /* @__PURE__ */ new Set();
|
|
2178
|
+
for (const cr of clusterResults) {
|
|
2179
|
+
for (const ci of cr.usedItems) {
|
|
2180
|
+
const idx = clusterItems.indexOf(ci);
|
|
2181
|
+
if (idx >= 0) usedIndices.add(idx);
|
|
2182
|
+
}
|
|
2183
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
2184
|
+
}
|
|
2185
|
+
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
2186
|
+
if (remaining.length > 0) {
|
|
2187
|
+
const yLines = groupByY(remaining);
|
|
1969
2188
|
for (const line of yLines) {
|
|
1970
2189
|
const text = mergeLineSimple(line);
|
|
1971
2190
|
if (!text.trim()) continue;
|
|
@@ -1973,9 +2192,36 @@ function extractPageBlocksFallback(items, pageNum) {
|
|
|
1973
2192
|
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
|
|
1974
2193
|
}
|
|
1975
2194
|
}
|
|
2195
|
+
blocks.sort((a, b) => {
|
|
2196
|
+
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
2197
|
+
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
2198
|
+
return by - ay;
|
|
2199
|
+
});
|
|
2200
|
+
} else {
|
|
2201
|
+
const allY = items.map((i) => i.y);
|
|
2202
|
+
const pageHeight = Math.max(...allY) - Math.min(...allY);
|
|
2203
|
+
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
2204
|
+
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
2205
|
+
for (const group of orderedGroups) {
|
|
2206
|
+
if (group.length === 0) continue;
|
|
2207
|
+
const yLines = groupByY(group);
|
|
2208
|
+
const groupColumns = detectColumns(yLines);
|
|
2209
|
+
if (groupColumns && groupColumns.length >= 3) {
|
|
2210
|
+
const tableText = extractWithColumns(yLines, groupColumns);
|
|
2211
|
+
const bbox = computeBBox(group, pageNum);
|
|
2212
|
+
blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(group) });
|
|
2213
|
+
} else {
|
|
2214
|
+
for (const line of yLines) {
|
|
2215
|
+
const text = mergeLineSimple(line);
|
|
2216
|
+
if (!text.trim()) continue;
|
|
2217
|
+
const bbox = computeBBox(line, pageNum);
|
|
2218
|
+
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
|
|
2219
|
+
}
|
|
2220
|
+
}
|
|
2221
|
+
}
|
|
1976
2222
|
}
|
|
1977
2223
|
}
|
|
1978
|
-
return blocks;
|
|
2224
|
+
return detectSpecialKoreanTables(blocks);
|
|
1979
2225
|
}
|
|
1980
2226
|
function computeBBox(items, pageNum) {
|
|
1981
2227
|
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
@@ -2241,8 +2487,10 @@ function mergeLineSimple(items) {
|
|
|
2241
2487
|
let result = sorted[0].text;
|
|
2242
2488
|
for (let i = 1; i < sorted.length; i++) {
|
|
2243
2489
|
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
2490
|
+
const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
|
|
2244
2491
|
if (gap > 15) result += " ";
|
|
2245
|
-
else if (gap
|
|
2492
|
+
else if (gap < avgFs * 0.3 && /[가-힣]$/.test(result) && /^[가-힣]/.test(sorted[i].text)) {
|
|
2493
|
+
} else if (gap > 3) result += " ";
|
|
2246
2494
|
result += sorted[i].text;
|
|
2247
2495
|
}
|
|
2248
2496
|
return result;
|
|
@@ -2280,6 +2528,90 @@ function detectListBlocks(blocks) {
|
|
|
2280
2528
|
}
|
|
2281
2529
|
return result;
|
|
2282
2530
|
}
|
|
2531
|
+
var KOREAN_TABLE_HEADER_RE = /^\(?(구분|항목|종류|분류|유형|대상|내용|기간|금액|비율|방법|절차|요건|조건|근거|목적|범위|기준)\)?[:\s]/;
|
|
2532
|
+
function detectSpecialKoreanTables(blocks) {
|
|
2533
|
+
const result = [];
|
|
2534
|
+
let kvLines = [];
|
|
2535
|
+
const flushKvTable = () => {
|
|
2536
|
+
if (kvLines.length < 2) {
|
|
2537
|
+
for (const kv of kvLines) result.push(kv.block);
|
|
2538
|
+
kvLines = [];
|
|
2539
|
+
return;
|
|
2540
|
+
}
|
|
2541
|
+
const cells = kvLines.map((kv) => {
|
|
2542
|
+
if (kv.value) {
|
|
2543
|
+
return [
|
|
2544
|
+
{ text: kv.key, colSpan: 1, rowSpan: 1 },
|
|
2545
|
+
{ text: kv.value, colSpan: 1, rowSpan: 1 }
|
|
2546
|
+
];
|
|
2547
|
+
}
|
|
2548
|
+
return [
|
|
2549
|
+
{ text: kv.key, colSpan: 2, rowSpan: 1 },
|
|
2550
|
+
{ text: "", colSpan: 1, rowSpan: 1 }
|
|
2551
|
+
];
|
|
2552
|
+
});
|
|
2553
|
+
const irTable = {
|
|
2554
|
+
rows: cells.length,
|
|
2555
|
+
cols: 2,
|
|
2556
|
+
cells,
|
|
2557
|
+
hasHeader: true
|
|
2558
|
+
};
|
|
2559
|
+
const firstBlock = kvLines[0].block;
|
|
2560
|
+
result.push({
|
|
2561
|
+
type: "table",
|
|
2562
|
+
table: irTable,
|
|
2563
|
+
pageNumber: firstBlock.pageNumber,
|
|
2564
|
+
bbox: firstBlock.bbox
|
|
2565
|
+
});
|
|
2566
|
+
kvLines = [];
|
|
2567
|
+
};
|
|
2568
|
+
for (const block of blocks) {
|
|
2569
|
+
if (block.type !== "paragraph" || !block.text) {
|
|
2570
|
+
flushKvTable();
|
|
2571
|
+
result.push(block);
|
|
2572
|
+
continue;
|
|
2573
|
+
}
|
|
2574
|
+
const text = block.text.trim();
|
|
2575
|
+
if (KOREAN_TABLE_HEADER_RE.test(text)) {
|
|
2576
|
+
const colonIdx = text.indexOf(":");
|
|
2577
|
+
if (colonIdx >= 0) {
|
|
2578
|
+
kvLines.push({
|
|
2579
|
+
key: text.slice(0, colonIdx).trim(),
|
|
2580
|
+
value: text.slice(colonIdx + 1).trim(),
|
|
2581
|
+
block
|
|
2582
|
+
});
|
|
2583
|
+
} else {
|
|
2584
|
+
const spaceIdx = text.search(/\s/);
|
|
2585
|
+
if (spaceIdx > 0) {
|
|
2586
|
+
kvLines.push({
|
|
2587
|
+
key: text.slice(0, spaceIdx).trim(),
|
|
2588
|
+
value: text.slice(spaceIdx + 1).trim(),
|
|
2589
|
+
block
|
|
2590
|
+
});
|
|
2591
|
+
} else {
|
|
2592
|
+
kvLines.push({ key: text, value: "", block });
|
|
2593
|
+
}
|
|
2594
|
+
}
|
|
2595
|
+
continue;
|
|
2596
|
+
}
|
|
2597
|
+
if (kvLines.length > 0 && text.includes(":") && !text.includes("(") && !text.includes(")")) {
|
|
2598
|
+
const colonIdx = text.indexOf(":");
|
|
2599
|
+
const key = text.slice(0, colonIdx).trim();
|
|
2600
|
+
if (/^[가-힣]+$/.test(key) && key.length >= 2 && key.length <= 8) {
|
|
2601
|
+
kvLines.push({
|
|
2602
|
+
key,
|
|
2603
|
+
value: text.slice(colonIdx + 1).trim(),
|
|
2604
|
+
block
|
|
2605
|
+
});
|
|
2606
|
+
continue;
|
|
2607
|
+
}
|
|
2608
|
+
}
|
|
2609
|
+
flushKvTable();
|
|
2610
|
+
result.push(block);
|
|
2611
|
+
}
|
|
2612
|
+
flushKvTable();
|
|
2613
|
+
return result;
|
|
2614
|
+
}
|
|
2283
2615
|
function mergeKoreanLines(text) {
|
|
2284
2616
|
if (!text) return "";
|
|
2285
2617
|
const lines = text.split("\n");
|
|
@@ -2654,4 +2986,4 @@ export {
|
|
|
2654
2986
|
extractFormFields,
|
|
2655
2987
|
parse
|
|
2656
2988
|
};
|
|
2657
|
-
//# sourceMappingURL=chunk-
|
|
2989
|
+
//# sourceMappingURL=chunk-DYUB34PO.js.map
|