kordoc 1.5.0 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -1
- package/dist/{chunk-5SZWGBNL.js → chunk-TFGOV2ML.js} +329 -19
- package/dist/chunk-TFGOV2ML.js.map +1 -0
- package/dist/cli.js +2 -2
- package/dist/index.cjs +328 -18
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +328 -18
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +1 -1
- package/dist/{watch-YCWNFYAW.js → watch-WMRLOFYY.js} +2 -2
- package/package.json +1 -1
- package/dist/chunk-5SZWGBNL.js.map +0 -1
- /package/dist/{watch-YCWNFYAW.js.map → watch-WMRLOFYY.js.map} +0 -0
package/README.md
CHANGED
|
@@ -14,7 +14,15 @@
|
|
|
14
14
|
|
|
15
15
|
---
|
|
16
16
|
|
|
17
|
-
## What's New in v1.
|
|
17
|
+
## What's New in v1.6.0
|
|
18
|
+
|
|
19
|
+
- **Cluster-Based Table Detection (PDF)** — Detects borderless tables by analyzing text alignment patterns. Baseline grouping + X-coordinate clustering identifies 2+ column tables that line-based detection misses. Sort-and-split clustering for order-independent results.
|
|
20
|
+
- **Korean Special Table Detection** — Automatically detects `구분/항목/종류`-style key-value patterns common in Korean government documents and converts them to structured 2-column tables.
|
|
21
|
+
- **Korean Word-Break Recovery** — Improved merging of broken Korean words in PDF table cells. Handles character-level PDF rendering (micro-gaps between Hangul characters) and cell line-break artifacts up to 8 characters.
|
|
22
|
+
- **Empty Table Filtering** — Tables with all-empty cells (from line detection of decorative borders) are now automatically removed.
|
|
23
|
+
|
|
24
|
+
<details>
|
|
25
|
+
<summary>v1.5.0 features</summary>
|
|
18
26
|
|
|
19
27
|
- **Line-Based Table Detection (PDF)** — Ported from OpenDataLoader. Extracts horizontal/vertical lines from PDF graphics commands, builds grid via intersection vertices, maps text to cells by bbox overlap. Proper colspan/rowspan detection. Falls back to heuristic for line-free PDFs.
|
|
20
28
|
- **IRBlock v2** — 6 block types: `heading`, `paragraph`, `table`, `list`, `image`, `separator`. New fields: `bbox`, `style`, `pageNumber`, `level`, `href`, `footnoteText`.
|
|
@@ -25,6 +33,8 @@
|
|
|
25
33
|
- **List Detection** — Numbered paragraphs after tables auto-converted to ordered list blocks.
|
|
26
34
|
- **MCP Server** — Now returns `outline` and `warnings` in parse_document responses.
|
|
27
35
|
|
|
36
|
+
</details>
|
|
37
|
+
|
|
28
38
|
<details>
|
|
29
39
|
<summary>v1.4.x features</summary>
|
|
30
40
|
|
|
@@ -185,7 +185,7 @@ function tableToMarkdown(table) {
|
|
|
185
185
|
}
|
|
186
186
|
|
|
187
187
|
// src/utils.ts
|
|
188
|
-
var VERSION = true ? "1.
|
|
188
|
+
var VERSION = true ? "1.6.0" : "0.0.0-dev";
|
|
189
189
|
function toArrayBuffer(buf) {
|
|
190
190
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
191
191
|
return buf.buffer;
|
|
@@ -1609,14 +1609,25 @@ function cellTextToString(items) {
|
|
|
1609
1609
|
lines.push(curLine);
|
|
1610
1610
|
const textLines = lines.map((line) => {
|
|
1611
1611
|
const s = line.sort((a, b) => a.x - b.x);
|
|
1612
|
-
|
|
1612
|
+
if (s.length === 1) return s[0].text;
|
|
1613
|
+
let result = s[0].text;
|
|
1614
|
+
for (let j = 1; j < s.length; j++) {
|
|
1615
|
+
const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
|
|
1616
|
+
const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
|
|
1617
|
+
if (gap < avgFs * 0.3 && /[가-힣]$/.test(result) && /^[가-힣]/.test(s[j].text)) {
|
|
1618
|
+
result += s[j].text;
|
|
1619
|
+
} else {
|
|
1620
|
+
result += " " + s[j].text;
|
|
1621
|
+
}
|
|
1622
|
+
}
|
|
1623
|
+
return result;
|
|
1613
1624
|
});
|
|
1614
1625
|
if (textLines.length <= 1) return textLines[0] || "";
|
|
1615
1626
|
const merged = [textLines[0]];
|
|
1616
1627
|
for (let i = 1; i < textLines.length; i++) {
|
|
1617
1628
|
const prev = merged[merged.length - 1];
|
|
1618
1629
|
const curr = textLines[i];
|
|
1619
|
-
if (/[가-힣]$/.test(prev) && /^[가-힣]+$/.test(curr) && curr.length <=
|
|
1630
|
+
if (/[가-힣]$/.test(prev) && /^[가-힣]+$/.test(curr) && curr.length <= 8 && !curr.includes(" ")) {
|
|
1620
1631
|
merged[merged.length - 1] = prev + curr;
|
|
1621
1632
|
} else {
|
|
1622
1633
|
merged.push(curr);
|
|
@@ -1625,6 +1636,181 @@ function cellTextToString(items) {
|
|
|
1625
1636
|
return merged.join("\n");
|
|
1626
1637
|
}
|
|
1627
1638
|
|
|
1639
|
+
// src/pdf/cluster-detector.ts
|
|
1640
|
+
var Y_TOL = 3;
|
|
1641
|
+
var COL_CLUSTER_TOL = 15;
|
|
1642
|
+
var MIN_ROWS = 3;
|
|
1643
|
+
var MIN_COLS = 2;
|
|
1644
|
+
var MIN_GAP_FACTOR = 1.5;
|
|
1645
|
+
var MIN_COL_FILL_RATIO = 0.3;
|
|
1646
|
+
function detectClusterTables(items, pageNum) {
|
|
1647
|
+
if (items.length < MIN_ROWS * MIN_COLS) return [];
|
|
1648
|
+
const rows = groupByBaseline(items);
|
|
1649
|
+
if (rows.length < MIN_ROWS) return [];
|
|
1650
|
+
const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
|
|
1651
|
+
if (suspiciousRows.length < MIN_ROWS) return [];
|
|
1652
|
+
const columns = extractColumnClusters(suspiciousRows);
|
|
1653
|
+
if (columns.length < MIN_COLS) return [];
|
|
1654
|
+
const tableRegions = findTableRegions(rows, columns);
|
|
1655
|
+
const results = [];
|
|
1656
|
+
for (const region of tableRegions) {
|
|
1657
|
+
const table = buildClusterTable(region.rows, columns, pageNum);
|
|
1658
|
+
if (table) results.push(table);
|
|
1659
|
+
}
|
|
1660
|
+
return results;
|
|
1661
|
+
}
|
|
1662
|
+
function groupByBaseline(items) {
|
|
1663
|
+
if (items.length === 0) return [];
|
|
1664
|
+
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
1665
|
+
const rows = [];
|
|
1666
|
+
let curItems = [sorted[0]];
|
|
1667
|
+
let curY = sorted[0].y;
|
|
1668
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
1669
|
+
if (Math.abs(sorted[i].y - curY) <= Y_TOL) {
|
|
1670
|
+
curItems.push(sorted[i]);
|
|
1671
|
+
} else {
|
|
1672
|
+
rows.push({ y: curY, items: curItems });
|
|
1673
|
+
curItems = [sorted[i]];
|
|
1674
|
+
curY = sorted[i].y;
|
|
1675
|
+
}
|
|
1676
|
+
}
|
|
1677
|
+
if (curItems.length > 0) rows.push({ y: curY, items: curItems });
|
|
1678
|
+
return rows;
|
|
1679
|
+
}
|
|
1680
|
+
function hasSuspiciousGaps(row) {
|
|
1681
|
+
if (row.items.length < 2) return false;
|
|
1682
|
+
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
1683
|
+
const avgFontSize = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
1684
|
+
const minGap = avgFontSize * MIN_GAP_FACTOR;
|
|
1685
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
1686
|
+
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
1687
|
+
if (gap >= minGap) return true;
|
|
1688
|
+
}
|
|
1689
|
+
return false;
|
|
1690
|
+
}
|
|
1691
|
+
function extractColumnClusters(rows) {
|
|
1692
|
+
const allX = [];
|
|
1693
|
+
for (const row of rows) {
|
|
1694
|
+
for (const item of row.items) allX.push(item.x);
|
|
1695
|
+
}
|
|
1696
|
+
if (allX.length === 0) return [];
|
|
1697
|
+
allX.sort((a, b) => a - b);
|
|
1698
|
+
const clusters = [];
|
|
1699
|
+
let clusterStart = 0;
|
|
1700
|
+
for (let i = 1; i <= allX.length; i++) {
|
|
1701
|
+
if (i === allX.length || allX[i] - allX[i - 1] > COL_CLUSTER_TOL) {
|
|
1702
|
+
const slice = allX.slice(clusterStart, i);
|
|
1703
|
+
const avg = Math.round(slice.reduce((s, v) => s + v, 0) / slice.length);
|
|
1704
|
+
clusters.push({ x: avg, count: slice.length });
|
|
1705
|
+
clusterStart = i;
|
|
1706
|
+
}
|
|
1707
|
+
}
|
|
1708
|
+
const minCount = Math.max(2, Math.floor(rows.length * MIN_COL_FILL_RATIO));
|
|
1709
|
+
return clusters.filter((c) => c.count >= minCount).sort((a, b) => a.x - b.x);
|
|
1710
|
+
}
|
|
1711
|
+
function findTableRegions(allRows, columns) {
|
|
1712
|
+
const regions = [];
|
|
1713
|
+
let currentRegion = [];
|
|
1714
|
+
for (const row of allRows) {
|
|
1715
|
+
const matchedCols = countMatchedColumns(row, columns);
|
|
1716
|
+
if (matchedCols >= MIN_COLS) {
|
|
1717
|
+
currentRegion.push(row);
|
|
1718
|
+
} else if (row.items.length === 1) {
|
|
1719
|
+
if (currentRegion.length > 0) {
|
|
1720
|
+
currentRegion.push(row);
|
|
1721
|
+
}
|
|
1722
|
+
} else {
|
|
1723
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
1724
|
+
regions.push({ rows: [...currentRegion] });
|
|
1725
|
+
}
|
|
1726
|
+
currentRegion = [];
|
|
1727
|
+
}
|
|
1728
|
+
}
|
|
1729
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
1730
|
+
regions.push({ rows: currentRegion });
|
|
1731
|
+
}
|
|
1732
|
+
return regions;
|
|
1733
|
+
}
|
|
1734
|
+
function countMatchedColumns(row, columns) {
|
|
1735
|
+
const matched = /* @__PURE__ */ new Set();
|
|
1736
|
+
for (const item of row.items) {
|
|
1737
|
+
for (let ci = 0; ci < columns.length; ci++) {
|
|
1738
|
+
if (Math.abs(item.x - columns[ci].x) <= COL_CLUSTER_TOL * 2) {
|
|
1739
|
+
matched.add(ci);
|
|
1740
|
+
break;
|
|
1741
|
+
}
|
|
1742
|
+
}
|
|
1743
|
+
}
|
|
1744
|
+
return matched.size;
|
|
1745
|
+
}
|
|
1746
|
+
function assignToColumn(item, columns) {
|
|
1747
|
+
const MAX_DIST = COL_CLUSTER_TOL * 3;
|
|
1748
|
+
let bestCol = -1;
|
|
1749
|
+
let bestDist = Infinity;
|
|
1750
|
+
for (let ci = 0; ci < columns.length; ci++) {
|
|
1751
|
+
const dist = Math.abs(item.x - columns[ci].x);
|
|
1752
|
+
if (dist < bestDist) {
|
|
1753
|
+
bestDist = dist;
|
|
1754
|
+
bestCol = ci;
|
|
1755
|
+
}
|
|
1756
|
+
}
|
|
1757
|
+
return bestDist <= MAX_DIST ? bestCol : -1;
|
|
1758
|
+
}
|
|
1759
|
+
function buildClusterTable(rows, columns, pageNum) {
|
|
1760
|
+
const numCols = columns.length;
|
|
1761
|
+
const numRows = rows.length;
|
|
1762
|
+
if (numRows < MIN_ROWS || numCols < MIN_COLS) return null;
|
|
1763
|
+
const cells = Array.from(
|
|
1764
|
+
{ length: numRows },
|
|
1765
|
+
() => Array.from({ length: numCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
|
|
1766
|
+
);
|
|
1767
|
+
const usedItems = /* @__PURE__ */ new Set();
|
|
1768
|
+
for (let r = 0; r < numRows; r++) {
|
|
1769
|
+
const row = rows[r];
|
|
1770
|
+
if (row.items.length === 1 && numCols > 1) {
|
|
1771
|
+
cells[r][0] = { text: row.items[0].text, colSpan: numCols, rowSpan: 1 };
|
|
1772
|
+
usedItems.add(row.items[0]);
|
|
1773
|
+
continue;
|
|
1774
|
+
}
|
|
1775
|
+
for (const item of row.items) {
|
|
1776
|
+
const col = assignToColumn(item, columns);
|
|
1777
|
+
if (col < 0) continue;
|
|
1778
|
+
const existing = cells[r][col].text;
|
|
1779
|
+
cells[r][col].text = existing ? existing + " " + item.text : item.text;
|
|
1780
|
+
usedItems.add(item);
|
|
1781
|
+
}
|
|
1782
|
+
}
|
|
1783
|
+
let emptyRows = 0;
|
|
1784
|
+
for (const row of cells) {
|
|
1785
|
+
if (row.every((c) => c.text === "")) emptyRows++;
|
|
1786
|
+
}
|
|
1787
|
+
if (emptyRows > numRows * 0.5) return null;
|
|
1788
|
+
for (let c = 0; c < numCols; c++) {
|
|
1789
|
+
const hasValue = cells.some((row) => row[c].text !== "");
|
|
1790
|
+
if (!hasValue) return null;
|
|
1791
|
+
}
|
|
1792
|
+
const irTable = {
|
|
1793
|
+
rows: numRows,
|
|
1794
|
+
cols: numCols,
|
|
1795
|
+
cells,
|
|
1796
|
+
hasHeader: numRows > 1
|
|
1797
|
+
};
|
|
1798
|
+
const allItems = rows.flatMap((r) => r.items);
|
|
1799
|
+
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
1800
|
+
for (const i of allItems) {
|
|
1801
|
+
if (i.x < minX) minX = i.x;
|
|
1802
|
+
if (i.y < minY) minY = i.y;
|
|
1803
|
+
if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
1804
|
+
const h = i.h > 0 ? i.h : i.fontSize;
|
|
1805
|
+
if (i.y + h > maxY) maxY = i.y + h;
|
|
1806
|
+
}
|
|
1807
|
+
return {
|
|
1808
|
+
table: irTable,
|
|
1809
|
+
bbox: { page: pageNum, x: minX, y: minY, width: maxX - minX, height: maxY - minY },
|
|
1810
|
+
usedItems
|
|
1811
|
+
};
|
|
1812
|
+
}
|
|
1813
|
+
|
|
1628
1814
|
// src/pdf/polyfill.ts
|
|
1629
1815
|
import * as pdfjsWorker from "pdfjs-dist/legacy/build/pdf.worker.mjs";
|
|
1630
1816
|
var g = globalThis;
|
|
@@ -1916,6 +2102,8 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
1916
2102
|
cells: irGrid,
|
|
1917
2103
|
hasHeader: numRows > 1
|
|
1918
2104
|
};
|
|
2105
|
+
const hasContent = irGrid.some((row) => row.some((cell) => cell.text.trim() !== ""));
|
|
2106
|
+
if (!hasContent) continue;
|
|
1919
2107
|
blocks.push({
|
|
1920
2108
|
type: "table",
|
|
1921
2109
|
table: irTable,
|
|
@@ -1953,19 +2141,28 @@ function extractPageBlocksFallback(items, pageNum) {
|
|
|
1953
2141
|
const bbox = computeBBox(items, pageNum);
|
|
1954
2142
|
blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
|
|
1955
2143
|
} else {
|
|
1956
|
-
const
|
|
1957
|
-
|
|
1958
|
-
|
|
1959
|
-
|
|
1960
|
-
|
|
1961
|
-
|
|
1962
|
-
|
|
1963
|
-
|
|
1964
|
-
|
|
1965
|
-
|
|
1966
|
-
|
|
1967
|
-
|
|
1968
|
-
|
|
2144
|
+
const clusterItems = items.map((i) => ({
|
|
2145
|
+
text: i.text,
|
|
2146
|
+
x: i.x,
|
|
2147
|
+
y: i.y,
|
|
2148
|
+
w: i.w,
|
|
2149
|
+
h: i.h,
|
|
2150
|
+
fontSize: i.fontSize,
|
|
2151
|
+
fontName: i.fontName
|
|
2152
|
+
}));
|
|
2153
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
2154
|
+
if (clusterResults.length > 0) {
|
|
2155
|
+
const usedIndices = /* @__PURE__ */ new Set();
|
|
2156
|
+
for (const cr of clusterResults) {
|
|
2157
|
+
for (const ci of cr.usedItems) {
|
|
2158
|
+
const idx = clusterItems.indexOf(ci);
|
|
2159
|
+
if (idx >= 0) usedIndices.add(idx);
|
|
2160
|
+
}
|
|
2161
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
2162
|
+
}
|
|
2163
|
+
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
2164
|
+
if (remaining.length > 0) {
|
|
2165
|
+
const yLines = groupByY(remaining);
|
|
1969
2166
|
for (const line of yLines) {
|
|
1970
2167
|
const text = mergeLineSimple(line);
|
|
1971
2168
|
if (!text.trim()) continue;
|
|
@@ -1973,9 +2170,36 @@ function extractPageBlocksFallback(items, pageNum) {
|
|
|
1973
2170
|
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
|
|
1974
2171
|
}
|
|
1975
2172
|
}
|
|
2173
|
+
blocks.sort((a, b) => {
|
|
2174
|
+
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
2175
|
+
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
2176
|
+
return by - ay;
|
|
2177
|
+
});
|
|
2178
|
+
} else {
|
|
2179
|
+
const allY = items.map((i) => i.y);
|
|
2180
|
+
const pageHeight = Math.max(...allY) - Math.min(...allY);
|
|
2181
|
+
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
2182
|
+
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
2183
|
+
for (const group of orderedGroups) {
|
|
2184
|
+
if (group.length === 0) continue;
|
|
2185
|
+
const yLines = groupByY(group);
|
|
2186
|
+
const groupColumns = detectColumns(yLines);
|
|
2187
|
+
if (groupColumns && groupColumns.length >= 3) {
|
|
2188
|
+
const tableText = extractWithColumns(yLines, groupColumns);
|
|
2189
|
+
const bbox = computeBBox(group, pageNum);
|
|
2190
|
+
blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(group) });
|
|
2191
|
+
} else {
|
|
2192
|
+
for (const line of yLines) {
|
|
2193
|
+
const text = mergeLineSimple(line);
|
|
2194
|
+
if (!text.trim()) continue;
|
|
2195
|
+
const bbox = computeBBox(line, pageNum);
|
|
2196
|
+
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
|
|
2197
|
+
}
|
|
2198
|
+
}
|
|
2199
|
+
}
|
|
1976
2200
|
}
|
|
1977
2201
|
}
|
|
1978
|
-
return blocks;
|
|
2202
|
+
return detectSpecialKoreanTables(blocks);
|
|
1979
2203
|
}
|
|
1980
2204
|
function computeBBox(items, pageNum) {
|
|
1981
2205
|
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
@@ -2241,8 +2465,10 @@ function mergeLineSimple(items) {
|
|
|
2241
2465
|
let result = sorted[0].text;
|
|
2242
2466
|
for (let i = 1; i < sorted.length; i++) {
|
|
2243
2467
|
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
2468
|
+
const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
|
|
2244
2469
|
if (gap > 15) result += " ";
|
|
2245
|
-
else if (gap
|
|
2470
|
+
else if (gap < avgFs * 0.3 && /[가-힣]$/.test(result) && /^[가-힣]/.test(sorted[i].text)) {
|
|
2471
|
+
} else if (gap > 3) result += " ";
|
|
2246
2472
|
result += sorted[i].text;
|
|
2247
2473
|
}
|
|
2248
2474
|
return result;
|
|
@@ -2280,6 +2506,90 @@ function detectListBlocks(blocks) {
|
|
|
2280
2506
|
}
|
|
2281
2507
|
return result;
|
|
2282
2508
|
}
|
|
2509
|
+
var KOREAN_TABLE_HEADER_RE = /^\(?(구분|항목|종류|분류|유형|대상|내용|기간|금액|비율|방법|절차|요건|조건|근거|목적|범위|기준)\)?[:\s]/;
|
|
2510
|
+
function detectSpecialKoreanTables(blocks) {
|
|
2511
|
+
const result = [];
|
|
2512
|
+
let kvLines = [];
|
|
2513
|
+
const flushKvTable = () => {
|
|
2514
|
+
if (kvLines.length < 2) {
|
|
2515
|
+
for (const kv of kvLines) result.push(kv.block);
|
|
2516
|
+
kvLines = [];
|
|
2517
|
+
return;
|
|
2518
|
+
}
|
|
2519
|
+
const cells = kvLines.map((kv) => {
|
|
2520
|
+
if (kv.value) {
|
|
2521
|
+
return [
|
|
2522
|
+
{ text: kv.key, colSpan: 1, rowSpan: 1 },
|
|
2523
|
+
{ text: kv.value, colSpan: 1, rowSpan: 1 }
|
|
2524
|
+
];
|
|
2525
|
+
}
|
|
2526
|
+
return [
|
|
2527
|
+
{ text: kv.key, colSpan: 2, rowSpan: 1 },
|
|
2528
|
+
{ text: "", colSpan: 1, rowSpan: 1 }
|
|
2529
|
+
];
|
|
2530
|
+
});
|
|
2531
|
+
const irTable = {
|
|
2532
|
+
rows: cells.length,
|
|
2533
|
+
cols: 2,
|
|
2534
|
+
cells,
|
|
2535
|
+
hasHeader: true
|
|
2536
|
+
};
|
|
2537
|
+
const firstBlock = kvLines[0].block;
|
|
2538
|
+
result.push({
|
|
2539
|
+
type: "table",
|
|
2540
|
+
table: irTable,
|
|
2541
|
+
pageNumber: firstBlock.pageNumber,
|
|
2542
|
+
bbox: firstBlock.bbox
|
|
2543
|
+
});
|
|
2544
|
+
kvLines = [];
|
|
2545
|
+
};
|
|
2546
|
+
for (const block of blocks) {
|
|
2547
|
+
if (block.type !== "paragraph" || !block.text) {
|
|
2548
|
+
flushKvTable();
|
|
2549
|
+
result.push(block);
|
|
2550
|
+
continue;
|
|
2551
|
+
}
|
|
2552
|
+
const text = block.text.trim();
|
|
2553
|
+
if (KOREAN_TABLE_HEADER_RE.test(text)) {
|
|
2554
|
+
const colonIdx = text.indexOf(":");
|
|
2555
|
+
if (colonIdx >= 0) {
|
|
2556
|
+
kvLines.push({
|
|
2557
|
+
key: text.slice(0, colonIdx).trim(),
|
|
2558
|
+
value: text.slice(colonIdx + 1).trim(),
|
|
2559
|
+
block
|
|
2560
|
+
});
|
|
2561
|
+
} else {
|
|
2562
|
+
const spaceIdx = text.search(/\s/);
|
|
2563
|
+
if (spaceIdx > 0) {
|
|
2564
|
+
kvLines.push({
|
|
2565
|
+
key: text.slice(0, spaceIdx).trim(),
|
|
2566
|
+
value: text.slice(spaceIdx + 1).trim(),
|
|
2567
|
+
block
|
|
2568
|
+
});
|
|
2569
|
+
} else {
|
|
2570
|
+
kvLines.push({ key: text, value: "", block });
|
|
2571
|
+
}
|
|
2572
|
+
}
|
|
2573
|
+
continue;
|
|
2574
|
+
}
|
|
2575
|
+
if (kvLines.length > 0 && text.includes(":") && !text.includes("(") && !text.includes(")")) {
|
|
2576
|
+
const colonIdx = text.indexOf(":");
|
|
2577
|
+
const key = text.slice(0, colonIdx).trim();
|
|
2578
|
+
if (/^[가-힣]+$/.test(key) && key.length >= 2 && key.length <= 8) {
|
|
2579
|
+
kvLines.push({
|
|
2580
|
+
key,
|
|
2581
|
+
value: text.slice(colonIdx + 1).trim(),
|
|
2582
|
+
block
|
|
2583
|
+
});
|
|
2584
|
+
continue;
|
|
2585
|
+
}
|
|
2586
|
+
}
|
|
2587
|
+
flushKvTable();
|
|
2588
|
+
result.push(block);
|
|
2589
|
+
}
|
|
2590
|
+
flushKvTable();
|
|
2591
|
+
return result;
|
|
2592
|
+
}
|
|
2283
2593
|
function mergeKoreanLines(text) {
|
|
2284
2594
|
if (!text) return "";
|
|
2285
2595
|
const lines = text.split("\n");
|
|
@@ -2654,4 +2964,4 @@ export {
|
|
|
2654
2964
|
extractFormFields,
|
|
2655
2965
|
parse
|
|
2656
2966
|
};
|
|
2657
|
-
//# sourceMappingURL=chunk-
|
|
2967
|
+
//# sourceMappingURL=chunk-TFGOV2ML.js.map
|