kordoc 1.5.0 → 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -14,7 +14,23 @@
14
14
 
15
15
  ---
16
16
 
17
- ## What's New in v1.5.0
17
+ ## What's New in v1.6.1
18
+
19
+ - **HWP5 Table Cell Offset Fix** — Fixed critical 2-byte offset misalignment in LIST_HEADER parsing. Row address was incorrectly read as colSpan, causing 3-column tables to explode into 6+ columns with misaligned content. Tables now use colAddr/rowAddr-based direct placement for accurate cell positioning.
20
+ - **HWP5 TAB Control Character Fix** — TAB (0x0009) inline control's 14-byte extension data was not skipped, producing garbage characters (`࣐Ā`) after every tab in the output. Fixed by adding the required 14-byte skip.
21
+
22
+ <details>
23
+ <summary>v1.6.0 features</summary>
24
+
25
+ - **Cluster-Based Table Detection (PDF)** — Detects borderless tables by analyzing text alignment patterns. Baseline grouping + X-coordinate clustering identifies 2+ column tables that line-based detection misses. Sort-and-split clustering for order-independent results.
26
+ - **Korean Special Table Detection** — Automatically detects `구분/항목/종류`-style key-value patterns common in Korean government documents and converts them to structured 2-column tables.
27
+ - **Korean Word-Break Recovery** — Improved merging of broken Korean words in PDF table cells. Handles character-level PDF rendering (micro-gaps between Hangul characters) and cell line-break artifacts up to 8 characters.
28
+ - **Empty Table Filtering** — Tables with all-empty cells (from line detection of decorative borders) are now automatically removed.
29
+
30
+ </details>
31
+
32
+ <details>
33
+ <summary>v1.5.0 features</summary>
18
34
 
19
35
  - **Line-Based Table Detection (PDF)** — Ported from OpenDataLoader. Extracts horizontal/vertical lines from PDF graphics commands, builds grid via intersection vertices, maps text to cells by bbox overlap. Proper colspan/rowspan detection. Falls back to heuristic for line-free PDFs.
20
36
  - **IRBlock v2** — 6 block types: `heading`, `paragraph`, `table`, `list`, `image`, `separator`. New fields: `bbox`, `style`, `pageNumber`, `level`, `href`, `footnoteText`.
@@ -25,6 +41,8 @@
25
41
  - **List Detection** — Numbered paragraphs after tables auto-converted to ordered list blocks.
26
42
  - **MCP Server** — Now returns `outline` and `warnings` in parse_document responses.
27
43
 
44
+ </details>
45
+
28
46
  <details>
29
47
  <summary>v1.4.x features</summary>
30
48
 
@@ -205,7 +223,7 @@ import type {
205
223
  | Format | Engine | Features |
206
224
  |--------|--------|----------|
207
225
  | **HWPX** (한컴 2020+) | ZIP + XML DOM | Manifest, nested tables, merged cells, broken ZIP recovery |
208
- | **HWP 5.x** (한컴 Legacy) | OLE2 + CFB | 21 control chars, zlib decompression, DRM detection |
226
+ | **HWP 5.x** (한컴 Legacy) | OLE2 + CFB | 21 control chars, zlib decompression, DRM detection, colAddr-based table cell placement |
209
227
  | **PDF** | pdfjs-dist | Line-based table detection, XY-Cut reading order, heading detection, hidden text filter, OCR |
210
228
 
211
229
  ## Security
@@ -185,7 +185,7 @@ function tableToMarkdown(table) {
185
185
  }
186
186
 
187
187
  // src/utils.ts
188
- var VERSION = true ? "1.5.0" : "0.0.0-dev";
188
+ var VERSION = true ? "1.6.1" : "0.0.0-dev";
189
189
  function toArrayBuffer(buf) {
190
190
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
191
191
  return buf.buffer;
@@ -923,6 +923,7 @@ function extractText(data) {
923
923
  break;
924
924
  case CHAR_TAB:
925
925
  result += " ";
926
+ if (i + 14 <= data.length) i += 14;
926
927
  break;
927
928
  case CHAR_HYPHEN:
928
929
  result += "-";
@@ -1237,9 +1238,13 @@ function parseCellBlock(records, startIdx, tableLevel) {
1237
1238
  const texts = [];
1238
1239
  let colSpan = 1;
1239
1240
  let rowSpan = 1;
1240
- if (rec.data.length >= 14) {
1241
- const cs = rec.data.readUInt16LE(10);
1242
- const rs = rec.data.readUInt16LE(12);
1241
+ let colAddr;
1242
+ let rowAddr;
1243
+ if (rec.data.length >= 16) {
1244
+ colAddr = rec.data.readUInt16LE(8);
1245
+ rowAddr = rec.data.readUInt16LE(10);
1246
+ const cs = rec.data.readUInt16LE(12);
1247
+ const rs = rec.data.readUInt16LE(14);
1243
1248
  if (cs > 0) colSpan = Math.min(cs, MAX_COLS);
1244
1249
  if (rs > 0) rowSpan = Math.min(rs, MAX_ROWS);
1245
1250
  }
@@ -1254,15 +1259,16 @@ function parseCellBlock(records, startIdx, tableLevel) {
1254
1259
  }
1255
1260
  i++;
1256
1261
  }
1257
- return { cell: { text: texts.join("\n"), colSpan, rowSpan }, nextIdx: i };
1262
+ return { cell: { text: texts.join("\n"), colSpan, rowSpan, colAddr, rowAddr }, nextIdx: i };
1258
1263
  }
1259
1264
  function arrangeCells(rows, cols, cells) {
1260
1265
  const grid = Array.from({ length: rows }, () => Array(cols).fill(null));
1261
- let cellIdx = 0;
1262
- for (let r = 0; r < rows && cellIdx < cells.length; r++) {
1263
- for (let c = 0; c < cols && cellIdx < cells.length; c++) {
1264
- if (grid[r][c] !== null) continue;
1265
- const cell = cells[cellIdx++];
1266
+ const hasAddr = cells.some((c) => c.colAddr !== void 0 && c.rowAddr !== void 0);
1267
+ if (hasAddr) {
1268
+ for (const cell of cells) {
1269
+ const r = cell.rowAddr ?? 0;
1270
+ const c = cell.colAddr ?? 0;
1271
+ if (r >= rows || c >= cols) continue;
1266
1272
  grid[r][c] = cell;
1267
1273
  for (let dr = 0; dr < cell.rowSpan; dr++) {
1268
1274
  for (let dc = 0; dc < cell.colSpan; dc++) {
@@ -1272,6 +1278,22 @@ function arrangeCells(rows, cols, cells) {
1272
1278
  }
1273
1279
  }
1274
1280
  }
1281
+ } else {
1282
+ let cellIdx = 0;
1283
+ for (let r = 0; r < rows && cellIdx < cells.length; r++) {
1284
+ for (let c = 0; c < cols && cellIdx < cells.length; c++) {
1285
+ if (grid[r][c] !== null) continue;
1286
+ const cell = cells[cellIdx++];
1287
+ grid[r][c] = cell;
1288
+ for (let dr = 0; dr < cell.rowSpan; dr++) {
1289
+ for (let dc = 0; dc < cell.colSpan; dc++) {
1290
+ if (dr === 0 && dc === 0) continue;
1291
+ if (r + dr < rows && c + dc < cols)
1292
+ grid[r + dr][c + dc] = { text: "", colSpan: 1, rowSpan: 1 };
1293
+ }
1294
+ }
1295
+ }
1296
+ }
1275
1297
  }
1276
1298
  return grid.map((row) => row.map((c) => c || { text: "", colSpan: 1, rowSpan: 1 }));
1277
1299
  }
@@ -1609,14 +1631,25 @@ function cellTextToString(items) {
1609
1631
  lines.push(curLine);
1610
1632
  const textLines = lines.map((line) => {
1611
1633
  const s = line.sort((a, b) => a.x - b.x);
1612
- return s.map((i) => i.text).join(" ");
1634
+ if (s.length === 1) return s[0].text;
1635
+ let result = s[0].text;
1636
+ for (let j = 1; j < s.length; j++) {
1637
+ const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
1638
+ const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
1639
+ if (gap < avgFs * 0.3 && /[가-힣]$/.test(result) && /^[가-힣]/.test(s[j].text)) {
1640
+ result += s[j].text;
1641
+ } else {
1642
+ result += " " + s[j].text;
1643
+ }
1644
+ }
1645
+ return result;
1613
1646
  });
1614
1647
  if (textLines.length <= 1) return textLines[0] || "";
1615
1648
  const merged = [textLines[0]];
1616
1649
  for (let i = 1; i < textLines.length; i++) {
1617
1650
  const prev = merged[merged.length - 1];
1618
1651
  const curr = textLines[i];
1619
- if (/[가-힣]$/.test(prev) && /^[가-힣]+$/.test(curr) && curr.length <= 5 && !curr.includes(" ")) {
1652
+ if (/[가-힣]$/.test(prev) && /^[가-힣]+$/.test(curr) && curr.length <= 8 && !curr.includes(" ")) {
1620
1653
  merged[merged.length - 1] = prev + curr;
1621
1654
  } else {
1622
1655
  merged.push(curr);
@@ -1625,6 +1658,181 @@ function cellTextToString(items) {
1625
1658
  return merged.join("\n");
1626
1659
  }
1627
1660
 
1661
+ // src/pdf/cluster-detector.ts
1662
+ var Y_TOL = 3;
1663
+ var COL_CLUSTER_TOL = 15;
1664
+ var MIN_ROWS = 3;
1665
+ var MIN_COLS = 2;
1666
+ var MIN_GAP_FACTOR = 1.5;
1667
+ var MIN_COL_FILL_RATIO = 0.3;
1668
+ function detectClusterTables(items, pageNum) {
1669
+ if (items.length < MIN_ROWS * MIN_COLS) return [];
1670
+ const rows = groupByBaseline(items);
1671
+ if (rows.length < MIN_ROWS) return [];
1672
+ const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
1673
+ if (suspiciousRows.length < MIN_ROWS) return [];
1674
+ const columns = extractColumnClusters(suspiciousRows);
1675
+ if (columns.length < MIN_COLS) return [];
1676
+ const tableRegions = findTableRegions(rows, columns);
1677
+ const results = [];
1678
+ for (const region of tableRegions) {
1679
+ const table = buildClusterTable(region.rows, columns, pageNum);
1680
+ if (table) results.push(table);
1681
+ }
1682
+ return results;
1683
+ }
1684
+ function groupByBaseline(items) {
1685
+ if (items.length === 0) return [];
1686
+ const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
1687
+ const rows = [];
1688
+ let curItems = [sorted[0]];
1689
+ let curY = sorted[0].y;
1690
+ for (let i = 1; i < sorted.length; i++) {
1691
+ if (Math.abs(sorted[i].y - curY) <= Y_TOL) {
1692
+ curItems.push(sorted[i]);
1693
+ } else {
1694
+ rows.push({ y: curY, items: curItems });
1695
+ curItems = [sorted[i]];
1696
+ curY = sorted[i].y;
1697
+ }
1698
+ }
1699
+ if (curItems.length > 0) rows.push({ y: curY, items: curItems });
1700
+ return rows;
1701
+ }
1702
+ function hasSuspiciousGaps(row) {
1703
+ if (row.items.length < 2) return false;
1704
+ const sorted = [...row.items].sort((a, b) => a.x - b.x);
1705
+ const avgFontSize = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
1706
+ const minGap = avgFontSize * MIN_GAP_FACTOR;
1707
+ for (let i = 1; i < sorted.length; i++) {
1708
+ const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
1709
+ if (gap >= minGap) return true;
1710
+ }
1711
+ return false;
1712
+ }
1713
+ function extractColumnClusters(rows) {
1714
+ const allX = [];
1715
+ for (const row of rows) {
1716
+ for (const item of row.items) allX.push(item.x);
1717
+ }
1718
+ if (allX.length === 0) return [];
1719
+ allX.sort((a, b) => a - b);
1720
+ const clusters = [];
1721
+ let clusterStart = 0;
1722
+ for (let i = 1; i <= allX.length; i++) {
1723
+ if (i === allX.length || allX[i] - allX[i - 1] > COL_CLUSTER_TOL) {
1724
+ const slice = allX.slice(clusterStart, i);
1725
+ const avg = Math.round(slice.reduce((s, v) => s + v, 0) / slice.length);
1726
+ clusters.push({ x: avg, count: slice.length });
1727
+ clusterStart = i;
1728
+ }
1729
+ }
1730
+ const minCount = Math.max(2, Math.floor(rows.length * MIN_COL_FILL_RATIO));
1731
+ return clusters.filter((c) => c.count >= minCount).sort((a, b) => a.x - b.x);
1732
+ }
1733
+ function findTableRegions(allRows, columns) {
1734
+ const regions = [];
1735
+ let currentRegion = [];
1736
+ for (const row of allRows) {
1737
+ const matchedCols = countMatchedColumns(row, columns);
1738
+ if (matchedCols >= MIN_COLS) {
1739
+ currentRegion.push(row);
1740
+ } else if (row.items.length === 1) {
1741
+ if (currentRegion.length > 0) {
1742
+ currentRegion.push(row);
1743
+ }
1744
+ } else {
1745
+ if (currentRegion.length >= MIN_ROWS) {
1746
+ regions.push({ rows: [...currentRegion] });
1747
+ }
1748
+ currentRegion = [];
1749
+ }
1750
+ }
1751
+ if (currentRegion.length >= MIN_ROWS) {
1752
+ regions.push({ rows: currentRegion });
1753
+ }
1754
+ return regions;
1755
+ }
1756
+ function countMatchedColumns(row, columns) {
1757
+ const matched = /* @__PURE__ */ new Set();
1758
+ for (const item of row.items) {
1759
+ for (let ci = 0; ci < columns.length; ci++) {
1760
+ if (Math.abs(item.x - columns[ci].x) <= COL_CLUSTER_TOL * 2) {
1761
+ matched.add(ci);
1762
+ break;
1763
+ }
1764
+ }
1765
+ }
1766
+ return matched.size;
1767
+ }
1768
+ function assignToColumn(item, columns) {
1769
+ const MAX_DIST = COL_CLUSTER_TOL * 3;
1770
+ let bestCol = -1;
1771
+ let bestDist = Infinity;
1772
+ for (let ci = 0; ci < columns.length; ci++) {
1773
+ const dist = Math.abs(item.x - columns[ci].x);
1774
+ if (dist < bestDist) {
1775
+ bestDist = dist;
1776
+ bestCol = ci;
1777
+ }
1778
+ }
1779
+ return bestDist <= MAX_DIST ? bestCol : -1;
1780
+ }
1781
+ function buildClusterTable(rows, columns, pageNum) {
1782
+ const numCols = columns.length;
1783
+ const numRows = rows.length;
1784
+ if (numRows < MIN_ROWS || numCols < MIN_COLS) return null;
1785
+ const cells = Array.from(
1786
+ { length: numRows },
1787
+ () => Array.from({ length: numCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
1788
+ );
1789
+ const usedItems = /* @__PURE__ */ new Set();
1790
+ for (let r = 0; r < numRows; r++) {
1791
+ const row = rows[r];
1792
+ if (row.items.length === 1 && numCols > 1) {
1793
+ cells[r][0] = { text: row.items[0].text, colSpan: numCols, rowSpan: 1 };
1794
+ usedItems.add(row.items[0]);
1795
+ continue;
1796
+ }
1797
+ for (const item of row.items) {
1798
+ const col = assignToColumn(item, columns);
1799
+ if (col < 0) continue;
1800
+ const existing = cells[r][col].text;
1801
+ cells[r][col].text = existing ? existing + " " + item.text : item.text;
1802
+ usedItems.add(item);
1803
+ }
1804
+ }
1805
+ let emptyRows = 0;
1806
+ for (const row of cells) {
1807
+ if (row.every((c) => c.text === "")) emptyRows++;
1808
+ }
1809
+ if (emptyRows > numRows * 0.5) return null;
1810
+ for (let c = 0; c < numCols; c++) {
1811
+ const hasValue = cells.some((row) => row[c].text !== "");
1812
+ if (!hasValue) return null;
1813
+ }
1814
+ const irTable = {
1815
+ rows: numRows,
1816
+ cols: numCols,
1817
+ cells,
1818
+ hasHeader: numRows > 1
1819
+ };
1820
+ const allItems = rows.flatMap((r) => r.items);
1821
+ let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
1822
+ for (const i of allItems) {
1823
+ if (i.x < minX) minX = i.x;
1824
+ if (i.y < minY) minY = i.y;
1825
+ if (i.x + i.w > maxX) maxX = i.x + i.w;
1826
+ const h = i.h > 0 ? i.h : i.fontSize;
1827
+ if (i.y + h > maxY) maxY = i.y + h;
1828
+ }
1829
+ return {
1830
+ table: irTable,
1831
+ bbox: { page: pageNum, x: minX, y: minY, width: maxX - minX, height: maxY - minY },
1832
+ usedItems
1833
+ };
1834
+ }
1835
+
1628
1836
  // src/pdf/polyfill.ts
1629
1837
  import * as pdfjsWorker from "pdfjs-dist/legacy/build/pdf.worker.mjs";
1630
1838
  var g = globalThis;
@@ -1916,6 +2124,8 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
1916
2124
  cells: irGrid,
1917
2125
  hasHeader: numRows > 1
1918
2126
  };
2127
+ const hasContent = irGrid.some((row) => row.some((cell) => cell.text.trim() !== ""));
2128
+ if (!hasContent) continue;
1919
2129
  blocks.push({
1920
2130
  type: "table",
1921
2131
  table: irTable,
@@ -1953,19 +2163,28 @@ function extractPageBlocksFallback(items, pageNum) {
1953
2163
  const bbox = computeBBox(items, pageNum);
1954
2164
  blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
1955
2165
  } else {
1956
- const allY = items.map((i) => i.y);
1957
- const pageHeight = Math.max(...allY) - Math.min(...allY);
1958
- const gapThreshold = Math.max(15, pageHeight * 0.03);
1959
- const orderedGroups = xyCutOrder(items, gapThreshold);
1960
- for (const group of orderedGroups) {
1961
- if (group.length === 0) continue;
1962
- const yLines = groupByY(group);
1963
- const groupColumns = detectColumns(yLines);
1964
- if (groupColumns && groupColumns.length >= 3) {
1965
- const tableText = extractWithColumns(yLines, groupColumns);
1966
- const bbox = computeBBox(group, pageNum);
1967
- blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(group) });
1968
- } else {
2166
+ const clusterItems = items.map((i) => ({
2167
+ text: i.text,
2168
+ x: i.x,
2169
+ y: i.y,
2170
+ w: i.w,
2171
+ h: i.h,
2172
+ fontSize: i.fontSize,
2173
+ fontName: i.fontName
2174
+ }));
2175
+ const clusterResults = detectClusterTables(clusterItems, pageNum);
2176
+ if (clusterResults.length > 0) {
2177
+ const usedIndices = /* @__PURE__ */ new Set();
2178
+ for (const cr of clusterResults) {
2179
+ for (const ci of cr.usedItems) {
2180
+ const idx = clusterItems.indexOf(ci);
2181
+ if (idx >= 0) usedIndices.add(idx);
2182
+ }
2183
+ blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
2184
+ }
2185
+ const remaining = items.filter((_, idx) => !usedIndices.has(idx));
2186
+ if (remaining.length > 0) {
2187
+ const yLines = groupByY(remaining);
1969
2188
  for (const line of yLines) {
1970
2189
  const text = mergeLineSimple(line);
1971
2190
  if (!text.trim()) continue;
@@ -1973,9 +2192,36 @@ function extractPageBlocksFallback(items, pageNum) {
1973
2192
  blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
1974
2193
  }
1975
2194
  }
2195
+ blocks.sort((a, b) => {
2196
+ const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
2197
+ const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
2198
+ return by - ay;
2199
+ });
2200
+ } else {
2201
+ const allY = items.map((i) => i.y);
2202
+ const pageHeight = Math.max(...allY) - Math.min(...allY);
2203
+ const gapThreshold = Math.max(15, pageHeight * 0.03);
2204
+ const orderedGroups = xyCutOrder(items, gapThreshold);
2205
+ for (const group of orderedGroups) {
2206
+ if (group.length === 0) continue;
2207
+ const yLines = groupByY(group);
2208
+ const groupColumns = detectColumns(yLines);
2209
+ if (groupColumns && groupColumns.length >= 3) {
2210
+ const tableText = extractWithColumns(yLines, groupColumns);
2211
+ const bbox = computeBBox(group, pageNum);
2212
+ blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(group) });
2213
+ } else {
2214
+ for (const line of yLines) {
2215
+ const text = mergeLineSimple(line);
2216
+ if (!text.trim()) continue;
2217
+ const bbox = computeBBox(line, pageNum);
2218
+ blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
2219
+ }
2220
+ }
2221
+ }
1976
2222
  }
1977
2223
  }
1978
- return blocks;
2224
+ return detectSpecialKoreanTables(blocks);
1979
2225
  }
1980
2226
  function computeBBox(items, pageNum) {
1981
2227
  let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
@@ -2241,8 +2487,10 @@ function mergeLineSimple(items) {
2241
2487
  let result = sorted[0].text;
2242
2488
  for (let i = 1; i < sorted.length; i++) {
2243
2489
  const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
2490
+ const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
2244
2491
  if (gap > 15) result += " ";
2245
- else if (gap > 3) result += " ";
2492
+ else if (gap < avgFs * 0.3 && /[가-힣]$/.test(result) && /^[가-힣]/.test(sorted[i].text)) {
2493
+ } else if (gap > 3) result += " ";
2246
2494
  result += sorted[i].text;
2247
2495
  }
2248
2496
  return result;
@@ -2280,6 +2528,90 @@ function detectListBlocks(blocks) {
2280
2528
  }
2281
2529
  return result;
2282
2530
  }
2531
+ var KOREAN_TABLE_HEADER_RE = /^\(?(구분|항목|종류|분류|유형|대상|내용|기간|금액|비율|방법|절차|요건|조건|근거|목적|범위|기준)\)?[:\s]/;
2532
+ function detectSpecialKoreanTables(blocks) {
2533
+ const result = [];
2534
+ let kvLines = [];
2535
+ const flushKvTable = () => {
2536
+ if (kvLines.length < 2) {
2537
+ for (const kv of kvLines) result.push(kv.block);
2538
+ kvLines = [];
2539
+ return;
2540
+ }
2541
+ const cells = kvLines.map((kv) => {
2542
+ if (kv.value) {
2543
+ return [
2544
+ { text: kv.key, colSpan: 1, rowSpan: 1 },
2545
+ { text: kv.value, colSpan: 1, rowSpan: 1 }
2546
+ ];
2547
+ }
2548
+ return [
2549
+ { text: kv.key, colSpan: 2, rowSpan: 1 },
2550
+ { text: "", colSpan: 1, rowSpan: 1 }
2551
+ ];
2552
+ });
2553
+ const irTable = {
2554
+ rows: cells.length,
2555
+ cols: 2,
2556
+ cells,
2557
+ hasHeader: true
2558
+ };
2559
+ const firstBlock = kvLines[0].block;
2560
+ result.push({
2561
+ type: "table",
2562
+ table: irTable,
2563
+ pageNumber: firstBlock.pageNumber,
2564
+ bbox: firstBlock.bbox
2565
+ });
2566
+ kvLines = [];
2567
+ };
2568
+ for (const block of blocks) {
2569
+ if (block.type !== "paragraph" || !block.text) {
2570
+ flushKvTable();
2571
+ result.push(block);
2572
+ continue;
2573
+ }
2574
+ const text = block.text.trim();
2575
+ if (KOREAN_TABLE_HEADER_RE.test(text)) {
2576
+ const colonIdx = text.indexOf(":");
2577
+ if (colonIdx >= 0) {
2578
+ kvLines.push({
2579
+ key: text.slice(0, colonIdx).trim(),
2580
+ value: text.slice(colonIdx + 1).trim(),
2581
+ block
2582
+ });
2583
+ } else {
2584
+ const spaceIdx = text.search(/\s/);
2585
+ if (spaceIdx > 0) {
2586
+ kvLines.push({
2587
+ key: text.slice(0, spaceIdx).trim(),
2588
+ value: text.slice(spaceIdx + 1).trim(),
2589
+ block
2590
+ });
2591
+ } else {
2592
+ kvLines.push({ key: text, value: "", block });
2593
+ }
2594
+ }
2595
+ continue;
2596
+ }
2597
+ if (kvLines.length > 0 && text.includes(":") && !text.includes("(") && !text.includes(")")) {
2598
+ const colonIdx = text.indexOf(":");
2599
+ const key = text.slice(0, colonIdx).trim();
2600
+ if (/^[가-힣]+$/.test(key) && key.length >= 2 && key.length <= 8) {
2601
+ kvLines.push({
2602
+ key,
2603
+ value: text.slice(colonIdx + 1).trim(),
2604
+ block
2605
+ });
2606
+ continue;
2607
+ }
2608
+ }
2609
+ flushKvTable();
2610
+ result.push(block);
2611
+ }
2612
+ flushKvTable();
2613
+ return result;
2614
+ }
2283
2615
  function mergeKoreanLines(text) {
2284
2616
  if (!text) return "";
2285
2617
  const lines = text.split("\n");
@@ -2654,4 +2986,4 @@ export {
2654
2986
  extractFormFields,
2655
2987
  parse
2656
2988
  };
2657
- //# sourceMappingURL=chunk-5SZWGBNL.js.map
2989
+ //# sourceMappingURL=chunk-DYUB34PO.js.map