kordoc 2.1.0 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,53 +1,105 @@
1
1
  #!/usr/bin/env node
2
2
  import {
3
- KordocError,
4
- classifyError,
5
- isPathTraversal,
6
- precheckZipSize,
7
- sanitizeHref,
8
- toArrayBuffer
9
- } from "./chunk-PKIJLEV6.js";
3
+ detectFormat,
4
+ detectZipFormat
5
+ } from "./chunk-MUAWCQDY.js";
10
6
  import {
11
7
  parsePageRange
12
- } from "./chunk-MOL7MDBG.js";
8
+ } from "./chunk-3TBUDJDE.js";
13
9
 
14
- // src/detect.ts
15
- import JSZip from "jszip";
16
- function magicBytes(buffer) {
17
- return new Uint8Array(buffer, 0, Math.min(4, buffer.byteLength));
18
- }
19
- function isZipFile(buffer) {
20
- const b = magicBytes(buffer);
21
- return b[0] === 80 && b[1] === 75 && b[2] === 3 && b[3] === 4;
22
- }
23
- function isOldHwpFile(buffer) {
24
- const b = magicBytes(buffer);
25
- return b[0] === 208 && b[1] === 207 && b[2] === 17 && b[3] === 224;
10
+ // src/utils.ts
11
+ var VERSION = true ? "2.2.1" : "0.0.0-dev";
12
+ function toArrayBuffer(buf) {
13
+ if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
14
+ return buf.buffer;
15
+ }
16
+ return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
26
17
  }
27
- function isPdfFile(buffer) {
28
- const b = magicBytes(buffer);
29
- return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
18
+ var KordocError = class extends Error {
19
+ constructor(message) {
20
+ super(message);
21
+ this.name = "KordocError";
22
+ }
23
+ };
24
+ function sanitizeError(err) {
25
+ if (err instanceof KordocError) return err.message;
26
+ return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
30
27
  }
31
- function detectFormat(buffer) {
32
- if (buffer.byteLength < 4) return "unknown";
33
- if (isZipFile(buffer)) return "hwpx";
34
- if (isOldHwpFile(buffer)) return "hwp";
35
- if (isPdfFile(buffer)) return "pdf";
36
- return "unknown";
28
+ function isPathTraversal(name) {
29
+ if (name.includes("\0")) return true;
30
+ const normalized = name.replace(/\\/g, "/");
31
+ const segments = normalized.split("/");
32
+ return segments.some((s) => s === "..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
37
33
  }
38
- async function detectZipFormat(buffer) {
34
+ function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
39
35
  try {
40
- const zip = await JSZip.loadAsync(buffer);
41
- if (zip.file("xl/workbook.xml")) return "xlsx";
42
- if (zip.file("word/document.xml")) return "docx";
43
- if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return "hwpx";
44
- const hasSection = Object.keys(zip.files).some((f) => f.startsWith("Contents/"));
45
- if (hasSection) return "hwpx";
46
- return "unknown";
47
- } catch {
48
- return "unknown";
36
+ const data = new DataView(buffer);
37
+ const len = buffer.byteLength;
38
+ let eocdOffset = -1;
39
+ for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
40
+ if (data.getUint32(i, true) === 101010256) {
41
+ eocdOffset = i;
42
+ break;
43
+ }
44
+ }
45
+ if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
46
+ const entryCount = data.getUint16(eocdOffset + 10, true);
47
+ if (entryCount > maxEntries) {
48
+ throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
49
+ }
50
+ const cdSize = data.getUint32(eocdOffset + 12, true);
51
+ const cdOffset = data.getUint32(eocdOffset + 16, true);
52
+ if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
53
+ let totalUncompressed = 0;
54
+ let pos = cdOffset;
55
+ for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
56
+ if (data.getUint32(pos, true) !== 33639248) break;
57
+ totalUncompressed += data.getUint32(pos + 24, true);
58
+ const nameLen = data.getUint16(pos + 28, true);
59
+ const extraLen = data.getUint16(pos + 30, true);
60
+ const commentLen = data.getUint16(pos + 32, true);
61
+ pos += 46 + nameLen + extraLen + commentLen;
62
+ }
63
+ if (totalUncompressed > maxUncompressedSize) {
64
+ throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
65
+ }
66
+ return { totalUncompressed, entryCount };
67
+ } catch (err) {
68
+ if (err instanceof KordocError) throw err;
69
+ return { totalUncompressed: 0, entryCount: 0 };
49
70
  }
50
71
  }
72
+ function stripDtd(xml) {
73
+ return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
74
+ }
75
+ var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
76
+ function sanitizeHref(href) {
77
+ const trimmed = href.trim();
78
+ if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
79
+ return trimmed;
80
+ }
81
+ function safeMin(arr) {
82
+ let min = Infinity;
83
+ for (let i = 0; i < arr.length; i++) if (arr[i] < min) min = arr[i];
84
+ return min;
85
+ }
86
+ function safeMax(arr) {
87
+ let max = -Infinity;
88
+ for (let i = 0; i < arr.length; i++) if (arr[i] > max) max = arr[i];
89
+ return max;
90
+ }
91
+ function classifyError(err) {
92
+ if (!(err instanceof Error)) return "PARSE_ERROR";
93
+ const msg = err.message;
94
+ if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
95
+ if (msg.includes("DRM")) return "DRM_PROTECTED";
96
+ if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
97
+ if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
98
+ if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
99
+ if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
100
+ if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
101
+ return "PARSE_ERROR";
102
+ }
51
103
 
52
104
  // src/table/builder.ts
53
105
  var MAX_COLS = 200;
@@ -110,6 +162,7 @@ function buildTableDirect(rows, numRows) {
110
162
  if (end > maxCols) maxCols = end;
111
163
  }
112
164
  }
165
+ if (maxCols > MAX_COLS) maxCols = MAX_COLS;
113
166
  if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
114
167
  const grid = Array.from(
115
168
  { length: numRows },
@@ -119,7 +172,7 @@ function buildTableDirect(rows, numRows) {
119
172
  for (const cell of row) {
120
173
  const r = cell.rowAddr ?? 0;
121
174
  const c = cell.colAddr ?? 0;
122
- if (r >= numRows || c >= maxCols) continue;
175
+ if (r >= numRows || c >= maxCols || r < 0 || c < 0) continue;
123
176
  grid[r][c] = { text: cell.text.trim(), colSpan: cell.colSpan, rowSpan: cell.rowSpan };
124
177
  for (let dr = 0; dr < cell.rowSpan; dr++) {
125
178
  for (let dc = 0; dc < cell.colSpan; dc++) {
@@ -148,9 +201,12 @@ function trimAndReturn(grid, numRows, maxCols) {
148
201
  }
149
202
  function convertTableToText(rows) {
150
203
  return rows.map(
151
- (row) => row.map((c) => c.text.trim().replace(/\n/g, " ")).filter(Boolean).join(" | ")
204
+ (row) => row.map((c) => c.text.trim().replace(/\n/g, " ").replace(/\|/g, "\\|")).filter(Boolean).join(" / ")
152
205
  ).filter(Boolean).join("\n");
153
206
  }
207
+ function escapeGfm(text) {
208
+ return text.replace(/~/g, "\\~");
209
+ }
154
210
  var HWP_SHAPE_ALT_TEXT_RE = /(?:모서리가 둥근 |둥근 )?(?:사각형|직사각형|정사각형|원|타원|삼각형|이등변 삼각형|직각 삼각형|선|직선|곡선|화살표|굵은 화살표|이중 화살표|오각형|육각형|팔각형|별|[4-8]점별|십자|십자형|구름|구름형|마름모|도넛|평행사변형|사다리꼴|부채꼴|호|반원|물결|번개|하트|빗금|블록 화살표|수식|표|그림|개체|그리기\s?개체|묶음\s?개체|글상자|수식\s?개체|OLE\s?개체)\s?입니다\.?/g;
155
211
  function sanitizeText(text) {
156
212
  let result = text.replace(/[\u{F0000}-\u{FFFFD}]/gu, "").replace(HWP_SHAPE_ALT_TEXT_RE, "").replace(/ +/g, " ").trim();
@@ -260,7 +316,7 @@ function blocksToMarkdown(blocks) {
260
316
  if (block.footnoteText) {
261
317
  text += ` (\uC8FC: ${block.footnoteText})`;
262
318
  }
263
- lines.push(text);
319
+ lines.push(escapeGfm(text), "");
264
320
  } else if (block.type === "table" && block.table) {
265
321
  if (lines.length > 0 && lines[lines.length - 1] !== "") {
266
322
  lines.push("");
@@ -283,13 +339,13 @@ function tableToMarkdown(table) {
283
339
  return content.split(/\n/).map((line) => {
284
340
  const trimmed = line.trim();
285
341
  if (!trimmed) return "";
286
- if (/^\d+\.\s/.test(trimmed)) return `**${trimmed}**`;
287
- if (/^[가-힣]\.\s/.test(trimmed)) return ` ${trimmed}`;
288
- return trimmed;
342
+ if (/^\d+\.\s/.test(trimmed)) return `**${escapeGfm(trimmed)}**`;
343
+ if (/^[가-힣]\.\s/.test(trimmed)) return ` ${escapeGfm(trimmed)}`;
344
+ return escapeGfm(trimmed);
289
345
  }).filter(Boolean).join("\n");
290
346
  }
291
347
  if (numCols === 1 && numRows >= 2) {
292
- return cells.map((row) => sanitizeText(row[0].text).replace(/\n/g, " ")).filter(Boolean).join("\n");
348
+ return cells.map((row) => escapeGfm(sanitizeText(row[0].text)).replace(/\n/g, " ")).filter(Boolean).join("\n");
293
349
  }
294
350
  const display = Array.from({ length: numRows }, () => Array(numCols).fill(""));
295
351
  const skip = /* @__PURE__ */ new Set();
@@ -298,15 +354,12 @@ function tableToMarkdown(table) {
298
354
  if (skip.has(`${r},${c}`)) continue;
299
355
  const cell = cells[r]?.[c];
300
356
  if (!cell) continue;
301
- display[r][c] = sanitizeText(cell.text).replace(/\n/g, "<br>");
357
+ display[r][c] = escapeGfm(sanitizeText(cell.text)).replace(/\|/g, "\\|").replace(/\n/g, "<br>");
302
358
  for (let dr = 0; dr < cell.rowSpan; dr++) {
303
359
  for (let dc = 0; dc < cell.colSpan; dc++) {
304
360
  if (dr === 0 && dc === 0) continue;
305
361
  if (r + dr < numRows && c + dc < numCols) {
306
362
  skip.add(`${r + dr},${c + dc}`);
307
- if (dr === 0) {
308
- display[r][c + dc] = cell.text.replace(/\n/g, "<br>");
309
- }
310
363
  }
311
364
  }
312
365
  }
@@ -344,7 +397,7 @@ function tableToMarkdown(table) {
344
397
  }
345
398
 
346
399
  // src/hwpx/parser.ts
347
- import JSZip2 from "jszip";
400
+ import JSZip from "jszip";
348
401
  import { inflateRawSync } from "zlib";
349
402
  import { DOMParser } from "@xmldom/xmldom";
350
403
 
@@ -446,14 +499,11 @@ function parseStyleElements(doc, map) {
446
499
  }
447
500
  }
448
501
  }
449
- function stripDtd(xml) {
450
- return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
451
- }
452
502
  async function parseHwpxDocument(buffer, options) {
453
503
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
454
504
  let zip;
455
505
  try {
456
- zip = await JSZip2.loadAsync(buffer);
506
+ zip = await JSZip.loadAsync(buffer);
457
507
  } catch {
458
508
  return extractFromBrokenZip(buffer);
459
509
  }
@@ -616,7 +666,7 @@ function parseDublinCoreMetadata(xml, metadata) {
616
666
  async function extractHwpxMetadataOnly(buffer) {
617
667
  let zip;
618
668
  try {
619
- zip = await JSZip2.loadAsync(buffer);
669
+ zip = await JSZip.loadAsync(buffer);
620
670
  } catch {
621
671
  throw new KordocError("HWPX ZIP\uC744 \uC5F4 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
622
672
  }
@@ -811,7 +861,8 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
811
861
  if (newTable.rows.length > 0) {
812
862
  if (tableStack.length > 0) {
813
863
  const parentTable = tableStack.pop();
814
- const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
864
+ let nestedCols = 0;
865
+ for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
815
866
  if (newTable.rows.length >= 3 && nestedCols >= 2) {
816
867
  blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
817
868
  } else {
@@ -920,7 +971,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
920
971
  if (newTable.rows.length > 0) {
921
972
  if (tableStack.length > 0) {
922
973
  const parentTable = tableStack.pop();
923
- const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
974
+ let nestedCols = 0;
975
+ for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
924
976
  if (newTable.rows.length >= 3 && nestedCols >= 2) {
925
977
  blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
926
978
  } else {
@@ -2018,6 +2070,7 @@ function parseLenientCfb(data) {
2018
2070
  if (miniSectorSizeShift > 16) throw new Error("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uBBF8\uB2C8 \uC139\uD130 \uD06C\uAE30 \uC2DC\uD504\uD2B8: " + miniSectorSizeShift);
2019
2071
  const miniSectorSize = 1 << miniSectorSizeShift;
2020
2072
  const fatSectorCount = data.readUInt32LE(44);
2073
+ if (fatSectorCount > 1e4) throw new Error("FAT \uC139\uD130 \uC218\uAC00 \uB108\uBB34 \uB9CE\uC2B5\uB2C8\uB2E4: " + fatSectorCount);
2021
2074
  const firstDirSector = data.readUInt32LE(48);
2022
2075
  const miniStreamCutoff = data.readUInt32LE(56);
2023
2076
  const firstMiniFatSector = data.readUInt32LE(60);
@@ -2406,10 +2459,14 @@ function findSections(cfb) {
2406
2459
  }
2407
2460
  function findSectionsLenient(lcfb, compressed) {
2408
2461
  const sections = [];
2462
+ let totalDecompressed = 0;
2409
2463
  for (let i = 0; i < MAX_SECTIONS; i++) {
2410
2464
  const raw = lcfb.findStream(`/BodyText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
2411
2465
  if (!raw) break;
2412
- sections.push({ idx: i, content: compressed ? decompressStream(raw) : raw });
2466
+ const content = compressed ? decompressStream(raw) : raw;
2467
+ totalDecompressed += content.length;
2468
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
2469
+ sections.push({ idx: i, content });
2413
2470
  }
2414
2471
  if (sections.length === 0) {
2415
2472
  for (const e of lcfb.entries()) {
@@ -2417,7 +2474,12 @@ function findSectionsLenient(lcfb, compressed) {
2417
2474
  if (e.name.startsWith("Section")) {
2418
2475
  const idx = parseInt(e.name.replace("Section", ""), 10) || 0;
2419
2476
  const raw = lcfb.findStream(e.name);
2420
- if (raw) sections.push({ idx, content: compressed ? decompressStream(raw) : raw });
2477
+ if (raw) {
2478
+ const content = compressed ? decompressStream(raw) : raw;
2479
+ totalDecompressed += content.length;
2480
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
2481
+ sections.push({ idx, content });
2482
+ }
2421
2483
  }
2422
2484
  }
2423
2485
  }
@@ -2425,11 +2487,15 @@ function findSectionsLenient(lcfb, compressed) {
2425
2487
  }
2426
2488
  function findViewTextSectionsLenient(lcfb, compressed) {
2427
2489
  const sections = [];
2490
+ let totalDecompressed = 0;
2428
2491
  for (let i = 0; i < MAX_SECTIONS; i++) {
2429
2492
  const raw = lcfb.findStream(`/ViewText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
2430
2493
  if (!raw) break;
2431
2494
  try {
2432
- sections.push({ idx: i, content: decryptViewText(raw, compressed) });
2495
+ const content = decryptViewText(raw, compressed);
2496
+ totalDecompressed += content.length;
2497
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
2498
+ sections.push({ idx: i, content });
2433
2499
  } catch {
2434
2500
  break;
2435
2501
  }
@@ -2828,37 +2894,18 @@ function arrangeCells(rows, cols, cells) {
2828
2894
  // src/pdf/line-detector.ts
2829
2895
  import { OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
2830
2896
  var ORIENTATION_TOL = 2;
2831
- var MIN_LINE_LENGTH = 10;
2832
- var COORD_MERGE_TOL = 3;
2897
+ var MIN_LINE_LENGTH = 15;
2898
+ var MAX_LINE_WIDTH = 5;
2833
2899
  var CONNECT_TOL = 5;
2834
2900
  var CELL_PADDING = 2;
2835
- var MAX_LINE_WIDTH = 5;
2836
- var IDENTITY = [1, 0, 0, 1, 0, 0];
2837
- function matMultiply(m1, m2) {
2838
- return [
2839
- m1[0] * m2[0] + m1[2] * m2[1],
2840
- m1[1] * m2[0] + m1[3] * m2[1],
2841
- m1[0] * m2[2] + m1[2] * m2[3],
2842
- m1[1] * m2[2] + m1[3] * m2[3],
2843
- m1[0] * m2[4] + m1[2] * m2[5] + m1[4],
2844
- m1[1] * m2[4] + m1[3] * m2[5] + m1[5]
2845
- ];
2846
- }
2847
- function matTransformPoint(m, x, y) {
2848
- return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
2849
- }
2850
- function matScale(m) {
2851
- return Math.max(
2852
- Math.sqrt(m[1] * m[1] + m[3] * m[3]),
2853
- Math.sqrt(m[0] * m[0] + m[2] * m[2])
2854
- );
2855
- }
2901
+ var MIN_COL_WIDTH = 15;
2902
+ var MIN_ROW_HEIGHT = 6;
2903
+ var VERTEX_MERGE_FACTOR = 4;
2904
+ var MIN_COORD_MERGE_TOL = 8;
2856
2905
  function extractLines(fnArray, argsArray) {
2857
2906
  const horizontals = [];
2858
2907
  const verticals = [];
2859
- let ctm = [...IDENTITY];
2860
2908
  let lineWidth = 1;
2861
- const stateStack = [];
2862
2909
  let currentPath = [];
2863
2910
  let pathStartX = 0, pathStartY = 0;
2864
2911
  let curX = 0, curY = 0;
@@ -2876,53 +2923,13 @@ function extractLines(fnArray, argsArray) {
2876
2923
  );
2877
2924
  }
2878
2925
  }
2879
- function tryConvertLinesToRectangle(path) {
2880
- if (path.length < 3 || path.length > 5) return false;
2881
- const first = path[0], last = path[path.length - 1];
2882
- const closed = Math.abs(first.x1 - last.x2) < 1 && Math.abs(first.y1 - last.y2) < 1;
2883
- if (!closed) return false;
2884
- let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
2885
- for (const seg of path) {
2886
- minX = Math.min(minX, seg.x1, seg.x2);
2887
- minY = Math.min(minY, seg.y1, seg.y2);
2888
- maxX = Math.max(maxX, seg.x1, seg.x2);
2889
- maxY = Math.max(maxY, seg.y1, seg.y2);
2890
- }
2891
- const w = maxX - minX, h = maxY - minY;
2892
- if (w < MIN_LINE_LENGTH && h < MIN_LINE_LENGTH) return false;
2893
- path.length = 0;
2894
- if (h < ORIENTATION_TOL * 2 || w > MIN_LINE_LENGTH && h <= MAX_LINE_WIDTH) {
2895
- path.push({ x1: minX, y1: (minY + maxY) / 2, x2: maxX, y2: (minY + maxY) / 2 });
2896
- } else if (w < ORIENTATION_TOL * 2 || h > MIN_LINE_LENGTH && w <= MAX_LINE_WIDTH) {
2897
- path.push({ x1: (minX + maxX) / 2, y1: minY, x2: (minX + maxX) / 2, y2: maxY });
2898
- } else {
2899
- pushRectangle(path, minX, minY, w, h);
2900
- }
2901
- return true;
2902
- }
2903
- function flushPath(isStroke, isFill) {
2904
- if (!isStroke && !isFill) {
2905
- currentPath = [];
2906
- return;
2907
- }
2908
- if (isFill && !isStroke && currentPath.length >= 3) {
2909
- tryConvertLinesToRectangle(currentPath);
2910
- }
2911
- const scale = matScale(ctm);
2912
- const effectiveLW = lineWidth * scale;
2913
- if (effectiveLW > MAX_LINE_WIDTH && isStroke && !isFill) {
2926
+ function flushPath(isStroke) {
2927
+ if (!isStroke) {
2914
2928
  currentPath = [];
2915
2929
  return;
2916
2930
  }
2917
2931
  for (const seg of currentPath) {
2918
- const [px1, py1] = matTransformPoint(ctm, seg.x1, seg.y1);
2919
- const [px2, py2] = matTransformPoint(ctm, seg.x2, seg.y2);
2920
- classifyAndAdd(
2921
- { x1: px1, y1: py1, x2: px2, y2: py2 },
2922
- effectiveLW,
2923
- horizontals,
2924
- verticals
2925
- );
2932
+ classifyAndAdd(seg, lineWidth, horizontals, verticals);
2926
2933
  }
2927
2934
  currentPath = [];
2928
2935
  }
@@ -2930,28 +2937,9 @@ function extractLines(fnArray, argsArray) {
2930
2937
  const op = fnArray[i];
2931
2938
  const args = argsArray[i];
2932
2939
  switch (op) {
2933
- // ── Graphics State ──
2934
- case OPS.save:
2935
- stateStack.push({ ctm: [...ctm], lineWidth });
2936
- break;
2937
- case OPS.restore:
2938
- if (stateStack.length > 0) {
2939
- const state = stateStack.pop();
2940
- ctm = state.ctm;
2941
- lineWidth = state.lineWidth;
2942
- }
2943
- break;
2944
- case OPS.transform: {
2945
- const m = args;
2946
- if (m.length >= 6) {
2947
- ctm = matMultiply(ctm, [m[0], m[1], m[2], m[3], m[4], m[5]]);
2948
- }
2949
- break;
2950
- }
2951
2940
  case OPS.setLineWidth:
2952
2941
  lineWidth = args[0] || 1;
2953
2942
  break;
2954
- // ── Path Construction ──
2955
2943
  case OPS.constructPath: {
2956
2944
  const arg0 = args[0];
2957
2945
  if (Array.isArray(arg0)) {
@@ -3019,60 +3007,34 @@ function extractLines(fnArray, argsArray) {
3019
3007
  }
3020
3008
  }
3021
3009
  }
3022
- const isStroke5 = afterOp === OPS.stroke || afterOp === OPS.closeStroke;
3023
- const isFill5 = afterOp === OPS.fill || afterOp === OPS.eoFill;
3024
- const isBoth5 = afterOp === OPS.fillStroke || afterOp === OPS.eoFillStroke || afterOp === OPS.closeFillStroke || afterOp === OPS.closeEOFillStroke;
3025
- if (isStroke5 || isFill5 || isBoth5) {
3026
- flushPath(isStroke5 || isBoth5, isFill5 || isBoth5);
3010
+ if (afterOp === OPS.stroke || afterOp === OPS.closeStroke) {
3011
+ flushPath(true);
3012
+ } else if (afterOp === OPS.fill || afterOp === OPS.eoFill || afterOp === OPS.fillStroke || afterOp === OPS.eoFillStroke || afterOp === OPS.closeFillStroke || afterOp === OPS.closeEOFillStroke) {
3013
+ flushPath(true);
3027
3014
  } else if (afterOp === OPS.endPath) {
3028
- flushPath(false, false);
3015
+ flushPath(false);
3029
3016
  }
3030
3017
  }
3031
3018
  break;
3032
3019
  }
3033
- // ── Paint Operations ──
3034
3020
  case OPS.stroke:
3035
3021
  case OPS.closeStroke:
3036
- flushPath(true, false);
3022
+ flushPath(true);
3037
3023
  break;
3038
3024
  case OPS.fill:
3039
3025
  case OPS.eoFill:
3040
- flushPath(false, true);
3041
- break;
3042
3026
  case OPS.fillStroke:
3043
3027
  case OPS.eoFillStroke:
3044
3028
  case OPS.closeFillStroke:
3045
3029
  case OPS.closeEOFillStroke:
3046
- flushPath(true, true);
3030
+ flushPath(true);
3047
3031
  break;
3048
3032
  case OPS.endPath:
3049
- flushPath(false, false);
3050
- break;
3051
- }
3052
- }
3053
- return {
3054
- horizontals: deduplicateLines(horizontals),
3055
- verticals: deduplicateLines(verticals)
3056
- };
3057
- }
3058
- function deduplicateLines(lines) {
3059
- if (lines.length <= 1) return lines;
3060
- const result = [];
3061
- const tol = COORD_MERGE_TOL;
3062
- for (const line of lines) {
3063
- let isDuplicate = false;
3064
- for (const existing of result) {
3065
- if (Math.abs(line.y1 - existing.y1) <= tol && Math.abs(line.y2 - existing.y2) <= tol && Math.abs(line.x1 - existing.x1) <= tol && Math.abs(line.x2 - existing.x2) <= tol) {
3066
- if (line.lineWidth > existing.lineWidth) {
3067
- existing.lineWidth = line.lineWidth;
3068
- }
3069
- isDuplicate = true;
3033
+ flushPath(false);
3070
3034
  break;
3071
- }
3072
3035
  }
3073
- if (!isDuplicate) result.push(line);
3074
3036
  }
3075
- return result;
3037
+ return { horizontals, verticals };
3076
3038
  }
3077
3039
  function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
3078
3040
  const dx = Math.abs(seg.x2 - seg.x1);
@@ -3091,6 +3053,55 @@ function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
3091
3053
  verticals.push({ x1: x, y1, x2: x, y2, lineWidth });
3092
3054
  }
3093
3055
  }
3056
+ function preprocessLines(horizontals, verticals) {
3057
+ let h = horizontals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
3058
+ let v = verticals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
3059
+ h = mergeParallelLines(h, "h");
3060
+ v = mergeParallelLines(v, "v");
3061
+ return { horizontals: h, verticals: v };
3062
+ }
3063
+ function mergeParallelLines(lines, dir) {
3064
+ if (lines.length <= 1) return lines;
3065
+ const sorted = [...lines].sort((a, b) => {
3066
+ const posA = dir === "h" ? a.y1 : a.x1;
3067
+ const posB = dir === "h" ? b.y1 : b.x1;
3068
+ if (Math.abs(posA - posB) > 0.1) return posA - posB;
3069
+ return dir === "h" ? a.x1 - b.x1 : a.y1 - b.y1;
3070
+ });
3071
+ const MERGE_TOL = 3;
3072
+ const result = [sorted[0]];
3073
+ for (let i = 1; i < sorted.length; i++) {
3074
+ const prev = result[result.length - 1];
3075
+ const curr = sorted[i];
3076
+ const prevPos = dir === "h" ? prev.y1 : prev.x1;
3077
+ const currPos = dir === "h" ? curr.y1 : curr.x1;
3078
+ if (Math.abs(prevPos - currPos) <= MERGE_TOL) {
3079
+ const prevStart = dir === "h" ? prev.x1 : prev.y1;
3080
+ const prevEnd = dir === "h" ? prev.x2 : prev.y2;
3081
+ const currStart = dir === "h" ? curr.x1 : curr.y1;
3082
+ const currEnd = dir === "h" ? curr.x2 : curr.y2;
3083
+ const overlap = Math.min(prevEnd, currEnd) - Math.max(prevStart, currStart);
3084
+ const minLen = Math.min(prevEnd - prevStart, currEnd - currStart);
3085
+ if (overlap > minLen * 0.3) {
3086
+ if (dir === "h") {
3087
+ prev.x1 = Math.min(prev.x1, curr.x1);
3088
+ prev.x2 = Math.max(prev.x2, curr.x2);
3089
+ prev.y1 = (prev.y1 + curr.y1) / 2;
3090
+ prev.y2 = prev.y1;
3091
+ } else {
3092
+ prev.y1 = Math.min(prev.y1, curr.y1);
3093
+ prev.y2 = Math.max(prev.y2, curr.y2);
3094
+ prev.x1 = (prev.x1 + curr.x1) / 2;
3095
+ prev.x2 = prev.x1;
3096
+ }
3097
+ prev.lineWidth = Math.max(prev.lineWidth, curr.lineWidth);
3098
+ continue;
3099
+ }
3100
+ }
3101
+ result.push(curr);
3102
+ }
3103
+ return result;
3104
+ }
3094
3105
  function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
3095
3106
  const margin = 5;
3096
3107
  return {
@@ -3102,8 +3113,49 @@ function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
3102
3113
  )
3103
3114
  };
3104
3115
  }
3116
+ function buildVertices(horizontals, verticals) {
3117
+ const vertices = [];
3118
+ const tol = CONNECT_TOL;
3119
+ for (const h of horizontals) {
3120
+ for (const v of verticals) {
3121
+ if (v.x1 >= h.x1 - tol && v.x1 <= h.x2 + tol && h.y1 >= v.y1 - tol && h.y1 <= v.y2 + tol) {
3122
+ const radius = Math.max(h.lineWidth, v.lineWidth, 1);
3123
+ vertices.push({ x: v.x1, y: h.y1, radius });
3124
+ }
3125
+ }
3126
+ }
3127
+ return vertices;
3128
+ }
3129
+ function mergeVertices(vertices) {
3130
+ if (vertices.length <= 1) return vertices;
3131
+ const merged = [];
3132
+ const used = new Array(vertices.length).fill(false);
3133
+ for (let i = 0; i < vertices.length; i++) {
3134
+ if (used[i]) continue;
3135
+ let sumX = vertices[i].x, sumY = vertices[i].y;
3136
+ let maxRadius = vertices[i].radius;
3137
+ let count = 1;
3138
+ for (let j = i + 1; j < vertices.length; j++) {
3139
+ if (used[j]) continue;
3140
+ const mergeTol = VERTEX_MERGE_FACTOR * Math.max(maxRadius, vertices[j].radius);
3141
+ if (Math.abs(vertices[i].x - vertices[j].x) <= mergeTol && Math.abs(vertices[i].y - vertices[j].y) <= mergeTol) {
3142
+ sumX += vertices[j].x;
3143
+ sumY += vertices[j].y;
3144
+ maxRadius = Math.max(maxRadius, vertices[j].radius);
3145
+ count++;
3146
+ used[j] = true;
3147
+ }
3148
+ }
3149
+ merged.push({ x: sumX / count, y: sumY / count, radius: maxRadius });
3150
+ }
3151
+ return merged;
3152
+ }
3105
3153
  function buildTableGrids(horizontals, verticals) {
3106
3154
  if (horizontals.length < 2 || verticals.length < 2) return [];
3155
+ const allVertices = buildVertices(horizontals, verticals);
3156
+ const vertices = mergeVertices(allVertices);
3157
+ if (vertices.length < 4) return [];
3158
+ const globalRadius = vertices.reduce((max, v) => Math.max(max, v.radius), 1);
3107
3159
  const allLines = [
3108
3160
  ...horizontals.map((l, i) => ({ ...l, type: "h", id: i })),
3109
3161
  ...verticals.map((l, i) => ({ ...l, type: "v", id: i + horizontals.length }))
@@ -3114,21 +3166,74 @@ function buildTableGrids(horizontals, verticals) {
3114
3166
  const hLines = group.filter((l) => l.type === "h");
3115
3167
  const vLines = group.filter((l) => l.type === "v");
3116
3168
  if (hLines.length < 2 || vLines.length < 2) continue;
3117
- const rawYs = hLines.map((l) => l.y1);
3118
- const rowYs = clusterCoordinates(rawYs).sort((a, b) => b - a);
3119
- const rawXs = vLines.map((l) => l.x1);
3120
- const colXs = clusterCoordinates(rawXs).sort((a, b) => a - b);
3169
+ let gx1 = Infinity, gy1 = Infinity, gx2 = -Infinity, gy2 = -Infinity;
3170
+ for (const l of vLines) {
3171
+ if (l.x1 < gx1) gx1 = l.x1;
3172
+ if (l.x1 > gx2) gx2 = l.x1;
3173
+ }
3174
+ for (const l of hLines) {
3175
+ if (l.y1 < gy1) gy1 = l.y1;
3176
+ if (l.y1 > gy2) gy2 = l.y1;
3177
+ }
3178
+ const groupBbox = {
3179
+ x1: gx1 - CONNECT_TOL,
3180
+ y1: gy1 - CONNECT_TOL,
3181
+ x2: gx2 + CONNECT_TOL,
3182
+ y2: gy2 + CONNECT_TOL
3183
+ };
3184
+ const groupVertices = vertices.filter(
3185
+ (v) => v.x >= groupBbox.x1 && v.x <= groupBbox.x2 && v.y >= groupBbox.y1 && v.y <= groupBbox.y2
3186
+ );
3187
+ const groupRadius = groupVertices.length > 0 ? groupVertices.reduce((max, v) => Math.max(max, v.radius), 1) : globalRadius;
3188
+ const coordMergeTol = Math.max(VERTEX_MERGE_FACTOR * groupRadius, MIN_COORD_MERGE_TOL);
3189
+ const rawYs = [
3190
+ ...hLines.map((l) => l.y1),
3191
+ ...groupVertices.map((v) => v.y)
3192
+ ];
3193
+ const rowYs = clusterCoordinates(rawYs, coordMergeTol).sort((a, b) => b - a);
3194
+ const rawXs = [
3195
+ ...vLines.map((l) => l.x1),
3196
+ ...groupVertices.map((v) => v.x)
3197
+ ];
3198
+ const colXs = clusterCoordinates(rawXs, coordMergeTol).sort((a, b) => a - b);
3121
3199
  if (rowYs.length < 2 || colXs.length < 2) continue;
3200
+ const validColXs = enforceMinWidth(colXs, MIN_COL_WIDTH);
3201
+ const validRowYs = enforceMinHeight(rowYs, MIN_ROW_HEIGHT);
3202
+ if (validRowYs.length < 2 || validColXs.length < 2) continue;
3122
3203
  const bbox = {
3123
- x1: colXs[0],
3124
- y1: rowYs[rowYs.length - 1],
3125
- x2: colXs[colXs.length - 1],
3126
- y2: rowYs[0]
3204
+ x1: validColXs[0],
3205
+ y1: validRowYs[validRowYs.length - 1],
3206
+ x2: validColXs[validColXs.length - 1],
3207
+ y2: validRowYs[0]
3127
3208
  };
3128
- grids.push({ rowYs, colXs, bbox });
3209
+ grids.push({ rowYs: validRowYs, colXs: validColXs, bbox, vertexRadius: groupRadius });
3129
3210
  }
3130
3211
  return mergeAdjacentGrids(grids);
3131
3212
  }
3213
+ function enforceMinWidth(colXs, minWidth) {
3214
+ if (colXs.length <= 2) return colXs;
3215
+ const result = [colXs[0]];
3216
+ for (let i = 1; i < colXs.length; i++) {
3217
+ const prevX = result[result.length - 1];
3218
+ if (colXs[i] - prevX < minWidth && i < colXs.length - 1) {
3219
+ continue;
3220
+ }
3221
+ result.push(colXs[i]);
3222
+ }
3223
+ return result;
3224
+ }
3225
+ function enforceMinHeight(rowYs, minHeight) {
3226
+ if (rowYs.length <= 2) return rowYs;
3227
+ const result = [rowYs[0]];
3228
+ for (let i = 1; i < rowYs.length; i++) {
3229
+ const prevY = result[result.length - 1];
3230
+ if (prevY - rowYs[i] < minHeight && i < rowYs.length - 1) {
3231
+ continue;
3232
+ }
3233
+ result.push(rowYs[i]);
3234
+ }
3235
+ return result;
3236
+ }
3132
3237
  function mergeAdjacentGrids(grids) {
3133
3238
  if (grids.length <= 1) return grids;
3134
3239
  const sorted = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
@@ -3137,9 +3242,10 @@ function mergeAdjacentGrids(grids) {
3137
3242
  const prev = merged[merged.length - 1];
3138
3243
  const curr = sorted[i];
3139
3244
  if (prev.colXs.length === curr.colXs.length) {
3140
- const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= COORD_MERGE_TOL * 3);
3245
+ const mergeTol = Math.max(VERTEX_MERGE_FACTOR * Math.max(prev.vertexRadius, curr.vertexRadius), 6) * 3;
3246
+ const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= mergeTol);
3141
3247
  const verticalGap = prev.bbox.y1 - curr.bbox.y2;
3142
- if (colMatch && verticalGap >= -COORD_MERGE_TOL && verticalGap <= 20) {
3248
+ if (colMatch && verticalGap >= -CONNECT_TOL && verticalGap <= 20) {
3143
3249
  const allRowYs = [.../* @__PURE__ */ new Set([...prev.rowYs, ...curr.rowYs])].sort((a, b) => b - a);
3144
3250
  merged[merged.length - 1] = {
3145
3251
  rowYs: allRowYs,
@@ -3149,7 +3255,8 @@ function mergeAdjacentGrids(grids) {
3149
3255
  y1: Math.min(prev.bbox.y1, curr.bbox.y1),
3150
3256
  x2: Math.max(prev.bbox.x2, curr.bbox.x2),
3151
3257
  y2: Math.max(prev.bbox.y2, curr.bbox.y2)
3152
- }
3258
+ },
3259
+ vertexRadius: Math.max(prev.vertexRadius, curr.vertexRadius)
3153
3260
  };
3154
3261
  continue;
3155
3262
  }
@@ -3158,14 +3265,14 @@ function mergeAdjacentGrids(grids) {
3158
3265
  }
3159
3266
  return merged;
3160
3267
  }
3161
- function clusterCoordinates(values) {
3268
+ function clusterCoordinates(values, tolerance) {
3162
3269
  if (values.length === 0) return [];
3163
3270
  const sorted = [...values].sort((a, b) => a - b);
3164
3271
  const clusters = [{ sum: sorted[0], count: 1 }];
3165
3272
  for (let i = 1; i < sorted.length; i++) {
3166
3273
  const last = clusters[clusters.length - 1];
3167
3274
  const avg = last.sum / last.count;
3168
- if (Math.abs(sorted[i] - avg) <= COORD_MERGE_TOL) {
3275
+ if (Math.abs(sorted[i] - avg) <= tolerance) {
3169
3276
  last.sum += sorted[i];
3170
3277
  last.count++;
3171
3278
  } else {
@@ -3222,6 +3329,20 @@ function extractCells(grid, horizontals, verticals) {
3222
3329
  const numRows = rowYs.length - 1;
3223
3330
  const numCols = colXs.length - 1;
3224
3331
  if (numRows <= 0 || numCols <= 0) return [];
3332
+ const vBorders = Array.from(
3333
+ { length: numRows },
3334
+ (_, r) => Array.from(
3335
+ { length: numCols + 1 },
3336
+ (_2, c) => hasVerticalLine(verticals, colXs[c], rowYs[r], rowYs[r + 1], grid.vertexRadius)
3337
+ )
3338
+ );
3339
+ const hBorders = Array.from(
3340
+ { length: numRows + 1 },
3341
+ (_, r) => Array.from(
3342
+ { length: numCols },
3343
+ (_2, c) => hasHorizontalLine(horizontals, rowYs[r], colXs[c], colXs[c + 1], grid.vertexRadius)
3344
+ )
3345
+ );
3225
3346
  const occupied = Array.from({ length: numRows }, () => Array(numCols).fill(false));
3226
3347
  const cells = [];
3227
3348
  for (let r = 0; r < numRows; r++) {
@@ -3229,18 +3350,26 @@ function extractCells(grid, horizontals, verticals) {
3229
3350
  if (occupied[r][c]) continue;
3230
3351
  let colSpan = 1;
3231
3352
  let rowSpan = 1;
3232
- while (c + colSpan < numCols) {
3233
- const borderX = colXs[c + colSpan];
3234
- const topY = rowYs[r];
3235
- const botY = rowYs[r + 1];
3236
- if (hasVerticalLine(verticals, borderX, topY, botY)) break;
3353
+ while (c + colSpan < numCols && !vBorders[r][c + colSpan]) {
3354
+ let canExpand = true;
3355
+ for (let dr = 0; dr < rowSpan; dr++) {
3356
+ if (vBorders[r + dr][c + colSpan]) {
3357
+ canExpand = false;
3358
+ break;
3359
+ }
3360
+ }
3361
+ if (!canExpand) break;
3237
3362
  colSpan++;
3238
3363
  }
3239
3364
  while (r + rowSpan < numRows) {
3240
- const borderY = rowYs[r + rowSpan];
3241
- const leftX = colXs[c];
3242
- const rightX = colXs[c + colSpan];
3243
- if (hasHorizontalLine(horizontals, borderY, leftX, rightX)) break;
3365
+ let hasLine = false;
3366
+ for (let dc = 0; dc < colSpan; dc++) {
3367
+ if (hBorders[r + rowSpan][c + dc]) {
3368
+ hasLine = true;
3369
+ break;
3370
+ }
3371
+ }
3372
+ if (hasLine) break;
3244
3373
  rowSpan++;
3245
3374
  }
3246
3375
  for (let dr = 0; dr < rowSpan; dr++) {
@@ -3264,28 +3393,30 @@ function extractCells(grid, horizontals, verticals) {
3264
3393
  }
3265
3394
  return cells;
3266
3395
  }
3267
- function hasVerticalLine(verticals, x, topY, botY) {
3268
- const tol = COORD_MERGE_TOL + 1;
3396
+ function hasVerticalLine(verticals, x, topY, botY, vertexRadius) {
3397
+ const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
3269
3398
  for (const v of verticals) {
3270
3399
  if (Math.abs(v.x1 - x) <= tol) {
3271
3400
  const cellH = Math.abs(topY - botY);
3401
+ if (cellH < 0.1) continue;
3272
3402
  const overlapTop = Math.min(v.y2, topY);
3273
3403
  const overlapBot = Math.max(v.y1, botY);
3274
3404
  const overlap = overlapTop - overlapBot;
3275
- if (overlap >= cellH * 0.5) return true;
3405
+ if (overlap >= cellH * 0.75) return true;
3276
3406
  }
3277
3407
  }
3278
3408
  return false;
3279
3409
  }
3280
- function hasHorizontalLine(horizontals, y, leftX, rightX) {
3281
- const tol = COORD_MERGE_TOL + 1;
3410
+ function hasHorizontalLine(horizontals, y, leftX, rightX, vertexRadius) {
3411
+ const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
3282
3412
  for (const h of horizontals) {
3283
3413
  if (Math.abs(h.y1 - y) <= tol) {
3284
3414
  const cellW = Math.abs(rightX - leftX);
3415
+ if (cellW < 0.1) continue;
3285
3416
  const overlapLeft = Math.max(h.x1, leftX);
3286
3417
  const overlapRight = Math.min(h.x2, rightX);
3287
3418
  const overlap = overlapRight - overlapLeft;
3288
- if (overlap >= cellW * 0.5) return true;
3419
+ if (overlap >= cellW * 0.75) return true;
3289
3420
  }
3290
3421
  }
3291
3422
  return false;
@@ -3296,23 +3427,24 @@ function mapTextToCells(items, cells) {
3296
3427
  result.set(cell, []);
3297
3428
  }
3298
3429
  for (const item of items) {
3299
- const cx = item.x + item.w / 2;
3300
- const cy = item.y;
3301
3430
  const pad = CELL_PADDING;
3302
3431
  let bestCell = null;
3303
- let bestDist = Infinity;
3432
+ let bestScore = 0;
3304
3433
  for (const cell of cells) {
3305
- if (cx >= cell.bbox.x1 - pad && cx <= cell.bbox.x2 + pad && cy >= cell.bbox.y1 - pad && cy <= cell.bbox.y2 + pad) {
3306
- const cellCx = (cell.bbox.x1 + cell.bbox.x2) / 2;
3307
- const cellCy = (cell.bbox.y1 + cell.bbox.y2) / 2;
3308
- const dist = Math.abs(cx - cellCx) + Math.abs(cy - cellCy);
3309
- if (dist < bestDist) {
3310
- bestDist = dist;
3311
- bestCell = cell;
3312
- }
3434
+ const ix1 = Math.max(item.x, cell.bbox.x1 - pad);
3435
+ const ix2 = Math.min(item.x + item.w, cell.bbox.x2 + pad);
3436
+ const iy1 = Math.max(item.y, cell.bbox.y1 - pad);
3437
+ const iy2 = Math.min(item.y + (item.h || item.fontSize), cell.bbox.y2 + pad);
3438
+ if (ix1 >= ix2 || iy1 >= iy2) continue;
3439
+ const intersectArea = (ix2 - ix1) * (iy2 - iy1);
3440
+ const itemArea = Math.max(item.w, 1) * Math.max(item.h || item.fontSize, 1);
3441
+ const score = intersectArea / itemArea;
3442
+ if (score > bestScore) {
3443
+ bestScore = score;
3444
+ bestCell = cell;
3313
3445
  }
3314
3446
  }
3315
- if (bestCell) {
3447
+ if (bestCell && bestScore > 0.3) {
3316
3448
  result.get(bestCell).push(item);
3317
3449
  }
3318
3450
  }
@@ -3339,8 +3471,13 @@ function cellTextToString(items) {
3339
3471
  const textLines = lines.map((line) => {
3340
3472
  const s = line.sort((a, b) => a.x - b.x);
3341
3473
  if (s.length === 1) return s[0].text;
3474
+ const evenSpaced = detectEvenSpacedItems(s);
3342
3475
  let result = s[0].text;
3343
3476
  for (let j = 1; j < s.length; j++) {
3477
+ if (evenSpaced[j]) {
3478
+ result += s[j].text;
3479
+ continue;
3480
+ }
3344
3481
  const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
3345
3482
  const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
3346
3483
  const prevIsKorean = /[가-힣]$/.test(result);
@@ -3355,6 +3492,57 @@ function cellTextToString(items) {
3355
3492
  }
3356
3493
  return result;
3357
3494
  });
3495
+ return mergeCellTextLines(textLines);
3496
+ }
3497
+ function detectEvenSpacedItems(items) {
3498
+ const result = new Array(items.length).fill(false);
3499
+ if (items.length < 3) return result;
3500
+ let runStart = -1;
3501
+ for (let i = 0; i < items.length; i++) {
3502
+ const isShortKorean = /^[가-힣]{1}$/.test(items[i].text) || /^[\d]{1}$/.test(items[i].text);
3503
+ if (isShortKorean && runStart >= 0 && i > 0) {
3504
+ const gap = items[i].x - (items[i - 1].x + items[i - 1].w);
3505
+ const maxRunGap = Math.max(items[i].fontSize * 3, 30);
3506
+ if (gap > maxRunGap) {
3507
+ if (i - runStart >= 3) markEvenRun(items, result, runStart, i);
3508
+ runStart = i;
3509
+ continue;
3510
+ }
3511
+ }
3512
+ if (isShortKorean) {
3513
+ if (runStart < 0) runStart = i;
3514
+ } else {
3515
+ if (runStart >= 0 && i - runStart >= 3) {
3516
+ markEvenRun(items, result, runStart, i);
3517
+ }
3518
+ runStart = -1;
3519
+ }
3520
+ }
3521
+ if (runStart >= 0 && items.length - runStart >= 3) {
3522
+ markEvenRun(items, result, runStart, items.length);
3523
+ }
3524
+ return result;
3525
+ }
3526
+ function markEvenRun(items, result, start, end) {
3527
+ const gaps = [];
3528
+ for (let i = start + 1; i < end; i++) {
3529
+ gaps.push(items[i].x - (items[i - 1].x + items[i - 1].w));
3530
+ }
3531
+ const posGaps = gaps.filter((g2) => g2 > 0);
3532
+ if (posGaps.length < 2) return;
3533
+ let minGap = Infinity, maxGap = -Infinity;
3534
+ for (const g2 of posGaps) {
3535
+ if (g2 < minGap) minGap = g2;
3536
+ if (g2 > maxGap) maxGap = g2;
3537
+ }
3538
+ const avgFs = items[start].fontSize;
3539
+ if (minGap >= avgFs * 0.1 && maxGap <= avgFs * 3 && maxGap / Math.max(minGap, 0.1) <= 3) {
3540
+ for (let i = start + 1; i < end; i++) {
3541
+ result[i] = true;
3542
+ }
3543
+ }
3544
+ }
3545
+ function mergeCellTextLines(textLines) {
3358
3546
  if (textLines.length <= 1) return textLines[0] || "";
3359
3547
  const merged = [textLines[0]];
3360
3548
  for (let i = 1; i < textLines.length; i++) {
@@ -3380,24 +3568,172 @@ var Y_TOL = 3;
3380
3568
  var COL_CLUSTER_TOL = 15;
3381
3569
  var MIN_ROWS = 3;
3382
3570
  var MIN_COLS = 2;
3383
- var MIN_GAP_FACTOR = 1.5;
3384
- var MIN_COL_FILL_RATIO = 0.3;
3571
+ var MIN_GAP_FACTOR = 2;
3572
+ var MIN_GAP_ABSOLUTE = 20;
3573
+ var MIN_COL_FILL_RATIO = 0.4;
3385
3574
  function detectClusterTables(items, pageNum) {
3386
3575
  if (items.length < MIN_ROWS * MIN_COLS) return [];
3387
- const rows = groupByBaseline(items);
3576
+ const { merged, originMap } = mergeEvenSpacedClusters(items);
3577
+ const rows = groupByBaseline(merged);
3388
3578
  if (rows.length < MIN_ROWS) return [];
3389
- const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
3390
- if (suspiciousRows.length < MIN_ROWS) return [];
3391
- const columns = extractColumnClusters(suspiciousRows);
3392
- if (columns.length < MIN_COLS) return [];
3393
- const tableRegions = findTableRegions(rows, columns);
3394
3579
  const results = [];
3395
- for (const region of tableRegions) {
3396
- const table = buildClusterTable(region.rows, columns, pageNum);
3397
- if (table) results.push(table);
3580
+ const headerResult = detectHeaderRow(rows);
3581
+ if (headerResult) {
3582
+ const { columns, headerIdx } = headerResult;
3583
+ const headerRow = rows[headerIdx];
3584
+ const headerItems = [...headerRow.items].sort((a, b) => a.x - b.x);
3585
+ const headerAndBelow = rows.slice(headerIdx);
3586
+ const mergedRows = mergeMultiLineRows(headerAndBelow, columns);
3587
+ const tableRegions = findTableRegionsByHeader(mergedRows, columns, headerItems);
3588
+ for (const region of tableRegions) {
3589
+ const table = buildClusterTable(region.rows, columns, pageNum);
3590
+ if (table) {
3591
+ expandUsedItems(table.usedItems, originMap);
3592
+ results.push(table);
3593
+ }
3594
+ }
3595
+ }
3596
+ if (results.length === 0) {
3597
+ const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
3598
+ if (suspiciousRows.length >= MIN_ROWS) {
3599
+ const columns = extractColumnClusters(suspiciousRows);
3600
+ if (columns.length >= MIN_COLS) {
3601
+ const tableRegions = findTableRegions(rows, columns);
3602
+ for (const region of tableRegions) {
3603
+ const mergedRows = mergeMultiLineRows(region.rows, columns);
3604
+ const table = buildClusterTable(mergedRows, columns, pageNum);
3605
+ if (table) {
3606
+ expandUsedItems(table.usedItems, originMap);
3607
+ results.push(table);
3608
+ }
3609
+ }
3610
+ }
3611
+ }
3398
3612
  }
3399
3613
  return results;
3400
3614
  }
3615
+ function mergeEvenSpacedClusters(items) {
3616
+ const originMap = /* @__PURE__ */ new Map();
3617
+ const rows = groupByBaseline(items);
3618
+ const merged = [];
3619
+ for (const row of rows) {
3620
+ const sorted = [...row.items].sort((a, b) => a.x - b.x);
3621
+ let i = 0;
3622
+ while (i < sorted.length) {
3623
+ if (/^[가-힣\d]$/.test(sorted[i].text)) {
3624
+ let runEnd = i + 1;
3625
+ while (runEnd < sorted.length && /^[가-힣\d]$/.test(sorted[runEnd].text)) {
3626
+ const gap = sorted[runEnd].x - (sorted[runEnd - 1].x + sorted[runEnd - 1].w);
3627
+ const fs = sorted[runEnd].fontSize;
3628
+ if (gap < fs * 0.1 || gap > fs * 3) break;
3629
+ runEnd++;
3630
+ }
3631
+ if (runEnd - i >= 3) {
3632
+ const gaps = [];
3633
+ for (let g2 = i + 1; g2 < runEnd; g2++) {
3634
+ gaps.push(sorted[g2].x - (sorted[g2 - 1].x + sorted[g2 - 1].w));
3635
+ }
3636
+ let minG = Infinity, maxG = -Infinity;
3637
+ for (const g2 of gaps) {
3638
+ if (g2 < minG) minG = g2;
3639
+ if (g2 > maxG) maxG = g2;
3640
+ }
3641
+ if (minG > 0 && maxG / minG <= 3) {
3642
+ const run = sorted.slice(i, runEnd);
3643
+ const text = run.map((r) => r.text).join("");
3644
+ const first = run[0], last = run[runEnd - i - 1];
3645
+ const item = {
3646
+ text,
3647
+ x: first.x,
3648
+ y: first.y,
3649
+ w: last.x + last.w - first.x,
3650
+ h: first.h,
3651
+ fontSize: first.fontSize,
3652
+ fontName: first.fontName
3653
+ };
3654
+ originMap.set(item, run);
3655
+ merged.push(item);
3656
+ i = runEnd;
3657
+ continue;
3658
+ }
3659
+ }
3660
+ }
3661
+ merged.push(sorted[i]);
3662
+ i++;
3663
+ }
3664
+ }
3665
+ return { merged, originMap };
3666
+ }
3667
+ function expandUsedItems(usedItems, originMap) {
3668
+ const toAdd = [];
3669
+ for (const item of usedItems) {
3670
+ const origins = originMap.get(item);
3671
+ if (origins) for (const o of origins) toAdd.push(o);
3672
+ }
3673
+ for (const a of toAdd) usedItems.add(a);
3674
+ }
3675
+ function detectHeaderRow(rows) {
3676
+ const allItems = rows.flatMap((r) => r.items);
3677
+ if (allItems.length === 0) return null;
3678
+ let allMinX = Infinity, allMaxX = -Infinity;
3679
+ for (const i of allItems) {
3680
+ if (i.x < allMinX) allMinX = i.x;
3681
+ const r = i.x + i.w;
3682
+ if (r > allMaxX) allMaxX = r;
3683
+ }
3684
+ const pageSpan = allMaxX - allMinX;
3685
+ if (pageSpan <= 0) return null;
3686
+ for (let ri = 0; ri < rows.length; ri++) {
3687
+ const row = rows[ri];
3688
+ if (row.items.length < MIN_COLS || row.items.length > 6) continue;
3689
+ if (row.items.some((i) => i.text.length > 8)) continue;
3690
+ if (!row.items.some((i) => /[가-힣]/.test(i.text))) continue;
3691
+ if (row.items.some((i) => /^[□■○●·※▶▷◆◇\-]/.test(i.text))) continue;
3692
+ const sorted = [...row.items].sort((a, b) => a.x - b.x);
3693
+ const xSpan = sorted[sorted.length - 1].x + sorted[sorted.length - 1].w - sorted[0].x;
3694
+ if (xSpan / pageSpan < 0.4) continue;
3695
+ const avgFs = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
3696
+ let hasLargeGap = false;
3697
+ for (let i = 1; i < sorted.length; i++) {
3698
+ const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
3699
+ if (gap >= avgFs * 2.5) {
3700
+ hasLargeGap = true;
3701
+ break;
3702
+ }
3703
+ }
3704
+ if (!hasLargeGap) continue;
3705
+ const columns = sorted.map((item) => ({ x: item.x, count: 0 }));
3706
+ let matchCount = 0;
3707
+ for (let j = ri + 1; j < rows.length && matchCount < MIN_ROWS + 2; j++) {
3708
+ const matched = countMatchedColumnsRange(rows[j], columns, sorted);
3709
+ if (matched >= MIN_COLS) matchCount++;
3710
+ }
3711
+ if (matchCount < MIN_ROWS) continue;
3712
+ return { columns, headerIdx: ri };
3713
+ }
3714
+ return null;
3715
+ }
3716
+ function mergeMultiLineRows(rows, columns) {
3717
+ if (rows.length <= 1) return rows;
3718
+ const result = [rows[0]];
3719
+ const allFontSizes = rows.flatMap((r) => r.items).map((i) => i.fontSize);
3720
+ const avgFontSize = allFontSizes.length > 0 ? allFontSizes.reduce((s, v) => s + v, 0) / allFontSizes.length : 12;
3721
+ for (let i = 1; i < rows.length; i++) {
3722
+ const prev = result[result.length - 1];
3723
+ const curr = rows[i];
3724
+ const yGap = Math.abs(prev.y - curr.y);
3725
+ const matchedCols = countMatchedColumns(curr, columns);
3726
+ if (yGap < avgFontSize * 1.8 && curr.items.length <= 2 && (matchedCols < MIN_COLS || curr.items.length === 1)) {
3727
+ result[result.length - 1] = {
3728
+ y: prev.y,
3729
+ items: [...prev.items, ...curr.items]
3730
+ };
3731
+ } else {
3732
+ result.push(curr);
3733
+ }
3734
+ }
3735
+ return result;
3736
+ }
3401
3737
  function groupByBaseline(items) {
3402
3738
  if (items.length === 0) return [];
3403
3739
  const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
@@ -3419,8 +3755,9 @@ function groupByBaseline(items) {
3419
3755
  function hasSuspiciousGaps(row) {
3420
3756
  if (row.items.length < 2) return false;
3421
3757
  const sorted = [...row.items].sort((a, b) => a.x - b.x);
3758
+ if (sorted.length === 2 && sorted[1].text.length > 20) return false;
3422
3759
  const avgFontSize = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
3423
- const minGap = avgFontSize * MIN_GAP_FACTOR;
3760
+ const minGap = Math.max(avgFontSize * MIN_GAP_FACTOR, MIN_GAP_ABSOLUTE);
3424
3761
  for (let i = 1; i < sorted.length; i++) {
3425
3762
  const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
3426
3763
  if (gap >= minGap) return true;
@@ -3447,6 +3784,41 @@ function extractColumnClusters(rows) {
3447
3784
  const minCount = Math.max(2, Math.floor(rows.length * MIN_COL_FILL_RATIO));
3448
3785
  return clusters.filter((c) => c.count >= minCount).sort((a, b) => a.x - b.x);
3449
3786
  }
3787
+ function findTableRegionsByHeader(allRows, columns, headerItems) {
3788
+ const regions = [];
3789
+ let currentRegion = [];
3790
+ let missStreak = 0;
3791
+ for (const row of allRows) {
3792
+ const matchedCols = countMatchedColumnsRange(row, columns, headerItems);
3793
+ if (matchedCols >= MIN_COLS) {
3794
+ currentRegion.push(row);
3795
+ missStreak = 0;
3796
+ } else if (currentRegion.length > 0 && (row.items.length <= 2 || missStreak === 0)) {
3797
+ currentRegion.push(row);
3798
+ missStreak++;
3799
+ } else {
3800
+ while (currentRegion.length > 0) {
3801
+ const last = currentRegion[currentRegion.length - 1];
3802
+ if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
3803
+ currentRegion.pop();
3804
+ }
3805
+ if (currentRegion.length >= MIN_ROWS) {
3806
+ regions.push({ rows: [...currentRegion] });
3807
+ }
3808
+ currentRegion = [];
3809
+ missStreak = 0;
3810
+ }
3811
+ }
3812
+ while (currentRegion.length > 0) {
3813
+ const last = currentRegion[currentRegion.length - 1];
3814
+ if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
3815
+ currentRegion.pop();
3816
+ }
3817
+ if (currentRegion.length >= MIN_ROWS) {
3818
+ regions.push({ rows: currentRegion });
3819
+ }
3820
+ return regions;
3821
+ }
3450
3822
  function findTableRegions(allRows, columns) {
3451
3823
  const regions = [];
3452
3824
  let currentRegion = [];
@@ -3482,18 +3854,81 @@ function countMatchedColumns(row, columns) {
3482
3854
  }
3483
3855
  return matched.size;
3484
3856
  }
3485
- function assignToColumn(item, columns) {
3486
- const MAX_DIST = COL_CLUSTER_TOL * 3;
3487
- let bestCol = -1;
3488
- let bestDist = Infinity;
3489
- for (let ci = 0; ci < columns.length; ci++) {
3490
- const dist = Math.abs(item.x - columns[ci].x);
3491
- if (dist < bestDist) {
3492
- bestDist = dist;
3493
- bestCol = ci;
3857
+ function countMatchedColumnsRange(row, columns, headerItems) {
3858
+ const boundaries = [];
3859
+ for (let ci = 0; ci < headerItems.length; ci++) {
3860
+ const left = ci === 0 ? 0 : (headerItems[ci - 1].x + headerItems[ci - 1].w + headerItems[ci].x) / 2;
3861
+ const right = ci === headerItems.length - 1 ? Infinity : (headerItems[ci].x + headerItems[ci].w + headerItems[ci + 1].x) / 2;
3862
+ boundaries.push({ left, right });
3863
+ }
3864
+ const matched = /* @__PURE__ */ new Set();
3865
+ for (const item of row.items) {
3866
+ for (let ci = 0; ci < boundaries.length; ci++) {
3867
+ if (item.x >= boundaries[ci].left && item.x < boundaries[ci].right) {
3868
+ matched.add(ci);
3869
+ break;
3870
+ }
3494
3871
  }
3495
3872
  }
3496
- return bestDist <= MAX_DIST ? bestCol : -1;
3873
+ return matched.size;
3874
+ }
3875
+ function assignRowItems(items, columns, numCols) {
3876
+ if (items.length === 0) return [];
3877
+ const sorted = [...items].sort((a, b) => a.x - b.x);
3878
+ const colCenters = columns.map((c) => c.x);
3879
+ const gaps = [];
3880
+ for (let i = 1; i < sorted.length; i++) {
3881
+ gaps.push({ idx: i, size: sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w) });
3882
+ }
3883
+ const gapSizes = gaps.map((g2) => g2.size).sort((a, b) => a - b);
3884
+ const medianGap = gapSizes.length > 0 ? gapSizes[Math.floor(gapSizes.length / 2)] : 0;
3885
+ const gapThreshold = sorted.length <= numCols + 1 ? 12 : Math.max(medianGap * 2.5, 12);
3886
+ const significantGaps = gaps.filter((g2) => g2.size >= gapThreshold).sort((a, b) => b.size - a.size).slice(0, numCols - 1).sort((a, b) => a.idx - b.idx);
3887
+ const groups = [];
3888
+ let start = 0;
3889
+ for (const gap of significantGaps) {
3890
+ groups.push(sorted.slice(start, gap.idx));
3891
+ start = gap.idx;
3892
+ }
3893
+ groups.push(sorted.slice(start));
3894
+ const result = [];
3895
+ const usedCols = /* @__PURE__ */ new Set();
3896
+ const groupCenters = groups.map((g2) => {
3897
+ let minX = Infinity, maxX = -Infinity;
3898
+ for (const i of g2) {
3899
+ if (i.x < minX) minX = i.x;
3900
+ const r = i.x + i.w;
3901
+ if (r > maxX) maxX = r;
3902
+ }
3903
+ return (minX + maxX) / 2;
3904
+ });
3905
+ const assignments = [];
3906
+ for (let gi = 0; gi < groups.length; gi++) {
3907
+ for (let ci = 0; ci < numCols; ci++) {
3908
+ assignments.push({ gi, ci, dist: Math.abs(groupCenters[gi] - colCenters[ci]) });
3909
+ }
3910
+ }
3911
+ assignments.sort((a, b) => a.dist - b.dist);
3912
+ const assignedGroups = /* @__PURE__ */ new Set();
3913
+ for (const { gi, ci } of assignments) {
3914
+ if (assignedGroups.has(gi) || usedCols.has(ci)) continue;
3915
+ result.push({ col: ci, items: groups[gi] });
3916
+ assignedGroups.add(gi);
3917
+ usedCols.add(ci);
3918
+ }
3919
+ for (let gi = 0; gi < groups.length; gi++) {
3920
+ if (assignedGroups.has(gi)) continue;
3921
+ let bestCol = 0, bestDist = Infinity;
3922
+ for (let ci = 0; ci < numCols; ci++) {
3923
+ const d = Math.abs(groupCenters[gi] - colCenters[ci]);
3924
+ if (d < bestDist) {
3925
+ bestDist = d;
3926
+ bestCol = ci;
3927
+ }
3928
+ }
3929
+ result.push({ col: bestCol, items: groups[gi] });
3930
+ }
3931
+ return result;
3497
3932
  }
3498
3933
  function buildClusterTable(rows, columns, pageNum) {
3499
3934
  const numCols = columns.length;
@@ -3511,12 +3946,12 @@ function buildClusterTable(rows, columns, pageNum) {
3511
3946
  usedItems.add(row.items[0]);
3512
3947
  continue;
3513
3948
  }
3514
- for (const item of row.items) {
3515
- const col = assignToColumn(item, columns);
3516
- if (col < 0) continue;
3949
+ const assignments = assignRowItems(row.items, columns, numCols);
3950
+ for (const { col, items } of assignments) {
3951
+ const text = items.map((i) => i.text).join(" ");
3517
3952
  const existing = cells[r][col].text;
3518
- cells[r][col].text = existing ? existing + " " + item.text : item.text;
3519
- usedItems.add(item);
3953
+ cells[r][col].text = existing ? existing + " " + text : text;
3954
+ for (const item of items) usedItems.add(item);
3520
3955
  }
3521
3956
  }
3522
3957
  let emptyRows = 0;
@@ -3528,11 +3963,48 @@ function buildClusterTable(rows, columns, pageNum) {
3528
3963
  const hasValue = cells.some((row) => row[c].text !== "");
3529
3964
  if (!hasValue) return null;
3530
3965
  }
3966
+ for (let r = numRows - 1; r >= 1; r--) {
3967
+ const nonEmptyCols = cells[r].filter((c) => c.text.trim()).length;
3968
+ if (nonEmptyCols !== 1) continue;
3969
+ if (cells[r][0].text.trim() !== "") continue;
3970
+ const contentText = cells[r].find((c) => c.text.trim())?.text.trim() || "";
3971
+ if (/^[○●▶\-·]/.test(contentText)) continue;
3972
+ for (let pr = r - 1; pr >= 0; pr--) {
3973
+ if (cells[pr].some((c) => c.text.trim())) {
3974
+ for (let c = 0; c < numCols; c++) {
3975
+ const prev = cells[pr][c].text.trim();
3976
+ const curr = cells[r][c].text.trim();
3977
+ if (curr) cells[pr][c].text = prev ? prev + " " + curr : curr;
3978
+ }
3979
+ for (let c = 0; c < numCols; c++) cells[r][c].text = "";
3980
+ break;
3981
+ }
3982
+ }
3983
+ }
3984
+ for (let r = 0; r < cells.length - 1; r++) {
3985
+ const row = cells[r];
3986
+ const hasCol0 = row[0].text.trim() !== "";
3987
+ const hasColLast = numCols > 1 && row[numCols - 1].text.trim() !== "";
3988
+ const midEmpty = row.slice(1, numCols - 1).every((c) => c.text.trim() === "");
3989
+ if (hasCol0 && hasColLast && midEmpty) {
3990
+ const next = cells[r + 1];
3991
+ if (next[0].text.trim() === "" && next.some((c) => c.text.trim())) {
3992
+ for (let c = 1; c < numCols; c++) {
3993
+ const curr = next[c].text.trim();
3994
+ if (curr) row[c].text = row[c].text.trim() ? row[c].text.trim() + " " + curr : curr;
3995
+ }
3996
+ for (let c = 0; c < numCols; c++) next[c].text = "";
3997
+ }
3998
+ }
3999
+ }
4000
+ const filteredCells = cells.filter((row) => row.some((c) => c.text.trim()));
4001
+ const finalRowCount = filteredCells.length;
4002
+ if (finalRowCount < MIN_ROWS) return null;
3531
4003
  const irTable = {
3532
- rows: numRows,
4004
+ rows: finalRowCount,
3533
4005
  cols: numCols,
3534
- cells,
3535
- hasHeader: numRows > 1
4006
+ cells: filteredCells,
4007
+ hasHeader: finalRowCount > 1
3536
4008
  };
3537
4009
  const allItems = rows.flatMap((r) => r.items);
3538
4010
  let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
@@ -3609,7 +4081,7 @@ async function parsePdfDocument(buffer, options) {
3609
4081
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
3610
4082
  const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
3611
4083
  const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
3612
- const allFontSizes = [];
4084
+ const fontSizeFreq = /* @__PURE__ */ new Map();
3613
4085
  const pageHeights = /* @__PURE__ */ new Map();
3614
4086
  let parsedPages = 0;
3615
4087
  for (let i = 1; i <= effectivePageCount; i++) {
@@ -3626,7 +4098,7 @@ async function parsePdfDocument(buffer, options) {
3626
4098
  warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
3627
4099
  }
3628
4100
  for (const item of visible) {
3629
- if (item.fontSize > 0) allFontSizes.push(item.fontSize);
4101
+ if (item.fontSize > 0) fontSizeFreq.set(item.fontSize, (fontSizeFreq.get(item.fontSize) || 0) + 1);
3630
4102
  }
3631
4103
  const opList = await page.getOperatorList();
3632
4104
  const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
@@ -3665,10 +4137,9 @@ async function parsePdfDocument(buffer, options) {
3665
4137
  blocks.splice(removed[ri], 1);
3666
4138
  }
3667
4139
  }
3668
- const medianFontSize = computeMedianFontSize(allFontSizes);
4140
+ const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
3669
4141
  if (medianFontSize > 0) {
3670
4142
  detectHeadings(blocks, medianFontSize);
3671
- mergeAdjacentHeadings(blocks);
3672
4143
  }
3673
4144
  detectMarkerHeadings(blocks);
3674
4145
  const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
@@ -3730,11 +4201,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
3730
4201
  }
3731
4202
  return { visible, hiddenCount };
3732
4203
  }
3733
- function computeMedianFontSize(sizes) {
3734
- if (sizes.length === 0) return 0;
3735
- const sorted = [...sizes].sort((a, b) => a - b);
3736
- const mid = Math.floor(sorted.length / 2);
3737
- return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
4204
+ function computeMedianFontSizeFromFreq(freq) {
4205
+ if (freq.size === 0) return 0;
4206
+ let total = 0;
4207
+ for (const count of freq.values()) total += count;
4208
+ const sorted = [...freq.entries()].sort((a, b) => a[0] - b[0]);
4209
+ const mid = Math.floor(total / 2);
4210
+ let cumulative = 0;
4211
+ for (const [size, count] of sorted) {
4212
+ cumulative += count;
4213
+ if (cumulative > mid) return size;
4214
+ }
4215
+ return sorted[sorted.length - 1][0];
3738
4216
  }
3739
4217
  function detectHeadings(blocks, medianFontSize) {
3740
4218
  for (const block of blocks) {
@@ -3754,220 +4232,27 @@ function detectHeadings(blocks, medianFontSize) {
3754
4232
  }
3755
4233
  }
3756
4234
  }
3757
- function mergeAdjacentHeadings(blocks) {
3758
- let i = 0;
3759
- while (i < blocks.length - 1) {
3760
- const curr = blocks[i];
3761
- const next = blocks[i + 1];
3762
- if (curr.type !== "heading" || next.type !== "heading") {
3763
- i++;
3764
- continue;
3765
- }
3766
- if (!curr.bbox || !next.bbox || !curr.text || !next.text) {
3767
- i++;
3768
- continue;
3769
- }
3770
- const currBaseline = curr.bbox.y + (curr.style?.fontSize || curr.bbox.height);
3771
- const nextBaseline = next.bbox.y + (next.style?.fontSize || next.bbox.height);
3772
- const yDiff = Math.abs(currBaseline - nextBaseline);
3773
- const maxFs = Math.max(curr.style?.fontSize || 12, next.style?.fontSize || 12);
3774
- const sameY = curr.bbox.page === next.bbox.page && yDiff < maxFs * 1.5;
3775
- const sameLevel = curr.level === next.level;
3776
- if (sameY && sameLevel) {
3777
- const currX = curr.bbox.x;
3778
- const nextX = next.bbox.x;
3779
- if (currX <= nextX) {
3780
- curr.text = curr.text + " " + next.text;
3781
- } else {
3782
- curr.text = next.text + " " + curr.text;
3783
- }
3784
- curr.bbox = {
3785
- page: curr.bbox.page,
3786
- x: Math.min(curr.bbox.x, next.bbox.x),
3787
- y: Math.min(curr.bbox.y, next.bbox.y),
3788
- width: Math.max(curr.bbox.x + curr.bbox.width, next.bbox.x + next.bbox.width) - Math.min(curr.bbox.x, next.bbox.x),
3789
- height: Math.max(curr.bbox.height, next.bbox.height)
3790
- };
3791
- blocks.splice(i + 1, 1);
3792
- } else {
3793
- i++;
3794
- }
3795
- }
3796
- }
3797
4235
  function collapseEvenSpacing(text) {
3798
4236
  const tokens = text.split(" ");
3799
4237
  const singleCharCount = tokens.filter((t) => t.length === 1).length;
3800
4238
  if (tokens.length >= 3 && singleCharCount / tokens.length >= 0.7) {
3801
4239
  return tokens.join("");
3802
4240
  }
3803
- return text;
3804
- }
3805
- function buildXyCutBlocks(items, pageNum) {
3806
- const allY = items.map((i) => i.y);
3807
- const pageHeight = Math.max(...allY) - Math.min(...allY);
3808
- const gapThreshold = Math.max(15, pageHeight * 0.03);
3809
- const orderedGroups = xyCutOrder(items, gapThreshold);
3810
- const blocks = [];
3811
- for (const group of orderedGroups) {
3812
- if (group.length === 0) continue;
3813
- const yLines = groupByY(group);
3814
- for (const line of yLines) {
3815
- const text = mergeLineSimple(line);
3816
- if (!text.trim()) continue;
3817
- blocks.push({
3818
- type: "paragraph",
3819
- text,
3820
- pageNumber: pageNum,
3821
- bbox: computeBBox(line, pageNum),
3822
- style: dominantStyle(line)
3823
- });
3824
- }
3825
- }
3826
- return blocks.length > 0 ? blocks : null;
3827
- }
3828
- function normalizeUnderSegmentedTable(table, items, pageNum, bbox) {
3829
- const totalCells = table.cells.reduce((sum, row) => sum + row.filter((c) => c.text.trim()).length, 0);
3830
- const totalTextLines = table.cells.reduce((sum, row) => sum + row.reduce((s, c) => s + (c.text.trim() ? c.text.split("\n").length : 0), 0), 0);
3831
- const isUnderSegmented = table.rows === 1 && table.cols === 1 || totalCells <= 2 && totalTextLines >= 8 || totalCells <= 2 && items.length >= 6;
3832
- if (!isUnderSegmented) return null;
3833
- if (hasMultiColumnLayout(items)) return buildXyCutBlocks(items, pageNum);
3834
- const directTable = buildTableFromTextLayout(items, pageNum, bbox);
3835
- if (directTable) return directTable;
3836
- const clusterItems = items.map((i) => ({
3837
- text: i.text,
3838
- x: i.x,
3839
- y: i.y,
3840
- w: i.w,
3841
- h: i.h,
3842
- fontSize: i.fontSize,
3843
- fontName: i.fontName
3844
- }));
3845
- const clusterResults = detectClusterTables(clusterItems, pageNum);
3846
- if (clusterResults.length > 0) {
3847
- const blocks = [];
3848
- const ciToIdx = /* @__PURE__ */ new Map();
3849
- for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
3850
- const usedIndices = /* @__PURE__ */ new Set();
3851
- for (const cr of clusterResults) {
3852
- for (const ci of cr.usedItems) {
3853
- const idx = ciToIdx.get(ci);
3854
- if (idx !== void 0) usedIndices.add(idx);
3855
- }
3856
- blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
3857
- }
3858
- const remaining = items.filter((_, idx) => !usedIndices.has(idx));
3859
- for (const item of remaining) {
3860
- if (!item.text.trim()) continue;
3861
- blocks.push({
3862
- type: "paragraph",
3863
- text: item.text,
3864
- pageNumber: pageNum,
3865
- bbox: computeBBox([item], pageNum),
3866
- style: { fontSize: item.fontSize, fontName: item.fontName }
3867
- });
3868
- }
3869
- blocks.sort((a, b) => {
3870
- const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
3871
- const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
3872
- return by - ay;
3873
- });
3874
- return blocks.length > 0 ? blocks : null;
3875
- }
3876
- return null;
3877
- }
3878
- function buildTableFromTextLayout(items, pageNum, bbox) {
3879
- if (items.length < 4) return null;
3880
- const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
3881
- const yTol = 3;
3882
- const rows = [];
3883
- let curRow = [sorted[0]];
3884
- let curY = sorted[0].y;
3885
- for (let i = 1; i < sorted.length; i++) {
3886
- if (Math.abs(sorted[i].y - curY) <= yTol) {
3887
- curRow.push(sorted[i]);
3888
- } else {
3889
- rows.push(curRow);
3890
- curRow = [sorted[i]];
3891
- curY = sorted[i].y;
3892
- }
3893
- }
3894
- rows.push(curRow);
3895
- if (rows.length < 2) return null;
3896
- const gapPositions = [];
3897
- for (const row of rows) {
3898
- if (row.length < 2) continue;
3899
- const sortedX = [...row].sort((a, b) => a.x - b.x);
3900
- const avgFs = sortedX.reduce((s, i) => s + i.fontSize, 0) / sortedX.length;
3901
- for (let j = 1; j < sortedX.length; j++) {
3902
- const gap = sortedX[j].x - (sortedX[j - 1].x + sortedX[j - 1].w);
3903
- if (gap >= avgFs * 1.5) {
3904
- gapPositions.push(sortedX[j - 1].x + sortedX[j - 1].w + gap / 2);
3905
- }
3906
- }
3907
- }
3908
- if (gapPositions.length < 2) return null;
3909
- gapPositions.sort((a, b) => a - b);
3910
- const colBoundaries = [];
3911
- let clusterSum = gapPositions[0], clusterCount = 1;
3912
- for (let i = 1; i < gapPositions.length; i++) {
3913
- const avg = clusterSum / clusterCount;
3914
- if (Math.abs(gapPositions[i] - avg) <= 15) {
3915
- clusterSum += gapPositions[i];
3916
- clusterCount++;
3917
- } else {
3918
- if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
3919
- clusterSum = gapPositions[i];
3920
- clusterCount = 1;
3921
- }
3922
- }
3923
- if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
3924
- if (colBoundaries.length === 0) return null;
3925
- const numCols = colBoundaries.length + 1;
3926
- const tableRows = [];
3927
- for (const row of rows) {
3928
- const cells = Array(numCols).fill("");
3929
- const sortedX = [...row].sort((a, b) => a.x - b.x);
3930
- for (const item of sortedX) {
3931
- const cx = item.x + item.w / 2;
3932
- let col = 0;
3933
- for (let b = 0; b < colBoundaries.length; b++) {
3934
- if (cx > colBoundaries[b]) col = b + 1;
3935
- }
3936
- cells[col] = cells[col] ? cells[col] + " " + item.text : item.text;
3937
- }
3938
- if (cells[0].trim() === "" && tableRows.length > 0) {
3939
- const prevCells = tableRows[tableRows.length - 1].cells;
3940
- for (let c = 0; c < numCols; c++) {
3941
- if (cells[c].trim()) {
3942
- prevCells[c] = prevCells[c] ? prevCells[c] + " " + cells[c].trim() : cells[c].trim();
3943
- }
3944
- }
3945
- } else {
3946
- tableRows.push({ cells });
3947
- }
3948
- }
3949
- if (tableRows.length < 2) return null;
3950
- const nonEmptyCount = tableRows.reduce((sum, r) => sum + r.cells.filter((c) => c.trim()).length, 0);
3951
- const totalCount = tableRows.length * numCols;
3952
- if (nonEmptyCount < totalCount * 0.3) return null;
3953
- const irCells = tableRows.map(
3954
- (r) => r.cells.map((text, colIdx) => {
3955
- let cleaned = text.trim();
3956
- if (colIdx > 0) cleaned = cleaned.replace(/^[•○·\-]\s*/, "");
3957
- return { text: cleaned, colSpan: 1, rowSpan: 1 };
3958
- })
4241
+ return text.replace(
4242
+ /(?<![가-힣])[가-힣](?: [가-힣\d]){2,}(?![가-힣])/g,
4243
+ (match) => match.replace(/ /g, "")
3959
4244
  );
3960
- const irTable = {
3961
- rows: tableRows.length,
3962
- cols: numCols,
3963
- cells: irCells,
3964
- hasHeader: tableRows.length > 1
3965
- };
3966
- return [{ type: "table", table: irTable, pageNumber: pageNum, bbox }];
3967
4245
  }
3968
4246
  function shouldDemoteTable(table) {
3969
4247
  const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
3970
4248
  const allText = allCells.join(" ");
4249
+ if (table.rows <= 3 && table.cols <= 3) {
4250
+ const totalCells2 = table.rows * table.cols;
4251
+ const emptyCells2 = totalCells2 - allCells.length;
4252
+ if (emptyCells2 >= totalCells2 * 0.3) return true;
4253
+ if (/[□■◆○●▶ㅇ]/.test(allText)) return true;
4254
+ if (/<[^>]+>/.test(allText)) return true;
4255
+ }
3971
4256
  if (allText.length > 200) return false;
3972
4257
  if (/[□■◆○●▶]/.test(allText) && table.rows <= 3) return true;
3973
4258
  const totalCells = table.rows * table.cols;
@@ -4011,32 +4296,6 @@ function detectMarkerHeadings(blocks) {
4011
4296
  }
4012
4297
  }
4013
4298
  }
4014
- function hasMultiColumnLayout(items) {
4015
- if (items.length < 30) return false;
4016
- const sorted = [...items].sort((a, b) => a.x - b.x);
4017
- const minX = sorted[0].x;
4018
- let maxX = minX;
4019
- for (const i of sorted) if (i.x + i.w > maxX) maxX = i.x + i.w;
4020
- const pageWidth = maxX - minX;
4021
- if (pageWidth < 200) return false;
4022
- let bestGap = 0;
4023
- let bestSplit = 0;
4024
- for (let j = 1; j < sorted.length; j++) {
4025
- const gap = sorted[j].x - (sorted[j - 1].x + sorted[j - 1].w);
4026
- if (gap > bestGap) {
4027
- bestGap = gap;
4028
- bestSplit = (sorted[j - 1].x + sorted[j - 1].w + sorted[j].x) / 2;
4029
- }
4030
- }
4031
- if (bestGap < 20) return false;
4032
- const splitRatio = (bestSplit - minX) / pageWidth;
4033
- if (splitRatio < 0.35 || splitRatio > 0.65) return false;
4034
- const leftCount = items.filter((i) => i.x + i.w / 2 < bestSplit).length;
4035
- const rightCount = items.filter((i) => i.x + i.w / 2 >= bestSplit).length;
4036
- if (leftCount < 15 || rightCount < 15) return false;
4037
- if (Math.min(leftCount, rightCount) / Math.max(leftCount, rightCount) < 0.35) return false;
4038
- return true;
4039
- }
4040
4299
  var MAX_XYCUT_DEPTH = 50;
4041
4300
  function xyCutOrder(items, gapThreshold, depth = 0) {
4042
4301
  if (items.length === 0) return [];
@@ -4104,6 +4363,7 @@ function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeigh
4104
4363
  if (items.length === 0) return [];
4105
4364
  let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
4106
4365
  ({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
4366
+ ({ horizontals, verticals } = preprocessLines(horizontals, verticals));
4107
4367
  const grids = buildTableGrids(horizontals, verticals);
4108
4368
  if (grids.length > 0) {
4109
4369
  return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
@@ -4115,14 +4375,19 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
4115
4375
  const usedItems = /* @__PURE__ */ new Set();
4116
4376
  const sortedGrids = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
4117
4377
  for (const grid of sortedGrids) {
4378
+ const numGridRows = grid.rowYs.length - 1;
4379
+ const numGridCols = grid.colXs.length - 1;
4380
+ if (numGridRows === 1 && numGridCols >= 2) continue;
4118
4381
  const tableItems = [];
4119
4382
  const pad = 3;
4383
+ const gridW = grid.bbox.x2 - grid.bbox.x1;
4120
4384
  for (const item of items) {
4121
4385
  if (usedItems.has(item)) continue;
4122
- if (item.x >= grid.bbox.x1 - pad && item.x + item.w <= grid.bbox.x2 + pad && item.y >= grid.bbox.y1 - pad && item.y <= grid.bbox.y2 + pad) {
4123
- tableItems.push(item);
4124
- usedItems.add(item);
4125
- }
4386
+ if (item.y < grid.bbox.y1 - pad || item.y > grid.bbox.y2 + pad) continue;
4387
+ if (item.x < grid.bbox.x1 - pad || item.x + item.w > grid.bbox.x2 + pad) continue;
4388
+ if (gridW < 120 && item.x + item.w > grid.bbox.x2 - 2) continue;
4389
+ tableItems.push(item);
4390
+ usedItems.add(item);
4126
4391
  }
4127
4392
  const cells = extractCells(grid, horizontals, verticals);
4128
4393
  if (cells.length === 0) continue;
@@ -4146,6 +4411,7 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
4146
4411
  const cellItems = cellTextMap.get(cell) || [];
4147
4412
  let text = cellTextToString(cellItems);
4148
4413
  text = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
4414
+ text = text.split("\n").map((line) => collapseEvenSpacing(line)).join("\n");
4149
4415
  irGrid[cell.row][cell.col] = {
4150
4416
  text,
4151
4417
  colSpan: cell.colSpan,
@@ -4167,31 +4433,61 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
4167
4433
  width: grid.bbox.x2 - grid.bbox.x1,
4168
4434
  height: grid.bbox.y2 - grid.bbox.y1
4169
4435
  };
4170
- const normalized = normalizeUnderSegmentedTable(irTable, tableItems, pageNum, tableBbox);
4171
- if (normalized) {
4172
- blocks.push(...normalized);
4173
- continue;
4174
- }
4175
4436
  if (shouldDemoteTable(irTable)) {
4176
4437
  const demoted = demoteTableToText(irTable);
4177
4438
  if (demoted) {
4178
- blocks.push({ type: "paragraph", text: demoted, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
4439
+ const text = numGridRows === 1 ? "\n" + demoted + "\n" : demoted;
4440
+ blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
4179
4441
  }
4180
4442
  continue;
4181
4443
  }
4182
4444
  blocks.push({ type: "table", table: irTable, pageNumber: pageNum, bbox: tableBbox });
4183
4445
  }
4184
- const remaining = items.filter((i) => !usedItems.has(i));
4446
+ let remaining = items.filter((i) => !usedItems.has(i));
4185
4447
  if (remaining.length > 0) {
4186
4448
  remaining.sort((a, b) => b.y - a.y || a.x - b.x);
4187
- const textBlocks = detectListBlocks(extractPageBlocksFallback(remaining, pageNum));
4188
- const allBlocks = [...blocks, ...textBlocks];
4189
- allBlocks.sort((a, b) => {
4449
+ const clusterItems = remaining.map((i) => ({
4450
+ text: i.text,
4451
+ x: i.x,
4452
+ y: i.y,
4453
+ w: i.w,
4454
+ h: i.h,
4455
+ fontSize: i.fontSize,
4456
+ fontName: i.fontName
4457
+ }));
4458
+ const clusterResults = detectClusterTables(clusterItems, pageNum);
4459
+ if (clusterResults.length > 0) {
4460
+ const ciToIdx = /* @__PURE__ */ new Map();
4461
+ for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
4462
+ const usedClusterIndices = /* @__PURE__ */ new Set();
4463
+ for (const cr of clusterResults) {
4464
+ for (const ci of cr.usedItems) {
4465
+ const idx = ciToIdx.get(ci);
4466
+ if (idx !== void 0) usedClusterIndices.add(idx);
4467
+ }
4468
+ blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
4469
+ }
4470
+ remaining = remaining.filter((_, idx) => !usedClusterIndices.has(idx));
4471
+ }
4472
+ if (remaining.length > 0) {
4473
+ const allY = remaining.map((i) => i.y);
4474
+ const pageH = safeMax(allY) - safeMin(allY);
4475
+ const groups = xyCutOrder(remaining, Math.max(15, pageH * 0.03));
4476
+ const textBlocks = [];
4477
+ for (const group of groups) {
4478
+ if (group.length === 0) continue;
4479
+ const groupBlocks = extractPageBlocksFallback(group, pageNum);
4480
+ for (const b of groupBlocks) textBlocks.push(b);
4481
+ }
4482
+ const finalTextBlocks = detectListBlocks(textBlocks);
4483
+ for (const b of finalTextBlocks) blocks.push(b);
4484
+ }
4485
+ blocks.sort((a, b) => {
4190
4486
  const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
4191
4487
  const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
4192
4488
  return by - ay;
4193
4489
  });
4194
- return mergeAdjacentTableBlocks(allBlocks);
4490
+ return mergeAdjacentTableBlocks(blocks);
4195
4491
  }
4196
4492
  return mergeAdjacentTableBlocks(blocks);
4197
4493
  }
@@ -4217,57 +4513,53 @@ function mergeAdjacentTableBlocks(blocks) {
4217
4513
  }
4218
4514
  function extractPageBlocksFallback(items, pageNum) {
4219
4515
  if (items.length === 0) return [];
4220
- if (hasMultiColumnLayout(items)) {
4221
- const xyBlocks = buildXyCutBlocks(items, pageNum) || [];
4222
- return detectSpecialKoreanTables(detectListBlocks(xyBlocks));
4223
- }
4224
4516
  const blocks = [];
4225
- const allYLines = groupByY(items);
4226
- const columns = detectColumns(allYLines);
4227
- if (columns && columns.length >= 3) {
4228
- const tableText = extractWithColumns(allYLines, columns);
4229
- const bbox = computeBBox(items, pageNum);
4230
- blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
4231
- } else {
4232
- const clusterItems = items.map((i) => ({
4233
- text: i.text,
4234
- x: i.x,
4235
- y: i.y,
4236
- w: i.w,
4237
- h: i.h,
4238
- fontSize: i.fontSize,
4239
- fontName: i.fontName
4240
- }));
4241
- const clusterResults = hasMultiColumnLayout(items) ? [] : detectClusterTables(clusterItems, pageNum);
4242
- if (clusterResults.length > 0) {
4243
- const ciToIdx = /* @__PURE__ */ new Map();
4244
- for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
4245
- const usedIndices = /* @__PURE__ */ new Set();
4246
- for (const cr of clusterResults) {
4247
- for (const ci of cr.usedItems) {
4248
- const idx = ciToIdx.get(ci);
4249
- if (idx !== void 0) usedIndices.add(idx);
4250
- }
4251
- blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
4517
+ const clusterItems = items.map((i) => ({
4518
+ text: i.text,
4519
+ x: i.x,
4520
+ y: i.y,
4521
+ w: i.w,
4522
+ h: i.h,
4523
+ fontSize: i.fontSize,
4524
+ fontName: i.fontName
4525
+ }));
4526
+ const clusterResults = detectClusterTables(clusterItems, pageNum);
4527
+ if (clusterResults.length > 0) {
4528
+ const ciToIdx = /* @__PURE__ */ new Map();
4529
+ for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
4530
+ const usedIndices = /* @__PURE__ */ new Set();
4531
+ for (const cr of clusterResults) {
4532
+ for (const ci of cr.usedItems) {
4533
+ const idx = ciToIdx.get(ci);
4534
+ if (idx !== void 0) usedIndices.add(idx);
4252
4535
  }
4253
- const remaining = items.filter((_, idx) => !usedIndices.has(idx));
4254
- if (remaining.length > 0) {
4255
- const yLines = groupByY(remaining);
4256
- for (const line of yLines) {
4257
- const text = mergeLineSimple(line);
4258
- if (!text.trim()) continue;
4259
- const bbox = computeBBox(line, pageNum);
4260
- blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
4261
- }
4536
+ blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
4537
+ }
4538
+ const remaining = items.filter((_, idx) => !usedIndices.has(idx));
4539
+ if (remaining.length > 0) {
4540
+ const yLines = groupByY(remaining);
4541
+ for (const line of yLines) {
4542
+ const text = mergeLineSimple(line);
4543
+ if (!text.trim()) continue;
4544
+ const bbox = computeBBox(line, pageNum);
4545
+ blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
4262
4546
  }
4263
- blocks.sort((a, b) => {
4264
- const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
4265
- const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
4266
- return by - ay;
4267
- });
4547
+ }
4548
+ blocks.sort((a, b) => {
4549
+ const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
4550
+ const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
4551
+ return by - ay;
4552
+ });
4553
+ } else {
4554
+ const allYLines = groupByY(items);
4555
+ const columns = detectColumns(allYLines);
4556
+ if (columns && columns.length >= 3) {
4557
+ const tableText = extractWithColumns(allYLines, columns);
4558
+ const bbox = computeBBox(items, pageNum);
4559
+ blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
4268
4560
  } else {
4269
4561
  const allY = items.map((i) => i.y);
4270
- const pageHeight = Math.max(...allY) - Math.min(...allY);
4562
+ const pageHeight = safeMax(allY) - safeMin(allY);
4271
4563
  const gapThreshold = Math.max(15, pageHeight * 0.03);
4272
4564
  const orderedGroups = xyCutOrder(items, gapThreshold);
4273
4565
  for (const group of orderedGroups) {
@@ -4320,22 +4612,76 @@ function dominantStyle(items) {
4320
4612
  return { fontSize: dominantSize, fontName };
4321
4613
  }
4322
4614
  function normalizeItems(rawItems) {
4323
- return rawItems.filter((i) => typeof i.str === "string" && i.str.trim() !== "").map((i) => {
4615
+ const items = [];
4616
+ const spacePositions = [];
4617
+ for (const i of rawItems) {
4618
+ if (typeof i.str !== "string") continue;
4619
+ const x = Math.round(i.transform[4]);
4620
+ const y = Math.round(i.transform[5]);
4621
+ if (!i.str.trim()) {
4622
+ spacePositions.push({ x, y });
4623
+ continue;
4624
+ }
4324
4625
  const scaleY = Math.abs(i.transform[3]);
4325
4626
  const scaleX = Math.abs(i.transform[0]);
4326
4627
  const fontSize = Math.round(Math.max(scaleY, scaleX));
4327
- return {
4328
- text: i.str.trim(),
4329
- x: Math.round(i.transform[4]),
4330
- y: Math.round(i.transform[5]),
4331
- w: Math.round(i.width),
4332
- h: Math.round(i.height),
4333
- fontSize,
4334
- fontName: i.fontName || "",
4335
- // 0pt 폰트이거나 너비 0 → hidden text (prompt injection 의심)
4336
- isHidden: fontSize === 0 || i.width === 0 && i.str.trim().length > 0
4337
- };
4338
- }).sort((a, b) => b.y - a.y || a.x - b.x);
4628
+ const w = Math.round(i.width);
4629
+ const h = Math.round(i.height);
4630
+ const isHidden = fontSize === 0 || i.width === 0 && i.str.trim().length > 0;
4631
+ let text = i.str.trim();
4632
+ if (/^[\d\s\-().·,☎]+$/.test(text) && /\d/.test(text) && / /.test(text)) {
4633
+ text = text.replace(/ /g, "");
4634
+ }
4635
+ const split = splitEvenSpacedItem(text, x, w, fontSize);
4636
+ if (split) {
4637
+ for (const s of split) {
4638
+ items.push({ text: s.text, x: s.x, y, w: s.w, h, fontSize, fontName: i.fontName || "", isHidden });
4639
+ }
4640
+ } else {
4641
+ items.push({ text, x, y, w, h, fontSize, fontName: i.fontName || "", isHidden });
4642
+ }
4643
+ }
4644
+ const sorted = items.sort((a, b) => b.y - a.y || a.x - b.x);
4645
+ const deduped = [];
4646
+ for (let i = 0; i < sorted.length; i++) {
4647
+ let isDup = false;
4648
+ for (let j = deduped.length - 1; j >= 0; j--) {
4649
+ const prev = deduped[j];
4650
+ if (prev.y - sorted[i].y > 3) break;
4651
+ if (Math.abs(prev.y - sorted[i].y) <= 3 && prev.text === sorted[i].text && Math.abs(prev.x - sorted[i].x) <= 3) {
4652
+ isDup = true;
4653
+ break;
4654
+ }
4655
+ }
4656
+ if (!isDup) deduped.push(sorted[i]);
4657
+ }
4658
+ if (spacePositions.length > 0) {
4659
+ for (const item of deduped) {
4660
+ for (const sp of spacePositions) {
4661
+ if (Math.abs(sp.y - item.y) <= 3) {
4662
+ const dist = item.x - sp.x;
4663
+ if (dist >= 0 && dist <= 20) {
4664
+ item.hasSpaceBefore = true;
4665
+ break;
4666
+ }
4667
+ }
4668
+ }
4669
+ }
4670
+ }
4671
+ return deduped;
4672
+ }
4673
+ function splitEvenSpacedItem(text, itemX, itemW, fontSize) {
4674
+ if (!/^[가-힣\d](?: [가-힣\d]){2,}$/.test(text)) return null;
4675
+ const chars = text.split(" ");
4676
+ if (chars.length < 3) return null;
4677
+ const charW = itemW / chars.length;
4678
+ if (charW > fontSize * 2) return null;
4679
+ return chars.map((ch, idx) => ({
4680
+ text: ch,
4681
+ x: Math.round(itemX + idx * charW),
4682
+ w: Math.round(charW * 0.8)
4683
+ // 실제 글자 폭은 간격보다 좁음
4684
+ }));
4339
4685
  }
4340
4686
  function groupByY(items) {
4341
4687
  if (items.length === 0) return [];
@@ -4360,14 +4706,14 @@ function isProseSpread(items) {
4360
4706
  for (let i = 1; i < sorted.length; i++) {
4361
4707
  gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
4362
4708
  }
4363
- const maxGap = Math.max(...gaps);
4709
+ const maxGap = safeMax(gaps);
4364
4710
  const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
4365
4711
  return maxGap < 40 && avgLen < 5;
4366
4712
  }
4367
4713
  function detectColumns(yLines) {
4368
4714
  const allItems = yLines.flat();
4369
4715
  if (allItems.length === 0) return null;
4370
- const pageWidth = Math.max(...allItems.map((i) => i.x + i.w)) - Math.min(...allItems.map((i) => i.x));
4716
+ const pageWidth = safeMax(allItems.map((i) => i.x + i.w)) - safeMin(allItems.map((i) => i.x));
4371
4717
  if (pageWidth < 100) return null;
4372
4718
  let bigoLineIdx = -1;
4373
4719
  for (let i = 0; i < yLines.length; i++) {
@@ -4399,7 +4745,7 @@ function detectColumns(yLines) {
4399
4745
  }
4400
4746
  const peaks = xClusters.filter((c) => c.count >= 3).sort((a, b) => a.minX - b.minX);
4401
4747
  if (peaks.length < 3) return null;
4402
- const MERGE_TOL = 30;
4748
+ const MERGE_TOL = 40;
4403
4749
  const merged = [peaks[0]];
4404
4750
  for (let i = 1; i < peaks.length; i++) {
4405
4751
  const prev = merged[merged.length - 1];
@@ -4413,7 +4759,14 @@ function detectColumns(yLines) {
4413
4759
  merged.push({ ...peaks[i] });
4414
4760
  }
4415
4761
  }
4416
- const columns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
4762
+ const rawColumns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
4763
+ if (rawColumns.length < 3) return null;
4764
+ const MIN_DETECT_COL_WIDTH = 30;
4765
+ const columns = [rawColumns[0]];
4766
+ for (let i = 1; i < rawColumns.length; i++) {
4767
+ if (rawColumns[i] - columns[columns.length - 1] < MIN_DETECT_COL_WIDTH) continue;
4768
+ columns.push(rawColumns[i]);
4769
+ }
4417
4770
  return columns.length >= 3 ? columns : null;
4418
4771
  }
4419
4772
  function findColumn(x, columns) {
@@ -4541,6 +4894,16 @@ function buildGridTable(lines, columns) {
4541
4894
  }
4542
4895
  merged.splice(0, headerEnd, headerRow);
4543
4896
  }
4897
+ for (const row of merged) {
4898
+ for (let c = 0; c < row.length; c++) {
4899
+ if (row[c]) row[c] = collapseEvenSpacing(row[c]);
4900
+ }
4901
+ }
4902
+ const totalCells = merged.length * numCols;
4903
+ const filledCells = merged.reduce((s, row) => s + row.filter((c) => c).length, 0);
4904
+ if (filledCells < totalCells * 0.35 || merged.length < 2 || merged.length <= 3 && numCols >= 7) {
4905
+ return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
4906
+ }
4544
4907
  const md = [];
4545
4908
  md.push("| " + merged[0].join(" | ") + " |");
4546
4909
  md.push("| " + merged[0].map(() => "---").join(" | ") + " |");
@@ -4552,12 +4915,32 @@ function buildGridTable(lines, columns) {
4552
4915
  function mergeLineSimple(items) {
4553
4916
  if (items.length <= 1) return items[0]?.text || "";
4554
4917
  const sorted = [...items].sort((a, b) => a.x - b.x);
4918
+ const isEvenSpaced = detectEvenSpacedItems(sorted);
4555
4919
  let result = sorted[0].text;
4556
4920
  for (let i = 1; i < sorted.length; i++) {
4557
4921
  const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
4558
4922
  const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
4559
- if (gap > 15) result += " ";
4560
- else if (gap < avgFs * 0.15) {
4923
+ const tabThreshold = Math.max(avgFs * 2, 30);
4924
+ if (gap > tabThreshold) {
4925
+ result += " ";
4926
+ result += sorted[i].text;
4927
+ continue;
4928
+ }
4929
+ if (isEvenSpaced[i]) {
4930
+ result += sorted[i].text;
4931
+ continue;
4932
+ }
4933
+ if (sorted[i].hasSpaceBefore && gap >= avgFs * 0.05) {
4934
+ result += " ";
4935
+ result += sorted[i].text;
4936
+ continue;
4937
+ }
4938
+ if (/[□■○●▶◆◇ㅇ]$/.test(sorted[i - 1].text) && /^[가-힣]/.test(sorted[i].text) && gap > 1) {
4939
+ result += " ";
4940
+ result += sorted[i].text;
4941
+ continue;
4942
+ }
4943
+ if (gap < avgFs * 0.15) {
4561
4944
  } else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
4562
4945
  } else if (gap > 3) result += " ";
4563
4946
  result += sorted[i].text;
@@ -4566,8 +4949,8 @@ function mergeLineSimple(items) {
4566
4949
  }
4567
4950
  function cleanPdfText(text) {
4568
4951
  return mergeKoreanLines(
4569
- text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "")
4570
- ).replace(/^(?!\|).{3,30}$/gm, (line) => collapseEvenSpacing(line)).replace(/\n{3,}/g, "\n\n").trim();
4952
+ text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
4953
+ ).replace(/^(?!\| ---).*$/gm, (line) => collapseEvenSpacing(line)).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
4571
4954
  }
4572
4955
  function startsWithMarker(line) {
4573
4956
  const t = line.trimStart();
@@ -4759,7 +5142,7 @@ function mergeKoreanLines(text) {
4759
5142
  result[result.length - 1] = prev + " " + currTrimmed;
4760
5143
  continue;
4761
5144
  }
4762
- if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
5145
+ if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev) && !startsWithMarker(prev)) {
4763
5146
  result[result.length - 1] = prev + " " + curr;
4764
5147
  } else {
4765
5148
  result.push(curr);
@@ -4772,7 +5155,7 @@ function mergeKoreanLines(text) {
4772
5155
  import { readFile } from "fs/promises";
4773
5156
 
4774
5157
  // src/xlsx/parser.ts
4775
- import JSZip3 from "jszip";
5158
+ import JSZip2 from "jszip";
4776
5159
  import { DOMParser as DOMParser2 } from "@xmldom/xmldom";
4777
5160
  var MAX_SHEETS = 100;
4778
5161
  var MAX_DECOMPRESS_SIZE3 = 100 * 1024 * 1024;
@@ -4810,7 +5193,7 @@ function getTextContent(el) {
4810
5193
  return el.textContent?.trim() ?? "";
4811
5194
  }
4812
5195
  function parseXml(text) {
4813
- return new DOMParser2().parseFromString(text, "text/xml");
5196
+ return new DOMParser2().parseFromString(stripDtd(text), "text/xml");
4814
5197
  }
4815
5198
  function parseSharedStrings(xml) {
4816
5199
  const doc = parseXml(xml);
@@ -4963,7 +5346,7 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
4963
5346
  }
4964
5347
  async function parseXlsxDocument(buffer, options) {
4965
5348
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
4966
- const zip = await JSZip3.loadAsync(buffer);
5349
+ const zip = await JSZip2.loadAsync(buffer);
4967
5350
  const warnings = [];
4968
5351
  const workbookFile = zip.file("xl/workbook.xml");
4969
5352
  if (!workbookFile) {
@@ -4985,7 +5368,7 @@ async function parseXlsxDocument(buffer, options) {
4985
5368
  }
4986
5369
  let pageFilter = null;
4987
5370
  if (options?.pages) {
4988
- const { parsePageRange: parsePageRange2 } = await import("./page-range-737B4EZW.js");
5371
+ const { parsePageRange: parsePageRange2 } = await import("./page-range-OF5I4PQY.js");
4989
5372
  pageFilter = parsePageRange2(options.pages, sheets.length);
4990
5373
  }
4991
5374
  const blocks = [];
@@ -5053,7 +5436,7 @@ async function parseXlsxDocument(buffer, options) {
5053
5436
  }
5054
5437
 
5055
5438
  // src/docx/parser.ts
5056
- import JSZip4 from "jszip";
5439
+ import JSZip3 from "jszip";
5057
5440
  import { DOMParser as DOMParser3 } from "@xmldom/xmldom";
5058
5441
  var MAX_DECOMPRESS_SIZE4 = 100 * 1024 * 1024;
5059
5442
  function getChildElements(parent, localName) {
@@ -5097,7 +5480,7 @@ function getAttr(el, localName) {
5097
5480
  return null;
5098
5481
  }
5099
5482
  function parseXml2(text) {
5100
- return new DOMParser3().parseFromString(text, "text/xml");
5483
+ return new DOMParser3().parseFromString(stripDtd(text), "text/xml");
5101
5484
  }
5102
5485
  function parseStyles(xml) {
5103
5486
  const doc = parseXml2(xml);
@@ -5391,7 +5774,7 @@ async function extractImages(zip, rels, doc) {
5391
5774
  }
5392
5775
  async function parseDocxDocument(buffer, options) {
5393
5776
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
5394
- const zip = await JSZip4.loadAsync(buffer);
5777
+ const zip = await JSZip3.loadAsync(buffer);
5395
5778
  const warnings = [];
5396
5779
  const docFile = zip.file("word/document.xml");
5397
5780
  if (!docFile) {
@@ -5608,7 +5991,7 @@ function extractInlineFields(text) {
5608
5991
  }
5609
5992
 
5610
5993
  // src/hwpx/generator.ts
5611
- import JSZip5 from "jszip";
5994
+ import JSZip4 from "jszip";
5612
5995
 
5613
5996
  // src/index.ts
5614
5997
  async function parse(input, options) {
@@ -5703,7 +6086,13 @@ function normalize(s) {
5703
6086
  }
5704
6087
  var MAX_LEVENSHTEIN_LEN = 1e4;
5705
6088
  function levenshtein(a, b) {
5706
- if (a.length + b.length > MAX_LEVENSHTEIN_LEN) return Math.abs(a.length - b.length);
6089
+ if (a.length + b.length > MAX_LEVENSHTEIN_LEN) {
6090
+ const sampleLen = Math.min(500, a.length, b.length);
6091
+ let diffs = 0;
6092
+ for (let i = 0; i < sampleLen; i++) if (a[i] !== b[i]) diffs++;
6093
+ const sampleRate = sampleLen > 0 ? diffs / sampleLen : 1;
6094
+ return Math.abs(a.length - b.length) + Math.round(Math.min(a.length, b.length) * sampleRate);
6095
+ }
5707
6096
  if (a.length > b.length) [a, b] = [b, a];
5708
6097
  const m = a.length;
5709
6098
  const n = b.length;
@@ -5859,7 +6248,10 @@ function diffTableCells(a, b) {
5859
6248
  }
5860
6249
 
5861
6250
  export {
5862
- detectFormat,
6251
+ VERSION,
6252
+ toArrayBuffer,
6253
+ KordocError,
6254
+ sanitizeError,
5863
6255
  blocksToMarkdown,
5864
6256
  extractHwpxMetadataOnly,
5865
6257
  extractHwp5MetadataOnly,
@@ -5868,4 +6260,4 @@ export {
5868
6260
  extractFormFields,
5869
6261
  parse
5870
6262
  };
5871
- //# sourceMappingURL=chunk-GJ2S6IMC.js.map
6263
+ //# sourceMappingURL=chunk-FINXMRCH.js.map