kordoc 2.0.3 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -63,6 +63,7 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount) {
63
63
  blocks.push({ type: "paragraph", text: text.trim(), pageNumber: i });
64
64
  }
65
65
  } catch {
66
+ blocks.push({ type: "paragraph", text: `[OCR \uC2E4\uD328: \uD398\uC774\uC9C0 ${i}]` });
66
67
  }
67
68
  }
68
69
  return blocks;
@@ -138,7 +139,7 @@ import { inflateRawSync } from "zlib";
138
139
  import { DOMParser } from "@xmldom/xmldom";
139
140
 
140
141
  // src/utils.ts
141
- var VERSION = true ? "2.0.3" : "0.0.0-dev";
142
+ var VERSION = true ? "2.2.0" : "0.0.0-dev";
142
143
  function toArrayBuffer(buf) {
143
144
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
144
145
  return buf.buffer;
@@ -154,7 +155,8 @@ var KordocError = class extends Error {
154
155
  function isPathTraversal(name) {
155
156
  if (name.includes("\0")) return true;
156
157
  const normalized = name.replace(/\\/g, "/");
157
- return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
158
+ const segments = normalized.split("/");
159
+ return segments.some((s) => s === "..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
158
160
  }
159
161
  function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
160
162
  try {
@@ -194,12 +196,25 @@ function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEnt
194
196
  return { totalUncompressed: 0, entryCount: 0 };
195
197
  }
196
198
  }
199
+ function stripDtd(xml) {
200
+ return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
201
+ }
197
202
  var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
198
203
  function sanitizeHref(href) {
199
204
  const trimmed = href.trim();
200
205
  if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
201
206
  return trimmed;
202
207
  }
208
+ function safeMin(arr) {
209
+ let min = Infinity;
210
+ for (let i = 0; i < arr.length; i++) if (arr[i] < min) min = arr[i];
211
+ return min;
212
+ }
213
+ function safeMax(arr) {
214
+ let max = -Infinity;
215
+ for (let i = 0; i < arr.length; i++) if (arr[i] > max) max = arr[i];
216
+ return max;
217
+ }
203
218
  function classifyError(err) {
204
219
  if (!(err instanceof Error)) return "PARSE_ERROR";
205
220
  const msg = err.message;
@@ -274,6 +289,7 @@ function buildTableDirect(rows, numRows) {
274
289
  if (end > maxCols) maxCols = end;
275
290
  }
276
291
  }
292
+ if (maxCols > MAX_COLS) maxCols = MAX_COLS;
277
293
  if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
278
294
  const grid = Array.from(
279
295
  { length: numRows },
@@ -283,7 +299,7 @@ function buildTableDirect(rows, numRows) {
283
299
  for (const cell of row) {
284
300
  const r = cell.rowAddr ?? 0;
285
301
  const c = cell.colAddr ?? 0;
286
- if (r >= numRows || c >= maxCols) continue;
302
+ if (r >= numRows || c >= maxCols || r < 0 || c < 0) continue;
287
303
  grid[r][c] = { text: cell.text.trim(), colSpan: cell.colSpan, rowSpan: cell.rowSpan };
288
304
  for (let dr = 0; dr < cell.rowSpan; dr++) {
289
305
  for (let dc = 0; dc < cell.colSpan; dc++) {
@@ -563,7 +579,12 @@ function parseCharProperties(doc, map) {
563
579
  if (!id) continue;
564
580
  const prop = {};
565
581
  const height = el.getAttribute("height");
566
- if (height) prop.fontSize = parseInt(height, 10) / 100;
582
+ if (height) {
583
+ const parsedHeight = parseInt(height, 10);
584
+ if (!isNaN(parsedHeight) && parsedHeight > 0) {
585
+ prop.fontSize = parsedHeight / 100;
586
+ }
587
+ }
567
588
  const bold = el.getAttribute("bold");
568
589
  if (bold === "true" || bold === "1") prop.bold = true;
569
590
  const italic = el.getAttribute("italic");
@@ -598,9 +619,6 @@ function parseStyleElements(doc, map) {
598
619
  }
599
620
  }
600
621
  }
601
- function stripDtd(xml) {
602
- return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
603
- }
604
622
  async function parseHwpxDocument(buffer, options) {
605
623
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
606
624
  let zip;
@@ -703,7 +721,7 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
703
721
  const data = await file.async("uint8array");
704
722
  decompressed.total += data.length;
705
723
  if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
706
- const ext = ref.includes(".") ? ref.split(".").pop() : "png";
724
+ const ext = ref.includes(".") ? ref.split(".").pop() || "png" : "png";
707
725
  const mimeType = imageExtToMime(ext);
708
726
  imageIndex++;
709
727
  const filename = `image_${String(imageIndex).padStart(3, "0")}.${mimeToExt(mimeType)}`;
@@ -950,7 +968,8 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
950
968
  if (newTable.rows.length > 0) {
951
969
  if (tableStack.length > 0) {
952
970
  const parentTable = tableStack.pop();
953
- const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
971
+ let nestedCols = 0;
972
+ for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
954
973
  if (newTable.rows.length >= 3 && nestedCols >= 2) {
955
974
  blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
956
975
  } else {
@@ -997,8 +1016,10 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
997
1016
  break;
998
1017
  case "cellSpan":
999
1018
  if (tableCtx?.cell) {
1000
- const cs = parseInt(el.getAttribute("colSpan") || "1", 10);
1001
- const rs = parseInt(el.getAttribute("rowSpan") || "1", 10);
1019
+ const rawCs = parseInt(el.getAttribute("colSpan") || "1", 10);
1020
+ const cs = isNaN(rawCs) ? 1 : rawCs;
1021
+ const rawRs = parseInt(el.getAttribute("rowSpan") || "1", 10);
1022
+ const rs = isNaN(rawRs) ? 1 : rawRs;
1002
1023
  tableCtx.cell.colSpan = clampSpan(cs, MAX_COLS);
1003
1024
  tableCtx.cell.rowSpan = clampSpan(rs, MAX_ROWS);
1004
1025
  }
@@ -1057,7 +1078,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
1057
1078
  if (newTable.rows.length > 0) {
1058
1079
  if (tableStack.length > 0) {
1059
1080
  const parentTable = tableStack.pop();
1060
- const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
1081
+ let nestedCols = 0;
1082
+ for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
1061
1083
  if (newTable.rows.length >= 3 && nestedCols >= 2) {
1062
1084
  blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
1063
1085
  } else {
@@ -1090,6 +1112,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
1090
1112
  extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
1091
1113
  } else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
1092
1114
  walkChildren(el, d + 1);
1115
+ } else if (localTag === "run") {
1116
+ tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
1093
1117
  }
1094
1118
  }
1095
1119
  };
@@ -2153,6 +2177,7 @@ function parseLenientCfb(data) {
2153
2177
  if (miniSectorSizeShift > 16) throw new Error("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uBBF8\uB2C8 \uC139\uD130 \uD06C\uAE30 \uC2DC\uD504\uD2B8: " + miniSectorSizeShift);
2154
2178
  const miniSectorSize = 1 << miniSectorSizeShift;
2155
2179
  const fatSectorCount = data.readUInt32LE(44);
2180
+ if (fatSectorCount > 1e4) throw new Error("FAT \uC139\uD130 \uC218\uAC00 \uB108\uBB34 \uB9CE\uC2B5\uB2C8\uB2E4: " + fatSectorCount);
2156
2181
  const firstDirSector = data.readUInt32LE(48);
2157
2182
  const miniStreamCutoff = data.readUInt32LE(56);
2158
2183
  const firstMiniFatSector = data.readUInt32LE(60);
@@ -2528,10 +2553,14 @@ function findSections(cfb) {
2528
2553
  }
2529
2554
  function findSectionsLenient(lcfb, compressed) {
2530
2555
  const sections = [];
2556
+ let totalDecompressed = 0;
2531
2557
  for (let i = 0; i < MAX_SECTIONS; i++) {
2532
2558
  const raw = lcfb.findStream(`/BodyText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
2533
2559
  if (!raw) break;
2534
- sections.push({ idx: i, content: compressed ? decompressStream(raw) : raw });
2560
+ const content = compressed ? decompressStream(raw) : raw;
2561
+ totalDecompressed += content.length;
2562
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
2563
+ sections.push({ idx: i, content });
2535
2564
  }
2536
2565
  if (sections.length === 0) {
2537
2566
  for (const e of lcfb.entries()) {
@@ -2539,7 +2568,12 @@ function findSectionsLenient(lcfb, compressed) {
2539
2568
  if (e.name.startsWith("Section")) {
2540
2569
  const idx = parseInt(e.name.replace("Section", ""), 10) || 0;
2541
2570
  const raw = lcfb.findStream(e.name);
2542
- if (raw) sections.push({ idx, content: compressed ? decompressStream(raw) : raw });
2571
+ if (raw) {
2572
+ const content = compressed ? decompressStream(raw) : raw;
2573
+ totalDecompressed += content.length;
2574
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
2575
+ sections.push({ idx, content });
2576
+ }
2543
2577
  }
2544
2578
  }
2545
2579
  }
@@ -2547,11 +2581,15 @@ function findSectionsLenient(lcfb, compressed) {
2547
2581
  }
2548
2582
  function findViewTextSectionsLenient(lcfb, compressed) {
2549
2583
  const sections = [];
2584
+ let totalDecompressed = 0;
2550
2585
  for (let i = 0; i < MAX_SECTIONS; i++) {
2551
2586
  const raw = lcfb.findStream(`/ViewText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
2552
2587
  if (!raw) break;
2553
2588
  try {
2554
- sections.push({ idx: i, content: decryptViewText(raw, compressed) });
2589
+ const content = decryptViewText(raw, compressed);
2590
+ totalDecompressed += content.length;
2591
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
2592
+ sections.push({ idx: i, content });
2555
2593
  } catch {
2556
2594
  break;
2557
2595
  }
@@ -2953,10 +2991,14 @@ init_page_range();
2953
2991
  // src/pdf/line-detector.ts
2954
2992
  import { OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
2955
2993
  var ORIENTATION_TOL = 2;
2956
- var MIN_LINE_LENGTH = 10;
2957
- var COORD_MERGE_TOL = 3;
2994
+ var MIN_LINE_LENGTH = 15;
2995
+ var MAX_LINE_WIDTH = 5;
2958
2996
  var CONNECT_TOL = 5;
2959
2997
  var CELL_PADDING = 2;
2998
+ var MIN_COL_WIDTH = 15;
2999
+ var MIN_ROW_HEIGHT = 6;
3000
+ var VERTEX_MERGE_FACTOR = 4;
3001
+ var MIN_COORD_MERGE_TOL = 8;
2960
3002
  function extractLines(fnArray, argsArray) {
2961
3003
  const horizontals = [];
2962
3004
  const verticals = [];
@@ -3108,6 +3150,55 @@ function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
3108
3150
  verticals.push({ x1: x, y1, x2: x, y2, lineWidth });
3109
3151
  }
3110
3152
  }
3153
+ function preprocessLines(horizontals, verticals) {
3154
+ let h = horizontals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
3155
+ let v = verticals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
3156
+ h = mergeParallelLines(h, "h");
3157
+ v = mergeParallelLines(v, "v");
3158
+ return { horizontals: h, verticals: v };
3159
+ }
3160
+ function mergeParallelLines(lines, dir) {
3161
+ if (lines.length <= 1) return lines;
3162
+ const sorted = [...lines].sort((a, b) => {
3163
+ const posA = dir === "h" ? a.y1 : a.x1;
3164
+ const posB = dir === "h" ? b.y1 : b.x1;
3165
+ if (Math.abs(posA - posB) > 0.1) return posA - posB;
3166
+ return dir === "h" ? a.x1 - b.x1 : a.y1 - b.y1;
3167
+ });
3168
+ const MERGE_TOL = 3;
3169
+ const result = [sorted[0]];
3170
+ for (let i = 1; i < sorted.length; i++) {
3171
+ const prev = result[result.length - 1];
3172
+ const curr = sorted[i];
3173
+ const prevPos = dir === "h" ? prev.y1 : prev.x1;
3174
+ const currPos = dir === "h" ? curr.y1 : curr.x1;
3175
+ if (Math.abs(prevPos - currPos) <= MERGE_TOL) {
3176
+ const prevStart = dir === "h" ? prev.x1 : prev.y1;
3177
+ const prevEnd = dir === "h" ? prev.x2 : prev.y2;
3178
+ const currStart = dir === "h" ? curr.x1 : curr.y1;
3179
+ const currEnd = dir === "h" ? curr.x2 : curr.y2;
3180
+ const overlap = Math.min(prevEnd, currEnd) - Math.max(prevStart, currStart);
3181
+ const minLen = Math.min(prevEnd - prevStart, currEnd - currStart);
3182
+ if (overlap > minLen * 0.3) {
3183
+ if (dir === "h") {
3184
+ prev.x1 = Math.min(prev.x1, curr.x1);
3185
+ prev.x2 = Math.max(prev.x2, curr.x2);
3186
+ prev.y1 = (prev.y1 + curr.y1) / 2;
3187
+ prev.y2 = prev.y1;
3188
+ } else {
3189
+ prev.y1 = Math.min(prev.y1, curr.y1);
3190
+ prev.y2 = Math.max(prev.y2, curr.y2);
3191
+ prev.x1 = (prev.x1 + curr.x1) / 2;
3192
+ prev.x2 = prev.x1;
3193
+ }
3194
+ prev.lineWidth = Math.max(prev.lineWidth, curr.lineWidth);
3195
+ continue;
3196
+ }
3197
+ }
3198
+ result.push(curr);
3199
+ }
3200
+ return result;
3201
+ }
3111
3202
  function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
3112
3203
  const margin = 5;
3113
3204
  return {
@@ -3119,8 +3210,49 @@ function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
3119
3210
  )
3120
3211
  };
3121
3212
  }
3213
+ function buildVertices(horizontals, verticals) {
3214
+ const vertices = [];
3215
+ const tol = CONNECT_TOL;
3216
+ for (const h of horizontals) {
3217
+ for (const v of verticals) {
3218
+ if (v.x1 >= h.x1 - tol && v.x1 <= h.x2 + tol && h.y1 >= v.y1 - tol && h.y1 <= v.y2 + tol) {
3219
+ const radius = Math.max(h.lineWidth, v.lineWidth, 1);
3220
+ vertices.push({ x: v.x1, y: h.y1, radius });
3221
+ }
3222
+ }
3223
+ }
3224
+ return vertices;
3225
+ }
3226
+ function mergeVertices(vertices) {
3227
+ if (vertices.length <= 1) return vertices;
3228
+ const merged = [];
3229
+ const used = new Array(vertices.length).fill(false);
3230
+ for (let i = 0; i < vertices.length; i++) {
3231
+ if (used[i]) continue;
3232
+ let sumX = vertices[i].x, sumY = vertices[i].y;
3233
+ let maxRadius = vertices[i].radius;
3234
+ let count = 1;
3235
+ for (let j = i + 1; j < vertices.length; j++) {
3236
+ if (used[j]) continue;
3237
+ const mergeTol = VERTEX_MERGE_FACTOR * Math.max(maxRadius, vertices[j].radius);
3238
+ if (Math.abs(vertices[i].x - vertices[j].x) <= mergeTol && Math.abs(vertices[i].y - vertices[j].y) <= mergeTol) {
3239
+ sumX += vertices[j].x;
3240
+ sumY += vertices[j].y;
3241
+ maxRadius = Math.max(maxRadius, vertices[j].radius);
3242
+ count++;
3243
+ used[j] = true;
3244
+ }
3245
+ }
3246
+ merged.push({ x: sumX / count, y: sumY / count, radius: maxRadius });
3247
+ }
3248
+ return merged;
3249
+ }
3122
3250
  function buildTableGrids(horizontals, verticals) {
3123
3251
  if (horizontals.length < 2 || verticals.length < 2) return [];
3252
+ const allVertices = buildVertices(horizontals, verticals);
3253
+ const vertices = mergeVertices(allVertices);
3254
+ if (vertices.length < 4) return [];
3255
+ const globalRadius = vertices.reduce((max, v) => Math.max(max, v.radius), 1);
3124
3256
  const allLines = [
3125
3257
  ...horizontals.map((l, i) => ({ ...l, type: "h", id: i })),
3126
3258
  ...verticals.map((l, i) => ({ ...l, type: "v", id: i + horizontals.length }))
@@ -3131,21 +3263,74 @@ function buildTableGrids(horizontals, verticals) {
3131
3263
  const hLines = group.filter((l) => l.type === "h");
3132
3264
  const vLines = group.filter((l) => l.type === "v");
3133
3265
  if (hLines.length < 2 || vLines.length < 2) continue;
3134
- const rawYs = hLines.map((l) => l.y1);
3135
- const rowYs = clusterCoordinates(rawYs).sort((a, b) => b - a);
3136
- const rawXs = vLines.map((l) => l.x1);
3137
- const colXs = clusterCoordinates(rawXs).sort((a, b) => a - b);
3266
+ let gx1 = Infinity, gy1 = Infinity, gx2 = -Infinity, gy2 = -Infinity;
3267
+ for (const l of vLines) {
3268
+ if (l.x1 < gx1) gx1 = l.x1;
3269
+ if (l.x1 > gx2) gx2 = l.x1;
3270
+ }
3271
+ for (const l of hLines) {
3272
+ if (l.y1 < gy1) gy1 = l.y1;
3273
+ if (l.y1 > gy2) gy2 = l.y1;
3274
+ }
3275
+ const groupBbox = {
3276
+ x1: gx1 - CONNECT_TOL,
3277
+ y1: gy1 - CONNECT_TOL,
3278
+ x2: gx2 + CONNECT_TOL,
3279
+ y2: gy2 + CONNECT_TOL
3280
+ };
3281
+ const groupVertices = vertices.filter(
3282
+ (v) => v.x >= groupBbox.x1 && v.x <= groupBbox.x2 && v.y >= groupBbox.y1 && v.y <= groupBbox.y2
3283
+ );
3284
+ const groupRadius = groupVertices.length > 0 ? groupVertices.reduce((max, v) => Math.max(max, v.radius), 1) : globalRadius;
3285
+ const coordMergeTol = Math.max(VERTEX_MERGE_FACTOR * groupRadius, MIN_COORD_MERGE_TOL);
3286
+ const rawYs = [
3287
+ ...hLines.map((l) => l.y1),
3288
+ ...groupVertices.map((v) => v.y)
3289
+ ];
3290
+ const rowYs = clusterCoordinates(rawYs, coordMergeTol).sort((a, b) => b - a);
3291
+ const rawXs = [
3292
+ ...vLines.map((l) => l.x1),
3293
+ ...groupVertices.map((v) => v.x)
3294
+ ];
3295
+ const colXs = clusterCoordinates(rawXs, coordMergeTol).sort((a, b) => a - b);
3138
3296
  if (rowYs.length < 2 || colXs.length < 2) continue;
3297
+ const validColXs = enforceMinWidth(colXs, MIN_COL_WIDTH);
3298
+ const validRowYs = enforceMinHeight(rowYs, MIN_ROW_HEIGHT);
3299
+ if (validRowYs.length < 2 || validColXs.length < 2) continue;
3139
3300
  const bbox = {
3140
- x1: colXs[0],
3141
- y1: rowYs[rowYs.length - 1],
3142
- x2: colXs[colXs.length - 1],
3143
- y2: rowYs[0]
3301
+ x1: validColXs[0],
3302
+ y1: validRowYs[validRowYs.length - 1],
3303
+ x2: validColXs[validColXs.length - 1],
3304
+ y2: validRowYs[0]
3144
3305
  };
3145
- grids.push({ rowYs, colXs, bbox });
3306
+ grids.push({ rowYs: validRowYs, colXs: validColXs, bbox, vertexRadius: groupRadius });
3146
3307
  }
3147
3308
  return mergeAdjacentGrids(grids);
3148
3309
  }
3310
+ function enforceMinWidth(colXs, minWidth) {
3311
+ if (colXs.length <= 2) return colXs;
3312
+ const result = [colXs[0]];
3313
+ for (let i = 1; i < colXs.length; i++) {
3314
+ const prevX = result[result.length - 1];
3315
+ if (colXs[i] - prevX < minWidth && i < colXs.length - 1) {
3316
+ continue;
3317
+ }
3318
+ result.push(colXs[i]);
3319
+ }
3320
+ return result;
3321
+ }
3322
+ function enforceMinHeight(rowYs, minHeight) {
3323
+ if (rowYs.length <= 2) return rowYs;
3324
+ const result = [rowYs[0]];
3325
+ for (let i = 1; i < rowYs.length; i++) {
3326
+ const prevY = result[result.length - 1];
3327
+ if (prevY - rowYs[i] < minHeight && i < rowYs.length - 1) {
3328
+ continue;
3329
+ }
3330
+ result.push(rowYs[i]);
3331
+ }
3332
+ return result;
3333
+ }
3149
3334
  function mergeAdjacentGrids(grids) {
3150
3335
  if (grids.length <= 1) return grids;
3151
3336
  const sorted = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
@@ -3154,9 +3339,10 @@ function mergeAdjacentGrids(grids) {
3154
3339
  const prev = merged[merged.length - 1];
3155
3340
  const curr = sorted[i];
3156
3341
  if (prev.colXs.length === curr.colXs.length) {
3157
- const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= COORD_MERGE_TOL * 3);
3342
+ const mergeTol = Math.max(VERTEX_MERGE_FACTOR * Math.max(prev.vertexRadius, curr.vertexRadius), 6) * 3;
3343
+ const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= mergeTol);
3158
3344
  const verticalGap = prev.bbox.y1 - curr.bbox.y2;
3159
- if (colMatch && verticalGap >= -COORD_MERGE_TOL && verticalGap <= 20) {
3345
+ if (colMatch && verticalGap >= -CONNECT_TOL && verticalGap <= 20) {
3160
3346
  const allRowYs = [.../* @__PURE__ */ new Set([...prev.rowYs, ...curr.rowYs])].sort((a, b) => b - a);
3161
3347
  merged[merged.length - 1] = {
3162
3348
  rowYs: allRowYs,
@@ -3166,7 +3352,8 @@ function mergeAdjacentGrids(grids) {
3166
3352
  y1: Math.min(prev.bbox.y1, curr.bbox.y1),
3167
3353
  x2: Math.max(prev.bbox.x2, curr.bbox.x2),
3168
3354
  y2: Math.max(prev.bbox.y2, curr.bbox.y2)
3169
- }
3355
+ },
3356
+ vertexRadius: Math.max(prev.vertexRadius, curr.vertexRadius)
3170
3357
  };
3171
3358
  continue;
3172
3359
  }
@@ -3175,14 +3362,14 @@ function mergeAdjacentGrids(grids) {
3175
3362
  }
3176
3363
  return merged;
3177
3364
  }
3178
- function clusterCoordinates(values) {
3365
+ function clusterCoordinates(values, tolerance) {
3179
3366
  if (values.length === 0) return [];
3180
3367
  const sorted = [...values].sort((a, b) => a - b);
3181
3368
  const clusters = [{ sum: sorted[0], count: 1 }];
3182
3369
  for (let i = 1; i < sorted.length; i++) {
3183
3370
  const last = clusters[clusters.length - 1];
3184
3371
  const avg = last.sum / last.count;
3185
- if (Math.abs(sorted[i] - avg) <= COORD_MERGE_TOL) {
3372
+ if (Math.abs(sorted[i] - avg) <= tolerance) {
3186
3373
  last.sum += sorted[i];
3187
3374
  last.count++;
3188
3375
  } else {
@@ -3239,6 +3426,20 @@ function extractCells(grid, horizontals, verticals) {
3239
3426
  const numRows = rowYs.length - 1;
3240
3427
  const numCols = colXs.length - 1;
3241
3428
  if (numRows <= 0 || numCols <= 0) return [];
3429
+ const vBorders = Array.from(
3430
+ { length: numRows },
3431
+ (_, r) => Array.from(
3432
+ { length: numCols + 1 },
3433
+ (_2, c) => hasVerticalLine(verticals, colXs[c], rowYs[r], rowYs[r + 1], grid.vertexRadius)
3434
+ )
3435
+ );
3436
+ const hBorders = Array.from(
3437
+ { length: numRows + 1 },
3438
+ (_, r) => Array.from(
3439
+ { length: numCols },
3440
+ (_2, c) => hasHorizontalLine(horizontals, rowYs[r], colXs[c], colXs[c + 1], grid.vertexRadius)
3441
+ )
3442
+ );
3242
3443
  const occupied = Array.from({ length: numRows }, () => Array(numCols).fill(false));
3243
3444
  const cells = [];
3244
3445
  for (let r = 0; r < numRows; r++) {
@@ -3246,18 +3447,26 @@ function extractCells(grid, horizontals, verticals) {
3246
3447
  if (occupied[r][c]) continue;
3247
3448
  let colSpan = 1;
3248
3449
  let rowSpan = 1;
3249
- while (c + colSpan < numCols) {
3250
- const borderX = colXs[c + colSpan];
3251
- const topY = rowYs[r];
3252
- const botY = rowYs[r + 1];
3253
- if (hasVerticalLine(verticals, borderX, topY, botY)) break;
3450
+ while (c + colSpan < numCols && !vBorders[r][c + colSpan]) {
3451
+ let canExpand = true;
3452
+ for (let dr = 0; dr < rowSpan; dr++) {
3453
+ if (vBorders[r + dr][c + colSpan]) {
3454
+ canExpand = false;
3455
+ break;
3456
+ }
3457
+ }
3458
+ if (!canExpand) break;
3254
3459
  colSpan++;
3255
3460
  }
3256
3461
  while (r + rowSpan < numRows) {
3257
- const borderY = rowYs[r + rowSpan];
3258
- const leftX = colXs[c];
3259
- const rightX = colXs[c + colSpan];
3260
- if (hasHorizontalLine(horizontals, borderY, leftX, rightX)) break;
3462
+ let hasLine = false;
3463
+ for (let dc = 0; dc < colSpan; dc++) {
3464
+ if (hBorders[r + rowSpan][c + dc]) {
3465
+ hasLine = true;
3466
+ break;
3467
+ }
3468
+ }
3469
+ if (hasLine) break;
3261
3470
  rowSpan++;
3262
3471
  }
3263
3472
  for (let dr = 0; dr < rowSpan; dr++) {
@@ -3281,28 +3490,30 @@ function extractCells(grid, horizontals, verticals) {
3281
3490
  }
3282
3491
  return cells;
3283
3492
  }
3284
- function hasVerticalLine(verticals, x, topY, botY) {
3285
- const tol = COORD_MERGE_TOL + 1;
3493
+ function hasVerticalLine(verticals, x, topY, botY, vertexRadius) {
3494
+ const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
3286
3495
  for (const v of verticals) {
3287
3496
  if (Math.abs(v.x1 - x) <= tol) {
3288
3497
  const cellH = Math.abs(topY - botY);
3498
+ if (cellH < 0.1) continue;
3289
3499
  const overlapTop = Math.min(v.y2, topY);
3290
3500
  const overlapBot = Math.max(v.y1, botY);
3291
3501
  const overlap = overlapTop - overlapBot;
3292
- if (overlap >= cellH * 0.5) return true;
3502
+ if (overlap >= cellH * 0.75) return true;
3293
3503
  }
3294
3504
  }
3295
3505
  return false;
3296
3506
  }
3297
- function hasHorizontalLine(horizontals, y, leftX, rightX) {
3298
- const tol = COORD_MERGE_TOL + 1;
3507
+ function hasHorizontalLine(horizontals, y, leftX, rightX, vertexRadius) {
3508
+ const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
3299
3509
  for (const h of horizontals) {
3300
3510
  if (Math.abs(h.y1 - y) <= tol) {
3301
3511
  const cellW = Math.abs(rightX - leftX);
3512
+ if (cellW < 0.1) continue;
3302
3513
  const overlapLeft = Math.max(h.x1, leftX);
3303
3514
  const overlapRight = Math.min(h.x2, rightX);
3304
3515
  const overlap = overlapRight - overlapLeft;
3305
- if (overlap >= cellW * 0.5) return true;
3516
+ if (overlap >= cellW * 0.75) return true;
3306
3517
  }
3307
3518
  }
3308
3519
  return false;
@@ -3313,23 +3524,24 @@ function mapTextToCells(items, cells) {
3313
3524
  result.set(cell, []);
3314
3525
  }
3315
3526
  for (const item of items) {
3316
- const cx = item.x + item.w / 2;
3317
- const cy = item.y;
3318
3527
  const pad = CELL_PADDING;
3319
3528
  let bestCell = null;
3320
- let bestDist = Infinity;
3529
+ let bestScore = 0;
3321
3530
  for (const cell of cells) {
3322
- if (cx >= cell.bbox.x1 - pad && cx <= cell.bbox.x2 + pad && cy >= cell.bbox.y1 - pad && cy <= cell.bbox.y2 + pad) {
3323
- const cellCx = (cell.bbox.x1 + cell.bbox.x2) / 2;
3324
- const cellCy = (cell.bbox.y1 + cell.bbox.y2) / 2;
3325
- const dist = Math.abs(cx - cellCx) + Math.abs(cy - cellCy);
3326
- if (dist < bestDist) {
3327
- bestDist = dist;
3328
- bestCell = cell;
3329
- }
3531
+ const ix1 = Math.max(item.x, cell.bbox.x1 - pad);
3532
+ const ix2 = Math.min(item.x + item.w, cell.bbox.x2 + pad);
3533
+ const iy1 = Math.max(item.y, cell.bbox.y1 - pad);
3534
+ const iy2 = Math.min(item.y + (item.h || item.fontSize), cell.bbox.y2 + pad);
3535
+ if (ix1 >= ix2 || iy1 >= iy2) continue;
3536
+ const intersectArea = (ix2 - ix1) * (iy2 - iy1);
3537
+ const itemArea = Math.max(item.w, 1) * Math.max(item.h || item.fontSize, 1);
3538
+ const score = intersectArea / itemArea;
3539
+ if (score > bestScore) {
3540
+ bestScore = score;
3541
+ bestCell = cell;
3330
3542
  }
3331
3543
  }
3332
- if (bestCell) {
3544
+ if (bestCell && bestScore > 0.3) {
3333
3545
  result.get(bestCell).push(item);
3334
3546
  }
3335
3547
  }
@@ -3356,8 +3568,13 @@ function cellTextToString(items) {
3356
3568
  const textLines = lines.map((line) => {
3357
3569
  const s = line.sort((a, b) => a.x - b.x);
3358
3570
  if (s.length === 1) return s[0].text;
3571
+ const evenSpaced = detectEvenSpacedItems(s);
3359
3572
  let result = s[0].text;
3360
3573
  for (let j = 1; j < s.length; j++) {
3574
+ if (evenSpaced[j]) {
3575
+ result += s[j].text;
3576
+ continue;
3577
+ }
3361
3578
  const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
3362
3579
  const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
3363
3580
  const prevIsKorean = /[가-힣]$/.test(result);
@@ -3372,6 +3589,57 @@ function cellTextToString(items) {
3372
3589
  }
3373
3590
  return result;
3374
3591
  });
3592
+ return mergeCellTextLines(textLines);
3593
+ }
3594
+ function detectEvenSpacedItems(items) {
3595
+ const result = new Array(items.length).fill(false);
3596
+ if (items.length < 3) return result;
3597
+ let runStart = -1;
3598
+ for (let i = 0; i < items.length; i++) {
3599
+ const isShortKorean = /^[가-힣]{1}$/.test(items[i].text) || /^[\d]{1}$/.test(items[i].text);
3600
+ if (isShortKorean && runStart >= 0 && i > 0) {
3601
+ const gap = items[i].x - (items[i - 1].x + items[i - 1].w);
3602
+ const maxRunGap = Math.max(items[i].fontSize * 3, 30);
3603
+ if (gap > maxRunGap) {
3604
+ if (i - runStart >= 3) markEvenRun(items, result, runStart, i);
3605
+ runStart = i;
3606
+ continue;
3607
+ }
3608
+ }
3609
+ if (isShortKorean) {
3610
+ if (runStart < 0) runStart = i;
3611
+ } else {
3612
+ if (runStart >= 0 && i - runStart >= 3) {
3613
+ markEvenRun(items, result, runStart, i);
3614
+ }
3615
+ runStart = -1;
3616
+ }
3617
+ }
3618
+ if (runStart >= 0 && items.length - runStart >= 3) {
3619
+ markEvenRun(items, result, runStart, items.length);
3620
+ }
3621
+ return result;
3622
+ }
3623
+ function markEvenRun(items, result, start, end) {
3624
+ const gaps = [];
3625
+ for (let i = start + 1; i < end; i++) {
3626
+ gaps.push(items[i].x - (items[i - 1].x + items[i - 1].w));
3627
+ }
3628
+ const posGaps = gaps.filter((g2) => g2 > 0);
3629
+ if (posGaps.length < 2) return;
3630
+ let minGap = Infinity, maxGap = -Infinity;
3631
+ for (const g2 of posGaps) {
3632
+ if (g2 < minGap) minGap = g2;
3633
+ if (g2 > maxGap) maxGap = g2;
3634
+ }
3635
+ const avgFs = items[start].fontSize;
3636
+ if (minGap >= avgFs * 0.1 && maxGap <= avgFs * 3 && maxGap / Math.max(minGap, 0.1) <= 3) {
3637
+ for (let i = start + 1; i < end; i++) {
3638
+ result[i] = true;
3639
+ }
3640
+ }
3641
+ }
3642
+ function mergeCellTextLines(textLines) {
3375
3643
  if (textLines.length <= 1) return textLines[0] || "";
3376
3644
  const merged = [textLines[0]];
3377
3645
  for (let i = 1; i < textLines.length; i++) {
@@ -3397,24 +3665,172 @@ var Y_TOL = 3;
3397
3665
  var COL_CLUSTER_TOL = 15;
3398
3666
  var MIN_ROWS = 3;
3399
3667
  var MIN_COLS = 2;
3400
- var MIN_GAP_FACTOR = 1.5;
3401
- var MIN_COL_FILL_RATIO = 0.3;
3668
+ var MIN_GAP_FACTOR = 2;
3669
+ var MIN_GAP_ABSOLUTE = 20;
3670
+ var MIN_COL_FILL_RATIO = 0.4;
3402
3671
  function detectClusterTables(items, pageNum) {
3403
3672
  if (items.length < MIN_ROWS * MIN_COLS) return [];
3404
- const rows = groupByBaseline(items);
3673
+ const { merged, originMap } = mergeEvenSpacedClusters(items);
3674
+ const rows = groupByBaseline(merged);
3405
3675
  if (rows.length < MIN_ROWS) return [];
3406
- const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
3407
- if (suspiciousRows.length < MIN_ROWS) return [];
3408
- const columns = extractColumnClusters(suspiciousRows);
3409
- if (columns.length < MIN_COLS) return [];
3410
- const tableRegions = findTableRegions(rows, columns);
3411
3676
  const results = [];
3412
- for (const region of tableRegions) {
3413
- const table = buildClusterTable(region.rows, columns, pageNum);
3414
- if (table) results.push(table);
3677
+ const headerResult = detectHeaderRow(rows);
3678
+ if (headerResult) {
3679
+ const { columns, headerIdx } = headerResult;
3680
+ const headerRow = rows[headerIdx];
3681
+ const headerItems = [...headerRow.items].sort((a, b) => a.x - b.x);
3682
+ const headerAndBelow = rows.slice(headerIdx);
3683
+ const mergedRows = mergeMultiLineRows(headerAndBelow, columns);
3684
+ const tableRegions = findTableRegionsByHeader(mergedRows, columns, headerItems);
3685
+ for (const region of tableRegions) {
3686
+ const table = buildClusterTable(region.rows, columns, pageNum);
3687
+ if (table) {
3688
+ expandUsedItems(table.usedItems, originMap);
3689
+ results.push(table);
3690
+ }
3691
+ }
3692
+ }
3693
+ if (results.length === 0) {
3694
+ const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
3695
+ if (suspiciousRows.length >= MIN_ROWS) {
3696
+ const columns = extractColumnClusters(suspiciousRows);
3697
+ if (columns.length >= MIN_COLS) {
3698
+ const tableRegions = findTableRegions(rows, columns);
3699
+ for (const region of tableRegions) {
3700
+ const mergedRows = mergeMultiLineRows(region.rows, columns);
3701
+ const table = buildClusterTable(mergedRows, columns, pageNum);
3702
+ if (table) {
3703
+ expandUsedItems(table.usedItems, originMap);
3704
+ results.push(table);
3705
+ }
3706
+ }
3707
+ }
3708
+ }
3415
3709
  }
3416
3710
  return results;
3417
3711
  }
3712
+ function mergeEvenSpacedClusters(items) {
3713
+ const originMap = /* @__PURE__ */ new Map();
3714
+ const rows = groupByBaseline(items);
3715
+ const merged = [];
3716
+ for (const row of rows) {
3717
+ const sorted = [...row.items].sort((a, b) => a.x - b.x);
3718
+ let i = 0;
3719
+ while (i < sorted.length) {
3720
+ if (/^[가-힣\d]$/.test(sorted[i].text)) {
3721
+ let runEnd = i + 1;
3722
+ while (runEnd < sorted.length && /^[가-힣\d]$/.test(sorted[runEnd].text)) {
3723
+ const gap = sorted[runEnd].x - (sorted[runEnd - 1].x + sorted[runEnd - 1].w);
3724
+ const fs = sorted[runEnd].fontSize;
3725
+ if (gap < fs * 0.1 || gap > fs * 3) break;
3726
+ runEnd++;
3727
+ }
3728
+ if (runEnd - i >= 3) {
3729
+ const gaps = [];
3730
+ for (let g2 = i + 1; g2 < runEnd; g2++) {
3731
+ gaps.push(sorted[g2].x - (sorted[g2 - 1].x + sorted[g2 - 1].w));
3732
+ }
3733
+ let minG = Infinity, maxG = -Infinity;
3734
+ for (const g2 of gaps) {
3735
+ if (g2 < minG) minG = g2;
3736
+ if (g2 > maxG) maxG = g2;
3737
+ }
3738
+ if (minG > 0 && maxG / minG <= 3) {
3739
+ const run = sorted.slice(i, runEnd);
3740
+ const text = run.map((r) => r.text).join("");
3741
+ const first = run[0], last = run[runEnd - i - 1];
3742
+ const item = {
3743
+ text,
3744
+ x: first.x,
3745
+ y: first.y,
3746
+ w: last.x + last.w - first.x,
3747
+ h: first.h,
3748
+ fontSize: first.fontSize,
3749
+ fontName: first.fontName
3750
+ };
3751
+ originMap.set(item, run);
3752
+ merged.push(item);
3753
+ i = runEnd;
3754
+ continue;
3755
+ }
3756
+ }
3757
+ }
3758
+ merged.push(sorted[i]);
3759
+ i++;
3760
+ }
3761
+ }
3762
+ return { merged, originMap };
3763
+ }
3764
+ function expandUsedItems(usedItems, originMap) {
3765
+ const toAdd = [];
3766
+ for (const item of usedItems) {
3767
+ const origins = originMap.get(item);
3768
+ if (origins) for (const o of origins) toAdd.push(o);
3769
+ }
3770
+ for (const a of toAdd) usedItems.add(a);
3771
+ }
3772
+ function detectHeaderRow(rows) {
3773
+ const allItems = rows.flatMap((r) => r.items);
3774
+ if (allItems.length === 0) return null;
3775
+ let allMinX = Infinity, allMaxX = -Infinity;
3776
+ for (const i of allItems) {
3777
+ if (i.x < allMinX) allMinX = i.x;
3778
+ const r = i.x + i.w;
3779
+ if (r > allMaxX) allMaxX = r;
3780
+ }
3781
+ const pageSpan = allMaxX - allMinX;
3782
+ if (pageSpan <= 0) return null;
3783
+ for (let ri = 0; ri < rows.length; ri++) {
3784
+ const row = rows[ri];
3785
+ if (row.items.length < MIN_COLS || row.items.length > 6) continue;
3786
+ if (row.items.some((i) => i.text.length > 8)) continue;
3787
+ if (!row.items.some((i) => /[가-힣]/.test(i.text))) continue;
3788
+ if (row.items.some((i) => /^[□■○●·※▶▷◆◇\-]/.test(i.text))) continue;
3789
+ const sorted = [...row.items].sort((a, b) => a.x - b.x);
3790
+ const xSpan = sorted[sorted.length - 1].x + sorted[sorted.length - 1].w - sorted[0].x;
3791
+ if (xSpan / pageSpan < 0.4) continue;
3792
+ const avgFs = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
3793
+ let hasLargeGap = false;
3794
+ for (let i = 1; i < sorted.length; i++) {
3795
+ const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
3796
+ if (gap >= avgFs * 2.5) {
3797
+ hasLargeGap = true;
3798
+ break;
3799
+ }
3800
+ }
3801
+ if (!hasLargeGap) continue;
3802
+ const columns = sorted.map((item) => ({ x: item.x, count: 0 }));
3803
+ let matchCount = 0;
3804
+ for (let j = ri + 1; j < rows.length && matchCount < MIN_ROWS + 2; j++) {
3805
+ const matched = countMatchedColumnsRange(rows[j], columns, sorted);
3806
+ if (matched >= MIN_COLS) matchCount++;
3807
+ }
3808
+ if (matchCount < MIN_ROWS) continue;
3809
+ return { columns, headerIdx: ri };
3810
+ }
3811
+ return null;
3812
+ }
3813
+ function mergeMultiLineRows(rows, columns) {
3814
+ if (rows.length <= 1) return rows;
3815
+ const result = [rows[0]];
3816
+ const allFontSizes = rows.flatMap((r) => r.items).map((i) => i.fontSize);
3817
+ const avgFontSize = allFontSizes.length > 0 ? allFontSizes.reduce((s, v) => s + v, 0) / allFontSizes.length : 12;
3818
+ for (let i = 1; i < rows.length; i++) {
3819
+ const prev = result[result.length - 1];
3820
+ const curr = rows[i];
3821
+ const yGap = Math.abs(prev.y - curr.y);
3822
+ const matchedCols = countMatchedColumns(curr, columns);
3823
+ if (yGap < avgFontSize * 1.8 && curr.items.length <= 2 && (matchedCols < MIN_COLS || curr.items.length === 1)) {
3824
+ result[result.length - 1] = {
3825
+ y: prev.y,
3826
+ items: [...prev.items, ...curr.items]
3827
+ };
3828
+ } else {
3829
+ result.push(curr);
3830
+ }
3831
+ }
3832
+ return result;
3833
+ }
3418
3834
  function groupByBaseline(items) {
3419
3835
  if (items.length === 0) return [];
3420
3836
  const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
@@ -3436,8 +3852,9 @@ function groupByBaseline(items) {
3436
3852
  function hasSuspiciousGaps(row) {
3437
3853
  if (row.items.length < 2) return false;
3438
3854
  const sorted = [...row.items].sort((a, b) => a.x - b.x);
3855
+ if (sorted.length === 2 && sorted[1].text.length > 20) return false;
3439
3856
  const avgFontSize = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
3440
- const minGap = avgFontSize * MIN_GAP_FACTOR;
3857
+ const minGap = Math.max(avgFontSize * MIN_GAP_FACTOR, MIN_GAP_ABSOLUTE);
3441
3858
  for (let i = 1; i < sorted.length; i++) {
3442
3859
  const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
3443
3860
  if (gap >= minGap) return true;
@@ -3464,6 +3881,41 @@ function extractColumnClusters(rows) {
3464
3881
  const minCount = Math.max(2, Math.floor(rows.length * MIN_COL_FILL_RATIO));
3465
3882
  return clusters.filter((c) => c.count >= minCount).sort((a, b) => a.x - b.x);
3466
3883
  }
3884
+ function findTableRegionsByHeader(allRows, columns, headerItems) {
3885
+ const regions = [];
3886
+ let currentRegion = [];
3887
+ let missStreak = 0;
3888
+ for (const row of allRows) {
3889
+ const matchedCols = countMatchedColumnsRange(row, columns, headerItems);
3890
+ if (matchedCols >= MIN_COLS) {
3891
+ currentRegion.push(row);
3892
+ missStreak = 0;
3893
+ } else if (currentRegion.length > 0 && (row.items.length <= 2 || missStreak === 0)) {
3894
+ currentRegion.push(row);
3895
+ missStreak++;
3896
+ } else {
3897
+ while (currentRegion.length > 0) {
3898
+ const last = currentRegion[currentRegion.length - 1];
3899
+ if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
3900
+ currentRegion.pop();
3901
+ }
3902
+ if (currentRegion.length >= MIN_ROWS) {
3903
+ regions.push({ rows: [...currentRegion] });
3904
+ }
3905
+ currentRegion = [];
3906
+ missStreak = 0;
3907
+ }
3908
+ }
3909
+ while (currentRegion.length > 0) {
3910
+ const last = currentRegion[currentRegion.length - 1];
3911
+ if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
3912
+ currentRegion.pop();
3913
+ }
3914
+ if (currentRegion.length >= MIN_ROWS) {
3915
+ regions.push({ rows: currentRegion });
3916
+ }
3917
+ return regions;
3918
+ }
3467
3919
  function findTableRegions(allRows, columns) {
3468
3920
  const regions = [];
3469
3921
  let currentRegion = [];
@@ -3499,18 +3951,81 @@ function countMatchedColumns(row, columns) {
3499
3951
  }
3500
3952
  return matched.size;
3501
3953
  }
3502
- function assignToColumn(item, columns) {
3503
- const MAX_DIST = COL_CLUSTER_TOL * 3;
3504
- let bestCol = -1;
3505
- let bestDist = Infinity;
3506
- for (let ci = 0; ci < columns.length; ci++) {
3507
- const dist = Math.abs(item.x - columns[ci].x);
3508
- if (dist < bestDist) {
3509
- bestDist = dist;
3510
- bestCol = ci;
3954
+ function countMatchedColumnsRange(row, columns, headerItems) {
3955
+ const boundaries = [];
3956
+ for (let ci = 0; ci < headerItems.length; ci++) {
3957
+ const left = ci === 0 ? 0 : (headerItems[ci - 1].x + headerItems[ci - 1].w + headerItems[ci].x) / 2;
3958
+ const right = ci === headerItems.length - 1 ? Infinity : (headerItems[ci].x + headerItems[ci].w + headerItems[ci + 1].x) / 2;
3959
+ boundaries.push({ left, right });
3960
+ }
3961
+ const matched = /* @__PURE__ */ new Set();
3962
+ for (const item of row.items) {
3963
+ for (let ci = 0; ci < boundaries.length; ci++) {
3964
+ if (item.x >= boundaries[ci].left && item.x < boundaries[ci].right) {
3965
+ matched.add(ci);
3966
+ break;
3967
+ }
3511
3968
  }
3512
3969
  }
3513
- return bestDist <= MAX_DIST ? bestCol : -1;
3970
+ return matched.size;
3971
+ }
3972
+ function assignRowItems(items, columns, numCols) {
3973
+ if (items.length === 0) return [];
3974
+ const sorted = [...items].sort((a, b) => a.x - b.x);
3975
+ const colCenters = columns.map((c) => c.x);
3976
+ const gaps = [];
3977
+ for (let i = 1; i < sorted.length; i++) {
3978
+ gaps.push({ idx: i, size: sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w) });
3979
+ }
3980
+ const gapSizes = gaps.map((g2) => g2.size).sort((a, b) => a - b);
3981
+ const medianGap = gapSizes.length > 0 ? gapSizes[Math.floor(gapSizes.length / 2)] : 0;
3982
+ const gapThreshold = sorted.length <= numCols + 1 ? 12 : Math.max(medianGap * 2.5, 12);
3983
+ const significantGaps = gaps.filter((g2) => g2.size >= gapThreshold).sort((a, b) => b.size - a.size).slice(0, numCols - 1).sort((a, b) => a.idx - b.idx);
3984
+ const groups = [];
3985
+ let start = 0;
3986
+ for (const gap of significantGaps) {
3987
+ groups.push(sorted.slice(start, gap.idx));
3988
+ start = gap.idx;
3989
+ }
3990
+ groups.push(sorted.slice(start));
3991
+ const result = [];
3992
+ const usedCols = /* @__PURE__ */ new Set();
3993
+ const groupCenters = groups.map((g2) => {
3994
+ let minX = Infinity, maxX = -Infinity;
3995
+ for (const i of g2) {
3996
+ if (i.x < minX) minX = i.x;
3997
+ const r = i.x + i.w;
3998
+ if (r > maxX) maxX = r;
3999
+ }
4000
+ return (minX + maxX) / 2;
4001
+ });
4002
+ const assignments = [];
4003
+ for (let gi = 0; gi < groups.length; gi++) {
4004
+ for (let ci = 0; ci < numCols; ci++) {
4005
+ assignments.push({ gi, ci, dist: Math.abs(groupCenters[gi] - colCenters[ci]) });
4006
+ }
4007
+ }
4008
+ assignments.sort((a, b) => a.dist - b.dist);
4009
+ const assignedGroups = /* @__PURE__ */ new Set();
4010
+ for (const { gi, ci } of assignments) {
4011
+ if (assignedGroups.has(gi) || usedCols.has(ci)) continue;
4012
+ result.push({ col: ci, items: groups[gi] });
4013
+ assignedGroups.add(gi);
4014
+ usedCols.add(ci);
4015
+ }
4016
+ for (let gi = 0; gi < groups.length; gi++) {
4017
+ if (assignedGroups.has(gi)) continue;
4018
+ let bestCol = 0, bestDist = Infinity;
4019
+ for (let ci = 0; ci < numCols; ci++) {
4020
+ const d = Math.abs(groupCenters[gi] - colCenters[ci]);
4021
+ if (d < bestDist) {
4022
+ bestDist = d;
4023
+ bestCol = ci;
4024
+ }
4025
+ }
4026
+ result.push({ col: bestCol, items: groups[gi] });
4027
+ }
4028
+ return result;
3514
4029
  }
3515
4030
  function buildClusterTable(rows, columns, pageNum) {
3516
4031
  const numCols = columns.length;
@@ -3528,12 +4043,12 @@ function buildClusterTable(rows, columns, pageNum) {
3528
4043
  usedItems.add(row.items[0]);
3529
4044
  continue;
3530
4045
  }
3531
- for (const item of row.items) {
3532
- const col = assignToColumn(item, columns);
3533
- if (col < 0) continue;
4046
+ const assignments = assignRowItems(row.items, columns, numCols);
4047
+ for (const { col, items } of assignments) {
4048
+ const text = items.map((i) => i.text).join(" ");
3534
4049
  const existing = cells[r][col].text;
3535
- cells[r][col].text = existing ? existing + " " + item.text : item.text;
3536
- usedItems.add(item);
4050
+ cells[r][col].text = existing ? existing + " " + text : text;
4051
+ for (const item of items) usedItems.add(item);
3537
4052
  }
3538
4053
  }
3539
4054
  let emptyRows = 0;
@@ -3545,11 +4060,48 @@ function buildClusterTable(rows, columns, pageNum) {
3545
4060
  const hasValue = cells.some((row) => row[c].text !== "");
3546
4061
  if (!hasValue) return null;
3547
4062
  }
4063
+ for (let r = numRows - 1; r >= 1; r--) {
4064
+ const nonEmptyCols = cells[r].filter((c) => c.text.trim()).length;
4065
+ if (nonEmptyCols !== 1) continue;
4066
+ if (cells[r][0].text.trim() !== "") continue;
4067
+ const contentText = cells[r].find((c) => c.text.trim())?.text.trim() || "";
4068
+ if (/^[○●▶\-·]/.test(contentText)) continue;
4069
+ for (let pr = r - 1; pr >= 0; pr--) {
4070
+ if (cells[pr].some((c) => c.text.trim())) {
4071
+ for (let c = 0; c < numCols; c++) {
4072
+ const prev = cells[pr][c].text.trim();
4073
+ const curr = cells[r][c].text.trim();
4074
+ if (curr) cells[pr][c].text = prev ? prev + " " + curr : curr;
4075
+ }
4076
+ for (let c = 0; c < numCols; c++) cells[r][c].text = "";
4077
+ break;
4078
+ }
4079
+ }
4080
+ }
4081
+ for (let r = 0; r < cells.length - 1; r++) {
4082
+ const row = cells[r];
4083
+ const hasCol0 = row[0].text.trim() !== "";
4084
+ const hasColLast = numCols > 1 && row[numCols - 1].text.trim() !== "";
4085
+ const midEmpty = row.slice(1, numCols - 1).every((c) => c.text.trim() === "");
4086
+ if (hasCol0 && hasColLast && midEmpty) {
4087
+ const next = cells[r + 1];
4088
+ if (next[0].text.trim() === "" && next.some((c) => c.text.trim())) {
4089
+ for (let c = 1; c < numCols; c++) {
4090
+ const curr = next[c].text.trim();
4091
+ if (curr) row[c].text = row[c].text.trim() ? row[c].text.trim() + " " + curr : curr;
4092
+ }
4093
+ for (let c = 0; c < numCols; c++) next[c].text = "";
4094
+ }
4095
+ }
4096
+ }
4097
+ const filteredCells = cells.filter((row) => row.some((c) => c.text.trim()));
4098
+ const finalRowCount = filteredCells.length;
4099
+ if (finalRowCount < MIN_ROWS) return null;
3548
4100
  const irTable = {
3549
- rows: numRows,
4101
+ rows: finalRowCount,
3550
4102
  cols: numCols,
3551
- cells,
3552
- hasHeader: numRows > 1
4103
+ cells: filteredCells,
4104
+ hasHeader: finalRowCount > 1
3553
4105
  };
3554
4106
  const allItems = rows.flatMap((r) => r.items);
3555
4107
  let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
@@ -3626,7 +4178,7 @@ async function parsePdfDocument(buffer, options) {
3626
4178
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
3627
4179
  const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
3628
4180
  const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
3629
- const allFontSizes = [];
4181
+ const fontSizeFreq = /* @__PURE__ */ new Map();
3630
4182
  const pageHeights = /* @__PURE__ */ new Map();
3631
4183
  let parsedPages = 0;
3632
4184
  for (let i = 1; i <= effectivePageCount; i++) {
@@ -3643,7 +4195,7 @@ async function parsePdfDocument(buffer, options) {
3643
4195
  warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
3644
4196
  }
3645
4197
  for (const item of visible) {
3646
- if (item.fontSize > 0) allFontSizes.push(item.fontSize);
4198
+ if (item.fontSize > 0) fontSizeFreq.set(item.fontSize, (fontSizeFreq.get(item.fontSize) || 0) + 1);
3647
4199
  }
3648
4200
  const opList = await page.getOperatorList();
3649
4201
  const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
@@ -3682,7 +4234,7 @@ async function parsePdfDocument(buffer, options) {
3682
4234
  blocks.splice(removed[ri], 1);
3683
4235
  }
3684
4236
  }
3685
- const medianFontSize = computeMedianFontSize(allFontSizes);
4237
+ const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
3686
4238
  if (medianFontSize > 0) {
3687
4239
  detectHeadings(blocks, medianFontSize);
3688
4240
  }
@@ -3735,11 +4287,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
3735
4287
  }
3736
4288
  return { visible, hiddenCount };
3737
4289
  }
3738
- function computeMedianFontSize(sizes) {
3739
- if (sizes.length === 0) return 0;
3740
- const sorted = [...sizes].sort((a, b) => a - b);
3741
- const mid = Math.floor(sorted.length / 2);
3742
- return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
4290
+ function computeMedianFontSizeFromFreq(freq) {
4291
+ if (freq.size === 0) return 0;
4292
+ let total = 0;
4293
+ for (const count of freq.values()) total += count;
4294
+ const sorted = [...freq.entries()].sort((a, b) => a[0] - b[0]);
4295
+ const mid = Math.floor(total / 2);
4296
+ let cumulative = 0;
4297
+ for (const [size, count] of sorted) {
4298
+ cumulative += count;
4299
+ if (cumulative > mid) return size;
4300
+ }
4301
+ return sorted[sorted.length - 1][0];
3743
4302
  }
3744
4303
  function detectHeadings(blocks, medianFontSize) {
3745
4304
  for (const block of blocks) {
@@ -3765,11 +4324,21 @@ function collapseEvenSpacing(text) {
3765
4324
  if (tokens.length >= 3 && singleCharCount / tokens.length >= 0.7) {
3766
4325
  return tokens.join("");
3767
4326
  }
3768
- return text;
4327
+ return text.replace(
4328
+ /(?<![가-힣])[가-힣](?: [가-힣\d]){2,}(?![가-힣])/g,
4329
+ (match) => match.replace(/ /g, "")
4330
+ );
3769
4331
  }
3770
4332
  function shouldDemoteTable(table) {
3771
4333
  const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
3772
4334
  const allText = allCells.join(" ");
4335
+ if (table.rows <= 3 && table.cols <= 3) {
4336
+ const totalCells2 = table.rows * table.cols;
4337
+ const emptyCells2 = totalCells2 - allCells.length;
4338
+ if (emptyCells2 >= totalCells2 * 0.3) return true;
4339
+ if (/[□■◆○●▶ㅇ]/.test(allText)) return true;
4340
+ if (/<[^>]+>/.test(allText)) return true;
4341
+ }
3773
4342
  if (allText.length > 200) return false;
3774
4343
  if (/[□■◆○●▶]/.test(allText) && table.rows <= 3) return true;
3775
4344
  const totalCells = table.rows * table.cols;
@@ -3880,6 +4449,7 @@ function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeigh
3880
4449
  if (items.length === 0) return [];
3881
4450
  let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
3882
4451
  ({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
4452
+ ({ horizontals, verticals } = preprocessLines(horizontals, verticals));
3883
4453
  const grids = buildTableGrids(horizontals, verticals);
3884
4454
  if (grids.length > 0) {
3885
4455
  return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
@@ -3891,14 +4461,19 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
3891
4461
  const usedItems = /* @__PURE__ */ new Set();
3892
4462
  const sortedGrids = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
3893
4463
  for (const grid of sortedGrids) {
4464
+ const numGridRows = grid.rowYs.length - 1;
4465
+ const numGridCols = grid.colXs.length - 1;
4466
+ if (numGridRows === 1 && numGridCols >= 2) continue;
3894
4467
  const tableItems = [];
3895
4468
  const pad = 3;
4469
+ const gridW = grid.bbox.x2 - grid.bbox.x1;
3896
4470
  for (const item of items) {
3897
4471
  if (usedItems.has(item)) continue;
3898
- if (item.x >= grid.bbox.x1 - pad && item.x + item.w <= grid.bbox.x2 + pad && item.y >= grid.bbox.y1 - pad && item.y <= grid.bbox.y2 + pad) {
3899
- tableItems.push(item);
3900
- usedItems.add(item);
3901
- }
4472
+ if (item.y < grid.bbox.y1 - pad || item.y > grid.bbox.y2 + pad) continue;
4473
+ if (item.x < grid.bbox.x1 - pad || item.x + item.w > grid.bbox.x2 + pad) continue;
4474
+ if (gridW < 120 && item.x + item.w > grid.bbox.x2 - 2) continue;
4475
+ tableItems.push(item);
4476
+ usedItems.add(item);
3902
4477
  }
3903
4478
  const cells = extractCells(grid, horizontals, verticals);
3904
4479
  if (cells.length === 0) continue;
@@ -3922,6 +4497,7 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
3922
4497
  const cellItems = cellTextMap.get(cell) || [];
3923
4498
  let text = cellTextToString(cellItems);
3924
4499
  text = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
4500
+ text = text.split("\n").map((line) => collapseEvenSpacing(line)).join("\n");
3925
4501
  irGrid[cell.row][cell.col] = {
3926
4502
  text,
3927
4503
  colSpan: cell.colSpan,
@@ -3946,23 +4522,58 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
3946
4522
  if (shouldDemoteTable(irTable)) {
3947
4523
  const demoted = demoteTableToText(irTable);
3948
4524
  if (demoted) {
3949
- blocks.push({ type: "paragraph", text: demoted, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
4525
+ const text = numGridRows === 1 ? "\n" + demoted + "\n" : demoted;
4526
+ blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
3950
4527
  }
3951
4528
  continue;
3952
4529
  }
3953
4530
  blocks.push({ type: "table", table: irTable, pageNumber: pageNum, bbox: tableBbox });
3954
4531
  }
3955
- const remaining = items.filter((i) => !usedItems.has(i));
4532
+ let remaining = items.filter((i) => !usedItems.has(i));
3956
4533
  if (remaining.length > 0) {
3957
4534
  remaining.sort((a, b) => b.y - a.y || a.x - b.x);
3958
- const textBlocks = detectListBlocks(extractPageBlocksFallback(remaining, pageNum));
3959
- const allBlocks = [...blocks, ...textBlocks];
3960
- allBlocks.sort((a, b) => {
4535
+ const clusterItems = remaining.map((i) => ({
4536
+ text: i.text,
4537
+ x: i.x,
4538
+ y: i.y,
4539
+ w: i.w,
4540
+ h: i.h,
4541
+ fontSize: i.fontSize,
4542
+ fontName: i.fontName
4543
+ }));
4544
+ const clusterResults = detectClusterTables(clusterItems, pageNum);
4545
+ if (clusterResults.length > 0) {
4546
+ const ciToIdx = /* @__PURE__ */ new Map();
4547
+ for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
4548
+ const usedClusterIndices = /* @__PURE__ */ new Set();
4549
+ for (const cr of clusterResults) {
4550
+ for (const ci of cr.usedItems) {
4551
+ const idx = ciToIdx.get(ci);
4552
+ if (idx !== void 0) usedClusterIndices.add(idx);
4553
+ }
4554
+ blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
4555
+ }
4556
+ remaining = remaining.filter((_, idx) => !usedClusterIndices.has(idx));
4557
+ }
4558
+ if (remaining.length > 0) {
4559
+ const allY = remaining.map((i) => i.y);
4560
+ const pageH = safeMax(allY) - safeMin(allY);
4561
+ const groups = xyCutOrder(remaining, Math.max(15, pageH * 0.03));
4562
+ const textBlocks = [];
4563
+ for (const group of groups) {
4564
+ if (group.length === 0) continue;
4565
+ const groupBlocks = extractPageBlocksFallback(group, pageNum);
4566
+ for (const b of groupBlocks) textBlocks.push(b);
4567
+ }
4568
+ const finalTextBlocks = detectListBlocks(textBlocks);
4569
+ for (const b of finalTextBlocks) blocks.push(b);
4570
+ }
4571
+ blocks.sort((a, b) => {
3961
4572
  const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
3962
4573
  const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
3963
4574
  return by - ay;
3964
4575
  });
3965
- return mergeAdjacentTableBlocks(allBlocks);
4576
+ return mergeAdjacentTableBlocks(blocks);
3966
4577
  }
3967
4578
  return mergeAdjacentTableBlocks(blocks);
3968
4579
  }
@@ -3989,52 +4600,52 @@ function mergeAdjacentTableBlocks(blocks) {
3989
4600
  function extractPageBlocksFallback(items, pageNum) {
3990
4601
  if (items.length === 0) return [];
3991
4602
  const blocks = [];
3992
- const allYLines = groupByY(items);
3993
- const columns = detectColumns(allYLines);
3994
- if (columns && columns.length >= 3) {
3995
- const tableText = extractWithColumns(allYLines, columns);
3996
- const bbox = computeBBox(items, pageNum);
3997
- blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
3998
- } else {
3999
- const clusterItems = items.map((i) => ({
4000
- text: i.text,
4001
- x: i.x,
4002
- y: i.y,
4003
- w: i.w,
4004
- h: i.h,
4005
- fontSize: i.fontSize,
4006
- fontName: i.fontName
4007
- }));
4008
- const clusterResults = detectClusterTables(clusterItems, pageNum);
4009
- if (clusterResults.length > 0) {
4010
- const ciToIdx = /* @__PURE__ */ new Map();
4011
- for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
4012
- const usedIndices = /* @__PURE__ */ new Set();
4013
- for (const cr of clusterResults) {
4014
- for (const ci of cr.usedItems) {
4015
- const idx = ciToIdx.get(ci);
4016
- if (idx !== void 0) usedIndices.add(idx);
4017
- }
4018
- blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
4603
+ const clusterItems = items.map((i) => ({
4604
+ text: i.text,
4605
+ x: i.x,
4606
+ y: i.y,
4607
+ w: i.w,
4608
+ h: i.h,
4609
+ fontSize: i.fontSize,
4610
+ fontName: i.fontName
4611
+ }));
4612
+ const clusterResults = detectClusterTables(clusterItems, pageNum);
4613
+ if (clusterResults.length > 0) {
4614
+ const ciToIdx = /* @__PURE__ */ new Map();
4615
+ for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
4616
+ const usedIndices = /* @__PURE__ */ new Set();
4617
+ for (const cr of clusterResults) {
4618
+ for (const ci of cr.usedItems) {
4619
+ const idx = ciToIdx.get(ci);
4620
+ if (idx !== void 0) usedIndices.add(idx);
4019
4621
  }
4020
- const remaining = items.filter((_, idx) => !usedIndices.has(idx));
4021
- if (remaining.length > 0) {
4022
- const yLines = groupByY(remaining);
4023
- for (const line of yLines) {
4024
- const text = mergeLineSimple(line);
4025
- if (!text.trim()) continue;
4026
- const bbox = computeBBox(line, pageNum);
4027
- blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
4028
- }
4622
+ blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
4623
+ }
4624
+ const remaining = items.filter((_, idx) => !usedIndices.has(idx));
4625
+ if (remaining.length > 0) {
4626
+ const yLines = groupByY(remaining);
4627
+ for (const line of yLines) {
4628
+ const text = mergeLineSimple(line);
4629
+ if (!text.trim()) continue;
4630
+ const bbox = computeBBox(line, pageNum);
4631
+ blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
4029
4632
  }
4030
- blocks.sort((a, b) => {
4031
- const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
4032
- const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
4033
- return by - ay;
4034
- });
4633
+ }
4634
+ blocks.sort((a, b) => {
4635
+ const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
4636
+ const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
4637
+ return by - ay;
4638
+ });
4639
+ } else {
4640
+ const allYLines = groupByY(items);
4641
+ const columns = detectColumns(allYLines);
4642
+ if (columns && columns.length >= 3) {
4643
+ const tableText = extractWithColumns(allYLines, columns);
4644
+ const bbox = computeBBox(items, pageNum);
4645
+ blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
4035
4646
  } else {
4036
4647
  const allY = items.map((i) => i.y);
4037
- const pageHeight = Math.max(...allY) - Math.min(...allY);
4648
+ const pageHeight = safeMax(allY) - safeMin(allY);
4038
4649
  const gapThreshold = Math.max(15, pageHeight * 0.03);
4039
4650
  const orderedGroups = xyCutOrder(items, gapThreshold);
4040
4651
  for (const group of orderedGroups) {
@@ -4087,22 +4698,76 @@ function dominantStyle(items) {
4087
4698
  return { fontSize: dominantSize, fontName };
4088
4699
  }
4089
4700
  function normalizeItems(rawItems) {
4090
- return rawItems.filter((i) => typeof i.str === "string" && i.str.trim() !== "").map((i) => {
4701
+ const items = [];
4702
+ const spacePositions = [];
4703
+ for (const i of rawItems) {
4704
+ if (typeof i.str !== "string") continue;
4705
+ const x = Math.round(i.transform[4]);
4706
+ const y = Math.round(i.transform[5]);
4707
+ if (!i.str.trim()) {
4708
+ spacePositions.push({ x, y });
4709
+ continue;
4710
+ }
4091
4711
  const scaleY = Math.abs(i.transform[3]);
4092
4712
  const scaleX = Math.abs(i.transform[0]);
4093
4713
  const fontSize = Math.round(Math.max(scaleY, scaleX));
4094
- return {
4095
- text: i.str.trim(),
4096
- x: Math.round(i.transform[4]),
4097
- y: Math.round(i.transform[5]),
4098
- w: Math.round(i.width),
4099
- h: Math.round(i.height),
4100
- fontSize,
4101
- fontName: i.fontName || "",
4102
- // 0pt 폰트이거나 너비 0 → hidden text (prompt injection 의심)
4103
- isHidden: fontSize === 0 || i.width === 0 && i.str.trim().length > 0
4104
- };
4105
- }).sort((a, b) => b.y - a.y || a.x - b.x);
4714
+ const w = Math.round(i.width);
4715
+ const h = Math.round(i.height);
4716
+ const isHidden = fontSize === 0 || i.width === 0 && i.str.trim().length > 0;
4717
+ let text = i.str.trim();
4718
+ if (/^[\d\s\-().·,☎]+$/.test(text) && /\d/.test(text) && / /.test(text)) {
4719
+ text = text.replace(/ /g, "");
4720
+ }
4721
+ const split = splitEvenSpacedItem(text, x, w, fontSize);
4722
+ if (split) {
4723
+ for (const s of split) {
4724
+ items.push({ text: s.text, x: s.x, y, w: s.w, h, fontSize, fontName: i.fontName || "", isHidden });
4725
+ }
4726
+ } else {
4727
+ items.push({ text, x, y, w, h, fontSize, fontName: i.fontName || "", isHidden });
4728
+ }
4729
+ }
4730
+ const sorted = items.sort((a, b) => b.y - a.y || a.x - b.x);
4731
+ const deduped = [];
4732
+ for (let i = 0; i < sorted.length; i++) {
4733
+ let isDup = false;
4734
+ for (let j = deduped.length - 1; j >= 0; j--) {
4735
+ const prev = deduped[j];
4736
+ if (prev.y - sorted[i].y > 3) break;
4737
+ if (Math.abs(prev.y - sorted[i].y) <= 3 && prev.text === sorted[i].text && Math.abs(prev.x - sorted[i].x) <= 3) {
4738
+ isDup = true;
4739
+ break;
4740
+ }
4741
+ }
4742
+ if (!isDup) deduped.push(sorted[i]);
4743
+ }
4744
+ if (spacePositions.length > 0) {
4745
+ for (const item of deduped) {
4746
+ for (const sp of spacePositions) {
4747
+ if (Math.abs(sp.y - item.y) <= 3) {
4748
+ const dist = item.x - sp.x;
4749
+ if (dist >= 0 && dist <= 20) {
4750
+ item.hasSpaceBefore = true;
4751
+ break;
4752
+ }
4753
+ }
4754
+ }
4755
+ }
4756
+ }
4757
+ return deduped;
4758
+ }
4759
+ function splitEvenSpacedItem(text, itemX, itemW, fontSize) {
4760
+ if (!/^[가-힣\d](?: [가-힣\d]){2,}$/.test(text)) return null;
4761
+ const chars = text.split(" ");
4762
+ if (chars.length < 3) return null;
4763
+ const charW = itemW / chars.length;
4764
+ if (charW > fontSize * 2) return null;
4765
+ return chars.map((ch, idx) => ({
4766
+ text: ch,
4767
+ x: Math.round(itemX + idx * charW),
4768
+ w: Math.round(charW * 0.8)
4769
+ // 실제 글자 폭은 간격보다 좁음
4770
+ }));
4106
4771
  }
4107
4772
  function groupByY(items) {
4108
4773
  if (items.length === 0) return [];
@@ -4127,14 +4792,14 @@ function isProseSpread(items) {
4127
4792
  for (let i = 1; i < sorted.length; i++) {
4128
4793
  gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
4129
4794
  }
4130
- const maxGap = Math.max(...gaps);
4795
+ const maxGap = safeMax(gaps);
4131
4796
  const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
4132
4797
  return maxGap < 40 && avgLen < 5;
4133
4798
  }
4134
4799
  function detectColumns(yLines) {
4135
4800
  const allItems = yLines.flat();
4136
4801
  if (allItems.length === 0) return null;
4137
- const pageWidth = Math.max(...allItems.map((i) => i.x + i.w)) - Math.min(...allItems.map((i) => i.x));
4802
+ const pageWidth = safeMax(allItems.map((i) => i.x + i.w)) - safeMin(allItems.map((i) => i.x));
4138
4803
  if (pageWidth < 100) return null;
4139
4804
  let bigoLineIdx = -1;
4140
4805
  for (let i = 0; i < yLines.length; i++) {
@@ -4166,7 +4831,7 @@ function detectColumns(yLines) {
4166
4831
  }
4167
4832
  const peaks = xClusters.filter((c) => c.count >= 3).sort((a, b) => a.minX - b.minX);
4168
4833
  if (peaks.length < 3) return null;
4169
- const MERGE_TOL = 30;
4834
+ const MERGE_TOL = 40;
4170
4835
  const merged = [peaks[0]];
4171
4836
  for (let i = 1; i < peaks.length; i++) {
4172
4837
  const prev = merged[merged.length - 1];
@@ -4180,7 +4845,14 @@ function detectColumns(yLines) {
4180
4845
  merged.push({ ...peaks[i] });
4181
4846
  }
4182
4847
  }
4183
- const columns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
4848
+ const rawColumns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
4849
+ if (rawColumns.length < 3) return null;
4850
+ const MIN_DETECT_COL_WIDTH = 30;
4851
+ const columns = [rawColumns[0]];
4852
+ for (let i = 1; i < rawColumns.length; i++) {
4853
+ if (rawColumns[i] - columns[columns.length - 1] < MIN_DETECT_COL_WIDTH) continue;
4854
+ columns.push(rawColumns[i]);
4855
+ }
4184
4856
  return columns.length >= 3 ? columns : null;
4185
4857
  }
4186
4858
  function findColumn(x, columns) {
@@ -4308,6 +4980,16 @@ function buildGridTable(lines, columns) {
4308
4980
  }
4309
4981
  merged.splice(0, headerEnd, headerRow);
4310
4982
  }
4983
+ for (const row of merged) {
4984
+ for (let c = 0; c < row.length; c++) {
4985
+ if (row[c]) row[c] = collapseEvenSpacing(row[c]);
4986
+ }
4987
+ }
4988
+ const totalCells = merged.length * numCols;
4989
+ const filledCells = merged.reduce((s, row) => s + row.filter((c) => c).length, 0);
4990
+ if (filledCells < totalCells * 0.35 || merged.length < 2 || merged.length <= 3 && numCols >= 7) {
4991
+ return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
4992
+ }
4311
4993
  const md = [];
4312
4994
  md.push("| " + merged[0].join(" | ") + " |");
4313
4995
  md.push("| " + merged[0].map(() => "---").join(" | ") + " |");
@@ -4319,12 +5001,32 @@ function buildGridTable(lines, columns) {
4319
5001
  function mergeLineSimple(items) {
4320
5002
  if (items.length <= 1) return items[0]?.text || "";
4321
5003
  const sorted = [...items].sort((a, b) => a.x - b.x);
5004
+ const isEvenSpaced = detectEvenSpacedItems(sorted);
4322
5005
  let result = sorted[0].text;
4323
5006
  for (let i = 1; i < sorted.length; i++) {
4324
5007
  const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
4325
5008
  const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
4326
- if (gap > 15) result += " ";
4327
- else if (gap < avgFs * 0.15) {
5009
+ const tabThreshold = Math.max(avgFs * 2, 30);
5010
+ if (gap > tabThreshold) {
5011
+ result += " ";
5012
+ result += sorted[i].text;
5013
+ continue;
5014
+ }
5015
+ if (isEvenSpaced[i]) {
5016
+ result += sorted[i].text;
5017
+ continue;
5018
+ }
5019
+ if (sorted[i].hasSpaceBefore && gap >= avgFs * 0.05) {
5020
+ result += " ";
5021
+ result += sorted[i].text;
5022
+ continue;
5023
+ }
5024
+ if (/[□■○●▶◆◇ㅇ]$/.test(sorted[i - 1].text) && /^[가-힣]/.test(sorted[i].text) && gap > 1) {
5025
+ result += " ";
5026
+ result += sorted[i].text;
5027
+ continue;
5028
+ }
5029
+ if (gap < avgFs * 0.15) {
4328
5030
  } else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
4329
5031
  } else if (gap > 3) result += " ";
4330
5032
  result += sorted[i].text;
@@ -4333,8 +5035,8 @@ function mergeLineSimple(items) {
4333
5035
  }
4334
5036
  function cleanPdfText(text) {
4335
5037
  return mergeKoreanLines(
4336
- text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "")
4337
- ).replace(/^(?!\|).{3,30}$/gm, (line) => collapseEvenSpacing(line)).replace(/\n{3,}/g, "\n\n").trim();
5038
+ text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
5039
+ ).replace(/^(?!\| ---).*$/gm, (line) => collapseEvenSpacing(line)).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
4338
5040
  }
4339
5041
  function startsWithMarker(line) {
4340
5042
  const t = line.trimStart();
@@ -4526,7 +5228,7 @@ function mergeKoreanLines(text) {
4526
5228
  result[result.length - 1] = prev + " " + currTrimmed;
4527
5229
  continue;
4528
5230
  }
4529
- if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
5231
+ if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev) && !startsWithMarker(prev)) {
4530
5232
  result[result.length - 1] = prev + " " + curr;
4531
5233
  } else {
4532
5234
  result.push(curr);
@@ -4574,7 +5276,7 @@ function getTextContent(el) {
4574
5276
  return el.textContent?.trim() ?? "";
4575
5277
  }
4576
5278
  function parseXml(text) {
4577
- return new DOMParser2().parseFromString(text, "text/xml");
5279
+ return new DOMParser2().parseFromString(stripDtd(text), "text/xml");
4578
5280
  }
4579
5281
  function parseSharedStrings(xml) {
4580
5282
  const doc = parseXml(xml);
@@ -4861,7 +5563,7 @@ function getAttr(el, localName) {
4861
5563
  return null;
4862
5564
  }
4863
5565
  function parseXml2(text) {
4864
- return new DOMParser3().parseFromString(text, "text/xml");
5566
+ return new DOMParser3().parseFromString(stripDtd(text), "text/xml");
4865
5567
  }
4866
5568
  function parseStyles(xml) {
4867
5569
  const doc = parseXml2(xml);
@@ -5261,7 +5963,13 @@ function normalize(s) {
5261
5963
  }
5262
5964
  var MAX_LEVENSHTEIN_LEN = 1e4;
5263
5965
  function levenshtein(a, b) {
5264
- if (a.length + b.length > MAX_LEVENSHTEIN_LEN) return Math.abs(a.length - b.length);
5966
+ if (a.length + b.length > MAX_LEVENSHTEIN_LEN) {
5967
+ const sampleLen = Math.min(500, a.length, b.length);
5968
+ let diffs = 0;
5969
+ for (let i = 0; i < sampleLen; i++) if (a[i] !== b[i]) diffs++;
5970
+ const sampleRate = sampleLen > 0 ? diffs / sampleLen : 1;
5971
+ return Math.abs(a.length - b.length) + Math.round(Math.min(a.length, b.length) * sampleRate);
5972
+ }
5265
5973
  if (a.length > b.length) [a, b] = [b, a];
5266
5974
  const m = a.length;
5267
5975
  const n = b.length;
@@ -5544,13 +6252,20 @@ function extractInlineFields(text) {
5544
6252
 
5545
6253
  // src/hwpx/generator.ts
5546
6254
  import JSZip5 from "jszip";
5547
- var HWPML_NS = "http://www.hancom.co.kr/hwpml/2016/HwpMl";
6255
+ var NS_SECTION = "http://www.hancom.co.kr/hwpml/2011/section";
6256
+ var NS_PARA = "http://www.hancom.co.kr/hwpml/2011/paragraph";
6257
+ var NS_HEAD = "http://www.hancom.co.kr/hwpml/2011/head";
6258
+ var NS_OPF = "http://www.idpf.org/2007/opf/";
6259
+ var NS_HPF = "http://www.hancom.co.kr/schema/2011/hpf";
6260
+ var NS_OCF = "urn:oasis:names:tc:opendocument:xmlns:container";
5548
6261
  async function markdownToHwpx(markdown) {
5549
6262
  const blocks = parseMarkdownToBlocks(markdown);
5550
6263
  const sectionXml = blocksToSectionXml(blocks);
5551
6264
  const zip = new JSZip5();
5552
6265
  zip.file("mimetype", "application/hwp+zip", { compression: "STORE" });
6266
+ zip.file("META-INF/container.xml", generateContainerXml());
5553
6267
  zip.file("Contents/content.hpf", generateManifest());
6268
+ zip.file("Contents/header.xml", generateHeaderXml());
5554
6269
  zip.file("Contents/section0.xml", sectionXml);
5555
6270
  return await zip.generateAsync({ type: "arraybuffer" });
5556
6271
  }
@@ -5595,8 +6310,111 @@ function parseMarkdownToBlocks(md) {
5595
6310
  function escapeXml(text) {
5596
6311
  return text.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;");
5597
6312
  }
6313
+ function generateContainerXml() {
6314
+ return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
6315
+ <ocf:container xmlns:ocf="${NS_OCF}" xmlns:hpf="${NS_HPF}">
6316
+ <ocf:rootfiles>
6317
+ <ocf:rootfile full-path="Contents/content.hpf" media-type="application/hwpml-package+xml"/>
6318
+ </ocf:rootfiles>
6319
+ </ocf:container>`;
6320
+ }
6321
+ function generateManifest() {
6322
+ return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
6323
+ <opf:package xmlns:opf="${NS_OPF}" xmlns:hpf="${NS_HPF}" xmlns:hh="${NS_HEAD}">
6324
+ <opf:manifest>
6325
+ <opf:item id="header" href="Contents/header.xml" media-type="application/xml"/>
6326
+ <opf:item id="section0" href="Contents/section0.xml" media-type="application/xml"/>
6327
+ </opf:manifest>
6328
+ <opf:spine>
6329
+ <opf:itemref idref="header" linear="no"/>
6330
+ <opf:itemref idref="section0" linear="yes"/>
6331
+ </opf:spine>
6332
+ </opf:package>`;
6333
+ }
6334
+ function generateHeaderXml() {
6335
+ return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
6336
+ <hh:head xmlns:hh="${NS_HEAD}" xmlns:hp="${NS_PARA}" version="1.4" secCnt="1">
6337
+ <hh:beginNum page="1" footnote="1" endnote="1" pic="1" tbl="1" equation="1"/>
6338
+ <hh:refList>
6339
+ <hh:fontfaces itemCnt="7">
6340
+ <hh:fontface lang="HANGUL" fontCnt="1">
6341
+ <hh:font id="0" face="\uD568\uCD08\uB86C\uBC14\uD0D5" type="TTF" isEmbedded="0">
6342
+ <hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="4" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
6343
+ </hh:font>
6344
+ </hh:fontface>
6345
+ <hh:fontface lang="LATIN" fontCnt="1">
6346
+ <hh:font id="0" face="Times New Roman" type="TTF" isEmbedded="0">
6347
+ <hh:typeInfo familyType="FCAT_OLDSTYLE" weight="5" proportion="4" contrast="2" strokeVariation="0" armStyle="0" letterform="0" midline="0" xHeight="4"/>
6348
+ </hh:font>
6349
+ </hh:fontface>
6350
+ <hh:fontface lang="HANJA" fontCnt="1">
6351
+ <hh:font id="0" face="\uD568\uCD08\uB86C\uBC14\uD0D5" type="TTF" isEmbedded="0">
6352
+ <hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="4" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
6353
+ </hh:font>
6354
+ </hh:fontface>
6355
+ <hh:fontface lang="JAPANESE" fontCnt="1">
6356
+ <hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
6357
+ <hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
6358
+ </hh:font>
6359
+ </hh:fontface>
6360
+ <hh:fontface lang="OTHER" fontCnt="1">
6361
+ <hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
6362
+ <hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
6363
+ </hh:font>
6364
+ </hh:fontface>
6365
+ <hh:fontface lang="SYMBOL" fontCnt="1">
6366
+ <hh:font id="0" face="Symbol" type="TTF" isEmbedded="0">
6367
+ <hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
6368
+ </hh:font>
6369
+ </hh:fontface>
6370
+ <hh:fontface lang="USER" fontCnt="1">
6371
+ <hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
6372
+ <hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
6373
+ </hh:font>
6374
+ </hh:fontface>
6375
+ </hh:fontfaces>
6376
+ <hh:borderFills itemCnt="1">
6377
+ <hh:borderFill id="0" threeD="0" shadow="0" centerLine="0" breakCellSeparateLine="0">
6378
+ <hh:slash type="NONE" Crooked="0" isCounter="0"/>
6379
+ <hh:backSlash type="NONE" Crooked="0" isCounter="0"/>
6380
+ <hh:leftBorder type="NONE" width="0.1mm" color="0"/>
6381
+ <hh:rightBorder type="NONE" width="0.1mm" color="0"/>
6382
+ <hh:topBorder type="NONE" width="0.1mm" color="0"/>
6383
+ <hh:bottomBorder type="NONE" width="0.1mm" color="0"/>
6384
+ <hh:diagonal type="NONE" width="0.1mm" color="0"/>
6385
+ <hh:fillInfo/>
6386
+ </hh:borderFill>
6387
+ </hh:borderFills>
6388
+ <hh:charProperties itemCnt="1">
6389
+ <hh:charPr id="0" height="1000" textColor="0" shadeColor="-1" useFontSpace="0" useKerning="0" symMark="NONE" borderFillIDRef="0">
6390
+ <hh:fontRef hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
6391
+ <hh:ratio hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
6392
+ <hh:spacing hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
6393
+ <hh:relSz hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
6394
+ <hh:offset hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
6395
+ </hh:charPr>
6396
+ </hh:charProperties>
6397
+ <hh:tabProperties itemCnt="0"/>
6398
+ <hh:numberings itemCnt="0"/>
6399
+ <hh:bullets itemCnt="0"/>
6400
+ <hh:paraProperties itemCnt="1">
6401
+ <hh:paraPr id="0" tabIDRef="0" condense="0" fontLineHeight="0" snapToGrid="0" suppressOverlap="0" checked="0">
6402
+ <hh:parLineBreak lineBreak="BREAK_LINE" wordBreak="BREAK_WORD" breakLatinWord="BREAK_WORD" breakNonLatinWord="BREAK_WORD"/>
6403
+ <hh:parMargin left="0" right="0" prev="0" next="0" indent="0"/>
6404
+ <hh:parBorder borderFillIDRef="0" offsetLeft="0" offsetRight="0" offsetTop="0" offsetBottom="0" connect="0" ignoreMargin="0"/>
6405
+ <hh:parShade borderFillIDRef="0"/>
6406
+ <hh:parTabList/>
6407
+ </hh:paraPr>
6408
+ </hh:paraProperties>
6409
+ <hh:styles itemCnt="1">
6410
+ <hh:style id="0" type="PARA" name="\uBC14\uD0D5\uAE00" engName="Normal" paraPrIDRef="0" charPrIDRef="0" nextStyleIDRef="0" langIDRef="1042" lockForm="0"/>
6411
+ </hh:styles>
6412
+ </hh:refList>
6413
+ <hh:compatibleDocument targetProgram="HWP2018"/>
6414
+ </hh:head>`;
6415
+ }
5598
6416
  function generateParagraph(text) {
5599
- return `<hp:p><hp:run><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
6417
+ return `<hp:p paraPrIDRef="0" styleIDRef="0"><hp:run charPrIDRef="0"><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
5600
6418
  }
5601
6419
  function generateTable(rows) {
5602
6420
  const trElements = rows.map((row) => {
@@ -5620,22 +6438,11 @@ function blocksToSectionXml(blocks) {
5620
6438
  return "";
5621
6439
  }
5622
6440
  }).join("\n ");
5623
- return `<?xml version="1.0" encoding="UTF-8"?>
5624
- <hs:sec xmlns:hs="${HWPML_NS}" xmlns:hp="${HWPML_NS}">
6441
+ return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
6442
+ <hs:sec xmlns:hs="${NS_SECTION}" xmlns:hp="${NS_PARA}">
5625
6443
  ${body}
5626
6444
  </hs:sec>`;
5627
6445
  }
5628
- function generateManifest() {
5629
- return `<?xml version="1.0" encoding="UTF-8"?>
5630
- <opf:package xmlns:opf="http://www.idpf.org/2007/opf">
5631
- <opf:manifest>
5632
- <opf:item id="s0" href="section0.xml" media-type="application/xml"/>
5633
- </opf:manifest>
5634
- <opf:spine>
5635
- <opf:itemref idref="s0"/>
5636
- </opf:spine>
5637
- </opf:package>`;
5638
- }
5639
6446
 
5640
6447
  // src/index.ts
5641
6448
  async function parse(input, options) {