kordoc 1.6.0 → 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -14,13 +14,21 @@
14
14
 
15
15
  ---
16
16
 
17
- ## What's New in v1.6.0
17
+ ## What's New in v1.6.1
18
+
19
+ - **HWP5 Table Cell Offset Fix** — Fixed critical 2-byte offset misalignment in LIST_HEADER parsing. Row address was incorrectly read as colSpan, causing 3-column tables to explode into 6+ columns with misaligned content. Tables now use colAddr/rowAddr-based direct placement for accurate cell positioning.
20
+ - **HWP5 TAB Control Character Fix** — TAB (0x0009) inline control's 14-byte extension data was not skipped, producing garbage characters (`࣐Ā`) after every tab in the output. Fixed by adding the required 14-byte skip.
21
+
22
+ <details>
23
+ <summary>v1.6.0 features</summary>
18
24
 
19
25
  - **Cluster-Based Table Detection (PDF)** — Detects borderless tables by analyzing text alignment patterns. Baseline grouping + X-coordinate clustering identifies 2+ column tables that line-based detection misses. Sort-and-split clustering for order-independent results.
20
26
  - **Korean Special Table Detection** — Automatically detects `구분/항목/종류`-style key-value patterns common in Korean government documents and converts them to structured 2-column tables.
21
27
  - **Korean Word-Break Recovery** — Improved merging of broken Korean words in PDF table cells. Handles character-level PDF rendering (micro-gaps between Hangul characters) and cell line-break artifacts up to 8 characters.
22
28
  - **Empty Table Filtering** — Tables with all-empty cells (from line detection of decorative borders) are now automatically removed.
23
29
 
30
+ </details>
31
+
24
32
  <details>
25
33
  <summary>v1.5.0 features</summary>
26
34
 
@@ -215,7 +223,7 @@ import type {
215
223
  | Format | Engine | Features |
216
224
  |--------|--------|----------|
217
225
  | **HWPX** (한컴 2020+) | ZIP + XML DOM | Manifest, nested tables, merged cells, broken ZIP recovery |
218
- | **HWP 5.x** (한컴 Legacy) | OLE2 + CFB | 21 control chars, zlib decompression, DRM detection |
226
+ | **HWP 5.x** (한컴 Legacy) | OLE2 + CFB | 21 control chars, zlib decompression, DRM detection, colAddr-based table cell placement |
219
227
  | **PDF** | pdfjs-dist | Line-based table detection, XY-Cut reading order, heading detection, hidden text filter, OCR |
220
228
 
221
229
  ## Security
@@ -185,7 +185,7 @@ function tableToMarkdown(table) {
185
185
  }
186
186
 
187
187
  // src/utils.ts
188
- var VERSION = true ? "1.6.0" : "0.0.0-dev";
188
+ var VERSION = true ? "1.6.1" : "0.0.0-dev";
189
189
  function toArrayBuffer(buf) {
190
190
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
191
191
  return buf.buffer;
@@ -923,6 +923,7 @@ function extractText(data) {
923
923
  break;
924
924
  case CHAR_TAB:
925
925
  result += " ";
926
+ if (i + 14 <= data.length) i += 14;
926
927
  break;
927
928
  case CHAR_HYPHEN:
928
929
  result += "-";
@@ -1237,9 +1238,13 @@ function parseCellBlock(records, startIdx, tableLevel) {
1237
1238
  const texts = [];
1238
1239
  let colSpan = 1;
1239
1240
  let rowSpan = 1;
1240
- if (rec.data.length >= 14) {
1241
- const cs = rec.data.readUInt16LE(10);
1242
- const rs = rec.data.readUInt16LE(12);
1241
+ let colAddr;
1242
+ let rowAddr;
1243
+ if (rec.data.length >= 16) {
1244
+ colAddr = rec.data.readUInt16LE(8);
1245
+ rowAddr = rec.data.readUInt16LE(10);
1246
+ const cs = rec.data.readUInt16LE(12);
1247
+ const rs = rec.data.readUInt16LE(14);
1243
1248
  if (cs > 0) colSpan = Math.min(cs, MAX_COLS);
1244
1249
  if (rs > 0) rowSpan = Math.min(rs, MAX_ROWS);
1245
1250
  }
@@ -1254,15 +1259,16 @@ function parseCellBlock(records, startIdx, tableLevel) {
1254
1259
  }
1255
1260
  i++;
1256
1261
  }
1257
- return { cell: { text: texts.join("\n"), colSpan, rowSpan }, nextIdx: i };
1262
+ return { cell: { text: texts.join("\n"), colSpan, rowSpan, colAddr, rowAddr }, nextIdx: i };
1258
1263
  }
1259
1264
  function arrangeCells(rows, cols, cells) {
1260
1265
  const grid = Array.from({ length: rows }, () => Array(cols).fill(null));
1261
- let cellIdx = 0;
1262
- for (let r = 0; r < rows && cellIdx < cells.length; r++) {
1263
- for (let c = 0; c < cols && cellIdx < cells.length; c++) {
1264
- if (grid[r][c] !== null) continue;
1265
- const cell = cells[cellIdx++];
1266
+ const hasAddr = cells.some((c) => c.colAddr !== void 0 && c.rowAddr !== void 0);
1267
+ if (hasAddr) {
1268
+ for (const cell of cells) {
1269
+ const r = cell.rowAddr ?? 0;
1270
+ const c = cell.colAddr ?? 0;
1271
+ if (r >= rows || c >= cols) continue;
1266
1272
  grid[r][c] = cell;
1267
1273
  for (let dr = 0; dr < cell.rowSpan; dr++) {
1268
1274
  for (let dc = 0; dc < cell.colSpan; dc++) {
@@ -1272,6 +1278,22 @@ function arrangeCells(rows, cols, cells) {
1272
1278
  }
1273
1279
  }
1274
1280
  }
1281
+ } else {
1282
+ let cellIdx = 0;
1283
+ for (let r = 0; r < rows && cellIdx < cells.length; r++) {
1284
+ for (let c = 0; c < cols && cellIdx < cells.length; c++) {
1285
+ if (grid[r][c] !== null) continue;
1286
+ const cell = cells[cellIdx++];
1287
+ grid[r][c] = cell;
1288
+ for (let dr = 0; dr < cell.rowSpan; dr++) {
1289
+ for (let dc = 0; dc < cell.colSpan; dc++) {
1290
+ if (dr === 0 && dc === 0) continue;
1291
+ if (r + dr < rows && c + dc < cols)
1292
+ grid[r + dr][c + dc] = { text: "", colSpan: 1, rowSpan: 1 };
1293
+ }
1294
+ }
1295
+ }
1296
+ }
1275
1297
  }
1276
1298
  return grid.map((row) => row.map((c) => c || { text: "", colSpan: 1, rowSpan: 1 }));
1277
1299
  }
@@ -2964,4 +2986,4 @@ export {
2964
2986
  extractFormFields,
2965
2987
  parse
2966
2988
  };
2967
- //# sourceMappingURL=chunk-TFGOV2ML.js.map
2989
+ //# sourceMappingURL=chunk-DYUB34PO.js.map