kordoc 1.6.0 → 1.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -2
- package/dist/{chunk-TFGOV2ML.js → chunk-DYUB34PO.js} +33 -11
- package/dist/chunk-DYUB34PO.js.map +1 -0
- package/dist/cli.js +2 -2
- package/dist/index.cjs +32 -10
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.js +32 -10
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +1 -1
- package/dist/{watch-WMRLOFYY.js → watch-3QVNEAVM.js} +2 -2
- package/package.json +1 -1
- package/dist/chunk-TFGOV2ML.js.map +0 -1
- /package/dist/{watch-WMRLOFYY.js.map → watch-3QVNEAVM.js.map} +0 -0
package/README.md
CHANGED
|
@@ -14,13 +14,21 @@
|
|
|
14
14
|
|
|
15
15
|
---
|
|
16
16
|
|
|
17
|
-
## What's New in v1.6.
|
|
17
|
+
## What's New in v1.6.1
|
|
18
|
+
|
|
19
|
+
- **HWP5 Table Cell Offset Fix** — Fixed critical 2-byte offset misalignment in LIST_HEADER parsing. Row address was incorrectly read as colSpan, causing 3-column tables to explode into 6+ columns with misaligned content. Tables now use colAddr/rowAddr-based direct placement for accurate cell positioning.
|
|
20
|
+
- **HWP5 TAB Control Character Fix** — TAB (0x0009) inline control's 14-byte extension data was not skipped, producing garbage characters (`࣐Ā`) after every tab in the output. Fixed by adding the required 14-byte skip.
|
|
21
|
+
|
|
22
|
+
<details>
|
|
23
|
+
<summary>v1.6.0 features</summary>
|
|
18
24
|
|
|
19
25
|
- **Cluster-Based Table Detection (PDF)** — Detects borderless tables by analyzing text alignment patterns. Baseline grouping + X-coordinate clustering identifies 2+ column tables that line-based detection misses. Sort-and-split clustering for order-independent results.
|
|
20
26
|
- **Korean Special Table Detection** — Automatically detects `구분/항목/종류`-style key-value patterns common in Korean government documents and converts them to structured 2-column tables.
|
|
21
27
|
- **Korean Word-Break Recovery** — Improved merging of broken Korean words in PDF table cells. Handles character-level PDF rendering (micro-gaps between Hangul characters) and cell line-break artifacts up to 8 characters.
|
|
22
28
|
- **Empty Table Filtering** — Tables with all-empty cells (from line detection of decorative borders) are now automatically removed.
|
|
23
29
|
|
|
30
|
+
</details>
|
|
31
|
+
|
|
24
32
|
<details>
|
|
25
33
|
<summary>v1.5.0 features</summary>
|
|
26
34
|
|
|
@@ -215,7 +223,7 @@ import type {
|
|
|
215
223
|
| Format | Engine | Features |
|
|
216
224
|
|--------|--------|----------|
|
|
217
225
|
| **HWPX** (한컴 2020+) | ZIP + XML DOM | Manifest, nested tables, merged cells, broken ZIP recovery |
|
|
218
|
-
| **HWP 5.x** (한컴 Legacy) | OLE2 + CFB | 21 control chars, zlib decompression, DRM detection |
|
|
226
|
+
| **HWP 5.x** (한컴 Legacy) | OLE2 + CFB | 21 control chars, zlib decompression, DRM detection, colAddr-based table cell placement |
|
|
219
227
|
| **PDF** | pdfjs-dist | Line-based table detection, XY-Cut reading order, heading detection, hidden text filter, OCR |
|
|
220
228
|
|
|
221
229
|
## Security
|
|
@@ -185,7 +185,7 @@ function tableToMarkdown(table) {
|
|
|
185
185
|
}
|
|
186
186
|
|
|
187
187
|
// src/utils.ts
|
|
188
|
-
var VERSION = true ? "1.6.
|
|
188
|
+
var VERSION = true ? "1.6.1" : "0.0.0-dev";
|
|
189
189
|
function toArrayBuffer(buf) {
|
|
190
190
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
191
191
|
return buf.buffer;
|
|
@@ -923,6 +923,7 @@ function extractText(data) {
|
|
|
923
923
|
break;
|
|
924
924
|
case CHAR_TAB:
|
|
925
925
|
result += " ";
|
|
926
|
+
if (i + 14 <= data.length) i += 14;
|
|
926
927
|
break;
|
|
927
928
|
case CHAR_HYPHEN:
|
|
928
929
|
result += "-";
|
|
@@ -1237,9 +1238,13 @@ function parseCellBlock(records, startIdx, tableLevel) {
|
|
|
1237
1238
|
const texts = [];
|
|
1238
1239
|
let colSpan = 1;
|
|
1239
1240
|
let rowSpan = 1;
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1241
|
+
let colAddr;
|
|
1242
|
+
let rowAddr;
|
|
1243
|
+
if (rec.data.length >= 16) {
|
|
1244
|
+
colAddr = rec.data.readUInt16LE(8);
|
|
1245
|
+
rowAddr = rec.data.readUInt16LE(10);
|
|
1246
|
+
const cs = rec.data.readUInt16LE(12);
|
|
1247
|
+
const rs = rec.data.readUInt16LE(14);
|
|
1243
1248
|
if (cs > 0) colSpan = Math.min(cs, MAX_COLS);
|
|
1244
1249
|
if (rs > 0) rowSpan = Math.min(rs, MAX_ROWS);
|
|
1245
1250
|
}
|
|
@@ -1254,15 +1259,16 @@ function parseCellBlock(records, startIdx, tableLevel) {
|
|
|
1254
1259
|
}
|
|
1255
1260
|
i++;
|
|
1256
1261
|
}
|
|
1257
|
-
return { cell: { text: texts.join("\n"), colSpan, rowSpan }, nextIdx: i };
|
|
1262
|
+
return { cell: { text: texts.join("\n"), colSpan, rowSpan, colAddr, rowAddr }, nextIdx: i };
|
|
1258
1263
|
}
|
|
1259
1264
|
function arrangeCells(rows, cols, cells) {
|
|
1260
1265
|
const grid = Array.from({ length: rows }, () => Array(cols).fill(null));
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
for (
|
|
1264
|
-
|
|
1265
|
-
const
|
|
1266
|
+
const hasAddr = cells.some((c) => c.colAddr !== void 0 && c.rowAddr !== void 0);
|
|
1267
|
+
if (hasAddr) {
|
|
1268
|
+
for (const cell of cells) {
|
|
1269
|
+
const r = cell.rowAddr ?? 0;
|
|
1270
|
+
const c = cell.colAddr ?? 0;
|
|
1271
|
+
if (r >= rows || c >= cols) continue;
|
|
1266
1272
|
grid[r][c] = cell;
|
|
1267
1273
|
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
1268
1274
|
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
@@ -1272,6 +1278,22 @@ function arrangeCells(rows, cols, cells) {
|
|
|
1272
1278
|
}
|
|
1273
1279
|
}
|
|
1274
1280
|
}
|
|
1281
|
+
} else {
|
|
1282
|
+
let cellIdx = 0;
|
|
1283
|
+
for (let r = 0; r < rows && cellIdx < cells.length; r++) {
|
|
1284
|
+
for (let c = 0; c < cols && cellIdx < cells.length; c++) {
|
|
1285
|
+
if (grid[r][c] !== null) continue;
|
|
1286
|
+
const cell = cells[cellIdx++];
|
|
1287
|
+
grid[r][c] = cell;
|
|
1288
|
+
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
1289
|
+
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
1290
|
+
if (dr === 0 && dc === 0) continue;
|
|
1291
|
+
if (r + dr < rows && c + dc < cols)
|
|
1292
|
+
grid[r + dr][c + dc] = { text: "", colSpan: 1, rowSpan: 1 };
|
|
1293
|
+
}
|
|
1294
|
+
}
|
|
1295
|
+
}
|
|
1296
|
+
}
|
|
1275
1297
|
}
|
|
1276
1298
|
return grid.map((row) => row.map((c) => c || { text: "", colSpan: 1, rowSpan: 1 }));
|
|
1277
1299
|
}
|
|
@@ -2964,4 +2986,4 @@ export {
|
|
|
2964
2986
|
extractFormFields,
|
|
2965
2987
|
parse
|
|
2966
2988
|
};
|
|
2967
|
-
//# sourceMappingURL=chunk-
|
|
2989
|
+
//# sourceMappingURL=chunk-DYUB34PO.js.map
|