markit-ai 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/convert.d.ts +1 -0
- package/dist/commands/convert.js +7 -3
- package/dist/commands/formats.js +5 -0
- package/dist/converters/docx.d.ts +1 -1
- package/dist/converters/docx.js +35 -3
- package/dist/converters/epub.js +1 -0
- package/dist/converters/github.d.ts +18 -0
- package/dist/converters/github.js +148 -0
- package/dist/converters/pdf/columns.d.ts +35 -0
- package/dist/converters/pdf/columns.js +93 -0
- package/dist/converters/pdf/extract.d.ts +19 -0
- package/dist/converters/pdf/extract.js +513 -0
- package/dist/converters/pdf/grid.d.ts +25 -0
- package/dist/converters/pdf/grid.js +654 -0
- package/dist/converters/pdf/headers.d.ts +24 -0
- package/dist/converters/pdf/headers.js +108 -0
- package/dist/converters/pdf/index.d.ts +19 -0
- package/dist/converters/pdf/index.js +116 -0
- package/dist/converters/pdf/render.d.ts +24 -0
- package/dist/converters/pdf/render.js +513 -0
- package/dist/converters/pdf/types.d.ts +75 -0
- package/dist/converters/pdf/types.js +1 -0
- package/dist/converters/pptx.d.ts +1 -1
- package/dist/converters/pptx.js +74 -1
- package/dist/converters/xlsx.js +1 -0
- package/dist/index.d.ts +2 -1
- package/dist/index.js +2 -1
- package/dist/main.js +4 -1
- package/dist/markit.d.ts +1 -1
- package/dist/markit.js +19 -4
- package/dist/types.d.ts +8 -0
- package/package.json +3 -3
- package/dist/converters/pdf.d.ts +0 -6
- package/dist/converters/pdf.js +0 -29
|
@@ -0,0 +1,513 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Markdown rendering for PDF pages.
|
|
3
|
+
*
|
|
4
|
+
* Converts table grids and free text boxes into markdown, handling:
|
|
5
|
+
* - Table grid → markdown table (`| col | col |`)
|
|
6
|
+
* - Free text → paragraphs with heading detection (by font size)
|
|
7
|
+
* - Content ordering (top-to-bottom via Y coordinate)
|
|
8
|
+
* - Paragraph wrap merging (lines broken across PDF line boundaries)
|
|
9
|
+
* - Page number removal
|
|
10
|
+
*
|
|
11
|
+
* Ported from @oharato/pdf2md-ts, stripped of CJK/TDnet-specific logic.
|
|
12
|
+
*/
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
// Utility
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
/** Convert full-width ASCII characters (A→A, !→! etc.) to normal ASCII. */
|
|
17
|
+
function normalizeFullWidthAscii(text) {
|
|
18
|
+
return text.replace(/[!-~]/g, (ch) => String.fromCharCode(ch.charCodeAt(0) - 0xfee0));
|
|
19
|
+
}
|
|
20
|
+
function escapePipes(text) {
|
|
21
|
+
return normalizeFullWidthAscii(text)
|
|
22
|
+
.replaceAll("|", "\\|")
|
|
23
|
+
.replaceAll("\n", "<br>");
|
|
24
|
+
}
|
|
25
|
+
/** Parse a markdown pipe-delimited row into cell strings. */
|
|
26
|
+
function parsePipeRow(line) {
|
|
27
|
+
const trimmed = line.trim();
|
|
28
|
+
if (!trimmed.startsWith("|") || !trimmed.endsWith("|"))
|
|
29
|
+
return [];
|
|
30
|
+
return trimmed
|
|
31
|
+
.slice(1, -1)
|
|
32
|
+
.split("|")
|
|
33
|
+
.map((cell) => cell.trim());
|
|
34
|
+
}
|
|
35
|
+
// ---------------------------------------------------------------------------
|
|
36
|
+
// Table rendering
|
|
37
|
+
// ---------------------------------------------------------------------------
|
|
38
|
+
/**
|
|
39
|
+
* Render a TableGrid as a markdown table.
|
|
40
|
+
*/
|
|
41
|
+
export function renderTableToMarkdown(table) {
|
|
42
|
+
if (table.rows === 0 || table.cols === 0)
|
|
43
|
+
return "";
|
|
44
|
+
const matrix = Array.from({ length: table.rows }, () => Array.from({ length: table.cols }, () => ""));
|
|
45
|
+
for (const cell of table.cells) {
|
|
46
|
+
if (cell.row < table.rows && cell.col < table.cols) {
|
|
47
|
+
matrix[cell.row][cell.col] = escapePipes(cell.text.trim());
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
const normalized = normalizeShiftedSparseColumns(matrix);
|
|
51
|
+
const promoted = promoteSubHeaderPrefixes(normalized);
|
|
52
|
+
const header = `| ${promoted[0].join(" | ")} |`;
|
|
53
|
+
const divider = `| ${Array.from({ length: promoted[0].length }, () => "---").join(" | ")} |`;
|
|
54
|
+
const body = promoted
|
|
55
|
+
.slice(1)
|
|
56
|
+
.map((row) => `| ${row.join(" | ")} |`)
|
|
57
|
+
.join("\n");
|
|
58
|
+
return [header, divider, body].filter((l) => l.length > 0).join("\n");
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Fix tables with ≥5 columns where sparse single-value columns are
|
|
62
|
+
* misaligned. Shifts those values to the adjacent dense column and
|
|
63
|
+
* removes the now-empty sparse columns.
|
|
64
|
+
*/
|
|
65
|
+
function normalizeShiftedSparseColumns(matrix) {
|
|
66
|
+
if (matrix.length === 0 || matrix[0].length < 5)
|
|
67
|
+
return matrix;
|
|
68
|
+
const _rows = matrix.length;
|
|
69
|
+
const cols = matrix[0].length;
|
|
70
|
+
const counts = Array.from({ length: cols }, (_, c) => matrix.reduce((n, row) => n + (row[c].trim().length > 0 ? 1 : 0), 0));
|
|
71
|
+
const denseCols = new Set(counts
|
|
72
|
+
.map((count, col) => ({ count, col }))
|
|
73
|
+
.filter(({ col, count }) => col === 0 || count >= 2)
|
|
74
|
+
.map(({ col }) => col));
|
|
75
|
+
const sparseCols = counts
|
|
76
|
+
.map((count, col) => ({ count, col }))
|
|
77
|
+
.filter(({ col, count }) => col > 0 && col < cols - 1 && count === 1)
|
|
78
|
+
.map(({ col }) => col);
|
|
79
|
+
if (sparseCols.length < 2 || denseCols.size < 4)
|
|
80
|
+
return matrix;
|
|
81
|
+
const moves = [];
|
|
82
|
+
for (const from of sparseCols) {
|
|
83
|
+
const row = matrix.findIndex((r) => r[from].trim().length > 0);
|
|
84
|
+
const to = from + 1;
|
|
85
|
+
if (row < 0)
|
|
86
|
+
return matrix;
|
|
87
|
+
if (!denseCols.has(to))
|
|
88
|
+
return matrix;
|
|
89
|
+
if (matrix[row][to].trim().length > 0)
|
|
90
|
+
return matrix;
|
|
91
|
+
moves.push({ from, to, row });
|
|
92
|
+
}
|
|
93
|
+
const copy = matrix.map((row) => [...row]);
|
|
94
|
+
for (const { from, to, row } of moves) {
|
|
95
|
+
copy[row][to] =
|
|
96
|
+
copy[row][to].trim().length > 0
|
|
97
|
+
? `${copy[row][to]} ${copy[row][from]}`
|
|
98
|
+
: copy[row][from];
|
|
99
|
+
copy[row][from] = "";
|
|
100
|
+
}
|
|
101
|
+
const keepCols = Array.from({ length: cols }, (_, c) => c).filter((c) => copy.some((row) => row[c].trim().length > 0));
|
|
102
|
+
if (keepCols.length === cols)
|
|
103
|
+
return copy;
|
|
104
|
+
return copy.map((row) => keepCols.map((c) => row[c]));
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* When a data row has ≥2 parenthesized qualifiers in non-first columns
|
|
108
|
+
* (and the first column is empty), promote them into the header row.
|
|
109
|
+
*/
|
|
110
|
+
function promoteSubHeaderPrefixes(matrix) {
|
|
111
|
+
if (matrix.length < 2)
|
|
112
|
+
return matrix;
|
|
113
|
+
const PAREN_RE = /^\([^)]{1,40}\)$/;
|
|
114
|
+
const result = matrix.map((row) => [...row]);
|
|
115
|
+
const cols = matrix[0].length;
|
|
116
|
+
const rowsToRemove = new Set();
|
|
117
|
+
for (let r = 1; r < result.length; r++) {
|
|
118
|
+
if (rowsToRemove.has(r))
|
|
119
|
+
continue;
|
|
120
|
+
const promotable = [];
|
|
121
|
+
for (let col = 1; col < cols; col++) {
|
|
122
|
+
const cell = (result[r][col] ?? "").trim();
|
|
123
|
+
if (!cell)
|
|
124
|
+
continue;
|
|
125
|
+
const parts = cell.split("<br>");
|
|
126
|
+
if (parts.length === 1 && PAREN_RE.test(cell)) {
|
|
127
|
+
promotable.push({ col, prefix: cell, isFullCell: true });
|
|
128
|
+
}
|
|
129
|
+
else if (parts.length >= 2 && PAREN_RE.test(parts[0].trim())) {
|
|
130
|
+
promotable.push({
|
|
131
|
+
col,
|
|
132
|
+
prefix: parts[0].trim(),
|
|
133
|
+
isFullCell: false,
|
|
134
|
+
});
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
if (promotable.length < 2)
|
|
138
|
+
continue;
|
|
139
|
+
if (promotable.some((p) => p.isFullCell) && result[r][0].trim().length > 0)
|
|
140
|
+
continue;
|
|
141
|
+
for (const { col, prefix, isFullCell } of promotable) {
|
|
142
|
+
result[0][col] = result[0][col].trim()
|
|
143
|
+
? `${result[0][col]} ${prefix}`
|
|
144
|
+
: prefix;
|
|
145
|
+
if (isFullCell) {
|
|
146
|
+
result[r][col] = "";
|
|
147
|
+
}
|
|
148
|
+
else {
|
|
149
|
+
const parts = result[r][col].split("<br>");
|
|
150
|
+
result[r][col] = parts.slice(1).join("<br>");
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
if (result[r].every((cell) => cell.trim().length === 0)) {
|
|
154
|
+
rowsToRemove.add(r);
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
return result.filter((_, r) => !rowsToRemove.has(r));
|
|
158
|
+
}
|
|
159
|
+
// ---------------------------------------------------------------------------
|
|
160
|
+
// Free text rendering
|
|
161
|
+
// ---------------------------------------------------------------------------
|
|
162
|
+
/** Y tolerance for grouping text boxes onto the same visual line. */
|
|
163
|
+
const TEXT_LINE_Y_TOLERANCE = 3;
|
|
164
|
+
/** Minimum X gap between adjacent boxes to mark line as tabular. */
|
|
165
|
+
const TABULAR_X_GAP = 30;
|
|
166
|
+
/**
|
|
167
|
+
* Minimum font size (pts) to consider when computing the modal body font.
|
|
168
|
+
* Tiny labels from diagrams, footnote markers, and superscripts are excluded
|
|
169
|
+
* so they don't skew the modal toward small sizes.
|
|
170
|
+
*/
|
|
171
|
+
const MIN_BODY_FONT_SIZE = 7;
|
|
172
|
+
/**
|
|
173
|
+
* Compute the most frequent font size among text boxes, ignoring very small
|
|
174
|
+
* text that likely comes from diagrams, footnotes, or superscripts.
|
|
175
|
+
*/
|
|
176
|
+
function modalFontSize(textBoxes) {
|
|
177
|
+
const counts = new Map();
|
|
178
|
+
for (const tb of textBoxes) {
|
|
179
|
+
const size = Math.round((tb.fontSize ?? 0) * 10) / 10;
|
|
180
|
+
if (size < MIN_BODY_FONT_SIZE)
|
|
181
|
+
continue;
|
|
182
|
+
counts.set(size, (counts.get(size) ?? 0) + 1);
|
|
183
|
+
}
|
|
184
|
+
let modal = 0;
|
|
185
|
+
let maxCount = 0;
|
|
186
|
+
for (const [size, count] of counts) {
|
|
187
|
+
if (count > maxCount) {
|
|
188
|
+
maxCount = count;
|
|
189
|
+
modal = size;
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
return modal;
|
|
193
|
+
}
|
|
194
|
+
/** Group free text boxes into horizontal lines, sorted top-to-bottom. */
|
|
195
|
+
function groupFreeTextIntoLines(textBoxes) {
|
|
196
|
+
if (textBoxes.length === 0)
|
|
197
|
+
return [];
|
|
198
|
+
const sorted = [...textBoxes].sort((a, b) => {
|
|
199
|
+
const ya = (a.bounds.top + a.bounds.bottom) / 2;
|
|
200
|
+
const yb = (b.bounds.top + b.bounds.bottom) / 2;
|
|
201
|
+
const dy = yb - ya;
|
|
202
|
+
if (Math.abs(dy) > TEXT_LINE_Y_TOLERANCE)
|
|
203
|
+
return dy;
|
|
204
|
+
return a.bounds.left - b.bounds.left;
|
|
205
|
+
});
|
|
206
|
+
const lines = [];
|
|
207
|
+
let curParts = [sorted[0].text];
|
|
208
|
+
let curBoxes = [sorted[0]];
|
|
209
|
+
let curY = (sorted[0].bounds.top + sorted[0].bounds.bottom) / 2;
|
|
210
|
+
let curTopY = curY;
|
|
211
|
+
let curFontSize = sorted[0].fontSize;
|
|
212
|
+
let curIsBold = sorted[0].isBold;
|
|
213
|
+
const finishLine = () => {
|
|
214
|
+
let isTabular = false;
|
|
215
|
+
for (let j = 1; j < curBoxes.length; j++) {
|
|
216
|
+
if (curBoxes[j].bounds.left - curBoxes[j - 1].bounds.right >
|
|
217
|
+
TABULAR_X_GAP) {
|
|
218
|
+
isTabular = true;
|
|
219
|
+
break;
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
lines.push({
|
|
223
|
+
text: curParts.join(" "),
|
|
224
|
+
topY: curTopY,
|
|
225
|
+
fontSize: curFontSize,
|
|
226
|
+
isBold: curIsBold,
|
|
227
|
+
isTabular,
|
|
228
|
+
});
|
|
229
|
+
};
|
|
230
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
231
|
+
const box = sorted[i];
|
|
232
|
+
const cy = (box.bounds.top + box.bounds.bottom) / 2;
|
|
233
|
+
if (Math.abs(cy - curY) <= TEXT_LINE_Y_TOLERANCE) {
|
|
234
|
+
curParts.push(box.text);
|
|
235
|
+
curBoxes.push(box);
|
|
236
|
+
curFontSize = Math.max(curFontSize, box.fontSize);
|
|
237
|
+
curIsBold = curIsBold || box.isBold;
|
|
238
|
+
}
|
|
239
|
+
else {
|
|
240
|
+
finishLine();
|
|
241
|
+
curParts = [box.text];
|
|
242
|
+
curBoxes = [box];
|
|
243
|
+
curY = cy;
|
|
244
|
+
curTopY = cy;
|
|
245
|
+
curFontSize = box.fontSize;
|
|
246
|
+
curIsBold = box.isBold;
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
finishLine();
|
|
250
|
+
return lines;
|
|
251
|
+
}
|
|
252
|
+
/** Determine markdown heading prefix based on font size relative to body. */
|
|
253
|
+
function headingPrefix(fontSize, bodyFontSize, isBold) {
|
|
254
|
+
if (bodyFontSize <= 0)
|
|
255
|
+
return "";
|
|
256
|
+
const ratio = fontSize / bodyFontSize;
|
|
257
|
+
// Large headings (>2x body size)
|
|
258
|
+
if (ratio >= 2.0)
|
|
259
|
+
return "# ";
|
|
260
|
+
// Medium headings (~1.5x body size)
|
|
261
|
+
if (ratio >= 1.4)
|
|
262
|
+
return "## ";
|
|
263
|
+
// Small headings (bold and slightly larger)
|
|
264
|
+
if (ratio >= 1.1 && isBold)
|
|
265
|
+
return "### ";
|
|
266
|
+
return "";
|
|
267
|
+
}
|
|
268
|
+
// ---------------------------------------------------------------------------
|
|
269
|
+
// Block merging
|
|
270
|
+
// ---------------------------------------------------------------------------
|
|
271
|
+
/** Merge consecutive blocks with the same heading prefix (wrapped headings). */
|
|
272
|
+
function mergeConsecutiveHeadings(blocks, bodyFS) {
|
|
273
|
+
if (blocks.length === 0)
|
|
274
|
+
return [];
|
|
275
|
+
const HEADING_RE = /^(#{1,6} )/;
|
|
276
|
+
const maxGap = Math.max(bodyFS * 3, 30);
|
|
277
|
+
const merged = [];
|
|
278
|
+
let cur = { ...blocks[0] };
|
|
279
|
+
for (let i = 1; i < blocks.length; i++) {
|
|
280
|
+
const next = blocks[i];
|
|
281
|
+
const curMatch = cur.content.match(HEADING_RE);
|
|
282
|
+
const nextMatch = next.content.match(HEADING_RE);
|
|
283
|
+
const gap = cur.topY - next.topY;
|
|
284
|
+
if (curMatch &&
|
|
285
|
+
nextMatch &&
|
|
286
|
+
curMatch[1] === nextMatch[1] &&
|
|
287
|
+
gap <= maxGap) {
|
|
288
|
+
cur = {
|
|
289
|
+
topY: cur.topY,
|
|
290
|
+
content: `${cur.content} ${next.content.slice(nextMatch[1].length)}`,
|
|
291
|
+
isTabular: cur.isTabular || next.isTabular,
|
|
292
|
+
};
|
|
293
|
+
}
|
|
294
|
+
else {
|
|
295
|
+
merged.push(cur);
|
|
296
|
+
cur = { ...next };
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
merged.push(cur);
|
|
300
|
+
return merged;
|
|
301
|
+
}
|
|
302
|
+
/**
|
|
303
|
+
* Merge consecutive plain-text blocks that are wrapped lines of the same paragraph.
|
|
304
|
+
*/
|
|
305
|
+
function mergeParagraphWraps(blocks, bodyFS) {
|
|
306
|
+
if (blocks.length === 0 || bodyFS <= 0)
|
|
307
|
+
return blocks;
|
|
308
|
+
const HEADING_RE = /^#{1,6} /;
|
|
309
|
+
const SENTENCE_END_RE = /[.!?…)\]]\s*$/;
|
|
310
|
+
const maxGap = bodyFS * 2.0;
|
|
311
|
+
const MIN_WRAP_LENGTH = 25;
|
|
312
|
+
const merged = [];
|
|
313
|
+
let cur = { ...blocks[0], lastTopY: blocks[0].topY };
|
|
314
|
+
for (let i = 1; i < blocks.length; i++) {
|
|
315
|
+
const next = blocks[i];
|
|
316
|
+
const curIsBody = !HEADING_RE.test(cur.content) && !cur.content.startsWith("|");
|
|
317
|
+
const nextIsBody = !HEADING_RE.test(next.content) && !next.content.startsWith("|");
|
|
318
|
+
const gap = cur.lastTopY - next.topY;
|
|
319
|
+
const isWrap = curIsBody &&
|
|
320
|
+
nextIsBody &&
|
|
321
|
+
!cur.isTabular &&
|
|
322
|
+
!next.isTabular &&
|
|
323
|
+
gap > 0 &&
|
|
324
|
+
gap <= maxGap &&
|
|
325
|
+
cur.content.length > MIN_WRAP_LENGTH &&
|
|
326
|
+
!SENTENCE_END_RE.test(cur.content);
|
|
327
|
+
if (isWrap) {
|
|
328
|
+
cur = {
|
|
329
|
+
topY: cur.topY,
|
|
330
|
+
lastTopY: next.topY,
|
|
331
|
+
content: `${cur.content.trimEnd()} ${next.content.trimStart()}`,
|
|
332
|
+
isTabular: false,
|
|
333
|
+
};
|
|
334
|
+
}
|
|
335
|
+
else {
|
|
336
|
+
merged.push({ topY: cur.topY, content: cur.content });
|
|
337
|
+
cur = { ...next, lastTopY: next.topY };
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
merged.push({ topY: cur.topY, content: cur.content });
|
|
341
|
+
return merged;
|
|
342
|
+
}
|
|
343
|
+
/** Remove page number blocks near the bottom of the page. */
|
|
344
|
+
function removePageNumbers(blocks) {
|
|
345
|
+
const PAGE_NUM_RE = /^(?:#{1,6}\s*)?\d+\s*$/;
|
|
346
|
+
const BOTTOM_Y = 120;
|
|
347
|
+
return blocks.filter((block, idx) => {
|
|
348
|
+
const isBottom = idx >= blocks.length - 3;
|
|
349
|
+
const isLowY = block.topY <= BOTTOM_Y;
|
|
350
|
+
const isPageNum = PAGE_NUM_RE.test(block.content.trim());
|
|
351
|
+
return !(isBottom && isLowY && isPageNum);
|
|
352
|
+
});
|
|
353
|
+
}
|
|
354
|
+
// ---------------------------------------------------------------------------
|
|
355
|
+
// Detached first-column table reconstruction
|
|
356
|
+
// ---------------------------------------------------------------------------
|
|
357
|
+
/**
|
|
358
|
+
* Fix tables where the first column was emitted as free text blocks
|
|
359
|
+
* around a markdown table containing only the right-side columns.
|
|
360
|
+
*
|
|
361
|
+
* Detects: a plain-text header line with (N+1) tokens above an N-column
|
|
362
|
+
* markdown table, plus short label lines whose count matches the table's
|
|
363
|
+
* logical row count. Reconstructs into a proper (N+1)-column table.
|
|
364
|
+
*/
|
|
365
|
+
function normalizeDetachedFirstColumnTables(blocks) {
|
|
366
|
+
const HEADING_RE = /^#{1,6}\s/;
|
|
367
|
+
const isTableBlock = (text) => text.trimStart().startsWith("|");
|
|
368
|
+
const isPlainBlock = (text) => !HEADING_RE.test(text) && !isTableBlock(text);
|
|
369
|
+
const isShortLabel = (text) => {
|
|
370
|
+
const t = text.trim();
|
|
371
|
+
return t.length > 0 && t.length <= 40;
|
|
372
|
+
};
|
|
373
|
+
const splitTokens = (text) => text
|
|
374
|
+
.trim()
|
|
375
|
+
.split(/[ \t]+/)
|
|
376
|
+
.filter(Boolean);
|
|
377
|
+
const replacements = new Map();
|
|
378
|
+
const remove = new Set();
|
|
379
|
+
for (let tableIdx = 0; tableIdx < blocks.length; tableIdx++) {
|
|
380
|
+
if (remove.has(tableIdx))
|
|
381
|
+
continue;
|
|
382
|
+
const tableBlock = blocks[tableIdx];
|
|
383
|
+
if (!isTableBlock(tableBlock.content))
|
|
384
|
+
continue;
|
|
385
|
+
const tableLines = tableBlock.content
|
|
386
|
+
.split("\n")
|
|
387
|
+
.map((line) => line.trim())
|
|
388
|
+
.filter((line) => line.startsWith("|"));
|
|
389
|
+
const dataRows = tableLines
|
|
390
|
+
.filter((line) => !/^\|\s*[-: ]+\|/.test(line))
|
|
391
|
+
.map(parsePipeRow)
|
|
392
|
+
.filter((row) => row.length > 0);
|
|
393
|
+
if (dataRows.length === 0)
|
|
394
|
+
continue;
|
|
395
|
+
const cols = dataRows[0].length;
|
|
396
|
+
if (cols < 2 || dataRows.some((row) => row.length !== cols))
|
|
397
|
+
continue;
|
|
398
|
+
// Expand by <br> count to get logical row count
|
|
399
|
+
const logicalRows = [];
|
|
400
|
+
for (const row of dataRows) {
|
|
401
|
+
const splitCells = row.map((cell) => cell.split("<br>").map((p) => p.trim()));
|
|
402
|
+
const rowSpan = Math.max(...splitCells.map((parts) => parts.length));
|
|
403
|
+
for (let k = 0; k < rowSpan; k++) {
|
|
404
|
+
logicalRows.push(splitCells.map((parts) => parts[k] ?? ""));
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
if (logicalRows.length < 2)
|
|
408
|
+
continue;
|
|
409
|
+
// Find header with (cols + 1) non-numeric tokens
|
|
410
|
+
let headerIdx = -1;
|
|
411
|
+
let headerTokens = [];
|
|
412
|
+
for (let i = Math.max(0, tableIdx - 4); i <= tableIdx - 1; i++) {
|
|
413
|
+
const text = normalizeFullWidthAscii(blocks[i].content).trim();
|
|
414
|
+
if (!isPlainBlock(text))
|
|
415
|
+
continue;
|
|
416
|
+
const tokens = splitTokens(text);
|
|
417
|
+
if (tokens.length === cols + 1 &&
|
|
418
|
+
tokens.every((tok) => !/[0-9]/.test(tok))) {
|
|
419
|
+
headerIdx = i;
|
|
420
|
+
headerTokens = tokens;
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
if (headerIdx < 0)
|
|
424
|
+
continue;
|
|
425
|
+
// Collect short label lines above/below table
|
|
426
|
+
const aboveLabels = [];
|
|
427
|
+
for (let i = tableIdx - 1; i > headerIdx; i--) {
|
|
428
|
+
const text = normalizeFullWidthAscii(blocks[i].content).trim();
|
|
429
|
+
if (!isPlainBlock(text) || !isShortLabel(text))
|
|
430
|
+
break;
|
|
431
|
+
aboveLabels.push({ idx: i, text });
|
|
432
|
+
}
|
|
433
|
+
aboveLabels.reverse();
|
|
434
|
+
const belowLabels = [];
|
|
435
|
+
for (let i = tableIdx + 1; i < blocks.length; i++) {
|
|
436
|
+
const text = normalizeFullWidthAscii(blocks[i].content).trim();
|
|
437
|
+
if (!isPlainBlock(text) || !isShortLabel(text))
|
|
438
|
+
break;
|
|
439
|
+
belowLabels.push({ idx: i, text });
|
|
440
|
+
}
|
|
441
|
+
const labels = [...aboveLabels, ...belowLabels];
|
|
442
|
+
if (labels.length !== logicalRows.length)
|
|
443
|
+
continue;
|
|
444
|
+
// Reconstruct the full table
|
|
445
|
+
const normalizedLines = [];
|
|
446
|
+
normalizedLines.push(`| ${headerTokens.join(" | ")} |`);
|
|
447
|
+
normalizedLines.push(`| ${Array.from({ length: cols + 1 }, () => "---").join(" | ")} |`);
|
|
448
|
+
for (let r = 0; r < logicalRows.length; r++) {
|
|
449
|
+
normalizedLines.push(`| ${labels[r].text} | ${logicalRows[r].join(" | ")} |`);
|
|
450
|
+
}
|
|
451
|
+
replacements.set(tableIdx, normalizedLines.join("\n"));
|
|
452
|
+
remove.add(headerIdx);
|
|
453
|
+
for (const label of labels)
|
|
454
|
+
remove.add(label.idx);
|
|
455
|
+
}
|
|
456
|
+
if (replacements.size === 0 && remove.size === 0)
|
|
457
|
+
return blocks;
|
|
458
|
+
const out = [];
|
|
459
|
+
for (let i = 0; i < blocks.length; i++) {
|
|
460
|
+
if (remove.has(i))
|
|
461
|
+
continue;
|
|
462
|
+
const replaced = replacements.get(i);
|
|
463
|
+
if (replaced) {
|
|
464
|
+
out.push({ topY: blocks[i].topY, content: replaced });
|
|
465
|
+
}
|
|
466
|
+
else {
|
|
467
|
+
out.push(blocks[i]);
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
return out;
|
|
471
|
+
}
|
|
472
|
+
// ---------------------------------------------------------------------------
|
|
473
|
+
// Public API
|
|
474
|
+
// ---------------------------------------------------------------------------
|
|
475
|
+
/**
|
|
476
|
+
* Render one page's content: free text and tables interleaved top-to-bottom.
|
|
477
|
+
*/
|
|
478
|
+
export function renderPageContent(freeTextBoxes, tables, imageBlocks = [], allTextBoxes) {
|
|
479
|
+
const blocks = [];
|
|
480
|
+
// Use ALL text boxes (before table/diagram filtering) for modal font size,
|
|
481
|
+
// so that diagram labels released as free text don't skew the body size.
|
|
482
|
+
const bodyFS = modalFontSize(allTextBoxes ?? freeTextBoxes);
|
|
483
|
+
// Free text lines
|
|
484
|
+
for (const line of groupFreeTextIntoLines(freeTextBoxes)) {
|
|
485
|
+
const prefix = headingPrefix(line.fontSize, bodyFS, line.isBold);
|
|
486
|
+
blocks.push({
|
|
487
|
+
topY: line.topY,
|
|
488
|
+
content: prefix + line.text,
|
|
489
|
+
isTabular: prefix === "" && line.isTabular,
|
|
490
|
+
});
|
|
491
|
+
}
|
|
492
|
+
// Tables
|
|
493
|
+
for (const table of tables) {
|
|
494
|
+
const md = renderTableToMarkdown(table);
|
|
495
|
+
if (md.length > 0) {
|
|
496
|
+
blocks.push({ topY: table.topY, content: md });
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
// Images
|
|
500
|
+
for (const img of imageBlocks) {
|
|
501
|
+
blocks.push({ topY: img.topY, content: img.markdown });
|
|
502
|
+
}
|
|
503
|
+
// Sort top-to-bottom (higher Y = higher on page = comes first)
|
|
504
|
+
blocks.sort((a, b) => b.topY - a.topY);
|
|
505
|
+
const cleaned = removePageNumbers(blocks);
|
|
506
|
+
const headingsMerged = mergeConsecutiveHeadings(cleaned, bodyFS);
|
|
507
|
+
const merged = mergeParagraphWraps(headingsMerged, bodyFS);
|
|
508
|
+
const normalized = normalizeDetachedFirstColumnTables(merged);
|
|
509
|
+
return normalized
|
|
510
|
+
.map((b) => b.content)
|
|
511
|
+
.join("\n\n")
|
|
512
|
+
.trim();
|
|
513
|
+
}
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
/** Bounding box in PDF coordinate space (origin = bottom-left). */
|
|
2
|
+
export type Bounds = {
|
|
3
|
+
left: number;
|
|
4
|
+
right: number;
|
|
5
|
+
/** Higher value = higher on the page. */
|
|
6
|
+
top: number;
|
|
7
|
+
bottom: number;
|
|
8
|
+
};
|
|
9
|
+
/** A text fragment with position and font metadata. */
|
|
10
|
+
export type TextBox = {
|
|
11
|
+
id: string;
|
|
12
|
+
text: string;
|
|
13
|
+
bounds: Bounds;
|
|
14
|
+
pageNumber: number;
|
|
15
|
+
/** Dominant font size in points. */
|
|
16
|
+
fontSize: number;
|
|
17
|
+
/** True if rendered bold (font name or rendering mode). */
|
|
18
|
+
isBold: boolean;
|
|
19
|
+
};
|
|
20
|
+
/** A horizontal or vertical line segment extracted from vector graphics. */
|
|
21
|
+
export type Segment = {
|
|
22
|
+
id: string;
|
|
23
|
+
x1: number;
|
|
24
|
+
y1: number;
|
|
25
|
+
x2: number;
|
|
26
|
+
y2: number;
|
|
27
|
+
};
|
|
28
|
+
/** A single cell in a resolved table grid. */
|
|
29
|
+
export type TableCell = {
|
|
30
|
+
row: number;
|
|
31
|
+
col: number;
|
|
32
|
+
text: string;
|
|
33
|
+
rowSpan: number;
|
|
34
|
+
colSpan: number;
|
|
35
|
+
};
|
|
36
|
+
/** A resolved table grid ready for markdown rendering. */
|
|
37
|
+
export type TableGrid = {
|
|
38
|
+
pageNumber: number;
|
|
39
|
+
rows: number;
|
|
40
|
+
cols: number;
|
|
41
|
+
cells: TableCell[];
|
|
42
|
+
warnings: string[];
|
|
43
|
+
/** Top Y coordinate (PDF space: larger = higher on page). */
|
|
44
|
+
topY: number;
|
|
45
|
+
/** True for tables detected without vector borders. */
|
|
46
|
+
isBorderless: boolean;
|
|
47
|
+
};
|
|
48
|
+
/** An image/diagram region detected on a page. */
|
|
49
|
+
export type ImageRegion = {
|
|
50
|
+
id: string;
|
|
51
|
+
pageNumber: number;
|
|
52
|
+
/** Bounding box in mupdf coordinates (top-left origin). */
|
|
53
|
+
bbox: {
|
|
54
|
+
x: number;
|
|
55
|
+
y: number;
|
|
56
|
+
w: number;
|
|
57
|
+
h: number;
|
|
58
|
+
};
|
|
59
|
+
/** Y position in PDF coordinates (bottom-left) for ordering. */
|
|
60
|
+
topY: number;
|
|
61
|
+
};
|
|
62
|
+
/** Result of extracting content from a single PDF page. */
|
|
63
|
+
export type PageContent = {
|
|
64
|
+
pageNumber: number;
|
|
65
|
+
textBoxes: TextBox[];
|
|
66
|
+
segments: Segment[];
|
|
67
|
+
images: ImageRegion[];
|
|
68
|
+
};
|
|
69
|
+
/** A block of rendered content (text paragraph or table). */
|
|
70
|
+
export type ContentBlock = {
|
|
71
|
+
topY: number;
|
|
72
|
+
content: string;
|
|
73
|
+
/** True if this line has wide gaps between text boxes (column headers). */
|
|
74
|
+
isTabular?: boolean;
|
|
75
|
+
};
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -2,7 +2,7 @@ import type { ConversionResult, Converter, StreamInfo } from "../types.js";
|
|
|
2
2
|
export declare class PptxConverter implements Converter {
|
|
3
3
|
name: string;
|
|
4
4
|
accepts(streamInfo: StreamInfo): boolean;
|
|
5
|
-
convert(input: Buffer,
|
|
5
|
+
convert(input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult>;
|
|
6
6
|
private extractText;
|
|
7
7
|
private extractTable;
|
|
8
8
|
}
|