markit-ai 0.4.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/formats.js +15 -0
- package/dist/converters/iwork.d.ts +20 -0
- package/dist/converters/iwork.js +391 -0
- package/dist/converters/pdf/grid.js +144 -3
- package/dist/converters/pdf/index.js +21 -1
- package/dist/index.d.ts +1 -0
- package/dist/index.js +1 -0
- package/dist/markit.js +2 -0
- package/dist/utils/turndown.js +8 -2
- package/package.json +1 -1
package/dist/commands/formats.js
CHANGED
|
@@ -23,6 +23,21 @@ const BUILTIN_FORMATS = [
|
|
|
23
23
|
extensions: [".mp3", ".wav", ".m4a", ".flac"],
|
|
24
24
|
builtin: true,
|
|
25
25
|
},
|
|
26
|
+
{
|
|
27
|
+
name: "Pages",
|
|
28
|
+
extensions: [".pages"],
|
|
29
|
+
builtin: true,
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
name: "Keynote",
|
|
33
|
+
extensions: [".key"],
|
|
34
|
+
builtin: true,
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
name: "Numbers",
|
|
38
|
+
extensions: [".numbers"],
|
|
39
|
+
builtin: true,
|
|
40
|
+
},
|
|
26
41
|
{
|
|
27
42
|
name: "GitHub",
|
|
28
43
|
extensions: ["github.com/*", "gist.github.com/*"],
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import type { ConversionResult, Converter, StreamInfo } from "../types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Converts Apple iWork files (Pages, Keynote, Numbers) to markdown.
|
|
4
|
+
*
|
|
5
|
+
* All three formats are ZIP archives containing an XML file:
|
|
6
|
+
* - Pages: index.xml (sf:p paragraphs with named styles)
|
|
7
|
+
* - Keynote: index.apxl (key:slide elements with sf:p text)
|
|
8
|
+
* - Numbers: index.xml (sf:t text cells + sf:n number cells)
|
|
9
|
+
*/
|
|
10
|
+
export declare class IWorkConverter implements Converter {
|
|
11
|
+
name: string;
|
|
12
|
+
accepts(streamInfo: StreamInfo): boolean;
|
|
13
|
+
convert(input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult>;
|
|
14
|
+
private convertPages;
|
|
15
|
+
private convertKeynote;
|
|
16
|
+
private convertNumbers;
|
|
17
|
+
private extractGrid;
|
|
18
|
+
private convertNumbersFallback;
|
|
19
|
+
private readIndex;
|
|
20
|
+
}
|
|
@@ -0,0 +1,391 @@
|
|
|
1
|
+
import { mkdirSync, writeFileSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import JSZip from "jszip";
|
|
4
|
+
const EXTENSIONS = [".pages", ".key", ".numbers"];
|
|
5
|
+
const SF = "http://developer.apple.com/namespaces/sf";
|
|
6
|
+
const SFA = "http://developer.apple.com/namespaces/sfa";
|
|
7
|
+
const KEY = "http://developer.apple.com/namespaces/keynote2";
|
|
8
|
+
/**
|
|
9
|
+
* Converts Apple iWork files (Pages, Keynote, Numbers) to markdown.
|
|
10
|
+
*
|
|
11
|
+
* All three formats are ZIP archives containing an XML file:
|
|
12
|
+
* - Pages: index.xml (sf:p paragraphs with named styles)
|
|
13
|
+
* - Keynote: index.apxl (key:slide elements with sf:p text)
|
|
14
|
+
* - Numbers: index.xml (sf:t text cells + sf:n number cells)
|
|
15
|
+
*/
|
|
16
|
+
export class IWorkConverter {
|
|
17
|
+
name = "iwork";
|
|
18
|
+
accepts(streamInfo) {
|
|
19
|
+
if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension)) {
|
|
20
|
+
return true;
|
|
21
|
+
}
|
|
22
|
+
return false;
|
|
23
|
+
}
|
|
24
|
+
async convert(input, streamInfo) {
|
|
25
|
+
const zip = await JSZip.loadAsync(input);
|
|
26
|
+
const ext = streamInfo.extension;
|
|
27
|
+
if (ext === ".pages")
|
|
28
|
+
return this.convertPages(zip, streamInfo);
|
|
29
|
+
if (ext === ".key")
|
|
30
|
+
return this.convertKeynote(zip, streamInfo);
|
|
31
|
+
if (ext === ".numbers")
|
|
32
|
+
return this.convertNumbers(zip);
|
|
33
|
+
throw new Error(`Unsupported iWork format: ${ext}`);
|
|
34
|
+
}
|
|
35
|
+
// ---------------------------------------------------------------------------
|
|
36
|
+
// Pages
|
|
37
|
+
// ---------------------------------------------------------------------------
|
|
38
|
+
async convertPages(zip, streamInfo) {
|
|
39
|
+
const xml = await this.readIndex(zip, "index.xml");
|
|
40
|
+
const root = parseXml(xml);
|
|
41
|
+
const imageDir = streamInfo.imageDir;
|
|
42
|
+
if (imageDir)
|
|
43
|
+
mkdirSync(imageDir, { recursive: true });
|
|
44
|
+
let imageCount = 0;
|
|
45
|
+
const lines = [];
|
|
46
|
+
let title;
|
|
47
|
+
for (const p of iterAll(root, SF, "p")) {
|
|
48
|
+
const text = collectText(p).trim();
|
|
49
|
+
if (!text)
|
|
50
|
+
continue;
|
|
51
|
+
const style = p.getAttribute("sf:style") || "";
|
|
52
|
+
const prefix = paragraphPrefix(style);
|
|
53
|
+
if (!title && text.length > 0)
|
|
54
|
+
title = text;
|
|
55
|
+
lines.push(`${prefix}${text}`);
|
|
56
|
+
}
|
|
57
|
+
// Extract images
|
|
58
|
+
for (const name of Object.keys(zip.files)) {
|
|
59
|
+
if (!name.match(/\.(png|jpg|jpeg|gif|webp|tiff|bmp)$/i))
|
|
60
|
+
continue;
|
|
61
|
+
if (name.startsWith("QuickLook/"))
|
|
62
|
+
continue;
|
|
63
|
+
imageCount++;
|
|
64
|
+
const imgName = name.split("/").pop() || `image_${imageCount}`;
|
|
65
|
+
if (imageDir) {
|
|
66
|
+
const file = zip.file(name);
|
|
67
|
+
if (file) {
|
|
68
|
+
const buf = await file.async("nodebuffer");
|
|
69
|
+
const filepath = join(imageDir, imgName);
|
|
70
|
+
writeFileSync(filepath, buf);
|
|
71
|
+
lines.push(``);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
else {
|
|
75
|
+
lines.push(`<!-- image: ${imgName} -->`);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
return { markdown: lines.join("\n\n"), title };
|
|
79
|
+
}
|
|
80
|
+
// ---------------------------------------------------------------------------
|
|
81
|
+
// Keynote
|
|
82
|
+
// ---------------------------------------------------------------------------
|
|
83
|
+
async convertKeynote(zip, streamInfo) {
|
|
84
|
+
const xml = await this.readIndex(zip, "index.apxl");
|
|
85
|
+
const root = parseXml(xml);
|
|
86
|
+
const imageDir = streamInfo.imageDir;
|
|
87
|
+
if (imageDir)
|
|
88
|
+
mkdirSync(imageDir, { recursive: true });
|
|
89
|
+
const sections = [];
|
|
90
|
+
let title;
|
|
91
|
+
const slides = [...iterAll(root, KEY, "slide")];
|
|
92
|
+
for (let i = 0; i < slides.length; i++) {
|
|
93
|
+
const slide = slides[i];
|
|
94
|
+
const slideLines = [`<!-- Slide ${i + 1} -->`];
|
|
95
|
+
const paragraphs = [...iterAll(slide, SF, "p")];
|
|
96
|
+
let isTitle = true;
|
|
97
|
+
for (const p of paragraphs) {
|
|
98
|
+
const text = collectText(p).trim();
|
|
99
|
+
if (!text)
|
|
100
|
+
continue;
|
|
101
|
+
if (isTitle) {
|
|
102
|
+
slideLines.push(`# ${text}`);
|
|
103
|
+
if (!title)
|
|
104
|
+
title = text;
|
|
105
|
+
isTitle = false;
|
|
106
|
+
}
|
|
107
|
+
else {
|
|
108
|
+
slideLines.push(text);
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
sections.push(slideLines.join("\n"));
|
|
112
|
+
}
|
|
113
|
+
// Extract media images
|
|
114
|
+
let imageCount = 0;
|
|
115
|
+
for (const name of Object.keys(zip.files)) {
|
|
116
|
+
if (!name.match(/\.(png|jpg|jpeg|gif|webp|tiff|bmp)$/i))
|
|
117
|
+
continue;
|
|
118
|
+
if (name.startsWith("QuickLook/"))
|
|
119
|
+
continue;
|
|
120
|
+
imageCount++;
|
|
121
|
+
const imgName = name.split("/").pop() || `image_${imageCount}`;
|
|
122
|
+
if (imageDir) {
|
|
123
|
+
const file = zip.file(name);
|
|
124
|
+
if (file) {
|
|
125
|
+
const buf = await file.async("nodebuffer");
|
|
126
|
+
const filepath = join(imageDir, imgName);
|
|
127
|
+
writeFileSync(filepath, buf);
|
|
128
|
+
sections.push(``);
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
else {
|
|
132
|
+
sections.push(`<!-- image: ${imgName} -->`);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
return { markdown: sections.join("\n\n"), title };
|
|
136
|
+
}
|
|
137
|
+
// ---------------------------------------------------------------------------
|
|
138
|
+
// Numbers
|
|
139
|
+
// ---------------------------------------------------------------------------
|
|
140
|
+
async convertNumbers(zip) {
|
|
141
|
+
const xml = await this.readIndex(zip, "index.xml");
|
|
142
|
+
const root = parseXml(xml);
|
|
143
|
+
// Find grid elements (tables)
|
|
144
|
+
const grids = [...iterAll(root, SF, "grid")];
|
|
145
|
+
if (grids.length === 0) {
|
|
146
|
+
// Fallback: extract all text and number cells
|
|
147
|
+
return this.convertNumbersFallback(root);
|
|
148
|
+
}
|
|
149
|
+
const sections = [];
|
|
150
|
+
for (const grid of grids) {
|
|
151
|
+
const rows = this.extractGrid(grid);
|
|
152
|
+
if (rows.length === 0)
|
|
153
|
+
continue;
|
|
154
|
+
const maxCols = Math.max(...rows.map((r) => r.length));
|
|
155
|
+
for (const row of rows) {
|
|
156
|
+
while (row.length < maxCols)
|
|
157
|
+
row.push("");
|
|
158
|
+
}
|
|
159
|
+
const [header, ...body] = rows;
|
|
160
|
+
const lines = [];
|
|
161
|
+
lines.push(`| ${header.join(" | ")} |`);
|
|
162
|
+
lines.push(`| ${header.map(() => "---").join(" | ")} |`);
|
|
163
|
+
for (const row of body) {
|
|
164
|
+
lines.push(`| ${row.join(" | ")} |`);
|
|
165
|
+
}
|
|
166
|
+
sections.push(lines.join("\n"));
|
|
167
|
+
}
|
|
168
|
+
return { markdown: sections.join("\n\n") };
|
|
169
|
+
}
|
|
170
|
+
extractGrid(grid) {
|
|
171
|
+
const datasource = findFirst(grid, SF, "datasource");
|
|
172
|
+
if (!datasource)
|
|
173
|
+
return [];
|
|
174
|
+
const rows = [];
|
|
175
|
+
let currentRow = [];
|
|
176
|
+
let colCount = 0;
|
|
177
|
+
let totalCells = 0;
|
|
178
|
+
const allValues = [];
|
|
179
|
+
// Get column count from grid attributes (raw attribute names)
|
|
180
|
+
const numCols = Number.parseInt(grid.getAttribute("sf:numcols") || "0", 10);
|
|
181
|
+
for (const child of datasource.children) {
|
|
182
|
+
const tag = child.tagName;
|
|
183
|
+
let value = "";
|
|
184
|
+
if (tag === `${SF}:t`) {
|
|
185
|
+
// Text cell
|
|
186
|
+
const ct = findFirst(child, SF, "ct");
|
|
187
|
+
value = ct?.getAttribute("sfa:s") || collectText(child).trim();
|
|
188
|
+
}
|
|
189
|
+
else if (tag === `${SF}:n`) {
|
|
190
|
+
// Number cell
|
|
191
|
+
value = child.getAttribute("sf:v") || "";
|
|
192
|
+
}
|
|
193
|
+
else if (tag === `${SF}:b`) {
|
|
194
|
+
// Boolean cell
|
|
195
|
+
value = child.getAttribute("sf:v") === "1" ? "TRUE" : "FALSE";
|
|
196
|
+
}
|
|
197
|
+
else if (tag === `${SF}:d`) {
|
|
198
|
+
// Date cell
|
|
199
|
+
value = child.getAttribute("sf:v") || "";
|
|
200
|
+
}
|
|
201
|
+
else if (tag === `${SF}:du`) {
|
|
202
|
+
// Duration cell
|
|
203
|
+
value = child.getAttribute("sf:v") || "";
|
|
204
|
+
}
|
|
205
|
+
else if (tag === `${SF}:e`) {
|
|
206
|
+
// Empty cell
|
|
207
|
+
value = "";
|
|
208
|
+
}
|
|
209
|
+
else {
|
|
210
|
+
continue;
|
|
211
|
+
}
|
|
212
|
+
currentRow.push(value);
|
|
213
|
+
allValues.push(value);
|
|
214
|
+
colCount++;
|
|
215
|
+
totalCells++;
|
|
216
|
+
if (numCols > 0 && colCount >= numCols) {
|
|
217
|
+
rows.push(currentRow);
|
|
218
|
+
currentRow = [];
|
|
219
|
+
colCount = 0;
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
if (currentRow.length > 0)
|
|
223
|
+
rows.push(currentRow);
|
|
224
|
+
// If the grid used default dimensions and produced only one row,
|
|
225
|
+
// the data probably doesn't fill the full grid width. Fall back to
|
|
226
|
+
// treating the cells as a 2-column key/value list or single column.
|
|
227
|
+
if (rows.length <= 1 && totalCells > 0 && totalCells < numCols) {
|
|
228
|
+
// Re-layout: try 2 columns if even, otherwise single column
|
|
229
|
+
const cols = totalCells % 2 === 0 ? 2 : 1;
|
|
230
|
+
const relaid = [];
|
|
231
|
+
for (let i = 0; i < allValues.length; i += cols) {
|
|
232
|
+
relaid.push(allValues.slice(i, i + cols));
|
|
233
|
+
}
|
|
234
|
+
return relaid;
|
|
235
|
+
}
|
|
236
|
+
return rows;
|
|
237
|
+
}
|
|
238
|
+
convertNumbersFallback(root) {
|
|
239
|
+
const values = [];
|
|
240
|
+
for (const t of iterAll(root, SF, "t")) {
|
|
241
|
+
const ct = findFirst(t, SF, "ct");
|
|
242
|
+
const val = ct?.getAttribute("sfa:s") || "";
|
|
243
|
+
if (val)
|
|
244
|
+
values.push(val);
|
|
245
|
+
}
|
|
246
|
+
for (const n of iterAll(root, SF, "n")) {
|
|
247
|
+
const val = n.getAttribute("sf:v") || "";
|
|
248
|
+
if (val)
|
|
249
|
+
values.push(val);
|
|
250
|
+
}
|
|
251
|
+
return { markdown: values.join("\n") };
|
|
252
|
+
}
|
|
253
|
+
// ---------------------------------------------------------------------------
|
|
254
|
+
// Helpers
|
|
255
|
+
// ---------------------------------------------------------------------------
|
|
256
|
+
async readIndex(zip, filename) {
|
|
257
|
+
const file = zip.file(filename);
|
|
258
|
+
if (!file) {
|
|
259
|
+
throw new Error(`Invalid iWork file: missing ${filename}`);
|
|
260
|
+
}
|
|
261
|
+
return file.async("string");
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
/**
|
|
265
|
+
* Minimal XML parser that preserves namespace prefixes in tag names
|
|
266
|
+
* and extracts text content and attributes.
|
|
267
|
+
*/
|
|
268
|
+
function parseXml(xml) {
|
|
269
|
+
// Use a simple recursive descent approach
|
|
270
|
+
const root = createElement("root");
|
|
271
|
+
const stack = [root];
|
|
272
|
+
// Match tags and text
|
|
273
|
+
const tagRe = /<(\/?)([a-zA-Z0-9_:.-]+)((?:\s+[a-zA-Z0-9_:.-]+\s*=\s*"[^"]*")*)\s*(\/?)>/g;
|
|
274
|
+
let lastIndex = 0;
|
|
275
|
+
let match = tagRe.exec(xml);
|
|
276
|
+
while (match !== null) {
|
|
277
|
+
const [fullMatch, isClose, tagName, attrs, isSelfClose] = match;
|
|
278
|
+
const textBefore = xml.slice(lastIndex, match.index);
|
|
279
|
+
lastIndex = match.index + fullMatch.length;
|
|
280
|
+
// Add text to current element
|
|
281
|
+
if (textBefore.trim()) {
|
|
282
|
+
const current = stack[stack.length - 1];
|
|
283
|
+
if (current.children.length > 0) {
|
|
284
|
+
current.children[current.children.length - 1].tail += textBefore.trim();
|
|
285
|
+
}
|
|
286
|
+
else {
|
|
287
|
+
current.text += textBefore.trim();
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
if (isClose) {
|
|
291
|
+
// Closing tag
|
|
292
|
+
stack.pop();
|
|
293
|
+
}
|
|
294
|
+
else {
|
|
295
|
+
// Opening tag
|
|
296
|
+
const el = createElement(expandTag(tagName, xml));
|
|
297
|
+
parseAttributes(attrs, el);
|
|
298
|
+
stack[stack.length - 1].children.push(el);
|
|
299
|
+
if (!isSelfClose) {
|
|
300
|
+
stack.push(el);
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
match = tagRe.exec(xml);
|
|
304
|
+
}
|
|
305
|
+
return root;
|
|
306
|
+
}
|
|
307
|
+
function createElement(tagName) {
|
|
308
|
+
return {
|
|
309
|
+
tagName,
|
|
310
|
+
children: [],
|
|
311
|
+
text: "",
|
|
312
|
+
tail: "",
|
|
313
|
+
attributes: {},
|
|
314
|
+
getAttribute(name) {
|
|
315
|
+
return this.attributes[name] ?? null;
|
|
316
|
+
},
|
|
317
|
+
};
|
|
318
|
+
}
|
|
319
|
+
function parseAttributes(attrStr, el) {
|
|
320
|
+
const attrRe = /([a-zA-Z0-9_:.-]+)\s*=\s*"([^"]*)"/g;
|
|
321
|
+
let m = attrRe.exec(attrStr);
|
|
322
|
+
while (m !== null) {
|
|
323
|
+
el.attributes[m[1]] = m[2];
|
|
324
|
+
m = attrRe.exec(attrStr);
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
/**
|
|
328
|
+
* Expand namespace prefix in tag name to full URI.
|
|
329
|
+
* e.g. "sf:p" with xmlns:sf="..." → "{uri}p"
|
|
330
|
+
* For simplicity, we use the known Apple namespaces.
|
|
331
|
+
*/
|
|
332
|
+
function expandTag(tag, _xml) {
|
|
333
|
+
const nsMap = {
|
|
334
|
+
sf: SF,
|
|
335
|
+
sfa: SFA,
|
|
336
|
+
sl: "http://developer.apple.com/namespaces/sl",
|
|
337
|
+
key: KEY,
|
|
338
|
+
};
|
|
339
|
+
const colon = tag.indexOf(":");
|
|
340
|
+
if (colon === -1)
|
|
341
|
+
return tag;
|
|
342
|
+
const prefix = tag.slice(0, colon);
|
|
343
|
+
const local = tag.slice(colon + 1);
|
|
344
|
+
const uri = nsMap[prefix];
|
|
345
|
+
return uri ? `${uri}:${local}` : tag;
|
|
346
|
+
}
|
|
347
|
+
function collectText(el) {
|
|
348
|
+
let result = el.text;
|
|
349
|
+
for (const child of el.children) {
|
|
350
|
+
result += collectText(child);
|
|
351
|
+
result += child.tail;
|
|
352
|
+
}
|
|
353
|
+
return result;
|
|
354
|
+
}
|
|
355
|
+
function* iterAll(el, ns, localName) {
|
|
356
|
+
const fullTag = `${ns}:${localName}`;
|
|
357
|
+
if (el.tagName === fullTag)
|
|
358
|
+
yield el;
|
|
359
|
+
for (const child of el.children) {
|
|
360
|
+
yield* iterAll(child, ns, localName);
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
function findFirst(el, ns, localName) {
|
|
364
|
+
for (const found of iterAll(el, ns, localName)) {
|
|
365
|
+
return found;
|
|
366
|
+
}
|
|
367
|
+
return null;
|
|
368
|
+
}
|
|
369
|
+
/**
|
|
370
|
+
* Map iWork paragraph style names to markdown heading prefixes.
|
|
371
|
+
*/
|
|
372
|
+
function paragraphPrefix(style) {
|
|
373
|
+
if (!style)
|
|
374
|
+
return "";
|
|
375
|
+
const lower = style.toLowerCase();
|
|
376
|
+
if (lower.includes("title"))
|
|
377
|
+
return "# ";
|
|
378
|
+
if (lower.includes("subtitle"))
|
|
379
|
+
return "## ";
|
|
380
|
+
if (lower.includes("heading-1") || lower.includes("heading 1"))
|
|
381
|
+
return "## ";
|
|
382
|
+
if (lower.includes("heading-2") || lower.includes("heading 2"))
|
|
383
|
+
return "### ";
|
|
384
|
+
if (lower.includes("heading-3") || lower.includes("heading 3"))
|
|
385
|
+
return "#### ";
|
|
386
|
+
if (lower.includes("heading-4") || lower.includes("heading 4"))
|
|
387
|
+
return "##### ";
|
|
388
|
+
if (lower.includes("caption"))
|
|
389
|
+
return "*";
|
|
390
|
+
return "";
|
|
391
|
+
}
|
|
@@ -258,6 +258,122 @@ function expandSubRowsByYClusters(originalRows, cols, cells, cellBoxes) {
|
|
|
258
258
|
return originalRows + addedRows;
|
|
259
259
|
}
|
|
260
260
|
// ---------------------------------------------------------------------------
|
|
261
|
+
// Cross-column text box splitting
|
|
262
|
+
// ---------------------------------------------------------------------------
|
|
263
|
+
/**
|
|
264
|
+
* Find which column a horizontal position falls into.
|
|
265
|
+
* Returns -1 if outside the grid.
|
|
266
|
+
*/
|
|
267
|
+
function findCol(x, xLines) {
|
|
268
|
+
for (let i = 0; i < xLines.length - 1; i++) {
|
|
269
|
+
if (x >= xLines[i] && x <= xLines[i + 1])
|
|
270
|
+
return i;
|
|
271
|
+
}
|
|
272
|
+
return -1;
|
|
273
|
+
}
|
|
274
|
+
/**
|
|
275
|
+
* When a text box spans across one or more vertical column boundaries,
|
|
276
|
+
* split it into multiple virtual text boxes — one per column — with the
|
|
277
|
+
* text divided proportionally by width.
|
|
278
|
+
*
|
|
279
|
+
* We split at word boundaries closest to the proportional split point
|
|
280
|
+
* so we don't chop words in half.
|
|
281
|
+
*/
|
|
282
|
+
function splitCrossColumnBoxes(textBoxes, xLines) {
|
|
283
|
+
const result = [];
|
|
284
|
+
const MARGIN = 5; // allow small overlap before considering it cross-column
|
|
285
|
+
for (const tb of textBoxes) {
|
|
286
|
+
const leftCol = findCol(tb.bounds.left + MARGIN, xLines);
|
|
287
|
+
const rightCol = findCol(tb.bounds.right - MARGIN, xLines);
|
|
288
|
+
// Not spanning columns, or outside grid — keep as-is
|
|
289
|
+
if (leftCol < 0 || rightCol < 0 || leftCol === rightCol) {
|
|
290
|
+
result.push(tb);
|
|
291
|
+
continue;
|
|
292
|
+
}
|
|
293
|
+
// Text box spans from leftCol to rightCol — split it
|
|
294
|
+
const totalWidth = tb.bounds.right - tb.bounds.left;
|
|
295
|
+
if (totalWidth <= 0) {
|
|
296
|
+
result.push(tb);
|
|
297
|
+
continue;
|
|
298
|
+
}
|
|
299
|
+
const words = tb.text.split(/\s+/);
|
|
300
|
+
if (words.length <= 1) {
|
|
301
|
+
// Single word spanning columns — just assign to whichever col has more overlap
|
|
302
|
+
result.push(tb);
|
|
303
|
+
continue;
|
|
304
|
+
}
|
|
305
|
+
// For each column boundary crossing, find the best word-boundary split
|
|
306
|
+
let remainingWords = [...words];
|
|
307
|
+
let currentLeft = tb.bounds.left;
|
|
308
|
+
for (let col = leftCol; col <= rightCol && remainingWords.length > 0; col++) {
|
|
309
|
+
const colRight = col < xLines.length - 1 ? xLines[col + 1] : tb.bounds.right;
|
|
310
|
+
const segmentRight = Math.min(colRight, tb.bounds.right);
|
|
311
|
+
if (col === rightCol) {
|
|
312
|
+
// Last column — take all remaining words
|
|
313
|
+
result.push({
|
|
314
|
+
...tb,
|
|
315
|
+
id: `${tb.id}-split${col}`,
|
|
316
|
+
text: remainingWords.join(" "),
|
|
317
|
+
bounds: {
|
|
318
|
+
...tb.bounds,
|
|
319
|
+
left: currentLeft,
|
|
320
|
+
right: tb.bounds.right,
|
|
321
|
+
},
|
|
322
|
+
});
|
|
323
|
+
remainingWords = [];
|
|
324
|
+
}
|
|
325
|
+
else {
|
|
326
|
+
// Find how many words fit in this column segment proportionally
|
|
327
|
+
const segmentWidth = segmentRight - currentLeft;
|
|
328
|
+
const fractionOfTotal = segmentWidth / totalWidth;
|
|
329
|
+
const approxChars = Math.round(fractionOfTotal * tb.text.length);
|
|
330
|
+
// Walk words to find the split closest to the proportional point
|
|
331
|
+
let charCount = 0;
|
|
332
|
+
let splitIdx = 0;
|
|
333
|
+
for (let w = 0; w < remainingWords.length; w++) {
|
|
334
|
+
const nextCount = charCount + remainingWords[w].length + (w > 0 ? 1 : 0);
|
|
335
|
+
if (nextCount > approxChars && splitIdx > 0)
|
|
336
|
+
break;
|
|
337
|
+
charCount = nextCount;
|
|
338
|
+
splitIdx = w + 1;
|
|
339
|
+
}
|
|
340
|
+
if (splitIdx === 0)
|
|
341
|
+
splitIdx = 1; // take at least one word
|
|
342
|
+
if (splitIdx >= remainingWords.length) {
|
|
343
|
+
// All remaining words fit here
|
|
344
|
+
result.push({
|
|
345
|
+
...tb,
|
|
346
|
+
id: `${tb.id}-split${col}`,
|
|
347
|
+
text: remainingWords.join(" "),
|
|
348
|
+
bounds: {
|
|
349
|
+
...tb.bounds,
|
|
350
|
+
left: currentLeft,
|
|
351
|
+
right: segmentRight,
|
|
352
|
+
},
|
|
353
|
+
});
|
|
354
|
+
remainingWords = [];
|
|
355
|
+
}
|
|
356
|
+
else {
|
|
357
|
+
const partWords = remainingWords.slice(0, splitIdx);
|
|
358
|
+
result.push({
|
|
359
|
+
...tb,
|
|
360
|
+
id: `${tb.id}-split${col}`,
|
|
361
|
+
text: partWords.join(" "),
|
|
362
|
+
bounds: {
|
|
363
|
+
...tb.bounds,
|
|
364
|
+
left: currentLeft,
|
|
365
|
+
right: segmentRight,
|
|
366
|
+
},
|
|
367
|
+
});
|
|
368
|
+
remainingWords = remainingWords.slice(splitIdx);
|
|
369
|
+
currentLeft = segmentRight;
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
return result;
|
|
375
|
+
}
|
|
376
|
+
// ---------------------------------------------------------------------------
|
|
261
377
|
// Full grid table (H + V lines)
|
|
262
378
|
// ---------------------------------------------------------------------------
|
|
263
379
|
function buildCells(rows, cols) {
|
|
@@ -278,11 +394,26 @@ function buildTableGrid(pageNumber, yLines, xLines, filteredSegments, textBoxes)
|
|
|
278
394
|
const yMax = yLines[0];
|
|
279
395
|
const xMin = xLines[0];
|
|
280
396
|
const xMax = xLines[xLines.length - 1];
|
|
281
|
-
//
|
|
397
|
+
// Split text boxes that span multiple columns before placement
|
|
398
|
+
const splitBoxes = splitCrossColumnBoxes(textBoxes, xLines);
|
|
399
|
+
// Track which split piece IDs get placed in cells, so we can consume
|
|
400
|
+
// the original (unsplit) text box IDs too.
|
|
401
|
+
const placedSplitIds = new Set();
|
|
402
|
+
// Look for header text boxes just above the grid.
|
|
403
|
+
// Use the ORIGINAL (unsplit) text boxes for header detection so that
|
|
404
|
+
// wide paragraph text isn't falsely split into column-sized header chunks.
|
|
405
|
+
// Reject boxes wider than 1.5 columns — those are paragraph text, not headers.
|
|
406
|
+
const avgColWidth = (xMax - xMin) / cols;
|
|
407
|
+
const maxHeaderBoxWidth = avgColWidth * 1.5;
|
|
282
408
|
const headerBoxes = textBoxes.filter((tb) => {
|
|
283
409
|
const cy = (tb.bounds.top + tb.bounds.bottom) / 2;
|
|
284
410
|
const cx = (tb.bounds.left + tb.bounds.right) / 2;
|
|
285
|
-
|
|
411
|
+
const boxWidth = tb.bounds.right - tb.bounds.left;
|
|
412
|
+
return (cy > yMax &&
|
|
413
|
+
cy <= yMax + 20 &&
|
|
414
|
+
cx >= xMin &&
|
|
415
|
+
cx <= xMax &&
|
|
416
|
+
boxWidth <= maxHeaderBoxWidth);
|
|
286
417
|
});
|
|
287
418
|
if (headerBoxes.length > 0) {
|
|
288
419
|
rows += 1;
|
|
@@ -308,7 +439,7 @@ function buildTableGrid(pageNumber, yLines, xLines, filteredSegments, textBoxes)
|
|
|
308
439
|
}
|
|
309
440
|
}
|
|
310
441
|
const cellBoxes = new Map();
|
|
311
|
-
for (const tb of
|
|
442
|
+
for (const tb of splitBoxes) {
|
|
312
443
|
const cx = (tb.bounds.left + tb.bounds.right) / 2;
|
|
313
444
|
const cy = (tb.bounds.top + tb.bounds.bottom) / 2;
|
|
314
445
|
if (cy < yMin || cy > yMax || cx < xMin || cx > xMax)
|
|
@@ -338,6 +469,8 @@ function buildTableGrid(pageNumber, yLines, xLines, filteredSegments, textBoxes)
|
|
|
338
469
|
cellBoxes.set(cell, []);
|
|
339
470
|
cellBoxes.get(cell)?.push(tb);
|
|
340
471
|
consumedIds.push(tb.id);
|
|
472
|
+
if (tb.id.includes("-split"))
|
|
473
|
+
placedSplitIds.add(tb.id);
|
|
341
474
|
}
|
|
342
475
|
rows = expandSubRowsByYClusters(rows, cols, cells, cellBoxes);
|
|
343
476
|
// Merge text boxes within each cell into cell text
|
|
@@ -369,6 +502,14 @@ function buildTableGrid(pageNumber, yLines, xLines, filteredSegments, textBoxes)
|
|
|
369
502
|
topY: yLines[0],
|
|
370
503
|
isBorderless: false,
|
|
371
504
|
});
|
|
505
|
+
// Also consume the original (unsplit) text box IDs when any of their
|
|
506
|
+
// split pieces were placed in a cell.
|
|
507
|
+
for (const splitId of placedSplitIds) {
|
|
508
|
+
const origId = splitId.replace(/-split\d+$/, "");
|
|
509
|
+
if (!consumedIds.includes(origId)) {
|
|
510
|
+
consumedIds.push(origId);
|
|
511
|
+
}
|
|
512
|
+
}
|
|
372
513
|
return { grid, consumedIds };
|
|
373
514
|
}
|
|
374
515
|
// ---------------------------------------------------------------------------
|
|
@@ -80,8 +80,28 @@ export class PdfConverter {
|
|
|
80
80
|
});
|
|
81
81
|
}
|
|
82
82
|
}
|
|
83
|
-
// Detect column layout
|
|
83
|
+
// Detect column layout.
|
|
84
|
+
// If the page has vertical segments (tables), suppress column detection
|
|
85
|
+
// when one detected column is very narrow — that's a table's first column,
|
|
86
|
+
// not a page layout column.
|
|
84
87
|
const layout = detectColumns(page.textBoxes);
|
|
88
|
+
if (layout.columnCount > 1 &&
|
|
89
|
+
page.segments.some((s) => Math.abs(s.x1 - s.x2) <= 0.8)) {
|
|
90
|
+
const pageXMin = Math.min(...page.textBoxes.map((tb) => tb.bounds.left));
|
|
91
|
+
const pageXMax = Math.max(...page.textBoxes.map((tb) => tb.bounds.right));
|
|
92
|
+
const pageWidth = pageXMax - pageXMin;
|
|
93
|
+
const minColFraction = 0.3;
|
|
94
|
+
const tooNarrow = layout.columns.some((col) => {
|
|
95
|
+
const colXMin = Math.min(...col.map((tb) => tb.bounds.left));
|
|
96
|
+
const colXMax = Math.max(...col.map((tb) => tb.bounds.right));
|
|
97
|
+
return (colXMax - colXMin) / pageWidth < minColFraction;
|
|
98
|
+
});
|
|
99
|
+
if (tooNarrow) {
|
|
100
|
+
layout.columnCount = 1;
|
|
101
|
+
layout.columns = [page.textBoxes];
|
|
102
|
+
layout.boundaries = [];
|
|
103
|
+
}
|
|
104
|
+
}
|
|
85
105
|
if (layout.columnCount === 1) {
|
|
86
106
|
// Single column — process normally
|
|
87
107
|
const md = processColumn(page.pageNumber, page.textBoxes, page.segments, imageBlocks);
|
package/dist/index.d.ts
CHANGED
|
@@ -7,6 +7,7 @@ export { GitHubConverter } from "./converters/github.js";
|
|
|
7
7
|
export { HtmlConverter } from "./converters/html.js";
|
|
8
8
|
export { ImageConverter } from "./converters/image.js";
|
|
9
9
|
export { IpynbConverter } from "./converters/ipynb.js";
|
|
10
|
+
export { IWorkConverter } from "./converters/iwork.js";
|
|
10
11
|
export { JsonConverter } from "./converters/json.js";
|
|
11
12
|
export { PdfConverter } from "./converters/pdf/index.js";
|
|
12
13
|
export { PlainTextConverter } from "./converters/plain-text.js";
|
package/dist/index.js
CHANGED
|
@@ -6,6 +6,7 @@ export { GitHubConverter } from "./converters/github.js";
|
|
|
6
6
|
export { HtmlConverter } from "./converters/html.js";
|
|
7
7
|
export { ImageConverter } from "./converters/image.js";
|
|
8
8
|
export { IpynbConverter } from "./converters/ipynb.js";
|
|
9
|
+
export { IWorkConverter } from "./converters/iwork.js";
|
|
9
10
|
export { JsonConverter } from "./converters/json.js";
|
|
10
11
|
export { PdfConverter } from "./converters/pdf/index.js";
|
|
11
12
|
export { PlainTextConverter } from "./converters/plain-text.js";
|
package/dist/markit.js
CHANGED
|
@@ -8,6 +8,7 @@ import { GitHubConverter } from "./converters/github.js";
|
|
|
8
8
|
import { HtmlConverter } from "./converters/html.js";
|
|
9
9
|
import { ImageConverter } from "./converters/image.js";
|
|
10
10
|
import { IpynbConverter } from "./converters/ipynb.js";
|
|
11
|
+
import { IWorkConverter } from "./converters/iwork.js";
|
|
11
12
|
import { JsonConverter } from "./converters/json.js";
|
|
12
13
|
import { PdfConverter } from "./converters/pdf/index.js";
|
|
13
14
|
import { PlainTextConverter } from "./converters/plain-text.js";
|
|
@@ -33,6 +34,7 @@ export class Markit {
|
|
|
33
34
|
new XlsxConverter(),
|
|
34
35
|
new EpubConverter(),
|
|
35
36
|
new IpynbConverter(),
|
|
37
|
+
new IWorkConverter(),
|
|
36
38
|
new GitHubConverter(),
|
|
37
39
|
new WikipediaConverter(),
|
|
38
40
|
new RssConverter(),
|
package/dist/utils/turndown.js
CHANGED
|
@@ -51,8 +51,14 @@ export function createTurndown() {
|
|
|
51
51
|
* - Strip <p> tags inside <td>/<th> cells
|
|
52
52
|
*/
|
|
53
53
|
export function normalizeTablesHtml(html) {
|
|
54
|
-
// Strip <p> tags inside table cells
|
|
55
|
-
let result = html.replace(/<(td|th)([^>]*)
|
|
54
|
+
// Strip <p> tags inside table cells, joining multiple paragraphs with <br>
|
|
55
|
+
let result = html.replace(/<(td|th)([^>]*)>([\s\S]*?)<\/(td|th)>/gi, (_match, tag, attrs, inner, closeTag) => {
|
|
56
|
+
const stripped = inner
|
|
57
|
+
.replace(/^\s*<p>/i, "")
|
|
58
|
+
.replace(/<\/p>\s*$/i, "")
|
|
59
|
+
.replace(/<\/p>\s*<p>/gi, " ");
|
|
60
|
+
return `<${tag}${attrs}>${stripped}</${closeTag}>`;
|
|
61
|
+
});
|
|
56
62
|
// Add thead to tables that lack it
|
|
57
63
|
result = result.replace(/<table([^>]*)>\s*(?:<tbody>\s*)?(<tr[\s\S]*?<\/tr>)([\s\S]*?)<\/(?:tbody>\s*<\/)?table>/gi, (_match, attrs, firstRow, rest) => {
|
|
58
64
|
const theadRow = firstRow
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "markit-ai",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.5.1",
|
|
4
4
|
"description": "Convert anything to markdown. PDF, DOCX, PPTX, XLSX, HTML, EPUB, Jupyter, RSS, images, audio, URLs, and more. Pluggable converters, built-in LLM providers for image description and audio transcription. Works as a CLI and as a library.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|