markit-ai 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -23,6 +23,21 @@ const BUILTIN_FORMATS = [
23
23
  extensions: [".mp3", ".wav", ".m4a", ".flac"],
24
24
  builtin: true,
25
25
  },
26
+ {
27
+ name: "Pages",
28
+ extensions: [".pages"],
29
+ builtin: true,
30
+ },
31
+ {
32
+ name: "Keynote",
33
+ extensions: [".key"],
34
+ builtin: true,
35
+ },
36
+ {
37
+ name: "Numbers",
38
+ extensions: [".numbers"],
39
+ builtin: true,
40
+ },
26
41
  {
27
42
  name: "GitHub",
28
43
  extensions: ["github.com/*", "gist.github.com/*"],
@@ -0,0 +1,20 @@
1
+ import type { ConversionResult, Converter, StreamInfo } from "../types.js";
2
+ /**
3
+ * Converts Apple iWork files (Pages, Keynote, Numbers) to markdown.
4
+ *
5
+ * All three formats are ZIP archives containing an XML file:
6
+ * - Pages: index.xml (sf:p paragraphs with named styles)
7
+ * - Keynote: index.apxl (key:slide elements with sf:p text)
8
+ * - Numbers: index.xml (sf:t text cells + sf:n number cells)
9
+ */
10
+ export declare class IWorkConverter implements Converter {
11
+ name: string;
12
+ accepts(streamInfo: StreamInfo): boolean;
13
+ convert(input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult>;
14
+ private convertPages;
15
+ private convertKeynote;
16
+ private convertNumbers;
17
+ private extractGrid;
18
+ private convertNumbersFallback;
19
+ private readIndex;
20
+ }
@@ -0,0 +1,391 @@
1
+ import { mkdirSync, writeFileSync } from "node:fs";
2
+ import { join } from "node:path";
3
+ import JSZip from "jszip";
4
+ const EXTENSIONS = [".pages", ".key", ".numbers"];
5
+ const SF = "http://developer.apple.com/namespaces/sf";
6
+ const SFA = "http://developer.apple.com/namespaces/sfa";
7
+ const KEY = "http://developer.apple.com/namespaces/keynote2";
8
+ /**
9
+ * Converts Apple iWork files (Pages, Keynote, Numbers) to markdown.
10
+ *
11
+ * All three formats are ZIP archives containing an XML file:
12
+ * - Pages: index.xml (sf:p paragraphs with named styles)
13
+ * - Keynote: index.apxl (key:slide elements with sf:p text)
14
+ * - Numbers: index.xml (sf:t text cells + sf:n number cells)
15
+ */
16
+ export class IWorkConverter {
17
+ name = "iwork";
18
+ accepts(streamInfo) {
19
+ if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension)) {
20
+ return true;
21
+ }
22
+ return false;
23
+ }
24
+ async convert(input, streamInfo) {
25
+ const zip = await JSZip.loadAsync(input);
26
+ const ext = streamInfo.extension;
27
+ if (ext === ".pages")
28
+ return this.convertPages(zip, streamInfo);
29
+ if (ext === ".key")
30
+ return this.convertKeynote(zip, streamInfo);
31
+ if (ext === ".numbers")
32
+ return this.convertNumbers(zip);
33
+ throw new Error(`Unsupported iWork format: ${ext}`);
34
+ }
35
+ // ---------------------------------------------------------------------------
36
+ // Pages
37
+ // ---------------------------------------------------------------------------
38
+ async convertPages(zip, streamInfo) {
39
+ const xml = await this.readIndex(zip, "index.xml");
40
+ const root = parseXml(xml);
41
+ const imageDir = streamInfo.imageDir;
42
+ if (imageDir)
43
+ mkdirSync(imageDir, { recursive: true });
44
+ let imageCount = 0;
45
+ const lines = [];
46
+ let title;
47
+ for (const p of iterAll(root, SF, "p")) {
48
+ const text = collectText(p).trim();
49
+ if (!text)
50
+ continue;
51
+ const style = p.getAttribute("sf:style") || "";
52
+ const prefix = paragraphPrefix(style);
53
+ if (!title && text.length > 0)
54
+ title = text;
55
+ lines.push(`${prefix}${text}`);
56
+ }
57
+ // Extract images
58
+ for (const name of Object.keys(zip.files)) {
59
+ if (!name.match(/\.(png|jpg|jpeg|gif|webp|tiff|bmp)$/i))
60
+ continue;
61
+ if (name.startsWith("QuickLook/"))
62
+ continue;
63
+ imageCount++;
64
+ const imgName = name.split("/").pop() || `image_${imageCount}`;
65
+ if (imageDir) {
66
+ const file = zip.file(name);
67
+ if (file) {
68
+ const buf = await file.async("nodebuffer");
69
+ const filepath = join(imageDir, imgName);
70
+ writeFileSync(filepath, buf);
71
+ lines.push(`![${imgName}](${filepath})`);
72
+ }
73
+ }
74
+ else {
75
+ lines.push(`<!-- image: ${imgName} -->`);
76
+ }
77
+ }
78
+ return { markdown: lines.join("\n\n"), title };
79
+ }
80
+ // ---------------------------------------------------------------------------
81
+ // Keynote
82
+ // ---------------------------------------------------------------------------
83
+ async convertKeynote(zip, streamInfo) {
84
+ const xml = await this.readIndex(zip, "index.apxl");
85
+ const root = parseXml(xml);
86
+ const imageDir = streamInfo.imageDir;
87
+ if (imageDir)
88
+ mkdirSync(imageDir, { recursive: true });
89
+ const sections = [];
90
+ let title;
91
+ const slides = [...iterAll(root, KEY, "slide")];
92
+ for (let i = 0; i < slides.length; i++) {
93
+ const slide = slides[i];
94
+ const slideLines = [`<!-- Slide ${i + 1} -->`];
95
+ const paragraphs = [...iterAll(slide, SF, "p")];
96
+ let isTitle = true;
97
+ for (const p of paragraphs) {
98
+ const text = collectText(p).trim();
99
+ if (!text)
100
+ continue;
101
+ if (isTitle) {
102
+ slideLines.push(`# ${text}`);
103
+ if (!title)
104
+ title = text;
105
+ isTitle = false;
106
+ }
107
+ else {
108
+ slideLines.push(text);
109
+ }
110
+ }
111
+ sections.push(slideLines.join("\n"));
112
+ }
113
+ // Extract media images
114
+ let imageCount = 0;
115
+ for (const name of Object.keys(zip.files)) {
116
+ if (!name.match(/\.(png|jpg|jpeg|gif|webp|tiff|bmp)$/i))
117
+ continue;
118
+ if (name.startsWith("QuickLook/"))
119
+ continue;
120
+ imageCount++;
121
+ const imgName = name.split("/").pop() || `image_${imageCount}`;
122
+ if (imageDir) {
123
+ const file = zip.file(name);
124
+ if (file) {
125
+ const buf = await file.async("nodebuffer");
126
+ const filepath = join(imageDir, imgName);
127
+ writeFileSync(filepath, buf);
128
+ sections.push(`![${imgName}](${filepath})`);
129
+ }
130
+ }
131
+ else {
132
+ sections.push(`<!-- image: ${imgName} -->`);
133
+ }
134
+ }
135
+ return { markdown: sections.join("\n\n"), title };
136
+ }
137
+ // ---------------------------------------------------------------------------
138
+ // Numbers
139
+ // ---------------------------------------------------------------------------
140
+ async convertNumbers(zip) {
141
+ const xml = await this.readIndex(zip, "index.xml");
142
+ const root = parseXml(xml);
143
+ // Find grid elements (tables)
144
+ const grids = [...iterAll(root, SF, "grid")];
145
+ if (grids.length === 0) {
146
+ // Fallback: extract all text and number cells
147
+ return this.convertNumbersFallback(root);
148
+ }
149
+ const sections = [];
150
+ for (const grid of grids) {
151
+ const rows = this.extractGrid(grid);
152
+ if (rows.length === 0)
153
+ continue;
154
+ const maxCols = Math.max(...rows.map((r) => r.length));
155
+ for (const row of rows) {
156
+ while (row.length < maxCols)
157
+ row.push("");
158
+ }
159
+ const [header, ...body] = rows;
160
+ const lines = [];
161
+ lines.push(`| ${header.join(" | ")} |`);
162
+ lines.push(`| ${header.map(() => "---").join(" | ")} |`);
163
+ for (const row of body) {
164
+ lines.push(`| ${row.join(" | ")} |`);
165
+ }
166
+ sections.push(lines.join("\n"));
167
+ }
168
+ return { markdown: sections.join("\n\n") };
169
+ }
170
+ extractGrid(grid) {
171
+ const datasource = findFirst(grid, SF, "datasource");
172
+ if (!datasource)
173
+ return [];
174
+ const rows = [];
175
+ let currentRow = [];
176
+ let colCount = 0;
177
+ let totalCells = 0;
178
+ const allValues = [];
179
+ // Get column count from grid attributes (raw attribute names)
180
+ const numCols = Number.parseInt(grid.getAttribute("sf:numcols") || "0", 10);
181
+ for (const child of datasource.children) {
182
+ const tag = child.tagName;
183
+ let value = "";
184
+ if (tag === `${SF}:t`) {
185
+ // Text cell
186
+ const ct = findFirst(child, SF, "ct");
187
+ value = ct?.getAttribute("sfa:s") || collectText(child).trim();
188
+ }
189
+ else if (tag === `${SF}:n`) {
190
+ // Number cell
191
+ value = child.getAttribute("sf:v") || "";
192
+ }
193
+ else if (tag === `${SF}:b`) {
194
+ // Boolean cell
195
+ value = child.getAttribute("sf:v") === "1" ? "TRUE" : "FALSE";
196
+ }
197
+ else if (tag === `${SF}:d`) {
198
+ // Date cell
199
+ value = child.getAttribute("sf:v") || "";
200
+ }
201
+ else if (tag === `${SF}:du`) {
202
+ // Duration cell
203
+ value = child.getAttribute("sf:v") || "";
204
+ }
205
+ else if (tag === `${SF}:e`) {
206
+ // Empty cell
207
+ value = "";
208
+ }
209
+ else {
210
+ continue;
211
+ }
212
+ currentRow.push(value);
213
+ allValues.push(value);
214
+ colCount++;
215
+ totalCells++;
216
+ if (numCols > 0 && colCount >= numCols) {
217
+ rows.push(currentRow);
218
+ currentRow = [];
219
+ colCount = 0;
220
+ }
221
+ }
222
+ if (currentRow.length > 0)
223
+ rows.push(currentRow);
224
+ // If the grid used default dimensions and produced only one row,
225
+ // the data probably doesn't fill the full grid width. Fall back to
226
+ // treating the cells as a 2-column key/value list or single column.
227
+ if (rows.length <= 1 && totalCells > 0 && totalCells < numCols) {
228
+ // Re-layout: try 2 columns if even, otherwise single column
229
+ const cols = totalCells % 2 === 0 ? 2 : 1;
230
+ const relaid = [];
231
+ for (let i = 0; i < allValues.length; i += cols) {
232
+ relaid.push(allValues.slice(i, i + cols));
233
+ }
234
+ return relaid;
235
+ }
236
+ return rows;
237
+ }
238
+ convertNumbersFallback(root) {
239
+ const values = [];
240
+ for (const t of iterAll(root, SF, "t")) {
241
+ const ct = findFirst(t, SF, "ct");
242
+ const val = ct?.getAttribute("sfa:s") || "";
243
+ if (val)
244
+ values.push(val);
245
+ }
246
+ for (const n of iterAll(root, SF, "n")) {
247
+ const val = n.getAttribute("sf:v") || "";
248
+ if (val)
249
+ values.push(val);
250
+ }
251
+ return { markdown: values.join("\n") };
252
+ }
253
+ // ---------------------------------------------------------------------------
254
+ // Helpers
255
+ // ---------------------------------------------------------------------------
256
+ async readIndex(zip, filename) {
257
+ const file = zip.file(filename);
258
+ if (!file) {
259
+ throw new Error(`Invalid iWork file: missing ${filename}`);
260
+ }
261
+ return file.async("string");
262
+ }
263
+ }
264
+ /**
265
+ * Minimal XML parser that preserves namespace prefixes in tag names
266
+ * and extracts text content and attributes.
267
+ */
268
+ function parseXml(xml) {
269
+ // Use a simple recursive descent approach
270
+ const root = createElement("root");
271
+ const stack = [root];
272
+ // Match tags and text
273
+ const tagRe = /<(\/?)([a-zA-Z0-9_:.-]+)((?:\s+[a-zA-Z0-9_:.-]+\s*=\s*"[^"]*")*)\s*(\/?)>/g;
274
+ let lastIndex = 0;
275
+ let match = tagRe.exec(xml);
276
+ while (match !== null) {
277
+ const [fullMatch, isClose, tagName, attrs, isSelfClose] = match;
278
+ const textBefore = xml.slice(lastIndex, match.index);
279
+ lastIndex = match.index + fullMatch.length;
280
+ // Add text to current element
281
+ if (textBefore.trim()) {
282
+ const current = stack[stack.length - 1];
283
+ if (current.children.length > 0) {
284
+ current.children[current.children.length - 1].tail += textBefore.trim();
285
+ }
286
+ else {
287
+ current.text += textBefore.trim();
288
+ }
289
+ }
290
+ if (isClose) {
291
+ // Closing tag
292
+ stack.pop();
293
+ }
294
+ else {
295
+ // Opening tag
296
+ const el = createElement(expandTag(tagName, xml));
297
+ parseAttributes(attrs, el);
298
+ stack[stack.length - 1].children.push(el);
299
+ if (!isSelfClose) {
300
+ stack.push(el);
301
+ }
302
+ }
303
+ match = tagRe.exec(xml);
304
+ }
305
+ return root;
306
+ }
307
+ function createElement(tagName) {
308
+ return {
309
+ tagName,
310
+ children: [],
311
+ text: "",
312
+ tail: "",
313
+ attributes: {},
314
+ getAttribute(name) {
315
+ return this.attributes[name] ?? null;
316
+ },
317
+ };
318
+ }
319
+ function parseAttributes(attrStr, el) {
320
+ const attrRe = /([a-zA-Z0-9_:.-]+)\s*=\s*"([^"]*)"/g;
321
+ let m = attrRe.exec(attrStr);
322
+ while (m !== null) {
323
+ el.attributes[m[1]] = m[2];
324
+ m = attrRe.exec(attrStr);
325
+ }
326
+ }
327
+ /**
328
+ * Expand namespace prefix in tag name to full URI.
329
+ * e.g. "sf:p" with xmlns:sf="..." → "{uri}p"
330
+ * For simplicity, we use the known Apple namespaces.
331
+ */
332
+ function expandTag(tag, _xml) {
333
+ const nsMap = {
334
+ sf: SF,
335
+ sfa: SFA,
336
+ sl: "http://developer.apple.com/namespaces/sl",
337
+ key: KEY,
338
+ };
339
+ const colon = tag.indexOf(":");
340
+ if (colon === -1)
341
+ return tag;
342
+ const prefix = tag.slice(0, colon);
343
+ const local = tag.slice(colon + 1);
344
+ const uri = nsMap[prefix];
345
+ return uri ? `${uri}:${local}` : tag;
346
+ }
347
+ function collectText(el) {
348
+ let result = el.text;
349
+ for (const child of el.children) {
350
+ result += collectText(child);
351
+ result += child.tail;
352
+ }
353
+ return result;
354
+ }
355
+ function* iterAll(el, ns, localName) {
356
+ const fullTag = `${ns}:${localName}`;
357
+ if (el.tagName === fullTag)
358
+ yield el;
359
+ for (const child of el.children) {
360
+ yield* iterAll(child, ns, localName);
361
+ }
362
+ }
363
+ function findFirst(el, ns, localName) {
364
+ for (const found of iterAll(el, ns, localName)) {
365
+ return found;
366
+ }
367
+ return null;
368
+ }
369
+ /**
370
+ * Map iWork paragraph style names to markdown heading prefixes.
371
+ */
372
+ function paragraphPrefix(style) {
373
+ if (!style)
374
+ return "";
375
+ const lower = style.toLowerCase();
376
+ if (lower.includes("title"))
377
+ return "# ";
378
+ if (lower.includes("subtitle"))
379
+ return "## ";
380
+ if (lower.includes("heading-1") || lower.includes("heading 1"))
381
+ return "## ";
382
+ if (lower.includes("heading-2") || lower.includes("heading 2"))
383
+ return "### ";
384
+ if (lower.includes("heading-3") || lower.includes("heading 3"))
385
+ return "#### ";
386
+ if (lower.includes("heading-4") || lower.includes("heading 4"))
387
+ return "##### ";
388
+ if (lower.includes("caption"))
389
+ return "*";
390
+ return "";
391
+ }
package/dist/index.d.ts CHANGED
@@ -7,6 +7,7 @@ export { GitHubConverter } from "./converters/github.js";
7
7
  export { HtmlConverter } from "./converters/html.js";
8
8
  export { ImageConverter } from "./converters/image.js";
9
9
  export { IpynbConverter } from "./converters/ipynb.js";
10
+ export { IWorkConverter } from "./converters/iwork.js";
10
11
  export { JsonConverter } from "./converters/json.js";
11
12
  export { PdfConverter } from "./converters/pdf/index.js";
12
13
  export { PlainTextConverter } from "./converters/plain-text.js";
package/dist/index.js CHANGED
@@ -6,6 +6,7 @@ export { GitHubConverter } from "./converters/github.js";
6
6
  export { HtmlConverter } from "./converters/html.js";
7
7
  export { ImageConverter } from "./converters/image.js";
8
8
  export { IpynbConverter } from "./converters/ipynb.js";
9
+ export { IWorkConverter } from "./converters/iwork.js";
9
10
  export { JsonConverter } from "./converters/json.js";
10
11
  export { PdfConverter } from "./converters/pdf/index.js";
11
12
  export { PlainTextConverter } from "./converters/plain-text.js";
package/dist/markit.js CHANGED
@@ -8,6 +8,7 @@ import { GitHubConverter } from "./converters/github.js";
8
8
  import { HtmlConverter } from "./converters/html.js";
9
9
  import { ImageConverter } from "./converters/image.js";
10
10
  import { IpynbConverter } from "./converters/ipynb.js";
11
+ import { IWorkConverter } from "./converters/iwork.js";
11
12
  import { JsonConverter } from "./converters/json.js";
12
13
  import { PdfConverter } from "./converters/pdf/index.js";
13
14
  import { PlainTextConverter } from "./converters/plain-text.js";
@@ -33,6 +34,7 @@ export class Markit {
33
34
  new XlsxConverter(),
34
35
  new EpubConverter(),
35
36
  new IpynbConverter(),
37
+ new IWorkConverter(),
36
38
  new GitHubConverter(),
37
39
  new WikipediaConverter(),
38
40
  new RssConverter(),
@@ -51,8 +51,14 @@ export function createTurndown() {
51
51
  * - Strip <p> tags inside <td>/<th> cells
52
52
  */
53
53
  export function normalizeTablesHtml(html) {
54
- // Strip <p> tags inside table cells
55
- let result = html.replace(/<(td|th)([^>]*)>\s*<p>([\s\S]*?)<\/p>\s*<\/(td|th)>/gi, "<$1$2>$3</$4>");
54
+ // Strip <p> tags inside table cells, joining multiple paragraphs with <br>
55
+ let result = html.replace(/<(td|th)([^>]*)>([\s\S]*?)<\/(td|th)>/gi, (_match, tag, attrs, inner, closeTag) => {
56
+ const stripped = inner
57
+ .replace(/^\s*<p>/i, "")
58
+ .replace(/<\/p>\s*$/i, "")
59
+ .replace(/<\/p>\s*<p>/gi, " ");
60
+ return `<${tag}${attrs}>${stripped}</${closeTag}>`;
61
+ });
56
62
  // Add thead to tables that lack it
57
63
  result = result.replace(/<table([^>]*)>\s*(?:<tbody>\s*)?(<tr[\s\S]*?<\/tr>)([\s\S]*?)<\/(?:tbody>\s*<\/)?table>/gi, (_match, attrs, firstRow, rest) => {
58
64
  const theadRow = firstRow
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "markit-ai",
3
- "version": "0.4.0",
3
+ "version": "0.5.0",
4
4
  "description": "Convert anything to markdown. PDF, DOCX, PPTX, XLSX, HTML, EPUB, Jupyter, RSS, images, audio, URLs, and more. Pluggable converters, built-in LLM providers for image description and audio transcription. Works as a CLI and as a library.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",