markit-ai 0.3.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/convert.d.ts +1 -0
- package/dist/commands/convert.js +7 -3
- package/dist/commands/formats.js +20 -0
- package/dist/converters/docx.d.ts +1 -1
- package/dist/converters/docx.js +35 -3
- package/dist/converters/epub.js +1 -0
- package/dist/converters/github.d.ts +18 -0
- package/dist/converters/github.js +148 -0
- package/dist/converters/iwork.d.ts +20 -0
- package/dist/converters/iwork.js +391 -0
- package/dist/converters/pptx.d.ts +1 -1
- package/dist/converters/pptx.js +74 -1
- package/dist/converters/xlsx.js +1 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +2 -0
- package/dist/main.js +4 -1
- package/dist/markit.d.ts +1 -1
- package/dist/markit.js +20 -3
- package/dist/types.d.ts +6 -0
- package/dist/utils/turndown.js +8 -2
- package/package.json +1 -1
package/dist/commands/convert.js
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
import { writeFileSync } from "node:fs";
|
|
1
|
+
import { mkdtempSync, writeFileSync } from "node:fs";
|
|
2
|
+
import { tmpdir } from "node:os";
|
|
3
|
+
import { join } from "node:path";
|
|
2
4
|
import { loadConfig } from "../config.js";
|
|
3
5
|
import { Markit } from "../markit.js";
|
|
4
6
|
import { loadAllPlugins } from "../plugins/loader.js";
|
|
@@ -23,6 +25,8 @@ export async function convert(source, options) {
|
|
|
23
25
|
}
|
|
24
26
|
const llmFunctions = createLlmFunctions(config, options.prompt);
|
|
25
27
|
const markit = new Markit(llmFunctions, plugins);
|
|
28
|
+
// Auto-create a temp dir for images if not explicitly provided
|
|
29
|
+
const imageDir = options.imageDir || mkdtempSync(join(tmpdir(), "markit-images-"));
|
|
26
30
|
try {
|
|
27
31
|
let result;
|
|
28
32
|
const isStdin = source === "-";
|
|
@@ -36,7 +40,7 @@ export async function convert(source, options) {
|
|
|
36
40
|
process.exit(EXIT_ERROR);
|
|
37
41
|
}
|
|
38
42
|
const buffer = await readStdin();
|
|
39
|
-
result = await markit.convert(buffer, {});
|
|
43
|
+
result = await markit.convert(buffer, { imageDir });
|
|
40
44
|
}
|
|
41
45
|
else if (isUrl) {
|
|
42
46
|
// Progress hint for URL fetches (stderr so it doesn't pollute piped output)
|
|
@@ -46,7 +50,7 @@ export async function convert(source, options) {
|
|
|
46
50
|
result = await markit.convertUrl(source);
|
|
47
51
|
}
|
|
48
52
|
else {
|
|
49
|
-
result = await markit.convertFile(source);
|
|
53
|
+
result = await markit.convertFile(source, { imageDir });
|
|
50
54
|
}
|
|
51
55
|
const label = isStdin ? "stdin" : source;
|
|
52
56
|
// Write to file or stdout
|
package/dist/commands/formats.js
CHANGED
|
@@ -23,6 +23,26 @@ const BUILTIN_FORMATS = [
|
|
|
23
23
|
extensions: [".mp3", ".wav", ".m4a", ".flac"],
|
|
24
24
|
builtin: true,
|
|
25
25
|
},
|
|
26
|
+
{
|
|
27
|
+
name: "Pages",
|
|
28
|
+
extensions: [".pages"],
|
|
29
|
+
builtin: true,
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
name: "Keynote",
|
|
33
|
+
extensions: [".key"],
|
|
34
|
+
builtin: true,
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
name: "Numbers",
|
|
38
|
+
extensions: [".numbers"],
|
|
39
|
+
builtin: true,
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
name: "GitHub",
|
|
43
|
+
extensions: ["github.com/*", "gist.github.com/*"],
|
|
44
|
+
builtin: true,
|
|
45
|
+
},
|
|
26
46
|
{ name: "ZIP", extensions: [".zip"], builtin: true },
|
|
27
47
|
{
|
|
28
48
|
name: "Plain text",
|
|
@@ -2,5 +2,5 @@ import type { ConversionResult, Converter, StreamInfo } from "../types.js";
|
|
|
2
2
|
export declare class DocxConverter implements Converter {
|
|
3
3
|
name: string;
|
|
4
4
|
accepts(streamInfo: StreamInfo): boolean;
|
|
5
|
-
convert(input: Buffer,
|
|
5
|
+
convert(input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult>;
|
|
6
6
|
}
|
package/dist/converters/docx.js
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import { mkdirSync, writeFileSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
1
3
|
import mammoth from "mammoth";
|
|
2
4
|
import { createTurndown, normalizeTablesHtml } from "../utils/turndown.js";
|
|
3
5
|
const EXTENSIONS = [".docx"];
|
|
@@ -16,10 +18,40 @@ export class DocxConverter {
|
|
|
16
18
|
}
|
|
17
19
|
return false;
|
|
18
20
|
}
|
|
19
|
-
async convert(input,
|
|
20
|
-
const
|
|
21
|
+
async convert(input, streamInfo) {
|
|
22
|
+
const imageDir = streamInfo.imageDir;
|
|
23
|
+
if (imageDir) {
|
|
24
|
+
mkdirSync(imageDir, { recursive: true });
|
|
25
|
+
}
|
|
26
|
+
let imageCount = 0;
|
|
27
|
+
const convertImage = imageDir
|
|
28
|
+
? mammoth.images.imgElement((image) => {
|
|
29
|
+
imageCount++;
|
|
30
|
+
const ext = (image.contentType?.split("/")[1] || "png").replace("jpeg", "jpg");
|
|
31
|
+
const filename = `image_${imageCount}.${ext}`;
|
|
32
|
+
const filepath = join(imageDir, filename);
|
|
33
|
+
return image.read("base64").then((base64) => {
|
|
34
|
+
writeFileSync(filepath, Buffer.from(base64, "base64"));
|
|
35
|
+
return { src: filepath, alt: `image_${imageCount}` };
|
|
36
|
+
});
|
|
37
|
+
})
|
|
38
|
+
: mammoth.images.imgElement((image) => {
|
|
39
|
+
imageCount++;
|
|
40
|
+
const contentType = image.contentType || "image/png";
|
|
41
|
+
return image.read("base64").then((base64) => {
|
|
42
|
+
return {
|
|
43
|
+
src: `data:${contentType};base64,${base64.slice(0, 0)}`,
|
|
44
|
+
alt: `image_${imageCount}`,
|
|
45
|
+
};
|
|
46
|
+
});
|
|
47
|
+
});
|
|
48
|
+
const { value: html } = await mammoth.convertToHtml({ buffer: input }, { convertImage });
|
|
21
49
|
const turndown = createTurndown();
|
|
22
|
-
|
|
50
|
+
let markdown = turndown.turndown(normalizeTablesHtml(html));
|
|
51
|
+
// Replace data URI images with comment placeholders when no imageDir
|
|
52
|
+
if (!imageDir) {
|
|
53
|
+
markdown = markdown.replace(/!\[([^\]]*)\]\(data:[^)]*\)/g, "<!-- image: $1 -->");
|
|
54
|
+
}
|
|
23
55
|
return { markdown: markdown.trim() };
|
|
24
56
|
}
|
|
25
57
|
}
|
package/dist/converters/epub.js
CHANGED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import type { ConversionResult, Converter, MarkitOptions, StreamInfo } from "../types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Matches GitHub URLs and fetches clean markdown content directly
|
|
4
|
+
* from raw endpoints or the GitHub API — no HTML scraping needed.
|
|
5
|
+
*
|
|
6
|
+
* Supported patterns:
|
|
7
|
+
* - Repos: github.com/owner/repo → raw README.md
|
|
8
|
+
* - Files: github.com/owner/repo/blob/… → raw file content
|
|
9
|
+
* - Gists: gist.github.com/owner/id → raw gist content
|
|
10
|
+
* - Issues: github.com/owner/repo/issues/N → API (title + body)
|
|
11
|
+
* - PRs: github.com/owner/repo/pull/N → API (title + body)
|
|
12
|
+
*/
|
|
13
|
+
export declare class GitHubConverter implements Converter {
|
|
14
|
+
name: string;
|
|
15
|
+
accepts(streamInfo: StreamInfo): boolean;
|
|
16
|
+
convertUrl(url: string, _options?: MarkitOptions): Promise<ConversionResult>;
|
|
17
|
+
convert(_input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult>;
|
|
18
|
+
}
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
const GITHUB_HOSTS = new Set([
|
|
2
|
+
"github.com",
|
|
3
|
+
"www.github.com",
|
|
4
|
+
"gist.github.com",
|
|
5
|
+
]);
|
|
6
|
+
/**
|
|
7
|
+
* Matches GitHub URLs and fetches clean markdown content directly
|
|
8
|
+
* from raw endpoints or the GitHub API — no HTML scraping needed.
|
|
9
|
+
*
|
|
10
|
+
* Supported patterns:
|
|
11
|
+
* - Repos: github.com/owner/repo → raw README.md
|
|
12
|
+
* - Files: github.com/owner/repo/blob/… → raw file content
|
|
13
|
+
* - Gists: gist.github.com/owner/id → raw gist content
|
|
14
|
+
* - Issues: github.com/owner/repo/issues/N → API (title + body)
|
|
15
|
+
* - PRs: github.com/owner/repo/pull/N → API (title + body)
|
|
16
|
+
*/
|
|
17
|
+
export class GitHubConverter {
|
|
18
|
+
name = "github";
|
|
19
|
+
accepts(streamInfo) {
|
|
20
|
+
if (!streamInfo.url)
|
|
21
|
+
return false;
|
|
22
|
+
try {
|
|
23
|
+
const { hostname } = new URL(streamInfo.url);
|
|
24
|
+
return GITHUB_HOSTS.has(hostname);
|
|
25
|
+
}
|
|
26
|
+
catch {
|
|
27
|
+
return false;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
async convertUrl(url, _options) {
|
|
31
|
+
const parsed = new URL(url);
|
|
32
|
+
if (parsed.hostname === "gist.github.com") {
|
|
33
|
+
return fetchGist(parsed);
|
|
34
|
+
}
|
|
35
|
+
const segments = parsed.pathname.split("/").filter(Boolean);
|
|
36
|
+
// Need at least owner/repo
|
|
37
|
+
if (segments.length < 2) {
|
|
38
|
+
throw new Error(`Unsupported GitHub URL: ${url}`);
|
|
39
|
+
}
|
|
40
|
+
const [owner, repo, type, ...rest] = segments;
|
|
41
|
+
// github.com/owner/repo/blob/ref/path → raw file
|
|
42
|
+
if (type === "blob" && rest.length >= 2) {
|
|
43
|
+
const ref = rest[0];
|
|
44
|
+
const filePath = rest.slice(1).join("/");
|
|
45
|
+
return fetchRawFile(owner, repo, ref, filePath);
|
|
46
|
+
}
|
|
47
|
+
// github.com/owner/repo/issues/N or /pull/N
|
|
48
|
+
if ((type === "issues" || type === "pull") && rest[0]) {
|
|
49
|
+
const number = Number.parseInt(rest[0], 10);
|
|
50
|
+
if (!Number.isNaN(number)) {
|
|
51
|
+
return fetchIssueOrPr(owner, repo, number);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
// github.com/owner/repo (no subpath or tree/wiki/etc) → README
|
|
55
|
+
if (!type) {
|
|
56
|
+
return fetchReadme(owner, repo);
|
|
57
|
+
}
|
|
58
|
+
throw new Error(`Unsupported GitHub URL pattern: ${url}`);
|
|
59
|
+
}
|
|
60
|
+
async convert(_input, streamInfo) {
|
|
61
|
+
// GitHub URLs are handled entirely via convertUrl.
|
|
62
|
+
// If we end up here, the URL was already fetched by the default path —
|
|
63
|
+
// just delegate to convertUrl.
|
|
64
|
+
if (streamInfo.url) {
|
|
65
|
+
return this.convertUrl(streamInfo.url);
|
|
66
|
+
}
|
|
67
|
+
throw new Error("GitHub converter requires a URL");
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
// ---------------------------------------------------------------------------
|
|
71
|
+
// Fetchers
|
|
72
|
+
// ---------------------------------------------------------------------------
|
|
73
|
+
async function fetchReadme(owner, repo) {
|
|
74
|
+
const url = `https://raw.githubusercontent.com/${owner}/${repo}/HEAD/README.md`;
|
|
75
|
+
const res = await fetch(url);
|
|
76
|
+
if (!res.ok) {
|
|
77
|
+
throw new Error(`Failed to fetch README: ${res.status} ${res.statusText}`);
|
|
78
|
+
}
|
|
79
|
+
const markdown = (await res.text()).trim();
|
|
80
|
+
const title = extractFirstHeading(markdown) ?? `${owner}/${repo}`;
|
|
81
|
+
return { markdown, title };
|
|
82
|
+
}
|
|
83
|
+
async function fetchRawFile(owner, repo, ref, filePath) {
|
|
84
|
+
const url = `https://raw.githubusercontent.com/${owner}/${repo}/${ref}/${filePath}`;
|
|
85
|
+
const res = await fetch(url);
|
|
86
|
+
if (!res.ok) {
|
|
87
|
+
throw new Error(`Failed to fetch file: ${res.status} ${res.statusText}`);
|
|
88
|
+
}
|
|
89
|
+
const content = (await res.text()).trim();
|
|
90
|
+
const filename = filePath.split("/").pop() ?? filePath;
|
|
91
|
+
// If it's markdown, return as-is. Otherwise wrap in a code block.
|
|
92
|
+
if (filePath.endsWith(".md") || filePath.endsWith(".mdx")) {
|
|
93
|
+
const title = extractFirstHeading(content) ?? filename;
|
|
94
|
+
return { markdown: content, title };
|
|
95
|
+
}
|
|
96
|
+
const ext = filename.includes(".") ? filename.split(".").pop() : "";
|
|
97
|
+
const markdown = `# ${filename}\n\n\`\`\`${ext}\n${content}\n\`\`\``;
|
|
98
|
+
return { markdown, title: filename };
|
|
99
|
+
}
|
|
100
|
+
async function fetchGist(parsed) {
|
|
101
|
+
const segments = parsed.pathname.split("/").filter(Boolean);
|
|
102
|
+
// gist.github.com/owner/id
|
|
103
|
+
const [owner, id] = segments;
|
|
104
|
+
if (!owner || !id) {
|
|
105
|
+
throw new Error(`Unsupported gist URL: ${parsed.href}`);
|
|
106
|
+
}
|
|
107
|
+
const url = `https://gist.githubusercontent.com/${owner}/${id}/raw`;
|
|
108
|
+
const res = await fetch(url);
|
|
109
|
+
if (!res.ok) {
|
|
110
|
+
throw new Error(`Failed to fetch gist: ${res.status} ${res.statusText}`);
|
|
111
|
+
}
|
|
112
|
+
const content = (await res.text()).trim();
|
|
113
|
+
const title = `gist:${id}`;
|
|
114
|
+
return { markdown: content, title };
|
|
115
|
+
}
|
|
116
|
+
async function fetchIssueOrPr(owner, repo, number) {
|
|
117
|
+
const url = `https://api.github.com/repos/${owner}/${repo}/issues/${number}`;
|
|
118
|
+
const res = await fetch(url, {
|
|
119
|
+
headers: { Accept: "application/vnd.github.v3+json" },
|
|
120
|
+
});
|
|
121
|
+
if (!res.ok) {
|
|
122
|
+
throw new Error(`Failed to fetch issue/PR: ${res.status} ${res.statusText}`);
|
|
123
|
+
}
|
|
124
|
+
const data = (await res.json());
|
|
125
|
+
const title = data.title ?? `#${number}`;
|
|
126
|
+
const parts = [`# ${title}`];
|
|
127
|
+
// Metadata line
|
|
128
|
+
const meta = [];
|
|
129
|
+
if (data.user?.login)
|
|
130
|
+
meta.push(`@${data.user.login}`);
|
|
131
|
+
if (data.state)
|
|
132
|
+
meta.push(data.state);
|
|
133
|
+
if (data.labels?.length) {
|
|
134
|
+
meta.push(data.labels.map((l) => l.name).join(", "));
|
|
135
|
+
}
|
|
136
|
+
if (meta.length > 0)
|
|
137
|
+
parts.push(meta.join(" · "));
|
|
138
|
+
if (data.body?.trim())
|
|
139
|
+
parts.push(data.body.trim());
|
|
140
|
+
return { markdown: parts.join("\n\n"), title };
|
|
141
|
+
}
|
|
142
|
+
// ---------------------------------------------------------------------------
|
|
143
|
+
// Helpers
|
|
144
|
+
// ---------------------------------------------------------------------------
|
|
145
|
+
function extractFirstHeading(markdown) {
|
|
146
|
+
const match = markdown.match(/^#\s+(.+)$/m);
|
|
147
|
+
return match?.[1]?.trim();
|
|
148
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import type { ConversionResult, Converter, StreamInfo } from "../types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Converts Apple iWork files (Pages, Keynote, Numbers) to markdown.
|
|
4
|
+
*
|
|
5
|
+
* All three formats are ZIP archives containing an XML file:
|
|
6
|
+
* - Pages: index.xml (sf:p paragraphs with named styles)
|
|
7
|
+
* - Keynote: index.apxl (key:slide elements with sf:p text)
|
|
8
|
+
* - Numbers: index.xml (sf:t text cells + sf:n number cells)
|
|
9
|
+
*/
|
|
10
|
+
export declare class IWorkConverter implements Converter {
|
|
11
|
+
name: string;
|
|
12
|
+
accepts(streamInfo: StreamInfo): boolean;
|
|
13
|
+
convert(input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult>;
|
|
14
|
+
private convertPages;
|
|
15
|
+
private convertKeynote;
|
|
16
|
+
private convertNumbers;
|
|
17
|
+
private extractGrid;
|
|
18
|
+
private convertNumbersFallback;
|
|
19
|
+
private readIndex;
|
|
20
|
+
}
|
|
@@ -0,0 +1,391 @@
|
|
|
1
|
+
import { mkdirSync, writeFileSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import JSZip from "jszip";
|
|
4
|
+
const EXTENSIONS = [".pages", ".key", ".numbers"];
|
|
5
|
+
const SF = "http://developer.apple.com/namespaces/sf";
|
|
6
|
+
const SFA = "http://developer.apple.com/namespaces/sfa";
|
|
7
|
+
const KEY = "http://developer.apple.com/namespaces/keynote2";
|
|
8
|
+
/**
|
|
9
|
+
* Converts Apple iWork files (Pages, Keynote, Numbers) to markdown.
|
|
10
|
+
*
|
|
11
|
+
* All three formats are ZIP archives containing an XML file:
|
|
12
|
+
* - Pages: index.xml (sf:p paragraphs with named styles)
|
|
13
|
+
* - Keynote: index.apxl (key:slide elements with sf:p text)
|
|
14
|
+
* - Numbers: index.xml (sf:t text cells + sf:n number cells)
|
|
15
|
+
*/
|
|
16
|
+
export class IWorkConverter {
|
|
17
|
+
name = "iwork";
|
|
18
|
+
accepts(streamInfo) {
|
|
19
|
+
if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension)) {
|
|
20
|
+
return true;
|
|
21
|
+
}
|
|
22
|
+
return false;
|
|
23
|
+
}
|
|
24
|
+
async convert(input, streamInfo) {
|
|
25
|
+
const zip = await JSZip.loadAsync(input);
|
|
26
|
+
const ext = streamInfo.extension;
|
|
27
|
+
if (ext === ".pages")
|
|
28
|
+
return this.convertPages(zip, streamInfo);
|
|
29
|
+
if (ext === ".key")
|
|
30
|
+
return this.convertKeynote(zip, streamInfo);
|
|
31
|
+
if (ext === ".numbers")
|
|
32
|
+
return this.convertNumbers(zip);
|
|
33
|
+
throw new Error(`Unsupported iWork format: ${ext}`);
|
|
34
|
+
}
|
|
35
|
+
// ---------------------------------------------------------------------------
|
|
36
|
+
// Pages
|
|
37
|
+
// ---------------------------------------------------------------------------
|
|
38
|
+
async convertPages(zip, streamInfo) {
|
|
39
|
+
const xml = await this.readIndex(zip, "index.xml");
|
|
40
|
+
const root = parseXml(xml);
|
|
41
|
+
const imageDir = streamInfo.imageDir;
|
|
42
|
+
if (imageDir)
|
|
43
|
+
mkdirSync(imageDir, { recursive: true });
|
|
44
|
+
let imageCount = 0;
|
|
45
|
+
const lines = [];
|
|
46
|
+
let title;
|
|
47
|
+
for (const p of iterAll(root, SF, "p")) {
|
|
48
|
+
const text = collectText(p).trim();
|
|
49
|
+
if (!text)
|
|
50
|
+
continue;
|
|
51
|
+
const style = p.getAttribute("sf:style") || "";
|
|
52
|
+
const prefix = paragraphPrefix(style);
|
|
53
|
+
if (!title && text.length > 0)
|
|
54
|
+
title = text;
|
|
55
|
+
lines.push(`${prefix}${text}`);
|
|
56
|
+
}
|
|
57
|
+
// Extract images
|
|
58
|
+
for (const name of Object.keys(zip.files)) {
|
|
59
|
+
if (!name.match(/\.(png|jpg|jpeg|gif|webp|tiff|bmp)$/i))
|
|
60
|
+
continue;
|
|
61
|
+
if (name.startsWith("QuickLook/"))
|
|
62
|
+
continue;
|
|
63
|
+
imageCount++;
|
|
64
|
+
const imgName = name.split("/").pop() || `image_${imageCount}`;
|
|
65
|
+
if (imageDir) {
|
|
66
|
+
const file = zip.file(name);
|
|
67
|
+
if (file) {
|
|
68
|
+
const buf = await file.async("nodebuffer");
|
|
69
|
+
const filepath = join(imageDir, imgName);
|
|
70
|
+
writeFileSync(filepath, buf);
|
|
71
|
+
lines.push(``);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
else {
|
|
75
|
+
lines.push(`<!-- image: ${imgName} -->`);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
return { markdown: lines.join("\n\n"), title };
|
|
79
|
+
}
|
|
80
|
+
// ---------------------------------------------------------------------------
|
|
81
|
+
// Keynote
|
|
82
|
+
// ---------------------------------------------------------------------------
|
|
83
|
+
async convertKeynote(zip, streamInfo) {
|
|
84
|
+
const xml = await this.readIndex(zip, "index.apxl");
|
|
85
|
+
const root = parseXml(xml);
|
|
86
|
+
const imageDir = streamInfo.imageDir;
|
|
87
|
+
if (imageDir)
|
|
88
|
+
mkdirSync(imageDir, { recursive: true });
|
|
89
|
+
const sections = [];
|
|
90
|
+
let title;
|
|
91
|
+
const slides = [...iterAll(root, KEY, "slide")];
|
|
92
|
+
for (let i = 0; i < slides.length; i++) {
|
|
93
|
+
const slide = slides[i];
|
|
94
|
+
const slideLines = [`<!-- Slide ${i + 1} -->`];
|
|
95
|
+
const paragraphs = [...iterAll(slide, SF, "p")];
|
|
96
|
+
let isTitle = true;
|
|
97
|
+
for (const p of paragraphs) {
|
|
98
|
+
const text = collectText(p).trim();
|
|
99
|
+
if (!text)
|
|
100
|
+
continue;
|
|
101
|
+
if (isTitle) {
|
|
102
|
+
slideLines.push(`# ${text}`);
|
|
103
|
+
if (!title)
|
|
104
|
+
title = text;
|
|
105
|
+
isTitle = false;
|
|
106
|
+
}
|
|
107
|
+
else {
|
|
108
|
+
slideLines.push(text);
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
sections.push(slideLines.join("\n"));
|
|
112
|
+
}
|
|
113
|
+
// Extract media images
|
|
114
|
+
let imageCount = 0;
|
|
115
|
+
for (const name of Object.keys(zip.files)) {
|
|
116
|
+
if (!name.match(/\.(png|jpg|jpeg|gif|webp|tiff|bmp)$/i))
|
|
117
|
+
continue;
|
|
118
|
+
if (name.startsWith("QuickLook/"))
|
|
119
|
+
continue;
|
|
120
|
+
imageCount++;
|
|
121
|
+
const imgName = name.split("/").pop() || `image_${imageCount}`;
|
|
122
|
+
if (imageDir) {
|
|
123
|
+
const file = zip.file(name);
|
|
124
|
+
if (file) {
|
|
125
|
+
const buf = await file.async("nodebuffer");
|
|
126
|
+
const filepath = join(imageDir, imgName);
|
|
127
|
+
writeFileSync(filepath, buf);
|
|
128
|
+
sections.push(``);
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
else {
|
|
132
|
+
sections.push(`<!-- image: ${imgName} -->`);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
return { markdown: sections.join("\n\n"), title };
|
|
136
|
+
}
|
|
137
|
+
// ---------------------------------------------------------------------------
|
|
138
|
+
// Numbers
|
|
139
|
+
// ---------------------------------------------------------------------------
|
|
140
|
+
async convertNumbers(zip) {
|
|
141
|
+
const xml = await this.readIndex(zip, "index.xml");
|
|
142
|
+
const root = parseXml(xml);
|
|
143
|
+
// Find grid elements (tables)
|
|
144
|
+
const grids = [...iterAll(root, SF, "grid")];
|
|
145
|
+
if (grids.length === 0) {
|
|
146
|
+
// Fallback: extract all text and number cells
|
|
147
|
+
return this.convertNumbersFallback(root);
|
|
148
|
+
}
|
|
149
|
+
const sections = [];
|
|
150
|
+
for (const grid of grids) {
|
|
151
|
+
const rows = this.extractGrid(grid);
|
|
152
|
+
if (rows.length === 0)
|
|
153
|
+
continue;
|
|
154
|
+
const maxCols = Math.max(...rows.map((r) => r.length));
|
|
155
|
+
for (const row of rows) {
|
|
156
|
+
while (row.length < maxCols)
|
|
157
|
+
row.push("");
|
|
158
|
+
}
|
|
159
|
+
const [header, ...body] = rows;
|
|
160
|
+
const lines = [];
|
|
161
|
+
lines.push(`| ${header.join(" | ")} |`);
|
|
162
|
+
lines.push(`| ${header.map(() => "---").join(" | ")} |`);
|
|
163
|
+
for (const row of body) {
|
|
164
|
+
lines.push(`| ${row.join(" | ")} |`);
|
|
165
|
+
}
|
|
166
|
+
sections.push(lines.join("\n"));
|
|
167
|
+
}
|
|
168
|
+
return { markdown: sections.join("\n\n") };
|
|
169
|
+
}
|
|
170
|
+
extractGrid(grid) {
|
|
171
|
+
const datasource = findFirst(grid, SF, "datasource");
|
|
172
|
+
if (!datasource)
|
|
173
|
+
return [];
|
|
174
|
+
const rows = [];
|
|
175
|
+
let currentRow = [];
|
|
176
|
+
let colCount = 0;
|
|
177
|
+
let totalCells = 0;
|
|
178
|
+
const allValues = [];
|
|
179
|
+
// Get column count from grid attributes (raw attribute names)
|
|
180
|
+
const numCols = Number.parseInt(grid.getAttribute("sf:numcols") || "0", 10);
|
|
181
|
+
for (const child of datasource.children) {
|
|
182
|
+
const tag = child.tagName;
|
|
183
|
+
let value = "";
|
|
184
|
+
if (tag === `${SF}:t`) {
|
|
185
|
+
// Text cell
|
|
186
|
+
const ct = findFirst(child, SF, "ct");
|
|
187
|
+
value = ct?.getAttribute("sfa:s") || collectText(child).trim();
|
|
188
|
+
}
|
|
189
|
+
else if (tag === `${SF}:n`) {
|
|
190
|
+
// Number cell
|
|
191
|
+
value = child.getAttribute("sf:v") || "";
|
|
192
|
+
}
|
|
193
|
+
else if (tag === `${SF}:b`) {
|
|
194
|
+
// Boolean cell
|
|
195
|
+
value = child.getAttribute("sf:v") === "1" ? "TRUE" : "FALSE";
|
|
196
|
+
}
|
|
197
|
+
else if (tag === `${SF}:d`) {
|
|
198
|
+
// Date cell
|
|
199
|
+
value = child.getAttribute("sf:v") || "";
|
|
200
|
+
}
|
|
201
|
+
else if (tag === `${SF}:du`) {
|
|
202
|
+
// Duration cell
|
|
203
|
+
value = child.getAttribute("sf:v") || "";
|
|
204
|
+
}
|
|
205
|
+
else if (tag === `${SF}:e`) {
|
|
206
|
+
// Empty cell
|
|
207
|
+
value = "";
|
|
208
|
+
}
|
|
209
|
+
else {
|
|
210
|
+
continue;
|
|
211
|
+
}
|
|
212
|
+
currentRow.push(value);
|
|
213
|
+
allValues.push(value);
|
|
214
|
+
colCount++;
|
|
215
|
+
totalCells++;
|
|
216
|
+
if (numCols > 0 && colCount >= numCols) {
|
|
217
|
+
rows.push(currentRow);
|
|
218
|
+
currentRow = [];
|
|
219
|
+
colCount = 0;
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
if (currentRow.length > 0)
|
|
223
|
+
rows.push(currentRow);
|
|
224
|
+
// If the grid used default dimensions and produced only one row,
|
|
225
|
+
// the data probably doesn't fill the full grid width. Fall back to
|
|
226
|
+
// treating the cells as a 2-column key/value list or single column.
|
|
227
|
+
if (rows.length <= 1 && totalCells > 0 && totalCells < numCols) {
|
|
228
|
+
// Re-layout: try 2 columns if even, otherwise single column
|
|
229
|
+
const cols = totalCells % 2 === 0 ? 2 : 1;
|
|
230
|
+
const relaid = [];
|
|
231
|
+
for (let i = 0; i < allValues.length; i += cols) {
|
|
232
|
+
relaid.push(allValues.slice(i, i + cols));
|
|
233
|
+
}
|
|
234
|
+
return relaid;
|
|
235
|
+
}
|
|
236
|
+
return rows;
|
|
237
|
+
}
|
|
238
|
+
convertNumbersFallback(root) {
|
|
239
|
+
const values = [];
|
|
240
|
+
for (const t of iterAll(root, SF, "t")) {
|
|
241
|
+
const ct = findFirst(t, SF, "ct");
|
|
242
|
+
const val = ct?.getAttribute("sfa:s") || "";
|
|
243
|
+
if (val)
|
|
244
|
+
values.push(val);
|
|
245
|
+
}
|
|
246
|
+
for (const n of iterAll(root, SF, "n")) {
|
|
247
|
+
const val = n.getAttribute("sf:v") || "";
|
|
248
|
+
if (val)
|
|
249
|
+
values.push(val);
|
|
250
|
+
}
|
|
251
|
+
return { markdown: values.join("\n") };
|
|
252
|
+
}
|
|
253
|
+
// ---------------------------------------------------------------------------
|
|
254
|
+
// Helpers
|
|
255
|
+
// ---------------------------------------------------------------------------
|
|
256
|
+
async readIndex(zip, filename) {
|
|
257
|
+
const file = zip.file(filename);
|
|
258
|
+
if (!file) {
|
|
259
|
+
throw new Error(`Invalid iWork file: missing ${filename}`);
|
|
260
|
+
}
|
|
261
|
+
return file.async("string");
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
/**
|
|
265
|
+
* Minimal XML parser that preserves namespace prefixes in tag names
|
|
266
|
+
* and extracts text content and attributes.
|
|
267
|
+
*/
|
|
268
|
+
function parseXml(xml) {
|
|
269
|
+
// Use a simple recursive descent approach
|
|
270
|
+
const root = createElement("root");
|
|
271
|
+
const stack = [root];
|
|
272
|
+
// Match tags and text
|
|
273
|
+
const tagRe = /<(\/?)([a-zA-Z0-9_:.-]+)((?:\s+[a-zA-Z0-9_:.-]+\s*=\s*"[^"]*")*)\s*(\/?)>/g;
|
|
274
|
+
let lastIndex = 0;
|
|
275
|
+
let match = tagRe.exec(xml);
|
|
276
|
+
while (match !== null) {
|
|
277
|
+
const [fullMatch, isClose, tagName, attrs, isSelfClose] = match;
|
|
278
|
+
const textBefore = xml.slice(lastIndex, match.index);
|
|
279
|
+
lastIndex = match.index + fullMatch.length;
|
|
280
|
+
// Add text to current element
|
|
281
|
+
if (textBefore.trim()) {
|
|
282
|
+
const current = stack[stack.length - 1];
|
|
283
|
+
if (current.children.length > 0) {
|
|
284
|
+
current.children[current.children.length - 1].tail += textBefore.trim();
|
|
285
|
+
}
|
|
286
|
+
else {
|
|
287
|
+
current.text += textBefore.trim();
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
if (isClose) {
|
|
291
|
+
// Closing tag
|
|
292
|
+
stack.pop();
|
|
293
|
+
}
|
|
294
|
+
else {
|
|
295
|
+
// Opening tag
|
|
296
|
+
const el = createElement(expandTag(tagName, xml));
|
|
297
|
+
parseAttributes(attrs, el);
|
|
298
|
+
stack[stack.length - 1].children.push(el);
|
|
299
|
+
if (!isSelfClose) {
|
|
300
|
+
stack.push(el);
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
match = tagRe.exec(xml);
|
|
304
|
+
}
|
|
305
|
+
return root;
|
|
306
|
+
}
|
|
307
|
+
function createElement(tagName) {
|
|
308
|
+
return {
|
|
309
|
+
tagName,
|
|
310
|
+
children: [],
|
|
311
|
+
text: "",
|
|
312
|
+
tail: "",
|
|
313
|
+
attributes: {},
|
|
314
|
+
getAttribute(name) {
|
|
315
|
+
return this.attributes[name] ?? null;
|
|
316
|
+
},
|
|
317
|
+
};
|
|
318
|
+
}
|
|
319
|
+
function parseAttributes(attrStr, el) {
|
|
320
|
+
const attrRe = /([a-zA-Z0-9_:.-]+)\s*=\s*"([^"]*)"/g;
|
|
321
|
+
let m = attrRe.exec(attrStr);
|
|
322
|
+
while (m !== null) {
|
|
323
|
+
el.attributes[m[1]] = m[2];
|
|
324
|
+
m = attrRe.exec(attrStr);
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
/**
|
|
328
|
+
* Expand namespace prefix in tag name to full URI.
|
|
329
|
+
* e.g. "sf:p" with xmlns:sf="..." → "{uri}p"
|
|
330
|
+
* For simplicity, we use the known Apple namespaces.
|
|
331
|
+
*/
|
|
332
|
+
function expandTag(tag, _xml) {
|
|
333
|
+
const nsMap = {
|
|
334
|
+
sf: SF,
|
|
335
|
+
sfa: SFA,
|
|
336
|
+
sl: "http://developer.apple.com/namespaces/sl",
|
|
337
|
+
key: KEY,
|
|
338
|
+
};
|
|
339
|
+
const colon = tag.indexOf(":");
|
|
340
|
+
if (colon === -1)
|
|
341
|
+
return tag;
|
|
342
|
+
const prefix = tag.slice(0, colon);
|
|
343
|
+
const local = tag.slice(colon + 1);
|
|
344
|
+
const uri = nsMap[prefix];
|
|
345
|
+
return uri ? `${uri}:${local}` : tag;
|
|
346
|
+
}
|
|
347
|
+
function collectText(el) {
|
|
348
|
+
let result = el.text;
|
|
349
|
+
for (const child of el.children) {
|
|
350
|
+
result += collectText(child);
|
|
351
|
+
result += child.tail;
|
|
352
|
+
}
|
|
353
|
+
return result;
|
|
354
|
+
}
|
|
355
|
+
function* iterAll(el, ns, localName) {
|
|
356
|
+
const fullTag = `${ns}:${localName}`;
|
|
357
|
+
if (el.tagName === fullTag)
|
|
358
|
+
yield el;
|
|
359
|
+
for (const child of el.children) {
|
|
360
|
+
yield* iterAll(child, ns, localName);
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
function findFirst(el, ns, localName) {
|
|
364
|
+
for (const found of iterAll(el, ns, localName)) {
|
|
365
|
+
return found;
|
|
366
|
+
}
|
|
367
|
+
return null;
|
|
368
|
+
}
|
|
369
|
+
/**
|
|
370
|
+
* Map iWork paragraph style names to markdown heading prefixes.
|
|
371
|
+
*/
|
|
372
|
+
function paragraphPrefix(style) {
|
|
373
|
+
if (!style)
|
|
374
|
+
return "";
|
|
375
|
+
const lower = style.toLowerCase();
|
|
376
|
+
if (lower.includes("title"))
|
|
377
|
+
return "# ";
|
|
378
|
+
if (lower.includes("subtitle"))
|
|
379
|
+
return "## ";
|
|
380
|
+
if (lower.includes("heading-1") || lower.includes("heading 1"))
|
|
381
|
+
return "## ";
|
|
382
|
+
if (lower.includes("heading-2") || lower.includes("heading 2"))
|
|
383
|
+
return "### ";
|
|
384
|
+
if (lower.includes("heading-3") || lower.includes("heading 3"))
|
|
385
|
+
return "#### ";
|
|
386
|
+
if (lower.includes("heading-4") || lower.includes("heading 4"))
|
|
387
|
+
return "##### ";
|
|
388
|
+
if (lower.includes("caption"))
|
|
389
|
+
return "*";
|
|
390
|
+
return "";
|
|
391
|
+
}
|
|
@@ -2,7 +2,7 @@ import type { ConversionResult, Converter, StreamInfo } from "../types.js";
|
|
|
2
2
|
export declare class PptxConverter implements Converter {
|
|
3
3
|
name: string;
|
|
4
4
|
accepts(streamInfo: StreamInfo): boolean;
|
|
5
|
-
convert(input: Buffer,
|
|
5
|
+
convert(input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult>;
|
|
6
6
|
private extractText;
|
|
7
7
|
private extractTable;
|
|
8
8
|
}
|
package/dist/converters/pptx.js
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import { mkdirSync, writeFileSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
1
3
|
import { XMLParser } from "fast-xml-parser";
|
|
2
4
|
import JSZip from "jszip";
|
|
3
5
|
const EXTENSIONS = [".pptx"];
|
|
@@ -14,12 +16,13 @@ export class PptxConverter {
|
|
|
14
16
|
return true;
|
|
15
17
|
return false;
|
|
16
18
|
}
|
|
17
|
-
async convert(input,
|
|
19
|
+
async convert(input, streamInfo) {
|
|
18
20
|
const zip = await JSZip.loadAsync(input);
|
|
19
21
|
const parser = new XMLParser({
|
|
20
22
|
ignoreAttributes: false,
|
|
21
23
|
attributeNamePrefix: "@_",
|
|
22
24
|
textNodeName: "#text",
|
|
25
|
+
processEntities: { maxTotalExpansions: 1_000_000 },
|
|
23
26
|
});
|
|
24
27
|
// Get slide order from presentation.xml
|
|
25
28
|
const presXml = await zip.file("ppt/presentation.xml")?.async("string");
|
|
@@ -66,7 +69,12 @@ export class PptxConverter {
|
|
|
66
69
|
});
|
|
67
70
|
slidePaths.push(...slideFiles);
|
|
68
71
|
}
|
|
72
|
+
const imageDir = streamInfo.imageDir;
|
|
73
|
+
if (imageDir) {
|
|
74
|
+
mkdirSync(imageDir, { recursive: true });
|
|
75
|
+
}
|
|
69
76
|
const sections = [];
|
|
77
|
+
let imageCount = 0;
|
|
70
78
|
for (let i = 0; i < slidePaths.length; i++) {
|
|
71
79
|
const slideXml = await zip.file(slidePaths[i])?.async("string");
|
|
72
80
|
if (!slideXml)
|
|
@@ -75,6 +83,17 @@ export class PptxConverter {
|
|
|
75
83
|
const spTree = slide["p:sld"]?.["p:cSld"]?.["p:spTree"];
|
|
76
84
|
if (!spTree)
|
|
77
85
|
continue;
|
|
86
|
+
// Parse slide-level rels for image references
|
|
87
|
+
const slideRelsPath = `${slidePaths[i].replace("slides/slide", "slides/_rels/slide")}.rels`;
|
|
88
|
+
const slideRelsXml = await zip.file(slideRelsPath)?.async("string");
|
|
89
|
+
const slideRelMap = new Map();
|
|
90
|
+
if (slideRelsXml) {
|
|
91
|
+
const slideRels = parser.parse(slideRelsXml);
|
|
92
|
+
const relItems = toList(slideRels?.Relationships?.Relationship);
|
|
93
|
+
for (const r of relItems) {
|
|
94
|
+
slideRelMap.set(r["@_Id"], r["@_Target"]);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
78
97
|
const slideLines = [`<!-- Slide ${i + 1} -->`];
|
|
79
98
|
const shapes = spTree["p:sp"];
|
|
80
99
|
const shapeList = Array.isArray(shapes) ? shapes : shapes ? [shapes] : [];
|
|
@@ -91,6 +110,55 @@ export class PptxConverter {
|
|
|
91
110
|
slideLines.push(text);
|
|
92
111
|
}
|
|
93
112
|
}
|
|
113
|
+
// Extract embedded images
|
|
114
|
+
const pics = toList(spTree["p:pic"]);
|
|
115
|
+
for (const pic of pics) {
|
|
116
|
+
const blipFill = pic["p:blipFill"];
|
|
117
|
+
const rEmbed = blipFill?.["a:blip"]?.["@_r:embed"];
|
|
118
|
+
if (!rEmbed)
|
|
119
|
+
continue;
|
|
120
|
+
const target = slideRelMap.get(rEmbed);
|
|
121
|
+
if (!target)
|
|
122
|
+
continue;
|
|
123
|
+
// Resolve relative target against slide directory
|
|
124
|
+
const imagePath = target.startsWith("/")
|
|
125
|
+
? target.slice(1)
|
|
126
|
+
: `ppt/slides/${target}`;
|
|
127
|
+
// Normalize path (e.g. ppt/slides/../media/image1.png → ppt/media/image1.png)
|
|
128
|
+
const normalizedPath = imagePath
|
|
129
|
+
.split("/")
|
|
130
|
+
.reduce((parts, seg) => {
|
|
131
|
+
if (seg === "..")
|
|
132
|
+
parts.pop();
|
|
133
|
+
else
|
|
134
|
+
parts.push(seg);
|
|
135
|
+
return parts;
|
|
136
|
+
}, [])
|
|
137
|
+
.join("/");
|
|
138
|
+
const imageFile = zip.file(normalizedPath);
|
|
139
|
+
if (!imageFile)
|
|
140
|
+
continue;
|
|
141
|
+
imageCount++;
|
|
142
|
+
const name = pic["p:nvSpPr"]?.["p:cNvPr"]?.["@_name"] ||
|
|
143
|
+
pic["p:nvPicPr"]?.["p:cNvPr"]?.["@_name"] ||
|
|
144
|
+
`image_${imageCount}`;
|
|
145
|
+
if (imageDir) {
|
|
146
|
+
try {
|
|
147
|
+
const ext = normalizedPath.split(".").pop() || "png";
|
|
148
|
+
const filename = `slide${i + 1}_${imageCount}.${ext}`;
|
|
149
|
+
const filepath = join(imageDir, filename);
|
|
150
|
+
const buf = await imageFile.async("nodebuffer");
|
|
151
|
+
writeFileSync(filepath, buf);
|
|
152
|
+
slideLines.push(``);
|
|
153
|
+
}
|
|
154
|
+
catch {
|
|
155
|
+
slideLines.push(`<!-- image: ${name} (slide ${i + 1}) -->`);
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
else {
|
|
159
|
+
slideLines.push(`<!-- image: ${name} (slide ${i + 1}) -->`);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
94
162
|
// Tables
|
|
95
163
|
const graphicFrames = spTree["p:graphicFrame"];
|
|
96
164
|
const gfList = Array.isArray(graphicFrames)
|
|
@@ -214,3 +282,8 @@ export class PptxConverter {
|
|
|
214
282
|
return lines.join("\n");
|
|
215
283
|
}
|
|
216
284
|
}
|
|
285
|
+
function toList(val) {
|
|
286
|
+
if (!val)
|
|
287
|
+
return [];
|
|
288
|
+
return Array.isArray(val) ? val : [val];
|
|
289
|
+
}
|
package/dist/converters/xlsx.js
CHANGED
|
@@ -20,6 +20,7 @@ export class XlsxConverter {
|
|
|
20
20
|
ignoreAttributes: false,
|
|
21
21
|
attributeNamePrefix: "@_",
|
|
22
22
|
textNodeName: "#text",
|
|
23
|
+
processEntities: { maxTotalExpansions: 1_000_000 },
|
|
23
24
|
});
|
|
24
25
|
// Parse shared strings
|
|
25
26
|
const ssXml = await zip.file("xl/sharedStrings.xml")?.async("string");
|
package/dist/index.d.ts
CHANGED
|
@@ -3,9 +3,11 @@ export { AudioConverter } from "./converters/audio.js";
|
|
|
3
3
|
export { CsvConverter } from "./converters/csv.js";
|
|
4
4
|
export { DocxConverter } from "./converters/docx.js";
|
|
5
5
|
export { EpubConverter } from "./converters/epub.js";
|
|
6
|
+
export { GitHubConverter } from "./converters/github.js";
|
|
6
7
|
export { HtmlConverter } from "./converters/html.js";
|
|
7
8
|
export { ImageConverter } from "./converters/image.js";
|
|
8
9
|
export { IpynbConverter } from "./converters/ipynb.js";
|
|
10
|
+
export { IWorkConverter } from "./converters/iwork.js";
|
|
9
11
|
export { JsonConverter } from "./converters/json.js";
|
|
10
12
|
export { PdfConverter } from "./converters/pdf/index.js";
|
|
11
13
|
export { PlainTextConverter } from "./converters/plain-text.js";
|
package/dist/index.js
CHANGED
|
@@ -2,9 +2,11 @@ export { AudioConverter } from "./converters/audio.js";
|
|
|
2
2
|
export { CsvConverter } from "./converters/csv.js";
|
|
3
3
|
export { DocxConverter } from "./converters/docx.js";
|
|
4
4
|
export { EpubConverter } from "./converters/epub.js";
|
|
5
|
+
export { GitHubConverter } from "./converters/github.js";
|
|
5
6
|
export { HtmlConverter } from "./converters/html.js";
|
|
6
7
|
export { ImageConverter } from "./converters/image.js";
|
|
7
8
|
export { IpynbConverter } from "./converters/ipynb.js";
|
|
9
|
+
export { IWorkConverter } from "./converters/iwork.js";
|
|
8
10
|
export { JsonConverter } from "./converters/json.js";
|
|
9
11
|
export { PdfConverter } from "./converters/pdf/index.js";
|
|
10
12
|
export { PlainTextConverter } from "./converters/plain-text.js";
|
package/dist/main.js
CHANGED
|
@@ -18,6 +18,7 @@ program
|
|
|
18
18
|
.option("-q, --quiet", "Raw markdown only, no decoration")
|
|
19
19
|
.option("-p, --prompt <text>", "Extra instructions for image description")
|
|
20
20
|
.option("-o, --output <file>", "Write to file instead of stdout")
|
|
21
|
+
.option("-i, --image-dir <dir>", "Extract images to this directory")
|
|
21
22
|
.addHelpText("after", `
|
|
22
23
|
Examples:
|
|
23
24
|
$ markit report.pdf Convert a PDF to markdown
|
|
@@ -42,7 +43,8 @@ program
|
|
|
42
43
|
json: globals.json,
|
|
43
44
|
quiet: globals.quiet,
|
|
44
45
|
output: opts.output,
|
|
45
|
-
prompt:
|
|
46
|
+
prompt: globals.prompt,
|
|
47
|
+
imageDir: globals.imageDir,
|
|
46
48
|
});
|
|
47
49
|
});
|
|
48
50
|
program
|
|
@@ -144,6 +146,7 @@ program.on("command:*", async (args) => {
|
|
|
144
146
|
quiet: globals.quiet,
|
|
145
147
|
output: globals.output,
|
|
146
148
|
prompt: globals.prompt,
|
|
149
|
+
imageDir: globals.imageDir,
|
|
147
150
|
});
|
|
148
151
|
});
|
|
149
152
|
// No args → show concise help
|
package/dist/markit.d.ts
CHANGED
|
@@ -7,7 +7,7 @@ export declare class Markit {
|
|
|
7
7
|
/**
|
|
8
8
|
* Convert a local file to markdown.
|
|
9
9
|
*/
|
|
10
|
-
convertFile(path: string): Promise<ConversionResult>;
|
|
10
|
+
convertFile(path: string, extra?: Partial<StreamInfo>): Promise<ConversionResult>;
|
|
11
11
|
/**
|
|
12
12
|
* Convert a URL to markdown.
|
|
13
13
|
*/
|
package/dist/markit.js
CHANGED
|
@@ -4,9 +4,11 @@ import { AudioConverter } from "./converters/audio.js";
|
|
|
4
4
|
import { CsvConverter } from "./converters/csv.js";
|
|
5
5
|
import { DocxConverter } from "./converters/docx.js";
|
|
6
6
|
import { EpubConverter } from "./converters/epub.js";
|
|
7
|
+
import { GitHubConverter } from "./converters/github.js";
|
|
7
8
|
import { HtmlConverter } from "./converters/html.js";
|
|
8
9
|
import { ImageConverter } from "./converters/image.js";
|
|
9
10
|
import { IpynbConverter } from "./converters/ipynb.js";
|
|
11
|
+
import { IWorkConverter } from "./converters/iwork.js";
|
|
10
12
|
import { JsonConverter } from "./converters/json.js";
|
|
11
13
|
import { PdfConverter } from "./converters/pdf/index.js";
|
|
12
14
|
import { PlainTextConverter } from "./converters/plain-text.js";
|
|
@@ -32,6 +34,8 @@ export class Markit {
|
|
|
32
34
|
new XlsxConverter(),
|
|
33
35
|
new EpubConverter(),
|
|
34
36
|
new IpynbConverter(),
|
|
37
|
+
new IWorkConverter(),
|
|
38
|
+
new GitHubConverter(),
|
|
35
39
|
new WikipediaConverter(),
|
|
36
40
|
new RssConverter(),
|
|
37
41
|
new CsvConverter(),
|
|
@@ -56,12 +60,13 @@ export class Markit {
|
|
|
56
60
|
/**
|
|
57
61
|
* Convert a local file to markdown.
|
|
58
62
|
*/
|
|
59
|
-
async convertFile(path) {
|
|
63
|
+
async convertFile(path, extra) {
|
|
60
64
|
const buffer = readFileSync(path);
|
|
61
65
|
const streamInfo = {
|
|
62
66
|
localPath: path,
|
|
63
67
|
extension: extname(path).toLowerCase(),
|
|
64
68
|
filename: basename(path),
|
|
69
|
+
...extra,
|
|
65
70
|
};
|
|
66
71
|
return this.convert(buffer, streamInfo);
|
|
67
72
|
}
|
|
@@ -69,6 +74,18 @@ export class Markit {
|
|
|
69
74
|
* Convert a URL to markdown.
|
|
70
75
|
*/
|
|
71
76
|
async convertUrl(url) {
|
|
77
|
+
// Let converters with a URL-specific hook handle it first
|
|
78
|
+
const streamInfo = { url };
|
|
79
|
+
for (const converter of this.converters) {
|
|
80
|
+
if (!converter.convertUrl || !converter.accepts(streamInfo))
|
|
81
|
+
continue;
|
|
82
|
+
try {
|
|
83
|
+
return await converter.convertUrl(url, this.options);
|
|
84
|
+
}
|
|
85
|
+
catch {
|
|
86
|
+
// Fall through to default fetch path
|
|
87
|
+
}
|
|
88
|
+
}
|
|
72
89
|
const response = await fetch(url, {
|
|
73
90
|
headers: {
|
|
74
91
|
Accept: "text/markdown, text/html;q=0.9, text/plain;q=0.8, */*;q=0.1",
|
|
@@ -84,13 +101,13 @@ export class Markit {
|
|
|
84
101
|
const urlPath = new URL(url).pathname;
|
|
85
102
|
const ext = extname(urlPath).toLowerCase();
|
|
86
103
|
const buffer = Buffer.from(await response.arrayBuffer());
|
|
87
|
-
const
|
|
104
|
+
const fetchedInfo = {
|
|
88
105
|
url,
|
|
89
106
|
mimetype: mimetype.trim(),
|
|
90
107
|
extension: ext || undefined,
|
|
91
108
|
filename: basename(urlPath) || undefined,
|
|
92
109
|
};
|
|
93
|
-
return this.convert(buffer,
|
|
110
|
+
return this.convert(buffer, fetchedInfo);
|
|
94
111
|
}
|
|
95
112
|
/**
|
|
96
113
|
* Convert a buffer with stream info to markdown.
|
package/dist/types.d.ts
CHANGED
|
@@ -25,6 +25,12 @@ export interface Converter {
|
|
|
25
25
|
name: string;
|
|
26
26
|
/** Quick check: can this converter handle the given stream? */
|
|
27
27
|
accepts(streamInfo: StreamInfo): boolean;
|
|
28
|
+
/**
|
|
29
|
+
* Optional URL-first hook. When present, called before the default fetch
|
|
30
|
+
* so the converter can handle URL fetching itself (e.g. rewrite to a raw
|
|
31
|
+
* content URL or call an API).
|
|
32
|
+
*/
|
|
33
|
+
convertUrl?(url: string, options?: MarkitOptions): Promise<ConversionResult>;
|
|
28
34
|
/** Convert the source to markdown */
|
|
29
35
|
convert(input: Buffer, streamInfo: StreamInfo, options?: MarkitOptions): Promise<ConversionResult>;
|
|
30
36
|
}
|
package/dist/utils/turndown.js
CHANGED
|
@@ -51,8 +51,14 @@ export function createTurndown() {
|
|
|
51
51
|
* - Strip <p> tags inside <td>/<th> cells
|
|
52
52
|
*/
|
|
53
53
|
export function normalizeTablesHtml(html) {
|
|
54
|
-
// Strip <p> tags inside table cells
|
|
55
|
-
let result = html.replace(/<(td|th)([^>]*)
|
|
54
|
+
// Strip <p> tags inside table cells, joining multiple paragraphs with <br>
|
|
55
|
+
let result = html.replace(/<(td|th)([^>]*)>([\s\S]*?)<\/(td|th)>/gi, (_match, tag, attrs, inner, closeTag) => {
|
|
56
|
+
const stripped = inner
|
|
57
|
+
.replace(/^\s*<p>/i, "")
|
|
58
|
+
.replace(/<\/p>\s*$/i, "")
|
|
59
|
+
.replace(/<\/p>\s*<p>/gi, " ");
|
|
60
|
+
return `<${tag}${attrs}>${stripped}</${closeTag}>`;
|
|
61
|
+
});
|
|
56
62
|
// Add thead to tables that lack it
|
|
57
63
|
result = result.replace(/<table([^>]*)>\s*(?:<tbody>\s*)?(<tr[\s\S]*?<\/tr>)([\s\S]*?)<\/(?:tbody>\s*<\/)?table>/gi, (_match, attrs, firstRow, rest) => {
|
|
58
64
|
const theadRow = firstRow
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "markit-ai",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.5.0",
|
|
4
4
|
"description": "Convert anything to markdown. PDF, DOCX, PPTX, XLSX, HTML, EPUB, Jupyter, RSS, images, audio, URLs, and more. Pluggable converters, built-in LLM providers for image description and audio transcription. Works as a CLI and as a library.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|