markit-ai 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/convert.d.ts +1 -0
- package/dist/commands/convert.js +7 -3
- package/dist/commands/formats.js +5 -0
- package/dist/converters/docx.d.ts +1 -1
- package/dist/converters/docx.js +35 -3
- package/dist/converters/epub.js +1 -0
- package/dist/converters/github.d.ts +18 -0
- package/dist/converters/github.js +148 -0
- package/dist/converters/pdf/columns.d.ts +35 -0
- package/dist/converters/pdf/columns.js +93 -0
- package/dist/converters/pdf/extract.d.ts +19 -0
- package/dist/converters/pdf/extract.js +513 -0
- package/dist/converters/pdf/grid.d.ts +25 -0
- package/dist/converters/pdf/grid.js +654 -0
- package/dist/converters/pdf/headers.d.ts +24 -0
- package/dist/converters/pdf/headers.js +108 -0
- package/dist/converters/pdf/index.d.ts +19 -0
- package/dist/converters/pdf/index.js +116 -0
- package/dist/converters/pdf/render.d.ts +24 -0
- package/dist/converters/pdf/render.js +513 -0
- package/dist/converters/pdf/types.d.ts +75 -0
- package/dist/converters/pdf/types.js +1 -0
- package/dist/converters/pptx.d.ts +1 -1
- package/dist/converters/pptx.js +74 -1
- package/dist/converters/xlsx.js +1 -0
- package/dist/index.d.ts +2 -1
- package/dist/index.js +2 -1
- package/dist/main.js +4 -1
- package/dist/markit.d.ts +1 -1
- package/dist/markit.js +19 -4
- package/dist/types.d.ts +8 -0
- package/package.json +3 -3
- package/dist/converters/pdf.d.ts +0 -6
- package/dist/converters/pdf.js +0 -29
package/dist/converters/pptx.js
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import { mkdirSync, writeFileSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
1
3
|
import { XMLParser } from "fast-xml-parser";
|
|
2
4
|
import JSZip from "jszip";
|
|
3
5
|
const EXTENSIONS = [".pptx"];
|
|
@@ -14,12 +16,13 @@ export class PptxConverter {
|
|
|
14
16
|
return true;
|
|
15
17
|
return false;
|
|
16
18
|
}
|
|
17
|
-
async convert(input,
|
|
19
|
+
async convert(input, streamInfo) {
|
|
18
20
|
const zip = await JSZip.loadAsync(input);
|
|
19
21
|
const parser = new XMLParser({
|
|
20
22
|
ignoreAttributes: false,
|
|
21
23
|
attributeNamePrefix: "@_",
|
|
22
24
|
textNodeName: "#text",
|
|
25
|
+
processEntities: { maxTotalExpansions: 1_000_000 },
|
|
23
26
|
});
|
|
24
27
|
// Get slide order from presentation.xml
|
|
25
28
|
const presXml = await zip.file("ppt/presentation.xml")?.async("string");
|
|
@@ -66,7 +69,12 @@ export class PptxConverter {
|
|
|
66
69
|
});
|
|
67
70
|
slidePaths.push(...slideFiles);
|
|
68
71
|
}
|
|
72
|
+
const imageDir = streamInfo.imageDir;
|
|
73
|
+
if (imageDir) {
|
|
74
|
+
mkdirSync(imageDir, { recursive: true });
|
|
75
|
+
}
|
|
69
76
|
const sections = [];
|
|
77
|
+
let imageCount = 0;
|
|
70
78
|
for (let i = 0; i < slidePaths.length; i++) {
|
|
71
79
|
const slideXml = await zip.file(slidePaths[i])?.async("string");
|
|
72
80
|
if (!slideXml)
|
|
@@ -75,6 +83,17 @@ export class PptxConverter {
|
|
|
75
83
|
const spTree = slide["p:sld"]?.["p:cSld"]?.["p:spTree"];
|
|
76
84
|
if (!spTree)
|
|
77
85
|
continue;
|
|
86
|
+
// Parse slide-level rels for image references
|
|
87
|
+
const slideRelsPath = `${slidePaths[i].replace("slides/slide", "slides/_rels/slide")}.rels`;
|
|
88
|
+
const slideRelsXml = await zip.file(slideRelsPath)?.async("string");
|
|
89
|
+
const slideRelMap = new Map();
|
|
90
|
+
if (slideRelsXml) {
|
|
91
|
+
const slideRels = parser.parse(slideRelsXml);
|
|
92
|
+
const relItems = toList(slideRels?.Relationships?.Relationship);
|
|
93
|
+
for (const r of relItems) {
|
|
94
|
+
slideRelMap.set(r["@_Id"], r["@_Target"]);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
78
97
|
const slideLines = [`<!-- Slide ${i + 1} -->`];
|
|
79
98
|
const shapes = spTree["p:sp"];
|
|
80
99
|
const shapeList = Array.isArray(shapes) ? shapes : shapes ? [shapes] : [];
|
|
@@ -91,6 +110,55 @@ export class PptxConverter {
|
|
|
91
110
|
slideLines.push(text);
|
|
92
111
|
}
|
|
93
112
|
}
|
|
113
|
+
// Extract embedded images
|
|
114
|
+
const pics = toList(spTree["p:pic"]);
|
|
115
|
+
for (const pic of pics) {
|
|
116
|
+
const blipFill = pic["p:blipFill"];
|
|
117
|
+
const rEmbed = blipFill?.["a:blip"]?.["@_r:embed"];
|
|
118
|
+
if (!rEmbed)
|
|
119
|
+
continue;
|
|
120
|
+
const target = slideRelMap.get(rEmbed);
|
|
121
|
+
if (!target)
|
|
122
|
+
continue;
|
|
123
|
+
// Resolve relative target against slide directory
|
|
124
|
+
const imagePath = target.startsWith("/")
|
|
125
|
+
? target.slice(1)
|
|
126
|
+
: `ppt/slides/${target}`;
|
|
127
|
+
// Normalize path (e.g. ppt/slides/../media/image1.png → ppt/media/image1.png)
|
|
128
|
+
const normalizedPath = imagePath
|
|
129
|
+
.split("/")
|
|
130
|
+
.reduce((parts, seg) => {
|
|
131
|
+
if (seg === "..")
|
|
132
|
+
parts.pop();
|
|
133
|
+
else
|
|
134
|
+
parts.push(seg);
|
|
135
|
+
return parts;
|
|
136
|
+
}, [])
|
|
137
|
+
.join("/");
|
|
138
|
+
const imageFile = zip.file(normalizedPath);
|
|
139
|
+
if (!imageFile)
|
|
140
|
+
continue;
|
|
141
|
+
imageCount++;
|
|
142
|
+
const name = pic["p:nvSpPr"]?.["p:cNvPr"]?.["@_name"] ||
|
|
143
|
+
pic["p:nvPicPr"]?.["p:cNvPr"]?.["@_name"] ||
|
|
144
|
+
`image_${imageCount}`;
|
|
145
|
+
if (imageDir) {
|
|
146
|
+
try {
|
|
147
|
+
const ext = normalizedPath.split(".").pop() || "png";
|
|
148
|
+
const filename = `slide${i + 1}_${imageCount}.${ext}`;
|
|
149
|
+
const filepath = join(imageDir, filename);
|
|
150
|
+
const buf = await imageFile.async("nodebuffer");
|
|
151
|
+
writeFileSync(filepath, buf);
|
|
152
|
+
slideLines.push(``);
|
|
153
|
+
}
|
|
154
|
+
catch {
|
|
155
|
+
slideLines.push(`<!-- image: ${name} (slide ${i + 1}) -->`);
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
else {
|
|
159
|
+
slideLines.push(`<!-- image: ${name} (slide ${i + 1}) -->`);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
94
162
|
// Tables
|
|
95
163
|
const graphicFrames = spTree["p:graphicFrame"];
|
|
96
164
|
const gfList = Array.isArray(graphicFrames)
|
|
@@ -214,3 +282,8 @@ export class PptxConverter {
|
|
|
214
282
|
return lines.join("\n");
|
|
215
283
|
}
|
|
216
284
|
}
|
|
285
|
+
function toList(val) {
|
|
286
|
+
if (!val)
|
|
287
|
+
return [];
|
|
288
|
+
return Array.isArray(val) ? val : [val];
|
|
289
|
+
}
|
package/dist/converters/xlsx.js
CHANGED
|
@@ -20,6 +20,7 @@ export class XlsxConverter {
|
|
|
20
20
|
ignoreAttributes: false,
|
|
21
21
|
attributeNamePrefix: "@_",
|
|
22
22
|
textNodeName: "#text",
|
|
23
|
+
processEntities: { maxTotalExpansions: 1_000_000 },
|
|
23
24
|
});
|
|
24
25
|
// Parse shared strings
|
|
25
26
|
const ssXml = await zip.file("xl/sharedStrings.xml")?.async("string");
|
package/dist/index.d.ts
CHANGED
|
@@ -3,11 +3,12 @@ export { AudioConverter } from "./converters/audio.js";
|
|
|
3
3
|
export { CsvConverter } from "./converters/csv.js";
|
|
4
4
|
export { DocxConverter } from "./converters/docx.js";
|
|
5
5
|
export { EpubConverter } from "./converters/epub.js";
|
|
6
|
+
export { GitHubConverter } from "./converters/github.js";
|
|
6
7
|
export { HtmlConverter } from "./converters/html.js";
|
|
7
8
|
export { ImageConverter } from "./converters/image.js";
|
|
8
9
|
export { IpynbConverter } from "./converters/ipynb.js";
|
|
9
10
|
export { JsonConverter } from "./converters/json.js";
|
|
10
|
-
export { PdfConverter } from "./converters/pdf.js";
|
|
11
|
+
export { PdfConverter } from "./converters/pdf/index.js";
|
|
11
12
|
export { PlainTextConverter } from "./converters/plain-text.js";
|
|
12
13
|
export { PptxConverter } from "./converters/pptx.js";
|
|
13
14
|
export { RssConverter } from "./converters/rss.js";
|
package/dist/index.js
CHANGED
|
@@ -2,11 +2,12 @@ export { AudioConverter } from "./converters/audio.js";
|
|
|
2
2
|
export { CsvConverter } from "./converters/csv.js";
|
|
3
3
|
export { DocxConverter } from "./converters/docx.js";
|
|
4
4
|
export { EpubConverter } from "./converters/epub.js";
|
|
5
|
+
export { GitHubConverter } from "./converters/github.js";
|
|
5
6
|
export { HtmlConverter } from "./converters/html.js";
|
|
6
7
|
export { ImageConverter } from "./converters/image.js";
|
|
7
8
|
export { IpynbConverter } from "./converters/ipynb.js";
|
|
8
9
|
export { JsonConverter } from "./converters/json.js";
|
|
9
|
-
export { PdfConverter } from "./converters/pdf.js";
|
|
10
|
+
export { PdfConverter } from "./converters/pdf/index.js";
|
|
10
11
|
export { PlainTextConverter } from "./converters/plain-text.js";
|
|
11
12
|
export { PptxConverter } from "./converters/pptx.js";
|
|
12
13
|
export { RssConverter } from "./converters/rss.js";
|
package/dist/main.js
CHANGED
|
@@ -18,6 +18,7 @@ program
|
|
|
18
18
|
.option("-q, --quiet", "Raw markdown only, no decoration")
|
|
19
19
|
.option("-p, --prompt <text>", "Extra instructions for image description")
|
|
20
20
|
.option("-o, --output <file>", "Write to file instead of stdout")
|
|
21
|
+
.option("-i, --image-dir <dir>", "Extract images to this directory")
|
|
21
22
|
.addHelpText("after", `
|
|
22
23
|
Examples:
|
|
23
24
|
$ markit report.pdf Convert a PDF to markdown
|
|
@@ -42,7 +43,8 @@ program
|
|
|
42
43
|
json: globals.json,
|
|
43
44
|
quiet: globals.quiet,
|
|
44
45
|
output: opts.output,
|
|
45
|
-
prompt:
|
|
46
|
+
prompt: globals.prompt,
|
|
47
|
+
imageDir: globals.imageDir,
|
|
46
48
|
});
|
|
47
49
|
});
|
|
48
50
|
program
|
|
@@ -144,6 +146,7 @@ program.on("command:*", async (args) => {
|
|
|
144
146
|
quiet: globals.quiet,
|
|
145
147
|
output: globals.output,
|
|
146
148
|
prompt: globals.prompt,
|
|
149
|
+
imageDir: globals.imageDir,
|
|
147
150
|
});
|
|
148
151
|
});
|
|
149
152
|
// No args → show concise help
|
package/dist/markit.d.ts
CHANGED
|
@@ -7,7 +7,7 @@ export declare class Markit {
|
|
|
7
7
|
/**
|
|
8
8
|
* Convert a local file to markdown.
|
|
9
9
|
*/
|
|
10
|
-
convertFile(path: string): Promise<ConversionResult>;
|
|
10
|
+
convertFile(path: string, extra?: Partial<StreamInfo>): Promise<ConversionResult>;
|
|
11
11
|
/**
|
|
12
12
|
* Convert a URL to markdown.
|
|
13
13
|
*/
|
package/dist/markit.js
CHANGED
|
@@ -4,11 +4,12 @@ import { AudioConverter } from "./converters/audio.js";
|
|
|
4
4
|
import { CsvConverter } from "./converters/csv.js";
|
|
5
5
|
import { DocxConverter } from "./converters/docx.js";
|
|
6
6
|
import { EpubConverter } from "./converters/epub.js";
|
|
7
|
+
import { GitHubConverter } from "./converters/github.js";
|
|
7
8
|
import { HtmlConverter } from "./converters/html.js";
|
|
8
9
|
import { ImageConverter } from "./converters/image.js";
|
|
9
10
|
import { IpynbConverter } from "./converters/ipynb.js";
|
|
10
11
|
import { JsonConverter } from "./converters/json.js";
|
|
11
|
-
import { PdfConverter } from "./converters/pdf.js";
|
|
12
|
+
import { PdfConverter } from "./converters/pdf/index.js";
|
|
12
13
|
import { PlainTextConverter } from "./converters/plain-text.js";
|
|
13
14
|
import { PptxConverter } from "./converters/pptx.js";
|
|
14
15
|
import { RssConverter } from "./converters/rss.js";
|
|
@@ -32,6 +33,7 @@ export class Markit {
|
|
|
32
33
|
new XlsxConverter(),
|
|
33
34
|
new EpubConverter(),
|
|
34
35
|
new IpynbConverter(),
|
|
36
|
+
new GitHubConverter(),
|
|
35
37
|
new WikipediaConverter(),
|
|
36
38
|
new RssConverter(),
|
|
37
39
|
new CsvConverter(),
|
|
@@ -56,12 +58,13 @@ export class Markit {
|
|
|
56
58
|
/**
|
|
57
59
|
* Convert a local file to markdown.
|
|
58
60
|
*/
|
|
59
|
-
async convertFile(path) {
|
|
61
|
+
async convertFile(path, extra) {
|
|
60
62
|
const buffer = readFileSync(path);
|
|
61
63
|
const streamInfo = {
|
|
62
64
|
localPath: path,
|
|
63
65
|
extension: extname(path).toLowerCase(),
|
|
64
66
|
filename: basename(path),
|
|
67
|
+
...extra,
|
|
65
68
|
};
|
|
66
69
|
return this.convert(buffer, streamInfo);
|
|
67
70
|
}
|
|
@@ -69,6 +72,18 @@ export class Markit {
|
|
|
69
72
|
* Convert a URL to markdown.
|
|
70
73
|
*/
|
|
71
74
|
async convertUrl(url) {
|
|
75
|
+
// Let converters with a URL-specific hook handle it first
|
|
76
|
+
const streamInfo = { url };
|
|
77
|
+
for (const converter of this.converters) {
|
|
78
|
+
if (!converter.convertUrl || !converter.accepts(streamInfo))
|
|
79
|
+
continue;
|
|
80
|
+
try {
|
|
81
|
+
return await converter.convertUrl(url, this.options);
|
|
82
|
+
}
|
|
83
|
+
catch {
|
|
84
|
+
// Fall through to default fetch path
|
|
85
|
+
}
|
|
86
|
+
}
|
|
72
87
|
const response = await fetch(url, {
|
|
73
88
|
headers: {
|
|
74
89
|
Accept: "text/markdown, text/html;q=0.9, text/plain;q=0.8, */*;q=0.1",
|
|
@@ -84,13 +99,13 @@ export class Markit {
|
|
|
84
99
|
const urlPath = new URL(url).pathname;
|
|
85
100
|
const ext = extname(urlPath).toLowerCase();
|
|
86
101
|
const buffer = Buffer.from(await response.arrayBuffer());
|
|
87
|
-
const
|
|
102
|
+
const fetchedInfo = {
|
|
88
103
|
url,
|
|
89
104
|
mimetype: mimetype.trim(),
|
|
90
105
|
extension: ext || undefined,
|
|
91
106
|
filename: basename(urlPath) || undefined,
|
|
92
107
|
};
|
|
93
|
-
return this.convert(buffer,
|
|
108
|
+
return this.convert(buffer, fetchedInfo);
|
|
94
109
|
}
|
|
95
110
|
/**
|
|
96
111
|
* Convert a buffer with stream info to markdown.
|
package/dist/types.d.ts
CHANGED
|
@@ -5,6 +5,8 @@ export interface StreamInfo {
|
|
|
5
5
|
filename?: string;
|
|
6
6
|
localPath?: string;
|
|
7
7
|
url?: string;
|
|
8
|
+
/** Directory to write extracted images/diagrams. */
|
|
9
|
+
imageDir?: string;
|
|
8
10
|
}
|
|
9
11
|
export interface ConversionResult {
|
|
10
12
|
markdown: string;
|
|
@@ -23,6 +25,12 @@ export interface Converter {
|
|
|
23
25
|
name: string;
|
|
24
26
|
/** Quick check: can this converter handle the given stream? */
|
|
25
27
|
accepts(streamInfo: StreamInfo): boolean;
|
|
28
|
+
/**
|
|
29
|
+
* Optional URL-first hook. When present, called before the default fetch
|
|
30
|
+
* so the converter can handle URL fetching itself (e.g. rewrite to a raw
|
|
31
|
+
* content URL or call an API).
|
|
32
|
+
*/
|
|
33
|
+
convertUrl?(url: string, options?: MarkitOptions): Promise<ConversionResult>;
|
|
26
34
|
/** Convert the source to markdown */
|
|
27
35
|
convert(input: Buffer, streamInfo: StreamInfo, options?: MarkitOptions): Promise<ConversionResult>;
|
|
28
36
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "markit-ai",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.4.0",
|
|
4
4
|
"description": "Convert anything to markdown. PDF, DOCX, PPTX, XLSX, HTML, EPUB, Jupyter, RSS, images, audio, URLs, and more. Pluggable converters, built-in LLM providers for image description and audio transcription. Works as a CLI and as a library.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -65,10 +65,10 @@
|
|
|
65
65
|
"fast-xml-parser": "^5.5.9",
|
|
66
66
|
"jszip": "^3.10.1",
|
|
67
67
|
"mammoth": "^1.9.0",
|
|
68
|
+
"mupdf": "^1.27.0",
|
|
68
69
|
"music-metadata": "^11.12.3",
|
|
69
70
|
"rss-parser": "^3.13.0",
|
|
70
71
|
"turndown": "^7.2.0",
|
|
71
|
-
"turndown-plugin-gfm": "^1.0.2"
|
|
72
|
-
"unpdf": "^1.4.0"
|
|
72
|
+
"turndown-plugin-gfm": "^1.0.2"
|
|
73
73
|
}
|
|
74
74
|
}
|
package/dist/converters/pdf.d.ts
DELETED
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
import type { ConversionResult, Converter, StreamInfo } from "../types.js";
|
|
2
|
-
export declare class PdfConverter implements Converter {
|
|
3
|
-
name: string;
|
|
4
|
-
accepts(streamInfo: StreamInfo): boolean;
|
|
5
|
-
convert(input: Buffer, _streamInfo: StreamInfo): Promise<ConversionResult>;
|
|
6
|
-
}
|
package/dist/converters/pdf.js
DELETED
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
const EXTENSIONS = [".pdf"];
|
|
2
|
-
const MIMETYPES = ["application/pdf", "application/x-pdf"];
|
|
3
|
-
export class PdfConverter {
|
|
4
|
-
name = "pdf";
|
|
5
|
-
accepts(streamInfo) {
|
|
6
|
-
if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension)) {
|
|
7
|
-
return true;
|
|
8
|
-
}
|
|
9
|
-
if (streamInfo.mimetype &&
|
|
10
|
-
MIMETYPES.some((m) => streamInfo.mimetype?.startsWith(m))) {
|
|
11
|
-
return true;
|
|
12
|
-
}
|
|
13
|
-
return false;
|
|
14
|
-
}
|
|
15
|
-
async convert(input, _streamInfo) {
|
|
16
|
-
let extractText;
|
|
17
|
-
try {
|
|
18
|
-
({ extractText } = await import("unpdf"));
|
|
19
|
-
}
|
|
20
|
-
catch {
|
|
21
|
-
throw new Error("PDF support requires 'unpdf'. Install it: npm install unpdf");
|
|
22
|
-
}
|
|
23
|
-
const result = await extractText(new Uint8Array(input));
|
|
24
|
-
const text = Array.isArray(result.text)
|
|
25
|
-
? result.text.join("\n\n")
|
|
26
|
-
: String(result.text);
|
|
27
|
-
return { markdown: text.trim() };
|
|
28
|
-
}
|
|
29
|
-
}
|