markit-ai 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +333 -0
- package/dist/commands/config.d.ts +4 -0
- package/dist/commands/config.js +133 -0
- package/dist/commands/convert.d.ts +5 -0
- package/dist/commands/convert.js +110 -0
- package/dist/commands/formats.d.ts +2 -0
- package/dist/commands/formats.js +56 -0
- package/dist/commands/init.d.ts +2 -0
- package/dist/commands/init.js +29 -0
- package/dist/commands/onboard.d.ts +2 -0
- package/dist/commands/onboard.js +61 -0
- package/dist/commands/plugin.d.ts +4 -0
- package/dist/commands/plugin.js +58 -0
- package/dist/config.d.ts +26 -0
- package/dist/config.js +42 -0
- package/dist/converters/audio.d.ts +7 -0
- package/dist/converters/audio.js +87 -0
- package/dist/converters/csv.d.ts +7 -0
- package/dist/converters/csv.js +83 -0
- package/dist/converters/docx.d.ts +6 -0
- package/dist/converters/docx.js +28 -0
- package/dist/converters/epub.d.ts +8 -0
- package/dist/converters/epub.js +110 -0
- package/dist/converters/html.d.ts +6 -0
- package/dist/converters/html.js +33 -0
- package/dist/converters/image.d.ts +6 -0
- package/dist/converters/image.js +94 -0
- package/dist/converters/ipynb.d.ts +6 -0
- package/dist/converters/ipynb.js +72 -0
- package/dist/converters/json.d.ts +6 -0
- package/dist/converters/json.js +21 -0
- package/dist/converters/pdf.d.ts +6 -0
- package/dist/converters/pdf.js +29 -0
- package/dist/converters/plain-text.d.ts +6 -0
- package/dist/converters/plain-text.js +41 -0
- package/dist/converters/pptx.d.ts +8 -0
- package/dist/converters/pptx.js +189 -0
- package/dist/converters/rss.d.ts +11 -0
- package/dist/converters/rss.js +134 -0
- package/dist/converters/wikipedia.d.ts +6 -0
- package/dist/converters/wikipedia.js +35 -0
- package/dist/converters/xlsx.d.ts +8 -0
- package/dist/converters/xlsx.js +139 -0
- package/dist/converters/xml.d.ts +6 -0
- package/dist/converters/xml.js +17 -0
- package/dist/converters/yaml.d.ts +6 -0
- package/dist/converters/yaml.js +16 -0
- package/dist/converters/zip.d.ts +8 -0
- package/dist/converters/zip.js +56 -0
- package/dist/index.d.ts +28 -0
- package/dist/index.js +24 -0
- package/dist/llm.d.ts +10 -0
- package/dist/llm.js +139 -0
- package/dist/main.d.ts +2 -0
- package/dist/main.js +182 -0
- package/dist/markit.d.ts +19 -0
- package/dist/markit.js +124 -0
- package/dist/mill.d.ts +18 -0
- package/dist/mill.js +123 -0
- package/dist/plugins/api.d.ts +7 -0
- package/dist/plugins/api.js +44 -0
- package/dist/plugins/index.d.ts +4 -0
- package/dist/plugins/index.js +3 -0
- package/dist/plugins/installer.d.ts +25 -0
- package/dist/plugins/installer.js +176 -0
- package/dist/plugins/loader.d.ts +6 -0
- package/dist/plugins/loader.js +61 -0
- package/dist/plugins/types.d.ts +25 -0
- package/dist/plugins/types.js +1 -0
- package/dist/providers/anthropic.d.ts +2 -0
- package/dist/providers/anthropic.js +47 -0
- package/dist/providers/index.d.ts +21 -0
- package/dist/providers/index.js +58 -0
- package/dist/providers/openai.d.ts +2 -0
- package/dist/providers/openai.js +65 -0
- package/dist/providers/types.d.ts +26 -0
- package/dist/providers/types.js +1 -0
- package/dist/types.d.ts +28 -0
- package/dist/types.js +1 -0
- package/dist/utils/exit-codes.d.ts +4 -0
- package/dist/utils/exit-codes.js +4 -0
- package/dist/utils/output.d.ts +22 -0
- package/dist/utils/output.js +31 -0
- package/package.json +70 -0
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
const EXTENSIONS = [".jpg", ".jpeg", ".png", ".gif", ".webp", ".tiff", ".tif", ".bmp", ".svg"];
|
|
2
|
+
const MIMETYPES = ["image/"];
|
|
3
|
+
export class ImageConverter {
|
|
4
|
+
name = "image";
|
|
5
|
+
accepts(streamInfo) {
|
|
6
|
+
if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension))
|
|
7
|
+
return true;
|
|
8
|
+
if (streamInfo.mimetype && MIMETYPES.some((m) => streamInfo.mimetype.startsWith(m)))
|
|
9
|
+
return true;
|
|
10
|
+
return false;
|
|
11
|
+
}
|
|
12
|
+
async convert(input, streamInfo, options) {
|
|
13
|
+
const sections = [];
|
|
14
|
+
// Extract EXIF metadata
|
|
15
|
+
try {
|
|
16
|
+
const exifr = await import("exifr");
|
|
17
|
+
const metadata = await exifr.parse(input, {
|
|
18
|
+
pick: [
|
|
19
|
+
"ImageWidth", "ImageHeight", "Make", "Model",
|
|
20
|
+
"DateTimeOriginal", "CreateDate", "GPSLatitude", "GPSLongitude",
|
|
21
|
+
"Artist", "Copyright", "Description", "Title",
|
|
22
|
+
"Keywords", "Software", "ExposureTime", "FNumber",
|
|
23
|
+
"ISO", "FocalLength",
|
|
24
|
+
],
|
|
25
|
+
});
|
|
26
|
+
if (metadata && Object.keys(metadata).length > 0) {
|
|
27
|
+
sections.push("## Metadata\n");
|
|
28
|
+
if (metadata.ImageWidth && metadata.ImageHeight) {
|
|
29
|
+
sections.push(`ImageSize: ${metadata.ImageWidth}x${metadata.ImageHeight}`);
|
|
30
|
+
}
|
|
31
|
+
const fields = {
|
|
32
|
+
Title: metadata.Title,
|
|
33
|
+
Description: metadata.Description || metadata.ImageDescription,
|
|
34
|
+
Keywords: Array.isArray(metadata.Keywords)
|
|
35
|
+
? metadata.Keywords.join(", ")
|
|
36
|
+
: metadata.Keywords,
|
|
37
|
+
Artist: metadata.Artist,
|
|
38
|
+
Copyright: metadata.Copyright,
|
|
39
|
+
Camera: [metadata.Make, metadata.Model].filter(Boolean).join(" "),
|
|
40
|
+
DateTimeOriginal: metadata.DateTimeOriginal
|
|
41
|
+
? String(metadata.DateTimeOriginal)
|
|
42
|
+
: undefined,
|
|
43
|
+
CreateDate: metadata.CreateDate
|
|
44
|
+
? String(metadata.CreateDate)
|
|
45
|
+
: undefined,
|
|
46
|
+
GPS: metadata.GPSLatitude && metadata.GPSLongitude
|
|
47
|
+
? `${metadata.GPSLatitude}, ${metadata.GPSLongitude}`
|
|
48
|
+
: undefined,
|
|
49
|
+
ExposureTime: metadata.ExposureTime
|
|
50
|
+
? `1/${Math.round(1 / metadata.ExposureTime)}s`
|
|
51
|
+
: undefined,
|
|
52
|
+
FNumber: metadata.FNumber ? `f/${metadata.FNumber}` : undefined,
|
|
53
|
+
ISO: metadata.ISO ? String(metadata.ISO) : undefined,
|
|
54
|
+
FocalLength: metadata.FocalLength
|
|
55
|
+
? `${metadata.FocalLength}mm`
|
|
56
|
+
: undefined,
|
|
57
|
+
Software: metadata.Software,
|
|
58
|
+
};
|
|
59
|
+
for (const [key, value] of Object.entries(fields)) {
|
|
60
|
+
if (value)
|
|
61
|
+
sections.push(`${key}: ${value}`);
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
catch {
|
|
66
|
+
// EXIF parsing failed — not all images have EXIF
|
|
67
|
+
}
|
|
68
|
+
// AI description
|
|
69
|
+
if (options?.describe) {
|
|
70
|
+
try {
|
|
71
|
+
const mimetype = streamInfo.mimetype || guessMimetype(streamInfo.extension);
|
|
72
|
+
const description = await options.describe(input, mimetype);
|
|
73
|
+
if (description) {
|
|
74
|
+
sections.push(`\n## Description\n\n${description}`);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
catch {
|
|
78
|
+
// Description failed — continue without it
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
if (sections.length === 0) {
|
|
82
|
+
return { markdown: `*[image: ${streamInfo.filename || "unknown"}]*` };
|
|
83
|
+
}
|
|
84
|
+
return { markdown: sections.join("\n").trim() };
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
function guessMimetype(ext) {
|
|
88
|
+
const map = {
|
|
89
|
+
".jpg": "image/jpeg", ".jpeg": "image/jpeg", ".png": "image/png",
|
|
90
|
+
".gif": "image/gif", ".webp": "image/webp", ".tiff": "image/tiff",
|
|
91
|
+
".tif": "image/tiff", ".bmp": "image/bmp", ".svg": "image/svg+xml",
|
|
92
|
+
};
|
|
93
|
+
return map[ext || ""] || "image/png";
|
|
94
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { Converter, ConversionResult, StreamInfo } from "../types.js";
|
|
2
|
+
export declare class IpynbConverter implements Converter {
|
|
3
|
+
name: string;
|
|
4
|
+
accepts(streamInfo: StreamInfo): boolean;
|
|
5
|
+
convert(input: Buffer, _streamInfo: StreamInfo): Promise<ConversionResult>;
|
|
6
|
+
}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
const EXTENSIONS = [".ipynb"];
|
|
2
|
+
export class IpynbConverter {
|
|
3
|
+
name = "ipynb";
|
|
4
|
+
accepts(streamInfo) {
|
|
5
|
+
if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension))
|
|
6
|
+
return true;
|
|
7
|
+
return false;
|
|
8
|
+
}
|
|
9
|
+
async convert(input, _streamInfo) {
|
|
10
|
+
const text = new TextDecoder("utf-8").decode(input);
|
|
11
|
+
const notebook = JSON.parse(text);
|
|
12
|
+
const sections = [];
|
|
13
|
+
let title;
|
|
14
|
+
for (const cell of notebook.cells ?? []) {
|
|
15
|
+
const source = Array.isArray(cell.source)
|
|
16
|
+
? cell.source.join("")
|
|
17
|
+
: cell.source ?? "";
|
|
18
|
+
if (cell.cell_type === "markdown") {
|
|
19
|
+
sections.push(source);
|
|
20
|
+
// Extract first heading as title
|
|
21
|
+
if (!title) {
|
|
22
|
+
const match = source.match(/^# (.+)$/m);
|
|
23
|
+
if (match)
|
|
24
|
+
title = match[1].trim();
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
else if (cell.cell_type === "code") {
|
|
28
|
+
// Detect language from kernel
|
|
29
|
+
const lang = notebook.metadata?.kernelspec?.language ||
|
|
30
|
+
notebook.metadata?.language_info?.name ||
|
|
31
|
+
"python";
|
|
32
|
+
sections.push(`\`\`\`${lang}\n${source}\n\`\`\``);
|
|
33
|
+
// Include text outputs
|
|
34
|
+
const outputs = [];
|
|
35
|
+
for (const out of cell.outputs ?? []) {
|
|
36
|
+
if (out.output_type === "stream") {
|
|
37
|
+
const text = Array.isArray(out.text) ? out.text.join("") : out.text ?? "";
|
|
38
|
+
if (text.trim())
|
|
39
|
+
outputs.push(text.trim());
|
|
40
|
+
}
|
|
41
|
+
else if (out.output_type === "execute_result" || out.output_type === "display_data") {
|
|
42
|
+
const data = out.data;
|
|
43
|
+
if (data?.["text/plain"]) {
|
|
44
|
+
const plain = Array.isArray(data["text/plain"])
|
|
45
|
+
? data["text/plain"].join("")
|
|
46
|
+
: data["text/plain"];
|
|
47
|
+
if (plain.trim())
|
|
48
|
+
outputs.push(plain.trim());
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
else if (out.output_type === "error") {
|
|
52
|
+
const tb = (out.traceback ?? []).join("\n");
|
|
53
|
+
if (tb.trim())
|
|
54
|
+
outputs.push(`Error: ${out.ename}: ${out.evalue}`);
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
if (outputs.length > 0) {
|
|
58
|
+
sections.push(`\`\`\`\n${outputs.join("\n")}\n\`\`\``);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
else if (cell.cell_type === "raw") {
|
|
62
|
+
sections.push(`\`\`\`\n${source}\n\`\`\``);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
// Check metadata for title
|
|
66
|
+
title = notebook.metadata?.title ?? title;
|
|
67
|
+
return {
|
|
68
|
+
markdown: sections.join("\n\n").trim(),
|
|
69
|
+
title,
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { Converter, ConversionResult, StreamInfo } from "../types.js";
|
|
2
|
+
export declare class JsonConverter implements Converter {
|
|
3
|
+
name: string;
|
|
4
|
+
accepts(streamInfo: StreamInfo): boolean;
|
|
5
|
+
convert(input: Buffer, _streamInfo: StreamInfo): Promise<ConversionResult>;
|
|
6
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
const EXTENSIONS = [".json"];
|
|
2
|
+
const MIMETYPES = ["application/json"];
|
|
3
|
+
export class JsonConverter {
|
|
4
|
+
name = "json";
|
|
5
|
+
accepts(streamInfo) {
|
|
6
|
+
if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension)) {
|
|
7
|
+
return true;
|
|
8
|
+
}
|
|
9
|
+
if (streamInfo.mimetype &&
|
|
10
|
+
MIMETYPES.some((m) => streamInfo.mimetype.startsWith(m))) {
|
|
11
|
+
return true;
|
|
12
|
+
}
|
|
13
|
+
return false;
|
|
14
|
+
}
|
|
15
|
+
async convert(input, _streamInfo) {
|
|
16
|
+
const text = new TextDecoder("utf-8").decode(input);
|
|
17
|
+
const parsed = JSON.parse(text);
|
|
18
|
+
const pretty = JSON.stringify(parsed, null, 2);
|
|
19
|
+
return { markdown: `\`\`\`json\n${pretty}\n\`\`\`` };
|
|
20
|
+
}
|
|
21
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { Converter, ConversionResult, StreamInfo } from "../types.js";
|
|
2
|
+
export declare class PdfConverter implements Converter {
|
|
3
|
+
name: string;
|
|
4
|
+
accepts(streamInfo: StreamInfo): boolean;
|
|
5
|
+
convert(input: Buffer, _streamInfo: StreamInfo): Promise<ConversionResult>;
|
|
6
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
const EXTENSIONS = [".pdf"];
|
|
2
|
+
const MIMETYPES = ["application/pdf", "application/x-pdf"];
|
|
3
|
+
export class PdfConverter {
|
|
4
|
+
name = "pdf";
|
|
5
|
+
accepts(streamInfo) {
|
|
6
|
+
if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension)) {
|
|
7
|
+
return true;
|
|
8
|
+
}
|
|
9
|
+
if (streamInfo.mimetype &&
|
|
10
|
+
MIMETYPES.some((m) => streamInfo.mimetype.startsWith(m))) {
|
|
11
|
+
return true;
|
|
12
|
+
}
|
|
13
|
+
return false;
|
|
14
|
+
}
|
|
15
|
+
async convert(input, _streamInfo) {
|
|
16
|
+
let extractText;
|
|
17
|
+
try {
|
|
18
|
+
({ extractText } = await import("unpdf"));
|
|
19
|
+
}
|
|
20
|
+
catch {
|
|
21
|
+
throw new Error("PDF support requires 'unpdf'. Install it: npm install unpdf");
|
|
22
|
+
}
|
|
23
|
+
const result = await extractText(new Uint8Array(input));
|
|
24
|
+
const text = Array.isArray(result.text)
|
|
25
|
+
? result.text.join("\n\n")
|
|
26
|
+
: String(result.text);
|
|
27
|
+
return { markdown: text.trim() };
|
|
28
|
+
}
|
|
29
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { Converter, ConversionResult, StreamInfo } from "../types.js";
|
|
2
|
+
export declare class PlainTextConverter implements Converter {
|
|
3
|
+
name: string;
|
|
4
|
+
accepts(streamInfo: StreamInfo): boolean;
|
|
5
|
+
convert(input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult>;
|
|
6
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
const TEXT_EXTENSIONS = [
|
|
2
|
+
".txt", ".md", ".markdown", ".rst", ".log", ".cfg", ".ini", ".yaml", ".yml",
|
|
3
|
+
".toml", ".xml", ".svg", ".env", ".sh", ".bash", ".zsh", ".fish",
|
|
4
|
+
".py", ".js", ".ts", ".jsx", ".tsx", ".go", ".rs", ".rb", ".java",
|
|
5
|
+
".c", ".cpp", ".h", ".hpp", ".cs", ".swift", ".kt", ".scala",
|
|
6
|
+
".sql", ".r", ".m", ".lua", ".pl", ".php", ".ex", ".exs",
|
|
7
|
+
".zig", ".nim", ".v", ".d", ".hs", ".ml", ".clj",
|
|
8
|
+
".makefile", ".dockerfile",
|
|
9
|
+
];
|
|
10
|
+
const TEXT_MIMETYPES = ["text/"];
|
|
11
|
+
export class PlainTextConverter {
|
|
12
|
+
name = "plain-text";
|
|
13
|
+
accepts(streamInfo) {
|
|
14
|
+
if (streamInfo.extension && TEXT_EXTENSIONS.includes(streamInfo.extension)) {
|
|
15
|
+
return true;
|
|
16
|
+
}
|
|
17
|
+
if (streamInfo.mimetype &&
|
|
18
|
+
TEXT_MIMETYPES.some((m) => streamInfo.mimetype.startsWith(m))) {
|
|
19
|
+
return true;
|
|
20
|
+
}
|
|
21
|
+
// If nothing else matched and there's no extension, try to decode as text
|
|
22
|
+
if (!streamInfo.extension && !streamInfo.mimetype) {
|
|
23
|
+
return true;
|
|
24
|
+
}
|
|
25
|
+
return false;
|
|
26
|
+
}
|
|
27
|
+
async convert(input, streamInfo) {
|
|
28
|
+
const charset = streamInfo.charset || "utf-8";
|
|
29
|
+
const text = new TextDecoder(charset).decode(input);
|
|
30
|
+
// If it's already markdown, return as-is
|
|
31
|
+
if (streamInfo.extension === ".md" || streamInfo.extension === ".markdown") {
|
|
32
|
+
return { markdown: text };
|
|
33
|
+
}
|
|
34
|
+
// For code files, wrap in a fenced code block
|
|
35
|
+
const ext = streamInfo.extension?.slice(1);
|
|
36
|
+
if (ext && !["txt", "log", "rst"].includes(ext)) {
|
|
37
|
+
return { markdown: `\`\`\`${ext}\n${text}\n\`\`\`` };
|
|
38
|
+
}
|
|
39
|
+
return { markdown: text };
|
|
40
|
+
}
|
|
41
|
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { Converter, ConversionResult, StreamInfo } from "../types.js";
|
|
2
|
+
export declare class PptxConverter implements Converter {
|
|
3
|
+
name: string;
|
|
4
|
+
accepts(streamInfo: StreamInfo): boolean;
|
|
5
|
+
convert(input: Buffer, _streamInfo: StreamInfo): Promise<ConversionResult>;
|
|
6
|
+
private extractText;
|
|
7
|
+
private extractTable;
|
|
8
|
+
}
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
import JSZip from "jszip";
|
|
2
|
+
import { XMLParser } from "fast-xml-parser";
|
|
3
|
+
const EXTENSIONS = [".pptx"];
|
|
4
|
+
const MIMETYPES = [
|
|
5
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
6
|
+
];
|
|
7
|
+
export class PptxConverter {
|
|
8
|
+
name = "pptx";
|
|
9
|
+
accepts(streamInfo) {
|
|
10
|
+
if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension))
|
|
11
|
+
return true;
|
|
12
|
+
if (streamInfo.mimetype && MIMETYPES.some((m) => streamInfo.mimetype.startsWith(m)))
|
|
13
|
+
return true;
|
|
14
|
+
return false;
|
|
15
|
+
}
|
|
16
|
+
async convert(input, _streamInfo) {
|
|
17
|
+
const zip = await JSZip.loadAsync(input);
|
|
18
|
+
const parser = new XMLParser({
|
|
19
|
+
ignoreAttributes: false,
|
|
20
|
+
attributeNamePrefix: "@_",
|
|
21
|
+
textNodeName: "#text",
|
|
22
|
+
});
|
|
23
|
+
// Get slide order from presentation.xml
|
|
24
|
+
const presXml = await zip.file("ppt/presentation.xml")?.async("string");
|
|
25
|
+
if (!presXml)
|
|
26
|
+
throw new Error("Invalid PPTX: missing presentation.xml");
|
|
27
|
+
const pres = parser.parse(presXml);
|
|
28
|
+
const sldIdList = pres["p:presentation"]?.["p:sldIdLst"]?.["p:sldId"];
|
|
29
|
+
const sldIds = Array.isArray(sldIdList) ? sldIdList : sldIdList ? [sldIdList] : [];
|
|
30
|
+
// Get relationship mappings
|
|
31
|
+
const relsXml = await zip.file("ppt/_rels/presentation.xml.rels")?.async("string");
|
|
32
|
+
const rels = relsXml ? parser.parse(relsXml) : null;
|
|
33
|
+
const relList = rels?.["Relationships"]?.["Relationship"];
|
|
34
|
+
const relArray = Array.isArray(relList) ? relList : relList ? [relList] : [];
|
|
35
|
+
const relMap = new Map();
|
|
36
|
+
for (const r of relArray) {
|
|
37
|
+
relMap.set(r["@_Id"], r["@_Target"]);
|
|
38
|
+
}
|
|
39
|
+
// Map slide IDs to file paths in order
|
|
40
|
+
const slidePaths = [];
|
|
41
|
+
for (const sld of sldIds) {
|
|
42
|
+
const rId = sld["@_r:id"];
|
|
43
|
+
const target = relMap.get(rId);
|
|
44
|
+
if (target)
|
|
45
|
+
slidePaths.push(`ppt/${target}`);
|
|
46
|
+
}
|
|
47
|
+
// If we couldn't resolve from rels, fall back to finding slide files
|
|
48
|
+
if (slidePaths.length === 0) {
|
|
49
|
+
const slideFiles = Object.keys(zip.files)
|
|
50
|
+
.filter((f) => /^ppt\/slides\/slide\d+\.xml$/.test(f))
|
|
51
|
+
.sort((a, b) => {
|
|
52
|
+
const na = parseInt(a.match(/slide(\d+)/)?.[1] || "0");
|
|
53
|
+
const nb = parseInt(b.match(/slide(\d+)/)?.[1] || "0");
|
|
54
|
+
return na - nb;
|
|
55
|
+
});
|
|
56
|
+
slidePaths.push(...slideFiles);
|
|
57
|
+
}
|
|
58
|
+
const sections = [];
|
|
59
|
+
for (let i = 0; i < slidePaths.length; i++) {
|
|
60
|
+
const slideXml = await zip.file(slidePaths[i])?.async("string");
|
|
61
|
+
if (!slideXml)
|
|
62
|
+
continue;
|
|
63
|
+
const slide = parser.parse(slideXml);
|
|
64
|
+
const spTree = slide["p:sld"]?.["p:cSld"]?.["p:spTree"];
|
|
65
|
+
if (!spTree)
|
|
66
|
+
continue;
|
|
67
|
+
const slideLines = [`<!-- Slide ${i + 1} -->`];
|
|
68
|
+
const shapes = spTree["p:sp"];
|
|
69
|
+
const shapeList = Array.isArray(shapes) ? shapes : shapes ? [shapes] : [];
|
|
70
|
+
let isTitle = true;
|
|
71
|
+
for (const shape of shapeList) {
|
|
72
|
+
const text = this.extractText(shape);
|
|
73
|
+
if (!text)
|
|
74
|
+
continue;
|
|
75
|
+
if (isTitle) {
|
|
76
|
+
slideLines.push(`# ${text}`);
|
|
77
|
+
isTitle = false;
|
|
78
|
+
}
|
|
79
|
+
else {
|
|
80
|
+
slideLines.push(text);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
// Tables
|
|
84
|
+
const graphicFrames = spTree["p:graphicFrame"];
|
|
85
|
+
const gfList = Array.isArray(graphicFrames) ? graphicFrames : graphicFrames ? [graphicFrames] : [];
|
|
86
|
+
for (const gf of gfList) {
|
|
87
|
+
const table = this.extractTable(gf);
|
|
88
|
+
if (table)
|
|
89
|
+
slideLines.push(table);
|
|
90
|
+
}
|
|
91
|
+
// Slide notes
|
|
92
|
+
const noteFile = slidePaths[i].replace("slides/slide", "notesSlides/notesSlide");
|
|
93
|
+
const noteXml = await zip.file(noteFile)?.async("string");
|
|
94
|
+
if (noteXml) {
|
|
95
|
+
const note = parser.parse(noteXml);
|
|
96
|
+
const noteSpTree = note["p:notes"]?.["p:cSld"]?.["p:spTree"];
|
|
97
|
+
if (noteSpTree) {
|
|
98
|
+
const noteShapes = noteSpTree["p:sp"];
|
|
99
|
+
const noteList = Array.isArray(noteShapes) ? noteShapes : noteShapes ? [noteShapes] : [];
|
|
100
|
+
const noteTexts = [];
|
|
101
|
+
for (const ns of noteList) {
|
|
102
|
+
// Skip slide image placeholder
|
|
103
|
+
const phType = ns["p:nvSpPr"]?.["p:nvPr"]?.["p:ph"]?.["@_type"];
|
|
104
|
+
if (phType === "sldImg")
|
|
105
|
+
continue;
|
|
106
|
+
const t = this.extractText(ns);
|
|
107
|
+
if (t)
|
|
108
|
+
noteTexts.push(t);
|
|
109
|
+
}
|
|
110
|
+
if (noteTexts.length > 0) {
|
|
111
|
+
slideLines.push("\n### Notes:");
|
|
112
|
+
slideLines.push(noteTexts.join("\n"));
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
sections.push(slideLines.join("\n"));
|
|
117
|
+
}
|
|
118
|
+
return { markdown: sections.join("\n\n").trim() };
|
|
119
|
+
}
|
|
120
|
+
extractText(shape) {
|
|
121
|
+
const txBody = shape["p:txBody"];
|
|
122
|
+
if (!txBody)
|
|
123
|
+
return "";
|
|
124
|
+
const paragraphs = txBody["a:p"];
|
|
125
|
+
const pList = Array.isArray(paragraphs) ? paragraphs : paragraphs ? [paragraphs] : [];
|
|
126
|
+
const lines = [];
|
|
127
|
+
for (const p of pList) {
|
|
128
|
+
const runs = p["a:r"];
|
|
129
|
+
const rList = Array.isArray(runs) ? runs : runs ? [runs] : [];
|
|
130
|
+
const parts = [];
|
|
131
|
+
for (const r of rList) {
|
|
132
|
+
const t = r["a:t"];
|
|
133
|
+
if (t != null)
|
|
134
|
+
parts.push(typeof t === "object" ? t["#text"] || "" : String(t));
|
|
135
|
+
}
|
|
136
|
+
if (parts.length > 0)
|
|
137
|
+
lines.push(parts.join(""));
|
|
138
|
+
}
|
|
139
|
+
return lines.join("\n").trim();
|
|
140
|
+
}
|
|
141
|
+
extractTable(gf) {
|
|
142
|
+
const tbl = gf?.["a:graphic"]?.["a:graphicData"]?.["a:tbl"];
|
|
143
|
+
if (!tbl)
|
|
144
|
+
return null;
|
|
145
|
+
const rows = tbl["a:tr"];
|
|
146
|
+
const rowList = Array.isArray(rows) ? rows : rows ? [rows] : [];
|
|
147
|
+
if (rowList.length === 0)
|
|
148
|
+
return null;
|
|
149
|
+
const mdRows = [];
|
|
150
|
+
for (const row of rowList) {
|
|
151
|
+
const cells = row["a:tc"];
|
|
152
|
+
const cellList = Array.isArray(cells) ? cells : cells ? [cells] : [];
|
|
153
|
+
const cellTexts = [];
|
|
154
|
+
for (const cell of cellList) {
|
|
155
|
+
const txBody = cell["a:txBody"];
|
|
156
|
+
if (!txBody) {
|
|
157
|
+
cellTexts.push("");
|
|
158
|
+
continue;
|
|
159
|
+
}
|
|
160
|
+
const paragraphs = txBody["a:p"];
|
|
161
|
+
const pList = Array.isArray(paragraphs) ? paragraphs : paragraphs ? [paragraphs] : [];
|
|
162
|
+
const parts = [];
|
|
163
|
+
for (const p of pList) {
|
|
164
|
+
const runs = p["a:r"];
|
|
165
|
+
const rList = Array.isArray(runs) ? runs : runs ? [runs] : [];
|
|
166
|
+
for (const r of rList) {
|
|
167
|
+
const t = r["a:t"];
|
|
168
|
+
if (t != null)
|
|
169
|
+
parts.push(typeof t === "object" ? t["#text"] || "" : String(t));
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
cellTexts.push(parts.join(" "));
|
|
173
|
+
}
|
|
174
|
+
mdRows.push(cellTexts);
|
|
175
|
+
}
|
|
176
|
+
if (mdRows.length === 0)
|
|
177
|
+
return null;
|
|
178
|
+
const [header, ...body] = mdRows;
|
|
179
|
+
const lines = [];
|
|
180
|
+
lines.push(`| ${header.join(" | ")} |`);
|
|
181
|
+
lines.push(`| ${header.map(() => "---").join(" | ")} |`);
|
|
182
|
+
for (const row of body) {
|
|
183
|
+
while (row.length < header.length)
|
|
184
|
+
row.push("");
|
|
185
|
+
lines.push(`| ${row.join(" | ")} |`);
|
|
186
|
+
}
|
|
187
|
+
return lines.join("\n");
|
|
188
|
+
}
|
|
189
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { Converter, ConversionResult, StreamInfo } from "../types.js";
|
|
2
|
+
export declare class RssConverter implements Converter {
|
|
3
|
+
name: string;
|
|
4
|
+
accepts(streamInfo: StreamInfo): boolean;
|
|
5
|
+
convert(input: Buffer, _streamInfo: StreamInfo): Promise<ConversionResult>;
|
|
6
|
+
private parseRss;
|
|
7
|
+
private parseAtom;
|
|
8
|
+
private htmlToMd;
|
|
9
|
+
private extract;
|
|
10
|
+
private extractAll;
|
|
11
|
+
}
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
import TurndownService from "turndown";
|
|
2
|
+
const EXTENSIONS = [".rss", ".atom", ".xml"];
|
|
3
|
+
const MIMETYPES = [
|
|
4
|
+
"application/rss+xml", "application/rss",
|
|
5
|
+
"application/atom+xml", "application/atom",
|
|
6
|
+
"text/xml", "application/xml",
|
|
7
|
+
];
|
|
8
|
+
export class RssConverter {
|
|
9
|
+
name = "rss";
|
|
10
|
+
accepts(streamInfo) {
|
|
11
|
+
// Only accept known RSS/Atom extensions directly
|
|
12
|
+
if (streamInfo.extension && [".rss", ".atom"].includes(streamInfo.extension))
|
|
13
|
+
return true;
|
|
14
|
+
// For .xml, we'll try and fail gracefully
|
|
15
|
+
if (streamInfo.extension === ".xml")
|
|
16
|
+
return true;
|
|
17
|
+
if (streamInfo.mimetype && MIMETYPES.some((m) => streamInfo.mimetype.startsWith(m)))
|
|
18
|
+
return true;
|
|
19
|
+
return false;
|
|
20
|
+
}
|
|
21
|
+
async convert(input, _streamInfo) {
|
|
22
|
+
const text = new TextDecoder("utf-8").decode(input);
|
|
23
|
+
// Detect feed type
|
|
24
|
+
if (text.includes("<rss")) {
|
|
25
|
+
return this.parseRss(text);
|
|
26
|
+
}
|
|
27
|
+
else if (text.includes("<feed")) {
|
|
28
|
+
return this.parseAtom(text);
|
|
29
|
+
}
|
|
30
|
+
// Not a feed — fall through for XML as generic text
|
|
31
|
+
throw new Error("Not an RSS or Atom feed");
|
|
32
|
+
}
|
|
33
|
+
parseRss(xml) {
|
|
34
|
+
const turndown = new TurndownService({ headingStyle: "atx" });
|
|
35
|
+
const sections = [];
|
|
36
|
+
// Extract from the <channel> block specifically
|
|
37
|
+
const channelMatch = xml.match(/<channel>([\s\S]*?)<\/channel>/i);
|
|
38
|
+
const channelXml = channelMatch ? channelMatch[1] : xml;
|
|
39
|
+
const channelTitle = this.extract(channelXml, "title");
|
|
40
|
+
const channelDesc = this.extract(channelXml, "description");
|
|
41
|
+
if (channelTitle)
|
|
42
|
+
sections.push(`# ${channelTitle}`);
|
|
43
|
+
if (channelDesc)
|
|
44
|
+
sections.push(this.htmlToMd(channelDesc, turndown));
|
|
45
|
+
// Extract items
|
|
46
|
+
const items = this.extractAll(xml, "item");
|
|
47
|
+
for (const item of items) {
|
|
48
|
+
const title = this.extract(item, "title");
|
|
49
|
+
const pubDate = this.extract(item, "pubDate");
|
|
50
|
+
const description = this.extract(item, "description");
|
|
51
|
+
const content = this.extract(item, "content:encoded");
|
|
52
|
+
const link = this.extract(item, "link");
|
|
53
|
+
const parts = [];
|
|
54
|
+
if (title)
|
|
55
|
+
parts.push(`## ${title}`);
|
|
56
|
+
if (pubDate)
|
|
57
|
+
parts.push(`Published: ${pubDate}`);
|
|
58
|
+
if (link)
|
|
59
|
+
parts.push(`[Link](${link})`);
|
|
60
|
+
if (content) {
|
|
61
|
+
parts.push(this.htmlToMd(content, turndown));
|
|
62
|
+
}
|
|
63
|
+
else if (description) {
|
|
64
|
+
parts.push(this.htmlToMd(description, turndown));
|
|
65
|
+
}
|
|
66
|
+
if (parts.length > 0)
|
|
67
|
+
sections.push(parts.join("\n"));
|
|
68
|
+
}
|
|
69
|
+
return { markdown: sections.join("\n\n").trim(), title: channelTitle };
|
|
70
|
+
}
|
|
71
|
+
parseAtom(xml) {
|
|
72
|
+
const turndown = new TurndownService({ headingStyle: "atx" });
|
|
73
|
+
const sections = [];
|
|
74
|
+
const feedTitle = this.extract(xml, "title");
|
|
75
|
+
const subtitle = this.extract(xml, "subtitle");
|
|
76
|
+
if (feedTitle)
|
|
77
|
+
sections.push(`# ${feedTitle}`);
|
|
78
|
+
if (subtitle)
|
|
79
|
+
sections.push(subtitle);
|
|
80
|
+
const entries = this.extractAll(xml, "entry");
|
|
81
|
+
for (const entry of entries) {
|
|
82
|
+
const title = this.extract(entry, "title");
|
|
83
|
+
const updated = this.extract(entry, "updated");
|
|
84
|
+
const summary = this.extract(entry, "summary");
|
|
85
|
+
const content = this.extract(entry, "content");
|
|
86
|
+
const parts = [];
|
|
87
|
+
if (title)
|
|
88
|
+
parts.push(`## ${title}`);
|
|
89
|
+
if (updated)
|
|
90
|
+
parts.push(`Updated: ${updated}`);
|
|
91
|
+
if (content) {
|
|
92
|
+
parts.push(this.htmlToMd(content, turndown));
|
|
93
|
+
}
|
|
94
|
+
else if (summary) {
|
|
95
|
+
parts.push(this.htmlToMd(summary, turndown));
|
|
96
|
+
}
|
|
97
|
+
if (parts.length > 0)
|
|
98
|
+
sections.push(parts.join("\n"));
|
|
99
|
+
}
|
|
100
|
+
return { markdown: sections.join("\n\n").trim(), title: feedTitle };
|
|
101
|
+
}
|
|
102
|
+
htmlToMd(html, turndown) {
|
|
103
|
+
// Unescape CDATA and HTML entities that might be in RSS
|
|
104
|
+
const unescaped = html
|
|
105
|
+
.replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, "$1")
|
|
106
|
+
.replace(/</g, "<")
|
|
107
|
+
.replace(/>/g, ">")
|
|
108
|
+
.replace(/&/g, "&");
|
|
109
|
+
// If it looks like HTML, convert it
|
|
110
|
+
if (unescaped.includes("<")) {
|
|
111
|
+
return turndown.turndown(unescaped).trim();
|
|
112
|
+
}
|
|
113
|
+
return unescaped.trim();
|
|
114
|
+
}
|
|
115
|
+
extract(xml, tag) {
|
|
116
|
+
// Handle both <tag>content</tag> and <tag><![CDATA[content]]></tag>
|
|
117
|
+
const re = new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`, "i");
|
|
118
|
+
const match = xml.match(re);
|
|
119
|
+
if (!match)
|
|
120
|
+
return undefined;
|
|
121
|
+
return match[1]
|
|
122
|
+
.replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, "$1")
|
|
123
|
+
.trim() || undefined;
|
|
124
|
+
}
|
|
125
|
+
extractAll(xml, tag) {
|
|
126
|
+
const results = [];
|
|
127
|
+
const re = new RegExp(`<${tag}[^>]*>[\\s\\S]*?</${tag}>`, "gi");
|
|
128
|
+
let match;
|
|
129
|
+
while ((match = re.exec(xml)) !== null) {
|
|
130
|
+
results.push(match[0]);
|
|
131
|
+
}
|
|
132
|
+
return results;
|
|
133
|
+
}
|
|
134
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { Converter, ConversionResult, StreamInfo } from "../types.js";
|
|
2
|
+
export declare class WikipediaConverter implements Converter {
|
|
3
|
+
name: string;
|
|
4
|
+
accepts(streamInfo: StreamInfo): boolean;
|
|
5
|
+
convert(input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult>;
|
|
6
|
+
}
|