markit-ai 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +333 -0
  3. package/dist/commands/config.d.ts +4 -0
  4. package/dist/commands/config.js +133 -0
  5. package/dist/commands/convert.d.ts +5 -0
  6. package/dist/commands/convert.js +110 -0
  7. package/dist/commands/formats.d.ts +2 -0
  8. package/dist/commands/formats.js +56 -0
  9. package/dist/commands/init.d.ts +2 -0
  10. package/dist/commands/init.js +29 -0
  11. package/dist/commands/onboard.d.ts +2 -0
  12. package/dist/commands/onboard.js +61 -0
  13. package/dist/commands/plugin.d.ts +4 -0
  14. package/dist/commands/plugin.js +58 -0
  15. package/dist/config.d.ts +26 -0
  16. package/dist/config.js +42 -0
  17. package/dist/converters/audio.d.ts +7 -0
  18. package/dist/converters/audio.js +87 -0
  19. package/dist/converters/csv.d.ts +7 -0
  20. package/dist/converters/csv.js +83 -0
  21. package/dist/converters/docx.d.ts +6 -0
  22. package/dist/converters/docx.js +28 -0
  23. package/dist/converters/epub.d.ts +8 -0
  24. package/dist/converters/epub.js +110 -0
  25. package/dist/converters/html.d.ts +6 -0
  26. package/dist/converters/html.js +33 -0
  27. package/dist/converters/image.d.ts +6 -0
  28. package/dist/converters/image.js +94 -0
  29. package/dist/converters/ipynb.d.ts +6 -0
  30. package/dist/converters/ipynb.js +72 -0
  31. package/dist/converters/json.d.ts +6 -0
  32. package/dist/converters/json.js +21 -0
  33. package/dist/converters/pdf.d.ts +6 -0
  34. package/dist/converters/pdf.js +29 -0
  35. package/dist/converters/plain-text.d.ts +6 -0
  36. package/dist/converters/plain-text.js +41 -0
  37. package/dist/converters/pptx.d.ts +8 -0
  38. package/dist/converters/pptx.js +189 -0
  39. package/dist/converters/rss.d.ts +11 -0
  40. package/dist/converters/rss.js +134 -0
  41. package/dist/converters/wikipedia.d.ts +6 -0
  42. package/dist/converters/wikipedia.js +35 -0
  43. package/dist/converters/xlsx.d.ts +8 -0
  44. package/dist/converters/xlsx.js +139 -0
  45. package/dist/converters/xml.d.ts +6 -0
  46. package/dist/converters/xml.js +17 -0
  47. package/dist/converters/yaml.d.ts +6 -0
  48. package/dist/converters/yaml.js +16 -0
  49. package/dist/converters/zip.d.ts +8 -0
  50. package/dist/converters/zip.js +56 -0
  51. package/dist/index.d.ts +28 -0
  52. package/dist/index.js +24 -0
  53. package/dist/llm.d.ts +10 -0
  54. package/dist/llm.js +139 -0
  55. package/dist/main.d.ts +2 -0
  56. package/dist/main.js +182 -0
  57. package/dist/markit.d.ts +19 -0
  58. package/dist/markit.js +124 -0
  59. package/dist/mill.d.ts +18 -0
  60. package/dist/mill.js +123 -0
  61. package/dist/plugins/api.d.ts +7 -0
  62. package/dist/plugins/api.js +44 -0
  63. package/dist/plugins/index.d.ts +4 -0
  64. package/dist/plugins/index.js +3 -0
  65. package/dist/plugins/installer.d.ts +25 -0
  66. package/dist/plugins/installer.js +176 -0
  67. package/dist/plugins/loader.d.ts +6 -0
  68. package/dist/plugins/loader.js +61 -0
  69. package/dist/plugins/types.d.ts +25 -0
  70. package/dist/plugins/types.js +1 -0
  71. package/dist/providers/anthropic.d.ts +2 -0
  72. package/dist/providers/anthropic.js +47 -0
  73. package/dist/providers/index.d.ts +21 -0
  74. package/dist/providers/index.js +58 -0
  75. package/dist/providers/openai.d.ts +2 -0
  76. package/dist/providers/openai.js +65 -0
  77. package/dist/providers/types.d.ts +26 -0
  78. package/dist/providers/types.js +1 -0
  79. package/dist/types.d.ts +28 -0
  80. package/dist/types.js +1 -0
  81. package/dist/utils/exit-codes.d.ts +4 -0
  82. package/dist/utils/exit-codes.js +4 -0
  83. package/dist/utils/output.d.ts +22 -0
  84. package/dist/utils/output.js +31 -0
  85. package/package.json +70 -0
@@ -0,0 +1,94 @@
1
+ const EXTENSIONS = [".jpg", ".jpeg", ".png", ".gif", ".webp", ".tiff", ".tif", ".bmp", ".svg"];
2
+ const MIMETYPES = ["image/"];
3
+ export class ImageConverter {
4
+ name = "image";
5
+ accepts(streamInfo) {
6
+ if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension))
7
+ return true;
8
+ if (streamInfo.mimetype && MIMETYPES.some((m) => streamInfo.mimetype.startsWith(m)))
9
+ return true;
10
+ return false;
11
+ }
12
+ async convert(input, streamInfo, options) {
13
+ const sections = [];
14
+ // Extract EXIF metadata
15
+ try {
16
+ const exifr = await import("exifr");
17
+ const metadata = await exifr.parse(input, {
18
+ pick: [
19
+ "ImageWidth", "ImageHeight", "Make", "Model",
20
+ "DateTimeOriginal", "CreateDate", "GPSLatitude", "GPSLongitude",
21
+ "Artist", "Copyright", "Description", "Title",
22
+ "Keywords", "Software", "ExposureTime", "FNumber",
23
+ "ISO", "FocalLength",
24
+ ],
25
+ });
26
+ if (metadata && Object.keys(metadata).length > 0) {
27
+ sections.push("## Metadata\n");
28
+ if (metadata.ImageWidth && metadata.ImageHeight) {
29
+ sections.push(`ImageSize: ${metadata.ImageWidth}x${metadata.ImageHeight}`);
30
+ }
31
+ const fields = {
32
+ Title: metadata.Title,
33
+ Description: metadata.Description || metadata.ImageDescription,
34
+ Keywords: Array.isArray(metadata.Keywords)
35
+ ? metadata.Keywords.join(", ")
36
+ : metadata.Keywords,
37
+ Artist: metadata.Artist,
38
+ Copyright: metadata.Copyright,
39
+ Camera: [metadata.Make, metadata.Model].filter(Boolean).join(" "),
40
+ DateTimeOriginal: metadata.DateTimeOriginal
41
+ ? String(metadata.DateTimeOriginal)
42
+ : undefined,
43
+ CreateDate: metadata.CreateDate
44
+ ? String(metadata.CreateDate)
45
+ : undefined,
46
+ GPS: metadata.GPSLatitude && metadata.GPSLongitude
47
+ ? `${metadata.GPSLatitude}, ${metadata.GPSLongitude}`
48
+ : undefined,
49
+ ExposureTime: metadata.ExposureTime
50
+ ? `1/${Math.round(1 / metadata.ExposureTime)}s`
51
+ : undefined,
52
+ FNumber: metadata.FNumber ? `f/${metadata.FNumber}` : undefined,
53
+ ISO: metadata.ISO ? String(metadata.ISO) : undefined,
54
+ FocalLength: metadata.FocalLength
55
+ ? `${metadata.FocalLength}mm`
56
+ : undefined,
57
+ Software: metadata.Software,
58
+ };
59
+ for (const [key, value] of Object.entries(fields)) {
60
+ if (value)
61
+ sections.push(`${key}: ${value}`);
62
+ }
63
+ }
64
+ }
65
+ catch {
66
+ // EXIF parsing failed — not all images have EXIF
67
+ }
68
+ // AI description
69
+ if (options?.describe) {
70
+ try {
71
+ const mimetype = streamInfo.mimetype || guessMimetype(streamInfo.extension);
72
+ const description = await options.describe(input, mimetype);
73
+ if (description) {
74
+ sections.push(`\n## Description\n\n${description}`);
75
+ }
76
+ }
77
+ catch {
78
+ // Description failed — continue without it
79
+ }
80
+ }
81
+ if (sections.length === 0) {
82
+ return { markdown: `*[image: ${streamInfo.filename || "unknown"}]*` };
83
+ }
84
+ return { markdown: sections.join("\n").trim() };
85
+ }
86
+ }
87
+ function guessMimetype(ext) {
88
+ const map = {
89
+ ".jpg": "image/jpeg", ".jpeg": "image/jpeg", ".png": "image/png",
90
+ ".gif": "image/gif", ".webp": "image/webp", ".tiff": "image/tiff",
91
+ ".tif": "image/tiff", ".bmp": "image/bmp", ".svg": "image/svg+xml",
92
+ };
93
+ return map[ext || ""] || "image/png";
94
+ }
@@ -0,0 +1,6 @@
1
+ import type { Converter, ConversionResult, StreamInfo } from "../types.js";
2
+ export declare class IpynbConverter implements Converter {
3
+ name: string;
4
+ accepts(streamInfo: StreamInfo): boolean;
5
+ convert(input: Buffer, _streamInfo: StreamInfo): Promise<ConversionResult>;
6
+ }
@@ -0,0 +1,72 @@
1
+ const EXTENSIONS = [".ipynb"];
2
+ export class IpynbConverter {
3
+ name = "ipynb";
4
+ accepts(streamInfo) {
5
+ if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension))
6
+ return true;
7
+ return false;
8
+ }
9
+ async convert(input, _streamInfo) {
10
+ const text = new TextDecoder("utf-8").decode(input);
11
+ const notebook = JSON.parse(text);
12
+ const sections = [];
13
+ let title;
14
+ for (const cell of notebook.cells ?? []) {
15
+ const source = Array.isArray(cell.source)
16
+ ? cell.source.join("")
17
+ : cell.source ?? "";
18
+ if (cell.cell_type === "markdown") {
19
+ sections.push(source);
20
+ // Extract first heading as title
21
+ if (!title) {
22
+ const match = source.match(/^# (.+)$/m);
23
+ if (match)
24
+ title = match[1].trim();
25
+ }
26
+ }
27
+ else if (cell.cell_type === "code") {
28
+ // Detect language from kernel
29
+ const lang = notebook.metadata?.kernelspec?.language ||
30
+ notebook.metadata?.language_info?.name ||
31
+ "python";
32
+ sections.push(`\`\`\`${lang}\n${source}\n\`\`\``);
33
+ // Include text outputs
34
+ const outputs = [];
35
+ for (const out of cell.outputs ?? []) {
36
+ if (out.output_type === "stream") {
37
+ const text = Array.isArray(out.text) ? out.text.join("") : out.text ?? "";
38
+ if (text.trim())
39
+ outputs.push(text.trim());
40
+ }
41
+ else if (out.output_type === "execute_result" || out.output_type === "display_data") {
42
+ const data = out.data;
43
+ if (data?.["text/plain"]) {
44
+ const plain = Array.isArray(data["text/plain"])
45
+ ? data["text/plain"].join("")
46
+ : data["text/plain"];
47
+ if (plain.trim())
48
+ outputs.push(plain.trim());
49
+ }
50
+ }
51
+ else if (out.output_type === "error") {
52
+ const tb = (out.traceback ?? []).join("\n");
53
+ if (tb.trim())
54
+ outputs.push(`Error: ${out.ename}: ${out.evalue}`);
55
+ }
56
+ }
57
+ if (outputs.length > 0) {
58
+ sections.push(`\`\`\`\n${outputs.join("\n")}\n\`\`\``);
59
+ }
60
+ }
61
+ else if (cell.cell_type === "raw") {
62
+ sections.push(`\`\`\`\n${source}\n\`\`\``);
63
+ }
64
+ }
65
+ // Check metadata for title
66
+ title = notebook.metadata?.title ?? title;
67
+ return {
68
+ markdown: sections.join("\n\n").trim(),
69
+ title,
70
+ };
71
+ }
72
+ }
@@ -0,0 +1,6 @@
1
+ import type { Converter, ConversionResult, StreamInfo } from "../types.js";
2
+ export declare class JsonConverter implements Converter {
3
+ name: string;
4
+ accepts(streamInfo: StreamInfo): boolean;
5
+ convert(input: Buffer, _streamInfo: StreamInfo): Promise<ConversionResult>;
6
+ }
@@ -0,0 +1,21 @@
1
+ const EXTENSIONS = [".json"];
2
+ const MIMETYPES = ["application/json"];
3
+ export class JsonConverter {
4
+ name = "json";
5
+ accepts(streamInfo) {
6
+ if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension)) {
7
+ return true;
8
+ }
9
+ if (streamInfo.mimetype &&
10
+ MIMETYPES.some((m) => streamInfo.mimetype.startsWith(m))) {
11
+ return true;
12
+ }
13
+ return false;
14
+ }
15
+ async convert(input, _streamInfo) {
16
+ const text = new TextDecoder("utf-8").decode(input);
17
+ const parsed = JSON.parse(text);
18
+ const pretty = JSON.stringify(parsed, null, 2);
19
+ return { markdown: `\`\`\`json\n${pretty}\n\`\`\`` };
20
+ }
21
+ }
@@ -0,0 +1,6 @@
1
+ import type { Converter, ConversionResult, StreamInfo } from "../types.js";
2
+ export declare class PdfConverter implements Converter {
3
+ name: string;
4
+ accepts(streamInfo: StreamInfo): boolean;
5
+ convert(input: Buffer, _streamInfo: StreamInfo): Promise<ConversionResult>;
6
+ }
@@ -0,0 +1,29 @@
1
+ const EXTENSIONS = [".pdf"];
2
+ const MIMETYPES = ["application/pdf", "application/x-pdf"];
3
+ export class PdfConverter {
4
+ name = "pdf";
5
+ accepts(streamInfo) {
6
+ if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension)) {
7
+ return true;
8
+ }
9
+ if (streamInfo.mimetype &&
10
+ MIMETYPES.some((m) => streamInfo.mimetype.startsWith(m))) {
11
+ return true;
12
+ }
13
+ return false;
14
+ }
15
+ async convert(input, _streamInfo) {
16
+ let extractText;
17
+ try {
18
+ ({ extractText } = await import("unpdf"));
19
+ }
20
+ catch {
21
+ throw new Error("PDF support requires 'unpdf'. Install it: npm install unpdf");
22
+ }
23
+ const result = await extractText(new Uint8Array(input));
24
+ const text = Array.isArray(result.text)
25
+ ? result.text.join("\n\n")
26
+ : String(result.text);
27
+ return { markdown: text.trim() };
28
+ }
29
+ }
@@ -0,0 +1,6 @@
1
+ import type { Converter, ConversionResult, StreamInfo } from "../types.js";
2
+ export declare class PlainTextConverter implements Converter {
3
+ name: string;
4
+ accepts(streamInfo: StreamInfo): boolean;
5
+ convert(input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult>;
6
+ }
@@ -0,0 +1,41 @@
1
+ const TEXT_EXTENSIONS = [
2
+ ".txt", ".md", ".markdown", ".rst", ".log", ".cfg", ".ini", ".yaml", ".yml",
3
+ ".toml", ".xml", ".svg", ".env", ".sh", ".bash", ".zsh", ".fish",
4
+ ".py", ".js", ".ts", ".jsx", ".tsx", ".go", ".rs", ".rb", ".java",
5
+ ".c", ".cpp", ".h", ".hpp", ".cs", ".swift", ".kt", ".scala",
6
+ ".sql", ".r", ".m", ".lua", ".pl", ".php", ".ex", ".exs",
7
+ ".zig", ".nim", ".v", ".d", ".hs", ".ml", ".clj",
8
+ ".makefile", ".dockerfile",
9
+ ];
10
+ const TEXT_MIMETYPES = ["text/"];
11
+ export class PlainTextConverter {
12
+ name = "plain-text";
13
+ accepts(streamInfo) {
14
+ if (streamInfo.extension && TEXT_EXTENSIONS.includes(streamInfo.extension)) {
15
+ return true;
16
+ }
17
+ if (streamInfo.mimetype &&
18
+ TEXT_MIMETYPES.some((m) => streamInfo.mimetype.startsWith(m))) {
19
+ return true;
20
+ }
21
+ // If nothing else matched and there's no extension, try to decode as text
22
+ if (!streamInfo.extension && !streamInfo.mimetype) {
23
+ return true;
24
+ }
25
+ return false;
26
+ }
27
+ async convert(input, streamInfo) {
28
+ const charset = streamInfo.charset || "utf-8";
29
+ const text = new TextDecoder(charset).decode(input);
30
+ // If it's already markdown, return as-is
31
+ if (streamInfo.extension === ".md" || streamInfo.extension === ".markdown") {
32
+ return { markdown: text };
33
+ }
34
+ // For code files, wrap in a fenced code block
35
+ const ext = streamInfo.extension?.slice(1);
36
+ if (ext && !["txt", "log", "rst"].includes(ext)) {
37
+ return { markdown: `\`\`\`${ext}\n${text}\n\`\`\`` };
38
+ }
39
+ return { markdown: text };
40
+ }
41
+ }
@@ -0,0 +1,8 @@
1
+ import type { Converter, ConversionResult, StreamInfo } from "../types.js";
2
+ export declare class PptxConverter implements Converter {
3
+ name: string;
4
+ accepts(streamInfo: StreamInfo): boolean;
5
+ convert(input: Buffer, _streamInfo: StreamInfo): Promise<ConversionResult>;
6
+ private extractText;
7
+ private extractTable;
8
+ }
@@ -0,0 +1,189 @@
1
+ import JSZip from "jszip";
2
+ import { XMLParser } from "fast-xml-parser";
3
+ const EXTENSIONS = [".pptx"];
4
+ const MIMETYPES = [
5
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
6
+ ];
7
+ export class PptxConverter {
8
+ name = "pptx";
9
+ accepts(streamInfo) {
10
+ if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension))
11
+ return true;
12
+ if (streamInfo.mimetype && MIMETYPES.some((m) => streamInfo.mimetype.startsWith(m)))
13
+ return true;
14
+ return false;
15
+ }
16
+ async convert(input, _streamInfo) {
17
+ const zip = await JSZip.loadAsync(input);
18
+ const parser = new XMLParser({
19
+ ignoreAttributes: false,
20
+ attributeNamePrefix: "@_",
21
+ textNodeName: "#text",
22
+ });
23
+ // Get slide order from presentation.xml
24
+ const presXml = await zip.file("ppt/presentation.xml")?.async("string");
25
+ if (!presXml)
26
+ throw new Error("Invalid PPTX: missing presentation.xml");
27
+ const pres = parser.parse(presXml);
28
+ const sldIdList = pres["p:presentation"]?.["p:sldIdLst"]?.["p:sldId"];
29
+ const sldIds = Array.isArray(sldIdList) ? sldIdList : sldIdList ? [sldIdList] : [];
30
+ // Get relationship mappings
31
+ const relsXml = await zip.file("ppt/_rels/presentation.xml.rels")?.async("string");
32
+ const rels = relsXml ? parser.parse(relsXml) : null;
33
+ const relList = rels?.["Relationships"]?.["Relationship"];
34
+ const relArray = Array.isArray(relList) ? relList : relList ? [relList] : [];
35
+ const relMap = new Map();
36
+ for (const r of relArray) {
37
+ relMap.set(r["@_Id"], r["@_Target"]);
38
+ }
39
+ // Map slide IDs to file paths in order
40
+ const slidePaths = [];
41
+ for (const sld of sldIds) {
42
+ const rId = sld["@_r:id"];
43
+ const target = relMap.get(rId);
44
+ if (target)
45
+ slidePaths.push(`ppt/${target}`);
46
+ }
47
+ // If we couldn't resolve from rels, fall back to finding slide files
48
+ if (slidePaths.length === 0) {
49
+ const slideFiles = Object.keys(zip.files)
50
+ .filter((f) => /^ppt\/slides\/slide\d+\.xml$/.test(f))
51
+ .sort((a, b) => {
52
+ const na = parseInt(a.match(/slide(\d+)/)?.[1] || "0");
53
+ const nb = parseInt(b.match(/slide(\d+)/)?.[1] || "0");
54
+ return na - nb;
55
+ });
56
+ slidePaths.push(...slideFiles);
57
+ }
58
+ const sections = [];
59
+ for (let i = 0; i < slidePaths.length; i++) {
60
+ const slideXml = await zip.file(slidePaths[i])?.async("string");
61
+ if (!slideXml)
62
+ continue;
63
+ const slide = parser.parse(slideXml);
64
+ const spTree = slide["p:sld"]?.["p:cSld"]?.["p:spTree"];
65
+ if (!spTree)
66
+ continue;
67
+ const slideLines = [`<!-- Slide ${i + 1} -->`];
68
+ const shapes = spTree["p:sp"];
69
+ const shapeList = Array.isArray(shapes) ? shapes : shapes ? [shapes] : [];
70
+ let isTitle = true;
71
+ for (const shape of shapeList) {
72
+ const text = this.extractText(shape);
73
+ if (!text)
74
+ continue;
75
+ if (isTitle) {
76
+ slideLines.push(`# ${text}`);
77
+ isTitle = false;
78
+ }
79
+ else {
80
+ slideLines.push(text);
81
+ }
82
+ }
83
+ // Tables
84
+ const graphicFrames = spTree["p:graphicFrame"];
85
+ const gfList = Array.isArray(graphicFrames) ? graphicFrames : graphicFrames ? [graphicFrames] : [];
86
+ for (const gf of gfList) {
87
+ const table = this.extractTable(gf);
88
+ if (table)
89
+ slideLines.push(table);
90
+ }
91
+ // Slide notes
92
+ const noteFile = slidePaths[i].replace("slides/slide", "notesSlides/notesSlide");
93
+ const noteXml = await zip.file(noteFile)?.async("string");
94
+ if (noteXml) {
95
+ const note = parser.parse(noteXml);
96
+ const noteSpTree = note["p:notes"]?.["p:cSld"]?.["p:spTree"];
97
+ if (noteSpTree) {
98
+ const noteShapes = noteSpTree["p:sp"];
99
+ const noteList = Array.isArray(noteShapes) ? noteShapes : noteShapes ? [noteShapes] : [];
100
+ const noteTexts = [];
101
+ for (const ns of noteList) {
102
+ // Skip slide image placeholder
103
+ const phType = ns["p:nvSpPr"]?.["p:nvPr"]?.["p:ph"]?.["@_type"];
104
+ if (phType === "sldImg")
105
+ continue;
106
+ const t = this.extractText(ns);
107
+ if (t)
108
+ noteTexts.push(t);
109
+ }
110
+ if (noteTexts.length > 0) {
111
+ slideLines.push("\n### Notes:");
112
+ slideLines.push(noteTexts.join("\n"));
113
+ }
114
+ }
115
+ }
116
+ sections.push(slideLines.join("\n"));
117
+ }
118
+ return { markdown: sections.join("\n\n").trim() };
119
+ }
120
+ extractText(shape) {
121
+ const txBody = shape["p:txBody"];
122
+ if (!txBody)
123
+ return "";
124
+ const paragraphs = txBody["a:p"];
125
+ const pList = Array.isArray(paragraphs) ? paragraphs : paragraphs ? [paragraphs] : [];
126
+ const lines = [];
127
+ for (const p of pList) {
128
+ const runs = p["a:r"];
129
+ const rList = Array.isArray(runs) ? runs : runs ? [runs] : [];
130
+ const parts = [];
131
+ for (const r of rList) {
132
+ const t = r["a:t"];
133
+ if (t != null)
134
+ parts.push(typeof t === "object" ? t["#text"] || "" : String(t));
135
+ }
136
+ if (parts.length > 0)
137
+ lines.push(parts.join(""));
138
+ }
139
+ return lines.join("\n").trim();
140
+ }
141
+ extractTable(gf) {
142
+ const tbl = gf?.["a:graphic"]?.["a:graphicData"]?.["a:tbl"];
143
+ if (!tbl)
144
+ return null;
145
+ const rows = tbl["a:tr"];
146
+ const rowList = Array.isArray(rows) ? rows : rows ? [rows] : [];
147
+ if (rowList.length === 0)
148
+ return null;
149
+ const mdRows = [];
150
+ for (const row of rowList) {
151
+ const cells = row["a:tc"];
152
+ const cellList = Array.isArray(cells) ? cells : cells ? [cells] : [];
153
+ const cellTexts = [];
154
+ for (const cell of cellList) {
155
+ const txBody = cell["a:txBody"];
156
+ if (!txBody) {
157
+ cellTexts.push("");
158
+ continue;
159
+ }
160
+ const paragraphs = txBody["a:p"];
161
+ const pList = Array.isArray(paragraphs) ? paragraphs : paragraphs ? [paragraphs] : [];
162
+ const parts = [];
163
+ for (const p of pList) {
164
+ const runs = p["a:r"];
165
+ const rList = Array.isArray(runs) ? runs : runs ? [runs] : [];
166
+ for (const r of rList) {
167
+ const t = r["a:t"];
168
+ if (t != null)
169
+ parts.push(typeof t === "object" ? t["#text"] || "" : String(t));
170
+ }
171
+ }
172
+ cellTexts.push(parts.join(" "));
173
+ }
174
+ mdRows.push(cellTexts);
175
+ }
176
+ if (mdRows.length === 0)
177
+ return null;
178
+ const [header, ...body] = mdRows;
179
+ const lines = [];
180
+ lines.push(`| ${header.join(" | ")} |`);
181
+ lines.push(`| ${header.map(() => "---").join(" | ")} |`);
182
+ for (const row of body) {
183
+ while (row.length < header.length)
184
+ row.push("");
185
+ lines.push(`| ${row.join(" | ")} |`);
186
+ }
187
+ return lines.join("\n");
188
+ }
189
+ }
@@ -0,0 +1,11 @@
1
+ import type { Converter, ConversionResult, StreamInfo } from "../types.js";
2
+ export declare class RssConverter implements Converter {
3
+ name: string;
4
+ accepts(streamInfo: StreamInfo): boolean;
5
+ convert(input: Buffer, _streamInfo: StreamInfo): Promise<ConversionResult>;
6
+ private parseRss;
7
+ private parseAtom;
8
+ private htmlToMd;
9
+ private extract;
10
+ private extractAll;
11
+ }
@@ -0,0 +1,134 @@
1
+ import TurndownService from "turndown";
2
+ const EXTENSIONS = [".rss", ".atom", ".xml"];
3
+ const MIMETYPES = [
4
+ "application/rss+xml", "application/rss",
5
+ "application/atom+xml", "application/atom",
6
+ "text/xml", "application/xml",
7
+ ];
8
+ export class RssConverter {
9
+ name = "rss";
10
+ accepts(streamInfo) {
11
+ // Only accept known RSS/Atom extensions directly
12
+ if (streamInfo.extension && [".rss", ".atom"].includes(streamInfo.extension))
13
+ return true;
14
+ // For .xml, we'll try and fail gracefully
15
+ if (streamInfo.extension === ".xml")
16
+ return true;
17
+ if (streamInfo.mimetype && MIMETYPES.some((m) => streamInfo.mimetype.startsWith(m)))
18
+ return true;
19
+ return false;
20
+ }
21
+ async convert(input, _streamInfo) {
22
+ const text = new TextDecoder("utf-8").decode(input);
23
+ // Detect feed type
24
+ if (text.includes("<rss")) {
25
+ return this.parseRss(text);
26
+ }
27
+ else if (text.includes("<feed")) {
28
+ return this.parseAtom(text);
29
+ }
30
+ // Not a feed — fall through for XML as generic text
31
+ throw new Error("Not an RSS or Atom feed");
32
+ }
33
+ parseRss(xml) {
34
+ const turndown = new TurndownService({ headingStyle: "atx" });
35
+ const sections = [];
36
+ // Extract from the <channel> block specifically
37
+ const channelMatch = xml.match(/<channel>([\s\S]*?)<\/channel>/i);
38
+ const channelXml = channelMatch ? channelMatch[1] : xml;
39
+ const channelTitle = this.extract(channelXml, "title");
40
+ const channelDesc = this.extract(channelXml, "description");
41
+ if (channelTitle)
42
+ sections.push(`# ${channelTitle}`);
43
+ if (channelDesc)
44
+ sections.push(this.htmlToMd(channelDesc, turndown));
45
+ // Extract items
46
+ const items = this.extractAll(xml, "item");
47
+ for (const item of items) {
48
+ const title = this.extract(item, "title");
49
+ const pubDate = this.extract(item, "pubDate");
50
+ const description = this.extract(item, "description");
51
+ const content = this.extract(item, "content:encoded");
52
+ const link = this.extract(item, "link");
53
+ const parts = [];
54
+ if (title)
55
+ parts.push(`## ${title}`);
56
+ if (pubDate)
57
+ parts.push(`Published: ${pubDate}`);
58
+ if (link)
59
+ parts.push(`[Link](${link})`);
60
+ if (content) {
61
+ parts.push(this.htmlToMd(content, turndown));
62
+ }
63
+ else if (description) {
64
+ parts.push(this.htmlToMd(description, turndown));
65
+ }
66
+ if (parts.length > 0)
67
+ sections.push(parts.join("\n"));
68
+ }
69
+ return { markdown: sections.join("\n\n").trim(), title: channelTitle };
70
+ }
71
+ parseAtom(xml) {
72
+ const turndown = new TurndownService({ headingStyle: "atx" });
73
+ const sections = [];
74
+ const feedTitle = this.extract(xml, "title");
75
+ const subtitle = this.extract(xml, "subtitle");
76
+ if (feedTitle)
77
+ sections.push(`# ${feedTitle}`);
78
+ if (subtitle)
79
+ sections.push(subtitle);
80
+ const entries = this.extractAll(xml, "entry");
81
+ for (const entry of entries) {
82
+ const title = this.extract(entry, "title");
83
+ const updated = this.extract(entry, "updated");
84
+ const summary = this.extract(entry, "summary");
85
+ const content = this.extract(entry, "content");
86
+ const parts = [];
87
+ if (title)
88
+ parts.push(`## ${title}`);
89
+ if (updated)
90
+ parts.push(`Updated: ${updated}`);
91
+ if (content) {
92
+ parts.push(this.htmlToMd(content, turndown));
93
+ }
94
+ else if (summary) {
95
+ parts.push(this.htmlToMd(summary, turndown));
96
+ }
97
+ if (parts.length > 0)
98
+ sections.push(parts.join("\n"));
99
+ }
100
+ return { markdown: sections.join("\n\n").trim(), title: feedTitle };
101
+ }
102
+ htmlToMd(html, turndown) {
103
+ // Unescape CDATA and HTML entities that might be in RSS
104
+ const unescaped = html
105
+ .replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, "$1")
106
+ .replace(/&lt;/g, "<")
107
+ .replace(/&gt;/g, ">")
108
+ .replace(/&amp;/g, "&");
109
+ // If it looks like HTML, convert it
110
+ if (unescaped.includes("<")) {
111
+ return turndown.turndown(unescaped).trim();
112
+ }
113
+ return unescaped.trim();
114
+ }
115
+ extract(xml, tag) {
116
+ // Handle both <tag>content</tag> and <tag><![CDATA[content]]></tag>
117
+ const re = new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`, "i");
118
+ const match = xml.match(re);
119
+ if (!match)
120
+ return undefined;
121
+ return match[1]
122
+ .replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, "$1")
123
+ .trim() || undefined;
124
+ }
125
+ extractAll(xml, tag) {
126
+ const results = [];
127
+ const re = new RegExp(`<${tag}[^>]*>[\\s\\S]*?</${tag}>`, "gi");
128
+ let match;
129
+ while ((match = re.exec(xml)) !== null) {
130
+ results.push(match[0]);
131
+ }
132
+ return results;
133
+ }
134
+ }
@@ -0,0 +1,6 @@
1
+ import type { Converter, ConversionResult, StreamInfo } from "../types.js";
2
+ export declare class WikipediaConverter implements Converter {
3
+ name: string;
4
+ accepts(streamInfo: StreamInfo): boolean;
5
+ convert(input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult>;
6
+ }