markit-ai 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/dist/commands/config.js +12 -6
  2. package/dist/commands/convert.js +3 -4
  3. package/dist/commands/formats.js +21 -5
  4. package/dist/commands/init.js +1 -1
  5. package/dist/commands/plugin.js +2 -2
  6. package/dist/converters/audio.d.ts +1 -1
  7. package/dist/converters/audio.js +23 -6
  8. package/dist/converters/csv.d.ts +1 -1
  9. package/dist/converters/csv.js +1 -1
  10. package/dist/converters/docx.d.ts +1 -1
  11. package/dist/converters/docx.js +4 -7
  12. package/dist/converters/epub.d.ts +1 -1
  13. package/dist/converters/epub.js +27 -10
  14. package/dist/converters/html.d.ts +1 -1
  15. package/dist/converters/html.js +4 -7
  16. package/dist/converters/image.d.ts +1 -1
  17. package/dist/converters/image.js +40 -10
  18. package/dist/converters/ipynb.d.ts +1 -1
  19. package/dist/converters/ipynb.js +6 -3
  20. package/dist/converters/json.d.ts +1 -1
  21. package/dist/converters/json.js +1 -1
  22. package/dist/converters/pdf.d.ts +1 -1
  23. package/dist/converters/pdf.js +1 -1
  24. package/dist/converters/plain-text.d.ts +1 -1
  25. package/dist/converters/plain-text.js +56 -10
  26. package/dist/converters/pptx.d.ts +1 -1
  27. package/dist/converters/pptx.js +39 -12
  28. package/dist/converters/rss.d.ts +1 -1
  29. package/dist/converters/rss.js +18 -14
  30. package/dist/converters/wikipedia.d.ts +1 -1
  31. package/dist/converters/wikipedia.js +6 -8
  32. package/dist/converters/xlsx.d.ts +1 -1
  33. package/dist/converters/xlsx.js +12 -5
  34. package/dist/converters/xml.d.ts +1 -1
  35. package/dist/converters/xml.js +2 -1
  36. package/dist/converters/yaml.d.ts +1 -1
  37. package/dist/converters/yaml.js +2 -1
  38. package/dist/converters/zip.d.ts +1 -1
  39. package/dist/converters/zip.js +3 -2
  40. package/dist/index.d.ts +20 -20
  41. package/dist/index.js +17 -17
  42. package/dist/main.js +13 -7
  43. package/dist/markit.d.ts +1 -1
  44. package/dist/markit.js +13 -16
  45. package/dist/plugins/api.js +1 -3
  46. package/dist/plugins/index.d.ts +3 -3
  47. package/dist/plugins/index.js +2 -2
  48. package/dist/plugins/installer.js +2 -2
  49. package/dist/plugins/types.d.ts +1 -1
  50. package/dist/providers/index.d.ts +2 -2
  51. package/dist/providers/index.js +2 -4
  52. package/dist/providers/openai.js +15 -4
  53. package/dist/utils/turndown.d.ts +8 -0
  54. package/dist/utils/turndown.js +64 -0
  55. package/package.json +4 -2
  56. package/dist/llm.d.ts +0 -10
  57. package/dist/llm.js +0 -139
  58. package/dist/mill.d.ts +0 -18
  59. package/dist/mill.js +0 -123
@@ -20,7 +20,12 @@ export const openai = {
20
20
  role: "user",
21
21
  content: [
22
22
  { type: "text", text: prompt },
23
- { type: "image_url", image_url: { url: `data:${mimetype};base64,${image.toString("base64")}` } },
23
+ {
24
+ type: "image_url",
25
+ image_url: {
26
+ url: `data:${mimetype};base64,${image.toString("base64")}`,
27
+ },
28
+ },
24
29
  ],
25
30
  },
26
31
  ],
@@ -36,7 +41,9 @@ export const openai = {
36
41
  },
37
42
  transcribe: async (audio, mimetype) => {
38
43
  const ext = mimeToExt(mimetype);
39
- const file = new File([audio], `audio${ext}`, { type: mimetype });
44
+ const file = new File([new Uint8Array(audio)], `audio${ext}`, {
45
+ type: mimetype,
46
+ });
40
47
  const formData = new FormData();
41
48
  formData.append("model", config.transcriptionModel || "gpt-4o-mini-transcribe");
42
49
  formData.append("file", file);
@@ -57,8 +64,12 @@ export const openai = {
57
64
  };
58
65
  function mimeToExt(mime) {
59
66
  const map = {
60
- "audio/mpeg": ".mp3", "audio/wav": ".wav", "audio/mp4": ".m4a",
61
- "video/mp4": ".mp4", "audio/ogg": ".ogg", "audio/flac": ".flac",
67
+ "audio/mpeg": ".mp3",
68
+ "audio/wav": ".wav",
69
+ "audio/mp4": ".m4a",
70
+ "video/mp4": ".mp4",
71
+ "audio/ogg": ".ogg",
72
+ "audio/flac": ".flac",
62
73
  "audio/aac": ".aac",
63
74
  };
64
75
  return map[mime] || ".mp3";
@@ -0,0 +1,8 @@
1
+ import TurndownService from "turndown";
2
+ export declare function createTurndown(): TurndownService;
3
+ /**
4
+ * Normalize HTML tables so turndown-plugin-gfm can handle them:
5
+ * - Wrap first row in <thead> if missing
6
+ * - Strip <p> tags inside <td>/<th> cells
7
+ */
8
+ export declare function normalizeTablesHtml(html: string): string;
@@ -0,0 +1,64 @@
1
+ import TurndownService from "turndown";
2
+ import { gfm } from "turndown-plugin-gfm";
3
+ export function createTurndown() {
4
+ const turndown = new TurndownService({
5
+ headingStyle: "atx",
6
+ codeBlockStyle: "fenced",
7
+ bulletListMarker: "-",
8
+ });
9
+ turndown.use(gfm);
10
+ // Fix strikethrough: GFM spec uses ~~ (double tilde), not ~ (single)
11
+ turndown.addRule("strikethrough", {
12
+ filter: ["del", "s", "strike"],
13
+ replacement(content) {
14
+ return `~~${content}~~`;
15
+ },
16
+ });
17
+ // Fix heading escaping: turndown escapes "1." to "1\." to avoid ordered lists
18
+ turndown.addRule("heading", {
19
+ filter: ["h1", "h2", "h3", "h4", "h5", "h6"],
20
+ replacement(content, node) {
21
+ const level = Number(node.nodeName.charAt(1));
22
+ const prefix = "#".repeat(level);
23
+ // Unescape unnecessary backslash before periods in headings
24
+ const cleaned = content.replace(/\\([.])/g, "$1").trim();
25
+ return `\n\n${prefix} ${cleaned}\n\n`;
26
+ },
27
+ });
28
+ // Override listItem rule to use single space after marker (turndown hardcodes 3)
29
+ turndown.addRule("listItem", {
30
+ filter: "li",
31
+ replacement(content, node, options) {
32
+ content = content
33
+ .replace(/^\n+/, "")
34
+ .replace(/\n+$/, "\n")
35
+ .replace(/\n/gm, "\n ");
36
+ const parent = node.parentNode;
37
+ let prefix = `${options.bulletListMarker} `;
38
+ if (parent?.nodeName === "OL") {
39
+ const start = parent.getAttribute("start");
40
+ const index = Array.prototype.indexOf.call(parent.children, node);
41
+ prefix = `${(start ? Number(start) : 1) + index}. `;
42
+ }
43
+ return prefix + content + (node.nextSibling ? "\n" : "");
44
+ },
45
+ });
46
+ return turndown;
47
+ }
48
+ /**
49
+ * Normalize HTML tables so turndown-plugin-gfm can handle them:
50
+ * - Wrap first row in <thead> if missing
51
+ * - Strip <p> tags inside <td>/<th> cells
52
+ */
53
+ export function normalizeTablesHtml(html) {
54
+ // Strip <p> tags inside table cells
55
+ let result = html.replace(/<(td|th)([^>]*)>\s*<p>([\s\S]*?)<\/p>\s*<\/(td|th)>/gi, "<$1$2>$3</$4>");
56
+ // Add thead to tables that lack it
57
+ result = result.replace(/<table([^>]*)>\s*(?:<tbody>\s*)?(<tr[\s\S]*?<\/tr>)([\s\S]*?)<\/(?:tbody>\s*<\/)?table>/gi, (_match, attrs, firstRow, rest) => {
58
+ const theadRow = firstRow
59
+ .replace(/<td/gi, "<th")
60
+ .replace(/<\/td>/gi, "</th>");
61
+ return `<table${attrs}><thead>${theadRow}</thead><tbody>${rest}</tbody></table>`;
62
+ });
63
+ return result;
64
+ }
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "markit-ai",
3
- "version": "0.1.1",
4
- "description": "Convert anything to markdown. PDFs, DOCX, HTML, URLs everything gets milled.",
3
+ "version": "0.2.0",
4
+ "description": "Convert anything to markdown. PDF, DOCX, PPTX, XLSX, HTML, EPUB, Jupyter, RSS, images, audio, URLs, and more. Pluggable converters, built-in LLM providers for image description and audio transcription. Works as a CLI and as a library.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
7
7
  "types": "dist/index.d.ts",
@@ -54,6 +54,7 @@
54
54
  "devDependencies": {
55
55
  "@biomejs/biome": "^2.3.14",
56
56
  "@types/jszip": "^3.4.1",
57
+ "@types/node": "^25.5.0",
57
58
  "@types/turndown": "^5.0.5",
58
59
  "typescript": "^5.8.0"
59
60
  },
@@ -67,6 +68,7 @@
67
68
  "music-metadata": "^11.12.3",
68
69
  "rss-parser": "^3.13.0",
69
70
  "turndown": "^7.2.0",
71
+ "turndown-plugin-gfm": "^1.0.2",
70
72
  "unpdf": "^1.4.0"
71
73
  }
72
74
  }
package/dist/llm.d.ts DELETED
@@ -1,10 +0,0 @@
1
- import type { MarkitOptions } from "./types.js";
2
- import type { MarkitConfig } from "./config.js";
3
- /**
4
- * Build describe/transcribe functions from .markit/config.json + env vars.
5
- * Uses the OpenAI-compatible provider by default.
6
- */
7
- export declare function createLlmFunctions(config: MarkitConfig): MarkitOptions;
8
- export declare function createOpenAIDescribe(config: MarkitConfig): (image: Buffer, mimetype: string) => Promise<string>;
9
- export declare function createOpenAITranscribe(config: MarkitConfig): (audio: Buffer, mimetype: string) => Promise<string>;
10
- export declare function createAnthropicDescribe(config: MarkitConfig): (image: Buffer, mimetype: string) => Promise<string>;
package/dist/llm.js DELETED
@@ -1,139 +0,0 @@
1
- import { resolveApiKey, resolveApiBase, resolveModel, resolveTranscriptionModel, } from "./config.js";
2
- /**
3
- * Build describe/transcribe functions from .markit/config.json + env vars.
4
- * Uses the OpenAI-compatible provider by default.
5
- */
6
- export function createLlmFunctions(config) {
7
- const apiKey = resolveApiKey(config);
8
- if (!apiKey)
9
- return {};
10
- const provider = config.llm?.provider || "openai";
11
- if (provider === "anthropic") {
12
- return {
13
- describe: createAnthropicDescribe(config),
14
- // Anthropic doesn't have a transcription API — leave undefined
15
- };
16
- }
17
- return {
18
- describe: createOpenAIDescribe(config),
19
- transcribe: createOpenAITranscribe(config),
20
- };
21
- }
22
- // ── OpenAI-compatible (also works with Groq, Together, Fireworks, Ollama) ──
23
- export function createOpenAIDescribe(config) {
24
- const apiKey = resolveApiKey(config);
25
- const baseUrl = resolveApiBase(config).replace(/\/+$/, "");
26
- const model = resolveModel(config);
27
- return async (image, mimetype) => {
28
- const base64 = image.toString("base64");
29
- const res = await fetch(`${baseUrl}/chat/completions`, {
30
- method: "POST",
31
- headers: {
32
- "Content-Type": "application/json",
33
- Authorization: `Bearer ${apiKey}`,
34
- },
35
- body: JSON.stringify({
36
- model,
37
- messages: [
38
- {
39
- role: "user",
40
- content: [
41
- { type: "text", text: "Write a detailed description of this image." },
42
- { type: "image_url", image_url: { url: `data:${mimetype};base64,${base64}` } },
43
- ],
44
- },
45
- ],
46
- max_tokens: 1024,
47
- }),
48
- });
49
- if (!res.ok) {
50
- const body = await res.text();
51
- throw new Error(`OpenAI API error ${res.status}: ${body}`);
52
- }
53
- const data = await res.json();
54
- return data.choices?.[0]?.message?.content ?? "";
55
- };
56
- }
57
- export function createOpenAITranscribe(config) {
58
- const apiKey = resolveApiKey(config);
59
- const baseUrl = resolveApiBase(config).replace(/\/+$/, "");
60
- const transcriptionModel = resolveTranscriptionModel(config);
61
- return async (audio, mimetype) => {
62
- const ext = mimeToExt(mimetype);
63
- const file = new File([audio], `audio${ext}`, { type: mimetype });
64
- const formData = new FormData();
65
- formData.append("model", transcriptionModel);
66
- formData.append("file", file);
67
- const res = await fetch(`${baseUrl}/audio/transcriptions`, {
68
- method: "POST",
69
- headers: { Authorization: `Bearer ${apiKey}` },
70
- body: formData,
71
- });
72
- if (!res.ok) {
73
- const body = await res.text();
74
- throw new Error(`Transcription API error ${res.status}: ${body}`);
75
- }
76
- const data = await res.json();
77
- return data.text ?? "";
78
- };
79
- }
80
- // ── Anthropic ───────────────────────────────────────────────────────────────
81
- export function createAnthropicDescribe(config) {
82
- const apiKey = process.env.ANTHROPIC_API_KEY ||
83
- config.llm?.apiKey ||
84
- resolveApiKey(config);
85
- const baseUrl = (process.env.ANTHROPIC_BASE_URL ||
86
- config.llm?.apiBase ||
87
- "https://api.anthropic.com").replace(/\/+$/, "");
88
- const model = resolveModel(config, undefined) || "claude-sonnet-4-20250514";
89
- return async (image, mimetype) => {
90
- const base64 = image.toString("base64");
91
- const res = await fetch(`${baseUrl}/v1/messages`, {
92
- method: "POST",
93
- headers: {
94
- "Content-Type": "application/json",
95
- "x-api-key": apiKey,
96
- "anthropic-version": "2023-06-01",
97
- },
98
- body: JSON.stringify({
99
- model,
100
- max_tokens: 1024,
101
- messages: [
102
- {
103
- role: "user",
104
- content: [
105
- {
106
- type: "image",
107
- source: {
108
- type: "base64",
109
- media_type: mimetype,
110
- data: base64,
111
- },
112
- },
113
- { type: "text", text: "Write a detailed description of this image." },
114
- ],
115
- },
116
- ],
117
- }),
118
- });
119
- if (!res.ok) {
120
- const body = await res.text();
121
- throw new Error(`Anthropic API error ${res.status}: ${body}`);
122
- }
123
- const data = await res.json();
124
- return data.content?.[0]?.text ?? "";
125
- };
126
- }
127
- // ── Helpers ─────────────────────────────────────────────────────────────────
128
- function mimeToExt(mime) {
129
- const map = {
130
- "audio/mpeg": ".mp3",
131
- "audio/wav": ".wav",
132
- "audio/mp4": ".m4a",
133
- "video/mp4": ".mp4",
134
- "audio/ogg": ".ogg",
135
- "audio/flac": ".flac",
136
- "audio/aac": ".aac",
137
- };
138
- return map[mime] || ".mp3";
139
- }
package/dist/mill.d.ts DELETED
@@ -1,18 +0,0 @@
1
- import type { ConversionResult, StreamInfo, ConvertOptions } from "./types.js";
2
- export declare class Mill {
3
- private converters;
4
- private options;
5
- constructor(options?: ConvertOptions);
6
- /**
7
- * Convert a local file to markdown.
8
- */
9
- convertFile(path: string): Promise<ConversionResult>;
10
- /**
11
- * Convert a URL to markdown.
12
- */
13
- convertUrl(url: string): Promise<ConversionResult>;
14
- /**
15
- * Convert a buffer with stream info to markdown.
16
- */
17
- convert(input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult>;
18
- }
package/dist/mill.js DELETED
@@ -1,123 +0,0 @@
1
- import { readFileSync } from "node:fs";
2
- import { extname, basename } from "node:path";
3
- import { PdfConverter } from "./converters/pdf.js";
4
- import { DocxConverter } from "./converters/docx.js";
5
- import { PptxConverter } from "./converters/pptx.js";
6
- import { XlsxConverter } from "./converters/xlsx.js";
7
- import { EpubConverter } from "./converters/epub.js";
8
- import { IpynbConverter } from "./converters/ipynb.js";
9
- import { HtmlConverter } from "./converters/html.js";
10
- import { WikipediaConverter } from "./converters/wikipedia.js";
11
- import { RssConverter } from "./converters/rss.js";
12
- import { CsvConverter } from "./converters/csv.js";
13
- import { JsonConverter } from "./converters/json.js";
14
- import { YamlConverter } from "./converters/yaml.js";
15
- import { XmlConverter } from "./converters/xml.js";
16
- import { ZipConverter } from "./converters/zip.js";
17
- import { ImageConverter } from "./converters/image.js";
18
- import { AudioConverter } from "./converters/audio.js";
19
- import { PlainTextConverter } from "./converters/plain-text.js";
20
- export class Mill {
21
- converters = [];
22
- options;
23
- constructor(options = {}) {
24
- this.options = options;
25
- // Order matters: specific formats first, generic last.
26
- // URL-specific converters (Wikipedia) before generic HTML.
27
- // ZIP converter gets a reference to other converters for recursive conversion.
28
- const specific = [
29
- new PdfConverter(),
30
- new DocxConverter(),
31
- new PptxConverter(),
32
- new XlsxConverter(),
33
- new EpubConverter(),
34
- new IpynbConverter(),
35
- new WikipediaConverter(),
36
- new RssConverter(),
37
- new CsvConverter(),
38
- new JsonConverter(),
39
- new YamlConverter(),
40
- new ImageConverter(),
41
- new AudioConverter(),
42
- ];
43
- const generic = [
44
- new XmlConverter(),
45
- new HtmlConverter(),
46
- ];
47
- // ZIP gets all other converters for recursive extraction
48
- const allNonZip = [...specific, ...generic];
49
- const zipConverter = new ZipConverter(allNonZip);
50
- // Plain text is the ultimate catch-all
51
- this.converters = [
52
- ...specific,
53
- zipConverter,
54
- ...generic,
55
- new PlainTextConverter(),
56
- ];
57
- }
58
- /**
59
- * Convert a local file to markdown.
60
- */
61
- async convertFile(path) {
62
- const buffer = readFileSync(path);
63
- const streamInfo = {
64
- localPath: path,
65
- extension: extname(path).toLowerCase(),
66
- filename: basename(path),
67
- };
68
- return this.convert(buffer, streamInfo);
69
- }
70
- /**
71
- * Convert a URL to markdown.
72
- */
73
- async convertUrl(url) {
74
- const response = await fetch(url, {
75
- headers: {
76
- Accept: "text/markdown, text/html;q=0.9, text/plain;q=0.8, */*;q=0.1",
77
- "User-Agent": "mill/0.1.0",
78
- },
79
- });
80
- if (!response.ok) {
81
- throw new Error(`Failed to fetch ${url}: ${response.status} ${response.statusText}`);
82
- }
83
- const contentType = response.headers.get("content-type") || "";
84
- const [mimetype] = contentType.split(";");
85
- // Derive extension from URL path
86
- const urlPath = new URL(url).pathname;
87
- const ext = extname(urlPath).toLowerCase();
88
- const buffer = Buffer.from(await response.arrayBuffer());
89
- const streamInfo = {
90
- url,
91
- mimetype: mimetype.trim(),
92
- extension: ext || undefined,
93
- filename: basename(urlPath) || undefined,
94
- };
95
- return this.convert(buffer, streamInfo);
96
- }
97
- /**
98
- * Convert a buffer with stream info to markdown.
99
- */
100
- async convert(input, streamInfo) {
101
- const errors = [];
102
- for (const converter of this.converters) {
103
- if (!converter.accepts(streamInfo))
104
- continue;
105
- try {
106
- return await converter.convert(input, streamInfo, this.options);
107
- }
108
- catch (err) {
109
- errors.push({
110
- converter: converter.name,
111
- error: err instanceof Error ? err : new Error(String(err)),
112
- });
113
- }
114
- }
115
- if (errors.length > 0) {
116
- const details = errors
117
- .map((e) => ` ${e.converter}: ${e.error.message}`)
118
- .join("\n");
119
- throw new Error(`Conversion failed:\n${details}`);
120
- }
121
- throw new Error(`Unsupported format: ${streamInfo.extension || streamInfo.mimetype || "unknown"}`);
122
- }
123
- }