markit-ai 0.1.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,64 @@
1
+ import TurndownService from "turndown";
2
+ import { gfm } from "turndown-plugin-gfm";
3
+ export function createTurndown() {
4
+ const turndown = new TurndownService({
5
+ headingStyle: "atx",
6
+ codeBlockStyle: "fenced",
7
+ bulletListMarker: "-",
8
+ });
9
+ turndown.use(gfm);
10
+ // Fix strikethrough: GFM spec uses ~~ (double tilde), not ~ (single)
11
+ turndown.addRule("strikethrough", {
12
+ filter: ["del", "s", "strike"],
13
+ replacement(content) {
14
+ return `~~${content}~~`;
15
+ },
16
+ });
17
+ // Fix heading escaping: turndown escapes "1." to "1\." to avoid ordered lists
18
+ turndown.addRule("heading", {
19
+ filter: ["h1", "h2", "h3", "h4", "h5", "h6"],
20
+ replacement(content, node) {
21
+ const level = Number(node.nodeName.charAt(1));
22
+ const prefix = "#".repeat(level);
23
+ // Unescape unnecessary backslash before periods in headings
24
+ const cleaned = content.replace(/\\([.])/g, "$1").trim();
25
+ return `\n\n${prefix} ${cleaned}\n\n`;
26
+ },
27
+ });
28
+ // Override listItem rule to use single space after marker (turndown hardcodes 3)
29
+ turndown.addRule("listItem", {
30
+ filter: "li",
31
+ replacement(content, node, options) {
32
+ content = content
33
+ .replace(/^\n+/, "")
34
+ .replace(/\n+$/, "\n")
35
+ .replace(/\n/gm, "\n ");
36
+ const parent = node.parentNode;
37
+ let prefix = `${options.bulletListMarker} `;
38
+ if (parent?.nodeName === "OL") {
39
+ const start = parent.getAttribute("start");
40
+ const index = Array.prototype.indexOf.call(parent.children, node);
41
+ prefix = `${(start ? Number(start) : 1) + index}. `;
42
+ }
43
+ return prefix + content + (node.nextSibling ? "\n" : "");
44
+ },
45
+ });
46
+ return turndown;
47
+ }
48
+ /**
49
+ * Normalize HTML tables so turndown-plugin-gfm can handle them:
50
+ * - Wrap first row in <thead> if missing
51
+ * - Strip <p> tags inside <td>/<th> cells
52
+ */
53
+ export function normalizeTablesHtml(html) {
54
+ // Strip <p> tags inside table cells
55
+ let result = html.replace(/<(td|th)([^>]*)>\s*<p>([\s\S]*?)<\/p>\s*<\/(td|th)>/gi, "<$1$2>$3</$4>");
56
+ // Add thead to tables that lack it
57
+ result = result.replace(/<table([^>]*)>\s*(?:<tbody>\s*)?(<tr[\s\S]*?<\/tr>)([\s\S]*?)<\/(?:tbody>\s*<\/)?table>/gi, (_match, attrs, firstRow, rest) => {
58
+ const theadRow = firstRow
59
+ .replace(/<td/gi, "<th")
60
+ .replace(/<\/td>/gi, "</th>");
61
+ return `<table${attrs}><thead>${theadRow}</thead><tbody>${rest}</tbody></table>`;
62
+ });
63
+ return result;
64
+ }
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "markit-ai",
3
- "version": "0.1.3",
4
- "description": "Convert anything to markdown. PDFs, DOCX, HTML, URLs everything gets milled.",
3
+ "version": "0.3.0",
4
+ "description": "Convert anything to markdown. PDF, DOCX, PPTX, XLSX, HTML, EPUB, Jupyter, RSS, images, audio, URLs, and more. Pluggable converters, built-in LLM providers for image description and audio transcription. Works as a CLI and as a library.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
7
7
  "types": "dist/index.d.ts",
@@ -65,9 +65,10 @@
65
65
  "fast-xml-parser": "^5.5.9",
66
66
  "jszip": "^3.10.1",
67
67
  "mammoth": "^1.9.0",
68
+ "mupdf": "^1.27.0",
68
69
  "music-metadata": "^11.12.3",
69
70
  "rss-parser": "^3.13.0",
70
71
  "turndown": "^7.2.0",
71
- "unpdf": "^1.4.0"
72
+ "turndown-plugin-gfm": "^1.0.2"
72
73
  }
73
74
  }
@@ -1,6 +0,0 @@
1
- import type { ConversionResult, Converter, StreamInfo } from "../types.js";
2
- export declare class PdfConverter implements Converter {
3
- name: string;
4
- accepts(streamInfo: StreamInfo): boolean;
5
- convert(input: Buffer, _streamInfo: StreamInfo): Promise<ConversionResult>;
6
- }
@@ -1,29 +0,0 @@
1
- const EXTENSIONS = [".pdf"];
2
- const MIMETYPES = ["application/pdf", "application/x-pdf"];
3
- export class PdfConverter {
4
- name = "pdf";
5
- accepts(streamInfo) {
6
- if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension)) {
7
- return true;
8
- }
9
- if (streamInfo.mimetype &&
10
- MIMETYPES.some((m) => streamInfo.mimetype?.startsWith(m))) {
11
- return true;
12
- }
13
- return false;
14
- }
15
- async convert(input, _streamInfo) {
16
- let extractText;
17
- try {
18
- ({ extractText } = await import("unpdf"));
19
- }
20
- catch {
21
- throw new Error("PDF support requires 'unpdf'. Install it: npm install unpdf");
22
- }
23
- const result = await extractText(new Uint8Array(input));
24
- const text = Array.isArray(result.text)
25
- ? result.text.join("\n\n")
26
- : String(result.text);
27
- return { markdown: text.trim() };
28
- }
29
- }