markit-ai 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/converters/docx.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import mammoth from "mammoth";
|
|
2
|
-
import
|
|
2
|
+
import { createTurndown, normalizeTablesHtml } from "../utils/turndown.js";
|
|
3
3
|
const EXTENSIONS = [".docx"];
|
|
4
4
|
const MIMETYPES = [
|
|
5
5
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
@@ -18,11 +18,8 @@ export class DocxConverter {
|
|
|
18
18
|
}
|
|
19
19
|
async convert(input, _streamInfo) {
|
|
20
20
|
const { value: html } = await mammoth.convertToHtml({ buffer: input });
|
|
21
|
-
const turndown =
|
|
22
|
-
|
|
23
|
-
codeBlockStyle: "fenced",
|
|
24
|
-
});
|
|
25
|
-
const markdown = turndown.turndown(html);
|
|
21
|
+
const turndown = createTurndown();
|
|
22
|
+
const markdown = turndown.turndown(normalizeTablesHtml(html));
|
|
26
23
|
return { markdown: markdown.trim() };
|
|
27
24
|
}
|
|
28
25
|
}
|
package/dist/converters/epub.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { XMLParser } from "fast-xml-parser";
|
|
2
2
|
import JSZip from "jszip";
|
|
3
|
-
import
|
|
3
|
+
import { createTurndown, normalizeTablesHtml } from "../utils/turndown.js";
|
|
4
4
|
const EXTENSIONS = [".epub"];
|
|
5
5
|
const MIMETYPES = [
|
|
6
6
|
"application/epub",
|
|
@@ -75,10 +75,7 @@ export class EpubConverter {
|
|
|
75
75
|
const basePath = opfPath.includes("/")
|
|
76
76
|
? opfPath.substring(0, opfPath.lastIndexOf("/"))
|
|
77
77
|
: "";
|
|
78
|
-
const turndown =
|
|
79
|
-
headingStyle: "atx",
|
|
80
|
-
codeBlockStyle: "fenced",
|
|
81
|
-
});
|
|
78
|
+
const turndown = createTurndown();
|
|
82
79
|
const sections = [];
|
|
83
80
|
// Add metadata header
|
|
84
81
|
const metaLines = [];
|
|
@@ -101,7 +98,7 @@ export class EpubConverter {
|
|
|
101
98
|
const cleaned = html
|
|
102
99
|
.replace(/<script[\s\S]*?<\/script>/gi, "")
|
|
103
100
|
.replace(/<style[\s\S]*?<\/style>/gi, "");
|
|
104
|
-
const md = turndown.turndown(cleaned).trim();
|
|
101
|
+
const md = turndown.turndown(normalizeTablesHtml(cleaned)).trim();
|
|
105
102
|
if (md)
|
|
106
103
|
sections.push(md);
|
|
107
104
|
}
|
package/dist/converters/html.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import
|
|
1
|
+
import { createTurndown, normalizeTablesHtml } from "../utils/turndown.js";
|
|
2
2
|
const EXTENSIONS = [".html", ".htm"];
|
|
3
3
|
const MIMETYPES = ["text/html", "application/xhtml"];
|
|
4
4
|
export class HtmlConverter {
|
|
@@ -16,15 +16,12 @@ export class HtmlConverter {
|
|
|
16
16
|
async convert(input, streamInfo) {
|
|
17
17
|
const charset = streamInfo.charset || "utf-8";
|
|
18
18
|
const html = new TextDecoder(charset).decode(input);
|
|
19
|
-
const turndown =
|
|
20
|
-
headingStyle: "atx",
|
|
21
|
-
codeBlockStyle: "fenced",
|
|
22
|
-
});
|
|
19
|
+
const turndown = createTurndown();
|
|
23
20
|
// Remove script and style tags before converting
|
|
24
21
|
const cleaned = html
|
|
25
22
|
.replace(/<script[\s\S]*?<\/script>/gi, "")
|
|
26
23
|
.replace(/<style[\s\S]*?<\/style>/gi, "");
|
|
27
|
-
const markdown = turndown.turndown(cleaned);
|
|
24
|
+
const markdown = turndown.turndown(normalizeTablesHtml(cleaned));
|
|
28
25
|
// Try to extract title
|
|
29
26
|
const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
30
27
|
const title = titleMatch ? titleMatch[1].trim() : undefined;
|
package/dist/converters/rss.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import
|
|
1
|
+
import { createTurndown } from "../utils/turndown.js";
|
|
2
2
|
const _EXTENSIONS = [".rss", ".atom", ".xml"];
|
|
3
3
|
const MIMETYPES = [
|
|
4
4
|
"application/rss+xml",
|
|
@@ -36,7 +36,7 @@ export class RssConverter {
|
|
|
36
36
|
throw new Error("Not an RSS or Atom feed");
|
|
37
37
|
}
|
|
38
38
|
parseRss(xml) {
|
|
39
|
-
const turndown =
|
|
39
|
+
const turndown = createTurndown();
|
|
40
40
|
const sections = [];
|
|
41
41
|
// Extract from the <channel> block specifically
|
|
42
42
|
const channelMatch = xml.match(/<channel>([\s\S]*?)<\/channel>/i);
|
|
@@ -74,7 +74,7 @@ export class RssConverter {
|
|
|
74
74
|
return { markdown: sections.join("\n\n").trim(), title: channelTitle };
|
|
75
75
|
}
|
|
76
76
|
parseAtom(xml) {
|
|
77
|
-
const turndown =
|
|
77
|
+
const turndown = createTurndown();
|
|
78
78
|
const sections = [];
|
|
79
79
|
const feedTitle = this.extract(xml, "title");
|
|
80
80
|
const subtitle = this.extract(xml, "subtitle");
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import
|
|
1
|
+
import { createTurndown } from "../utils/turndown.js";
|
|
2
2
|
const WIKIPEDIA_RE = /^https?:\/\/[a-zA-Z]{2,3}\.wikipedia\.org\//;
|
|
3
3
|
export class WikipediaConverter {
|
|
4
4
|
name = "wikipedia";
|
|
@@ -16,10 +16,7 @@ export class WikipediaConverter {
|
|
|
16
16
|
const title = titleMatch
|
|
17
17
|
? titleMatch[1].replace(/ - Wikipedia$/, "").trim()
|
|
18
18
|
: undefined;
|
|
19
|
-
const turndown =
|
|
20
|
-
headingStyle: "atx",
|
|
21
|
-
codeBlockStyle: "fenced",
|
|
22
|
-
});
|
|
19
|
+
const turndown = createTurndown();
|
|
23
20
|
// Clean up Wikipedia-specific elements
|
|
24
21
|
let content = contentMatch ? contentMatch[1] : html;
|
|
25
22
|
content = content
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import TurndownService from "turndown";
|
|
2
|
+
export declare function createTurndown(): TurndownService;
|
|
3
|
+
/**
|
|
4
|
+
* Normalize HTML tables so turndown-plugin-gfm can handle them:
|
|
5
|
+
* - Wrap first row in <thead> if missing
|
|
6
|
+
* - Strip <p> tags inside <td>/<th> cells
|
|
7
|
+
*/
|
|
8
|
+
export declare function normalizeTablesHtml(html: string): string;
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import TurndownService from "turndown";
|
|
2
|
+
import { gfm } from "turndown-plugin-gfm";
|
|
3
|
+
export function createTurndown() {
|
|
4
|
+
const turndown = new TurndownService({
|
|
5
|
+
headingStyle: "atx",
|
|
6
|
+
codeBlockStyle: "fenced",
|
|
7
|
+
bulletListMarker: "-",
|
|
8
|
+
});
|
|
9
|
+
turndown.use(gfm);
|
|
10
|
+
// Fix strikethrough: GFM spec uses ~~ (double tilde), not ~ (single)
|
|
11
|
+
turndown.addRule("strikethrough", {
|
|
12
|
+
filter: ["del", "s", "strike"],
|
|
13
|
+
replacement(content) {
|
|
14
|
+
return `~~${content}~~`;
|
|
15
|
+
},
|
|
16
|
+
});
|
|
17
|
+
// Fix heading escaping: turndown escapes "1." to "1\." to avoid ordered lists
|
|
18
|
+
turndown.addRule("heading", {
|
|
19
|
+
filter: ["h1", "h2", "h3", "h4", "h5", "h6"],
|
|
20
|
+
replacement(content, node) {
|
|
21
|
+
const level = Number(node.nodeName.charAt(1));
|
|
22
|
+
const prefix = "#".repeat(level);
|
|
23
|
+
// Unescape unnecessary backslash before periods in headings
|
|
24
|
+
const cleaned = content.replace(/\\([.])/g, "$1").trim();
|
|
25
|
+
return `\n\n${prefix} ${cleaned}\n\n`;
|
|
26
|
+
},
|
|
27
|
+
});
|
|
28
|
+
// Override listItem rule to use single space after marker (turndown hardcodes 3)
|
|
29
|
+
turndown.addRule("listItem", {
|
|
30
|
+
filter: "li",
|
|
31
|
+
replacement(content, node, options) {
|
|
32
|
+
content = content
|
|
33
|
+
.replace(/^\n+/, "")
|
|
34
|
+
.replace(/\n+$/, "\n")
|
|
35
|
+
.replace(/\n/gm, "\n ");
|
|
36
|
+
const parent = node.parentNode;
|
|
37
|
+
let prefix = `${options.bulletListMarker} `;
|
|
38
|
+
if (parent?.nodeName === "OL") {
|
|
39
|
+
const start = parent.getAttribute("start");
|
|
40
|
+
const index = Array.prototype.indexOf.call(parent.children, node);
|
|
41
|
+
prefix = `${(start ? Number(start) : 1) + index}. `;
|
|
42
|
+
}
|
|
43
|
+
return prefix + content + (node.nextSibling ? "\n" : "");
|
|
44
|
+
},
|
|
45
|
+
});
|
|
46
|
+
return turndown;
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Normalize HTML tables so turndown-plugin-gfm can handle them:
|
|
50
|
+
* - Wrap first row in <thead> if missing
|
|
51
|
+
* - Strip <p> tags inside <td>/<th> cells
|
|
52
|
+
*/
|
|
53
|
+
export function normalizeTablesHtml(html) {
|
|
54
|
+
// Strip <p> tags inside table cells
|
|
55
|
+
let result = html.replace(/<(td|th)([^>]*)>\s*<p>([\s\S]*?)<\/p>\s*<\/(td|th)>/gi, "<$1$2>$3</$4>");
|
|
56
|
+
// Add thead to tables that lack it
|
|
57
|
+
result = result.replace(/<table([^>]*)>\s*(?:<tbody>\s*)?(<tr[\s\S]*?<\/tr>)([\s\S]*?)<\/(?:tbody>\s*<\/)?table>/gi, (_match, attrs, firstRow, rest) => {
|
|
58
|
+
const theadRow = firstRow
|
|
59
|
+
.replace(/<td/gi, "<th")
|
|
60
|
+
.replace(/<\/td>/gi, "</th>");
|
|
61
|
+
return `<table${attrs}><thead>${theadRow}</thead><tbody>${rest}</tbody></table>`;
|
|
62
|
+
});
|
|
63
|
+
return result;
|
|
64
|
+
}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "markit-ai",
|
|
3
|
-
"version": "0.
|
|
4
|
-
"description": "Convert anything to markdown.
|
|
3
|
+
"version": "0.2.0",
|
|
4
|
+
"description": "Convert anything to markdown. PDF, DOCX, PPTX, XLSX, HTML, EPUB, Jupyter, RSS, images, audio, URLs, and more. Pluggable converters, built-in LLM providers for image description and audio transcription. Works as a CLI and as a library.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
7
7
|
"types": "dist/index.d.ts",
|
|
@@ -68,6 +68,7 @@
|
|
|
68
68
|
"music-metadata": "^11.12.3",
|
|
69
69
|
"rss-parser": "^3.13.0",
|
|
70
70
|
"turndown": "^7.2.0",
|
|
71
|
+
"turndown-plugin-gfm": "^1.0.2",
|
|
71
72
|
"unpdf": "^1.4.0"
|
|
72
73
|
}
|
|
73
74
|
}
|