markit-ai 0.1.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/converters/docx.js +3 -6
- package/dist/converters/epub.js +3 -6
- package/dist/converters/html.js +3 -6
- package/dist/converters/pdf/columns.d.ts +35 -0
- package/dist/converters/pdf/columns.js +93 -0
- package/dist/converters/pdf/extract.d.ts +19 -0
- package/dist/converters/pdf/extract.js +513 -0
- package/dist/converters/pdf/grid.d.ts +25 -0
- package/dist/converters/pdf/grid.js +654 -0
- package/dist/converters/pdf/headers.d.ts +24 -0
- package/dist/converters/pdf/headers.js +108 -0
- package/dist/converters/pdf/index.d.ts +19 -0
- package/dist/converters/pdf/index.js +116 -0
- package/dist/converters/pdf/render.d.ts +24 -0
- package/dist/converters/pdf/render.js +513 -0
- package/dist/converters/pdf/types.d.ts +75 -0
- package/dist/converters/pdf/types.js +1 -0
- package/dist/converters/rss.js +3 -3
- package/dist/converters/wikipedia.js +2 -5
- package/dist/index.d.ts +1 -1
- package/dist/index.js +1 -1
- package/dist/markit.js +1 -1
- package/dist/types.d.ts +2 -0
- package/dist/utils/turndown.d.ts +8 -0
- package/dist/utils/turndown.js +64 -0
- package/package.json +4 -3
- package/dist/converters/pdf.d.ts +0 -6
- package/dist/converters/pdf.js +0 -29
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import TurndownService from "turndown";
|
|
2
|
+
import { gfm } from "turndown-plugin-gfm";
|
|
3
|
+
export function createTurndown() {
|
|
4
|
+
const turndown = new TurndownService({
|
|
5
|
+
headingStyle: "atx",
|
|
6
|
+
codeBlockStyle: "fenced",
|
|
7
|
+
bulletListMarker: "-",
|
|
8
|
+
});
|
|
9
|
+
turndown.use(gfm);
|
|
10
|
+
// Fix strikethrough: GFM spec uses ~~ (double tilde), not ~ (single)
|
|
11
|
+
turndown.addRule("strikethrough", {
|
|
12
|
+
filter: ["del", "s", "strike"],
|
|
13
|
+
replacement(content) {
|
|
14
|
+
return `~~${content}~~`;
|
|
15
|
+
},
|
|
16
|
+
});
|
|
17
|
+
// Fix heading escaping: turndown escapes "1." to "1\." to avoid ordered lists
|
|
18
|
+
turndown.addRule("heading", {
|
|
19
|
+
filter: ["h1", "h2", "h3", "h4", "h5", "h6"],
|
|
20
|
+
replacement(content, node) {
|
|
21
|
+
const level = Number(node.nodeName.charAt(1));
|
|
22
|
+
const prefix = "#".repeat(level);
|
|
23
|
+
// Unescape unnecessary backslash before periods in headings
|
|
24
|
+
const cleaned = content.replace(/\\([.])/g, "$1").trim();
|
|
25
|
+
return `\n\n${prefix} ${cleaned}\n\n`;
|
|
26
|
+
},
|
|
27
|
+
});
|
|
28
|
+
// Override listItem rule to use single space after marker (turndown hardcodes 3)
|
|
29
|
+
turndown.addRule("listItem", {
|
|
30
|
+
filter: "li",
|
|
31
|
+
replacement(content, node, options) {
|
|
32
|
+
content = content
|
|
33
|
+
.replace(/^\n+/, "")
|
|
34
|
+
.replace(/\n+$/, "\n")
|
|
35
|
+
.replace(/\n/gm, "\n ");
|
|
36
|
+
const parent = node.parentNode;
|
|
37
|
+
let prefix = `${options.bulletListMarker} `;
|
|
38
|
+
if (parent?.nodeName === "OL") {
|
|
39
|
+
const start = parent.getAttribute("start");
|
|
40
|
+
const index = Array.prototype.indexOf.call(parent.children, node);
|
|
41
|
+
prefix = `${(start ? Number(start) : 1) + index}. `;
|
|
42
|
+
}
|
|
43
|
+
return prefix + content + (node.nextSibling ? "\n" : "");
|
|
44
|
+
},
|
|
45
|
+
});
|
|
46
|
+
return turndown;
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Normalize HTML tables so turndown-plugin-gfm can handle them:
|
|
50
|
+
* - Wrap first row in <thead> if missing
|
|
51
|
+
* - Strip <p> tags inside <td>/<th> cells
|
|
52
|
+
*/
|
|
53
|
+
export function normalizeTablesHtml(html) {
|
|
54
|
+
// Strip <p> tags inside table cells
|
|
55
|
+
let result = html.replace(/<(td|th)([^>]*)>\s*<p>([\s\S]*?)<\/p>\s*<\/(td|th)>/gi, "<$1$2>$3</$4>");
|
|
56
|
+
// Add thead to tables that lack it
|
|
57
|
+
result = result.replace(/<table([^>]*)>\s*(?:<tbody>\s*)?(<tr[\s\S]*?<\/tr>)([\s\S]*?)<\/(?:tbody>\s*<\/)?table>/gi, (_match, attrs, firstRow, rest) => {
|
|
58
|
+
const theadRow = firstRow
|
|
59
|
+
.replace(/<td/gi, "<th")
|
|
60
|
+
.replace(/<\/td>/gi, "</th>");
|
|
61
|
+
return `<table${attrs}><thead>${theadRow}</thead><tbody>${rest}</tbody></table>`;
|
|
62
|
+
});
|
|
63
|
+
return result;
|
|
64
|
+
}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "markit-ai",
|
|
3
|
-
"version": "0.
|
|
4
|
-
"description": "Convert anything to markdown.
|
|
3
|
+
"version": "0.3.0",
|
|
4
|
+
"description": "Convert anything to markdown. PDF, DOCX, PPTX, XLSX, HTML, EPUB, Jupyter, RSS, images, audio, URLs, and more. Pluggable converters, built-in LLM providers for image description and audio transcription. Works as a CLI and as a library.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
7
7
|
"types": "dist/index.d.ts",
|
|
@@ -65,9 +65,10 @@
|
|
|
65
65
|
"fast-xml-parser": "^5.5.9",
|
|
66
66
|
"jszip": "^3.10.1",
|
|
67
67
|
"mammoth": "^1.9.0",
|
|
68
|
+
"mupdf": "^1.27.0",
|
|
68
69
|
"music-metadata": "^11.12.3",
|
|
69
70
|
"rss-parser": "^3.13.0",
|
|
70
71
|
"turndown": "^7.2.0",
|
|
71
|
-
"
|
|
72
|
+
"turndown-plugin-gfm": "^1.0.2"
|
|
72
73
|
}
|
|
73
74
|
}
|
package/dist/converters/pdf.d.ts
DELETED
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
import type { ConversionResult, Converter, StreamInfo } from "../types.js";
|
|
2
|
-
export declare class PdfConverter implements Converter {
|
|
3
|
-
name: string;
|
|
4
|
-
accepts(streamInfo: StreamInfo): boolean;
|
|
5
|
-
convert(input: Buffer, _streamInfo: StreamInfo): Promise<ConversionResult>;
|
|
6
|
-
}
|
package/dist/converters/pdf.js
DELETED
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
const EXTENSIONS = [".pdf"];
|
|
2
|
-
const MIMETYPES = ["application/pdf", "application/x-pdf"];
|
|
3
|
-
export class PdfConverter {
|
|
4
|
-
name = "pdf";
|
|
5
|
-
accepts(streamInfo) {
|
|
6
|
-
if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension)) {
|
|
7
|
-
return true;
|
|
8
|
-
}
|
|
9
|
-
if (streamInfo.mimetype &&
|
|
10
|
-
MIMETYPES.some((m) => streamInfo.mimetype?.startsWith(m))) {
|
|
11
|
-
return true;
|
|
12
|
-
}
|
|
13
|
-
return false;
|
|
14
|
-
}
|
|
15
|
-
async convert(input, _streamInfo) {
|
|
16
|
-
let extractText;
|
|
17
|
-
try {
|
|
18
|
-
({ extractText } = await import("unpdf"));
|
|
19
|
-
}
|
|
20
|
-
catch {
|
|
21
|
-
throw new Error("PDF support requires 'unpdf'. Install it: npm install unpdf");
|
|
22
|
-
}
|
|
23
|
-
const result = await extractText(new Uint8Array(input));
|
|
24
|
-
const text = Array.isArray(result.text)
|
|
25
|
-
? result.text.join("\n\n")
|
|
26
|
-
: String(result.text);
|
|
27
|
-
return { markdown: text.trim() };
|
|
28
|
-
}
|
|
29
|
-
}
|