markit-ai 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  import mammoth from "mammoth";
2
- import TurndownService from "turndown";
2
+ import { createTurndown, normalizeTablesHtml } from "../utils/turndown.js";
3
3
  const EXTENSIONS = [".docx"];
4
4
  const MIMETYPES = [
5
5
  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -18,11 +18,8 @@ export class DocxConverter {
18
18
  }
19
19
  async convert(input, _streamInfo) {
20
20
  const { value: html } = await mammoth.convertToHtml({ buffer: input });
21
- const turndown = new TurndownService({
22
- headingStyle: "atx",
23
- codeBlockStyle: "fenced",
24
- });
25
- const markdown = turndown.turndown(html);
21
+ const turndown = createTurndown();
22
+ const markdown = turndown.turndown(normalizeTablesHtml(html));
26
23
  return { markdown: markdown.trim() };
27
24
  }
28
25
  }
@@ -1,6 +1,6 @@
1
1
  import { XMLParser } from "fast-xml-parser";
2
2
  import JSZip from "jszip";
3
- import TurndownService from "turndown";
3
+ import { createTurndown, normalizeTablesHtml } from "../utils/turndown.js";
4
4
  const EXTENSIONS = [".epub"];
5
5
  const MIMETYPES = [
6
6
  "application/epub",
@@ -75,10 +75,7 @@ export class EpubConverter {
75
75
  const basePath = opfPath.includes("/")
76
76
  ? opfPath.substring(0, opfPath.lastIndexOf("/"))
77
77
  : "";
78
- const turndown = new TurndownService({
79
- headingStyle: "atx",
80
- codeBlockStyle: "fenced",
81
- });
78
+ const turndown = createTurndown();
82
79
  const sections = [];
83
80
  // Add metadata header
84
81
  const metaLines = [];
@@ -101,7 +98,7 @@ export class EpubConverter {
101
98
  const cleaned = html
102
99
  .replace(/<script[\s\S]*?<\/script>/gi, "")
103
100
  .replace(/<style[\s\S]*?<\/style>/gi, "");
104
- const md = turndown.turndown(cleaned).trim();
101
+ const md = turndown.turndown(normalizeTablesHtml(cleaned)).trim();
105
102
  if (md)
106
103
  sections.push(md);
107
104
  }
@@ -1,4 +1,4 @@
1
- import TurndownService from "turndown";
1
+ import { createTurndown, normalizeTablesHtml } from "../utils/turndown.js";
2
2
  const EXTENSIONS = [".html", ".htm"];
3
3
  const MIMETYPES = ["text/html", "application/xhtml"];
4
4
  export class HtmlConverter {
@@ -16,15 +16,12 @@ export class HtmlConverter {
16
16
  async convert(input, streamInfo) {
17
17
  const charset = streamInfo.charset || "utf-8";
18
18
  const html = new TextDecoder(charset).decode(input);
19
- const turndown = new TurndownService({
20
- headingStyle: "atx",
21
- codeBlockStyle: "fenced",
22
- });
19
+ const turndown = createTurndown();
23
20
  // Remove script and style tags before converting
24
21
  const cleaned = html
25
22
  .replace(/<script[\s\S]*?<\/script>/gi, "")
26
23
  .replace(/<style[\s\S]*?<\/style>/gi, "");
27
- const markdown = turndown.turndown(cleaned);
24
+ const markdown = turndown.turndown(normalizeTablesHtml(cleaned));
28
25
  // Try to extract title
29
26
  const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
30
27
  const title = titleMatch ? titleMatch[1].trim() : undefined;
@@ -1,4 +1,4 @@
1
- import TurndownService from "turndown";
1
+ import { createTurndown } from "../utils/turndown.js";
2
2
  const _EXTENSIONS = [".rss", ".atom", ".xml"];
3
3
  const MIMETYPES = [
4
4
  "application/rss+xml",
@@ -36,7 +36,7 @@ export class RssConverter {
36
36
  throw new Error("Not an RSS or Atom feed");
37
37
  }
38
38
  parseRss(xml) {
39
- const turndown = new TurndownService({ headingStyle: "atx" });
39
+ const turndown = createTurndown();
40
40
  const sections = [];
41
41
  // Extract from the <channel> block specifically
42
42
  const channelMatch = xml.match(/<channel>([\s\S]*?)<\/channel>/i);
@@ -74,7 +74,7 @@ export class RssConverter {
74
74
  return { markdown: sections.join("\n\n").trim(), title: channelTitle };
75
75
  }
76
76
  parseAtom(xml) {
77
- const turndown = new TurndownService({ headingStyle: "atx" });
77
+ const turndown = createTurndown();
78
78
  const sections = [];
79
79
  const feedTitle = this.extract(xml, "title");
80
80
  const subtitle = this.extract(xml, "subtitle");
@@ -1,4 +1,4 @@
1
- import TurndownService from "turndown";
1
+ import { createTurndown } from "../utils/turndown.js";
2
2
  const WIKIPEDIA_RE = /^https?:\/\/[a-zA-Z]{2,3}\.wikipedia\.org\//;
3
3
  export class WikipediaConverter {
4
4
  name = "wikipedia";
@@ -16,10 +16,7 @@ export class WikipediaConverter {
16
16
  const title = titleMatch
17
17
  ? titleMatch[1].replace(/ - Wikipedia$/, "").trim()
18
18
  : undefined;
19
- const turndown = new TurndownService({
20
- headingStyle: "atx",
21
- codeBlockStyle: "fenced",
22
- });
19
+ const turndown = createTurndown();
23
20
  // Clean up Wikipedia-specific elements
24
21
  let content = contentMatch ? contentMatch[1] : html;
25
22
  content = content
@@ -0,0 +1,8 @@
1
+ import TurndownService from "turndown";
2
+ export declare function createTurndown(): TurndownService;
3
+ /**
4
+ * Normalize HTML tables so turndown-plugin-gfm can handle them:
5
+ * - Wrap first row in <thead> if missing
6
+ * - Strip <p> tags inside <td>/<th> cells
7
+ */
8
+ export declare function normalizeTablesHtml(html: string): string;
@@ -0,0 +1,64 @@
1
+ import TurndownService from "turndown";
2
+ import { gfm } from "turndown-plugin-gfm";
3
+ export function createTurndown() {
4
+ const turndown = new TurndownService({
5
+ headingStyle: "atx",
6
+ codeBlockStyle: "fenced",
7
+ bulletListMarker: "-",
8
+ });
9
+ turndown.use(gfm);
10
+ // Fix strikethrough: GFM spec uses ~~ (double tilde), not ~ (single)
11
+ turndown.addRule("strikethrough", {
12
+ filter: ["del", "s", "strike"],
13
+ replacement(content) {
14
+ return `~~${content}~~`;
15
+ },
16
+ });
17
+ // Fix heading escaping: turndown escapes "1." to "1\." to avoid ordered lists
18
+ turndown.addRule("heading", {
19
+ filter: ["h1", "h2", "h3", "h4", "h5", "h6"],
20
+ replacement(content, node) {
21
+ const level = Number(node.nodeName.charAt(1));
22
+ const prefix = "#".repeat(level);
23
+ // Unescape unnecessary backslash before periods in headings
24
+ const cleaned = content.replace(/\\([.])/g, "$1").trim();
25
+ return `\n\n${prefix} ${cleaned}\n\n`;
26
+ },
27
+ });
28
+ // Override listItem rule to use single space after marker (turndown hardcodes 3)
29
+ turndown.addRule("listItem", {
30
+ filter: "li",
31
+ replacement(content, node, options) {
32
+ content = content
33
+ .replace(/^\n+/, "")
34
+ .replace(/\n+$/, "\n")
35
+ .replace(/\n/gm, "\n ");
36
+ const parent = node.parentNode;
37
+ let prefix = `${options.bulletListMarker} `;
38
+ if (parent?.nodeName === "OL") {
39
+ const start = parent.getAttribute("start");
40
+ const index = Array.prototype.indexOf.call(parent.children, node);
41
+ prefix = `${(start ? Number(start) : 1) + index}. `;
42
+ }
43
+ return prefix + content + (node.nextSibling ? "\n" : "");
44
+ },
45
+ });
46
+ return turndown;
47
+ }
48
+ /**
49
+ * Normalize HTML tables so turndown-plugin-gfm can handle them:
50
+ * - Wrap first row in <thead> if missing
51
+ * - Strip <p> tags inside <td>/<th> cells
52
+ */
53
+ export function normalizeTablesHtml(html) {
54
+ // Strip <p> tags inside table cells
55
+ let result = html.replace(/<(td|th)([^>]*)>\s*<p>([\s\S]*?)<\/p>\s*<\/(td|th)>/gi, "<$1$2>$3</$4>");
56
+ // Add thead to tables that lack it
57
+ result = result.replace(/<table([^>]*)>\s*(?:<tbody>\s*)?(<tr[\s\S]*?<\/tr>)([\s\S]*?)<\/(?:tbody>\s*<\/)?table>/gi, (_match, attrs, firstRow, rest) => {
58
+ const theadRow = firstRow
59
+ .replace(/<td/gi, "<th")
60
+ .replace(/<\/td>/gi, "</th>");
61
+ return `<table${attrs}><thead>${theadRow}</thead><tbody>${rest}</tbody></table>`;
62
+ });
63
+ return result;
64
+ }
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "markit-ai",
3
- "version": "0.1.3",
4
- "description": "Convert anything to markdown. PDFs, DOCX, HTML, URLs everything gets milled.",
3
+ "version": "0.2.0",
4
+ "description": "Convert anything to markdown. PDF, DOCX, PPTX, XLSX, HTML, EPUB, Jupyter, RSS, images, audio, URLs, and more. Pluggable converters, built-in LLM providers for image description and audio transcription. Works as a CLI and as a library.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
7
7
  "types": "dist/index.d.ts",
@@ -68,6 +68,7 @@
68
68
  "music-metadata": "^11.12.3",
69
69
  "rss-parser": "^3.13.0",
70
70
  "turndown": "^7.2.0",
71
+ "turndown-plugin-gfm": "^1.0.2",
71
72
  "unpdf": "^1.4.0"
72
73
  }
73
74
  }