markit-ai 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,4 +2,5 @@ import type { OutputOptions } from "../utils/output.js";
2
2
  export declare function convert(source: string, options: OutputOptions & {
3
3
  output?: string;
4
4
  prompt?: string;
5
+ imageDir?: string;
5
6
  }): Promise<void>;
@@ -1,4 +1,6 @@
1
- import { writeFileSync } from "node:fs";
1
+ import { mkdtempSync, writeFileSync } from "node:fs";
2
+ import { tmpdir } from "node:os";
3
+ import { join } from "node:path";
2
4
  import { loadConfig } from "../config.js";
3
5
  import { Markit } from "../markit.js";
4
6
  import { loadAllPlugins } from "../plugins/loader.js";
@@ -23,6 +25,8 @@ export async function convert(source, options) {
23
25
  }
24
26
  const llmFunctions = createLlmFunctions(config, options.prompt);
25
27
  const markit = new Markit(llmFunctions, plugins);
28
+ // Auto-create a temp dir for images if not explicitly provided
29
+ const imageDir = options.imageDir || mkdtempSync(join(tmpdir(), "markit-images-"));
26
30
  try {
27
31
  let result;
28
32
  const isStdin = source === "-";
@@ -36,7 +40,7 @@ export async function convert(source, options) {
36
40
  process.exit(EXIT_ERROR);
37
41
  }
38
42
  const buffer = await readStdin();
39
- result = await markit.convert(buffer, {});
43
+ result = await markit.convert(buffer, { imageDir });
40
44
  }
41
45
  else if (isUrl) {
42
46
  // Progress hint for URL fetches (stderr so it doesn't pollute piped output)
@@ -46,7 +50,7 @@ export async function convert(source, options) {
46
50
  result = await markit.convertUrl(source);
47
51
  }
48
52
  else {
49
- result = await markit.convertFile(source);
53
+ result = await markit.convertFile(source, { imageDir });
50
54
  }
51
55
  const label = isStdin ? "stdin" : source;
52
56
  // Write to file or stdout
@@ -23,6 +23,11 @@ const BUILTIN_FORMATS = [
23
23
  extensions: [".mp3", ".wav", ".m4a", ".flac"],
24
24
  builtin: true,
25
25
  },
26
+ {
27
+ name: "GitHub",
28
+ extensions: ["github.com/*", "gist.github.com/*"],
29
+ builtin: true,
30
+ },
26
31
  { name: "ZIP", extensions: [".zip"], builtin: true },
27
32
  {
28
33
  name: "Plain text",
@@ -2,5 +2,5 @@ import type { ConversionResult, Converter, StreamInfo } from "../types.js";
2
2
  export declare class DocxConverter implements Converter {
3
3
  name: string;
4
4
  accepts(streamInfo: StreamInfo): boolean;
5
- convert(input: Buffer, _streamInfo: StreamInfo): Promise<ConversionResult>;
5
+ convert(input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult>;
6
6
  }
@@ -1,3 +1,5 @@
1
+ import { mkdirSync, writeFileSync } from "node:fs";
2
+ import { join } from "node:path";
1
3
  import mammoth from "mammoth";
2
4
  import { createTurndown, normalizeTablesHtml } from "../utils/turndown.js";
3
5
  const EXTENSIONS = [".docx"];
@@ -16,10 +18,40 @@ export class DocxConverter {
16
18
  }
17
19
  return false;
18
20
  }
19
- async convert(input, _streamInfo) {
20
- const { value: html } = await mammoth.convertToHtml({ buffer: input });
21
+ async convert(input, streamInfo) {
22
+ const imageDir = streamInfo.imageDir;
23
+ if (imageDir) {
24
+ mkdirSync(imageDir, { recursive: true });
25
+ }
26
+ let imageCount = 0;
27
+ const convertImage = imageDir
28
+ ? mammoth.images.imgElement((image) => {
29
+ imageCount++;
30
+ const ext = (image.contentType?.split("/")[1] || "png").replace("jpeg", "jpg");
31
+ const filename = `image_${imageCount}.${ext}`;
32
+ const filepath = join(imageDir, filename);
33
+ return image.read("base64").then((base64) => {
34
+ writeFileSync(filepath, Buffer.from(base64, "base64"));
35
+ return { src: filepath, alt: `image_${imageCount}` };
36
+ });
37
+ })
38
+ : mammoth.images.imgElement((image) => {
39
+ imageCount++;
40
+ const contentType = image.contentType || "image/png";
41
+ return image.read("base64").then((base64) => {
42
+ return {
43
+ src: `data:${contentType};base64,${base64.slice(0, 0)}`,
44
+ alt: `image_${imageCount}`,
45
+ };
46
+ });
47
+ });
48
+ const { value: html } = await mammoth.convertToHtml({ buffer: input }, { convertImage });
21
49
  const turndown = createTurndown();
22
- const markdown = turndown.turndown(normalizeTablesHtml(html));
50
+ let markdown = turndown.turndown(normalizeTablesHtml(html));
51
+ // Replace data URI images with comment placeholders when no imageDir
52
+ if (!imageDir) {
53
+ markdown = markdown.replace(/!\[([^\]]*)\]\(data:[^)]*\)/g, "<!-- image: $1 -->");
54
+ }
23
55
  return { markdown: markdown.trim() };
24
56
  }
25
57
  }
@@ -23,6 +23,7 @@ export class EpubConverter {
23
23
  ignoreAttributes: false,
24
24
  attributeNamePrefix: "@_",
25
25
  textNodeName: "#text",
26
+ processEntities: { maxTotalExpansions: 1_000_000 },
26
27
  });
27
28
  // Find content.opf path from container.xml
28
29
  const containerXml = await zip
@@ -0,0 +1,18 @@
1
+ import type { ConversionResult, Converter, MarkitOptions, StreamInfo } from "../types.js";
2
+ /**
3
+ * Matches GitHub URLs and fetches clean markdown content directly
4
+ * from raw endpoints or the GitHub API — no HTML scraping needed.
5
+ *
6
+ * Supported patterns:
7
+ * - Repos: github.com/owner/repo → raw README.md
8
+ * - Files: github.com/owner/repo/blob/… → raw file content
9
+ * - Gists: gist.github.com/owner/id → raw gist content
10
+ * - Issues: github.com/owner/repo/issues/N → API (title + body)
11
+ * - PRs: github.com/owner/repo/pull/N → API (title + body)
12
+ */
13
+ export declare class GitHubConverter implements Converter {
14
+ name: string;
15
+ accepts(streamInfo: StreamInfo): boolean;
16
+ convertUrl(url: string, _options?: MarkitOptions): Promise<ConversionResult>;
17
+ convert(_input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult>;
18
+ }
@@ -0,0 +1,148 @@
1
+ const GITHUB_HOSTS = new Set([
2
+ "github.com",
3
+ "www.github.com",
4
+ "gist.github.com",
5
+ ]);
6
+ /**
7
+ * Matches GitHub URLs and fetches clean markdown content directly
8
+ * from raw endpoints or the GitHub API — no HTML scraping needed.
9
+ *
10
+ * Supported patterns:
11
+ * - Repos: github.com/owner/repo → raw README.md
12
+ * - Files: github.com/owner/repo/blob/… → raw file content
13
+ * - Gists: gist.github.com/owner/id → raw gist content
14
+ * - Issues: github.com/owner/repo/issues/N → API (title + body)
15
+ * - PRs: github.com/owner/repo/pull/N → API (title + body)
16
+ */
17
+ export class GitHubConverter {
18
+ name = "github";
19
+ accepts(streamInfo) {
20
+ if (!streamInfo.url)
21
+ return false;
22
+ try {
23
+ const { hostname } = new URL(streamInfo.url);
24
+ return GITHUB_HOSTS.has(hostname);
25
+ }
26
+ catch {
27
+ return false;
28
+ }
29
+ }
30
+ async convertUrl(url, _options) {
31
+ const parsed = new URL(url);
32
+ if (parsed.hostname === "gist.github.com") {
33
+ return fetchGist(parsed);
34
+ }
35
+ const segments = parsed.pathname.split("/").filter(Boolean);
36
+ // Need at least owner/repo
37
+ if (segments.length < 2) {
38
+ throw new Error(`Unsupported GitHub URL: ${url}`);
39
+ }
40
+ const [owner, repo, type, ...rest] = segments;
41
+ // github.com/owner/repo/blob/ref/path → raw file
42
+ if (type === "blob" && rest.length >= 2) {
43
+ const ref = rest[0];
44
+ const filePath = rest.slice(1).join("/");
45
+ return fetchRawFile(owner, repo, ref, filePath);
46
+ }
47
+ // github.com/owner/repo/issues/N or /pull/N
48
+ if ((type === "issues" || type === "pull") && rest[0]) {
49
+ const number = Number.parseInt(rest[0], 10);
50
+ if (!Number.isNaN(number)) {
51
+ return fetchIssueOrPr(owner, repo, number);
52
+ }
53
+ }
54
+ // github.com/owner/repo (no subpath or tree/wiki/etc) → README
55
+ if (!type) {
56
+ return fetchReadme(owner, repo);
57
+ }
58
+ throw new Error(`Unsupported GitHub URL pattern: ${url}`);
59
+ }
60
+ async convert(_input, streamInfo) {
61
+ // GitHub URLs are handled entirely via convertUrl.
62
+ // If we end up here, the URL was already fetched by the default path —
63
+ // just delegate to convertUrl.
64
+ if (streamInfo.url) {
65
+ return this.convertUrl(streamInfo.url);
66
+ }
67
+ throw new Error("GitHub converter requires a URL");
68
+ }
69
+ }
70
+ // ---------------------------------------------------------------------------
71
+ // Fetchers
72
+ // ---------------------------------------------------------------------------
73
+ async function fetchReadme(owner, repo) {
74
+ const url = `https://raw.githubusercontent.com/${owner}/${repo}/HEAD/README.md`;
75
+ const res = await fetch(url);
76
+ if (!res.ok) {
77
+ throw new Error(`Failed to fetch README: ${res.status} ${res.statusText}`);
78
+ }
79
+ const markdown = (await res.text()).trim();
80
+ const title = extractFirstHeading(markdown) ?? `${owner}/${repo}`;
81
+ return { markdown, title };
82
+ }
83
+ async function fetchRawFile(owner, repo, ref, filePath) {
84
+ const url = `https://raw.githubusercontent.com/${owner}/${repo}/${ref}/${filePath}`;
85
+ const res = await fetch(url);
86
+ if (!res.ok) {
87
+ throw new Error(`Failed to fetch file: ${res.status} ${res.statusText}`);
88
+ }
89
+ const content = (await res.text()).trim();
90
+ const filename = filePath.split("/").pop() ?? filePath;
91
+ // If it's markdown, return as-is. Otherwise wrap in a code block.
92
+ if (filePath.endsWith(".md") || filePath.endsWith(".mdx")) {
93
+ const title = extractFirstHeading(content) ?? filename;
94
+ return { markdown: content, title };
95
+ }
96
+ const ext = filename.includes(".") ? filename.split(".").pop() : "";
97
+ const markdown = `# ${filename}\n\n\`\`\`${ext}\n${content}\n\`\`\``;
98
+ return { markdown, title: filename };
99
+ }
100
+ async function fetchGist(parsed) {
101
+ const segments = parsed.pathname.split("/").filter(Boolean);
102
+ // gist.github.com/owner/id
103
+ const [owner, id] = segments;
104
+ if (!owner || !id) {
105
+ throw new Error(`Unsupported gist URL: ${parsed.href}`);
106
+ }
107
+ const url = `https://gist.githubusercontent.com/${owner}/${id}/raw`;
108
+ const res = await fetch(url);
109
+ if (!res.ok) {
110
+ throw new Error(`Failed to fetch gist: ${res.status} ${res.statusText}`);
111
+ }
112
+ const content = (await res.text()).trim();
113
+ const title = `gist:${id}`;
114
+ return { markdown: content, title };
115
+ }
116
+ async function fetchIssueOrPr(owner, repo, number) {
117
+ const url = `https://api.github.com/repos/${owner}/${repo}/issues/${number}`;
118
+ const res = await fetch(url, {
119
+ headers: { Accept: "application/vnd.github.v3+json" },
120
+ });
121
+ if (!res.ok) {
122
+ throw new Error(`Failed to fetch issue/PR: ${res.status} ${res.statusText}`);
123
+ }
124
+ const data = (await res.json());
125
+ const title = data.title ?? `#${number}`;
126
+ const parts = [`# ${title}`];
127
+ // Metadata line
128
+ const meta = [];
129
+ if (data.user?.login)
130
+ meta.push(`@${data.user.login}`);
131
+ if (data.state)
132
+ meta.push(data.state);
133
+ if (data.labels?.length) {
134
+ meta.push(data.labels.map((l) => l.name).join(", "));
135
+ }
136
+ if (meta.length > 0)
137
+ parts.push(meta.join(" · "));
138
+ if (data.body?.trim())
139
+ parts.push(data.body.trim());
140
+ return { markdown: parts.join("\n\n"), title };
141
+ }
142
+ // ---------------------------------------------------------------------------
143
+ // Helpers
144
+ // ---------------------------------------------------------------------------
145
+ function extractFirstHeading(markdown) {
146
+ const match = markdown.match(/^#\s+(.+)$/m);
147
+ return match?.[1]?.trim();
148
+ }
@@ -0,0 +1,35 @@
1
+ /**
2
+ * Multi-column layout detection and text box reordering.
3
+ *
4
+ * Many PDFs (legal documents, datasheets, academic papers) use two-column
5
+ * layouts. Without column detection, text boxes are ordered by Y position
6
+ * only, interleaving left and right column content.
7
+ *
8
+ * Algorithm:
9
+ * 1. Collect left edges of all text boxes on the page
10
+ * 2. Find the largest horizontal gap between consecutive left edges
11
+ * 3. If gap > MIN_GAP_RATIO of the text width and both sides have
12
+ * enough boxes → multi-column detected
13
+ * 4. Assign each text box to a column based on its center X
14
+ * 5. Return columns in reading order (left-to-right, top-to-bottom)
15
+ *
16
+ * This only detects the column structure. The caller is responsible for
17
+ * processing each column's text boxes independently (table detection,
18
+ * rendering, etc.).
19
+ */
20
+ import type { TextBox } from "./types.js";
21
+ export interface ColumnLayout {
22
+ /** Number of columns detected (1 = single column, 2+ = multi-column). */
23
+ columnCount: number;
24
+ /** Text boxes grouped by column, in reading order (left to right). */
25
+ columns: TextBox[][];
26
+ /** X positions of column boundaries (between columns). */
27
+ boundaries: number[];
28
+ }
29
+ /**
30
+ * Detect column layout and return text boxes grouped by column.
31
+ *
32
+ * For single-column pages, returns all boxes in one group.
33
+ * For multi-column pages, returns boxes split by column in reading order.
34
+ */
35
+ export declare function detectColumns(textBoxes: TextBox[]): ColumnLayout;
@@ -0,0 +1,93 @@
1
+ /**
2
+ * Multi-column layout detection and text box reordering.
3
+ *
4
+ * Many PDFs (legal documents, datasheets, academic papers) use two-column
5
+ * layouts. Without column detection, text boxes are ordered by Y position
6
+ * only, interleaving left and right column content.
7
+ *
8
+ * Algorithm:
9
+ * 1. Collect left edges of all text boxes on the page
10
+ * 2. Find the largest horizontal gap between consecutive left edges
11
+ * 3. If gap > MIN_GAP_RATIO of the text width and both sides have
12
+ * enough boxes → multi-column detected
13
+ * 4. Assign each text box to a column based on its center X
14
+ * 5. Return columns in reading order (left-to-right, top-to-bottom)
15
+ *
16
+ * This only detects the column structure. The caller is responsible for
17
+ * processing each column's text boxes independently (table detection,
18
+ * rendering, etc.).
19
+ */
20
+ /**
21
+ * Minimum gap as a fraction of the total text width to consider a column
22
+ * boundary. A two-column layout typically has ~50% gap; we use a lower
23
+ * threshold to catch asymmetric columns.
24
+ */
25
+ const MIN_GAP_RATIO = 0.15;
26
+ /** Minimum number of text boxes on each side of the gap. */
27
+ const MIN_BOXES_PER_COLUMN = 4;
28
+ /** Minimum gap in absolute points to avoid splitting on small whitespace. */
29
+ const MIN_GAP_PTS = 40;
30
+ /**
31
+ * Detect column layout and return text boxes grouped by column.
32
+ *
33
+ * For single-column pages, returns all boxes in one group.
34
+ * For multi-column pages, returns boxes split by column in reading order.
35
+ */
36
+ export function detectColumns(textBoxes) {
37
+ if (textBoxes.length < MIN_BOXES_PER_COLUMN * 2) {
38
+ return { columnCount: 1, columns: [textBoxes], boundaries: [] };
39
+ }
40
+ // Collect unique left edges (rounded to avoid float noise)
41
+ const lefts = [
42
+ ...new Set(textBoxes.map((tb) => Math.round(tb.bounds.left))),
43
+ ].sort((a, b) => a - b);
44
+ if (lefts.length < 2) {
45
+ return { columnCount: 1, columns: [textBoxes], boundaries: [] };
46
+ }
47
+ const textXMin = lefts[0];
48
+ const textXMax = Math.max(...textBoxes.map((tb) => Math.round(tb.bounds.right)));
49
+ const textWidth = textXMax - textXMin;
50
+ if (textWidth <= 0) {
51
+ return { columnCount: 1, columns: [textBoxes], boundaries: [] };
52
+ }
53
+ // Find the largest gap between consecutive left-edge positions
54
+ let maxGap = 0;
55
+ let gapLeft = 0;
56
+ let gapRight = 0;
57
+ for (let i = 1; i < lefts.length; i++) {
58
+ const gap = lefts[i] - lefts[i - 1];
59
+ if (gap > maxGap) {
60
+ maxGap = gap;
61
+ gapLeft = lefts[i - 1];
62
+ gapRight = lefts[i];
63
+ }
64
+ }
65
+ const gapRatio = maxGap / textWidth;
66
+ if (gapRatio < MIN_GAP_RATIO || maxGap < MIN_GAP_PTS) {
67
+ return { columnCount: 1, columns: [textBoxes], boundaries: [] };
68
+ }
69
+ // Split point is the midpoint of the gap
70
+ const splitX = (gapLeft + gapRight) / 2;
71
+ // Assign boxes to columns based on center X
72
+ const leftCol = [];
73
+ const rightCol = [];
74
+ for (const tb of textBoxes) {
75
+ const cx = (tb.bounds.left + tb.bounds.right) / 2;
76
+ if (cx < splitX) {
77
+ leftCol.push(tb);
78
+ }
79
+ else {
80
+ rightCol.push(tb);
81
+ }
82
+ }
83
+ // Validate both columns have enough content
84
+ if (leftCol.length < MIN_BOXES_PER_COLUMN ||
85
+ rightCol.length < MIN_BOXES_PER_COLUMN) {
86
+ return { columnCount: 1, columns: [textBoxes], boundaries: [] };
87
+ }
88
+ return {
89
+ columnCount: 2,
90
+ columns: [leftCol, rightCol],
91
+ boundaries: [splitX],
92
+ };
93
+ }
@@ -0,0 +1,19 @@
1
+ /**
2
+ * PDF content extraction using mupdf.
3
+ *
4
+ * Extracts text boxes (with position, font size, bold) and vector line
5
+ * segments (table borders) from each page. Uses mupdf's native WASM
6
+ * engine for fast parsing, and reads raw content streams for vector graphics.
7
+ *
8
+ * Coordinate system: PDF native (origin = bottom-left, Y increases upward).
9
+ */
10
+ import type { ImageRegion, PageContent } from "./types.js";
11
+ /**
12
+ * Render an image region from a PDF page as a PNG buffer.
13
+ * Uses mupdf's DrawDevice to render just the cropped area at 2x resolution.
14
+ */
15
+ export declare function renderImageRegion(input: Uint8Array, region: ImageRegion): Uint8Array;
16
+ /**
17
+ * Extract text boxes and vector segments from all pages of a PDF buffer.
18
+ */
19
+ export declare function extractPages(input: Uint8Array): Promise<PageContent[]>;