markit-ai 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/convert.d.ts +1 -0
- package/dist/commands/convert.js +7 -3
- package/dist/commands/formats.js +5 -0
- package/dist/converters/docx.d.ts +1 -1
- package/dist/converters/docx.js +35 -3
- package/dist/converters/epub.js +1 -0
- package/dist/converters/github.d.ts +18 -0
- package/dist/converters/github.js +148 -0
- package/dist/converters/pdf/columns.d.ts +35 -0
- package/dist/converters/pdf/columns.js +93 -0
- package/dist/converters/pdf/extract.d.ts +19 -0
- package/dist/converters/pdf/extract.js +513 -0
- package/dist/converters/pdf/grid.d.ts +25 -0
- package/dist/converters/pdf/grid.js +654 -0
- package/dist/converters/pdf/headers.d.ts +24 -0
- package/dist/converters/pdf/headers.js +108 -0
- package/dist/converters/pdf/index.d.ts +19 -0
- package/dist/converters/pdf/index.js +116 -0
- package/dist/converters/pdf/render.d.ts +24 -0
- package/dist/converters/pdf/render.js +513 -0
- package/dist/converters/pdf/types.d.ts +75 -0
- package/dist/converters/pdf/types.js +1 -0
- package/dist/converters/pptx.d.ts +1 -1
- package/dist/converters/pptx.js +74 -1
- package/dist/converters/xlsx.js +1 -0
- package/dist/index.d.ts +2 -1
- package/dist/index.js +2 -1
- package/dist/main.js +4 -1
- package/dist/markit.d.ts +1 -1
- package/dist/markit.js +19 -4
- package/dist/types.d.ts +8 -0
- package/package.json +3 -3
- package/dist/converters/pdf.d.ts +0 -6
- package/dist/converters/pdf.js +0 -29
package/dist/commands/convert.js
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
import { writeFileSync } from "node:fs";
|
|
1
|
+
import { mkdtempSync, writeFileSync } from "node:fs";
|
|
2
|
+
import { tmpdir } from "node:os";
|
|
3
|
+
import { join } from "node:path";
|
|
2
4
|
import { loadConfig } from "../config.js";
|
|
3
5
|
import { Markit } from "../markit.js";
|
|
4
6
|
import { loadAllPlugins } from "../plugins/loader.js";
|
|
@@ -23,6 +25,8 @@ export async function convert(source, options) {
|
|
|
23
25
|
}
|
|
24
26
|
const llmFunctions = createLlmFunctions(config, options.prompt);
|
|
25
27
|
const markit = new Markit(llmFunctions, plugins);
|
|
28
|
+
// Auto-create a temp dir for images if not explicitly provided
|
|
29
|
+
const imageDir = options.imageDir || mkdtempSync(join(tmpdir(), "markit-images-"));
|
|
26
30
|
try {
|
|
27
31
|
let result;
|
|
28
32
|
const isStdin = source === "-";
|
|
@@ -36,7 +40,7 @@ export async function convert(source, options) {
|
|
|
36
40
|
process.exit(EXIT_ERROR);
|
|
37
41
|
}
|
|
38
42
|
const buffer = await readStdin();
|
|
39
|
-
result = await markit.convert(buffer, {});
|
|
43
|
+
result = await markit.convert(buffer, { imageDir });
|
|
40
44
|
}
|
|
41
45
|
else if (isUrl) {
|
|
42
46
|
// Progress hint for URL fetches (stderr so it doesn't pollute piped output)
|
|
@@ -46,7 +50,7 @@ export async function convert(source, options) {
|
|
|
46
50
|
result = await markit.convertUrl(source);
|
|
47
51
|
}
|
|
48
52
|
else {
|
|
49
|
-
result = await markit.convertFile(source);
|
|
53
|
+
result = await markit.convertFile(source, { imageDir });
|
|
50
54
|
}
|
|
51
55
|
const label = isStdin ? "stdin" : source;
|
|
52
56
|
// Write to file or stdout
|
package/dist/commands/formats.js
CHANGED
|
@@ -23,6 +23,11 @@ const BUILTIN_FORMATS = [
|
|
|
23
23
|
extensions: [".mp3", ".wav", ".m4a", ".flac"],
|
|
24
24
|
builtin: true,
|
|
25
25
|
},
|
|
26
|
+
{
|
|
27
|
+
name: "GitHub",
|
|
28
|
+
extensions: ["github.com/*", "gist.github.com/*"],
|
|
29
|
+
builtin: true,
|
|
30
|
+
},
|
|
26
31
|
{ name: "ZIP", extensions: [".zip"], builtin: true },
|
|
27
32
|
{
|
|
28
33
|
name: "Plain text",
|
|
@@ -2,5 +2,5 @@ import type { ConversionResult, Converter, StreamInfo } from "../types.js";
|
|
|
2
2
|
export declare class DocxConverter implements Converter {
|
|
3
3
|
name: string;
|
|
4
4
|
accepts(streamInfo: StreamInfo): boolean;
|
|
5
|
-
convert(input: Buffer,
|
|
5
|
+
convert(input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult>;
|
|
6
6
|
}
|
package/dist/converters/docx.js
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import { mkdirSync, writeFileSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
1
3
|
import mammoth from "mammoth";
|
|
2
4
|
import { createTurndown, normalizeTablesHtml } from "../utils/turndown.js";
|
|
3
5
|
const EXTENSIONS = [".docx"];
|
|
@@ -16,10 +18,40 @@ export class DocxConverter {
|
|
|
16
18
|
}
|
|
17
19
|
return false;
|
|
18
20
|
}
|
|
19
|
-
async convert(input,
|
|
20
|
-
const
|
|
21
|
+
async convert(input, streamInfo) {
|
|
22
|
+
const imageDir = streamInfo.imageDir;
|
|
23
|
+
if (imageDir) {
|
|
24
|
+
mkdirSync(imageDir, { recursive: true });
|
|
25
|
+
}
|
|
26
|
+
let imageCount = 0;
|
|
27
|
+
const convertImage = imageDir
|
|
28
|
+
? mammoth.images.imgElement((image) => {
|
|
29
|
+
imageCount++;
|
|
30
|
+
const ext = (image.contentType?.split("/")[1] || "png").replace("jpeg", "jpg");
|
|
31
|
+
const filename = `image_${imageCount}.${ext}`;
|
|
32
|
+
const filepath = join(imageDir, filename);
|
|
33
|
+
return image.read("base64").then((base64) => {
|
|
34
|
+
writeFileSync(filepath, Buffer.from(base64, "base64"));
|
|
35
|
+
return { src: filepath, alt: `image_${imageCount}` };
|
|
36
|
+
});
|
|
37
|
+
})
|
|
38
|
+
: mammoth.images.imgElement((image) => {
|
|
39
|
+
imageCount++;
|
|
40
|
+
const contentType = image.contentType || "image/png";
|
|
41
|
+
return image.read("base64").then((base64) => {
|
|
42
|
+
return {
|
|
43
|
+
src: `data:${contentType};base64,${base64.slice(0, 0)}`,
|
|
44
|
+
alt: `image_${imageCount}`,
|
|
45
|
+
};
|
|
46
|
+
});
|
|
47
|
+
});
|
|
48
|
+
const { value: html } = await mammoth.convertToHtml({ buffer: input }, { convertImage });
|
|
21
49
|
const turndown = createTurndown();
|
|
22
|
-
|
|
50
|
+
let markdown = turndown.turndown(normalizeTablesHtml(html));
|
|
51
|
+
// Replace data URI images with comment placeholders when no imageDir
|
|
52
|
+
if (!imageDir) {
|
|
53
|
+
markdown = markdown.replace(/!\[([^\]]*)\]\(data:[^)]*\)/g, "<!-- image: $1 -->");
|
|
54
|
+
}
|
|
23
55
|
return { markdown: markdown.trim() };
|
|
24
56
|
}
|
|
25
57
|
}
|
package/dist/converters/epub.js
CHANGED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import type { ConversionResult, Converter, MarkitOptions, StreamInfo } from "../types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Matches GitHub URLs and fetches clean markdown content directly
|
|
4
|
+
* from raw endpoints or the GitHub API — no HTML scraping needed.
|
|
5
|
+
*
|
|
6
|
+
* Supported patterns:
|
|
7
|
+
* - Repos: github.com/owner/repo → raw README.md
|
|
8
|
+
* - Files: github.com/owner/repo/blob/… → raw file content
|
|
9
|
+
* - Gists: gist.github.com/owner/id → raw gist content
|
|
10
|
+
* - Issues: github.com/owner/repo/issues/N → API (title + body)
|
|
11
|
+
* - PRs: github.com/owner/repo/pull/N → API (title + body)
|
|
12
|
+
*/
|
|
13
|
+
export declare class GitHubConverter implements Converter {
|
|
14
|
+
name: string;
|
|
15
|
+
accepts(streamInfo: StreamInfo): boolean;
|
|
16
|
+
convertUrl(url: string, _options?: MarkitOptions): Promise<ConversionResult>;
|
|
17
|
+
convert(_input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult>;
|
|
18
|
+
}
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
const GITHUB_HOSTS = new Set([
|
|
2
|
+
"github.com",
|
|
3
|
+
"www.github.com",
|
|
4
|
+
"gist.github.com",
|
|
5
|
+
]);
|
|
6
|
+
/**
|
|
7
|
+
* Matches GitHub URLs and fetches clean markdown content directly
|
|
8
|
+
* from raw endpoints or the GitHub API — no HTML scraping needed.
|
|
9
|
+
*
|
|
10
|
+
* Supported patterns:
|
|
11
|
+
* - Repos: github.com/owner/repo → raw README.md
|
|
12
|
+
* - Files: github.com/owner/repo/blob/… → raw file content
|
|
13
|
+
* - Gists: gist.github.com/owner/id → raw gist content
|
|
14
|
+
* - Issues: github.com/owner/repo/issues/N → API (title + body)
|
|
15
|
+
* - PRs: github.com/owner/repo/pull/N → API (title + body)
|
|
16
|
+
*/
|
|
17
|
+
export class GitHubConverter {
|
|
18
|
+
name = "github";
|
|
19
|
+
accepts(streamInfo) {
|
|
20
|
+
if (!streamInfo.url)
|
|
21
|
+
return false;
|
|
22
|
+
try {
|
|
23
|
+
const { hostname } = new URL(streamInfo.url);
|
|
24
|
+
return GITHUB_HOSTS.has(hostname);
|
|
25
|
+
}
|
|
26
|
+
catch {
|
|
27
|
+
return false;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
async convertUrl(url, _options) {
|
|
31
|
+
const parsed = new URL(url);
|
|
32
|
+
if (parsed.hostname === "gist.github.com") {
|
|
33
|
+
return fetchGist(parsed);
|
|
34
|
+
}
|
|
35
|
+
const segments = parsed.pathname.split("/").filter(Boolean);
|
|
36
|
+
// Need at least owner/repo
|
|
37
|
+
if (segments.length < 2) {
|
|
38
|
+
throw new Error(`Unsupported GitHub URL: ${url}`);
|
|
39
|
+
}
|
|
40
|
+
const [owner, repo, type, ...rest] = segments;
|
|
41
|
+
// github.com/owner/repo/blob/ref/path → raw file
|
|
42
|
+
if (type === "blob" && rest.length >= 2) {
|
|
43
|
+
const ref = rest[0];
|
|
44
|
+
const filePath = rest.slice(1).join("/");
|
|
45
|
+
return fetchRawFile(owner, repo, ref, filePath);
|
|
46
|
+
}
|
|
47
|
+
// github.com/owner/repo/issues/N or /pull/N
|
|
48
|
+
if ((type === "issues" || type === "pull") && rest[0]) {
|
|
49
|
+
const number = Number.parseInt(rest[0], 10);
|
|
50
|
+
if (!Number.isNaN(number)) {
|
|
51
|
+
return fetchIssueOrPr(owner, repo, number);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
// github.com/owner/repo (no subpath or tree/wiki/etc) → README
|
|
55
|
+
if (!type) {
|
|
56
|
+
return fetchReadme(owner, repo);
|
|
57
|
+
}
|
|
58
|
+
throw new Error(`Unsupported GitHub URL pattern: ${url}`);
|
|
59
|
+
}
|
|
60
|
+
async convert(_input, streamInfo) {
|
|
61
|
+
// GitHub URLs are handled entirely via convertUrl.
|
|
62
|
+
// If we end up here, the URL was already fetched by the default path —
|
|
63
|
+
// just delegate to convertUrl.
|
|
64
|
+
if (streamInfo.url) {
|
|
65
|
+
return this.convertUrl(streamInfo.url);
|
|
66
|
+
}
|
|
67
|
+
throw new Error("GitHub converter requires a URL");
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
// ---------------------------------------------------------------------------
|
|
71
|
+
// Fetchers
|
|
72
|
+
// ---------------------------------------------------------------------------
|
|
73
|
+
async function fetchReadme(owner, repo) {
|
|
74
|
+
const url = `https://raw.githubusercontent.com/${owner}/${repo}/HEAD/README.md`;
|
|
75
|
+
const res = await fetch(url);
|
|
76
|
+
if (!res.ok) {
|
|
77
|
+
throw new Error(`Failed to fetch README: ${res.status} ${res.statusText}`);
|
|
78
|
+
}
|
|
79
|
+
const markdown = (await res.text()).trim();
|
|
80
|
+
const title = extractFirstHeading(markdown) ?? `${owner}/${repo}`;
|
|
81
|
+
return { markdown, title };
|
|
82
|
+
}
|
|
83
|
+
async function fetchRawFile(owner, repo, ref, filePath) {
|
|
84
|
+
const url = `https://raw.githubusercontent.com/${owner}/${repo}/${ref}/${filePath}`;
|
|
85
|
+
const res = await fetch(url);
|
|
86
|
+
if (!res.ok) {
|
|
87
|
+
throw new Error(`Failed to fetch file: ${res.status} ${res.statusText}`);
|
|
88
|
+
}
|
|
89
|
+
const content = (await res.text()).trim();
|
|
90
|
+
const filename = filePath.split("/").pop() ?? filePath;
|
|
91
|
+
// If it's markdown, return as-is. Otherwise wrap in a code block.
|
|
92
|
+
if (filePath.endsWith(".md") || filePath.endsWith(".mdx")) {
|
|
93
|
+
const title = extractFirstHeading(content) ?? filename;
|
|
94
|
+
return { markdown: content, title };
|
|
95
|
+
}
|
|
96
|
+
const ext = filename.includes(".") ? filename.split(".").pop() : "";
|
|
97
|
+
const markdown = `# ${filename}\n\n\`\`\`${ext}\n${content}\n\`\`\``;
|
|
98
|
+
return { markdown, title: filename };
|
|
99
|
+
}
|
|
100
|
+
async function fetchGist(parsed) {
|
|
101
|
+
const segments = parsed.pathname.split("/").filter(Boolean);
|
|
102
|
+
// gist.github.com/owner/id
|
|
103
|
+
const [owner, id] = segments;
|
|
104
|
+
if (!owner || !id) {
|
|
105
|
+
throw new Error(`Unsupported gist URL: ${parsed.href}`);
|
|
106
|
+
}
|
|
107
|
+
const url = `https://gist.githubusercontent.com/${owner}/${id}/raw`;
|
|
108
|
+
const res = await fetch(url);
|
|
109
|
+
if (!res.ok) {
|
|
110
|
+
throw new Error(`Failed to fetch gist: ${res.status} ${res.statusText}`);
|
|
111
|
+
}
|
|
112
|
+
const content = (await res.text()).trim();
|
|
113
|
+
const title = `gist:${id}`;
|
|
114
|
+
return { markdown: content, title };
|
|
115
|
+
}
|
|
116
|
+
async function fetchIssueOrPr(owner, repo, number) {
|
|
117
|
+
const url = `https://api.github.com/repos/${owner}/${repo}/issues/${number}`;
|
|
118
|
+
const res = await fetch(url, {
|
|
119
|
+
headers: { Accept: "application/vnd.github.v3+json" },
|
|
120
|
+
});
|
|
121
|
+
if (!res.ok) {
|
|
122
|
+
throw new Error(`Failed to fetch issue/PR: ${res.status} ${res.statusText}`);
|
|
123
|
+
}
|
|
124
|
+
const data = (await res.json());
|
|
125
|
+
const title = data.title ?? `#${number}`;
|
|
126
|
+
const parts = [`# ${title}`];
|
|
127
|
+
// Metadata line
|
|
128
|
+
const meta = [];
|
|
129
|
+
if (data.user?.login)
|
|
130
|
+
meta.push(`@${data.user.login}`);
|
|
131
|
+
if (data.state)
|
|
132
|
+
meta.push(data.state);
|
|
133
|
+
if (data.labels?.length) {
|
|
134
|
+
meta.push(data.labels.map((l) => l.name).join(", "));
|
|
135
|
+
}
|
|
136
|
+
if (meta.length > 0)
|
|
137
|
+
parts.push(meta.join(" · "));
|
|
138
|
+
if (data.body?.trim())
|
|
139
|
+
parts.push(data.body.trim());
|
|
140
|
+
return { markdown: parts.join("\n\n"), title };
|
|
141
|
+
}
|
|
142
|
+
// ---------------------------------------------------------------------------
|
|
143
|
+
// Helpers
|
|
144
|
+
// ---------------------------------------------------------------------------
|
|
145
|
+
function extractFirstHeading(markdown) {
|
|
146
|
+
const match = markdown.match(/^#\s+(.+)$/m);
|
|
147
|
+
return match?.[1]?.trim();
|
|
148
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Multi-column layout detection and text box reordering.
|
|
3
|
+
*
|
|
4
|
+
* Many PDFs (legal documents, datasheets, academic papers) use two-column
|
|
5
|
+
* layouts. Without column detection, text boxes are ordered by Y position
|
|
6
|
+
* only, interleaving left and right column content.
|
|
7
|
+
*
|
|
8
|
+
* Algorithm:
|
|
9
|
+
* 1. Collect left edges of all text boxes on the page
|
|
10
|
+
* 2. Find the largest horizontal gap between consecutive left edges
|
|
11
|
+
* 3. If gap > MIN_GAP_RATIO of the text width and both sides have
|
|
12
|
+
* enough boxes → multi-column detected
|
|
13
|
+
* 4. Assign each text box to a column based on its center X
|
|
14
|
+
* 5. Return columns in reading order (left-to-right, top-to-bottom)
|
|
15
|
+
*
|
|
16
|
+
* This only detects the column structure. The caller is responsible for
|
|
17
|
+
* processing each column's text boxes independently (table detection,
|
|
18
|
+
* rendering, etc.).
|
|
19
|
+
*/
|
|
20
|
+
import type { TextBox } from "./types.js";
|
|
21
|
+
export interface ColumnLayout {
|
|
22
|
+
/** Number of columns detected (1 = single column, 2+ = multi-column). */
|
|
23
|
+
columnCount: number;
|
|
24
|
+
/** Text boxes grouped by column, in reading order (left to right). */
|
|
25
|
+
columns: TextBox[][];
|
|
26
|
+
/** X positions of column boundaries (between columns). */
|
|
27
|
+
boundaries: number[];
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Detect column layout and return text boxes grouped by column.
|
|
31
|
+
*
|
|
32
|
+
* For single-column pages, returns all boxes in one group.
|
|
33
|
+
* For multi-column pages, returns boxes split by column in reading order.
|
|
34
|
+
*/
|
|
35
|
+
export declare function detectColumns(textBoxes: TextBox[]): ColumnLayout;
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Multi-column layout detection and text box reordering.
|
|
3
|
+
*
|
|
4
|
+
* Many PDFs (legal documents, datasheets, academic papers) use two-column
|
|
5
|
+
* layouts. Without column detection, text boxes are ordered by Y position
|
|
6
|
+
* only, interleaving left and right column content.
|
|
7
|
+
*
|
|
8
|
+
* Algorithm:
|
|
9
|
+
* 1. Collect left edges of all text boxes on the page
|
|
10
|
+
* 2. Find the largest horizontal gap between consecutive left edges
|
|
11
|
+
* 3. If gap > MIN_GAP_RATIO of the text width and both sides have
|
|
12
|
+
* enough boxes → multi-column detected
|
|
13
|
+
* 4. Assign each text box to a column based on its center X
|
|
14
|
+
* 5. Return columns in reading order (left-to-right, top-to-bottom)
|
|
15
|
+
*
|
|
16
|
+
* This only detects the column structure. The caller is responsible for
|
|
17
|
+
* processing each column's text boxes independently (table detection,
|
|
18
|
+
* rendering, etc.).
|
|
19
|
+
*/
|
|
20
|
+
/**
|
|
21
|
+
* Minimum gap as a fraction of the total text width to consider a column
|
|
22
|
+
* boundary. A two-column layout typically has ~50% gap; we use a lower
|
|
23
|
+
* threshold to catch asymmetric columns.
|
|
24
|
+
*/
|
|
25
|
+
const MIN_GAP_RATIO = 0.15;
|
|
26
|
+
/** Minimum number of text boxes on each side of the gap. */
|
|
27
|
+
const MIN_BOXES_PER_COLUMN = 4;
|
|
28
|
+
/** Minimum gap in absolute points to avoid splitting on small whitespace. */
|
|
29
|
+
const MIN_GAP_PTS = 40;
|
|
30
|
+
/**
|
|
31
|
+
* Detect column layout and return text boxes grouped by column.
|
|
32
|
+
*
|
|
33
|
+
* For single-column pages, returns all boxes in one group.
|
|
34
|
+
* For multi-column pages, returns boxes split by column in reading order.
|
|
35
|
+
*/
|
|
36
|
+
export function detectColumns(textBoxes) {
|
|
37
|
+
if (textBoxes.length < MIN_BOXES_PER_COLUMN * 2) {
|
|
38
|
+
return { columnCount: 1, columns: [textBoxes], boundaries: [] };
|
|
39
|
+
}
|
|
40
|
+
// Collect unique left edges (rounded to avoid float noise)
|
|
41
|
+
const lefts = [
|
|
42
|
+
...new Set(textBoxes.map((tb) => Math.round(tb.bounds.left))),
|
|
43
|
+
].sort((a, b) => a - b);
|
|
44
|
+
if (lefts.length < 2) {
|
|
45
|
+
return { columnCount: 1, columns: [textBoxes], boundaries: [] };
|
|
46
|
+
}
|
|
47
|
+
const textXMin = lefts[0];
|
|
48
|
+
const textXMax = Math.max(...textBoxes.map((tb) => Math.round(tb.bounds.right)));
|
|
49
|
+
const textWidth = textXMax - textXMin;
|
|
50
|
+
if (textWidth <= 0) {
|
|
51
|
+
return { columnCount: 1, columns: [textBoxes], boundaries: [] };
|
|
52
|
+
}
|
|
53
|
+
// Find the largest gap between consecutive left-edge positions
|
|
54
|
+
let maxGap = 0;
|
|
55
|
+
let gapLeft = 0;
|
|
56
|
+
let gapRight = 0;
|
|
57
|
+
for (let i = 1; i < lefts.length; i++) {
|
|
58
|
+
const gap = lefts[i] - lefts[i - 1];
|
|
59
|
+
if (gap > maxGap) {
|
|
60
|
+
maxGap = gap;
|
|
61
|
+
gapLeft = lefts[i - 1];
|
|
62
|
+
gapRight = lefts[i];
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
const gapRatio = maxGap / textWidth;
|
|
66
|
+
if (gapRatio < MIN_GAP_RATIO || maxGap < MIN_GAP_PTS) {
|
|
67
|
+
return { columnCount: 1, columns: [textBoxes], boundaries: [] };
|
|
68
|
+
}
|
|
69
|
+
// Split point is the midpoint of the gap
|
|
70
|
+
const splitX = (gapLeft + gapRight) / 2;
|
|
71
|
+
// Assign boxes to columns based on center X
|
|
72
|
+
const leftCol = [];
|
|
73
|
+
const rightCol = [];
|
|
74
|
+
for (const tb of textBoxes) {
|
|
75
|
+
const cx = (tb.bounds.left + tb.bounds.right) / 2;
|
|
76
|
+
if (cx < splitX) {
|
|
77
|
+
leftCol.push(tb);
|
|
78
|
+
}
|
|
79
|
+
else {
|
|
80
|
+
rightCol.push(tb);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
// Validate both columns have enough content
|
|
84
|
+
if (leftCol.length < MIN_BOXES_PER_COLUMN ||
|
|
85
|
+
rightCol.length < MIN_BOXES_PER_COLUMN) {
|
|
86
|
+
return { columnCount: 1, columns: [textBoxes], boundaries: [] };
|
|
87
|
+
}
|
|
88
|
+
return {
|
|
89
|
+
columnCount: 2,
|
|
90
|
+
columns: [leftCol, rightCol],
|
|
91
|
+
boundaries: [splitX],
|
|
92
|
+
};
|
|
93
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF content extraction using mupdf.
|
|
3
|
+
*
|
|
4
|
+
* Extracts text boxes (with position, font size, bold) and vector line
|
|
5
|
+
* segments (table borders) from each page. Uses mupdf's native WASM
|
|
6
|
+
* engine for fast parsing, and reads raw content streams for vector graphics.
|
|
7
|
+
*
|
|
8
|
+
* Coordinate system: PDF native (origin = bottom-left, Y increases upward).
|
|
9
|
+
*/
|
|
10
|
+
import type { ImageRegion, PageContent } from "./types.js";
|
|
11
|
+
/**
|
|
12
|
+
* Render an image region from a PDF page as a PNG buffer.
|
|
13
|
+
* Uses mupdf's DrawDevice to render just the cropped area at 2x resolution.
|
|
14
|
+
*/
|
|
15
|
+
export declare function renderImageRegion(input: Uint8Array, region: ImageRegion): Uint8Array;
|
|
16
|
+
/**
|
|
17
|
+
* Extract text boxes and vector segments from all pages of a PDF buffer.
|
|
18
|
+
*/
|
|
19
|
+
export declare function extractPages(input: Uint8Array): Promise<PageContent[]>;
|