markit-ai 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +333 -0
- package/dist/commands/config.d.ts +4 -0
- package/dist/commands/config.js +133 -0
- package/dist/commands/convert.d.ts +5 -0
- package/dist/commands/convert.js +110 -0
- package/dist/commands/formats.d.ts +2 -0
- package/dist/commands/formats.js +56 -0
- package/dist/commands/init.d.ts +2 -0
- package/dist/commands/init.js +29 -0
- package/dist/commands/onboard.d.ts +2 -0
- package/dist/commands/onboard.js +61 -0
- package/dist/commands/plugin.d.ts +4 -0
- package/dist/commands/plugin.js +58 -0
- package/dist/config.d.ts +26 -0
- package/dist/config.js +42 -0
- package/dist/converters/audio.d.ts +7 -0
- package/dist/converters/audio.js +87 -0
- package/dist/converters/csv.d.ts +7 -0
- package/dist/converters/csv.js +83 -0
- package/dist/converters/docx.d.ts +6 -0
- package/dist/converters/docx.js +28 -0
- package/dist/converters/epub.d.ts +8 -0
- package/dist/converters/epub.js +110 -0
- package/dist/converters/html.d.ts +6 -0
- package/dist/converters/html.js +33 -0
- package/dist/converters/image.d.ts +6 -0
- package/dist/converters/image.js +94 -0
- package/dist/converters/ipynb.d.ts +6 -0
- package/dist/converters/ipynb.js +72 -0
- package/dist/converters/json.d.ts +6 -0
- package/dist/converters/json.js +21 -0
- package/dist/converters/pdf.d.ts +6 -0
- package/dist/converters/pdf.js +29 -0
- package/dist/converters/plain-text.d.ts +6 -0
- package/dist/converters/plain-text.js +41 -0
- package/dist/converters/pptx.d.ts +8 -0
- package/dist/converters/pptx.js +189 -0
- package/dist/converters/rss.d.ts +11 -0
- package/dist/converters/rss.js +134 -0
- package/dist/converters/wikipedia.d.ts +6 -0
- package/dist/converters/wikipedia.js +35 -0
- package/dist/converters/xlsx.d.ts +8 -0
- package/dist/converters/xlsx.js +139 -0
- package/dist/converters/xml.d.ts +6 -0
- package/dist/converters/xml.js +17 -0
- package/dist/converters/yaml.d.ts +6 -0
- package/dist/converters/yaml.js +16 -0
- package/dist/converters/zip.d.ts +8 -0
- package/dist/converters/zip.js +56 -0
- package/dist/index.d.ts +28 -0
- package/dist/index.js +24 -0
- package/dist/llm.d.ts +10 -0
- package/dist/llm.js +139 -0
- package/dist/main.d.ts +2 -0
- package/dist/main.js +182 -0
- package/dist/markit.d.ts +19 -0
- package/dist/markit.js +124 -0
- package/dist/mill.d.ts +18 -0
- package/dist/mill.js +123 -0
- package/dist/plugins/api.d.ts +7 -0
- package/dist/plugins/api.js +44 -0
- package/dist/plugins/index.d.ts +4 -0
- package/dist/plugins/index.js +3 -0
- package/dist/plugins/installer.d.ts +25 -0
- package/dist/plugins/installer.js +176 -0
- package/dist/plugins/loader.d.ts +6 -0
- package/dist/plugins/loader.js +61 -0
- package/dist/plugins/types.d.ts +25 -0
- package/dist/plugins/types.js +1 -0
- package/dist/providers/anthropic.d.ts +2 -0
- package/dist/providers/anthropic.js +47 -0
- package/dist/providers/index.d.ts +21 -0
- package/dist/providers/index.js +58 -0
- package/dist/providers/openai.d.ts +2 -0
- package/dist/providers/openai.js +65 -0
- package/dist/providers/types.d.ts +26 -0
- package/dist/providers/types.js +1 -0
- package/dist/types.d.ts +28 -0
- package/dist/types.js +1 -0
- package/dist/utils/exit-codes.d.ts +4 -0
- package/dist/utils/exit-codes.js +4 -0
- package/dist/utils/output.d.ts +22 -0
- package/dist/utils/output.js +31 -0
- package/package.json +70 -0
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import { existsSync, mkdirSync, writeFileSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import { output, success, hint, cmd } from "../utils/output.js";
|
|
4
|
+
const DATA_DIR = ".markit";
|
|
5
|
+
export async function init(_args, options) {
|
|
6
|
+
const root = join(process.cwd(), DATA_DIR);
|
|
7
|
+
if (existsSync(root)) {
|
|
8
|
+
output(options, {
|
|
9
|
+
json: () => ({ success: true, path: root, message: "already_exists" }),
|
|
10
|
+
human: () => success(`.markit/ already exists`),
|
|
11
|
+
});
|
|
12
|
+
return;
|
|
13
|
+
}
|
|
14
|
+
mkdirSync(root, { recursive: true });
|
|
15
|
+
const config = {
|
|
16
|
+
llm: {},
|
|
17
|
+
};
|
|
18
|
+
writeFileSync(join(root, "config.json"), `${JSON.stringify(config, null, 2)}\n`);
|
|
19
|
+
output(options, {
|
|
20
|
+
json: () => ({ success: true, path: root }),
|
|
21
|
+
human: () => {
|
|
22
|
+
success(`Created .markit/ in ${process.cwd()}`);
|
|
23
|
+
hint("Set your API key for image/audio AI features:");
|
|
24
|
+
console.log(` ${cmd("export OPENAI_API_KEY=sk-...")}`);
|
|
25
|
+
hint("Or configure directly:");
|
|
26
|
+
console.log(` ${cmd("markit config set llm.apiKey sk-...")}`);
|
|
27
|
+
},
|
|
28
|
+
});
|
|
29
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import { existsSync, readFileSync, writeFileSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import { output, success } from "../utils/output.js";
|
|
4
|
+
const INSTRUCTIONS = `
|
|
5
|
+
<mill>
|
|
6
|
+
Use \`markit\` to convert files and URLs to markdown. Supports PDF, DOCX, HTML, XLSX, CSV, JSON, and plain text.
|
|
7
|
+
|
|
8
|
+
<commands>
|
|
9
|
+
- \`markit <file-or-url>\` - convert to markdown (stdout)
|
|
10
|
+
- \`markit <file-or-url> -o output.md\` - convert to file
|
|
11
|
+
- \`markit formats\` - list supported formats
|
|
12
|
+
</commands>
|
|
13
|
+
|
|
14
|
+
<rules>
|
|
15
|
+
- Use \`--json\` flag to get structured output for parsing
|
|
16
|
+
- Use \`-q\` to get raw markdown without formatting
|
|
17
|
+
- Pipe output directly: \`markit report.pdf | other-tool\`
|
|
18
|
+
</rules>
|
|
19
|
+
</markit>
|
|
20
|
+
`.trim();
|
|
21
|
+
const MARKER = "<mill>";
|
|
22
|
+
export async function onboard(_args, options) {
|
|
23
|
+
const cwd = process.cwd();
|
|
24
|
+
const claudeMd = join(cwd, "CLAUDE.md");
|
|
25
|
+
const agentsMd = join(cwd, "AGENTS.md");
|
|
26
|
+
let targetFile;
|
|
27
|
+
if (existsSync(claudeMd)) {
|
|
28
|
+
targetFile = claudeMd;
|
|
29
|
+
}
|
|
30
|
+
else if (existsSync(agentsMd)) {
|
|
31
|
+
targetFile = agentsMd;
|
|
32
|
+
}
|
|
33
|
+
else {
|
|
34
|
+
targetFile = claudeMd;
|
|
35
|
+
}
|
|
36
|
+
let existingContent = "";
|
|
37
|
+
if (existsSync(targetFile)) {
|
|
38
|
+
existingContent = readFileSync(targetFile, "utf-8");
|
|
39
|
+
}
|
|
40
|
+
if (existingContent.includes(MARKER)) {
|
|
41
|
+
output(options, {
|
|
42
|
+
json: () => ({
|
|
43
|
+
success: true,
|
|
44
|
+
file: targetFile,
|
|
45
|
+
message: "already_onboarded",
|
|
46
|
+
}),
|
|
47
|
+
human: () => success(`Already onboarded (${targetFile})`),
|
|
48
|
+
});
|
|
49
|
+
return;
|
|
50
|
+
}
|
|
51
|
+
if (existingContent) {
|
|
52
|
+
writeFileSync(targetFile, `${existingContent.trimEnd()}\n\n${INSTRUCTIONS}\n`);
|
|
53
|
+
}
|
|
54
|
+
else {
|
|
55
|
+
writeFileSync(targetFile, `${INSTRUCTIONS}\n`);
|
|
56
|
+
}
|
|
57
|
+
output(options, {
|
|
58
|
+
json: () => ({ success: true, file: targetFile }),
|
|
59
|
+
human: () => success(`Added markit instructions to ${targetFile}`),
|
|
60
|
+
});
|
|
61
|
+
}
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
import type { OutputOptions } from "../utils/output.js";
|
|
2
|
+
export declare function pluginInstall(source: string, options: OutputOptions): Promise<void>;
|
|
3
|
+
export declare function pluginRemove(name: string, options: OutputOptions): Promise<void>;
|
|
4
|
+
export declare function pluginList(options: OutputOptions): Promise<void>;
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import { installPlugin, removePlugin, listInstalled } from "../plugins/installer.js";
|
|
2
|
+
import { output, success, error, dim, bold } from "../utils/output.js";
|
|
3
|
+
import { EXIT_ERROR } from "../utils/exit-codes.js";
|
|
4
|
+
export async function pluginInstall(source, options) {
|
|
5
|
+
try {
|
|
6
|
+
const result = await installPlugin(source);
|
|
7
|
+
output(options, {
|
|
8
|
+
json: () => ({ success: true, ...result }),
|
|
9
|
+
human: () => {
|
|
10
|
+
success(`Installed ${result.name}`);
|
|
11
|
+
console.log(dim(` ${result.path}`));
|
|
12
|
+
},
|
|
13
|
+
});
|
|
14
|
+
}
|
|
15
|
+
catch (err) {
|
|
16
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
17
|
+
output(options, {
|
|
18
|
+
json: () => ({ success: false, error: msg }),
|
|
19
|
+
human: () => error(msg),
|
|
20
|
+
});
|
|
21
|
+
process.exit(EXIT_ERROR);
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
export async function pluginRemove(name, options) {
|
|
25
|
+
const removed = removePlugin(name);
|
|
26
|
+
output(options, {
|
|
27
|
+
json: () => ({ success: removed, name }),
|
|
28
|
+
human: () => {
|
|
29
|
+
if (removed) {
|
|
30
|
+
success(`Removed ${name}`);
|
|
31
|
+
}
|
|
32
|
+
else {
|
|
33
|
+
error(`Plugin '${name}' not found`);
|
|
34
|
+
}
|
|
35
|
+
},
|
|
36
|
+
});
|
|
37
|
+
if (!removed)
|
|
38
|
+
process.exit(EXIT_ERROR);
|
|
39
|
+
}
|
|
40
|
+
export async function pluginList(options) {
|
|
41
|
+
const plugins = listInstalled();
|
|
42
|
+
output(options, {
|
|
43
|
+
json: () => ({ plugins }),
|
|
44
|
+
human: () => {
|
|
45
|
+
if (plugins.length === 0) {
|
|
46
|
+
console.log(dim(" No plugins installed"));
|
|
47
|
+
return;
|
|
48
|
+
}
|
|
49
|
+
console.log();
|
|
50
|
+
console.log(bold("Installed plugins"));
|
|
51
|
+
console.log();
|
|
52
|
+
for (const p of plugins) {
|
|
53
|
+
console.log(` ${p.name.padEnd(20)} ${dim(p.type)} ${dim(p.source)}`);
|
|
54
|
+
}
|
|
55
|
+
console.log();
|
|
56
|
+
},
|
|
57
|
+
});
|
|
58
|
+
}
|
package/dist/config.d.ts
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
export interface MarkitConfig {
|
|
2
|
+
llm?: {
|
|
3
|
+
/** Provider name: "openai" (default), "anthropic", or any registered provider */
|
|
4
|
+
provider?: string;
|
|
5
|
+
/** API base URL (overrides provider default) */
|
|
6
|
+
apiBase?: string;
|
|
7
|
+
/** API key — prefer env vars over storing here */
|
|
8
|
+
apiKey?: string;
|
|
9
|
+
/** Model override (overrides provider default) */
|
|
10
|
+
model?: string;
|
|
11
|
+
/** Transcription model override */
|
|
12
|
+
transcriptionModel?: string;
|
|
13
|
+
};
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Walk up from cwd to find .markit/ directory.
|
|
17
|
+
*/
|
|
18
|
+
export declare function findConfigDir(): string | null;
|
|
19
|
+
/**
|
|
20
|
+
* Load config from .markit/config.json.
|
|
21
|
+
*/
|
|
22
|
+
export declare function loadConfig(): MarkitConfig;
|
|
23
|
+
/**
|
|
24
|
+
* Save config to .markit/config.json. Creates .markit/ if needed.
|
|
25
|
+
*/
|
|
26
|
+
export declare function saveConfig(config: MarkitConfig): void;
|
package/dist/config.js
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
const DATA_DIR = ".markit";
|
|
4
|
+
const CONFIG_FILE = "config.json";
|
|
5
|
+
/**
|
|
6
|
+
* Walk up from cwd to find .markit/ directory.
|
|
7
|
+
*/
|
|
8
|
+
export function findConfigDir() {
|
|
9
|
+
let dir = process.cwd();
|
|
10
|
+
while (true) {
|
|
11
|
+
if (existsSync(join(dir, DATA_DIR))) {
|
|
12
|
+
return join(dir, DATA_DIR);
|
|
13
|
+
}
|
|
14
|
+
const parent = join(dir, "..");
|
|
15
|
+
if (parent === dir)
|
|
16
|
+
return null;
|
|
17
|
+
dir = parent;
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Load config from .markit/config.json.
|
|
22
|
+
*/
|
|
23
|
+
export function loadConfig() {
|
|
24
|
+
const configDir = findConfigDir();
|
|
25
|
+
if (!configDir)
|
|
26
|
+
return {};
|
|
27
|
+
const configFile = join(configDir, CONFIG_FILE);
|
|
28
|
+
if (!existsSync(configFile))
|
|
29
|
+
return {};
|
|
30
|
+
const raw = JSON.parse(readFileSync(configFile, "utf-8"));
|
|
31
|
+
return { ...raw, llm: { ...raw.llm } };
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Save config to .markit/config.json. Creates .markit/ if needed.
|
|
35
|
+
*/
|
|
36
|
+
export function saveConfig(config) {
|
|
37
|
+
const configDir = findConfigDir();
|
|
38
|
+
const dir = configDir || join(process.cwd(), DATA_DIR);
|
|
39
|
+
if (!existsSync(dir))
|
|
40
|
+
mkdirSync(dir, { recursive: true });
|
|
41
|
+
writeFileSync(join(dir, CONFIG_FILE), `${JSON.stringify(config, null, 2)}\n`);
|
|
42
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { Converter, ConversionResult, StreamInfo, MarkitOptions } from "../types.js";
|
|
2
|
+
export declare class AudioConverter implements Converter {
|
|
3
|
+
name: string;
|
|
4
|
+
accepts(streamInfo: StreamInfo): boolean;
|
|
5
|
+
convert(input: Buffer, streamInfo: StreamInfo, options?: MarkitOptions): Promise<ConversionResult>;
|
|
6
|
+
private formatDuration;
|
|
7
|
+
}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
const EXTENSIONS = [".mp3", ".wav", ".m4a", ".mp4", ".ogg", ".flac", ".aac", ".wma"];
|
|
2
|
+
const MIMETYPES = ["audio/", "video/mp4"];
|
|
3
|
+
export class AudioConverter {
|
|
4
|
+
name = "audio";
|
|
5
|
+
accepts(streamInfo) {
|
|
6
|
+
if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension))
|
|
7
|
+
return true;
|
|
8
|
+
if (streamInfo.mimetype && MIMETYPES.some((m) => streamInfo.mimetype.startsWith(m)))
|
|
9
|
+
return true;
|
|
10
|
+
return false;
|
|
11
|
+
}
|
|
12
|
+
async convert(input, streamInfo, options) {
|
|
13
|
+
const sections = [];
|
|
14
|
+
// Extract audio metadata
|
|
15
|
+
try {
|
|
16
|
+
const mm = await import("music-metadata");
|
|
17
|
+
const metadata = await mm.parseBuffer(new Uint8Array(input), {
|
|
18
|
+
mimeType: streamInfo.mimetype,
|
|
19
|
+
size: input.length,
|
|
20
|
+
});
|
|
21
|
+
const { common, format } = metadata;
|
|
22
|
+
sections.push("## Metadata\n");
|
|
23
|
+
const fields = {
|
|
24
|
+
Title: common.title,
|
|
25
|
+
Artist: common.artist,
|
|
26
|
+
Album: common.album,
|
|
27
|
+
Genre: common.genre?.join(", "),
|
|
28
|
+
Track: common.track?.no ? `${common.track.no}${common.track.of ? ` of ${common.track.of}` : ""}` : undefined,
|
|
29
|
+
Year: common.year ? String(common.year) : undefined,
|
|
30
|
+
Duration: format.duration
|
|
31
|
+
? this.formatDuration(format.duration)
|
|
32
|
+
: undefined,
|
|
33
|
+
Format: format.codec || format.container,
|
|
34
|
+
SampleRate: format.sampleRate ? `${format.sampleRate} Hz` : undefined,
|
|
35
|
+
Channels: format.numberOfChannels
|
|
36
|
+
? String(format.numberOfChannels)
|
|
37
|
+
: undefined,
|
|
38
|
+
Bitrate: format.bitrate
|
|
39
|
+
? `${Math.round(format.bitrate / 1000)} kbps`
|
|
40
|
+
: undefined,
|
|
41
|
+
};
|
|
42
|
+
for (const [key, value] of Object.entries(fields)) {
|
|
43
|
+
if (value)
|
|
44
|
+
sections.push(`${key}: ${value}`);
|
|
45
|
+
}
|
|
46
|
+
if (common.lyrics?.length) {
|
|
47
|
+
sections.push(`\n## Lyrics\n\n${common.lyrics.join("\n")}`);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
catch {
|
|
51
|
+
// Metadata parsing failed
|
|
52
|
+
}
|
|
53
|
+
// AI transcription
|
|
54
|
+
if (options?.transcribe) {
|
|
55
|
+
try {
|
|
56
|
+
const mimetype = streamInfo.mimetype || guessMimetype(streamInfo.extension);
|
|
57
|
+
const transcript = await options.transcribe(input, mimetype);
|
|
58
|
+
if (transcript) {
|
|
59
|
+
sections.push(`\n## Transcript\n\n${transcript}`);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
catch {
|
|
63
|
+
// Transcription failed
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
if (sections.length === 0) {
|
|
67
|
+
return { markdown: `*[audio: ${streamInfo.filename || "unknown"}]*` };
|
|
68
|
+
}
|
|
69
|
+
return { markdown: sections.join("\n").trim() };
|
|
70
|
+
}
|
|
71
|
+
formatDuration(seconds) {
|
|
72
|
+
const h = Math.floor(seconds / 3600);
|
|
73
|
+
const m = Math.floor((seconds % 3600) / 60);
|
|
74
|
+
const s = Math.round(seconds % 60);
|
|
75
|
+
if (h > 0)
|
|
76
|
+
return `${h}:${String(m).padStart(2, "0")}:${String(s).padStart(2, "0")}`;
|
|
77
|
+
return `${m}:${String(s).padStart(2, "0")}`;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
function guessMimetype(ext) {
|
|
81
|
+
const map = {
|
|
82
|
+
".mp3": "audio/mpeg", ".wav": "audio/wav", ".m4a": "audio/mp4",
|
|
83
|
+
".mp4": "video/mp4", ".ogg": "audio/ogg", ".flac": "audio/flac",
|
|
84
|
+
".aac": "audio/aac", ".wma": "audio/x-ms-wma",
|
|
85
|
+
};
|
|
86
|
+
return map[ext || ""] || "audio/mpeg";
|
|
87
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { Converter, ConversionResult, StreamInfo } from "../types.js";
|
|
2
|
+
export declare class CsvConverter implements Converter {
|
|
3
|
+
name: string;
|
|
4
|
+
accepts(streamInfo: StreamInfo): boolean;
|
|
5
|
+
convert(input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult>;
|
|
6
|
+
private parseRows;
|
|
7
|
+
}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
const EXTENSIONS = [".csv", ".tsv"];
|
|
2
|
+
const MIMETYPES = ["text/csv", "text/tab-separated-values"];
|
|
3
|
+
export class CsvConverter {
|
|
4
|
+
name = "csv";
|
|
5
|
+
accepts(streamInfo) {
|
|
6
|
+
if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension)) {
|
|
7
|
+
return true;
|
|
8
|
+
}
|
|
9
|
+
if (streamInfo.mimetype &&
|
|
10
|
+
MIMETYPES.some((m) => streamInfo.mimetype.startsWith(m))) {
|
|
11
|
+
return true;
|
|
12
|
+
}
|
|
13
|
+
return false;
|
|
14
|
+
}
|
|
15
|
+
async convert(input, streamInfo) {
|
|
16
|
+
const text = new TextDecoder(streamInfo.charset || "utf-8").decode(input);
|
|
17
|
+
const delimiter = streamInfo.extension === ".tsv" ? "\t" : ",";
|
|
18
|
+
const rows = this.parseRows(text, delimiter);
|
|
19
|
+
if (rows.length === 0) {
|
|
20
|
+
return { markdown: "" };
|
|
21
|
+
}
|
|
22
|
+
const [header, ...body] = rows;
|
|
23
|
+
const lines = [];
|
|
24
|
+
// Header
|
|
25
|
+
lines.push(`| ${header.join(" | ")} |`);
|
|
26
|
+
lines.push(`| ${header.map(() => "---").join(" | ")} |`);
|
|
27
|
+
// Body
|
|
28
|
+
for (const row of body) {
|
|
29
|
+
// Pad row to match header length
|
|
30
|
+
while (row.length < header.length)
|
|
31
|
+
row.push("");
|
|
32
|
+
lines.push(`| ${row.join(" | ")} |`);
|
|
33
|
+
}
|
|
34
|
+
return { markdown: lines.join("\n") };
|
|
35
|
+
}
|
|
36
|
+
parseRows(text, delimiter) {
|
|
37
|
+
const rows = [];
|
|
38
|
+
let current = [];
|
|
39
|
+
let cell = "";
|
|
40
|
+
let inQuotes = false;
|
|
41
|
+
for (let i = 0; i < text.length; i++) {
|
|
42
|
+
const ch = text[i];
|
|
43
|
+
if (inQuotes) {
|
|
44
|
+
if (ch === '"' && text[i + 1] === '"') {
|
|
45
|
+
cell += '"';
|
|
46
|
+
i++;
|
|
47
|
+
}
|
|
48
|
+
else if (ch === '"') {
|
|
49
|
+
inQuotes = false;
|
|
50
|
+
}
|
|
51
|
+
else {
|
|
52
|
+
cell += ch;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
else if (ch === '"') {
|
|
56
|
+
inQuotes = true;
|
|
57
|
+
}
|
|
58
|
+
else if (ch === delimiter) {
|
|
59
|
+
current.push(cell.trim());
|
|
60
|
+
cell = "";
|
|
61
|
+
}
|
|
62
|
+
else if (ch === "\n" || (ch === "\r" && text[i + 1] === "\n")) {
|
|
63
|
+
current.push(cell.trim());
|
|
64
|
+
if (current.some((c) => c.length > 0))
|
|
65
|
+
rows.push(current);
|
|
66
|
+
current = [];
|
|
67
|
+
cell = "";
|
|
68
|
+
if (ch === "\r")
|
|
69
|
+
i++;
|
|
70
|
+
}
|
|
71
|
+
else {
|
|
72
|
+
cell += ch;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
// Last row
|
|
76
|
+
if (cell || current.length > 0) {
|
|
77
|
+
current.push(cell.trim());
|
|
78
|
+
if (current.some((c) => c.length > 0))
|
|
79
|
+
rows.push(current);
|
|
80
|
+
}
|
|
81
|
+
return rows;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { Converter, ConversionResult, StreamInfo } from "../types.js";
|
|
2
|
+
export declare class DocxConverter implements Converter {
|
|
3
|
+
name: string;
|
|
4
|
+
accepts(streamInfo: StreamInfo): boolean;
|
|
5
|
+
convert(input: Buffer, _streamInfo: StreamInfo): Promise<ConversionResult>;
|
|
6
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import mammoth from "mammoth";
|
|
2
|
+
import TurndownService from "turndown";
|
|
3
|
+
const EXTENSIONS = [".docx"];
|
|
4
|
+
const MIMETYPES = [
|
|
5
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
6
|
+
];
|
|
7
|
+
export class DocxConverter {
|
|
8
|
+
name = "docx";
|
|
9
|
+
accepts(streamInfo) {
|
|
10
|
+
if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension)) {
|
|
11
|
+
return true;
|
|
12
|
+
}
|
|
13
|
+
if (streamInfo.mimetype &&
|
|
14
|
+
MIMETYPES.some((m) => streamInfo.mimetype.startsWith(m))) {
|
|
15
|
+
return true;
|
|
16
|
+
}
|
|
17
|
+
return false;
|
|
18
|
+
}
|
|
19
|
+
async convert(input, _streamInfo) {
|
|
20
|
+
const { value: html } = await mammoth.convertToHtml({ buffer: input });
|
|
21
|
+
const turndown = new TurndownService({
|
|
22
|
+
headingStyle: "atx",
|
|
23
|
+
codeBlockStyle: "fenced",
|
|
24
|
+
});
|
|
25
|
+
const markdown = turndown.turndown(html);
|
|
26
|
+
return { markdown: markdown.trim() };
|
|
27
|
+
}
|
|
28
|
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { Converter, ConversionResult, StreamInfo } from "../types.js";
|
|
2
|
+
export declare class EpubConverter implements Converter {
|
|
3
|
+
name: string;
|
|
4
|
+
accepts(streamInfo: StreamInfo): boolean;
|
|
5
|
+
convert(input: Buffer, _streamInfo: StreamInfo): Promise<ConversionResult>;
|
|
6
|
+
private getText;
|
|
7
|
+
private getTextArray;
|
|
8
|
+
}
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import JSZip from "jszip";
|
|
2
|
+
import { XMLParser } from "fast-xml-parser";
|
|
3
|
+
import TurndownService from "turndown";
|
|
4
|
+
const EXTENSIONS = [".epub"];
|
|
5
|
+
const MIMETYPES = ["application/epub", "application/epub+zip", "application/x-epub+zip"];
|
|
6
|
+
export class EpubConverter {
|
|
7
|
+
name = "epub";
|
|
8
|
+
accepts(streamInfo) {
|
|
9
|
+
if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension))
|
|
10
|
+
return true;
|
|
11
|
+
if (streamInfo.mimetype && MIMETYPES.some((m) => streamInfo.mimetype.startsWith(m)))
|
|
12
|
+
return true;
|
|
13
|
+
return false;
|
|
14
|
+
}
|
|
15
|
+
async convert(input, _streamInfo) {
|
|
16
|
+
const zip = await JSZip.loadAsync(input);
|
|
17
|
+
const parser = new XMLParser({
|
|
18
|
+
ignoreAttributes: false,
|
|
19
|
+
attributeNamePrefix: "@_",
|
|
20
|
+
textNodeName: "#text",
|
|
21
|
+
});
|
|
22
|
+
// Find content.opf path from container.xml
|
|
23
|
+
const containerXml = await zip.file("META-INF/container.xml")?.async("string");
|
|
24
|
+
if (!containerXml)
|
|
25
|
+
throw new Error("Invalid EPUB: missing container.xml");
|
|
26
|
+
const container = parser.parse(containerXml);
|
|
27
|
+
const rootfile = container.container?.rootfiles?.rootfile;
|
|
28
|
+
const opfPath = Array.isArray(rootfile)
|
|
29
|
+
? rootfile[0]["@_full-path"]
|
|
30
|
+
: rootfile?.["@_full-path"];
|
|
31
|
+
if (!opfPath)
|
|
32
|
+
throw new Error("Invalid EPUB: missing rootfile path");
|
|
33
|
+
// Parse content.opf
|
|
34
|
+
const opfXml = await zip.file(opfPath)?.async("string");
|
|
35
|
+
if (!opfXml)
|
|
36
|
+
throw new Error("Invalid EPUB: missing content.opf");
|
|
37
|
+
const opf = parser.parse(opfXml);
|
|
38
|
+
// Extract metadata
|
|
39
|
+
const meta = opf.package?.metadata ?? {};
|
|
40
|
+
const metadata = {
|
|
41
|
+
title: this.getText(meta["dc:title"]),
|
|
42
|
+
authors: this.getTextArray(meta["dc:creator"]).join(", ") || undefined,
|
|
43
|
+
language: this.getText(meta["dc:language"]),
|
|
44
|
+
publisher: this.getText(meta["dc:publisher"]),
|
|
45
|
+
date: this.getText(meta["dc:date"]),
|
|
46
|
+
description: this.getText(meta["dc:description"]),
|
|
47
|
+
};
|
|
48
|
+
// Build manifest map (id → href)
|
|
49
|
+
const manifestItems = opf.package?.manifest?.item;
|
|
50
|
+
const itemList = Array.isArray(manifestItems) ? manifestItems : manifestItems ? [manifestItems] : [];
|
|
51
|
+
const manifest = new Map();
|
|
52
|
+
for (const item of itemList) {
|
|
53
|
+
manifest.set(item["@_id"], item["@_href"]);
|
|
54
|
+
}
|
|
55
|
+
// Get spine order
|
|
56
|
+
const spineItems = opf.package?.spine?.itemref;
|
|
57
|
+
const spineList = Array.isArray(spineItems) ? spineItems : spineItems ? [spineItems] : [];
|
|
58
|
+
const spineOrder = spineList.map((s) => s["@_idref"]);
|
|
59
|
+
// Resolve file paths
|
|
60
|
+
const basePath = opfPath.includes("/") ? opfPath.substring(0, opfPath.lastIndexOf("/")) : "";
|
|
61
|
+
const turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
|
|
62
|
+
const sections = [];
|
|
63
|
+
// Add metadata header
|
|
64
|
+
const metaLines = [];
|
|
65
|
+
for (const [key, value] of Object.entries(metadata)) {
|
|
66
|
+
if (value)
|
|
67
|
+
metaLines.push(`**${key.charAt(0).toUpperCase() + key.slice(1)}:** ${value}`);
|
|
68
|
+
}
|
|
69
|
+
if (metaLines.length > 0)
|
|
70
|
+
sections.push(metaLines.join("\n"));
|
|
71
|
+
// Convert spine files
|
|
72
|
+
for (const idref of spineOrder) {
|
|
73
|
+
const href = manifest.get(idref);
|
|
74
|
+
if (!href)
|
|
75
|
+
continue;
|
|
76
|
+
const filePath = basePath ? `${basePath}/${href}` : href;
|
|
77
|
+
const html = await zip.file(filePath)?.async("string");
|
|
78
|
+
if (!html)
|
|
79
|
+
continue;
|
|
80
|
+
// Strip script/style, convert to markdown
|
|
81
|
+
const cleaned = html
|
|
82
|
+
.replace(/<script[\s\S]*?<\/script>/gi, "")
|
|
83
|
+
.replace(/<style[\s\S]*?<\/style>/gi, "");
|
|
84
|
+
const md = turndown.turndown(cleaned).trim();
|
|
85
|
+
if (md)
|
|
86
|
+
sections.push(md);
|
|
87
|
+
}
|
|
88
|
+
return {
|
|
89
|
+
markdown: sections.join("\n\n").trim(),
|
|
90
|
+
title: metadata.title,
|
|
91
|
+
};
|
|
92
|
+
}
|
|
93
|
+
getText(node) {
|
|
94
|
+
if (!node)
|
|
95
|
+
return undefined;
|
|
96
|
+
if (typeof node === "string")
|
|
97
|
+
return node;
|
|
98
|
+
if (node["#text"])
|
|
99
|
+
return String(node["#text"]);
|
|
100
|
+
if (Array.isArray(node))
|
|
101
|
+
return this.getText(node[0]);
|
|
102
|
+
return undefined;
|
|
103
|
+
}
|
|
104
|
+
getTextArray(node) {
|
|
105
|
+
if (!node)
|
|
106
|
+
return [];
|
|
107
|
+
const list = Array.isArray(node) ? node : [node];
|
|
108
|
+
return list.map((n) => this.getText(n)).filter(Boolean);
|
|
109
|
+
}
|
|
110
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { Converter, ConversionResult, StreamInfo } from "../types.js";
|
|
2
|
+
export declare class HtmlConverter implements Converter {
|
|
3
|
+
name: string;
|
|
4
|
+
accepts(streamInfo: StreamInfo): boolean;
|
|
5
|
+
convert(input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult>;
|
|
6
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import TurndownService from "turndown";
|
|
2
|
+
const EXTENSIONS = [".html", ".htm"];
|
|
3
|
+
const MIMETYPES = ["text/html", "application/xhtml"];
|
|
4
|
+
export class HtmlConverter {
|
|
5
|
+
name = "html";
|
|
6
|
+
accepts(streamInfo) {
|
|
7
|
+
if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension)) {
|
|
8
|
+
return true;
|
|
9
|
+
}
|
|
10
|
+
if (streamInfo.mimetype &&
|
|
11
|
+
MIMETYPES.some((m) => streamInfo.mimetype.startsWith(m))) {
|
|
12
|
+
return true;
|
|
13
|
+
}
|
|
14
|
+
return false;
|
|
15
|
+
}
|
|
16
|
+
async convert(input, streamInfo) {
|
|
17
|
+
const charset = streamInfo.charset || "utf-8";
|
|
18
|
+
const html = new TextDecoder(charset).decode(input);
|
|
19
|
+
const turndown = new TurndownService({
|
|
20
|
+
headingStyle: "atx",
|
|
21
|
+
codeBlockStyle: "fenced",
|
|
22
|
+
});
|
|
23
|
+
// Remove script and style tags before converting
|
|
24
|
+
const cleaned = html
|
|
25
|
+
.replace(/<script[\s\S]*?<\/script>/gi, "")
|
|
26
|
+
.replace(/<style[\s\S]*?<\/style>/gi, "");
|
|
27
|
+
const markdown = turndown.turndown(cleaned);
|
|
28
|
+
// Try to extract title
|
|
29
|
+
const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
30
|
+
const title = titleMatch ? titleMatch[1].trim() : undefined;
|
|
31
|
+
return { markdown: markdown.trim(), title };
|
|
32
|
+
}
|
|
33
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { Converter, ConversionResult, StreamInfo, MarkitOptions } from "../types.js";
|
|
2
|
+
export declare class ImageConverter implements Converter {
|
|
3
|
+
name: string;
|
|
4
|
+
accepts(streamInfo: StreamInfo): boolean;
|
|
5
|
+
convert(input: Buffer, streamInfo: StreamInfo, options?: MarkitOptions): Promise<ConversionResult>;
|
|
6
|
+
}
|