cajupdf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Julio Cesar Ody
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,135 @@
1
+ # cajupdf 🥜
2
+
3
+ **Crack open a PDF, get clean markdown and its pictures.**
4
+
5
+ `caju` is the cashew — that funny little nut that grows _outside_ its fruit, in
6
+ plain sight. `cajupdf` does the same trick to your PDFs: it cracks the shell and
7
+ hands you the good stuff — readable markdown, plus every embedded image, laid out
8
+ where you can actually use them.
9
+
10
+ ---
11
+
12
+ ## What it does
13
+
14
+ Three stages, one command:
15
+
16
+ 1. **Text** — pulled with [`unpdf`](https://github.com/unjs/unpdf) (PDF.js under
17
+ the hood). Scanned/image-only PDFs fall back to the macOS **Vision framework
18
+ OCR** automatically — no setup, no API keys.
19
+ 2. **Images / figures / sigils** — extracted at full fidelity with Poppler's
20
+ `pdfimages` (JPEGs stay JPEGs, PNGs stay PNGs).
21
+ 3. **Markdown** — assembled with YAML front matter (title, author, page & image
22
+ counts) and one `## Page N` section per page, images linked inline.
23
+
24
+ ---
25
+
26
+ ## Requirements
27
+
28
+ - **macOS** — cajupdf leans on the system Vision framework for OCR, so it's
29
+ macOS-only for now.
30
+ - **Node ≥ 22**
31
+ - **`pdfimages`** (Poppler) — only needed when extracting images:
32
+ `brew install poppler`. Skip it entirely with `--no-images`.
33
+
34
+ ---
35
+
36
+ ## Install
37
+
38
+ ```bash
39
+ npm i -g cajupdf
40
+ # or
41
+ pnpm add -g cajupdf
42
+ ```
43
+
44
+ Then:
45
+
46
+ ```bash
47
+ cajupdf some.pdf
48
+ ```
49
+
50
+ Or run it from a clone:
51
+
52
+ ```bash
53
+ pnpm install
54
+ pnpm build
55
+ node dist/cli.js some.pdf
56
+ ```
57
+
58
+ ---
59
+
60
+ ## Usage
61
+
62
+ ```
63
+ cajupdf <pdf> [<pdf> ...] [options]
64
+
65
+ Options:
66
+ --no-images Skip image extraction (images are extracted by default).
67
+ --url-friendly Slugify output names to kebab-case (default: verbatim).
68
+ --out-dir <dir> Where to write outputs (default: current directory).
69
+ -h, --help Show help.
70
+ -v, --version Show version.
71
+ ```
72
+
73
+ Pass as many PDFs as you like — each is processed independently, and one bad file
74
+ won't sink the rest (the run just exits non-zero if any failed).
75
+
76
+ ---
77
+
78
+ ## Output layout
79
+
80
+ The markdown is `<name>.md`, and its images sit in a sibling `<name>-images/`
81
+ directory.
82
+
83
+ **Default (verbatim names):**
84
+
85
+ ```
86
+ My Book.pdf → My Book.md
87
+ My Book-images/
88
+ img-002-000.jpg
89
+ ...
90
+ ```
91
+
92
+ Verbatim names can contain spaces, so image links are percent-encoded and
93
+ angle-bracket-wrapped to stay valid markdown:
94
+ `![](<My Book-images/img-002-000.jpg>)`.
95
+
96
+ **`--url-friendly` (kebab-case):**
97
+
98
+ ```
99
+ My Book.pdf → my-book.md
100
+ my-book-images/
101
+ img-002-000.jpg
102
+ ...
103
+ ```
104
+
105
+ ---
106
+
107
+ ## Programmatic API
108
+
109
+ `cajupdf` is also a library. The one-call workhorse:
110
+
111
+ ```ts
112
+ import { extractPdf } from 'cajupdf'
113
+
114
+ const result = await extractPdf('My Book.pdf', {
115
+ images: true, // extract images (default)
116
+ urlFriendly: false, // keep names verbatim (default)
117
+ outDir: './out', // default: cwd
118
+ onProgress: p => console.log(p.stage, p.current, p.total),
119
+ })
120
+
121
+ // → { mdPath, imageDir, pageCount, imageCount }
122
+ ```
123
+
124
+ Or reach for the individual stages — `parsePDF`, `extractImages`, `toMarkdown`,
125
+ `slugify`, `resolveOutputNames` — all exported with full types.
126
+
127
+ ---
128
+
129
+ ## License
130
+
131
+ MIT
132
+
133
+ ---
134
+
135
+ _Built to be cracked open. Mind the shell._ 🥜🐘
package/dist/cli.d.ts ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env node
2
+ export {};
package/dist/cli.js ADDED
@@ -0,0 +1,136 @@
1
+ #!/usr/bin/env node
2
+ import { t as e } from "./extract-pdf-Bnge6uHj.js";
3
+ import t from "node:fs";
4
+ import n from "node:path";
5
+ import { fileURLToPath as r } from "node:url";
6
+ import i from "chalk";
7
+ //#region src/lib/spinner.ts
8
+ var a = [
9
+ "⠋",
10
+ "⠙",
11
+ "⠹",
12
+ "⠸",
13
+ "⠼",
14
+ "⠴",
15
+ "⠦",
16
+ "⠧",
17
+ "⠇",
18
+ "⠏"
19
+ ], o = 80;
20
+ function s(e, t, n) {
21
+ let r = process.stdout.columns ?? 80, i = Math.max(10, r - t - n - 1);
22
+ return e.length <= i ? e : e.slice(0, Math.max(1, i - 1)) + "…";
23
+ }
24
+ function c(e, t, n = 15) {
25
+ let r = t > 0 ? Math.min(e / t, 1) : 0, a = Math.round(r * n), o = n - a, s = i.cyan("█".repeat(a)) + i.dim("░".repeat(o)), c = `${Math.round(r * 100)}%`.padStart(4);
26
+ return `[${s}] ${i.dim(c)}`;
27
+ }
28
+ function l(e) {
29
+ let t = 0, n = e, r = 0, l = 0, u = null, d = () => {
30
+ let e = i.cyan(a[t]), o = l > 0 ? ` ${c(r, l)}` : "", u = s(n, 2, l > 0 ? 23 : 0);
31
+ process.stdout.write(`\r${e} ${i.dim(u)}${o}\x1b[K`), t = (t + 1) % a.length;
32
+ };
33
+ return u = setInterval(d, o), d(), {
34
+ update(e) {
35
+ n = e;
36
+ },
37
+ setProgress(e, t) {
38
+ r = e, l = t;
39
+ },
40
+ stop(e) {
41
+ u &&= (clearInterval(u), null), e ? process.stdout.write(`\r${i.dim(e)}\x1b[K\n`) : process.stdout.write("\r\x1B[K");
42
+ },
43
+ succeed(e) {
44
+ u &&= (clearInterval(u), null), process.stdout.write(`\r${i.green("✓")} ${i.dim(e)}\x1b[K\n`);
45
+ },
46
+ fail(e) {
47
+ u &&= (clearInterval(u), null), process.stdout.write(`\r${i.red("✗")} ${i.dim(e)}\x1b[K\n`);
48
+ }
49
+ };
50
+ }
51
+ //#endregion
52
+ //#region src/cli.ts
53
+ function u() {
54
+ console.error("cajupdf — crack open a PDF, get clean markdown and its pictures. 🥜"), console.error(""), console.error("Usage: cajupdf <pdf> [<pdf> ...] [options]"), console.error(""), console.error("Options:"), console.error(" --no-images Skip image extraction (images are extracted by default)."), console.error(" --url-friendly Slugify output names to kebab-case (default: verbatim)."), console.error(" --out-dir <dir> Where to write outputs (default: current directory)."), console.error(" -h, --help Show this help."), console.error(" -v, --version Show version.");
55
+ }
56
+ function d() {
57
+ let e = n.dirname(r(import.meta.url));
58
+ return JSON.parse(t.readFileSync(n.join(e, "..", "package.json"), "utf-8")).version;
59
+ }
60
+ function f(e) {
61
+ let t = [], n = !0, r = !1, i;
62
+ for (let a = 0; a < e.length; a++) {
63
+ let o = e[a];
64
+ switch (o) {
65
+ case "--no-images":
66
+ n = !1;
67
+ break;
68
+ case "--url-friendly":
69
+ r = !0;
70
+ break;
71
+ case "--out-dir": {
72
+ let t = e[++a];
73
+ if (!t) return null;
74
+ i = t;
75
+ break;
76
+ }
77
+ default:
78
+ if (o && o.startsWith("-")) return null;
79
+ o && t.push(o);
80
+ }
81
+ }
82
+ return t.length === 0 ? null : {
83
+ pdfs: t,
84
+ images: n,
85
+ urlFriendly: r,
86
+ outDir: i
87
+ };
88
+ }
89
+ async function p(r, i) {
90
+ let a = n.resolve(r), o = n.basename(a);
91
+ if (!t.existsSync(a)) return console.error(`✗ File not found: ${r}`), !1;
92
+ if (n.extname(a).toLowerCase() !== ".pdf") return console.error(`✗ Not a PDF: ${r}`), !1;
93
+ let s = l(`Extracting ${o}`), c = (e) => {
94
+ switch (e.stage) {
95
+ case "loading":
96
+ s.update(`Loading ${o}`);
97
+ break;
98
+ case "parsing":
99
+ s.update(`Parsing text ${o}`), s.setProgress(e.current, e.total);
100
+ break;
101
+ case "ocr":
102
+ s.update(`OCR (Vision) ${o}`), s.setProgress(e.current, e.total);
103
+ break;
104
+ case "images":
105
+ s.update(`Extracting images ${o}`), e.total > 0 && s.setProgress(e.current, e.total);
106
+ break;
107
+ case "done": break;
108
+ }
109
+ };
110
+ try {
111
+ let t = await e(a, {
112
+ ...i,
113
+ onProgress: c
114
+ }), r = n.basename(t.mdPath);
115
+ return s.succeed(`${o} → ${r} (${t.pageCount} pages, ${t.imageCount} images)`), !0;
116
+ } catch (e) {
117
+ return s.fail(`Failed: ${o}`), console.error(e instanceof Error ? e.message : e), !1;
118
+ }
119
+ }
120
+ async function m() {
121
+ let e = process.argv.slice(2);
122
+ (e[0] === "-h" || e[0] === "--help") && (u(), process.exit(0)), (e[0] === "-v" || e[0] === "--version") && (console.log(d()), process.exit(0));
123
+ let t = f(e);
124
+ t || (u(), process.exit(2));
125
+ let n = {
126
+ images: t.images,
127
+ urlFriendly: t.urlFriendly,
128
+ outDir: t.outDir
129
+ }, r = !1;
130
+ for (let e of t.pdfs) await p(e, n) || (r = !0);
131
+ process.exit(+!!r);
132
+ }
133
+ m().catch((e) => {
134
+ console.error(e), process.exit(1);
135
+ });
136
+ //#endregion
@@ -0,0 +1,192 @@
1
+ import e from "node:fs";
2
+ import t from "node:path";
3
+ import { execFile as n } from "node:child_process";
4
+ import { extractText as r, getDocumentProxy as i } from "unpdf";
5
+ import { promisify as a } from "node:util";
6
+ //#region src/lib/text.ts
7
+ var o = () => new Promise((e) => setTimeout(e, 10));
8
+ async function s(n, a) {
9
+ let s = e.readFileSync(n);
10
+ a?.({
11
+ stage: "loading",
12
+ current: 0,
13
+ total: 0
14
+ }), await o();
15
+ let c = await i(new Uint8Array(s)), u = c.numPages, d, f;
16
+ try {
17
+ let e = (await c.getMetadata())?.info;
18
+ d = e?.Title, f = e?.Author;
19
+ } catch {}
20
+ a?.({
21
+ stage: "parsing",
22
+ current: 0,
23
+ total: u
24
+ }), await o();
25
+ let { text: p } = await r(c, { mergePages: !1 });
26
+ for (let e = 0; e < p.length; e++) e % 10 == 0 && (a?.({
27
+ stage: "parsing",
28
+ current: e + 1,
29
+ total: u
30
+ }), await o());
31
+ let m = p.map((e) => e.trim());
32
+ return !m.some((e) => e.length > 0) && process.platform === "darwin" && (a?.({
33
+ stage: "ocr",
34
+ current: 0,
35
+ total: u
36
+ }), await o(), m = await l(n, u, a)), a?.({
37
+ stage: "done",
38
+ current: u,
39
+ total: u
40
+ }), {
41
+ text: m.join("\n\n"),
42
+ pageCount: u,
43
+ pages: m,
44
+ metadata: {
45
+ title: d || t.basename(n, ".pdf"),
46
+ author: f
47
+ }
48
+ };
49
+ }
50
+ var c = "\nimport Foundation\nimport PDFKit\nimport Vision\n\nguard CommandLine.arguments.count > 1 else {\n fputs(\"Usage: ocr-pdf <path>\\n\", stderr)\n exit(1)\n}\n\nlet pdfPath = CommandLine.arguments[1]\nguard let doc = PDFDocument(url: URL(fileURLWithPath: pdfPath)) else {\n fputs(\"Failed to open PDF\\n\", stderr)\n exit(1)\n}\n\nvar result: [[String: Any]] = []\n\nfor i in 0..<doc.pageCount {\n guard let page = doc.page(at: i) else {\n result.append([\"page\": i + 1, \"text\": \"\"])\n continue\n }\n\n // Render page to CGImage for Vision OCR\n let bounds = page.bounds(for: .mediaBox)\n let scale: CGFloat = 2.0 // 2x for better OCR accuracy\n let width = Int(bounds.width * scale)\n let height = Int(bounds.height * scale)\n\n guard let ctx = CGContext(\n data: nil,\n width: width,\n height: height,\n bitsPerComponent: 8,\n bytesPerRow: 0,\n space: CGColorSpaceCreateDeviceRGB(),\n bitmapInfo: CGImageAlphaInfo.premultipliedFirst.rawValue\n ) else {\n result.append([\"page\": i + 1, \"text\": \"\"])\n continue\n }\n\n ctx.setFillColor(CGColor.white)\n ctx.fill(CGRect(x: 0, y: 0, width: width, height: height))\n ctx.scaleBy(x: scale, y: scale)\n\n // PDFKit renders in a flipped coordinate space\n NSGraphicsContext.saveGraphicsState()\n let nsCtx = NSGraphicsContext(cgContext: ctx, flipped: false)\n NSGraphicsContext.current = nsCtx\n page.draw(with: .mediaBox, to: ctx)\n NSGraphicsContext.restoreGraphicsState()\n\n guard let cgImage = ctx.makeImage() else {\n result.append([\"page\": i + 1, \"text\": \"\"])\n continue\n }\n\n let semaphore = DispatchSemaphore(value: 0)\n var pageText = \"\"\n\n let request = VNRecognizeTextRequest { request, error in\n if let observations = request.results as? [VNRecognizedTextObservation] {\n pageText = observations\n .compactMap { $0.topCandidates(1).first?.string }\n .joined(separator: \"\\n\")\n }\n semaphore.signal()\n }\n request.recognitionLevel = .accurate\n request.usesLanguageCorrection = true\n\n let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])\n try? handler.perform([request])\n semaphore.wait()\n\n result.append([\"page\": i + 1, \"text\": pageText])\n\n // Progress: write page number to stderr so the caller can track it\n fputs(\"PAGE:\\(i + 1)\\n\", stderr)\n}\n\nlet jsonData = try! JSONSerialization.data(withJSONObject: result)\nprint(String(data: jsonData, encoding: .utf8)!)\n";
51
+ async function l(r, i, a) {
52
+ let o = process.env.TMPDIR || "/tmp", s = t.join(o, ".ocr-script-" + process.pid + ".swift");
53
+ try {
54
+ return e.writeFileSync(s, c), await new Promise((e, t) => {
55
+ n("/usr/bin/swift", [s, r], {
56
+ maxBuffer: 100 * 1024 * 1024,
57
+ timeout: 600 * 1e3
58
+ }, (n, r) => {
59
+ if (n) {
60
+ t(/* @__PURE__ */ Error(`OCR failed: ${n.message}`));
61
+ return;
62
+ }
63
+ try {
64
+ e(JSON.parse(r).map((e) => e.text.trim()));
65
+ } catch (e) {
66
+ t(/* @__PURE__ */ Error(`Failed to parse OCR output: ${e}`));
67
+ }
68
+ }).stderr?.on("data", (e) => {
69
+ let t = e.toString().split("\n");
70
+ for (let e of t) {
71
+ let t = e.match(/^PAGE:(\d+)$/);
72
+ if (t?.[1]) {
73
+ let e = parseInt(t[1], 10);
74
+ a?.({
75
+ stage: "ocr",
76
+ current: e,
77
+ total: i
78
+ });
79
+ }
80
+ }
81
+ });
82
+ });
83
+ } finally {
84
+ try {
85
+ e.unlinkSync(s);
86
+ } catch {}
87
+ }
88
+ }
89
+ //#endregion
90
+ //#region src/lib/images.ts
91
+ var u = a(n);
92
+ async function d(n, r, i = "img", a) {
93
+ try {
94
+ await u("pdfimages", ["-v"]);
95
+ } catch (e) {
96
+ if (e.code === "ENOENT") throw Error("pdfimages not found on PATH. Install Poppler: `brew install poppler` (macOS) or your distro equivalent.");
97
+ }
98
+ e.mkdirSync(r, { recursive: !0 }), a?.({
99
+ stage: "images",
100
+ current: 0,
101
+ total: 0
102
+ }), await u("pdfimages", [
103
+ "-p",
104
+ "-all",
105
+ n,
106
+ t.join(r, i)
107
+ ], { maxBuffer: 100 * 1024 * 1024 });
108
+ let o = RegExp(`^${f(i)}-(\\d+)-(\\d+)\\.([A-Za-z0-9]+)$`), s = e.readdirSync(r), c = [];
109
+ for (let e of s) {
110
+ let n = e.match(o);
111
+ !n?.[1] || !n[2] || c.push({
112
+ page: parseInt(n[1], 10),
113
+ index: parseInt(n[2], 10),
114
+ filename: e,
115
+ path: t.join(r, e)
116
+ });
117
+ }
118
+ return c.sort((e, t) => e.page - t.page || e.index - t.index), a?.({
119
+ stage: "images",
120
+ current: c.length,
121
+ total: c.length
122
+ }), c;
123
+ }
124
+ function f(e) {
125
+ return e.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
126
+ }
127
+ //#endregion
128
+ //#region src/lib/markdown.ts
129
+ function p(e, t, n) {
130
+ let r = n.title ?? e.metadata.title ?? "Untitled", i = e.metadata.author, a = /* @__PURE__ */ new Map();
131
+ for (let e of t) {
132
+ let t = a.get(e.page) ?? [];
133
+ t.push(e), a.set(e.page, t);
134
+ }
135
+ let o = [];
136
+ o.push("---"), o.push(`title: ${h(r)}`), i && o.push(`author: ${h(i)}`), o.push(`pages: ${e.pageCount}`), o.push(`images: ${t.length}`), o.push("---"), o.push("");
137
+ for (let t = 0; t < e.pages.length; t++) {
138
+ let r = t + 1, i = e.pages[t]?.trim() ?? "", s = a.get(r) ?? [];
139
+ if (!(!i && s.length === 0)) {
140
+ o.push(`## Page ${r}`), o.push(""), i && (o.push(i), o.push(""));
141
+ for (let e of s) o.push(`![](${m(n.imageDir, e.filename)})`);
142
+ s.length > 0 && o.push("");
143
+ }
144
+ }
145
+ return o.join("\n");
146
+ }
147
+ function m(e, t) {
148
+ let n = `${e}/${t}`, r = encodeURI(n);
149
+ return r === n && !/[()]/.test(n) ? n : `<${r}>`;
150
+ }
151
+ function h(e) {
152
+ return /[:#\n"'\\]/.test(e) ? JSON.stringify(e) : e;
153
+ }
154
+ //#endregion
155
+ //#region src/lib/slugify.ts
156
+ function g(e, t = 60) {
157
+ let n = e.normalize("NFKD").replace(/[̀-ͯ]/g, "").replace(/['‘’`´]/g, "").toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "");
158
+ if (n.length <= t) return n || "untitled";
159
+ let r = n.slice(0, t), i = r.lastIndexOf("-");
160
+ return (i > t / 2 ? r.slice(0, i) : r).replace(/-+$/, "");
161
+ }
162
+ //#endregion
163
+ //#region src/lib/resolve-output-names.ts
164
+ function _(e, n = {}) {
165
+ let r = t.resolve(n.outDir ?? process.cwd()), i = t.basename(e, t.extname(e)), a = n.urlFriendly ? g(i) : i, o = `${a}-images`;
166
+ return {
167
+ baseName: a,
168
+ mdPath: t.join(r, `${a}.md`),
169
+ imageDir: t.join(r, o),
170
+ imageDirName: o
171
+ };
172
+ }
173
+ //#endregion
174
+ //#region src/lib/extract-pdf.ts
175
+ async function v(t, n = {}) {
176
+ let { images: r = !0, urlFriendly: i = !1, outDir: a, onProgress: o } = n, c = _(t, {
177
+ urlFriendly: i,
178
+ outDir: a
179
+ }), l = await s(t, o), u = [];
180
+ if (r && (u = await d(t, c.imageDir, "img", o), u.length === 0)) try {
181
+ e.rmdirSync(c.imageDir);
182
+ } catch {}
183
+ let f = p(l, u, { imageDir: c.imageDirName });
184
+ return e.writeFileSync(c.mdPath, f, "utf-8"), {
185
+ mdPath: c.mdPath,
186
+ imageDir: u.length > 0 ? c.imageDir : null,
187
+ pageCount: l.pageCount,
188
+ imageCount: u.length
189
+ };
190
+ }
191
+ //#endregion
192
+ export { d as a, p as i, _ as n, s as o, g as r, v as t };
@@ -0,0 +1,11 @@
1
+ export { extractPdf } from './lib/extract-pdf';
2
+ export { parsePDF } from './lib/text';
3
+ export { extractImages } from './lib/images';
4
+ export { toMarkdown } from './lib/markdown';
5
+ export { slugify } from './lib/slugify';
6
+ export { resolveOutputNames } from './lib/resolve-output-names';
7
+ export type { ExtractPdfOptions, ExtractPdfResult } from './lib/extract-pdf';
8
+ export type { ParsedPDF, ParseProgress, ProgressCallback } from './lib/text';
9
+ export type { ExtractedImage } from './lib/images';
10
+ export type { ToMarkdownOptions } from './lib/markdown';
11
+ export type { OutputNames, ResolveOutputNamesOptions } from './lib/resolve-output-names';
package/dist/index.js ADDED
@@ -0,0 +1,2 @@
1
+ import { a as e, i as t, n, o as r, r as i, t as a } from "./extract-pdf-Bnge6uHj.js";
2
+ export { e as extractImages, a as extractPdf, r as parsePDF, n as resolveOutputNames, i as slugify, t as toMarkdown };
@@ -0,0 +1,25 @@
1
+ import { ProgressCallback } from './text';
2
+ export interface ExtractPdfOptions {
3
+ /** Extract embedded images via Poppler. Default: true. */
4
+ images?: boolean;
5
+ /** Slugify output names to kebab-case. Default: false (verbatim). */
6
+ urlFriendly?: boolean;
7
+ /** Directory to write outputs in. Default: current working directory. */
8
+ outDir?: string;
9
+ /** Progress callback for spinner/UI wiring. */
10
+ onProgress?: ProgressCallback;
11
+ }
12
+ export interface ExtractPdfResult {
13
+ /** Absolute path of the markdown file written. */
14
+ mdPath: string;
15
+ /** Absolute path of the image directory, or null when images were skipped/empty. */
16
+ imageDir: string | null;
17
+ pageCount: number;
18
+ imageCount: number;
19
+ }
20
+ /**
21
+ * Run the full PDF → markdown pipeline for a single file: parse text (with macOS
22
+ * OCR fallback), optionally extract images, assemble markdown, and write it to
23
+ * disk. Returns where things landed.
24
+ */
25
+ export declare function extractPdf(pdfPath: string, options?: ExtractPdfOptions): Promise<ExtractPdfResult>;
@@ -0,0 +1,17 @@
1
+ import { ProgressCallback } from './text';
2
+ export interface ExtractedImage {
3
+ page: number;
4
+ index: number;
5
+ filename: string;
6
+ path: string;
7
+ }
8
+ /**
9
+ * Extract embedded images from a PDF using Poppler's `pdfimages`.
10
+ *
11
+ * Requires `pdfimages` on PATH (install via `brew install poppler` on macOS).
12
+ *
13
+ * Files are written as `<outDir>/<prefix>-<page>-<n>.<ext>` where the page
14
+ * number is zero-padded by pdfimages. `-all` preserves original encodings
15
+ * (JPEGs stay JPEGs, etc.) for highest fidelity.
16
+ */
17
+ export declare function extractImages(pdfPath: string, outDir: string, prefix?: string, onProgress?: ProgressCallback): Promise<ExtractedImage[]>;
@@ -0,0 +1,9 @@
1
+ import { ParsedPDF } from './text';
2
+ import { ExtractedImage } from './images';
3
+ export interface ToMarkdownOptions {
4
+ /** Directory (relative to the .md file) where images live. e.g. "My Book" */
5
+ imageDir: string;
6
+ /** Optional title override; defaults to parsed metadata title. */
7
+ title?: string;
8
+ }
9
+ export declare function toMarkdown(parsed: ParsedPDF, images: ExtractedImage[], options: ToMarkdownOptions): string;
@@ -0,0 +1,22 @@
1
+ export interface OutputNames {
2
+ /** Markdown stem (source filename without extension; slugified if url-friendly). */
3
+ baseName: string;
4
+ /** Absolute path of the markdown file to write. */
5
+ mdPath: string;
6
+ /** Absolute path of the image directory (`<baseName>-images`). */
7
+ imageDir: string;
8
+ /** Image directory name relative to the markdown file, for use in image links. */
9
+ imageDirName: string;
10
+ }
11
+ export interface ResolveOutputNamesOptions {
12
+ /** Slugify names to kebab-case instead of keeping them verbatim. */
13
+ urlFriendly?: boolean;
14
+ /** Directory to place outputs in. Defaults to the current working directory. */
15
+ outDir?: string;
16
+ }
17
+ /**
18
+ * Decide where a PDF's outputs land. The markdown is `<baseName>.md` and its
19
+ * images go in a sibling `<baseName>-images/` directory. Default keeps the
20
+ * source filename verbatim; `--url-friendly` slugifies it.
21
+ */
22
+ export declare function resolveOutputNames(pdfPath: string, options?: ResolveOutputNamesOptions): OutputNames;
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Turn an arbitrary string into a url-friendly kebab-case slug: lowercased,
3
+ * diacritics stripped, apostrophes dropped, runs of other characters collapsed
4
+ * to single hyphens, and truncated at a word boundary within `maxLength`.
5
+ */
6
+ export declare function slugify(input: string, maxLength?: number): string;
@@ -0,0 +1,11 @@
1
+ export interface Spinner {
2
+ update(text: string): void;
3
+ stop(finalText?: string): void;
4
+ succeed(text: string): void;
5
+ fail(text: string): void;
6
+ }
7
+ export interface ProgressSpinner extends Spinner {
8
+ setProgress(current: number, total: number): void;
9
+ }
10
+ export declare function createSpinner(initialText: string): Spinner;
11
+ export declare function createProgressSpinner(initialText: string): ProgressSpinner;
@@ -0,0 +1,16 @@
1
+ export interface ParsedPDF {
2
+ text: string;
3
+ pageCount: number;
4
+ pages: string[];
5
+ metadata: {
6
+ title?: string;
7
+ author?: string;
8
+ };
9
+ }
10
+ export interface ParseProgress {
11
+ stage: 'loading' | 'parsing' | 'ocr' | 'images' | 'done';
12
+ current: number;
13
+ total: number;
14
+ }
15
+ export type ProgressCallback = (progress: ParseProgress) => void;
16
+ export declare function parsePDF(filePath: string, onProgress?: ProgressCallback): Promise<ParsedPDF>;
package/package.json ADDED
@@ -0,0 +1,82 @@
1
+ {
2
+ "name": "cajupdf",
3
+ "version": "0.1.0",
4
+ "description": "Turn PDFs into clean markdown + extracted images (macOS)",
5
+ "type": "module",
6
+ "bin": {
7
+ "cajupdf": "dist/cli.js"
8
+ },
9
+ "main": "./dist/index.js",
10
+ "exports": {
11
+ ".": {
12
+ "types": "./dist/index.d.ts",
13
+ "import": "./dist/index.js"
14
+ }
15
+ },
16
+ "publishConfig": {
17
+ "access": "public"
18
+ },
19
+ "files": [
20
+ "dist",
21
+ "README.md",
22
+ "LICENSE"
23
+ ],
24
+ "engines": {
25
+ "node": ">=22"
26
+ },
27
+ "os": [
28
+ "darwin"
29
+ ],
30
+ "keywords": [
31
+ "pdf",
32
+ "markdown",
33
+ "ocr",
34
+ "vision",
35
+ "poppler",
36
+ "pdfimages",
37
+ "extract",
38
+ "cli",
39
+ "macos"
40
+ ],
41
+ "license": "MIT",
42
+ "author": "Julio Cesar Ody",
43
+ "homepage": "https://github.com/juliocesar/cajupdf#readme",
44
+ "repository": {
45
+ "type": "git",
46
+ "url": "git+https://github.com/juliocesar/cajupdf.git"
47
+ },
48
+ "bugs": {
49
+ "url": "https://github.com/juliocesar/cajupdf/issues"
50
+ },
51
+ "scripts": {
52
+ "build": "vite build",
53
+ "dev": "vite build --watch",
54
+ "start": "node dist/cli.js",
55
+ "typecheck": "tsc --noEmit",
56
+ "lint": "eslint .",
57
+ "format": "prettier --write \"**/*.{ts,js,mjs,json,md}\"",
58
+ "format:check": "prettier --check \"**/*.{ts,js,mjs,json,md}\"",
59
+ "test": "vitest run",
60
+ "test:watch": "vitest",
61
+ "prepublishOnly": "pnpm run build"
62
+ },
63
+ "dependencies": {
64
+ "chalk": "^5.6.2",
65
+ "unpdf": "^1.4.0"
66
+ },
67
+ "devDependencies": {
68
+ "@eslint/js": "^9.39.4",
69
+ "@types/node": "^22.10.0",
70
+ "eslint": "^9.39.4",
71
+ "eslint-config-prettier": "^10.1.8",
72
+ "eslint-plugin-only-warn": "^1.2.1",
73
+ "eslint-plugin-prettier": "^5.5.6",
74
+ "globals": "^17.6.0",
75
+ "prettier": "^3.8.3",
76
+ "typescript": "~6.0.3",
77
+ "typescript-eslint": "^8.60.1",
78
+ "vite": "^8.0.14",
79
+ "vite-plugin-dts": "^4",
80
+ "vitest": "^4.1.7"
81
+ }
82
+ }