cajupdf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +135 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +136 -0
- package/dist/extract-pdf-Bnge6uHj.js +192 -0
- package/dist/index.d.ts +11 -0
- package/dist/index.js +2 -0
- package/dist/lib/extract-pdf.d.ts +25 -0
- package/dist/lib/images.d.ts +17 -0
- package/dist/lib/markdown.d.ts +9 -0
- package/dist/lib/resolve-output-names.d.ts +22 -0
- package/dist/lib/slugify.d.ts +6 -0
- package/dist/lib/spinner.d.ts +11 -0
- package/dist/lib/text.d.ts +16 -0
- package/package.json +82 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Julio Cesar Ody
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# cajupdf 🥜
|
|
2
|
+
|
|
3
|
+
**Crack open a PDF, get clean markdown and its pictures.**
|
|
4
|
+
|
|
5
|
+
`caju` is the cashew — that funny little nut that grows _outside_ its fruit, in
|
|
6
|
+
plain sight. `cajupdf` does the same trick to your PDFs: it cracks the shell and
|
|
7
|
+
hands you the good stuff — readable markdown, plus every embedded image, laid out
|
|
8
|
+
where you can actually use them.
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## What it does
|
|
13
|
+
|
|
14
|
+
Three stages, one command:
|
|
15
|
+
|
|
16
|
+
1. **Text** — pulled with [`unpdf`](https://github.com/unjs/unpdf) (PDF.js under
|
|
17
|
+
the hood). Scanned/image-only PDFs fall back to the macOS **Vision framework
|
|
18
|
+
OCR** automatically — no setup, no API keys.
|
|
19
|
+
2. **Images / figures / sigils** — extracted at full fidelity with Poppler's
|
|
20
|
+
`pdfimages` (JPEGs stay JPEGs, PNGs stay PNGs).
|
|
21
|
+
3. **Markdown** — assembled with YAML front matter (title, author, page & image
|
|
22
|
+
counts) and one `## Page N` section per page, images linked inline.
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Requirements
|
|
27
|
+
|
|
28
|
+
- **macOS** — cajupdf leans on the system Vision framework for OCR, so it's
|
|
29
|
+
macOS-only for now.
|
|
30
|
+
- **Node ≥ 22**
|
|
31
|
+
- **`pdfimages`** (Poppler) — only needed when extracting images:
|
|
32
|
+
`brew install poppler`. Skip it entirely with `--no-images`.
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## Install
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
npm i -g cajupdf
|
|
40
|
+
# or
|
|
41
|
+
pnpm add -g cajupdf
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Then:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
cajupdf some.pdf
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Or run it from a clone:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pnpm install
|
|
54
|
+
pnpm build
|
|
55
|
+
node dist/cli.js some.pdf
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## Usage
|
|
61
|
+
|
|
62
|
+
```
|
|
63
|
+
cajupdf <pdf> [<pdf> ...] [options]
|
|
64
|
+
|
|
65
|
+
Options:
|
|
66
|
+
--no-images Skip image extraction (images are extracted by default).
|
|
67
|
+
--url-friendly Slugify output names to kebab-case (default: verbatim).
|
|
68
|
+
--out-dir <dir> Where to write outputs (default: current directory).
|
|
69
|
+
-h, --help Show help.
|
|
70
|
+
-v, --version Show version.
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Pass as many PDFs as you like — each is processed independently, and one bad file
|
|
74
|
+
won't sink the rest (the run just exits non-zero if any failed).
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## Output layout
|
|
79
|
+
|
|
80
|
+
The markdown is `<name>.md`, and its images sit in a sibling `<name>-images/`
|
|
81
|
+
directory.
|
|
82
|
+
|
|
83
|
+
**Default (verbatim names):**
|
|
84
|
+
|
|
85
|
+
```
|
|
86
|
+
My Book.pdf → My Book.md
|
|
87
|
+
My Book-images/
|
|
88
|
+
img-002-000.jpg
|
|
89
|
+
...
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Verbatim names can contain spaces, so image links are percent-encoded and
|
|
93
|
+
angle-bracket-wrapped to stay valid markdown:
|
|
94
|
+
``.
|
|
95
|
+
|
|
96
|
+
**`--url-friendly` (kebab-case):**
|
|
97
|
+
|
|
98
|
+
```
|
|
99
|
+
My Book.pdf → my-book.md
|
|
100
|
+
my-book-images/
|
|
101
|
+
img-002-000.jpg
|
|
102
|
+
...
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
---
|
|
106
|
+
|
|
107
|
+
## Programmatic API
|
|
108
|
+
|
|
109
|
+
`cajupdf` is also a library. The one-call workhorse:
|
|
110
|
+
|
|
111
|
+
```ts
|
|
112
|
+
import { extractPdf } from 'cajupdf'
|
|
113
|
+
|
|
114
|
+
const result = await extractPdf('My Book.pdf', {
|
|
115
|
+
images: true, // extract images (default)
|
|
116
|
+
urlFriendly: false, // keep names verbatim (default)
|
|
117
|
+
outDir: './out', // default: cwd
|
|
118
|
+
onProgress: p => console.log(p.stage, p.current, p.total),
|
|
119
|
+
})
|
|
120
|
+
|
|
121
|
+
// → { mdPath, imageDir, pageCount, imageCount }
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
Or reach for the individual stages — `parsePDF`, `extractImages`, `toMarkdown`,
|
|
125
|
+
`slugify`, `resolveOutputNames` — all exported with full types.
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
## License
|
|
130
|
+
|
|
131
|
+
MIT
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
_Built to be cracked open. Mind the shell._ 🥜🐘
|
package/dist/cli.d.ts
ADDED
package/dist/cli.js
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { t as e } from "./extract-pdf-Bnge6uHj.js";
|
|
3
|
+
import t from "node:fs";
|
|
4
|
+
import n from "node:path";
|
|
5
|
+
import { fileURLToPath as r } from "node:url";
|
|
6
|
+
import i from "chalk";
|
|
7
|
+
//#region src/lib/spinner.ts
|
|
8
|
+
var a = [
|
|
9
|
+
"⠋",
|
|
10
|
+
"⠙",
|
|
11
|
+
"⠹",
|
|
12
|
+
"⠸",
|
|
13
|
+
"⠼",
|
|
14
|
+
"⠴",
|
|
15
|
+
"⠦",
|
|
16
|
+
"⠧",
|
|
17
|
+
"⠇",
|
|
18
|
+
"⠏"
|
|
19
|
+
], o = 80;
|
|
20
|
+
function s(e, t, n) {
|
|
21
|
+
let r = process.stdout.columns ?? 80, i = Math.max(10, r - t - n - 1);
|
|
22
|
+
return e.length <= i ? e : e.slice(0, Math.max(1, i - 1)) + "…";
|
|
23
|
+
}
|
|
24
|
+
function c(e, t, n = 15) {
|
|
25
|
+
let r = t > 0 ? Math.min(e / t, 1) : 0, a = Math.round(r * n), o = n - a, s = i.cyan("█".repeat(a)) + i.dim("░".repeat(o)), c = `${Math.round(r * 100)}%`.padStart(4);
|
|
26
|
+
return `[${s}] ${i.dim(c)}`;
|
|
27
|
+
}
|
|
28
|
+
function l(e) {
|
|
29
|
+
let t = 0, n = e, r = 0, l = 0, u = null, d = () => {
|
|
30
|
+
let e = i.cyan(a[t]), o = l > 0 ? ` ${c(r, l)}` : "", u = s(n, 2, l > 0 ? 23 : 0);
|
|
31
|
+
process.stdout.write(`\r${e} ${i.dim(u)}${o}\x1b[K`), t = (t + 1) % a.length;
|
|
32
|
+
};
|
|
33
|
+
return u = setInterval(d, o), d(), {
|
|
34
|
+
update(e) {
|
|
35
|
+
n = e;
|
|
36
|
+
},
|
|
37
|
+
setProgress(e, t) {
|
|
38
|
+
r = e, l = t;
|
|
39
|
+
},
|
|
40
|
+
stop(e) {
|
|
41
|
+
u &&= (clearInterval(u), null), e ? process.stdout.write(`\r${i.dim(e)}\x1b[K\n`) : process.stdout.write("\r\x1B[K");
|
|
42
|
+
},
|
|
43
|
+
succeed(e) {
|
|
44
|
+
u &&= (clearInterval(u), null), process.stdout.write(`\r${i.green("✓")} ${i.dim(e)}\x1b[K\n`);
|
|
45
|
+
},
|
|
46
|
+
fail(e) {
|
|
47
|
+
u &&= (clearInterval(u), null), process.stdout.write(`\r${i.red("✗")} ${i.dim(e)}\x1b[K\n`);
|
|
48
|
+
}
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
//#endregion
|
|
52
|
+
//#region src/cli.ts
|
|
53
|
+
function u() {
|
|
54
|
+
console.error("cajupdf — crack open a PDF, get clean markdown and its pictures. 🥜"), console.error(""), console.error("Usage: cajupdf <pdf> [<pdf> ...] [options]"), console.error(""), console.error("Options:"), console.error(" --no-images Skip image extraction (images are extracted by default)."), console.error(" --url-friendly Slugify output names to kebab-case (default: verbatim)."), console.error(" --out-dir <dir> Where to write outputs (default: current directory)."), console.error(" -h, --help Show this help."), console.error(" -v, --version Show version.");
|
|
55
|
+
}
|
|
56
|
+
function d() {
|
|
57
|
+
let e = n.dirname(r(import.meta.url));
|
|
58
|
+
return JSON.parse(t.readFileSync(n.join(e, "..", "package.json"), "utf-8")).version;
|
|
59
|
+
}
|
|
60
|
+
function f(e) {
|
|
61
|
+
let t = [], n = !0, r = !1, i;
|
|
62
|
+
for (let a = 0; a < e.length; a++) {
|
|
63
|
+
let o = e[a];
|
|
64
|
+
switch (o) {
|
|
65
|
+
case "--no-images":
|
|
66
|
+
n = !1;
|
|
67
|
+
break;
|
|
68
|
+
case "--url-friendly":
|
|
69
|
+
r = !0;
|
|
70
|
+
break;
|
|
71
|
+
case "--out-dir": {
|
|
72
|
+
let t = e[++a];
|
|
73
|
+
if (!t) return null;
|
|
74
|
+
i = t;
|
|
75
|
+
break;
|
|
76
|
+
}
|
|
77
|
+
default:
|
|
78
|
+
if (o && o.startsWith("-")) return null;
|
|
79
|
+
o && t.push(o);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
return t.length === 0 ? null : {
|
|
83
|
+
pdfs: t,
|
|
84
|
+
images: n,
|
|
85
|
+
urlFriendly: r,
|
|
86
|
+
outDir: i
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
async function p(r, i) {
|
|
90
|
+
let a = n.resolve(r), o = n.basename(a);
|
|
91
|
+
if (!t.existsSync(a)) return console.error(`✗ File not found: ${r}`), !1;
|
|
92
|
+
if (n.extname(a).toLowerCase() !== ".pdf") return console.error(`✗ Not a PDF: ${r}`), !1;
|
|
93
|
+
let s = l(`Extracting ${o}`), c = (e) => {
|
|
94
|
+
switch (e.stage) {
|
|
95
|
+
case "loading":
|
|
96
|
+
s.update(`Loading ${o}`);
|
|
97
|
+
break;
|
|
98
|
+
case "parsing":
|
|
99
|
+
s.update(`Parsing text ${o}`), s.setProgress(e.current, e.total);
|
|
100
|
+
break;
|
|
101
|
+
case "ocr":
|
|
102
|
+
s.update(`OCR (Vision) ${o}`), s.setProgress(e.current, e.total);
|
|
103
|
+
break;
|
|
104
|
+
case "images":
|
|
105
|
+
s.update(`Extracting images ${o}`), e.total > 0 && s.setProgress(e.current, e.total);
|
|
106
|
+
break;
|
|
107
|
+
case "done": break;
|
|
108
|
+
}
|
|
109
|
+
};
|
|
110
|
+
try {
|
|
111
|
+
let t = await e(a, {
|
|
112
|
+
...i,
|
|
113
|
+
onProgress: c
|
|
114
|
+
}), r = n.basename(t.mdPath);
|
|
115
|
+
return s.succeed(`${o} → ${r} (${t.pageCount} pages, ${t.imageCount} images)`), !0;
|
|
116
|
+
} catch (e) {
|
|
117
|
+
return s.fail(`Failed: ${o}`), console.error(e instanceof Error ? e.message : e), !1;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
async function m() {
|
|
121
|
+
let e = process.argv.slice(2);
|
|
122
|
+
(e[0] === "-h" || e[0] === "--help") && (u(), process.exit(0)), (e[0] === "-v" || e[0] === "--version") && (console.log(d()), process.exit(0));
|
|
123
|
+
let t = f(e);
|
|
124
|
+
t || (u(), process.exit(2));
|
|
125
|
+
let n = {
|
|
126
|
+
images: t.images,
|
|
127
|
+
urlFriendly: t.urlFriendly,
|
|
128
|
+
outDir: t.outDir
|
|
129
|
+
}, r = !1;
|
|
130
|
+
for (let e of t.pdfs) await p(e, n) || (r = !0);
|
|
131
|
+
process.exit(+!!r);
|
|
132
|
+
}
|
|
133
|
+
m().catch((e) => {
|
|
134
|
+
console.error(e), process.exit(1);
|
|
135
|
+
});
|
|
136
|
+
//#endregion
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
import e from "node:fs";
|
|
2
|
+
import t from "node:path";
|
|
3
|
+
import { execFile as n } from "node:child_process";
|
|
4
|
+
import { extractText as r, getDocumentProxy as i } from "unpdf";
|
|
5
|
+
import { promisify as a } from "node:util";
|
|
6
|
+
//#region src/lib/text.ts
|
|
7
|
+
var o = () => new Promise((e) => setTimeout(e, 10));
|
|
8
|
+
async function s(n, a) {
|
|
9
|
+
let s = e.readFileSync(n);
|
|
10
|
+
a?.({
|
|
11
|
+
stage: "loading",
|
|
12
|
+
current: 0,
|
|
13
|
+
total: 0
|
|
14
|
+
}), await o();
|
|
15
|
+
let c = await i(new Uint8Array(s)), u = c.numPages, d, f;
|
|
16
|
+
try {
|
|
17
|
+
let e = (await c.getMetadata())?.info;
|
|
18
|
+
d = e?.Title, f = e?.Author;
|
|
19
|
+
} catch {}
|
|
20
|
+
a?.({
|
|
21
|
+
stage: "parsing",
|
|
22
|
+
current: 0,
|
|
23
|
+
total: u
|
|
24
|
+
}), await o();
|
|
25
|
+
let { text: p } = await r(c, { mergePages: !1 });
|
|
26
|
+
for (let e = 0; e < p.length; e++) e % 10 == 0 && (a?.({
|
|
27
|
+
stage: "parsing",
|
|
28
|
+
current: e + 1,
|
|
29
|
+
total: u
|
|
30
|
+
}), await o());
|
|
31
|
+
let m = p.map((e) => e.trim());
|
|
32
|
+
return !m.some((e) => e.length > 0) && process.platform === "darwin" && (a?.({
|
|
33
|
+
stage: "ocr",
|
|
34
|
+
current: 0,
|
|
35
|
+
total: u
|
|
36
|
+
}), await o(), m = await l(n, u, a)), a?.({
|
|
37
|
+
stage: "done",
|
|
38
|
+
current: u,
|
|
39
|
+
total: u
|
|
40
|
+
}), {
|
|
41
|
+
text: m.join("\n\n"),
|
|
42
|
+
pageCount: u,
|
|
43
|
+
pages: m,
|
|
44
|
+
metadata: {
|
|
45
|
+
title: d || t.basename(n, ".pdf"),
|
|
46
|
+
author: f
|
|
47
|
+
}
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
var c = "\nimport Foundation\nimport PDFKit\nimport Vision\n\nguard CommandLine.arguments.count > 1 else {\n fputs(\"Usage: ocr-pdf <path>\\n\", stderr)\n exit(1)\n}\n\nlet pdfPath = CommandLine.arguments[1]\nguard let doc = PDFDocument(url: URL(fileURLWithPath: pdfPath)) else {\n fputs(\"Failed to open PDF\\n\", stderr)\n exit(1)\n}\n\nvar result: [[String: Any]] = []\n\nfor i in 0..<doc.pageCount {\n guard let page = doc.page(at: i) else {\n result.append([\"page\": i + 1, \"text\": \"\"])\n continue\n }\n\n // Render page to CGImage for Vision OCR\n let bounds = page.bounds(for: .mediaBox)\n let scale: CGFloat = 2.0 // 2x for better OCR accuracy\n let width = Int(bounds.width * scale)\n let height = Int(bounds.height * scale)\n\n guard let ctx = CGContext(\n data: nil,\n width: width,\n height: height,\n bitsPerComponent: 8,\n bytesPerRow: 0,\n space: CGColorSpaceCreateDeviceRGB(),\n bitmapInfo: CGImageAlphaInfo.premultipliedFirst.rawValue\n ) else {\n result.append([\"page\": i + 1, \"text\": \"\"])\n continue\n }\n\n ctx.setFillColor(CGColor.white)\n ctx.fill(CGRect(x: 0, y: 0, width: width, height: height))\n ctx.scaleBy(x: scale, y: scale)\n\n // PDFKit renders in a flipped coordinate space\n NSGraphicsContext.saveGraphicsState()\n let nsCtx = NSGraphicsContext(cgContext: ctx, flipped: false)\n NSGraphicsContext.current = nsCtx\n page.draw(with: .mediaBox, to: ctx)\n NSGraphicsContext.restoreGraphicsState()\n\n guard let cgImage = ctx.makeImage() else {\n result.append([\"page\": i + 1, \"text\": \"\"])\n continue\n }\n\n let semaphore = DispatchSemaphore(value: 0)\n var pageText = \"\"\n\n let request = VNRecognizeTextRequest { request, error in\n if let observations = request.results as? [VNRecognizedTextObservation] {\n pageText = observations\n .compactMap { $0.topCandidates(1).first?.string }\n .joined(separator: \"\\n\")\n }\n semaphore.signal()\n }\n request.recognitionLevel = .accurate\n request.usesLanguageCorrection = true\n\n let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])\n try? handler.perform([request])\n semaphore.wait()\n\n result.append([\"page\": i + 1, \"text\": pageText])\n\n // Progress: write page number to stderr so the caller can track it\n fputs(\"PAGE:\\(i + 1)\\n\", stderr)\n}\n\nlet jsonData = try! JSONSerialization.data(withJSONObject: result)\nprint(String(data: jsonData, encoding: .utf8)!)\n";
|
|
51
|
+
async function l(r, i, a) {
|
|
52
|
+
let o = process.env.TMPDIR || "/tmp", s = t.join(o, ".ocr-script-" + process.pid + ".swift");
|
|
53
|
+
try {
|
|
54
|
+
return e.writeFileSync(s, c), await new Promise((e, t) => {
|
|
55
|
+
n("/usr/bin/swift", [s, r], {
|
|
56
|
+
maxBuffer: 100 * 1024 * 1024,
|
|
57
|
+
timeout: 600 * 1e3
|
|
58
|
+
}, (n, r) => {
|
|
59
|
+
if (n) {
|
|
60
|
+
t(/* @__PURE__ */ Error(`OCR failed: ${n.message}`));
|
|
61
|
+
return;
|
|
62
|
+
}
|
|
63
|
+
try {
|
|
64
|
+
e(JSON.parse(r).map((e) => e.text.trim()));
|
|
65
|
+
} catch (e) {
|
|
66
|
+
t(/* @__PURE__ */ Error(`Failed to parse OCR output: ${e}`));
|
|
67
|
+
}
|
|
68
|
+
}).stderr?.on("data", (e) => {
|
|
69
|
+
let t = e.toString().split("\n");
|
|
70
|
+
for (let e of t) {
|
|
71
|
+
let t = e.match(/^PAGE:(\d+)$/);
|
|
72
|
+
if (t?.[1]) {
|
|
73
|
+
let e = parseInt(t[1], 10);
|
|
74
|
+
a?.({
|
|
75
|
+
stage: "ocr",
|
|
76
|
+
current: e,
|
|
77
|
+
total: i
|
|
78
|
+
});
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
});
|
|
82
|
+
});
|
|
83
|
+
} finally {
|
|
84
|
+
try {
|
|
85
|
+
e.unlinkSync(s);
|
|
86
|
+
} catch {}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
//#endregion
|
|
90
|
+
//#region src/lib/images.ts
|
|
91
|
+
var u = a(n);
|
|
92
|
+
async function d(n, r, i = "img", a) {
|
|
93
|
+
try {
|
|
94
|
+
await u("pdfimages", ["-v"]);
|
|
95
|
+
} catch (e) {
|
|
96
|
+
if (e.code === "ENOENT") throw Error("pdfimages not found on PATH. Install Poppler: `brew install poppler` (macOS) or your distro equivalent.");
|
|
97
|
+
}
|
|
98
|
+
e.mkdirSync(r, { recursive: !0 }), a?.({
|
|
99
|
+
stage: "images",
|
|
100
|
+
current: 0,
|
|
101
|
+
total: 0
|
|
102
|
+
}), await u("pdfimages", [
|
|
103
|
+
"-p",
|
|
104
|
+
"-all",
|
|
105
|
+
n,
|
|
106
|
+
t.join(r, i)
|
|
107
|
+
], { maxBuffer: 100 * 1024 * 1024 });
|
|
108
|
+
let o = RegExp(`^${f(i)}-(\\d+)-(\\d+)\\.([A-Za-z0-9]+)$`), s = e.readdirSync(r), c = [];
|
|
109
|
+
for (let e of s) {
|
|
110
|
+
let n = e.match(o);
|
|
111
|
+
!n?.[1] || !n[2] || c.push({
|
|
112
|
+
page: parseInt(n[1], 10),
|
|
113
|
+
index: parseInt(n[2], 10),
|
|
114
|
+
filename: e,
|
|
115
|
+
path: t.join(r, e)
|
|
116
|
+
});
|
|
117
|
+
}
|
|
118
|
+
return c.sort((e, t) => e.page - t.page || e.index - t.index), a?.({
|
|
119
|
+
stage: "images",
|
|
120
|
+
current: c.length,
|
|
121
|
+
total: c.length
|
|
122
|
+
}), c;
|
|
123
|
+
}
|
|
124
|
+
function f(e) {
|
|
125
|
+
return e.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
126
|
+
}
|
|
127
|
+
//#endregion
|
|
128
|
+
//#region src/lib/markdown.ts
|
|
129
|
+
function p(e, t, n) {
|
|
130
|
+
let r = n.title ?? e.metadata.title ?? "Untitled", i = e.metadata.author, a = /* @__PURE__ */ new Map();
|
|
131
|
+
for (let e of t) {
|
|
132
|
+
let t = a.get(e.page) ?? [];
|
|
133
|
+
t.push(e), a.set(e.page, t);
|
|
134
|
+
}
|
|
135
|
+
let o = [];
|
|
136
|
+
o.push("---"), o.push(`title: ${h(r)}`), i && o.push(`author: ${h(i)}`), o.push(`pages: ${e.pageCount}`), o.push(`images: ${t.length}`), o.push("---"), o.push("");
|
|
137
|
+
for (let t = 0; t < e.pages.length; t++) {
|
|
138
|
+
let r = t + 1, i = e.pages[t]?.trim() ?? "", s = a.get(r) ?? [];
|
|
139
|
+
if (!(!i && s.length === 0)) {
|
|
140
|
+
o.push(`## Page ${r}`), o.push(""), i && (o.push(i), o.push(""));
|
|
141
|
+
for (let e of s) o.push(`})`);
|
|
142
|
+
s.length > 0 && o.push("");
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
return o.join("\n");
|
|
146
|
+
}
|
|
147
|
+
function m(e, t) {
|
|
148
|
+
let n = `${e}/${t}`, r = encodeURI(n);
|
|
149
|
+
return r === n && !/[()]/.test(n) ? n : `<${r}>`;
|
|
150
|
+
}
|
|
151
|
+
function h(e) {
|
|
152
|
+
return /[:#\n"'\\]/.test(e) ? JSON.stringify(e) : e;
|
|
153
|
+
}
|
|
154
|
+
//#endregion
|
|
155
|
+
//#region src/lib/slugify.ts
|
|
156
|
+
function g(e, t = 60) {
|
|
157
|
+
let n = e.normalize("NFKD").replace(/[̀-ͯ]/g, "").replace(/['‘’`´]/g, "").toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "");
|
|
158
|
+
if (n.length <= t) return n || "untitled";
|
|
159
|
+
let r = n.slice(0, t), i = r.lastIndexOf("-");
|
|
160
|
+
return (i > t / 2 ? r.slice(0, i) : r).replace(/-+$/, "");
|
|
161
|
+
}
|
|
162
|
+
//#endregion
|
|
163
|
+
//#region src/lib/resolve-output-names.ts
|
|
164
|
+
function _(e, n = {}) {
|
|
165
|
+
let r = t.resolve(n.outDir ?? process.cwd()), i = t.basename(e, t.extname(e)), a = n.urlFriendly ? g(i) : i, o = `${a}-images`;
|
|
166
|
+
return {
|
|
167
|
+
baseName: a,
|
|
168
|
+
mdPath: t.join(r, `${a}.md`),
|
|
169
|
+
imageDir: t.join(r, o),
|
|
170
|
+
imageDirName: o
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
//#endregion
|
|
174
|
+
//#region src/lib/extract-pdf.ts
|
|
175
|
+
async function v(t, n = {}) {
|
|
176
|
+
let { images: r = !0, urlFriendly: i = !1, outDir: a, onProgress: o } = n, c = _(t, {
|
|
177
|
+
urlFriendly: i,
|
|
178
|
+
outDir: a
|
|
179
|
+
}), l = await s(t, o), u = [];
|
|
180
|
+
if (r && (u = await d(t, c.imageDir, "img", o), u.length === 0)) try {
|
|
181
|
+
e.rmdirSync(c.imageDir);
|
|
182
|
+
} catch {}
|
|
183
|
+
let f = p(l, u, { imageDir: c.imageDirName });
|
|
184
|
+
return e.writeFileSync(c.mdPath, f, "utf-8"), {
|
|
185
|
+
mdPath: c.mdPath,
|
|
186
|
+
imageDir: u.length > 0 ? c.imageDir : null,
|
|
187
|
+
pageCount: l.pageCount,
|
|
188
|
+
imageCount: u.length
|
|
189
|
+
};
|
|
190
|
+
}
|
|
191
|
+
//#endregion
|
|
192
|
+
export { d as a, p as i, _ as n, s as o, g as r, v as t };
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
export { extractPdf } from './lib/extract-pdf';
|
|
2
|
+
export { parsePDF } from './lib/text';
|
|
3
|
+
export { extractImages } from './lib/images';
|
|
4
|
+
export { toMarkdown } from './lib/markdown';
|
|
5
|
+
export { slugify } from './lib/slugify';
|
|
6
|
+
export { resolveOutputNames } from './lib/resolve-output-names';
|
|
7
|
+
export type { ExtractPdfOptions, ExtractPdfResult } from './lib/extract-pdf';
|
|
8
|
+
export type { ParsedPDF, ParseProgress, ProgressCallback } from './lib/text';
|
|
9
|
+
export type { ExtractedImage } from './lib/images';
|
|
10
|
+
export type { ToMarkdownOptions } from './lib/markdown';
|
|
11
|
+
export type { OutputNames, ResolveOutputNamesOptions } from './lib/resolve-output-names';
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import { ProgressCallback } from './text';
|
|
2
|
+
export interface ExtractPdfOptions {
|
|
3
|
+
/** Extract embedded images via Poppler. Default: true. */
|
|
4
|
+
images?: boolean;
|
|
5
|
+
/** Slugify output names to kebab-case. Default: false (verbatim). */
|
|
6
|
+
urlFriendly?: boolean;
|
|
7
|
+
/** Directory to write outputs in. Default: current working directory. */
|
|
8
|
+
outDir?: string;
|
|
9
|
+
/** Progress callback for spinner/UI wiring. */
|
|
10
|
+
onProgress?: ProgressCallback;
|
|
11
|
+
}
|
|
12
|
+
export interface ExtractPdfResult {
|
|
13
|
+
/** Absolute path of the markdown file written. */
|
|
14
|
+
mdPath: string;
|
|
15
|
+
/** Absolute path of the image directory, or null when images were skipped/empty. */
|
|
16
|
+
imageDir: string | null;
|
|
17
|
+
pageCount: number;
|
|
18
|
+
imageCount: number;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Run the full PDF → markdown pipeline for a single file: parse text (with macOS
|
|
22
|
+
* OCR fallback), optionally extract images, assemble markdown, and write it to
|
|
23
|
+
* disk. Returns where things landed.
|
|
24
|
+
*/
|
|
25
|
+
export declare function extractPdf(pdfPath: string, options?: ExtractPdfOptions): Promise<ExtractPdfResult>;
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import { ProgressCallback } from './text';
|
|
2
|
+
export interface ExtractedImage {
|
|
3
|
+
page: number;
|
|
4
|
+
index: number;
|
|
5
|
+
filename: string;
|
|
6
|
+
path: string;
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* Extract embedded images from a PDF using Poppler's `pdfimages`.
|
|
10
|
+
*
|
|
11
|
+
* Requires `pdfimages` on PATH (install via `brew install poppler` on macOS).
|
|
12
|
+
*
|
|
13
|
+
* Files are written as `<outDir>/<prefix>-<page>-<n>.<ext>` where the page
|
|
14
|
+
* number is zero-padded by pdfimages. `-all` preserves original encodings
|
|
15
|
+
* (JPEGs stay JPEGs, etc.) for highest fidelity.
|
|
16
|
+
*/
|
|
17
|
+
export declare function extractImages(pdfPath: string, outDir: string, prefix?: string, onProgress?: ProgressCallback): Promise<ExtractedImage[]>;
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { ParsedPDF } from './text';
|
|
2
|
+
import { ExtractedImage } from './images';
|
|
3
|
+
export interface ToMarkdownOptions {
|
|
4
|
+
/** Directory (relative to the .md file) where images live. e.g. "My Book" */
|
|
5
|
+
imageDir: string;
|
|
6
|
+
/** Optional title override; defaults to parsed metadata title. */
|
|
7
|
+
title?: string;
|
|
8
|
+
}
|
|
9
|
+
export declare function toMarkdown(parsed: ParsedPDF, images: ExtractedImage[], options: ToMarkdownOptions): string;
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
export interface OutputNames {
|
|
2
|
+
/** Markdown stem (source filename without extension; slugified if url-friendly). */
|
|
3
|
+
baseName: string;
|
|
4
|
+
/** Absolute path of the markdown file to write. */
|
|
5
|
+
mdPath: string;
|
|
6
|
+
/** Absolute path of the image directory (`<baseName>-images`). */
|
|
7
|
+
imageDir: string;
|
|
8
|
+
/** Image directory name relative to the markdown file, for use in image links. */
|
|
9
|
+
imageDirName: string;
|
|
10
|
+
}
|
|
11
|
+
export interface ResolveOutputNamesOptions {
|
|
12
|
+
/** Slugify names to kebab-case instead of keeping them verbatim. */
|
|
13
|
+
urlFriendly?: boolean;
|
|
14
|
+
/** Directory to place outputs in. Defaults to the current working directory. */
|
|
15
|
+
outDir?: string;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Decide where a PDF's outputs land. The markdown is `<baseName>.md` and its
|
|
19
|
+
* images go in a sibling `<baseName>-images/` directory. Default keeps the
|
|
20
|
+
* source filename verbatim; `--url-friendly` slugifies it.
|
|
21
|
+
*/
|
|
22
|
+
export declare function resolveOutputNames(pdfPath: string, options?: ResolveOutputNamesOptions): OutputNames;
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Turn an arbitrary string into a url-friendly kebab-case slug: lowercased,
|
|
3
|
+
* diacritics stripped, apostrophes dropped, runs of other characters collapsed
|
|
4
|
+
* to single hyphens, and truncated at a word boundary within `maxLength`.
|
|
5
|
+
*/
|
|
6
|
+
export declare function slugify(input: string, maxLength?: number): string;
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
export interface Spinner {
|
|
2
|
+
update(text: string): void;
|
|
3
|
+
stop(finalText?: string): void;
|
|
4
|
+
succeed(text: string): void;
|
|
5
|
+
fail(text: string): void;
|
|
6
|
+
}
|
|
7
|
+
export interface ProgressSpinner extends Spinner {
|
|
8
|
+
setProgress(current: number, total: number): void;
|
|
9
|
+
}
|
|
10
|
+
export declare function createSpinner(initialText: string): Spinner;
|
|
11
|
+
export declare function createProgressSpinner(initialText: string): ProgressSpinner;
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
export interface ParsedPDF {
|
|
2
|
+
text: string;
|
|
3
|
+
pageCount: number;
|
|
4
|
+
pages: string[];
|
|
5
|
+
metadata: {
|
|
6
|
+
title?: string;
|
|
7
|
+
author?: string;
|
|
8
|
+
};
|
|
9
|
+
}
|
|
10
|
+
export interface ParseProgress {
|
|
11
|
+
stage: 'loading' | 'parsing' | 'ocr' | 'images' | 'done';
|
|
12
|
+
current: number;
|
|
13
|
+
total: number;
|
|
14
|
+
}
|
|
15
|
+
export type ProgressCallback = (progress: ParseProgress) => void;
|
|
16
|
+
export declare function parsePDF(filePath: string, onProgress?: ProgressCallback): Promise<ParsedPDF>;
|
package/package.json
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "cajupdf",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Turn PDFs into clean markdown + extracted images (macOS)",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"bin": {
|
|
7
|
+
"cajupdf": "dist/cli.js"
|
|
8
|
+
},
|
|
9
|
+
"main": "./dist/index.js",
|
|
10
|
+
"exports": {
|
|
11
|
+
".": {
|
|
12
|
+
"types": "./dist/index.d.ts",
|
|
13
|
+
"import": "./dist/index.js"
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
"publishConfig": {
|
|
17
|
+
"access": "public"
|
|
18
|
+
},
|
|
19
|
+
"files": [
|
|
20
|
+
"dist",
|
|
21
|
+
"README.md",
|
|
22
|
+
"LICENSE"
|
|
23
|
+
],
|
|
24
|
+
"engines": {
|
|
25
|
+
"node": ">=22"
|
|
26
|
+
},
|
|
27
|
+
"os": [
|
|
28
|
+
"darwin"
|
|
29
|
+
],
|
|
30
|
+
"keywords": [
|
|
31
|
+
"pdf",
|
|
32
|
+
"markdown",
|
|
33
|
+
"ocr",
|
|
34
|
+
"vision",
|
|
35
|
+
"poppler",
|
|
36
|
+
"pdfimages",
|
|
37
|
+
"extract",
|
|
38
|
+
"cli",
|
|
39
|
+
"macos"
|
|
40
|
+
],
|
|
41
|
+
"license": "MIT",
|
|
42
|
+
"author": "Julio Cesar Ody",
|
|
43
|
+
"homepage": "https://github.com/juliocesar/cajupdf#readme",
|
|
44
|
+
"repository": {
|
|
45
|
+
"type": "git",
|
|
46
|
+
"url": "git+https://github.com/juliocesar/cajupdf.git"
|
|
47
|
+
},
|
|
48
|
+
"bugs": {
|
|
49
|
+
"url": "https://github.com/juliocesar/cajupdf/issues"
|
|
50
|
+
},
|
|
51
|
+
"scripts": {
|
|
52
|
+
"build": "vite build",
|
|
53
|
+
"dev": "vite build --watch",
|
|
54
|
+
"start": "node dist/cli.js",
|
|
55
|
+
"typecheck": "tsc --noEmit",
|
|
56
|
+
"lint": "eslint .",
|
|
57
|
+
"format": "prettier --write \"**/*.{ts,js,mjs,json,md}\"",
|
|
58
|
+
"format:check": "prettier --check \"**/*.{ts,js,mjs,json,md}\"",
|
|
59
|
+
"test": "vitest run",
|
|
60
|
+
"test:watch": "vitest",
|
|
61
|
+
"prepublishOnly": "pnpm run build"
|
|
62
|
+
},
|
|
63
|
+
"dependencies": {
|
|
64
|
+
"chalk": "^5.6.2",
|
|
65
|
+
"unpdf": "^1.4.0"
|
|
66
|
+
},
|
|
67
|
+
"devDependencies": {
|
|
68
|
+
"@eslint/js": "^9.39.4",
|
|
69
|
+
"@types/node": "^22.10.0",
|
|
70
|
+
"eslint": "^9.39.4",
|
|
71
|
+
"eslint-config-prettier": "^10.1.8",
|
|
72
|
+
"eslint-plugin-only-warn": "^1.2.1",
|
|
73
|
+
"eslint-plugin-prettier": "^5.5.6",
|
|
74
|
+
"globals": "^17.6.0",
|
|
75
|
+
"prettier": "^3.8.3",
|
|
76
|
+
"typescript": "~6.0.3",
|
|
77
|
+
"typescript-eslint": "^8.60.1",
|
|
78
|
+
"vite": "^8.0.14",
|
|
79
|
+
"vite-plugin-dts": "^4",
|
|
80
|
+
"vitest": "^4.1.7"
|
|
81
|
+
}
|
|
82
|
+
}
|