afpp 2.1.6 → 2.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +82 -108
- package/package.json +13 -11
package/README.md
CHANGED
|
@@ -1,119 +1,126 @@
|
|
|
1
1
|
# afpp
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-

|
|
5
|
-
[](https://codecov.io/github/l2ysho/afpp)
|
|
6
|
-

|
|
7
|
-

|
|
8
|
-

|
|
9
|
-

|
|
10
|
-

|
|
3
|
+
   
|
|
11
4
|
|
|
12
|
-
|
|
5
|
+
> **afpp** — A modern, dependency-light PDF parser for Node.js.
|
|
6
|
+
>
|
|
7
|
+
> Built for performance, reliability, and developer sanity.
|
|
13
8
|
|
|
14
|
-
|
|
9
|
+
---
|
|
15
10
|
|
|
16
|
-
|
|
11
|
+
## Overview
|
|
17
12
|
|
|
18
|
-
|
|
13
|
+
`afpp` (Another PDF Parser, Properly) is a Node.js library for extracting text and images from PDF files without heavyweight native dependencies, event-loop blocking, or fragile runtime assumptions.
|
|
19
14
|
|
|
20
|
-
|
|
21
|
-
- 🧵 Why is the event loop blocked?
|
|
22
|
-
- 🐏 Is that a memory leak I smell?
|
|
23
|
-
- 🐌 Should reading a PDF really be this performance-heavy?
|
|
24
|
-
- 🐞 Why is everything so buggy?
|
|
25
|
-
- 🎨 Why does it complain about the lack of a canvas in Node.js?
|
|
26
|
-
- 🧱 Why does canvas require native C++/Python dependencies to build?
|
|
27
|
-
- 🪟 Why does it complain about the missing window object?
|
|
28
|
-
- 🪄 Why do I need ImageMagick for this?!
|
|
29
|
-
- 👻 What the hell is Ghostscript, and why does it keep failing?
|
|
30
|
-
- ❌ Where’s the TypeScript support?
|
|
31
|
-
- 🧓 Why are the dependencies older than my dev career?
|
|
32
|
-
- 🔐 Why does everything work… until I try an encrypted PDF?
|
|
33
|
-
- 🕯️ Why does every OS need its own special setup ritual?
|
|
15
|
+
The project was created to address recurring problems encountered with existing PDF tooling in the Node.js ecosystem:
|
|
34
16
|
|
|
35
|
-
|
|
17
|
+
- Excessive bundle sizes and transitive dependencies
|
|
18
|
+
- Native build steps (canvas, ImageMagick, Ghostscript)
|
|
19
|
+
- Browser-specific assumptions (`window`, DOM, canvas)
|
|
20
|
+
- Poor TypeScript support
|
|
21
|
+
- Unreliable handling of encrypted PDFs
|
|
22
|
+
- Performance and memory inefficiencies
|
|
36
23
|
|
|
37
|
-
|
|
24
|
+
`afpp` focuses on **predictable behavior**, **explicit APIs**, and **production-ready defaults**.
|
|
38
25
|
|
|
39
|
-
|
|
26
|
+
---
|
|
40
27
|
|
|
41
|
-
|
|
28
|
+
## Key Features
|
|
42
29
|
|
|
43
|
-
|
|
30
|
+
- Zero native build dependencies
|
|
31
|
+
- Fully asynchronous, non-blocking architecture
|
|
32
|
+
- First-class TypeScript support
|
|
33
|
+
- Supports local files, buffers, and remote URLs
|
|
34
|
+
- Handles encrypted PDFs
|
|
35
|
+
- Configurable concurrency and rendering scale
|
|
36
|
+
- Minimal and auditable dependency graph
|
|
44
37
|
|
|
45
|
-
|
|
46
|
-
npm install afpp
|
|
47
|
-
```
|
|
38
|
+
---
|
|
48
39
|
|
|
49
|
-
|
|
40
|
+
## Requirements
|
|
50
41
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
42
|
+
- **Node.js** >= 22.14.0
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## Installation
|
|
54
47
|
|
|
55
|
-
|
|
48
|
+
Install using your preferred package manager:
|
|
56
49
|
|
|
57
50
|
```bash
|
|
51
|
+
npm install afpp
|
|
52
|
+
# or
|
|
53
|
+
yarn add afpp
|
|
54
|
+
# or
|
|
58
55
|
pnpm add afpp
|
|
59
56
|
```
|
|
60
57
|
|
|
61
|
-
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## Quick Start
|
|
62
61
|
|
|
63
|
-
|
|
62
|
+
All parsing functions accept the same input types:
|
|
64
63
|
|
|
65
|
-
|
|
64
|
+
- `string` (file path)
|
|
65
|
+
- `Buffer`
|
|
66
|
+
- `URL`
|
|
67
|
+
|
|
68
|
+
### Extract Text from a PDF
|
|
66
69
|
|
|
67
70
|
```ts
|
|
68
71
|
import { readFile } from 'fs/promises';
|
|
69
72
|
import path from 'path';
|
|
70
|
-
|
|
71
73
|
import { pdf2string } from 'afpp';
|
|
72
74
|
|
|
73
|
-
(async
|
|
74
|
-
const
|
|
75
|
-
const
|
|
76
|
-
const data = await pdf2string(input);
|
|
75
|
+
(async () => {
|
|
76
|
+
const filePath = path.join('..', 'test', 'example.pdf');
|
|
77
|
+
const buffer = await readFile(filePath);
|
|
77
78
|
|
|
78
|
-
|
|
79
|
+
const pages = await pdf2string(buffer);
|
|
80
|
+
console.log(pages); // ['Page 1 text', 'Page 2 text', ...]
|
|
79
81
|
})();
|
|
80
82
|
```
|
|
81
83
|
|
|
82
|
-
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
### Render PDF Pages as Images
|
|
83
87
|
|
|
84
88
|
```ts
|
|
85
89
|
import { pdf2image } from 'afpp';
|
|
86
90
|
|
|
87
|
-
(async
|
|
91
|
+
(async () => {
|
|
88
92
|
const url = new URL('https://pdfobject.com/pdf/sample.pdf');
|
|
89
|
-
const
|
|
93
|
+
const images = await pdf2image(url);
|
|
90
94
|
|
|
91
|
-
console.log(
|
|
95
|
+
console.log(images); // [Buffer, Buffer, ...]
|
|
92
96
|
})();
|
|
93
97
|
```
|
|
94
98
|
|
|
95
|
-
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
### Low-Level Parsing API
|
|
102
|
+
|
|
103
|
+
For advanced use cases, `parsePdf` exposes page-level control and transformation.
|
|
96
104
|
|
|
97
105
|
```ts
|
|
98
106
|
import { parsePdf } from 'afpp';
|
|
99
107
|
|
|
100
|
-
(async
|
|
101
|
-
// Download PDF from URL
|
|
108
|
+
(async () => {
|
|
102
109
|
const response = await fetch('https://pdfobject.com/pdf/sample.pdf');
|
|
103
110
|
const buffer = Buffer.from(await response.arrayBuffer());
|
|
104
111
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
console.log('Parsed PDF:', result);
|
|
112
|
+
const result = await parsePdf(buffer, {}, (pageContent) => pageContent);
|
|
113
|
+
console.log(result);
|
|
108
114
|
})();
|
|
109
115
|
```
|
|
110
116
|
|
|
111
|
-
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
## Configuration
|
|
112
120
|
|
|
113
|
-
|
|
114
|
-
Example usage
|
|
121
|
+
All public APIs accept a shared options object.
|
|
115
122
|
|
|
116
|
-
```
|
|
123
|
+
```ts
|
|
117
124
|
const result = await parsePdf(buffer, {
|
|
118
125
|
concurrency: 5,
|
|
119
126
|
imageEncoding: 'jpeg',
|
|
@@ -122,59 +129,26 @@ const result = await parsePdf(buffer, {
|
|
|
122
129
|
});
|
|
123
130
|
```
|
|
124
131
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
### concurrency?
|
|
128
|
-
|
|
129
|
-
> `optional` **concurrency**: `number`
|
|
130
|
-
|
|
131
|
-
Concurrency level for page processing. Defaults to 1.
|
|
132
|
-
Higher values may improve performance but increase memory usage.
|
|
133
|
-
|
|
134
|
-
#### Default
|
|
135
|
-
|
|
136
|
-
```ts
|
|
137
|
-
1;
|
|
138
|
-
```
|
|
139
|
-
|
|
140
|
-
---
|
|
141
|
-
|
|
142
|
-
### imageEncoding?
|
|
143
|
-
|
|
144
|
-
> `optional` **imageEncoding**: [`ImageEncoding`](../type-aliases/ImageEncoding.md)
|
|
132
|
+
### AfppParseOptions
|
|
145
133
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
'png';
|
|
153
|
-
```
|
|
134
|
+
| Option | Type | Default | Description |
|
|
135
|
+
| --------------- | ------------------------------------- | ------- | ------------------------------------- |
|
|
136
|
+
| `concurrency` | `number` | `1` | Number of pages processed in parallel |
|
|
137
|
+
| `imageEncoding` | `'png' \| 'jpeg' \| 'webp' \| 'avif'` | `'png'` | Output format for rendered images |
|
|
138
|
+
| `password` | `string` | — | Password for encrypted PDFs |
|
|
139
|
+
| `scale` | `number` | `2.0` | Rendering scale for non-text pages |
|
|
154
140
|
|
|
155
141
|
---
|
|
156
142
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
> `optional` **password**: `string`
|
|
143
|
+
## Design Principles
|
|
160
144
|
|
|
161
|
-
|
|
145
|
+
- **Node-first**: No browser globals or DOM assumptions
|
|
146
|
+
- **Explicit over implicit**: No magic configuration
|
|
147
|
+
- **Fail fast**: Clear errors instead of silent corruption
|
|
148
|
+
- **Production-oriented**: Optimized for long-running processes
|
|
162
149
|
|
|
163
150
|
---
|
|
164
151
|
|
|
165
|
-
### scale?
|
|
166
|
-
|
|
167
|
-
> `optional` **scale**: `number`
|
|
168
|
-
|
|
169
|
-
Scale of a page if content is not text (or pdf2image is used). Defaults to 2.0.
|
|
170
|
-
Higher values increase image resolution but also memory usage.
|
|
171
|
-
|
|
172
|
-
#### Default
|
|
173
|
-
|
|
174
|
-
```ts
|
|
175
|
-
2.0;
|
|
176
|
-
```
|
|
177
|
-
|
|
178
152
|
## License
|
|
179
153
|
|
|
180
|
-
|
|
154
|
+
MIT © Richard Solár
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "afpp",
|
|
3
|
-
"version": "2.1.
|
|
3
|
+
"version": "2.1.7",
|
|
4
4
|
"description": "another f*cking pdf parser",
|
|
5
5
|
"types": "./dist/index.d.ts",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -46,30 +46,32 @@
|
|
|
46
46
|
},
|
|
47
47
|
"homepage": "https://github.com/l2ysho/afpp#readme",
|
|
48
48
|
"devDependencies": {
|
|
49
|
-
"@commitlint/cli": "20.
|
|
50
|
-
"@commitlint/config-conventional": "20.
|
|
51
|
-
"@
|
|
49
|
+
"@commitlint/cli": "20.2.0",
|
|
50
|
+
"@commitlint/config-conventional": "20.2.0",
|
|
51
|
+
"@semantic-release/changelog": "6.0.3",
|
|
52
|
+
"@semantic-release/git": "10.0.1",
|
|
53
|
+
"@types/node": "25.0.9",
|
|
52
54
|
"@voxpelli/node-test-pretty-reporter": "1.1.2",
|
|
53
55
|
"c8": "10.1.3",
|
|
54
56
|
"commitizen": "4.3.1",
|
|
55
57
|
"cz-conventional-changelog": "3.3.0",
|
|
56
|
-
"eslint": "9.39.
|
|
58
|
+
"eslint": "9.39.2",
|
|
57
59
|
"eslint-config-prettier": "10.1.8",
|
|
58
60
|
"eslint-plugin-import": "2.32.0",
|
|
59
|
-
"eslint-plugin-perfectionist": "
|
|
61
|
+
"eslint-plugin-perfectionist": "5.0.0",
|
|
60
62
|
"eslint-plugin-prettier": "5.5.4",
|
|
61
63
|
"eslint-plugin-promise": "7.2.1",
|
|
62
64
|
"husky": "9.1.7",
|
|
63
|
-
"lint-staged": "16.2.
|
|
64
|
-
"prettier": "3.
|
|
65
|
+
"lint-staged": "16.2.7",
|
|
66
|
+
"prettier": "3.7.4",
|
|
65
67
|
"semantic-release": "25.0.2",
|
|
66
68
|
"tsc-alias": "1.8.16",
|
|
67
|
-
"tsx": "4.
|
|
69
|
+
"tsx": "4.21.0",
|
|
68
70
|
"typescript": "5.9.3",
|
|
69
|
-
"typescript-eslint": "8.
|
|
71
|
+
"typescript-eslint": "8.50.0"
|
|
70
72
|
},
|
|
71
73
|
"dependencies": {
|
|
72
74
|
"p-limit": "7.2.0",
|
|
73
|
-
"pdfjs-dist": "5.4.
|
|
75
|
+
"pdfjs-dist": "5.4.449"
|
|
74
76
|
}
|
|
75
77
|
}
|