afpp 2.1.6 β 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +82 -103
- package/package.json +21 -15
package/README.md
CHANGED
|
@@ -1,119 +1,131 @@
|
|
|
1
1
|
# afpp
|
|
2
2
|
|
|
3
3
|

|
|
4
|
-

|
|
5
4
|
[](https://codecov.io/github/l2ysho/afpp)
|
|
6
|
-

|
|
5
|
+

|
|
7
6
|

|
|
8
7
|

|
|
9
8
|

|
|
10
|
-

|
|
11
9
|
|
|
12
|
-
|
|
10
|
+
> **afpp** β A modern, dependency-light PDF parser for Node.js.
|
|
11
|
+
>
|
|
12
|
+
> Built for performance, reliability, and developer sanity.
|
|
13
13
|
|
|
14
|
-
|
|
14
|
+
---
|
|
15
15
|
|
|
16
|
-
|
|
16
|
+
## Overview
|
|
17
17
|
|
|
18
|
-
|
|
18
|
+
`afpp` (Another PDF Parser, Properly) is a Node.js library for extracting text and images from PDF files without heavyweight native dependencies, event-loop blocking, or fragile runtime assumptions.
|
|
19
19
|
|
|
20
|
-
|
|
21
|
-
- π§΅ Why is the event loop blocked?
|
|
22
|
-
- π Is that a memory leak I smell?
|
|
23
|
-
- π Should reading a PDF really be this performance-heavy?
|
|
24
|
-
- π Why is everything so buggy?
|
|
25
|
-
- π¨ Why does it complain about the lack of a canvas in Node.js?
|
|
26
|
-
- π§± Why does canvas require native C++/Python dependencies to build?
|
|
27
|
-
- πͺ Why does it complain about the missing window object?
|
|
28
|
-
- πͺ Why do I need ImageMagick for this?!
|
|
29
|
-
- π» What the hell is Ghostscript, and why does it keep failing?
|
|
30
|
-
- β Whereβs the TypeScript support?
|
|
31
|
-
- π§ Why are the dependencies older than my dev career?
|
|
32
|
-
- π Why does everything workβ¦ until I try an encrypted PDF?
|
|
33
|
-
- π―οΈ Why does every OS need its own special setup ritual?
|
|
20
|
+
The project was created to address recurring problems encountered with existing PDF tooling in the Node.js ecosystem:
|
|
34
21
|
|
|
35
|
-
|
|
22
|
+
- Excessive bundle sizes and transitive dependencies
|
|
23
|
+
- Native build steps (canvas, ImageMagick, Ghostscript)
|
|
24
|
+
- Browser-specific assumptions (`window`, DOM, canvas)
|
|
25
|
+
- Poor TypeScript support
|
|
26
|
+
- Unreliable handling of encrypted PDFs
|
|
27
|
+
- Performance and memory inefficiencies
|
|
36
28
|
|
|
37
|
-
|
|
29
|
+
`afpp` focuses on **predictable behavior**, **explicit APIs**, and **production-ready defaults**.
|
|
38
30
|
|
|
39
|
-
|
|
31
|
+
---
|
|
40
32
|
|
|
41
|
-
|
|
33
|
+
## Key Features
|
|
42
34
|
|
|
43
|
-
|
|
35
|
+
- Zero native build dependencies
|
|
36
|
+
- Fully asynchronous, non-blocking architecture
|
|
37
|
+
- First-class TypeScript support
|
|
38
|
+
- Supports local files, buffers, and remote URLs
|
|
39
|
+
- Handles encrypted PDFs
|
|
40
|
+
- Configurable concurrency and rendering scale
|
|
41
|
+
- Minimal and auditable dependency graph
|
|
44
42
|
|
|
45
|
-
|
|
46
|
-
npm install afpp
|
|
47
|
-
```
|
|
43
|
+
---
|
|
48
44
|
|
|
49
|
-
|
|
45
|
+
## Requirements
|
|
50
46
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
47
|
+
- **Node.js** >= 22.14.0
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## Installation
|
|
54
52
|
|
|
55
|
-
|
|
53
|
+
Install using your preferred package manager:
|
|
56
54
|
|
|
57
55
|
```bash
|
|
56
|
+
npm install afpp
|
|
57
|
+
# or
|
|
58
|
+
yarn add afpp
|
|
59
|
+
# or
|
|
58
60
|
pnpm add afpp
|
|
59
61
|
```
|
|
60
62
|
|
|
61
|
-
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## Quick Start
|
|
62
66
|
|
|
63
|
-
|
|
67
|
+
All parsing functions accept the same input types:
|
|
64
68
|
|
|
65
|
-
|
|
69
|
+
- `string` (file path)
|
|
70
|
+
- `Buffer`
|
|
71
|
+
- `URL`
|
|
72
|
+
|
|
73
|
+
### Extract Text from a PDF
|
|
66
74
|
|
|
67
75
|
```ts
|
|
68
76
|
import { readFile } from 'fs/promises';
|
|
69
77
|
import path from 'path';
|
|
70
|
-
|
|
71
78
|
import { pdf2string } from 'afpp';
|
|
72
79
|
|
|
73
|
-
(async
|
|
74
|
-
const
|
|
75
|
-
const
|
|
76
|
-
const data = await pdf2string(input);
|
|
80
|
+
(async () => {
|
|
81
|
+
const filePath = path.join('..', 'test', 'example.pdf');
|
|
82
|
+
const buffer = await readFile(filePath);
|
|
77
83
|
|
|
78
|
-
|
|
84
|
+
const pages = await pdf2string(buffer);
|
|
85
|
+
console.log(pages); // ['Page 1 text', 'Page 2 text', ...]
|
|
79
86
|
})();
|
|
80
87
|
```
|
|
81
88
|
|
|
82
|
-
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
### Render PDF Pages as Images
|
|
83
92
|
|
|
84
93
|
```ts
|
|
85
94
|
import { pdf2image } from 'afpp';
|
|
86
95
|
|
|
87
|
-
(async
|
|
96
|
+
(async () => {
|
|
88
97
|
const url = new URL('https://pdfobject.com/pdf/sample.pdf');
|
|
89
|
-
const
|
|
98
|
+
const images = await pdf2image(url);
|
|
90
99
|
|
|
91
|
-
console.log(
|
|
100
|
+
console.log(images); // [Buffer, Buffer, ...]
|
|
92
101
|
})();
|
|
93
102
|
```
|
|
94
103
|
|
|
95
|
-
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
### Low-Level Parsing API
|
|
107
|
+
|
|
108
|
+
For advanced use cases, `parsePdf` exposes page-level control and transformation.
|
|
96
109
|
|
|
97
110
|
```ts
|
|
98
111
|
import { parsePdf } from 'afpp';
|
|
99
112
|
|
|
100
|
-
(async
|
|
101
|
-
// Download PDF from URL
|
|
113
|
+
(async () => {
|
|
102
114
|
const response = await fetch('https://pdfobject.com/pdf/sample.pdf');
|
|
103
115
|
const buffer = Buffer.from(await response.arrayBuffer());
|
|
104
116
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
console.log('Parsed PDF:', result);
|
|
117
|
+
const result = await parsePdf(buffer, {}, (pageContent) => pageContent);
|
|
118
|
+
console.log(result);
|
|
108
119
|
})();
|
|
109
120
|
```
|
|
110
121
|
|
|
111
|
-
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## Configuration
|
|
112
125
|
|
|
113
|
-
|
|
114
|
-
Example usage
|
|
126
|
+
All public APIs accept a shared options object.
|
|
115
127
|
|
|
116
|
-
```
|
|
128
|
+
```ts
|
|
117
129
|
const result = await parsePdf(buffer, {
|
|
118
130
|
concurrency: 5,
|
|
119
131
|
imageEncoding: 'jpeg',
|
|
@@ -122,59 +134,26 @@ const result = await parsePdf(buffer, {
|
|
|
122
134
|
});
|
|
123
135
|
```
|
|
124
136
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
### concurrency?
|
|
128
|
-
|
|
129
|
-
> `optional` **concurrency**: `number`
|
|
130
|
-
|
|
131
|
-
Concurrency level for page processing. Defaults to 1.
|
|
132
|
-
Higher values may improve performance but increase memory usage.
|
|
133
|
-
|
|
134
|
-
#### Default
|
|
135
|
-
|
|
136
|
-
```ts
|
|
137
|
-
1;
|
|
138
|
-
```
|
|
139
|
-
|
|
140
|
-
---
|
|
141
|
-
|
|
142
|
-
### imageEncoding?
|
|
143
|
-
|
|
144
|
-
> `optional` **imageEncoding**: [`ImageEncoding`](../type-aliases/ImageEncoding.md)
|
|
137
|
+
### AfppParseOptions
|
|
145
138
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
'png';
|
|
153
|
-
```
|
|
139
|
+
| Option | Type | Default | Description |
|
|
140
|
+
| --------------- | ------------------------------------- | ------- | ------------------------------------- |
|
|
141
|
+
| `concurrency` | `number` | `1` | Number of pages processed in parallel |
|
|
142
|
+
| `imageEncoding` | `'png' \| 'jpeg' \| 'webp' \| 'avif'` | `'png'` | Output format for rendered images |
|
|
143
|
+
| `password` | `string` | β | Password for encrypted PDFs |
|
|
144
|
+
| `scale` | `number` | `2.0` | Rendering scale for non-text pages |
|
|
154
145
|
|
|
155
146
|
---
|
|
156
147
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
> `optional` **password**: `string`
|
|
148
|
+
## Design Principles
|
|
160
149
|
|
|
161
|
-
|
|
150
|
+
- **Node-first**: No browser globals or DOM assumptions
|
|
151
|
+
- **Explicit over implicit**: No magic configuration
|
|
152
|
+
- **Fail fast**: Clear errors instead of silent corruption
|
|
153
|
+
- **Production-oriented**: Optimized for long-running processes
|
|
162
154
|
|
|
163
155
|
---
|
|
164
156
|
|
|
165
|
-
### scale?
|
|
166
|
-
|
|
167
|
-
> `optional` **scale**: `number`
|
|
168
|
-
|
|
169
|
-
Scale of a page if content is not text (or pdf2image is used). Defaults to 2.0.
|
|
170
|
-
Higher values increase image resolution but also memory usage.
|
|
171
|
-
|
|
172
|
-
#### Default
|
|
173
|
-
|
|
174
|
-
```ts
|
|
175
|
-
2.0;
|
|
176
|
-
```
|
|
177
|
-
|
|
178
157
|
## License
|
|
179
158
|
|
|
180
|
-
|
|
159
|
+
MIT Β© Richard SolΓ‘r
|
package/package.json
CHANGED
|
@@ -1,23 +1,27 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "afpp",
|
|
3
|
-
"version": "2.
|
|
4
|
-
"description": "
|
|
3
|
+
"version": "2.2.0",
|
|
4
|
+
"description": "Async Fast PDF Parser for Node.js β dependency-light, TypeScript-first, production-ready.",
|
|
5
5
|
"types": "./dist/index.d.ts",
|
|
6
6
|
"main": "./dist/index.js",
|
|
7
7
|
"files": [
|
|
8
|
-
"dist/**"
|
|
8
|
+
"dist/**",
|
|
9
|
+
"README.md",
|
|
10
|
+
"LICENSE"
|
|
9
11
|
],
|
|
10
12
|
"scripts": {
|
|
11
13
|
"build": "tsc -p tsconfig.build.json",
|
|
12
14
|
"commit": "git-cz",
|
|
15
|
+
"format": "prettier --write .",
|
|
13
16
|
"lint": "eslint .",
|
|
17
|
+
"lint:fix": "eslint . --fix",
|
|
14
18
|
"postbuild": "tsc-alias -p tsconfig.build.json",
|
|
15
19
|
"prebuild": "rm -rf dist",
|
|
16
20
|
"prepare": "husky",
|
|
17
21
|
"pretest:coverage": "rm -rf coverage",
|
|
18
22
|
"test": "NODE_ENV=test npx tsx --test --test-reporter @voxpelli/node-test-pretty-reporter test/*.test.ts",
|
|
19
|
-
"test:single": "NODE_ENV=test npx tsx --test --test-reporter @voxpelli/node-test-pretty-reporter",
|
|
20
23
|
"test:coverage": "c8 --reporter=lcov npm test",
|
|
24
|
+
"test:single": "NODE_ENV=test npx tsx --test --test-reporter @voxpelli/node-test-pretty-reporter",
|
|
21
25
|
"typecheck": "tsc -p tsconfig.json --noEmit"
|
|
22
26
|
},
|
|
23
27
|
"repository": {
|
|
@@ -46,30 +50,32 @@
|
|
|
46
50
|
},
|
|
47
51
|
"homepage": "https://github.com/l2ysho/afpp#readme",
|
|
48
52
|
"devDependencies": {
|
|
49
|
-
"@commitlint/cli": "20.1
|
|
50
|
-
"@commitlint/config-conventional": "20.
|
|
51
|
-
"@
|
|
53
|
+
"@commitlint/cli": "20.3.1",
|
|
54
|
+
"@commitlint/config-conventional": "20.3.1",
|
|
55
|
+
"@semantic-release/changelog": "6.0.3",
|
|
56
|
+
"@semantic-release/git": "10.0.1",
|
|
57
|
+
"@types/node": "25.0.9",
|
|
52
58
|
"@voxpelli/node-test-pretty-reporter": "1.1.2",
|
|
53
59
|
"c8": "10.1.3",
|
|
54
60
|
"commitizen": "4.3.1",
|
|
55
61
|
"cz-conventional-changelog": "3.3.0",
|
|
56
|
-
"eslint": "9.39.
|
|
62
|
+
"eslint": "9.39.2",
|
|
57
63
|
"eslint-config-prettier": "10.1.8",
|
|
58
64
|
"eslint-plugin-import": "2.32.0",
|
|
59
|
-
"eslint-plugin-perfectionist": "
|
|
60
|
-
"eslint-plugin-prettier": "5.5.
|
|
65
|
+
"eslint-plugin-perfectionist": "5.3.1",
|
|
66
|
+
"eslint-plugin-prettier": "5.5.5",
|
|
61
67
|
"eslint-plugin-promise": "7.2.1",
|
|
62
68
|
"husky": "9.1.7",
|
|
63
|
-
"lint-staged": "16.2.
|
|
64
|
-
"prettier": "3.
|
|
69
|
+
"lint-staged": "16.2.7",
|
|
70
|
+
"prettier": "3.8.0",
|
|
65
71
|
"semantic-release": "25.0.2",
|
|
66
72
|
"tsc-alias": "1.8.16",
|
|
67
|
-
"tsx": "4.
|
|
73
|
+
"tsx": "4.21.0",
|
|
68
74
|
"typescript": "5.9.3",
|
|
69
|
-
"typescript-eslint": "8.
|
|
75
|
+
"typescript-eslint": "8.50.0"
|
|
70
76
|
},
|
|
71
77
|
"dependencies": {
|
|
72
78
|
"p-limit": "7.2.0",
|
|
73
|
-
"pdfjs-dist": "5.4.
|
|
79
|
+
"pdfjs-dist": "5.4.530"
|
|
74
80
|
}
|
|
75
81
|
}
|