pdf-defang 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdf_defang-0.1.0/LICENSE +21 -0
- pdf_defang-0.1.0/PKG-INFO +311 -0
- pdf_defang-0.1.0/README.md +273 -0
- pdf_defang-0.1.0/pdf_defang/__init__.py +38 -0
- pdf_defang-0.1.0/pdf_defang/_async.py +91 -0
- pdf_defang-0.1.0/pdf_defang/_bytes.py +157 -0
- pdf_defang-0.1.0/pdf_defang/_core.py +428 -0
- pdf_defang-0.1.0/pdf_defang/_scan.py +210 -0
- pdf_defang-0.1.0/pdf_defang/cli.py +234 -0
- pdf_defang-0.1.0/pdf_defang/py.typed +0 -0
- pdf_defang-0.1.0/pdf_defang.egg-info/PKG-INFO +311 -0
- pdf_defang-0.1.0/pdf_defang.egg-info/SOURCES.txt +26 -0
- pdf_defang-0.1.0/pdf_defang.egg-info/dependency_links.txt +1 -0
- pdf_defang-0.1.0/pdf_defang.egg-info/entry_points.txt +2 -0
- pdf_defang-0.1.0/pdf_defang.egg-info/requires.txt +9 -0
- pdf_defang-0.1.0/pdf_defang.egg-info/top_level.txt +1 -0
- pdf_defang-0.1.0/pyproject.toml +71 -0
- pdf_defang-0.1.0/setup.cfg +4 -0
- pdf_defang-0.1.0/tests/test_async.py +68 -0
- pdf_defang-0.1.0/tests/test_balanced.py +193 -0
- pdf_defang-0.1.0/tests/test_bytes.py +114 -0
- pdf_defang-0.1.0/tests/test_cli.py +117 -0
- pdf_defang-0.1.0/tests/test_edge_cases.py +115 -0
- pdf_defang-0.1.0/tests/test_encryption.py +130 -0
- pdf_defang-0.1.0/tests/test_new_protections.py +186 -0
- pdf_defang-0.1.0/tests/test_performance.py +102 -0
- pdf_defang-0.1.0/tests/test_sanitize.py +130 -0
- pdf_defang-0.1.0/tests/test_scan.py +99 -0
pdf_defang-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 kovetz.co.il
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pdf-defang
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Strip JavaScript, OpenAction, Launch actions and other active content from PDFs. Lightweight Python library.
|
|
5
|
+
Author-email: "kovetz.co.il" <contact@kovetz.co.il>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://kovetz.co.il
|
|
8
|
+
Project-URL: Repository, https://github.com/kovetz-PDF/pdf-defang
|
|
9
|
+
Project-URL: Issues, https://github.com/kovetz-PDF/pdf-defang/issues
|
|
10
|
+
Project-URL: Changelog, https://github.com/kovetz-PDF/pdf-defang/blob/main/CHANGELOG.md
|
|
11
|
+
Keywords: pdf,security,sanitize,sanitizer,defang,javascript,malware,pikepdf,openaction,active-content
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Information Technology
|
|
15
|
+
Classifier: Intended Audience :: System Administrators
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Classifier: Topic :: Security
|
|
24
|
+
Classifier: Topic :: Office/Business
|
|
25
|
+
Classifier: Topic :: Utilities
|
|
26
|
+
Classifier: Typing :: Typed
|
|
27
|
+
Requires-Python: >=3.9
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
License-File: LICENSE
|
|
30
|
+
Requires-Dist: pikepdf<10.0,>=8.0
|
|
31
|
+
Provides-Extra: test
|
|
32
|
+
Requires-Dist: pytest>=7.0; extra == "test"
|
|
33
|
+
Requires-Dist: pytest-cov>=4.0; extra == "test"
|
|
34
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == "test"
|
|
35
|
+
Provides-Extra: docs
|
|
36
|
+
Requires-Dist: mkdocs-material>=9.5; extra == "docs"
|
|
37
|
+
Dynamic: license-file
|
|
38
|
+
|
|
39
|
+
# pdf-defang
|
|
40
|
+
|
|
41
|
+
> Strip JavaScript, OpenAction, Launch actions and other active content from PDFs.
|
|
42
|
+
> Lightweight Python library on top of [pikepdf](https://github.com/pikepdf/pikepdf).
|
|
43
|
+
> MIT licensed.
|
|
44
|
+
|
|
45
|
+
[](https://pypi.org/project/pdf-defang/)
|
|
46
|
+
[](https://pypi.org/project/pdf-defang/)
|
|
47
|
+
[](LICENSE)
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## Why?
|
|
52
|
+
|
|
53
|
+
PDFs can carry executable content: JavaScript that runs when the file opens,
|
|
54
|
+
auto-actions that fire on every page navigation, "Launch" actions that try to
|
|
55
|
+
open other programs, embedded files that drop malware. If you process
|
|
56
|
+
user-uploaded PDFs in your app, you should strip this content before serving
|
|
57
|
+
them back.
|
|
58
|
+
|
|
59
|
+
The Python ecosystem has parsers (`pikepdf`, `pypdf`, `PyMuPDF`) and a heavy
|
|
60
|
+
container-based tool ([Dangerzone](https://dangerzone.rocks/)), but no clean
|
|
61
|
+
drop-in library that says "give me this PDF without active content." This is
|
|
62
|
+
that library.
|
|
63
|
+
|
|
64
|
+
## Install
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
pip install pdf-defang
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Requires Python 3.9+ and pikepdf 8+.
|
|
71
|
+
|
|
72
|
+
## Quick start
|
|
73
|
+
|
|
74
|
+
### Python API
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from pdf_defang import sanitize, scan
|
|
78
|
+
|
|
79
|
+
# Clean a file in place
|
|
80
|
+
sanitize("uploaded.pdf")
|
|
81
|
+
|
|
82
|
+
# Get a detailed report of what was removed
|
|
83
|
+
report = sanitize("uploaded.pdf", return_report=True)
|
|
84
|
+
print(report.javascript_in_names) # 2
|
|
85
|
+
print(report.open_action_removed) # True
|
|
86
|
+
print(report.annotation_action_types) # ['Launch']
|
|
87
|
+
print(report.dangerous_uris_removed) # 1
|
|
88
|
+
print(report.as_dict()) # JSON-serialisable
|
|
89
|
+
|
|
90
|
+
# Inspect a file WITHOUT modifying it
|
|
91
|
+
report = scan("suspicious.pdf")
|
|
92
|
+
print(report.risk_level) # 'high' / 'medium' / 'low' / 'none'
|
|
93
|
+
print(report.has_javascript) # True
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Async API (FastAPI / aiohttp / asyncio)
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from pdf_defang import sanitize_async, scan_async
|
|
100
|
+
|
|
101
|
+
async def handle_upload(path):
|
|
102
|
+
report = await sanitize_async(path, return_report=True)
|
|
103
|
+
return report.as_dict()
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### In-memory API (S3, Lambda, no disk)
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
from pdf_defang import sanitize_bytes
|
|
110
|
+
|
|
111
|
+
raw_pdf: bytes = ... # from S3, HTTP, anywhere
|
|
112
|
+
cleaned: bytes = sanitize_bytes(raw_pdf)
|
|
113
|
+
# No disk involved
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### Encrypted PDFs (encryption preserved on output)
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
sanitize("encrypted.pdf", password="hunter2")
|
|
120
|
+
# Still encrypted with the same password, JavaScript removed.
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### Two levels: strict (default) vs balanced
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
# Public uploads: kill everything active (safest)
|
|
127
|
+
sanitize("untrusted.pdf") # level="strict"
|
|
128
|
+
|
|
129
|
+
# Trusted internal forms that need Submit / Calculate buttons:
|
|
130
|
+
sanitize("expense_form.pdf", level="balanced")
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
Both levels strip pure attack vectors (`/Launch`, `/GoToR`, document
|
|
134
|
+
JavaScript, dangerous URI schemes, etc.). `balanced` additionally
|
|
135
|
+
preserves `/SubmitForm` / `/ResetForm` / form JS actions, annotation
|
|
136
|
+
`/AA` and `/JS` triggers, the AcroForm `/CO` calculation order, and
|
|
137
|
+
embedded files (used by PDF portfolios). Default is `strict`.
|
|
138
|
+
|
|
139
|
+
### Command line
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
# Clean a single file (strict by default)
|
|
143
|
+
pdf-defang clean uploaded.pdf
|
|
144
|
+
|
|
145
|
+
# Clean many at once
|
|
146
|
+
pdf-defang clean *.pdf
|
|
147
|
+
|
|
148
|
+
# Keep form interactivity working
|
|
149
|
+
pdf-defang clean --level balanced internal_form.pdf
|
|
150
|
+
|
|
151
|
+
# Inspect without changes
|
|
152
|
+
pdf-defang scan suspicious.pdf
|
|
153
|
+
|
|
154
|
+
# Get JSON output for piping into your logging stack
|
|
155
|
+
pdf-defang scan suspicious.pdf --json | jq .risk_level
|
|
156
|
+
pdf-defang clean *.pdf --json > sanitization-log.json
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
Exit codes follow shell conventions:
|
|
160
|
+
|
|
161
|
+
| Code | `clean` | `scan` |
|
|
162
|
+
|------|---------|--------|
|
|
163
|
+
| 0 | All files were already clean | No active content found |
|
|
164
|
+
| 1 | At least one file had something stripped | Active content detected |
|
|
165
|
+
| 2 | At least one file could not be opened | File could not be scanned |
|
|
166
|
+
|
|
167
|
+
## Use cases
|
|
168
|
+
|
|
169
|
+
### Web app that accepts PDF uploads
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
from pdf_defang import sanitize
|
|
173
|
+
|
|
174
|
+
def handle_upload(uploaded_file_path: str) -> str:
|
|
175
|
+
report = sanitize(uploaded_file_path, return_report=True)
|
|
176
|
+
if report.error:
|
|
177
|
+
raise ValueError(f"Could not process PDF: {report.error}")
|
|
178
|
+
# Log what was removed for your audit trail
|
|
179
|
+
logger.info("Sanitized %s: %s", uploaded_file_path, report.as_dict())
|
|
180
|
+
return uploaded_file_path # safe to serve back to other users now
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
### Suspicious file investigation
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
from pdf_defang import scan
|
|
187
|
+
|
|
188
|
+
report = scan("phishing_attachment.pdf")
|
|
189
|
+
if report.risk_level == "high":
|
|
190
|
+
quarantine(report)
|
|
191
|
+
elif report.risk_level == "medium":
|
|
192
|
+
notify_security_team(report)
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
### Compliance pipeline (PDF/A clean output)
|
|
196
|
+
|
|
197
|
+
```bash
|
|
198
|
+
find /var/incoming -name '*.pdf' | xargs pdf-defang clean --json >> audit.jsonl
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
## What gets removed
|
|
202
|
+
|
|
203
|
+
| Item | Where | What it does |
|
|
204
|
+
|---|---|---|
|
|
205
|
+
| `/JavaScript` in `/Names` | Document root | Document-level JavaScript that runs on open |
|
|
206
|
+
| `/EmbeddedFiles` | Document root | Files hidden inside the PDF (potential malware) |
|
|
207
|
+
| `/OpenAction` | Document root | Action automatically executed when PDF opens |
|
|
208
|
+
| `/AA` | Document root | "Additional Actions" - auto-execute on navigation |
|
|
209
|
+
| `/XFA` | `/AcroForm` | Legacy XML forms - well-known attack surface |
|
|
210
|
+
| `/CO` | `/AcroForm` | Form field Calculation Order |
|
|
211
|
+
| `/AA` | Each page | Page-level auto-execute actions |
|
|
212
|
+
| Dangerous `/A` | Each annotation | JavaScript, Launch, ImportData, SubmitForm, ResetForm, Rendition, GoToR, GoToE, Movie, Sound actions |
|
|
213
|
+
| `/AA` | Each annotation | Per-annotation auto-actions |
|
|
214
|
+
| `/JS` | Each annotation | JavaScript attached directly to an annotation |
|
|
215
|
+
| Unsafe `/URI` | Each annotation | URI actions with dangerous schemes (`javascript:`, `file:`, `data:`, `vbscript:`, UNC paths). Standard hyperlinks (`http`, `https`, `mailto`, `tel`, `ftp`, etc.) are preserved. |
|
|
216
|
+
|
|
217
|
+
## What is preserved
|
|
218
|
+
|
|
219
|
+
Sanitization is **non-destructive to visible content**:
|
|
220
|
+
|
|
221
|
+
- All text, images and layout
|
|
222
|
+
- Standard form fields (filled values stay intact)
|
|
223
|
+
- Bookmarks, table of contents, page labels
|
|
224
|
+
- Document metadata (Author, Title, Subject, Keywords)
|
|
225
|
+
- Standard link annotations to `mailto:` / `http(s):` URLs
|
|
226
|
+
- Document structure, page count, page order
|
|
227
|
+
|
|
228
|
+
## Why not Dangerzone / iText / commercial SDKs?
|
|
229
|
+
|
|
230
|
+
| Tool | Why this might not fit you |
|
|
231
|
+
|---|---|
|
|
232
|
+
| [Dangerzone](https://dangerzone.rocks/) | Excellent for sensitive analyst workflows, but runs a full Docker container per file. Minutes per PDF, not milliseconds. |
|
|
233
|
+
| [iText](https://itextpdf.com/) / [Apryse](https://apryse.com/) | Powerful, but commercial licenses start at thousands of USD/year. |
|
|
234
|
+
| [pikepdf](https://github.com/pikepdf/pikepdf) directly | Brilliant library, but it's a parser, not a sanitizer. You'd write the same `_strip_document_level()` code we wrote here. That's exactly what we extracted. |
|
|
235
|
+
|
|
236
|
+
`pdf-defang` is for the case where you want a small, free, drop-in function
|
|
237
|
+
to ship in your existing Python app. No subprocesses, no Docker, no per-seat
|
|
238
|
+
license.
|
|
239
|
+
|
|
240
|
+
## Performance
|
|
241
|
+
|
|
242
|
+
Measured on a Windows 11 laptop, Python 3.13, on the fixture PDFs:
|
|
243
|
+
|
|
244
|
+
| Operation | Median time |
|
|
245
|
+
|---|---|
|
|
246
|
+
| `scan_bytes()` on a clean PDF (in memory) | ~0.3 ms |
|
|
247
|
+
| `sanitize_bytes()` on a malicious PDF (in memory) | ~0.6 ms |
|
|
248
|
+
| `sanitize()` on a clean PDF (with disk I/O) | ~8 ms |
|
|
249
|
+
| `sanitize()` kitchen-sink PDF (with disk I/O) | ~8 ms |
|
|
250
|
+
|
|
251
|
+
These are 50-100 times faster than container-based tools like Dangerzone
|
|
252
|
+
(which take seconds-to-minutes per file).
|
|
253
|
+
|
|
254
|
+
To benchmark on your hardware:
|
|
255
|
+
|
|
256
|
+
```bash
|
|
257
|
+
python -m pytest tests/test_performance.py -v -s
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
## Caveats
|
|
261
|
+
|
|
262
|
+
- Sanitization modifies the input file **in place**. If you need the original
|
|
263
|
+
preserved for audit, copy it first.
|
|
264
|
+
- Encrypted PDFs require the `password=` argument. Wrong-password attempts
|
|
265
|
+
return an error report (not an exception).
|
|
266
|
+
- Malformed PDFs may not open at all - we surface the underlying pikepdf error
|
|
267
|
+
in the report. The original file is not touched on failure.
|
|
268
|
+
- This is **defense in depth**, not a replacement for layered controls. Don't
|
|
269
|
+
rely on a sanitizer alone for high-risk attachment workflows: also validate
|
|
270
|
+
uploaders, sandbox processing, and scan with AV.
|
|
271
|
+
|
|
272
|
+
## Origin story
|
|
273
|
+
|
|
274
|
+
This library was originally written for [kovetz.co.il](https://kovetz.co.il)
|
|
275
|
+
(Hebrew PDF tools, [www.kovetz.co.il](https://www.kovetz.co.il)) in May 2026,
|
|
276
|
+
during an APT scanning campaign by an Iranian-attributed threat actor sweeping
|
|
277
|
+
endpoints for upload vectors. We needed to make sure that any PDF leaving our
|
|
278
|
+
service was free of executable payloads, even if an attacker successfully
|
|
279
|
+
uploaded a poisoned file.
|
|
280
|
+
|
|
281
|
+
We initially wrote 67 lines of pikepdf code, tested it on the kovetz.co.il
|
|
282
|
+
fleet (thousands of files/day), then realised there's no clean equivalent in
|
|
283
|
+
the OSS Python ecosystem. So we extracted it here for everyone else who needs
|
|
284
|
+
the same thing.
|
|
285
|
+
|
|
286
|
+
## Contributing
|
|
287
|
+
|
|
288
|
+
Issues and PRs welcome at [github.com/kovetz-PDF/pdf-defang](https://github.com/kovetz-PDF/pdf-defang).
|
|
289
|
+
|
|
290
|
+
If you've found a PDF in the wild that contains active content we don't
|
|
291
|
+
strip, please open an issue with the file (or a minimal reproducer) attached.
|
|
292
|
+
|
|
293
|
+
### Development setup
|
|
294
|
+
|
|
295
|
+
```bash
|
|
296
|
+
git clone https://github.com/kovetz-PDF/pdf-defang.git
|
|
297
|
+
cd pdf-defang
|
|
298
|
+
python -m pip install -e ".[test]"
|
|
299
|
+
python -m pytest
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
The `tests/conftest.py` will auto-generate the test fixture PDFs on first run.
|
|
303
|
+
|
|
304
|
+
## License
|
|
305
|
+
|
|
306
|
+
[MIT](LICENSE) - free for any use, including commercial.
|
|
307
|
+
|
|
308
|
+
---
|
|
309
|
+
|
|
310
|
+
Built and maintained by [kovetz.co.il](https://kovetz.co.il).
|
|
311
|
+
Contact: [contact@kovetz.co.il](mailto:contact@kovetz.co.il)
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
# pdf-defang
|
|
2
|
+
|
|
3
|
+
> Strip JavaScript, OpenAction, Launch actions and other active content from PDFs.
|
|
4
|
+
> Lightweight Python library on top of [pikepdf](https://github.com/pikepdf/pikepdf).
|
|
5
|
+
> MIT licensed.
|
|
6
|
+
|
|
7
|
+
[](https://pypi.org/project/pdf-defang/)
|
|
8
|
+
[](https://pypi.org/project/pdf-defang/)
|
|
9
|
+
[](LICENSE)
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## Why?
|
|
14
|
+
|
|
15
|
+
PDFs can carry executable content: JavaScript that runs when the file opens,
|
|
16
|
+
auto-actions that fire on every page navigation, "Launch" actions that try to
|
|
17
|
+
open other programs, embedded files that drop malware. If you process
|
|
18
|
+
user-uploaded PDFs in your app, you should strip this content before serving
|
|
19
|
+
them back.
|
|
20
|
+
|
|
21
|
+
The Python ecosystem has parsers (`pikepdf`, `pypdf`, `PyMuPDF`) and a heavy
|
|
22
|
+
container-based tool ([Dangerzone](https://dangerzone.rocks/)), but no clean
|
|
23
|
+
drop-in library that says "give me this PDF without active content." This is
|
|
24
|
+
that library.
|
|
25
|
+
|
|
26
|
+
## Install
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install pdf-defang
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
Requires Python 3.9+ and pikepdf 8+.
|
|
33
|
+
|
|
34
|
+
## Quick start
|
|
35
|
+
|
|
36
|
+
### Python API
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
from pdf_defang import sanitize, scan
|
|
40
|
+
|
|
41
|
+
# Clean a file in place
|
|
42
|
+
sanitize("uploaded.pdf")
|
|
43
|
+
|
|
44
|
+
# Get a detailed report of what was removed
|
|
45
|
+
report = sanitize("uploaded.pdf", return_report=True)
|
|
46
|
+
print(report.javascript_in_names) # 2
|
|
47
|
+
print(report.open_action_removed) # True
|
|
48
|
+
print(report.annotation_action_types) # ['Launch']
|
|
49
|
+
print(report.dangerous_uris_removed) # 1
|
|
50
|
+
print(report.as_dict()) # JSON-serialisable
|
|
51
|
+
|
|
52
|
+
# Inspect a file WITHOUT modifying it
|
|
53
|
+
report = scan("suspicious.pdf")
|
|
54
|
+
print(report.risk_level) # 'high' / 'medium' / 'low' / 'none'
|
|
55
|
+
print(report.has_javascript) # True
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Async API (FastAPI / aiohttp / asyncio)
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from pdf_defang import sanitize_async, scan_async
|
|
62
|
+
|
|
63
|
+
async def handle_upload(path):
|
|
64
|
+
report = await sanitize_async(path, return_report=True)
|
|
65
|
+
return report.as_dict()
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### In-memory API (S3, Lambda, no disk)
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from pdf_defang import sanitize_bytes
|
|
72
|
+
|
|
73
|
+
raw_pdf: bytes = ... # from S3, HTTP, anywhere
|
|
74
|
+
cleaned: bytes = sanitize_bytes(raw_pdf)
|
|
75
|
+
# No disk involved
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Encrypted PDFs (encryption preserved on output)
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
sanitize("encrypted.pdf", password="hunter2")
|
|
82
|
+
# Still encrypted with the same password, JavaScript removed.
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Two levels: strict (default) vs balanced
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
# Public uploads: kill everything active (safest)
|
|
89
|
+
sanitize("untrusted.pdf") # level="strict"
|
|
90
|
+
|
|
91
|
+
# Trusted internal forms that need Submit / Calculate buttons:
|
|
92
|
+
sanitize("expense_form.pdf", level="balanced")
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Both levels strip pure attack vectors (`/Launch`, `/GoToR`, document
|
|
96
|
+
JavaScript, dangerous URI schemes, etc.). `balanced` additionally
|
|
97
|
+
preserves `/SubmitForm` / `/ResetForm` / form JS actions, annotation
|
|
98
|
+
`/AA` and `/JS` triggers, the AcroForm `/CO` calculation order, and
|
|
99
|
+
embedded files (used by PDF portfolios). Default is `strict`.
|
|
100
|
+
|
|
101
|
+
### Command line
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
# Clean a single file (strict by default)
|
|
105
|
+
pdf-defang clean uploaded.pdf
|
|
106
|
+
|
|
107
|
+
# Clean many at once
|
|
108
|
+
pdf-defang clean *.pdf
|
|
109
|
+
|
|
110
|
+
# Keep form interactivity working
|
|
111
|
+
pdf-defang clean --level balanced internal_form.pdf
|
|
112
|
+
|
|
113
|
+
# Inspect without changes
|
|
114
|
+
pdf-defang scan suspicious.pdf
|
|
115
|
+
|
|
116
|
+
# Get JSON output for piping into your logging stack
|
|
117
|
+
pdf-defang scan suspicious.pdf --json | jq .risk_level
|
|
118
|
+
pdf-defang clean *.pdf --json > sanitization-log.json
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
Exit codes follow shell conventions:
|
|
122
|
+
|
|
123
|
+
| Code | `clean` | `scan` |
|
|
124
|
+
|------|---------|--------|
|
|
125
|
+
| 0 | All files were already clean | No active content found |
|
|
126
|
+
| 1 | At least one file had something stripped | Active content detected |
|
|
127
|
+
| 2 | At least one file could not be opened | File could not be scanned |
|
|
128
|
+
|
|
129
|
+
## Use cases
|
|
130
|
+
|
|
131
|
+
### Web app that accepts PDF uploads
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
from pdf_defang import sanitize
|
|
135
|
+
|
|
136
|
+
def handle_upload(uploaded_file_path: str) -> str:
|
|
137
|
+
report = sanitize(uploaded_file_path, return_report=True)
|
|
138
|
+
if report.error:
|
|
139
|
+
raise ValueError(f"Could not process PDF: {report.error}")
|
|
140
|
+
# Log what was removed for your audit trail
|
|
141
|
+
logger.info("Sanitized %s: %s", uploaded_file_path, report.as_dict())
|
|
142
|
+
return uploaded_file_path # safe to serve back to other users now
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### Suspicious file investigation
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
from pdf_defang import scan
|
|
149
|
+
|
|
150
|
+
report = scan("phishing_attachment.pdf")
|
|
151
|
+
if report.risk_level == "high":
|
|
152
|
+
quarantine(report)
|
|
153
|
+
elif report.risk_level == "medium":
|
|
154
|
+
notify_security_team(report)
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### Compliance pipeline (PDF/A clean output)
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
find /var/incoming -name '*.pdf' | xargs pdf-defang clean --json >> audit.jsonl
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## What gets removed
|
|
164
|
+
|
|
165
|
+
| Item | Where | What it does |
|
|
166
|
+
|---|---|---|
|
|
167
|
+
| `/JavaScript` in `/Names` | Document root | Document-level JavaScript that runs on open |
|
|
168
|
+
| `/EmbeddedFiles` | Document root | Files hidden inside the PDF (potential malware) |
|
|
169
|
+
| `/OpenAction` | Document root | Action automatically executed when PDF opens |
|
|
170
|
+
| `/AA` | Document root | "Additional Actions" - auto-execute on navigation |
|
|
171
|
+
| `/XFA` | `/AcroForm` | Legacy XML forms - well-known attack surface |
|
|
172
|
+
| `/CO` | `/AcroForm` | Form field Calculation Order |
|
|
173
|
+
| `/AA` | Each page | Page-level auto-execute actions |
|
|
174
|
+
| Dangerous `/A` | Each annotation | JavaScript, Launch, ImportData, SubmitForm, ResetForm, Rendition, GoToR, GoToE, Movie, Sound actions |
|
|
175
|
+
| `/AA` | Each annotation | Per-annotation auto-actions |
|
|
176
|
+
| `/JS` | Each annotation | JavaScript attached directly to an annotation |
|
|
177
|
+
| Unsafe `/URI` | Each annotation | URI actions with dangerous schemes (`javascript:`, `file:`, `data:`, `vbscript:`, UNC paths). Standard hyperlinks (`http`, `https`, `mailto`, `tel`, `ftp`, etc.) are preserved. |
|
|
178
|
+
|
|
179
|
+
## What is preserved
|
|
180
|
+
|
|
181
|
+
Sanitization is **non-destructive to visible content**:
|
|
182
|
+
|
|
183
|
+
- All text, images and layout
|
|
184
|
+
- Standard form fields (filled values stay intact)
|
|
185
|
+
- Bookmarks, table of contents, page labels
|
|
186
|
+
- Document metadata (Author, Title, Subject, Keywords)
|
|
187
|
+
- Standard link annotations to `mailto:` / `http(s):` URLs
|
|
188
|
+
- Document structure, page count, page order
|
|
189
|
+
|
|
190
|
+
## Why not Dangerzone / iText / commercial SDKs?
|
|
191
|
+
|
|
192
|
+
| Tool | Why this might not fit you |
|
|
193
|
+
|---|---|
|
|
194
|
+
| [Dangerzone](https://dangerzone.rocks/) | Excellent for sensitive analyst workflows, but runs a full Docker container per file. Minutes per PDF, not milliseconds. |
|
|
195
|
+
| [iText](https://itextpdf.com/) / [Apryse](https://apryse.com/) | Powerful, but commercial licenses start at thousands of USD/year. |
|
|
196
|
+
| [pikepdf](https://github.com/pikepdf/pikepdf) directly | Brilliant library, but it's a parser, not a sanitizer. You'd write the same `_strip_document_level()` code we wrote here. That's exactly what we extracted. |
|
|
197
|
+
|
|
198
|
+
`pdf-defang` is for the case where you want a small, free, drop-in function
|
|
199
|
+
to ship in your existing Python app. No subprocesses, no Docker, no per-seat
|
|
200
|
+
license.
|
|
201
|
+
|
|
202
|
+
## Performance
|
|
203
|
+
|
|
204
|
+
Measured on a Windows 11 laptop, Python 3.13, on the fixture PDFs:
|
|
205
|
+
|
|
206
|
+
| Operation | Median time |
|
|
207
|
+
|---|---|
|
|
208
|
+
| `scan_bytes()` on a clean PDF (in memory) | ~0.3 ms |
|
|
209
|
+
| `sanitize_bytes()` on a malicious PDF (in memory) | ~0.6 ms |
|
|
210
|
+
| `sanitize()` on a clean PDF (with disk I/O) | ~8 ms |
|
|
211
|
+
| `sanitize()` kitchen-sink PDF (with disk I/O) | ~8 ms |
|
|
212
|
+
|
|
213
|
+
These are 50-100 times faster than container-based tools like Dangerzone
|
|
214
|
+
(which take seconds-to-minutes per file).
|
|
215
|
+
|
|
216
|
+
To benchmark on your hardware:
|
|
217
|
+
|
|
218
|
+
```bash
|
|
219
|
+
python -m pytest tests/test_performance.py -v -s
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
## Caveats
|
|
223
|
+
|
|
224
|
+
- Sanitization modifies the input file **in place**. If you need the original
|
|
225
|
+
preserved for audit, copy it first.
|
|
226
|
+
- Encrypted PDFs require the `password=` argument. Wrong-password attempts
|
|
227
|
+
return an error report (not an exception).
|
|
228
|
+
- Malformed PDFs may not open at all - we surface the underlying pikepdf error
|
|
229
|
+
in the report. The original file is not touched on failure.
|
|
230
|
+
- This is **defense in depth**, not a replacement for layered controls. Don't
|
|
231
|
+
rely on a sanitizer alone for high-risk attachment workflows: also validate
|
|
232
|
+
uploaders, sandbox processing, and scan with AV.
|
|
233
|
+
|
|
234
|
+
## Origin story
|
|
235
|
+
|
|
236
|
+
This library was originally written for [kovetz.co.il](https://kovetz.co.il)
|
|
237
|
+
(Hebrew PDF tools, [www.kovetz.co.il](https://www.kovetz.co.il)) in May 2026,
|
|
238
|
+
during an APT scanning campaign by an Iranian-attributed threat actor sweeping
|
|
239
|
+
endpoints for upload vectors. We needed to make sure that any PDF leaving our
|
|
240
|
+
service was free of executable payloads, even if an attacker successfully
|
|
241
|
+
uploaded a poisoned file.
|
|
242
|
+
|
|
243
|
+
We initially wrote 67 lines of pikepdf code, tested it on the kovetz.co.il
|
|
244
|
+
fleet (thousands of files/day), then realised there's no clean equivalent in
|
|
245
|
+
the OSS Python ecosystem. So we extracted it here for everyone else who needs
|
|
246
|
+
the same thing.
|
|
247
|
+
|
|
248
|
+
## Contributing
|
|
249
|
+
|
|
250
|
+
Issues and PRs welcome at [github.com/kovetz-PDF/pdf-defang](https://github.com/kovetz-PDF/pdf-defang).
|
|
251
|
+
|
|
252
|
+
If you've found a PDF in the wild that contains active content we don't
|
|
253
|
+
strip, please open an issue with the file (or a minimal reproducer) attached.
|
|
254
|
+
|
|
255
|
+
### Development setup
|
|
256
|
+
|
|
257
|
+
```bash
|
|
258
|
+
git clone https://github.com/kovetz-PDF/pdf-defang.git
|
|
259
|
+
cd pdf-defang
|
|
260
|
+
python -m pip install -e ".[test]"
|
|
261
|
+
python -m pytest
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
The `tests/conftest.py` will auto-generate the test fixture PDFs on first run.
|
|
265
|
+
|
|
266
|
+
## License
|
|
267
|
+
|
|
268
|
+
[MIT](LICENSE) - free for any use, including commercial.
|
|
269
|
+
|
|
270
|
+
---
|
|
271
|
+
|
|
272
|
+
Built and maintained by [kovetz.co.il](https://kovetz.co.il).
|
|
273
|
+
Contact: [contact@kovetz.co.il](mailto:contact@kovetz.co.il)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
pdf-defang
|
|
3
|
+
==========
|
|
4
|
+
|
|
5
|
+
Strip JavaScript, OpenAction, Launch actions and other active content
|
|
6
|
+
from PDFs. Lightweight Python library on top of pikepdf.
|
|
7
|
+
|
|
8
|
+
Quick start::
|
|
9
|
+
|
|
10
|
+
from pdf_defang import sanitize, scan
|
|
11
|
+
|
|
12
|
+
# Clean a file in place
|
|
13
|
+
sanitize("uploaded.pdf")
|
|
14
|
+
|
|
15
|
+
# Inspect without modifying
|
|
16
|
+
report = scan("suspicious.pdf")
|
|
17
|
+
print(report.risk_level) # 'high' / 'medium' / 'low' / 'none'
|
|
18
|
+
|
|
19
|
+
See https://github.com/kovetz-PDF/pdf-defang for full docs.
|
|
20
|
+
"""
|
|
21
|
+
from ._async import sanitize_async, scan_async
|
|
22
|
+
from ._bytes import sanitize_bytes, scan_bytes
|
|
23
|
+
from ._core import Level, SanitizeReport, sanitize
|
|
24
|
+
from ._scan import ScanReport, scan
|
|
25
|
+
|
|
26
|
+
__version__ = "0.1.0"
|
|
27
|
+
__all__ = [
|
|
28
|
+
"sanitize",
|
|
29
|
+
"scan",
|
|
30
|
+
"sanitize_async",
|
|
31
|
+
"scan_async",
|
|
32
|
+
"sanitize_bytes",
|
|
33
|
+
"scan_bytes",
|
|
34
|
+
"SanitizeReport",
|
|
35
|
+
"ScanReport",
|
|
36
|
+
"Level",
|
|
37
|
+
"__version__",
|
|
38
|
+
]
|