unstaple 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unstaple-0.1.0/LICENSE +21 -0
- unstaple-0.1.0/PKG-INFO +161 -0
- unstaple-0.1.0/README.md +132 -0
- unstaple-0.1.0/pyproject.toml +44 -0
- unstaple-0.1.0/setup.cfg +4 -0
- unstaple-0.1.0/src/unstaple/__init__.py +4 -0
- unstaple-0.1.0/src/unstaple/__main__.py +7 -0
- unstaple-0.1.0/src/unstaple/cli.py +108 -0
- unstaple-0.1.0/src/unstaple/features.py +223 -0
- unstaple-0.1.0/src/unstaple/naming.py +83 -0
- unstaple-0.1.0/src/unstaple/scoring.py +164 -0
- unstaple-0.1.0/src/unstaple/splitter.py +51 -0
- unstaple-0.1.0/src/unstaple.egg-info/PKG-INFO +161 -0
- unstaple-0.1.0/src/unstaple.egg-info/SOURCES.txt +20 -0
- unstaple-0.1.0/src/unstaple.egg-info/dependency_links.txt +1 -0
- unstaple-0.1.0/src/unstaple.egg-info/entry_points.txt +2 -0
- unstaple-0.1.0/src/unstaple.egg-info/requires.txt +5 -0
- unstaple-0.1.0/src/unstaple.egg-info/top_level.txt +1 -0
- unstaple-0.1.0/tests/test_end_to_end.py +149 -0
- unstaple-0.1.0/tests/test_features.py +115 -0
- unstaple-0.1.0/tests/test_naming.py +55 -0
- unstaple-0.1.0/tests/test_scoring.py +135 -0
unstaple-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Ben Malaga
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
unstaple-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: unstaple
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Split one fat scanned PDF of stapled-together documents into correctly-bounded, sensibly-named separate PDFs.
|
|
5
|
+
Author-email: Ben Malaga <benmalaga03@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/BenMalaga/unstaple
|
|
8
|
+
Project-URL: Issues, https://github.com/BenMalaga/unstaple/issues
|
|
9
|
+
Keywords: pdf,split,scan,documents,paperless,cli,archiving
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Office/Business
|
|
20
|
+
Classifier: Topic :: Utilities
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: pypdf>=4.0
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
27
|
+
Requires-Dist: reportlab>=4.0; extra == "dev"
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
<div align="center">
|
|
31
|
+
|
|
32
|
+
# unstaple
|
|
33
|
+
|
|
34
|
+
**Split one fat scanned PDF of stapled-together documents into correctly-bounded, sensibly-named separate PDFs.**
|
|
35
|
+
|
|
36
|
+
[](https://github.com/BenMalaga/unstaple/actions/workflows/test.yml)
|
|
37
|
+
[](https://github.com/BenMalaga/unstaple/releases)
|
|
38
|
+
[](https://pypi.org/project/unstaple/)
|
|
39
|
+
[](LICENSE)
|
|
40
|
+
|
|
41
|
+
</div>
|
|
42
|
+
|
|
43
|
+
You fed a stack of mail into the sheet scanner. Out came `scan.pdf`: an invoice, a lab report, and a city notice, all stapled into one 6-page file. Your archive wants three files with real names. `unstaple` reads the text layer, finds where one document ends and the next begins, and writes each one out separately.
|
|
44
|
+
|
|
45
|
+
```console
|
|
46
|
+
$ unstaple scan.pdf
|
|
47
|
+
scan.pdf: 6 pages, 3 documents detected
|
|
48
|
+
|
|
49
|
+
proposed cuts:
|
|
50
|
+
cut before page 3 confidence 0.95 [page numbering reset: "Page 1 of 3" after "Page 2 of 2"; header shift (similarity 0.00)]
|
|
51
|
+
cut before page 6 confidence 0.96 [page numbering reset: "Page 1 of 1" after "Page 3 of 3"; header shift (similarity 0.00); date change (2026-04-17 -> 2026-05-02)]
|
|
52
|
+
|
|
53
|
+
documents:
|
|
54
|
+
pages 1-2 -> 2026-03-03-acme-supply-co-invoice-4821.pdf
|
|
55
|
+
pages 3-5 -> 2026-04-17-northwind-medical-group-lab-results-summary.pdf
|
|
56
|
+
page 6 -> 2026-05-02-city-of-springfield.pdf
|
|
57
|
+
|
|
58
|
+
wrote unstapled/2026-03-03-acme-supply-co-invoice-4821.pdf
|
|
59
|
+
wrote unstapled/2026-04-17-northwind-medical-group-lab-results-summary.pdf
|
|
60
|
+
wrote unstapled/2026-05-02-city-of-springfield.pdf
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
That transcript is real: it is the output of `unstaple` on the synthetic fixture in `tests/fixtures/`, and the test suite asserts those exact boundaries and names.
|
|
64
|
+
|
|
65
|
+
## Install
|
|
66
|
+
|
|
67
|
+
```console
|
|
68
|
+
pip install unstaple
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Or, for a clean isolated install of the CLI:
|
|
72
|
+
|
|
73
|
+
```console
|
|
74
|
+
pipx install unstaple
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Requires Python 3.10 or newer. The only runtime dependency is [pypdf](https://pypi.org/project/pypdf/).
|
|
78
|
+
|
|
79
|
+
## Usage
|
|
80
|
+
|
|
81
|
+
```console
|
|
82
|
+
unstaple scan.pdf # split into ./unstapled/
|
|
83
|
+
unstaple scan.pdf --dry-run # show proposed cuts and names, write nothing
|
|
84
|
+
unstaple scan.pdf -o outbox # choose the output directory
|
|
85
|
+
unstaple scan.pdf --threshold 0.7 # demand more confidence before cutting
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Always start with `--dry-run`. It prints every proposed cut with a confidence score and the evidence behind it, so you can sanity-check the plan before any files are written. If a boundary is missed, lower `--threshold`; if it cuts too eagerly, raise it.
|
|
89
|
+
|
|
90
|
+
`unstaple` is lossless: every input page lands in exactly one output file, in order. Blank pages (separator sheets, blank duplex backs) stay attached to the document before them.
|
|
91
|
+
|
|
92
|
+
## How boundary scoring works
|
|
93
|
+
|
|
94
|
+
For every page, `unstaple` extracts deterministic signals from the text layer. No ML, no network, no cloud: the same input always produces the same cuts, and every cut comes with its reasons.
|
|
95
|
+
|
|
96
|
+
For each pair of adjacent pages it asks: does the second page start a new document?
|
|
97
|
+
|
|
98
|
+
**Evidence for a cut** (combined with a noisy-OR, so any strong signal can carry the decision):
|
|
99
|
+
|
|
100
|
+
| Signal | Weight | Example |
|
|
101
|
+
|---|---|---|
|
|
102
|
+
| Page numbering resets after completing | 0.90 | `Page 1 of 3` right after `Page 2 of 2` |
|
|
103
|
+
| Page numbering drops back to 1 | 0.85 | `Page 1 of 5` after `Page 4 of 9` |
|
|
104
|
+
| Bare numbering restarts | 0.70 | a lone `1` after a lone `6` |
|
|
105
|
+
| Blank page separator | 0.80 | a scanner divider sheet between documents |
|
|
106
|
+
| Numbering starts after unnumbered pages | 0.55 | `Page 1 of 2` after a letter with no page numbers |
|
|
107
|
+
| Header shift | 0.50 | first-lines token fingerprint goes from one letterhead to another |
|
|
108
|
+
| Previous page completed its numbering | 0.45 | `Page 2 of 2` followed by an unnumbered page |
|
|
109
|
+
| Date change | 0.30 | both pages dated, no date in common |
|
|
110
|
+
|
|
111
|
+
**Evidence against a cut** (multiplies the score down):
|
|
112
|
+
|
|
113
|
+
| Dampener | Factor | Example |
|
|
114
|
+
|---|---|---|
|
|
115
|
+
| Page numbering continues | 0.10 | `Page 1 of 3` then `Page 2 of 3` |
|
|
116
|
+
| Header continuity | 0.50 | the same letterhead tokens on both pages |
|
|
117
|
+
| Shared date | 0.70 | the same date printed on both pages |
|
|
118
|
+
|
|
119
|
+
A cut is proposed when the final score clears `--threshold` (default 0.5). One deliberate asymmetry: a hard numbering reset overrides header continuity and shared dates, because three invoices from the same vendor share a letterhead but are still three documents.
|
|
120
|
+
|
|
121
|
+
Names are inferred from the first page of each split: the first date found (ISO format) plus the first prominent line (usually the letterhead), plus a following all-caps title line if there is one. Collisions get `-2`, `-3` suffixes; if nothing usable is found, you get `document-01.pdf`.
|
|
122
|
+
|
|
123
|
+
## Honest scope
|
|
124
|
+
|
|
125
|
+
v0.1 is deliberately narrow. Know what you are getting:
|
|
126
|
+
|
|
127
|
+
- **Text-layer PDFs only.** If `pypdf` cannot extract text, `unstaple` refuses and tells you so. Image-only scans need OCR first (run them through [OCRmyPDF](https://github.com/ocrmypdf/OCRmyPDF)); native OCR support is on the roadmap.
|
|
128
|
+
- **Heuristics, not magic.** Documents with no page numbers, no dates, and an identical or absent header will not be separated. Two-page memos that look alike may be merged. `--dry-run` exists precisely so you can check before committing.
|
|
129
|
+
- **English-leaning patterns.** "Page X of Y" and English month names are recognized; other languages are roadmap material (and easy contributions, see below).
|
|
130
|
+
- **One PDF in, N PDFs out.** No watch folders, no GUI, no daemon. It is a small sharp tool meant to sit in a pipeline before your archive tool ingests the results.
|
|
131
|
+
|
|
132
|
+
## Why this exists
|
|
133
|
+
|
|
134
|
+
Every paperless workflow hits this problem: the scanner ADF happily eats a month of mail and hands you one giant PDF. The [paperless-ngx issue tracker has years of requests](https://github.com/paperless-ngx/paperless-ngx/discussions) for automatic document separation, and the usual answers are barcode separator sheets (which require planning ahead), cloud SaaS splitters (your mail, someone else's server), or GUI page-range pickers (you do the boundary detection yourself, with a mouse). `unstaple` is the missing fourth option: a local, deterministic, scriptable splitter that explains its decisions and never sends a byte anywhere.
|
|
135
|
+
|
|
136
|
+
## Development
|
|
137
|
+
|
|
138
|
+
```console
|
|
139
|
+
git clone https://github.com/BenMalaga/unstaple
|
|
140
|
+
cd unstaple
|
|
141
|
+
python -m venv .venv && source .venv/bin/activate
|
|
142
|
+
pip install -e ".[dev]"
|
|
143
|
+
pytest
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
The test fixtures are synthetic stapled PDFs generated by `tests/fixtures/make_fixtures.py` (reportlab, dev dependency only). The end-to-end tests split them and assert exact boundaries and filenames.
|
|
147
|
+
|
|
148
|
+
## Roadmap
|
|
149
|
+
|
|
150
|
+
- OCR fallback for image-only scans (probably via optional OCRmyPDF integration)
|
|
151
|
+
- Non-English page-number and date patterns
|
|
152
|
+
- Font-size and layout signals from the PDF content stream (a big bold first line is a strong title hint)
|
|
153
|
+
- `--interactive` mode to accept, reject, or move proposed cuts
|
|
154
|
+
|
|
155
|
+
## Contributing
|
|
156
|
+
|
|
157
|
+
The single most valuable contribution is a real-world PDF that `unstaple` gets wrong, anonymized, with a note about where the cuts should have been. See [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
158
|
+
|
|
159
|
+
## License
|
|
160
|
+
|
|
161
|
+
[MIT](LICENSE), Copyright (c) 2026 Ben Malaga.
|
unstaple-0.1.0/README.md
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
# unstaple
|
|
4
|
+
|
|
5
|
+
**Split one fat scanned PDF of stapled-together documents into correctly-bounded, sensibly-named separate PDFs.**
|
|
6
|
+
|
|
7
|
+
[](https://github.com/BenMalaga/unstaple/actions/workflows/test.yml)
|
|
8
|
+
[](https://github.com/BenMalaga/unstaple/releases)
|
|
9
|
+
[](https://pypi.org/project/unstaple/)
|
|
10
|
+
[](LICENSE)
|
|
11
|
+
|
|
12
|
+
</div>
|
|
13
|
+
|
|
14
|
+
You fed a stack of mail into the sheet scanner. Out came `scan.pdf`: an invoice, a lab report, and a city notice, all stapled into one 6-page file. Your archive wants three files with real names. `unstaple` reads the text layer, finds where one document ends and the next begins, and writes each one out separately.
|
|
15
|
+
|
|
16
|
+
```console
|
|
17
|
+
$ unstaple scan.pdf
|
|
18
|
+
scan.pdf: 6 pages, 3 documents detected
|
|
19
|
+
|
|
20
|
+
proposed cuts:
|
|
21
|
+
cut before page 3 confidence 0.95 [page numbering reset: "Page 1 of 3" after "Page 2 of 2"; header shift (similarity 0.00)]
|
|
22
|
+
cut before page 6 confidence 0.96 [page numbering reset: "Page 1 of 1" after "Page 3 of 3"; header shift (similarity 0.00); date change (2026-04-17 -> 2026-05-02)]
|
|
23
|
+
|
|
24
|
+
documents:
|
|
25
|
+
pages 1-2 -> 2026-03-03-acme-supply-co-invoice-4821.pdf
|
|
26
|
+
pages 3-5 -> 2026-04-17-northwind-medical-group-lab-results-summary.pdf
|
|
27
|
+
page 6 -> 2026-05-02-city-of-springfield.pdf
|
|
28
|
+
|
|
29
|
+
wrote unstapled/2026-03-03-acme-supply-co-invoice-4821.pdf
|
|
30
|
+
wrote unstapled/2026-04-17-northwind-medical-group-lab-results-summary.pdf
|
|
31
|
+
wrote unstapled/2026-05-02-city-of-springfield.pdf
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
That transcript is real: it is the output of `unstaple` on the synthetic fixture in `tests/fixtures/`, and the test suite asserts those exact boundaries and names.
|
|
35
|
+
|
|
36
|
+
## Install
|
|
37
|
+
|
|
38
|
+
```console
|
|
39
|
+
pip install unstaple
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
Or, for a clean isolated install of the CLI:
|
|
43
|
+
|
|
44
|
+
```console
|
|
45
|
+
pipx install unstaple
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Requires Python 3.10 or newer. The only runtime dependency is [pypdf](https://pypi.org/project/pypdf/).
|
|
49
|
+
|
|
50
|
+
## Usage
|
|
51
|
+
|
|
52
|
+
```console
|
|
53
|
+
unstaple scan.pdf # split into ./unstapled/
|
|
54
|
+
unstaple scan.pdf --dry-run # show proposed cuts and names, write nothing
|
|
55
|
+
unstaple scan.pdf -o outbox # choose the output directory
|
|
56
|
+
unstaple scan.pdf --threshold 0.7 # demand more confidence before cutting
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Always start with `--dry-run`. It prints every proposed cut with a confidence score and the evidence behind it, so you can sanity-check the plan before any files are written. If a boundary is missed, lower `--threshold`; if it cuts too eagerly, raise it.
|
|
60
|
+
|
|
61
|
+
`unstaple` is lossless: every input page lands in exactly one output file, in order. Blank pages (separator sheets, blank duplex backs) stay attached to the document before them.
|
|
62
|
+
|
|
63
|
+
## How boundary scoring works
|
|
64
|
+
|
|
65
|
+
For every page, `unstaple` extracts deterministic signals from the text layer. No ML, no network, no cloud: the same input always produces the same cuts, and every cut comes with its reasons.
|
|
66
|
+
|
|
67
|
+
For each pair of adjacent pages it asks: does the second page start a new document?
|
|
68
|
+
|
|
69
|
+
**Evidence for a cut** (combined with a noisy-OR, so any strong signal can carry the decision):
|
|
70
|
+
|
|
71
|
+
| Signal | Weight | Example |
|
|
72
|
+
|---|---|---|
|
|
73
|
+
| Page numbering resets after completing | 0.90 | `Page 1 of 3` right after `Page 2 of 2` |
|
|
74
|
+
| Page numbering drops back to 1 | 0.85 | `Page 1 of 5` after `Page 4 of 9` |
|
|
75
|
+
| Bare numbering restarts | 0.70 | a lone `1` after a lone `6` |
|
|
76
|
+
| Blank page separator | 0.80 | a scanner divider sheet between documents |
|
|
77
|
+
| Numbering starts after unnumbered pages | 0.55 | `Page 1 of 2` after a letter with no page numbers |
|
|
78
|
+
| Header shift | 0.50 | first-lines token fingerprint goes from one letterhead to another |
|
|
79
|
+
| Previous page completed its numbering | 0.45 | `Page 2 of 2` followed by an unnumbered page |
|
|
80
|
+
| Date change | 0.30 | both pages dated, no date in common |
|
|
81
|
+
|
|
82
|
+
**Evidence against a cut** (multiplies the score down):
|
|
83
|
+
|
|
84
|
+
| Dampener | Factor | Example |
|
|
85
|
+
|---|---|---|
|
|
86
|
+
| Page numbering continues | 0.10 | `Page 1 of 3` then `Page 2 of 3` |
|
|
87
|
+
| Header continuity | 0.50 | the same letterhead tokens on both pages |
|
|
88
|
+
| Shared date | 0.70 | the same date printed on both pages |
|
|
89
|
+
|
|
90
|
+
A cut is proposed when the final score clears `--threshold` (default 0.5). One deliberate asymmetry: a hard numbering reset overrides header continuity and shared dates, because three invoices from the same vendor share a letterhead but are still three documents.
|
|
91
|
+
|
|
92
|
+
Names are inferred from the first page of each split: the first date found (ISO format) plus the first prominent line (usually the letterhead), plus a following all-caps title line if there is one. Collisions get `-2`, `-3` suffixes; if nothing usable is found, you get `document-01.pdf`.
|
|
93
|
+
|
|
94
|
+
## Honest scope
|
|
95
|
+
|
|
96
|
+
v0.1 is deliberately narrow. Know what you are getting:
|
|
97
|
+
|
|
98
|
+
- **Text-layer PDFs only.** If `pypdf` cannot extract text, `unstaple` refuses and tells you so. Image-only scans need OCR first (run them through [OCRmyPDF](https://github.com/ocrmypdf/OCRmyPDF)); native OCR support is on the roadmap.
|
|
99
|
+
- **Heuristics, not magic.** Documents with no page numbers, no dates, and an identical or absent header will not be separated. Two-page memos that look alike may be merged. `--dry-run` exists precisely so you can check before committing.
|
|
100
|
+
- **English-leaning patterns.** "Page X of Y" and English month names are recognized; other languages are roadmap material (and easy contributions, see below).
|
|
101
|
+
- **One PDF in, N PDFs out.** No watch folders, no GUI, no daemon. It is a small sharp tool meant to sit in a pipeline before your archive tool ingests the results.
|
|
102
|
+
|
|
103
|
+
## Why this exists
|
|
104
|
+
|
|
105
|
+
Every paperless workflow hits this problem: the scanner ADF happily eats a month of mail and hands you one giant PDF. The [paperless-ngx issue tracker has years of requests](https://github.com/paperless-ngx/paperless-ngx/discussions) for automatic document separation, and the usual answers are barcode separator sheets (which require planning ahead), cloud SaaS splitters (your mail, someone else's server), or GUI page-range pickers (you do the boundary detection yourself, with a mouse). `unstaple` is the missing fourth option: a local, deterministic, scriptable splitter that explains its decisions and never sends a byte anywhere.
|
|
106
|
+
|
|
107
|
+
## Development
|
|
108
|
+
|
|
109
|
+
```console
|
|
110
|
+
git clone https://github.com/BenMalaga/unstaple
|
|
111
|
+
cd unstaple
|
|
112
|
+
python -m venv .venv && source .venv/bin/activate
|
|
113
|
+
pip install -e ".[dev]"
|
|
114
|
+
pytest
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
The test fixtures are synthetic stapled PDFs generated by `tests/fixtures/make_fixtures.py` (reportlab, dev dependency only). The end-to-end tests split them and assert exact boundaries and filenames.
|
|
118
|
+
|
|
119
|
+
## Roadmap
|
|
120
|
+
|
|
121
|
+
- OCR fallback for image-only scans (probably via optional OCRmyPDF integration)
|
|
122
|
+
- Non-English page-number and date patterns
|
|
123
|
+
- Font-size and layout signals from the PDF content stream (a big bold first line is a strong title hint)
|
|
124
|
+
- `--interactive` mode to accept, reject, or move proposed cuts
|
|
125
|
+
|
|
126
|
+
## Contributing
|
|
127
|
+
|
|
128
|
+
The single most valuable contribution is a real-world PDF that `unstaple` gets wrong, anonymized, with a note about where the cuts should have been. See [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
129
|
+
|
|
130
|
+
## License
|
|
131
|
+
|
|
132
|
+
[MIT](LICENSE), Copyright (c) 2026 Ben Malaga.
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=77"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "unstaple"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Split one fat scanned PDF of stapled-together documents into correctly-bounded, sensibly-named separate PDFs."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
license-files = ["LICENSE"]
|
|
13
|
+
authors = [{ name = "Ben Malaga", email = "benmalaga03@gmail.com" }]
|
|
14
|
+
keywords = ["pdf", "split", "scan", "documents", "paperless", "cli", "archiving"]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 4 - Beta",
|
|
17
|
+
"Environment :: Console",
|
|
18
|
+
"Intended Audience :: End Users/Desktop",
|
|
19
|
+
"Operating System :: OS Independent",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.10",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Programming Language :: Python :: 3.13",
|
|
25
|
+
"Topic :: Office/Business",
|
|
26
|
+
"Topic :: Utilities",
|
|
27
|
+
]
|
|
28
|
+
dependencies = ["pypdf>=4.0"]
|
|
29
|
+
|
|
30
|
+
[project.optional-dependencies]
|
|
31
|
+
dev = ["pytest>=7.0", "reportlab>=4.0"]
|
|
32
|
+
|
|
33
|
+
[project.scripts]
|
|
34
|
+
unstaple = "unstaple.cli:main"
|
|
35
|
+
|
|
36
|
+
[project.urls]
|
|
37
|
+
Homepage = "https://github.com/BenMalaga/unstaple"
|
|
38
|
+
Issues = "https://github.com/BenMalaga/unstaple/issues"
|
|
39
|
+
|
|
40
|
+
[tool.setuptools.packages.find]
|
|
41
|
+
where = ["src"]
|
|
42
|
+
|
|
43
|
+
[tool.pytest.ini_options]
|
|
44
|
+
testpaths = ["tests"]
|
unstaple-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Command line interface for unstaple."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from . import __version__
|
|
10
|
+
from .naming import infer_stem, uniquify
|
|
11
|
+
from .scoring import DEFAULT_THRESHOLD, find_boundaries, segments
|
|
12
|
+
from .splitter import NoTextLayerError, load_features, write_segment
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
16
|
+
parser = argparse.ArgumentParser(
|
|
17
|
+
prog="unstaple",
|
|
18
|
+
description=(
|
|
19
|
+
"Split one fat scanned PDF of stapled-together documents into "
|
|
20
|
+
"correctly-bounded, sensibly-named separate PDFs."
|
|
21
|
+
),
|
|
22
|
+
)
|
|
23
|
+
parser.add_argument("pdf", type=Path, help="the stapled-together PDF to split")
|
|
24
|
+
parser.add_argument(
|
|
25
|
+
"--dry-run",
|
|
26
|
+
action="store_true",
|
|
27
|
+
help="print proposed cut points, confidence, and names; write nothing",
|
|
28
|
+
)
|
|
29
|
+
parser.add_argument(
|
|
30
|
+
"-o",
|
|
31
|
+
"--out-dir",
|
|
32
|
+
type=Path,
|
|
33
|
+
default=Path("unstapled"),
|
|
34
|
+
help="output directory for split PDFs (default: ./unstapled)",
|
|
35
|
+
)
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
"--threshold",
|
|
38
|
+
type=float,
|
|
39
|
+
default=DEFAULT_THRESHOLD,
|
|
40
|
+
metavar="T",
|
|
41
|
+
help=f"cut confidence threshold in [0,1] (default: {DEFAULT_THRESHOLD})",
|
|
42
|
+
)
|
|
43
|
+
parser.add_argument("--version", action="version", version=f"unstaple {__version__}")
|
|
44
|
+
return parser
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def main(argv: list[str] | None = None) -> int:
|
|
48
|
+
args = build_parser().parse_args(argv)
|
|
49
|
+
|
|
50
|
+
if not args.pdf.is_file():
|
|
51
|
+
print(f"unstaple: error: {args.pdf}: no such file", file=sys.stderr)
|
|
52
|
+
return 2
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
reader, features = load_features(args.pdf)
|
|
56
|
+
except NoTextLayerError as exc:
|
|
57
|
+
print(f"unstaple: error: {exc}", file=sys.stderr)
|
|
58
|
+
return 2
|
|
59
|
+
except Exception as exc:
|
|
60
|
+
print(f"unstaple: error: {args.pdf}: {exc}", file=sys.stderr)
|
|
61
|
+
return 2
|
|
62
|
+
|
|
63
|
+
num_pages = len(features)
|
|
64
|
+
boundaries = find_boundaries(features, threshold=args.threshold)
|
|
65
|
+
ranges = segments(num_pages, boundaries)
|
|
66
|
+
n_docs = len(ranges)
|
|
67
|
+
|
|
68
|
+
plural = "s" if num_pages != 1 else ""
|
|
69
|
+
print(f"{args.pdf}: {num_pages} page{plural}, {n_docs} document{'s' if n_docs != 1 else ''} detected")
|
|
70
|
+
|
|
71
|
+
if not boundaries:
|
|
72
|
+
print("no document boundaries found; nothing to unstaple "
|
|
73
|
+
"(try a lower --threshold, or this may be a single document)")
|
|
74
|
+
return 0
|
|
75
|
+
|
|
76
|
+
print()
|
|
77
|
+
print("proposed cuts:")
|
|
78
|
+
for b in boundaries:
|
|
79
|
+
reasons = "; ".join(b.reasons)
|
|
80
|
+
print(f" cut before page {b.page_index + 1:<3} confidence {b.confidence:.2f} [{reasons}]")
|
|
81
|
+
|
|
82
|
+
feature_by_index = {f.index: f for f in features}
|
|
83
|
+
stems = uniquify(
|
|
84
|
+
[infer_stem(feature_by_index[start], i + 1) for i, (start, _) in enumerate(ranges)]
|
|
85
|
+
)
|
|
86
|
+
names = [f"{stem}.pdf" for stem in stems]
|
|
87
|
+
|
|
88
|
+
print()
|
|
89
|
+
print("documents:")
|
|
90
|
+
for (start, end), name in zip(ranges, names):
|
|
91
|
+
pages = f"pages {start + 1}-{end}" if end - start > 1 else f"page {start + 1}"
|
|
92
|
+
print(f" {pages:<12} -> {name}")
|
|
93
|
+
|
|
94
|
+
if args.dry_run:
|
|
95
|
+
print()
|
|
96
|
+
print("dry run: nothing written")
|
|
97
|
+
return 0
|
|
98
|
+
|
|
99
|
+
print()
|
|
100
|
+
for (start, end), name in zip(ranges, names):
|
|
101
|
+
out_path = args.out_dir / name
|
|
102
|
+
write_segment(reader, start, end, out_path)
|
|
103
|
+
print(f"wrote {out_path}")
|
|
104
|
+
return 0
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
if __name__ == "__main__": # pragma: no cover
|
|
108
|
+
sys.exit(main())
|