pdf-concatenator 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdf_concatenator-1.0.0/LICENSE +21 -0
- pdf_concatenator-1.0.0/PKG-INFO +155 -0
- pdf_concatenator-1.0.0/README.md +123 -0
- pdf_concatenator-1.0.0/pyproject.toml +51 -0
- pdf_concatenator-1.0.0/setup.cfg +4 -0
- pdf_concatenator-1.0.0/src/pdf_concatenator/__init__.py +3 -0
- pdf_concatenator-1.0.0/src/pdf_concatenator/cli.py +192 -0
- pdf_concatenator-1.0.0/src/pdf_concatenator/config.py +80 -0
- pdf_concatenator-1.0.0/src/pdf_concatenator/discovery.py +79 -0
- pdf_concatenator-1.0.0/src/pdf_concatenator/llm.py +107 -0
- pdf_concatenator-1.0.0/src/pdf_concatenator/pdf_build.py +292 -0
- pdf_concatenator-1.0.0/src/pdf_concatenator/sidecar.py +69 -0
- pdf_concatenator-1.0.0/src/pdf_concatenator/summaries.py +41 -0
- pdf_concatenator-1.0.0/src/pdf_concatenator.egg-info/PKG-INFO +155 -0
- pdf_concatenator-1.0.0/src/pdf_concatenator.egg-info/SOURCES.txt +23 -0
- pdf_concatenator-1.0.0/src/pdf_concatenator.egg-info/dependency_links.txt +1 -0
- pdf_concatenator-1.0.0/src/pdf_concatenator.egg-info/entry_points.txt +2 -0
- pdf_concatenator-1.0.0/src/pdf_concatenator.egg-info/requires.txt +10 -0
- pdf_concatenator-1.0.0/src/pdf_concatenator.egg-info/top_level.txt +1 -0
- pdf_concatenator-1.0.0/tests/test_cli.py +282 -0
- pdf_concatenator-1.0.0/tests/test_config.py +98 -0
- pdf_concatenator-1.0.0/tests/test_discovery.py +82 -0
- pdf_concatenator-1.0.0/tests/test_llm.py +149 -0
- pdf_concatenator-1.0.0/tests/test_pdf_build.py +235 -0
- pdf_concatenator-1.0.0/tests/test_sidecar.py +127 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Lorenzo Wood
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pdf-concatenator
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Concatenate PDFs with table of contents, cover pages, and optional LLM summaries
|
|
5
|
+
Author: Lorenzo Wood
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/lorenzowood/pdf-concatenator
|
|
8
|
+
Project-URL: Repository, https://github.com/lorenzowood/pdf-concatenator
|
|
9
|
+
Project-URL: Issues, https://github.com/lorenzowood/pdf-concatenator/issues
|
|
10
|
+
Keywords: pdf,concatenate,merge,toc,llm
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Office/Business
|
|
19
|
+
Requires-Python: >=3.11
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: httpx>=0.27
|
|
23
|
+
Requires-Dist: pypdf>=4.0
|
|
24
|
+
Requires-Dist: reportlab>=4.0
|
|
25
|
+
Requires-Dist: tqdm>=4.66
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: build>=1.0; extra == "dev"
|
|
28
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
29
|
+
Requires-Dist: pytest-mock>=3.12; extra == "dev"
|
|
30
|
+
Requires-Dist: twine>=5.0; extra == "dev"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# pdf-concatenator
|
|
34
|
+
|
|
35
|
+
Bundle many PDFs into a single submission-ready document.
|
|
36
|
+
|
|
37
|
+
This tool was built to pull together a large set of PDFs for a **contract submission**: one combined file with a table of contents, cover pages, and optional short summaries so reviewers can navigate the bundle easily.
|
|
38
|
+
|
|
39
|
+
## Features
|
|
40
|
+
|
|
41
|
+
- Recursively discover PDFs from a directory or glob pattern
|
|
42
|
+
- Sort files by path and concatenate them into one output PDF
|
|
43
|
+
- Generate a **table of contents** with folder structure, page numbers, and alternating row shading
|
|
44
|
+
- Insert a **cover page** before each source PDF (path, optional summary, page number)
|
|
45
|
+
- Optionally generate **LLM summaries** via a sidecar file per PDF (`*.pdf.sidecar.json`)
|
|
46
|
+
- Regenerate sidecars without concatenating (`--regenerate-summaries`)
|
|
47
|
+
- Exclude specific files or patterns (`--exclude`)
|
|
48
|
+
- Progress bar while summaries are processed
|
|
49
|
+
|
|
50
|
+
## Installation
|
|
51
|
+
|
|
52
|
+
With [pipx](https://pipx.pypa.io/) (recommended):
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pipx install pdf-concatenator
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
With pip:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
pip install pdf-concatenator
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
For development:
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
git clone https://github.com/lorenzowood/pdf-concatenator.git
|
|
68
|
+
cd pdf-concatenator
|
|
69
|
+
python -m venv .venv
|
|
70
|
+
source .venv/bin/activate
|
|
71
|
+
pip install -e ".[dev]"
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Quick start
|
|
75
|
+
|
|
76
|
+
Concatenate all PDFs under a folder:
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
pdf-concatenator -o submission.pdf contracts/
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
With summaries (requires LLM config — see below):
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
pdf-concatenator -o submission.pdf --include-summaries contracts/
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Regenerate sidecar summaries only:
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
pdf-concatenator --regenerate-summaries contracts/
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Exclude files:
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
pdf-concatenator -o submission.pdf \
|
|
98
|
+
--exclude "drafts/*" \
|
|
99
|
+
--exclude "broken.pdf" \
|
|
100
|
+
contracts/
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Patterns can be a directory (all PDFs beneath it) or a glob, e.g. `contracts/**/*.pdf`.
|
|
104
|
+
|
|
105
|
+
## LLM configuration
|
|
106
|
+
|
|
107
|
+
When using `--include-summaries` or `--regenerate-summaries`, create `~/.config/pdf-concatenator`:
|
|
108
|
+
|
|
109
|
+
```ini
|
|
110
|
+
LLM_API=open_ai
|
|
111
|
+
LLM_SERVER=127.0.0.1:28911
|
|
112
|
+
LLM_API_KEY=your-api-key
|
|
113
|
+
LLM_MODEL=your-model-id
|
|
114
|
+
LLM_PROMPT_TITLE_AND_SUMMARY=Your prompt here
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
The server should expose an OpenAI-compatible `/v1/chat/completions` endpoint. The whole PDF is sent to the model. If the prompt key is missing but everything else is valid, a default prompt is written to the config file.
|
|
118
|
+
|
|
119
|
+
Summaries are stored beside each PDF as `document.pdf.sidecar.json` and reused when the file hash matches.
|
|
120
|
+
|
|
121
|
+
## Output structure
|
|
122
|
+
|
|
123
|
+
1. **Contents** — tree of folders and files; page numbers point to each document's cover page. When summaries are included, a disclaimer appears in the footer.
|
|
124
|
+
2. **Cover page** per PDF — relative path, optional summary, page number.
|
|
125
|
+
3. **Original PDF pages** — unchanged (no added page numbers).
|
|
126
|
+
|
|
127
|
+
If any PDF cannot be read, or summary generation fails when required, the run aborts and no output file is produced.
|
|
128
|
+
|
|
129
|
+
## Options
|
|
130
|
+
|
|
131
|
+
```
|
|
132
|
+
usage: pdf-concatenator [-h] [-o filename] [--include-summaries]
|
|
133
|
+
[--regenerate-summaries] [--exclude pattern]
|
|
134
|
+
[--config CONFIG] [--verbose]
|
|
135
|
+
pattern
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
| Option | Description |
|
|
139
|
+
|--------|-------------|
|
|
140
|
+
| `-o`, `--output` | Output PDF path (required unless `--regenerate-summaries`) |
|
|
141
|
+
| `--include-summaries` | Include summaries in contents and cover pages |
|
|
142
|
+
| `--regenerate-summaries` | Regenerate sidecar files only; do not concatenate |
|
|
143
|
+
| `--exclude` | Glob pattern to exclude (repeatable) |
|
|
144
|
+
| `--config` | Path to LLM config (default: `~/.config/pdf-concatenator`) |
|
|
145
|
+
| `--verbose` | Show library warnings while reading/merging PDFs |
|
|
146
|
+
|
|
147
|
+
## Development
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
pytest
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## License
|
|
154
|
+
|
|
155
|
+
MIT
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# pdf-concatenator
|
|
2
|
+
|
|
3
|
+
Bundle many PDFs into a single submission-ready document.
|
|
4
|
+
|
|
5
|
+
This tool was built to pull together a large set of PDFs for a **contract submission**: one combined file with a table of contents, cover pages, and optional short summaries so reviewers can navigate the bundle easily.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- Recursively discover PDFs from a directory or glob pattern
|
|
10
|
+
- Sort files by path and concatenate them into one output PDF
|
|
11
|
+
- Generate a **table of contents** with folder structure, page numbers, and alternating row shading
|
|
12
|
+
- Insert a **cover page** before each source PDF (path, optional summary, page number)
|
|
13
|
+
- Optionally generate **LLM summaries** via a sidecar file per PDF (`*.pdf.sidecar.json`)
|
|
14
|
+
- Regenerate sidecars without concatenating (`--regenerate-summaries`)
|
|
15
|
+
- Exclude specific files or patterns (`--exclude`)
|
|
16
|
+
- Progress bar while summaries are processed
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
|
|
20
|
+
With [pipx](https://pipx.pypa.io/) (recommended):
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
pipx install pdf-concatenator
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
With pip:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install pdf-concatenator
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
For development:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
git clone https://github.com/lorenzowood/pdf-concatenator.git
|
|
36
|
+
cd pdf-concatenator
|
|
37
|
+
python -m venv .venv
|
|
38
|
+
source .venv/bin/activate
|
|
39
|
+
pip install -e ".[dev]"
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Quick start
|
|
43
|
+
|
|
44
|
+
Concatenate all PDFs under a folder:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
pdf-concatenator -o submission.pdf contracts/
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
With summaries (requires LLM config — see below):
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pdf-concatenator -o submission.pdf --include-summaries contracts/
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Regenerate sidecar summaries only:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
pdf-concatenator --regenerate-summaries contracts/
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Exclude files:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
pdf-concatenator -o submission.pdf \
|
|
66
|
+
--exclude "drafts/*" \
|
|
67
|
+
--exclude "broken.pdf" \
|
|
68
|
+
contracts/
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Patterns can be a directory (all PDFs beneath it) or a glob, e.g. `contracts/**/*.pdf`.
|
|
72
|
+
|
|
73
|
+
## LLM configuration
|
|
74
|
+
|
|
75
|
+
When using `--include-summaries` or `--regenerate-summaries`, create `~/.config/pdf-concatenator`:
|
|
76
|
+
|
|
77
|
+
```ini
|
|
78
|
+
LLM_API=open_ai
|
|
79
|
+
LLM_SERVER=127.0.0.1:28911
|
|
80
|
+
LLM_API_KEY=your-api-key
|
|
81
|
+
LLM_MODEL=your-model-id
|
|
82
|
+
LLM_PROMPT_TITLE_AND_SUMMARY=Your prompt here
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
The server should expose an OpenAI-compatible `/v1/chat/completions` endpoint. The whole PDF is sent to the model. If the prompt key is missing but everything else is valid, a default prompt is written to the config file.
|
|
86
|
+
|
|
87
|
+
Summaries are stored beside each PDF as `document.pdf.sidecar.json` and reused when the file hash matches.
|
|
88
|
+
|
|
89
|
+
## Output structure
|
|
90
|
+
|
|
91
|
+
1. **Contents** — tree of folders and files; page numbers point to each document's cover page. When summaries are included, a disclaimer appears in the footer.
|
|
92
|
+
2. **Cover page** per PDF — relative path, optional summary, page number.
|
|
93
|
+
3. **Original PDF pages** — unchanged (no added page numbers).
|
|
94
|
+
|
|
95
|
+
If any PDF cannot be read, or summary generation fails when required, the run aborts and no output file is produced.
|
|
96
|
+
|
|
97
|
+
## Options
|
|
98
|
+
|
|
99
|
+
```
|
|
100
|
+
usage: pdf-concatenator [-h] [-o filename] [--include-summaries]
|
|
101
|
+
[--regenerate-summaries] [--exclude pattern]
|
|
102
|
+
[--config CONFIG] [--verbose]
|
|
103
|
+
pattern
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
| Option | Description |
|
|
107
|
+
|--------|-------------|
|
|
108
|
+
| `-o`, `--output` | Output PDF path (required unless `--regenerate-summaries`) |
|
|
109
|
+
| `--include-summaries` | Include summaries in contents and cover pages |
|
|
110
|
+
| `--regenerate-summaries` | Regenerate sidecar files only; do not concatenate |
|
|
111
|
+
| `--exclude` | Glob pattern to exclude (repeatable) |
|
|
112
|
+
| `--config` | Path to LLM config (default: `~/.config/pdf-concatenator`) |
|
|
113
|
+
| `--verbose` | Show library warnings while reading/merging PDFs |
|
|
114
|
+
|
|
115
|
+
## Development
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
pytest
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## License
|
|
122
|
+
|
|
123
|
+
MIT
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "pdf-concatenator"
|
|
3
|
+
version = "1.0.0"
|
|
4
|
+
description = "Concatenate PDFs with table of contents, cover pages, and optional LLM summaries"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = "MIT"
|
|
7
|
+
requires-python = ">=3.11"
|
|
8
|
+
authors = [{ name = "Lorenzo Wood" }]
|
|
9
|
+
keywords = ["pdf", "concatenate", "merge", "toc", "llm"]
|
|
10
|
+
classifiers = [
|
|
11
|
+
"Development Status :: 4 - Beta",
|
|
12
|
+
"Environment :: Console",
|
|
13
|
+
"Intended Audience :: End Users/Desktop",
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"Programming Language :: Python :: 3.11",
|
|
16
|
+
"Programming Language :: Python :: 3.12",
|
|
17
|
+
"Programming Language :: Python :: 3.13",
|
|
18
|
+
"Topic :: Office/Business",
|
|
19
|
+
]
|
|
20
|
+
dependencies = [
|
|
21
|
+
"httpx>=0.27",
|
|
22
|
+
"pypdf>=4.0",
|
|
23
|
+
"reportlab>=4.0",
|
|
24
|
+
"tqdm>=4.66",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[project.urls]
|
|
28
|
+
Homepage = "https://github.com/lorenzowood/pdf-concatenator"
|
|
29
|
+
Repository = "https://github.com/lorenzowood/pdf-concatenator"
|
|
30
|
+
Issues = "https://github.com/lorenzowood/pdf-concatenator/issues"
|
|
31
|
+
|
|
32
|
+
[project.optional-dependencies]
|
|
33
|
+
dev = [
|
|
34
|
+
"build>=1.0",
|
|
35
|
+
"pytest>=8.0",
|
|
36
|
+
"pytest-mock>=3.12",
|
|
37
|
+
"twine>=5.0",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
[project.scripts]
|
|
41
|
+
pdf-concatenator = "pdf_concatenator.cli:main"
|
|
42
|
+
|
|
43
|
+
[build-system]
|
|
44
|
+
requires = ["setuptools>=61"]
|
|
45
|
+
build-backend = "setuptools.build_meta"
|
|
46
|
+
|
|
47
|
+
[tool.setuptools.packages.find]
|
|
48
|
+
where = ["src"]
|
|
49
|
+
|
|
50
|
+
[tool.pytest.ini_options]
|
|
51
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import logging
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from pdf_concatenator.config import ConfigError, DEFAULT_CONFIG_PATH
|
|
9
|
+
from pdf_concatenator.discovery import DiscoveredPdf, discover_pdfs
|
|
10
|
+
from pdf_concatenator.llm import LlmError
|
|
11
|
+
from pdf_concatenator.pdf_build import DocumentInfo, PdfBuildError, build_concatenated_pdf
|
|
12
|
+
from pdf_concatenator.summaries import load_llm_config, resolve_sidecar
|
|
13
|
+
from tqdm import tqdm
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
17
|
+
parser = argparse.ArgumentParser(
|
|
18
|
+
prog="pdf-concatenator",
|
|
19
|
+
description="Concatenate PDFs with a table of contents and optional summaries.",
|
|
20
|
+
)
|
|
21
|
+
parser.add_argument(
|
|
22
|
+
"-o",
|
|
23
|
+
"--output",
|
|
24
|
+
metavar="filename",
|
|
25
|
+
help="Output PDF filename (required unless --regenerate-summaries)",
|
|
26
|
+
)
|
|
27
|
+
parser.add_argument(
|
|
28
|
+
"--include-summaries",
|
|
29
|
+
action="store_true",
|
|
30
|
+
help="Include summaries in the table of contents and cover pages",
|
|
31
|
+
)
|
|
32
|
+
parser.add_argument(
|
|
33
|
+
"--regenerate-summaries",
|
|
34
|
+
action="store_true",
|
|
35
|
+
help="Regenerate sidecar summary files only; do not concatenate",
|
|
36
|
+
)
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"--exclude",
|
|
39
|
+
action="append",
|
|
40
|
+
default=[],
|
|
41
|
+
metavar="pattern",
|
|
42
|
+
help="Exclude files matching pattern (may be repeated)",
|
|
43
|
+
)
|
|
44
|
+
parser.add_argument(
|
|
45
|
+
"--config",
|
|
46
|
+
default=str(DEFAULT_CONFIG_PATH),
|
|
47
|
+
help="Path to LLM config file",
|
|
48
|
+
)
|
|
49
|
+
parser.add_argument(
|
|
50
|
+
"--verbose",
|
|
51
|
+
action="store_true",
|
|
52
|
+
help="Show library warnings while reading and merging PDFs",
|
|
53
|
+
)
|
|
54
|
+
parser.add_argument("pattern", help="Directory or glob pattern for PDF files")
|
|
55
|
+
return parser
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _configure_logging(verbose: bool) -> None:
|
|
59
|
+
level = logging.WARNING if verbose else logging.ERROR
|
|
60
|
+
logging.getLogger("pypdf").setLevel(level)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _summary_progress(
|
|
64
|
+
pdfs: list[DiscoveredPdf],
|
|
65
|
+
*,
|
|
66
|
+
disable: bool | None = None,
|
|
67
|
+
):
|
|
68
|
+
if disable is None:
|
|
69
|
+
disable = not sys.stderr.isatty()
|
|
70
|
+
return tqdm(
|
|
71
|
+
pdfs,
|
|
72
|
+
desc="Summaries",
|
|
73
|
+
unit="pdf",
|
|
74
|
+
total=len(pdfs),
|
|
75
|
+
disable=disable,
|
|
76
|
+
file=sys.stderr,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def main(argv: list[str] | None = None) -> int:
|
|
81
|
+
parser = build_parser()
|
|
82
|
+
try:
|
|
83
|
+
args = parser.parse_args(argv)
|
|
84
|
+
except SystemExit as exc:
|
|
85
|
+
code = exc.code
|
|
86
|
+
return int(code) if isinstance(code, int) else 1
|
|
87
|
+
|
|
88
|
+
_configure_logging(args.verbose)
|
|
89
|
+
|
|
90
|
+
if args.regenerate_summaries:
|
|
91
|
+
if args.output:
|
|
92
|
+
print(
|
|
93
|
+
"error: --output cannot be used with --regenerate-summaries",
|
|
94
|
+
file=sys.stderr,
|
|
95
|
+
)
|
|
96
|
+
return 2
|
|
97
|
+
return _regenerate_summaries(args)
|
|
98
|
+
|
|
99
|
+
if not args.output:
|
|
100
|
+
print(
|
|
101
|
+
"error: the following arguments are required: -o/--output",
|
|
102
|
+
file=sys.stderr,
|
|
103
|
+
)
|
|
104
|
+
return 2
|
|
105
|
+
|
|
106
|
+
return _concatenate(args)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _discover(args: argparse.Namespace):
|
|
110
|
+
pdfs = discover_pdfs(args.pattern, excludes=args.exclude)
|
|
111
|
+
if not pdfs:
|
|
112
|
+
print("No PDF files matched pattern.", file=sys.stderr)
|
|
113
|
+
return None
|
|
114
|
+
return pdfs
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _regenerate_summaries(args: argparse.Namespace) -> int:
|
|
118
|
+
pdfs = _discover(args)
|
|
119
|
+
if pdfs is None:
|
|
120
|
+
return 1
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
config = load_llm_config(Path(args.config))
|
|
124
|
+
except ConfigError as exc:
|
|
125
|
+
print(str(exc), file=sys.stderr)
|
|
126
|
+
return 1
|
|
127
|
+
|
|
128
|
+
for pdf in _summary_progress(pdfs):
|
|
129
|
+
try:
|
|
130
|
+
resolve_sidecar(pdf.path, config, force=True)
|
|
131
|
+
except LlmError as exc:
|
|
132
|
+
print(str(exc), file=sys.stderr)
|
|
133
|
+
return 1
|
|
134
|
+
|
|
135
|
+
return 0
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _concatenate(args: argparse.Namespace) -> int:
|
|
139
|
+
pdfs = _discover(args)
|
|
140
|
+
if pdfs is None:
|
|
141
|
+
return 1
|
|
142
|
+
|
|
143
|
+
output_path = Path(args.output)
|
|
144
|
+
config = None
|
|
145
|
+
if args.include_summaries:
|
|
146
|
+
try:
|
|
147
|
+
config = load_llm_config(Path(args.config))
|
|
148
|
+
except ConfigError as exc:
|
|
149
|
+
print(str(exc), file=sys.stderr)
|
|
150
|
+
return 1
|
|
151
|
+
|
|
152
|
+
documents: list[DocumentInfo] = []
|
|
153
|
+
summary_pdfs = pdfs if args.include_summaries else []
|
|
154
|
+
for pdf in _summary_progress(summary_pdfs) if summary_pdfs else pdfs:
|
|
155
|
+
summary: str | None = None
|
|
156
|
+
title = pdf.path.stem
|
|
157
|
+
if args.include_summaries:
|
|
158
|
+
assert config is not None
|
|
159
|
+
try:
|
|
160
|
+
sidecar = resolve_sidecar(pdf.path, config, force=False)
|
|
161
|
+
except LlmError as exc:
|
|
162
|
+
print(str(exc), file=sys.stderr)
|
|
163
|
+
return 1
|
|
164
|
+
summary = sidecar.summary
|
|
165
|
+
title = sidecar.title
|
|
166
|
+
|
|
167
|
+
documents.append(
|
|
168
|
+
DocumentInfo(
|
|
169
|
+
path=pdf.path,
|
|
170
|
+
relative_path=pdf.relative_path,
|
|
171
|
+
title=title,
|
|
172
|
+
summary=summary,
|
|
173
|
+
)
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
try:
|
|
177
|
+
build_concatenated_pdf(
|
|
178
|
+
documents,
|
|
179
|
+
output_path,
|
|
180
|
+
include_summaries=args.include_summaries,
|
|
181
|
+
)
|
|
182
|
+
except PdfBuildError as exc:
|
|
183
|
+
print(str(exc), file=sys.stderr)
|
|
184
|
+
if output_path.exists():
|
|
185
|
+
output_path.unlink()
|
|
186
|
+
return 1
|
|
187
|
+
|
|
188
|
+
return 0
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
if __name__ == "__main__":
|
|
192
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
DEFAULT_CONFIG_PATH = Path.home() / ".config" / "pdf-concatenator"
|
|
7
|
+
|
|
8
|
+
DEFAULT_PROMPT = (
|
|
9
|
+
"You are summarising a PDF document. Given the filename, metadata, and PDF "
|
|
10
|
+
"below, produce a concise title and a summary under 100 words (ideally one "
|
|
11
|
+
"sentence, but use more only if needed). Respond with JSON only: "
|
|
12
|
+
'{"title": "...", "summary": "..."}'
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
REQUIRED_KEYS = ("LLM_API", "LLM_SERVER", "LLM_API_KEY", "LLM_MODEL")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ConfigError(Exception):
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass(frozen=True)
|
|
23
|
+
class LlmConfig:
|
|
24
|
+
api: str
|
|
25
|
+
server: str
|
|
26
|
+
api_key: str
|
|
27
|
+
model: str
|
|
28
|
+
prompt: str
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _parse_config_text(text: str) -> dict[str, str]:
|
|
32
|
+
values: dict[str, str] = {}
|
|
33
|
+
for line in text.splitlines():
|
|
34
|
+
line = line.strip()
|
|
35
|
+
if not line or line.startswith("#"):
|
|
36
|
+
continue
|
|
37
|
+
if "=" not in line:
|
|
38
|
+
continue
|
|
39
|
+
key, value = line.split("=", 1)
|
|
40
|
+
values[key.strip()] = value.strip()
|
|
41
|
+
return values
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def load_config(path: Path = DEFAULT_CONFIG_PATH) -> LlmConfig:
|
|
45
|
+
if not path.exists():
|
|
46
|
+
raise ConfigError(f"Config file not found: {path}")
|
|
47
|
+
|
|
48
|
+
values = _parse_config_text(path.read_text())
|
|
49
|
+
for key in REQUIRED_KEYS:
|
|
50
|
+
if key not in values or not values[key]:
|
|
51
|
+
raise ConfigError(f"Missing required config key: {key}")
|
|
52
|
+
|
|
53
|
+
return LlmConfig(
|
|
54
|
+
api=values["LLM_API"],
|
|
55
|
+
server=values["LLM_SERVER"],
|
|
56
|
+
api_key=values["LLM_API_KEY"],
|
|
57
|
+
model=values["LLM_MODEL"],
|
|
58
|
+
prompt=values.get("LLM_PROMPT_TITLE_AND_SUMMARY", ""),
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def ensure_prompt(config: LlmConfig, path: Path = DEFAULT_CONFIG_PATH) -> LlmConfig:
|
|
63
|
+
if config.prompt:
|
|
64
|
+
return config
|
|
65
|
+
try:
|
|
66
|
+
existing = path.read_text()
|
|
67
|
+
if existing and not existing.endswith("\n"):
|
|
68
|
+
existing += "\n"
|
|
69
|
+
path.write_text(
|
|
70
|
+
existing + f"LLM_PROMPT_TITLE_AND_SUMMARY={DEFAULT_PROMPT}\n"
|
|
71
|
+
)
|
|
72
|
+
except OSError as exc:
|
|
73
|
+
raise ConfigError(f"Failed to write default prompt to config: {exc}") from exc
|
|
74
|
+
return LlmConfig(
|
|
75
|
+
api=config.api,
|
|
76
|
+
server=config.server,
|
|
77
|
+
api_key=config.api_key,
|
|
78
|
+
model=config.model,
|
|
79
|
+
prompt=DEFAULT_PROMPT,
|
|
80
|
+
)
|