pdf-concatenator 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Lorenzo Wood
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,155 @@
1
+ Metadata-Version: 2.4
2
+ Name: pdf-concatenator
3
+ Version: 1.0.0
4
+ Summary: Concatenate PDFs with table of contents, cover pages, and optional LLM summaries
5
+ Author: Lorenzo Wood
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/lorenzowood/pdf-concatenator
8
+ Project-URL: Repository, https://github.com/lorenzowood/pdf-concatenator
9
+ Project-URL: Issues, https://github.com/lorenzowood/pdf-concatenator/issues
10
+ Keywords: pdf,concatenate,merge,toc,llm
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Environment :: Console
13
+ Classifier: Intended Audience :: End Users/Desktop
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Office/Business
19
+ Requires-Python: >=3.11
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: httpx>=0.27
23
+ Requires-Dist: pypdf>=4.0
24
+ Requires-Dist: reportlab>=4.0
25
+ Requires-Dist: tqdm>=4.66
26
+ Provides-Extra: dev
27
+ Requires-Dist: build>=1.0; extra == "dev"
28
+ Requires-Dist: pytest>=8.0; extra == "dev"
29
+ Requires-Dist: pytest-mock>=3.12; extra == "dev"
30
+ Requires-Dist: twine>=5.0; extra == "dev"
31
+ Dynamic: license-file
32
+
33
+ # pdf-concatenator
34
+
35
+ Bundle many PDFs into a single submission-ready document.
36
+
37
+ This tool was built to pull together a large set of PDFs for a **contract submission**: one combined file with a table of contents, cover pages, and optional short summaries so reviewers can navigate the bundle easily.
38
+
39
+ ## Features
40
+
41
+ - Recursively discover PDFs from a directory or glob pattern
42
+ - Sort files by path and concatenate them into one output PDF
43
+ - Generate a **table of contents** with folder structure, page numbers, and alternating row shading
44
+ - Insert a **cover page** before each source PDF (path, optional summary, page number)
45
+ - Optionally generate **LLM summaries** via a sidecar file per PDF (`*.pdf.sidecar.json`)
46
+ - Regenerate sidecars without concatenating (`--regenerate-summaries`)
47
+ - Exclude specific files or patterns (`--exclude`)
48
+ - Progress bar while summaries are processed
49
+
50
+ ## Installation
51
+
52
+ With [pipx](https://pipx.pypa.io/) (recommended):
53
+
54
+ ```bash
55
+ pipx install pdf-concatenator
56
+ ```
57
+
58
+ With pip:
59
+
60
+ ```bash
61
+ pip install pdf-concatenator
62
+ ```
63
+
64
+ For development:
65
+
66
+ ```bash
67
+ git clone https://github.com/lorenzowood/pdf-concatenator.git
68
+ cd pdf-concatenator
69
+ python -m venv .venv
70
+ source .venv/bin/activate
71
+ pip install -e ".[dev]"
72
+ ```
73
+
74
+ ## Quick start
75
+
76
+ Concatenate all PDFs under a folder:
77
+
78
+ ```bash
79
+ pdf-concatenator -o submission.pdf contracts/
80
+ ```
81
+
82
+ With summaries (requires LLM config — see below):
83
+
84
+ ```bash
85
+ pdf-concatenator -o submission.pdf --include-summaries contracts/
86
+ ```
87
+
88
+ Regenerate sidecar summaries only:
89
+
90
+ ```bash
91
+ pdf-concatenator --regenerate-summaries contracts/
92
+ ```
93
+
94
+ Exclude files:
95
+
96
+ ```bash
97
+ pdf-concatenator -o submission.pdf \
98
+ --exclude "drafts/*" \
99
+ --exclude "broken.pdf" \
100
+ contracts/
101
+ ```
102
+
103
+ Patterns can be a directory (all PDFs beneath it) or a glob, e.g. `contracts/**/*.pdf`.
104
+
105
+ ## LLM configuration
106
+
107
+ When using `--include-summaries` or `--regenerate-summaries`, create `~/.config/pdf-concatenator`:
108
+
109
+ ```ini
110
+ LLM_API=open_ai
111
+ LLM_SERVER=127.0.0.1:28911
112
+ LLM_API_KEY=your-api-key
113
+ LLM_MODEL=your-model-id
114
+ LLM_PROMPT_TITLE_AND_SUMMARY=Your prompt here
115
+ ```
116
+
117
+ The server should expose an OpenAI-compatible `/v1/chat/completions` endpoint. The whole PDF is sent to the model. If the prompt key is missing but everything else is valid, a default prompt is written to the config file.
118
+
119
+ Summaries are stored beside each PDF as `document.pdf.sidecar.json` and reused when the file hash matches.
120
+
121
+ ## Output structure
122
+
123
+ 1. **Contents** — tree of folders and files; page numbers point to each document's cover page. When summaries are included, a disclaimer appears in the footer.
124
+ 2. **Cover page** per PDF — relative path, optional summary, page number.
125
+ 3. **Original PDF pages** — unchanged (no added page numbers).
126
+
127
+ If any PDF cannot be read, or summary generation fails when required, the run aborts and no output file is produced.
128
+
129
+ ## Options
130
+
131
+ ```
132
+ usage: pdf-concatenator [-h] [-o filename] [--include-summaries]
133
+ [--regenerate-summaries] [--exclude pattern]
134
+ [--config CONFIG] [--verbose]
135
+ pattern
136
+ ```
137
+
138
+ | Option | Description |
139
+ |--------|-------------|
140
+ | `-o`, `--output` | Output PDF path (required unless `--regenerate-summaries`) |
141
+ | `--include-summaries` | Include summaries in contents and cover pages |
142
+ | `--regenerate-summaries` | Regenerate sidecar files only; do not concatenate |
143
+ | `--exclude` | Glob pattern to exclude (repeatable) |
144
+ | `--config` | Path to LLM config (default: `~/.config/pdf-concatenator`) |
145
+ | `--verbose` | Show library warnings while reading/merging PDFs |
146
+
147
+ ## Development
148
+
149
+ ```bash
150
+ pytest
151
+ ```
152
+
153
+ ## License
154
+
155
+ MIT
@@ -0,0 +1,123 @@
1
+ # pdf-concatenator
2
+
3
+ Bundle many PDFs into a single submission-ready document.
4
+
5
+ This tool was built to pull together a large set of PDFs for a **contract submission**: one combined file with a table of contents, cover pages, and optional short summaries so reviewers can navigate the bundle easily.
6
+
7
+ ## Features
8
+
9
+ - Recursively discover PDFs from a directory or glob pattern
10
+ - Sort files by path and concatenate them into one output PDF
11
+ - Generate a **table of contents** with folder structure, page numbers, and alternating row shading
12
+ - Insert a **cover page** before each source PDF (path, optional summary, page number)
13
+ - Optionally generate **LLM summaries** via a sidecar file per PDF (`*.pdf.sidecar.json`)
14
+ - Regenerate sidecars without concatenating (`--regenerate-summaries`)
15
+ - Exclude specific files or patterns (`--exclude`)
16
+ - Progress bar while summaries are processed
17
+
18
+ ## Installation
19
+
20
+ With [pipx](https://pipx.pypa.io/) (recommended):
21
+
22
+ ```bash
23
+ pipx install pdf-concatenator
24
+ ```
25
+
26
+ With pip:
27
+
28
+ ```bash
29
+ pip install pdf-concatenator
30
+ ```
31
+
32
+ For development:
33
+
34
+ ```bash
35
+ git clone https://github.com/lorenzowood/pdf-concatenator.git
36
+ cd pdf-concatenator
37
+ python -m venv .venv
38
+ source .venv/bin/activate
39
+ pip install -e ".[dev]"
40
+ ```
41
+
42
+ ## Quick start
43
+
44
+ Concatenate all PDFs under a folder:
45
+
46
+ ```bash
47
+ pdf-concatenator -o submission.pdf contracts/
48
+ ```
49
+
50
+ With summaries (requires LLM config — see below):
51
+
52
+ ```bash
53
+ pdf-concatenator -o submission.pdf --include-summaries contracts/
54
+ ```
55
+
56
+ Regenerate sidecar summaries only:
57
+
58
+ ```bash
59
+ pdf-concatenator --regenerate-summaries contracts/
60
+ ```
61
+
62
+ Exclude files:
63
+
64
+ ```bash
65
+ pdf-concatenator -o submission.pdf \
66
+ --exclude "drafts/*" \
67
+ --exclude "broken.pdf" \
68
+ contracts/
69
+ ```
70
+
71
+ Patterns can be a directory (all PDFs beneath it) or a glob, e.g. `contracts/**/*.pdf`.
72
+
73
+ ## LLM configuration
74
+
75
+ When using `--include-summaries` or `--regenerate-summaries`, create `~/.config/pdf-concatenator`:
76
+
77
+ ```ini
78
+ LLM_API=open_ai
79
+ LLM_SERVER=127.0.0.1:28911
80
+ LLM_API_KEY=your-api-key
81
+ LLM_MODEL=your-model-id
82
+ LLM_PROMPT_TITLE_AND_SUMMARY=Your prompt here
83
+ ```
84
+
85
+ The server should expose an OpenAI-compatible `/v1/chat/completions` endpoint. The whole PDF is sent to the model. If the prompt key is missing but everything else is valid, a default prompt is written to the config file.
86
+
87
+ Summaries are stored beside each PDF as `document.pdf.sidecar.json` and reused when the file hash matches.
88
+
89
+ ## Output structure
90
+
91
+ 1. **Contents** — tree of folders and files; page numbers point to each document's cover page. When summaries are included, a disclaimer appears in the footer.
92
+ 2. **Cover page** per PDF — relative path, optional summary, page number.
93
+ 3. **Original PDF pages** — unchanged (no added page numbers).
94
+
95
+ If any PDF cannot be read, or summary generation fails when required, the run aborts and no output file is produced.
96
+
97
+ ## Options
98
+
99
+ ```
100
+ usage: pdf-concatenator [-h] [-o filename] [--include-summaries]
101
+ [--regenerate-summaries] [--exclude pattern]
102
+ [--config CONFIG] [--verbose]
103
+ pattern
104
+ ```
105
+
106
+ | Option | Description |
107
+ |--------|-------------|
108
+ | `-o`, `--output` | Output PDF path (required unless `--regenerate-summaries`) |
109
+ | `--include-summaries` | Include summaries in contents and cover pages |
110
+ | `--regenerate-summaries` | Regenerate sidecar files only; do not concatenate |
111
+ | `--exclude` | Glob pattern to exclude (repeatable) |
112
+ | `--config` | Path to LLM config (default: `~/.config/pdf-concatenator`) |
113
+ | `--verbose` | Show library warnings while reading/merging PDFs |
114
+
115
+ ## Development
116
+
117
+ ```bash
118
+ pytest
119
+ ```
120
+
121
+ ## License
122
+
123
+ MIT
@@ -0,0 +1,51 @@
1
+ [project]
2
+ name = "pdf-concatenator"
3
+ version = "1.0.0"
4
+ description = "Concatenate PDFs with table of contents, cover pages, and optional LLM summaries"
5
+ readme = "README.md"
6
+ license = "MIT"
7
+ requires-python = ">=3.11"
8
+ authors = [{ name = "Lorenzo Wood" }]
9
+ keywords = ["pdf", "concatenate", "merge", "toc", "llm"]
10
+ classifiers = [
11
+ "Development Status :: 4 - Beta",
12
+ "Environment :: Console",
13
+ "Intended Audience :: End Users/Desktop",
14
+ "Programming Language :: Python :: 3",
15
+ "Programming Language :: Python :: 3.11",
16
+ "Programming Language :: Python :: 3.12",
17
+ "Programming Language :: Python :: 3.13",
18
+ "Topic :: Office/Business",
19
+ ]
20
+ dependencies = [
21
+ "httpx>=0.27",
22
+ "pypdf>=4.0",
23
+ "reportlab>=4.0",
24
+ "tqdm>=4.66",
25
+ ]
26
+
27
+ [project.urls]
28
+ Homepage = "https://github.com/lorenzowood/pdf-concatenator"
29
+ Repository = "https://github.com/lorenzowood/pdf-concatenator"
30
+ Issues = "https://github.com/lorenzowood/pdf-concatenator/issues"
31
+
32
+ [project.optional-dependencies]
33
+ dev = [
34
+ "build>=1.0",
35
+ "pytest>=8.0",
36
+ "pytest-mock>=3.12",
37
+ "twine>=5.0",
38
+ ]
39
+
40
+ [project.scripts]
41
+ pdf-concatenator = "pdf_concatenator.cli:main"
42
+
43
+ [build-system]
44
+ requires = ["setuptools>=61"]
45
+ build-backend = "setuptools.build_meta"
46
+
47
+ [tool.setuptools.packages.find]
48
+ where = ["src"]
49
+
50
+ [tool.pytest.ini_options]
51
+ testpaths = ["tests"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,3 @@
1
+ """PDF concatenator with TOC, cover pages, and optional LLM summaries."""
2
+
3
+ __version__ = "1.0.0"
@@ -0,0 +1,192 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import logging
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ from pdf_concatenator.config import ConfigError, DEFAULT_CONFIG_PATH
9
+ from pdf_concatenator.discovery import DiscoveredPdf, discover_pdfs
10
+ from pdf_concatenator.llm import LlmError
11
+ from pdf_concatenator.pdf_build import DocumentInfo, PdfBuildError, build_concatenated_pdf
12
+ from pdf_concatenator.summaries import load_llm_config, resolve_sidecar
13
+ from tqdm import tqdm
14
+
15
+
16
+ def build_parser() -> argparse.ArgumentParser:
17
+ parser = argparse.ArgumentParser(
18
+ prog="pdf-concatenator",
19
+ description="Concatenate PDFs with a table of contents and optional summaries.",
20
+ )
21
+ parser.add_argument(
22
+ "-o",
23
+ "--output",
24
+ metavar="filename",
25
+ help="Output PDF filename (required unless --regenerate-summaries)",
26
+ )
27
+ parser.add_argument(
28
+ "--include-summaries",
29
+ action="store_true",
30
+ help="Include summaries in the table of contents and cover pages",
31
+ )
32
+ parser.add_argument(
33
+ "--regenerate-summaries",
34
+ action="store_true",
35
+ help="Regenerate sidecar summary files only; do not concatenate",
36
+ )
37
+ parser.add_argument(
38
+ "--exclude",
39
+ action="append",
40
+ default=[],
41
+ metavar="pattern",
42
+ help="Exclude files matching pattern (may be repeated)",
43
+ )
44
+ parser.add_argument(
45
+ "--config",
46
+ default=str(DEFAULT_CONFIG_PATH),
47
+ help="Path to LLM config file",
48
+ )
49
+ parser.add_argument(
50
+ "--verbose",
51
+ action="store_true",
52
+ help="Show library warnings while reading and merging PDFs",
53
+ )
54
+ parser.add_argument("pattern", help="Directory or glob pattern for PDF files")
55
+ return parser
56
+
57
+
58
+ def _configure_logging(verbose: bool) -> None:
59
+ level = logging.WARNING if verbose else logging.ERROR
60
+ logging.getLogger("pypdf").setLevel(level)
61
+
62
+
63
+ def _summary_progress(
64
+ pdfs: list[DiscoveredPdf],
65
+ *,
66
+ disable: bool | None = None,
67
+ ):
68
+ if disable is None:
69
+ disable = not sys.stderr.isatty()
70
+ return tqdm(
71
+ pdfs,
72
+ desc="Summaries",
73
+ unit="pdf",
74
+ total=len(pdfs),
75
+ disable=disable,
76
+ file=sys.stderr,
77
+ )
78
+
79
+
80
+ def main(argv: list[str] | None = None) -> int:
81
+ parser = build_parser()
82
+ try:
83
+ args = parser.parse_args(argv)
84
+ except SystemExit as exc:
85
+ code = exc.code
86
+ return int(code) if isinstance(code, int) else 1
87
+
88
+ _configure_logging(args.verbose)
89
+
90
+ if args.regenerate_summaries:
91
+ if args.output:
92
+ print(
93
+ "error: --output cannot be used with --regenerate-summaries",
94
+ file=sys.stderr,
95
+ )
96
+ return 2
97
+ return _regenerate_summaries(args)
98
+
99
+ if not args.output:
100
+ print(
101
+ "error: the following arguments are required: -o/--output",
102
+ file=sys.stderr,
103
+ )
104
+ return 2
105
+
106
+ return _concatenate(args)
107
+
108
+
109
+ def _discover(args: argparse.Namespace):
110
+ pdfs = discover_pdfs(args.pattern, excludes=args.exclude)
111
+ if not pdfs:
112
+ print("No PDF files matched pattern.", file=sys.stderr)
113
+ return None
114
+ return pdfs
115
+
116
+
117
+ def _regenerate_summaries(args: argparse.Namespace) -> int:
118
+ pdfs = _discover(args)
119
+ if pdfs is None:
120
+ return 1
121
+
122
+ try:
123
+ config = load_llm_config(Path(args.config))
124
+ except ConfigError as exc:
125
+ print(str(exc), file=sys.stderr)
126
+ return 1
127
+
128
+ for pdf in _summary_progress(pdfs):
129
+ try:
130
+ resolve_sidecar(pdf.path, config, force=True)
131
+ except LlmError as exc:
132
+ print(str(exc), file=sys.stderr)
133
+ return 1
134
+
135
+ return 0
136
+
137
+
138
+ def _concatenate(args: argparse.Namespace) -> int:
139
+ pdfs = _discover(args)
140
+ if pdfs is None:
141
+ return 1
142
+
143
+ output_path = Path(args.output)
144
+ config = None
145
+ if args.include_summaries:
146
+ try:
147
+ config = load_llm_config(Path(args.config))
148
+ except ConfigError as exc:
149
+ print(str(exc), file=sys.stderr)
150
+ return 1
151
+
152
+ documents: list[DocumentInfo] = []
153
+ summary_pdfs = pdfs if args.include_summaries else []
154
+ for pdf in _summary_progress(summary_pdfs) if summary_pdfs else pdfs:
155
+ summary: str | None = None
156
+ title = pdf.path.stem
157
+ if args.include_summaries:
158
+ assert config is not None
159
+ try:
160
+ sidecar = resolve_sidecar(pdf.path, config, force=False)
161
+ except LlmError as exc:
162
+ print(str(exc), file=sys.stderr)
163
+ return 1
164
+ summary = sidecar.summary
165
+ title = sidecar.title
166
+
167
+ documents.append(
168
+ DocumentInfo(
169
+ path=pdf.path,
170
+ relative_path=pdf.relative_path,
171
+ title=title,
172
+ summary=summary,
173
+ )
174
+ )
175
+
176
+ try:
177
+ build_concatenated_pdf(
178
+ documents,
179
+ output_path,
180
+ include_summaries=args.include_summaries,
181
+ )
182
+ except PdfBuildError as exc:
183
+ print(str(exc), file=sys.stderr)
184
+ if output_path.exists():
185
+ output_path.unlink()
186
+ return 1
187
+
188
+ return 0
189
+
190
+
191
+ if __name__ == "__main__":
192
+ raise SystemExit(main())
@@ -0,0 +1,80 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+
6
+ DEFAULT_CONFIG_PATH = Path.home() / ".config" / "pdf-concatenator"
7
+
8
+ DEFAULT_PROMPT = (
9
+ "You are summarising a PDF document. Given the filename, metadata, and PDF "
10
+ "below, produce a concise title and a summary under 100 words (ideally one "
11
+ "sentence, but use more only if needed). Respond with JSON only: "
12
+ '{"title": "...", "summary": "..."}'
13
+ )
14
+
15
+ REQUIRED_KEYS = ("LLM_API", "LLM_SERVER", "LLM_API_KEY", "LLM_MODEL")
16
+
17
+
18
+ class ConfigError(Exception):
19
+ pass
20
+
21
+
22
+ @dataclass(frozen=True)
23
+ class LlmConfig:
24
+ api: str
25
+ server: str
26
+ api_key: str
27
+ model: str
28
+ prompt: str
29
+
30
+
31
+ def _parse_config_text(text: str) -> dict[str, str]:
32
+ values: dict[str, str] = {}
33
+ for line in text.splitlines():
34
+ line = line.strip()
35
+ if not line or line.startswith("#"):
36
+ continue
37
+ if "=" not in line:
38
+ continue
39
+ key, value = line.split("=", 1)
40
+ values[key.strip()] = value.strip()
41
+ return values
42
+
43
+
44
+ def load_config(path: Path = DEFAULT_CONFIG_PATH) -> LlmConfig:
45
+ if not path.exists():
46
+ raise ConfigError(f"Config file not found: {path}")
47
+
48
+ values = _parse_config_text(path.read_text())
49
+ for key in REQUIRED_KEYS:
50
+ if key not in values or not values[key]:
51
+ raise ConfigError(f"Missing required config key: {key}")
52
+
53
+ return LlmConfig(
54
+ api=values["LLM_API"],
55
+ server=values["LLM_SERVER"],
56
+ api_key=values["LLM_API_KEY"],
57
+ model=values["LLM_MODEL"],
58
+ prompt=values.get("LLM_PROMPT_TITLE_AND_SUMMARY", ""),
59
+ )
60
+
61
+
62
+ def ensure_prompt(config: LlmConfig, path: Path = DEFAULT_CONFIG_PATH) -> LlmConfig:
63
+ if config.prompt:
64
+ return config
65
+ try:
66
+ existing = path.read_text()
67
+ if existing and not existing.endswith("\n"):
68
+ existing += "\n"
69
+ path.write_text(
70
+ existing + f"LLM_PROMPT_TITLE_AND_SUMMARY={DEFAULT_PROMPT}\n"
71
+ )
72
+ except OSError as exc:
73
+ raise ConfigError(f"Failed to write default prompt to config: {exc}") from exc
74
+ return LlmConfig(
75
+ api=config.api,
76
+ server=config.server,
77
+ api_key=config.api_key,
78
+ model=config.model,
79
+ prompt=DEFAULT_PROMPT,
80
+ )