book-condenser 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- book_condenser-0.1.0/CHANGELOG.md +10 -0
- book_condenser-0.1.0/CONTRIBUTING.md +29 -0
- book_condenser-0.1.0/LICENSE +74 -0
- book_condenser-0.1.0/MANIFEST.in +9 -0
- book_condenser-0.1.0/NOTICE +6 -0
- book_condenser-0.1.0/PKG-INFO +208 -0
- book_condenser-0.1.0/README.md +172 -0
- book_condenser-0.1.0/SECURITY.md +16 -0
- book_condenser-0.1.0/chapter_map_example.json +6 -0
- book_condenser-0.1.0/examples/chapter_map.json +7 -0
- book_condenser-0.1.0/pyproject.toml +68 -0
- book_condenser-0.1.0/requirements.txt +5 -0
- book_condenser-0.1.0/setup.cfg +4 -0
- book_condenser-0.1.0/src/book_condenser/__init__.py +9 -0
- book_condenser-0.1.0/src/book_condenser/__main__.py +7 -0
- book_condenser-0.1.0/src/book_condenser/cli.py +6 -0
- book_condenser-0.1.0/src/book_condenser/core.py +2547 -0
- book_condenser-0.1.0/src/book_condenser/exporters.py +11 -0
- book_condenser-0.1.0/src/book_condenser/llm.py +6 -0
- book_condenser-0.1.0/src/book_condenser/loaders.py +13 -0
- book_condenser-0.1.0/src/book_condenser/models.py +40 -0
- book_condenser-0.1.0/src/book_condenser/pipeline.py +12 -0
- book_condenser-0.1.0/src/book_condenser/selection.py +20 -0
- book_condenser-0.1.0/src/book_condenser/text.py +13 -0
- book_condenser-0.1.0/src/book_condenser.egg-info/PKG-INFO +208 -0
- book_condenser-0.1.0/src/book_condenser.egg-info/SOURCES.txt +29 -0
- book_condenser-0.1.0/src/book_condenser.egg-info/dependency_links.txt +1 -0
- book_condenser-0.1.0/src/book_condenser.egg-info/entry_points.txt +2 -0
- book_condenser-0.1.0/src/book_condenser.egg-info/requires.txt +11 -0
- book_condenser-0.1.0/src/book_condenser.egg-info/top_level.txt +1 -0
- book_condenser-0.1.0/tests/test_core.py +148 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.1.0
|
|
4
|
+
|
|
5
|
+
- Initial public-release preparation.
|
|
6
|
+
- Added installable Python package metadata and `book-condenser` CLI entry point.
|
|
7
|
+
- Added PolyForm Noncommercial 1.0.0 license, security policy, contribution guide, and release documentation.
|
|
8
|
+
- Added focused unit tests, ruff linting, and GitHub Actions CI.
|
|
9
|
+
- Removed generated/private book artifacts from the release tree.
|
|
10
|
+
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Contributing
|
|
2
|
+
|
|
3
|
+
Thanks for helping improve Book Condenser.
|
|
4
|
+
|
|
5
|
+
## Local Setup
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
python -m venv .venv
|
|
9
|
+
source .venv/bin/activate
|
|
10
|
+
pip install -e ".[dev]"
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Run the local checks before opening a pull request:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
ruff check .
|
|
17
|
+
pytest
|
|
18
|
+
python -m build
|
|
19
|
+
twine check dist/*
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Test Data
|
|
23
|
+
|
|
24
|
+
Do not commit copyrighted books, generated abridgements, or full parsed source text. Use synthetic fixtures or public-domain material that is clearly safe to redistribute.
|
|
25
|
+
|
|
26
|
+
## Style
|
|
27
|
+
|
|
28
|
+
Keep behavior-preserving refactors separate from functional changes when possible. The CLI is the public interface, so changes to flags, defaults, output names, or file formats should include tests and README updates.
|
|
29
|
+
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
PolyForm Noncommercial License 1.0.0
|
|
2
|
+
|
|
3
|
+
<https://polyformproject.org/licenses/noncommercial/1.0.0>
|
|
4
|
+
|
|
5
|
+
## Acceptance
|
|
6
|
+
|
|
7
|
+
In order to get any license under these terms, you must agree to them as both strict obligations and conditions to all your licenses.
|
|
8
|
+
|
|
9
|
+
## Copyright License
|
|
10
|
+
|
|
11
|
+
The licensor grants you a copyright license for the software to do everything you might do with the software that would otherwise infringe the licensor's copyright in it for any permitted purpose. However, you may only distribute the software according to Distribution License and make changes or new works based on the software according to Changes and New Works License.
|
|
12
|
+
|
|
13
|
+
## Distribution License
|
|
14
|
+
|
|
15
|
+
The licensor grants you an additional copyright license to distribute copies of the software. Your license to distribute covers distributing the software with changes and new works permitted by Changes and New Works License.
|
|
16
|
+
|
|
17
|
+
## Notices
|
|
18
|
+
|
|
19
|
+
You must ensure that anyone who gets a copy of any part of the software from you also gets a copy of these terms or the URL for them above, as well as copies of any plain-text lines beginning with `Required Notice:` that the licensor provided with the software. For example:
|
|
20
|
+
|
|
21
|
+
Required Notice: Copyright Yoyodyne, Inc. (http://example.com)
|
|
22
|
+
|
|
23
|
+
## Changes and New Works License
|
|
24
|
+
|
|
25
|
+
The licensor grants you an additional copyright license to make changes and new works based on the software for any permitted purpose.
|
|
26
|
+
|
|
27
|
+
## Patent License
|
|
28
|
+
|
|
29
|
+
The licensor grants you a patent license for the software that covers patent claims the licensor can license, or becomes able to license, that you would infringe by using the software.
|
|
30
|
+
|
|
31
|
+
## Noncommercial Purposes
|
|
32
|
+
|
|
33
|
+
Any noncommercial purpose is a permitted purpose.
|
|
34
|
+
|
|
35
|
+
## Personal Uses
|
|
36
|
+
|
|
37
|
+
Personal use for research, experiment, and testing for the benefit of public knowledge, personal study, private entertainment, hobby projects, amateur pursuits, or religious observance, without any anticipated commercial application, is use for a permitted purpose.
|
|
38
|
+
|
|
39
|
+
## Noncommercial Organizations
|
|
40
|
+
|
|
41
|
+
Use by any charitable organization, educational institution, public research organization, public safety or health organization, environmental protection organization, or government institution is use for a permitted purpose regardless of the source of funding or obligations resulting from the funding.
|
|
42
|
+
|
|
43
|
+
## Fair Use
|
|
44
|
+
|
|
45
|
+
You may have "fair use" rights for the software under the law. These terms do not limit them.
|
|
46
|
+
|
|
47
|
+
## No Other Rights
|
|
48
|
+
|
|
49
|
+
These terms do not allow you to sublicense or transfer any of your licenses to anyone else, or prevent the licensor from granting licenses to anyone else. These terms do not imply any other licenses.
|
|
50
|
+
|
|
51
|
+
## Patent Defense
|
|
52
|
+
|
|
53
|
+
If you make any written claim that the software infringes or contributes to infringement of any patent, your patent license for the software granted under these terms ends immediately. If your company makes such a claim, your patent license ends immediately for work on behalf of your company.
|
|
54
|
+
|
|
55
|
+
## Violations
|
|
56
|
+
|
|
57
|
+
The first time you are notified in writing that you have violated any of these terms, or done anything with the software not covered by your licenses, your licenses can nonetheless continue if you come into full compliance with these terms, and take practical steps to correct past violations, within 32 days of receiving notice. Otherwise, all your licenses end immediately.
|
|
58
|
+
|
|
59
|
+
## No Liability
|
|
60
|
+
|
|
61
|
+
As far as the law allows, the software comes as is, without any warranty or condition, and the licensor will not be liable to you for any damages arising out of these terms or the use or nature of the software, under any kind of legal claim.
|
|
62
|
+
|
|
63
|
+
## Definitions
|
|
64
|
+
|
|
65
|
+
The licensor is the individual or entity offering these terms, and the software is the software the licensor makes available under these terms.
|
|
66
|
+
|
|
67
|
+
You refers to the individual or entity agreeing to these terms.
|
|
68
|
+
|
|
69
|
+
Your company is any legal entity, sole proprietorship, or other kind of organization that you work for, plus all organizations that have control over, are under the control of, or are under common control with that organization. Control means ownership of substantially all the assets of an entity, or the power to direct its management and policies by vote, contract, or otherwise. Control can be direct or indirect.
|
|
70
|
+
|
|
71
|
+
Your licenses are all the licenses granted to you for the software under these terms.
|
|
72
|
+
|
|
73
|
+
Use means anything you do with the software requiring one of your licenses.
|
|
74
|
+
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
Book Condenser is licensed under the PolyForm Noncommercial License 1.0.0.
|
|
2
|
+
|
|
3
|
+
Book Condenser processes user-provided source documents and may produce outputs containing substantial verbatim text from those documents. You are responsible for ensuring that you have the legal right to process each source document and to store, distribute, or otherwise use generated outputs.
|
|
4
|
+
|
|
5
|
+
This project does not provide legal advice. If you are unsure whether a use is allowed, consult a qualified professional before processing or sharing copyrighted material.
|
|
6
|
+
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: book-condenser
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Read the essential book: create extractive abridgements that preserve the author's original passages.
|
|
5
|
+
Author: Khalid
|
|
6
|
+
License-Expression: LicenseRef-PolyForm-Noncommercial-1.0.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/khalid/book-condenser
|
|
8
|
+
Project-URL: Repository, https://github.com/khalid/book-condenser
|
|
9
|
+
Project-URL: Issues, https://github.com/khalid/book-condenser/issues
|
|
10
|
+
Project-URL: License, https://polyformproject.org/licenses/noncommercial/1.0.0
|
|
11
|
+
Keywords: books,epub,pdf,abridgement,extractive,openai,cli
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
15
|
+
Classifier: Natural Language :: English
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Text Processing
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: openai>=2.21.0
|
|
26
|
+
Requires-Dist: pydantic>=2.7.0
|
|
27
|
+
Requires-Dist: pymupdf>=1.24.0
|
|
28
|
+
Requires-Dist: python-docx>=1.1.0
|
|
29
|
+
Requires-Dist: reportlab>=4.2.0
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: build>=1.2.0; extra == "dev"
|
|
32
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
33
|
+
Requires-Dist: ruff>=0.8.0; extra == "dev"
|
|
34
|
+
Requires-Dist: twine>=5.0.0; extra == "dev"
|
|
35
|
+
Dynamic: license-file
|
|
36
|
+
|
|
37
|
+
# Book Condenser
|
|
38
|
+
|
|
39
|
+
## Read the Essential Book
|
|
40
|
+
|
|
41
|
+
Book Condenser creates an extractive abridgement of a nonfiction book. An AI model identifies the original passages that carry the book's central argument, evidence, concepts, turning points, and conclusions. The software then assembles those passages verbatim into a shorter, beautifully formatted reading edition.
|
|
42
|
+
|
|
43
|
+
This approach preserves what makes a serious book valuable: the author's reasoning, voice, and choice of evidence. Many nonfiction books develop their core ideas through repetition, extended examples, and supporting detail. By retaining the passages that do the essential intellectual work, Book Condenser makes the book more efficient to read while keeping the reader in direct contact with the original text.
|
|
44
|
+
|
|
45
|
+
The result is a condensed, tablet-friendly PDF designed for focused reading: shorter than the source, richer than a summary, and faithful to the author.
|
|
46
|
+
|
|
47
|
+
This tool is intended for books you own the rights to process, public-domain works, or other material you are legally allowed to transform and store. Generated outputs may contain substantial verbatim source text.
|
|
48
|
+
|
|
49
|
+
## Features
|
|
50
|
+
|
|
51
|
+
- Supports EPUB, PDF, DOCX, TXT, and Markdown input.
|
|
52
|
+
- Validates parsing with `--parse-only` before making API calls.
|
|
53
|
+
- Preserves chronology and argument structure through subtype-aware selection rules.
|
|
54
|
+
- Protects broad coverage with `--coverage-mode all` and per-section concentration limits.
|
|
55
|
+
- Produces `reading_abridgement.pdf` as the primary reader-facing output.
|
|
56
|
+
- Writes audit artifacts so users can inspect selected passages, scores, coverage, and quality-control decisions.
|
|
57
|
+
|
|
58
|
+
## Installation
|
|
59
|
+
|
|
60
|
+
From PyPI after release:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
pip install book-condenser
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
For local development from a checkout:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
python -m venv .venv
|
|
70
|
+
source .venv/bin/activate
|
|
71
|
+
pip install -e ".[dev]"
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Set your OpenAI API key in the environment before running the full pipeline:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
export OPENAI_API_KEY="your-api-key-here"
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
You can also set `OPENAI_MODEL`; otherwise the CLI defaults to `gpt-5-mini`.
|
|
81
|
+
|
|
82
|
+
## Quick Start
|
|
83
|
+
|
|
84
|
+
Validate parsing before any API calls:
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
book-condenser path/to/public-domain-book.epub \
|
|
88
|
+
--output-dir out/example \
|
|
89
|
+
--parse-only
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Review `out/example/parsed_structure_report.md`. Continue only if chapter and back-matter detection look plausible.
|
|
93
|
+
|
|
94
|
+
Generate a reading edition:
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
book-condenser path/to/public-domain-book.epub \
|
|
98
|
+
--output-dir out/example \
|
|
99
|
+
--target-ratio 0.25 \
|
|
100
|
+
--coverage-mode all \
|
|
101
|
+
--chapter-max-share 0.08 \
|
|
102
|
+
--apply-qc
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
For PDFs with unreliable bookmarks, provide a manual chapter map:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
book-condenser path/to/public-domain-book.pdf \
|
|
109
|
+
--chapter-map examples/chapter_map.json \
|
|
110
|
+
--output-dir out/example \
|
|
111
|
+
--parse-only
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
The root `book_condenser.py` file is a compatibility launcher. Prefer the installed `book-condenser` command for normal use.
|
|
115
|
+
|
|
116
|
+
## Key Controls
|
|
117
|
+
|
|
118
|
+
| Argument | Purpose | Default |
|
|
119
|
+
|---|---|---:|
|
|
120
|
+
| `--target-ratio` | Target proportion of source words retained | `0.25` |
|
|
121
|
+
| `--candidate-ratio` | Candidate pool before global pruning | `0.42` |
|
|
122
|
+
| `--coverage-mode` | Section coverage rule: `all`, `major`, or `none` | `all` |
|
|
123
|
+
| `--chapter-max-share` | Maximum nominal share of final text from one chapter | `0.08` |
|
|
124
|
+
| `--chapter-map` | Manual PDF section/page map when bookmarks are unreliable | none |
|
|
125
|
+
| `--parse-only` | Validate structure and cleanup without API calls | off |
|
|
126
|
+
| `--apply-qc` | Apply final model review within constraints | off |
|
|
127
|
+
| `--pdf-page-size` | `small-tablet`, `a5`, or `large-tablet` | `small-tablet` |
|
|
128
|
+
| `--pdf-font-size` | Body type size between 11 and 20 pt | `14.0` |
|
|
129
|
+
| `--pdf-font` | `auto`, `georgia`, `dejavu serif`, or `times` | `auto` |
|
|
130
|
+
| `--no-docx` | Skip optional DOCX output | off |
|
|
131
|
+
|
|
132
|
+
## Outputs
|
|
133
|
+
|
|
134
|
+
```text
|
|
135
|
+
out/example/
|
|
136
|
+
parsed_structure_report.md
|
|
137
|
+
book_metadata.json
|
|
138
|
+
book_paragraphs.jsonl
|
|
139
|
+
structural_overview.json
|
|
140
|
+
chapter_candidates/
|
|
141
|
+
scored_candidates.json
|
|
142
|
+
global_selection.json
|
|
143
|
+
quality_control.json
|
|
144
|
+
selection_audit.md
|
|
145
|
+
reading_abridgement.md
|
|
146
|
+
reading_abridgement.pdf
|
|
147
|
+
reading_abridgement.docx
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
`reading_abridgement.pdf` is the primary reading edition. `selection_audit.md` records subtype classification, chapter balance, selected passage functions, scores, protected anchors, and locations.
|
|
151
|
+
|
|
152
|
+
Treat the entire output directory as private by default. It can contain verbatim source text, local paths, and model-generated analysis.
|
|
153
|
+
|
|
154
|
+
## Manual Chapter Map Format
|
|
155
|
+
|
|
156
|
+
Pages are 1-indexed. `end_page` is optional; when omitted, the next section's `start_page - 1` is used.
|
|
157
|
+
|
|
158
|
+
```json
|
|
159
|
+
[
|
|
160
|
+
{"title": "Prologue", "start_page": 1, "end_page": 8},
|
|
161
|
+
{"title": "Chapter One", "start_page": 9},
|
|
162
|
+
{"title": "Chapter Two", "start_page": 28},
|
|
163
|
+
{"title": "Bibliography", "start_page": 410}
|
|
164
|
+
]
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
Back matter headings are retained in the parse audit but excluded from selection and source-word budgeting.
|
|
168
|
+
|
|
169
|
+
## Source Format Guidance
|
|
170
|
+
|
|
171
|
+
Prefer EPUB when available. PDFs may require a manual chapter map and inspection of the parse-only report. If a PDF is scanned or image-only, run OCR first.
|
|
172
|
+
|
|
173
|
+
The parser supports EPUB 2 `toc.ncx`, EPUB 3 navigation documents, semantic back-matter signals, anchored subsections, PDF bookmarks, visible-heading fallback, and common PDF text cleanup.
|
|
174
|
+
|
|
175
|
+
## Cost and Privacy
|
|
176
|
+
|
|
177
|
+
Full runs send selected source excerpts and structural context to the configured OpenAI model. Use `--parse-only` to inspect local parsing before any API calls. Larger books, higher `--candidate-ratio`, and `--apply-qc` increase token usage and cost.
|
|
178
|
+
|
|
179
|
+
Do not process confidential, copyrighted, or sensitive books unless your API/provider settings and legal rights allow that use.
|
|
180
|
+
|
|
181
|
+
## Development
|
|
182
|
+
|
|
183
|
+
Run checks locally:
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
ruff check .
|
|
187
|
+
pytest
|
|
188
|
+
python -m build
|
|
189
|
+
twine check dist/*
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
The package exposes `book-condenser` as a console script and `python -m book_condenser` as a module entry point.
|
|
193
|
+
|
|
194
|
+
## Release Checklist
|
|
195
|
+
|
|
196
|
+
1. Confirm the repository root is this project directory, not a parent home directory.
|
|
197
|
+
2. Verify no `.env`, `books/`, `out/`, generated abridgements, or copyrighted fixtures are tracked.
|
|
198
|
+
3. Run `ruff check .`, `pytest`, `python -m build`, and `twine check dist/*`.
|
|
199
|
+
4. Configure PyPI trusted publishing for `khalidlabs/book-condenser` using the `Publish to PyPI` workflow.
|
|
200
|
+
5. Publish a GitHub release or run the publish workflow manually after package install and CLI smoke tests pass.
|
|
201
|
+
|
|
202
|
+
## License
|
|
203
|
+
|
|
204
|
+
Book Condenser is licensed under the [PolyForm Noncommercial License 1.0.0](LICENSE). Commercial use is not permitted by this license without a separate commercial license from the licensor.
|
|
205
|
+
|
|
206
|
+
## Disclaimer
|
|
207
|
+
|
|
208
|
+
Book Condenser is provided as-is and does not provide legal advice. You are responsible for ensuring that your source material and generated outputs comply with copyright law, contract terms, platform policies, and any other obligations that apply to your use.
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
# Book Condenser
|
|
2
|
+
|
|
3
|
+
## Read the Essential Book
|
|
4
|
+
|
|
5
|
+
Book Condenser creates an extractive abridgement of a nonfiction book. An AI model identifies the original passages that carry the book's central argument, evidence, concepts, turning points, and conclusions. The software then assembles those passages verbatim into a shorter, beautifully formatted reading edition.
|
|
6
|
+
|
|
7
|
+
This approach preserves what makes a serious book valuable: the author's reasoning, voice, and choice of evidence. Many nonfiction books develop their core ideas through repetition, extended examples, and supporting detail. By retaining the passages that do the essential intellectual work, Book Condenser makes the book more efficient to read while keeping the reader in direct contact with the original text.
|
|
8
|
+
|
|
9
|
+
The result is a condensed, tablet-friendly PDF designed for focused reading: shorter than the source, richer than a summary, and faithful to the author.
|
|
10
|
+
|
|
11
|
+
This tool is intended for books you own the rights to process, public-domain works, or other material you are legally allowed to transform and store. Generated outputs may contain substantial verbatim source text.
|
|
12
|
+
|
|
13
|
+
## Features
|
|
14
|
+
|
|
15
|
+
- Supports EPUB, PDF, DOCX, TXT, and Markdown input.
|
|
16
|
+
- Validates parsing with `--parse-only` before making API calls.
|
|
17
|
+
- Preserves chronology and argument structure through subtype-aware selection rules.
|
|
18
|
+
- Protects broad coverage with `--coverage-mode all` and per-section concentration limits.
|
|
19
|
+
- Produces `reading_abridgement.pdf` as the primary reader-facing output.
|
|
20
|
+
- Writes audit artifacts so users can inspect selected passages, scores, coverage, and quality-control decisions.
|
|
21
|
+
|
|
22
|
+
## Installation
|
|
23
|
+
|
|
24
|
+
From PyPI after release:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install book-condenser
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
For local development from a checkout:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
python -m venv .venv
|
|
34
|
+
source .venv/bin/activate
|
|
35
|
+
pip install -e ".[dev]"
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Set your OpenAI API key in the environment before running the full pipeline:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
export OPENAI_API_KEY="your-api-key-here"
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
You can also set `OPENAI_MODEL`; otherwise the CLI defaults to `gpt-5-mini`.
|
|
45
|
+
|
|
46
|
+
## Quick Start
|
|
47
|
+
|
|
48
|
+
Validate parsing before any API calls:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
book-condenser path/to/public-domain-book.epub \
|
|
52
|
+
--output-dir out/example \
|
|
53
|
+
--parse-only
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Review `out/example/parsed_structure_report.md`. Continue only if chapter and back-matter detection look plausible.
|
|
57
|
+
|
|
58
|
+
Generate a reading edition:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
book-condenser path/to/public-domain-book.epub \
|
|
62
|
+
--output-dir out/example \
|
|
63
|
+
--target-ratio 0.25 \
|
|
64
|
+
--coverage-mode all \
|
|
65
|
+
--chapter-max-share 0.08 \
|
|
66
|
+
--apply-qc
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
For PDFs with unreliable bookmarks, provide a manual chapter map:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
book-condenser path/to/public-domain-book.pdf \
|
|
73
|
+
--chapter-map examples/chapter_map.json \
|
|
74
|
+
--output-dir out/example \
|
|
75
|
+
--parse-only
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
The root `book_condenser.py` file is a compatibility launcher. Prefer the installed `book-condenser` command for normal use.
|
|
79
|
+
|
|
80
|
+
## Key Controls
|
|
81
|
+
|
|
82
|
+
| Argument | Purpose | Default |
|
|
83
|
+
|---|---|---:|
|
|
84
|
+
| `--target-ratio` | Target proportion of source words retained | `0.25` |
|
|
85
|
+
| `--candidate-ratio` | Candidate pool before global pruning | `0.42` |
|
|
86
|
+
| `--coverage-mode` | Section coverage rule: `all`, `major`, or `none` | `all` |
|
|
87
|
+
| `--chapter-max-share` | Maximum nominal share of final text from one chapter | `0.08` |
|
|
88
|
+
| `--chapter-map` | Manual PDF section/page map when bookmarks are unreliable | none |
|
|
89
|
+
| `--parse-only` | Validate structure and cleanup without API calls | off |
|
|
90
|
+
| `--apply-qc` | Apply final model review within constraints | off |
|
|
91
|
+
| `--pdf-page-size` | `small-tablet`, `a5`, or `large-tablet` | `small-tablet` |
|
|
92
|
+
| `--pdf-font-size` | Body type size between 11 and 20 pt | `14.0` |
|
|
93
|
+
| `--pdf-font` | `auto`, `georgia`, `dejavu serif`, or `times` | `auto` |
|
|
94
|
+
| `--no-docx` | Skip optional DOCX output | off |
|
|
95
|
+
|
|
96
|
+
## Outputs
|
|
97
|
+
|
|
98
|
+
```text
|
|
99
|
+
out/example/
|
|
100
|
+
parsed_structure_report.md
|
|
101
|
+
book_metadata.json
|
|
102
|
+
book_paragraphs.jsonl
|
|
103
|
+
structural_overview.json
|
|
104
|
+
chapter_candidates/
|
|
105
|
+
scored_candidates.json
|
|
106
|
+
global_selection.json
|
|
107
|
+
quality_control.json
|
|
108
|
+
selection_audit.md
|
|
109
|
+
reading_abridgement.md
|
|
110
|
+
reading_abridgement.pdf
|
|
111
|
+
reading_abridgement.docx
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
`reading_abridgement.pdf` is the primary reading edition. `selection_audit.md` records subtype classification, chapter balance, selected passage functions, scores, protected anchors, and locations.
|
|
115
|
+
|
|
116
|
+
Treat the entire output directory as private by default. It can contain verbatim source text, local paths, and model-generated analysis.
|
|
117
|
+
|
|
118
|
+
## Manual Chapter Map Format
|
|
119
|
+
|
|
120
|
+
Pages are 1-indexed. `end_page` is optional; when omitted, the next section's `start_page - 1` is used.
|
|
121
|
+
|
|
122
|
+
```json
|
|
123
|
+
[
|
|
124
|
+
{"title": "Prologue", "start_page": 1, "end_page": 8},
|
|
125
|
+
{"title": "Chapter One", "start_page": 9},
|
|
126
|
+
{"title": "Chapter Two", "start_page": 28},
|
|
127
|
+
{"title": "Bibliography", "start_page": 410}
|
|
128
|
+
]
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Back matter headings are retained in the parse audit but excluded from selection and source-word budgeting.
|
|
132
|
+
|
|
133
|
+
## Source Format Guidance
|
|
134
|
+
|
|
135
|
+
Prefer EPUB when available. PDFs may require a manual chapter map and inspection of the parse-only report. If a PDF is scanned or image-only, run OCR first.
|
|
136
|
+
|
|
137
|
+
The parser supports EPUB 2 `toc.ncx`, EPUB 3 navigation documents, semantic back-matter signals, anchored subsections, PDF bookmarks, visible-heading fallback, and common PDF text cleanup.
|
|
138
|
+
|
|
139
|
+
## Cost and Privacy
|
|
140
|
+
|
|
141
|
+
Full runs send selected source excerpts and structural context to the configured OpenAI model. Use `--parse-only` to inspect local parsing before any API calls. Larger books, higher `--candidate-ratio`, and `--apply-qc` increase token usage and cost.
|
|
142
|
+
|
|
143
|
+
Do not process confidential, copyrighted, or sensitive books unless your API/provider settings and legal rights allow that use.
|
|
144
|
+
|
|
145
|
+
## Development
|
|
146
|
+
|
|
147
|
+
Run checks locally:
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
ruff check .
|
|
151
|
+
pytest
|
|
152
|
+
python -m build
|
|
153
|
+
twine check dist/*
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
The package exposes `book-condenser` as a console script and `python -m book_condenser` as a module entry point.
|
|
157
|
+
|
|
158
|
+
## Release Checklist
|
|
159
|
+
|
|
160
|
+
1. Confirm the repository root is this project directory, not a parent home directory.
|
|
161
|
+
2. Verify no `.env`, `books/`, `out/`, generated abridgements, or copyrighted fixtures are tracked.
|
|
162
|
+
3. Run `ruff check .`, `pytest`, `python -m build`, and `twine check dist/*`.
|
|
163
|
+
4. Configure PyPI trusted publishing for `khalidlabs/book-condenser` using the `Publish to PyPI` workflow.
|
|
164
|
+
5. Publish a GitHub release or run the publish workflow manually after package install and CLI smoke tests pass.
|
|
165
|
+
|
|
166
|
+
## License
|
|
167
|
+
|
|
168
|
+
Book Condenser is licensed under the [PolyForm Noncommercial License 1.0.0](LICENSE). Commercial use is not permitted by this license without a separate commercial license from the licensor.
|
|
169
|
+
|
|
170
|
+
## Disclaimer
|
|
171
|
+
|
|
172
|
+
Book Condenser is provided as-is and does not provide legal advice. You are responsible for ensuring that your source material and generated outputs comply with copyright law, contract terms, platform policies, and any other obligations that apply to your use.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Security Policy
|
|
2
|
+
|
|
3
|
+
## Reporting Issues
|
|
4
|
+
|
|
5
|
+
Please report security issues privately to the project maintainer before opening a public issue. Include the affected version, a clear reproduction path, and any relevant logs with secrets removed.
|
|
6
|
+
|
|
7
|
+
## Secrets
|
|
8
|
+
|
|
9
|
+
Book Condenser reads API credentials from environment variables such as `OPENAI_API_KEY`. Do not commit `.env` files, shell history, generated logs, or output artifacts containing credentials.
|
|
10
|
+
|
|
11
|
+
If an API key is exposed, revoke it with the provider immediately, create a replacement key, and remove the exposed value from the repository and any published history.
|
|
12
|
+
|
|
13
|
+
## Generated Artifacts
|
|
14
|
+
|
|
15
|
+
Generated files may contain substantial verbatim text from source books and local filesystem paths. Treat `out/`, `books/`, and similar working directories as private unless you have verified that every file is safe to publish.
|
|
16
|
+
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=77", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "book-condenser"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Read the essential book: create extractive abridgements that preserve the author's original passages."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = "LicenseRef-PolyForm-Noncommercial-1.0.0"
|
|
12
|
+
license-files = ["LICENSE"]
|
|
13
|
+
authors = [
|
|
14
|
+
{ name = "Khalid" }
|
|
15
|
+
]
|
|
16
|
+
keywords = ["books", "epub", "pdf", "abridgement", "extractive", "openai", "cli"]
|
|
17
|
+
classifiers = [
|
|
18
|
+
"Development Status :: 3 - Alpha",
|
|
19
|
+
"Environment :: Console",
|
|
20
|
+
"Intended Audience :: End Users/Desktop",
|
|
21
|
+
"Natural Language :: English",
|
|
22
|
+
"Programming Language :: Python :: 3",
|
|
23
|
+
"Programming Language :: Python :: 3.10",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
"Programming Language :: Python :: 3.12",
|
|
26
|
+
"Programming Language :: Python :: 3.13",
|
|
27
|
+
"Topic :: Text Processing",
|
|
28
|
+
]
|
|
29
|
+
dependencies = [
|
|
30
|
+
"openai>=2.21.0",
|
|
31
|
+
"pydantic>=2.7.0",
|
|
32
|
+
"pymupdf>=1.24.0",
|
|
33
|
+
"python-docx>=1.1.0",
|
|
34
|
+
"reportlab>=4.2.0",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.optional-dependencies]
|
|
38
|
+
dev = [
|
|
39
|
+
"build>=1.2.0",
|
|
40
|
+
"pytest>=8.0.0",
|
|
41
|
+
"ruff>=0.8.0",
|
|
42
|
+
"twine>=5.0.0",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
[project.scripts]
|
|
46
|
+
book-condenser = "book_condenser.cli:main"
|
|
47
|
+
|
|
48
|
+
[project.urls]
|
|
49
|
+
Homepage = "https://github.com/khalid/book-condenser"
|
|
50
|
+
Repository = "https://github.com/khalid/book-condenser"
|
|
51
|
+
Issues = "https://github.com/khalid/book-condenser/issues"
|
|
52
|
+
License = "https://polyformproject.org/licenses/noncommercial/1.0.0"
|
|
53
|
+
|
|
54
|
+
[tool.setuptools.packages.find]
|
|
55
|
+
where = ["src"]
|
|
56
|
+
|
|
57
|
+
[tool.pytest.ini_options]
|
|
58
|
+
testpaths = ["tests"]
|
|
59
|
+
pythonpath = ["src"]
|
|
60
|
+
|
|
61
|
+
[tool.ruff]
|
|
62
|
+
line-length = 120
|
|
63
|
+
target-version = "py310"
|
|
64
|
+
|
|
65
|
+
[tool.ruff.lint]
|
|
66
|
+
select = ["E", "F", "I", "UP", "B"]
|
|
67
|
+
ignore = ["E501"]
|
|
68
|
+
|