figgydeck 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- figgydeck-1.0.0/.github/workflows/publish.yml +61 -0
- figgydeck-1.0.0/.github/workflows/tests.yml +36 -0
- figgydeck-1.0.0/.gitignore +47 -0
- figgydeck-1.0.0/CHANGELOG.md +35 -0
- figgydeck-1.0.0/CONTRIBUTING.md +76 -0
- figgydeck-1.0.0/LICENSE +21 -0
- figgydeck-1.0.0/PKG-INFO +195 -0
- figgydeck-1.0.0/README.md +163 -0
- figgydeck-1.0.0/docs/HOW_IT_WORKS.md +128 -0
- figgydeck-1.0.0/examples/batch_process.py +51 -0
- figgydeck-1.0.0/pyproject.toml +61 -0
- figgydeck-1.0.0/scripts/build_combined_slides.py +119 -0
- figgydeck-1.0.0/src/figgydeck/__init__.py +22 -0
- figgydeck-1.0.0/src/figgydeck/anki.py +331 -0
- figgydeck-1.0.0/src/figgydeck/clean.py +83 -0
- figgydeck-1.0.0/src/figgydeck/cli.py +212 -0
- figgydeck-1.0.0/src/figgydeck/extract.py +135 -0
- figgydeck-1.0.0/src/figgydeck/images.py +91 -0
- figgydeck-1.0.0/src/figgydeck/layout.py +326 -0
- figgydeck-1.0.0/src/figgydeck/models.py +45 -0
- figgydeck-1.0.0/src/figgydeck/pptx.py +333 -0
- figgydeck-1.0.0/tests/test_apkg_combined.py +82 -0
- figgydeck-1.0.0/tests/test_chapter1.py +131 -0
- figgydeck-1.0.0/tests/test_clean.py +50 -0
- figgydeck-1.0.0/tests/test_cli.py +244 -0
- figgydeck-1.0.0/tests/test_layout.py +61 -0
- figgydeck-1.0.0/tests/test_manifest.py +36 -0
- figgydeck-1.0.0/tests/test_pptx.py +116 -0
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
name: publish
|
|
2
|
+
|
|
3
|
+
# Build once, then publish to PyPI via Trusted Publishing (OIDC) — no API tokens.
|
|
4
|
+
# - Publishing a GitHub Release → builds and publishes to PyPI.
|
|
5
|
+
# - Manual "Run workflow" → builds and twine-checks only (no publish),
|
|
6
|
+
# a safe dry run of the distribution.
|
|
7
|
+
#
|
|
8
|
+
# One-time setup (required before the first release): on PyPI, add a "pending"
|
|
9
|
+
# Trusted Publisher for project `figgydeck`, owner `gregjoeval`, repo
|
|
10
|
+
# `figgydeck`, workflow `publish.yml`, environment `pypi`; and create a GitHub
|
|
11
|
+
# Environment named `pypi`.
|
|
12
|
+
|
|
13
|
+
on:
|
|
14
|
+
release:
|
|
15
|
+
types: [published]
|
|
16
|
+
workflow_dispatch: {}
|
|
17
|
+
|
|
18
|
+
permissions:
|
|
19
|
+
contents: read
|
|
20
|
+
|
|
21
|
+
jobs:
|
|
22
|
+
build:
|
|
23
|
+
runs-on: ubuntu-latest
|
|
24
|
+
steps:
|
|
25
|
+
- uses: actions/checkout@v4
|
|
26
|
+
|
|
27
|
+
- name: Set up Python
|
|
28
|
+
uses: actions/setup-python@v5
|
|
29
|
+
with:
|
|
30
|
+
python-version: "3.12"
|
|
31
|
+
|
|
32
|
+
- name: Build sdist + wheel
|
|
33
|
+
run: |
|
|
34
|
+
python -m pip install --upgrade pip build twine
|
|
35
|
+
python -m build
|
|
36
|
+
python -m twine check dist/*
|
|
37
|
+
|
|
38
|
+
- name: Upload dist artifact
|
|
39
|
+
uses: actions/upload-artifact@v4
|
|
40
|
+
with:
|
|
41
|
+
name: dist
|
|
42
|
+
path: dist/
|
|
43
|
+
|
|
44
|
+
publish-pypi:
|
|
45
|
+
# Only publish when a GitHub Release is published. Manual dispatch stops at
|
|
46
|
+
# the build job above (a distribution dry run).
|
|
47
|
+
if: github.event_name == 'release'
|
|
48
|
+
needs: build
|
|
49
|
+
runs-on: ubuntu-latest
|
|
50
|
+
environment: pypi
|
|
51
|
+
permissions:
|
|
52
|
+
id-token: write # OIDC token for Trusted Publishing
|
|
53
|
+
steps:
|
|
54
|
+
- name: Download dist artifact
|
|
55
|
+
uses: actions/download-artifact@v4
|
|
56
|
+
with:
|
|
57
|
+
name: dist
|
|
58
|
+
path: dist/
|
|
59
|
+
|
|
60
|
+
- name: Publish to PyPI
|
|
61
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
name: tests
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Install poppler
|
|
20
|
+
run: sudo apt-get update && sudo apt-get install -y poppler-utils
|
|
21
|
+
|
|
22
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
23
|
+
uses: actions/setup-python@v5
|
|
24
|
+
with:
|
|
25
|
+
python-version: ${{ matrix.python-version }}
|
|
26
|
+
|
|
27
|
+
- name: Install package
|
|
28
|
+
run: |
|
|
29
|
+
python -m pip install --upgrade pip
|
|
30
|
+
pip install -e ".[dev,pptx]"
|
|
31
|
+
|
|
32
|
+
- name: Lint
|
|
33
|
+
run: ruff check src tests
|
|
34
|
+
|
|
35
|
+
- name: Test
|
|
36
|
+
run: pytest -v --tb=short
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# Build artifacts
|
|
2
|
+
build/
|
|
3
|
+
dist/
|
|
4
|
+
*.egg-info/
|
|
5
|
+
*.egg
|
|
6
|
+
|
|
7
|
+
# Python cache
|
|
8
|
+
__pycache__/
|
|
9
|
+
*.pyc
|
|
10
|
+
*.pyo
|
|
11
|
+
*.pyd
|
|
12
|
+
.Python
|
|
13
|
+
.pytest_cache/
|
|
14
|
+
.ruff_cache/
|
|
15
|
+
.mypy_cache/
|
|
16
|
+
.coverage
|
|
17
|
+
htmlcov/
|
|
18
|
+
|
|
19
|
+
# Virtual environments
|
|
20
|
+
.venv/
|
|
21
|
+
venv/
|
|
22
|
+
env/
|
|
23
|
+
|
|
24
|
+
# Generated outputs (never commit decks/images by default — they're per-user)
|
|
25
|
+
out/
|
|
26
|
+
output/
|
|
27
|
+
extracted/
|
|
28
|
+
*.apkg
|
|
29
|
+
*.pptx
|
|
30
|
+
|
|
31
|
+
# Source PDFs — don't commit copyrighted material
|
|
32
|
+
*.pdf
|
|
33
|
+
tests/fixtures/*.pdf
|
|
34
|
+
examples/*.pdf
|
|
35
|
+
|
|
36
|
+
# Editor / OS / agent tooling
|
|
37
|
+
.vscode/
|
|
38
|
+
.idea/
|
|
39
|
+
.serena/
|
|
40
|
+
.DS_Store
|
|
41
|
+
*.swp
|
|
42
|
+
*.swo
|
|
43
|
+
|
|
44
|
+
# Notebook checkpoints
|
|
45
|
+
.ipynb_checkpoints/
|
|
46
|
+
|
|
47
|
+
/assets/
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project are documented here. The format is based on
|
|
4
|
+
[Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project
|
|
5
|
+
adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
6
|
+
|
|
7
|
+
## [1.0.0]
|
|
8
|
+
|
|
9
|
+
First stable public release on PyPI: `pip install figgydeck`.
|
|
10
|
+
|
|
11
|
+
Because this is the 1.0 line, the CLI and public Python API are now covered by
|
|
12
|
+
semver. Compared to the pre-release code, the following are **breaking**:
|
|
13
|
+
|
|
14
|
+
### Changed
|
|
15
|
+
- CLI: the format selector `--out` is renamed to `--format` / `-f`, and is now
|
|
16
|
+
**required** — there is no default output format. Running without it exits
|
|
17
|
+
with a non-zero status and a message listing the choices.
|
|
18
|
+
- CLI: the output directory now contains only the built deck(s) by default. The
|
|
19
|
+
extracted `manifest.json` and `images/` are no longer written there unless you
|
|
20
|
+
pass the new `--save-manifest` / `--save-images` flags.
|
|
21
|
+
- `manifest.json` gained a schema wrapper: `{"version": 1, "entries": [...]}`
|
|
22
|
+
instead of a bare array. `figgydeck` still reads legacy bare-array manifests.
|
|
23
|
+
- Public API: `build_combined_apkg` and `build_combined_pptx` now take a list of
|
|
24
|
+
`Chapter` objects (`figgydeck.Chapter`) instead of `(manifest, images_dir,
|
|
25
|
+
title)` tuples.
|
|
26
|
+
|
|
27
|
+
### Fixed
|
|
28
|
+
- Anki deck IDs are now derived with a stable SHA-256 digest instead of the
|
|
29
|
+
builtin `hash()` (which is salted per process), so re-running `figgydeck`
|
|
30
|
+
produces the same deck ID and re-imports no longer risk duplicate decks.
|
|
31
|
+
|
|
32
|
+
### Added
|
|
33
|
+
- `--save-manifest` / `--save-images` flags to opt into keeping the extraction
|
|
34
|
+
intermediates.
|
|
35
|
+
- `figgydeck.Chapter` is exported from the top-level package.
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# Contributing to figgydeck
|
|
2
|
+
|
|
3
|
+
## Quick development setup
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
git clone https://github.com/YOUR_USERNAME/figgydeck.git
|
|
7
|
+
cd figgydeck
|
|
8
|
+
python -m venv .venv
|
|
9
|
+
source .venv/bin/activate # or `.venv\Scripts\activate` on Windows
|
|
10
|
+
pip install -e ".[dev,pptx]"
|
|
11
|
+
|
|
12
|
+
# Run tests
|
|
13
|
+
pytest
|
|
14
|
+
|
|
15
|
+
# Lint
|
|
16
|
+
ruff check src tests
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Repo layout
|
|
20
|
+
|
|
21
|
+
```
|
|
22
|
+
figgydeck/
|
|
23
|
+
├── src/figgydeck/ ← the package
|
|
24
|
+
│ ├── extract.py high-level pipeline
|
|
25
|
+
│ ├── layout.py font/geometry-based label & caption detection
|
|
26
|
+
│ ├── images.py pdfimages wrapping + table cropping
|
|
27
|
+
│ ├── clean.py caption text post-processing
|
|
28
|
+
│ ├── pptx.py .pptx builder (PowerPoint slides)
|
|
29
|
+
│ ├── anki.py .apkg builder (Anki cards)
|
|
30
|
+
│ └── cli.py argparse entry point
|
|
31
|
+
├── tests/ unit + smoke tests
|
|
32
|
+
├── examples/ illustrative scripts
|
|
33
|
+
└── docs/HOW_IT_WORKS.md pipeline architecture
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Adding support for a new publisher's layout
|
|
37
|
+
|
|
38
|
+
The font-size constants in `src/figgydeck/layout.py` are the main tunables.
|
|
39
|
+
Test on a representative chapter:
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
import pdfplumber
|
|
43
|
+
with pdfplumber.open("chapter.pdf") as pdf:
|
|
44
|
+
page = pdf.pages[3]
|
|
45
|
+
fonts = {}
|
|
46
|
+
for c in page.chars:
|
|
47
|
+
key = (c["fontname"], round(c["size"], 1))
|
|
48
|
+
fonts.setdefault(key, []).append(c["text"])
|
|
49
|
+
for (name, size), sample in sorted(fonts.items(), key=lambda x: -len(x[1])):
|
|
50
|
+
preview = "".join(sample[:60])
|
|
51
|
+
print(f" {name} {size}pt — {len(sample)} chars — {preview!r}")
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Look for the font sizes used by:
|
|
55
|
+
- The "FIGURE X.Y" label (typically bold, smaller than body)
|
|
56
|
+
- The caption body text (typically slightly smaller than body)
|
|
57
|
+
- The body text itself (the most common size)
|
|
58
|
+
|
|
59
|
+
If the publisher uses a layout substantially different from Elsevier
|
|
60
|
+
(e.g. captions to the right of figures rather than below), the geometric
|
|
61
|
+
matcher in `layout.py` will need adjustment too. Open an issue with a
|
|
62
|
+
sample PDF and we'll figure it out together.
|
|
63
|
+
|
|
64
|
+
## Testing with copyrighted PDFs
|
|
65
|
+
|
|
66
|
+
The smoke test in `tests/test_chapter1.py` requires a chapter PDF that you own
|
|
67
|
+
at `tests/fixtures/sample_chapter.pdf`. This file is not committed — never commit
|
|
68
|
+
copyrighted material. The test gracefully skips if the file isn't present. Place
|
|
69
|
+
your own copy there to run the smoke test locally.
|
|
70
|
+
|
|
71
|
+
## Pull requests
|
|
72
|
+
|
|
73
|
+
- Keep changes focused. One concern per PR.
|
|
74
|
+
- Add a unit test if you fix a bug.
|
|
75
|
+
- Update `docs/HOW_IT_WORKS.md` if you change the extraction pipeline.
|
|
76
|
+
- Run `ruff check src tests` and `pytest` before pushing.
|
figgydeck-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 figgydeck contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
figgydeck-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: figgydeck
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Turn textbooks into study decks: extract figures, tables, and captions from chapter PDFs and package them as PowerPoint slides or Anki decks.
|
|
5
|
+
Project-URL: Homepage, https://github.com/gregjoeval/figgydeck
|
|
6
|
+
Project-URL: Repository, https://github.com/gregjoeval/figgydeck
|
|
7
|
+
Project-URL: Issues, https://github.com/gregjoeval/figgydeck/issues
|
|
8
|
+
Author: figgydeck contributors
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: anki,captions,figures,flashcards,pdf,powerpoint,pptx,slides,study,textbook
|
|
12
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
13
|
+
Classifier: Intended Audience :: Education
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Education
|
|
20
|
+
Classifier: Topic :: Text Processing
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Requires-Dist: genanki>=0.13
|
|
23
|
+
Requires-Dist: pdfplumber>=0.10
|
|
24
|
+
Requires-Dist: pillow>=9.0
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest-cov>=4; extra == 'dev'
|
|
27
|
+
Requires-Dist: pytest>=7; extra == 'dev'
|
|
28
|
+
Requires-Dist: ruff>=0.1; extra == 'dev'
|
|
29
|
+
Provides-Extra: pptx
|
|
30
|
+
Requires-Dist: python-pptx>=0.6.21; extra == 'pptx'
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# figgydeck
|
|
34
|
+
|
|
35
|
+
[](https://github.com/gregjoeval/figgydeck/actions/workflows/tests.yml)
|
|
36
|
+
|
|
37
|
+
> Turn textbooks into study decks. Extract figures, tables, and their captions from chapter PDFs, then package them as PowerPoint slides or Anki flashcard decks.
|
|
38
|
+
|
|
39
|
+
`figgydeck` reads a chapter PDF and finds every figure and table along with its
|
|
40
|
+
caption. It gives you one slide or card per figure — the image on the face, and
|
|
41
|
+
the caption plus book/chapter/page metadata attached — the format you actually
|
|
42
|
+
want for visual recall or lecture review.
|
|
43
|
+
|
|
44
|
+
Two output formats, same extraction:
|
|
45
|
+
|
|
46
|
+
- **PowerPoint** (`.pptx`) — one slide per figure/table, image centered with its
|
|
47
|
+
aspect ratio preserved and the metadata in the slide's speaker notes. Opens in
|
|
48
|
+
PowerPoint, Keynote, or Google Slides.
|
|
49
|
+
- **Anki** (`.apkg`) — one card per figure/table, image on the front and the
|
|
50
|
+
caption + metadata on the back. Imports straight into Anki.
|
|
51
|
+
|
|
52
|
+
## Why
|
|
53
|
+
|
|
54
|
+
Manually building slides or flashcards from a 40-figure chapter takes hours. The two
|
|
55
|
+
hard parts are (1) matching extracted images to their figure numbers — PDF
|
|
56
|
+
stream order is often shuffled — and (2) cleaning captions of running headers
|
|
57
|
+
and footnotes that bleed in. `figgydeck` does both with pure-Python signal
|
|
58
|
+
processing (font sizes, spatial geometry, ruled-line detection); no LLM call
|
|
59
|
+
required for the standard Elsevier textbook layout.
|
|
60
|
+
|
|
61
|
+
## Install
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
pip install figgydeck
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
System requirements: `poppler` (for `pdftoppm` and `pdfimages`).
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
# macOS
|
|
71
|
+
brew install poppler
|
|
72
|
+
|
|
73
|
+
# Debian/Ubuntu
|
|
74
|
+
sudo apt install poppler-utils
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
PowerPoint output (`--format pptx`) needs an optional extra:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pip install "figgydeck[pptx]"
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Usage
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
# One command, chapter PDF → figure deck (figures only by default)
|
|
87
|
+
figgydeck chapter1.pdf \
|
|
88
|
+
--book "Example Textbook (1st ed., 2020)" \
|
|
89
|
+
--chapter "Ch. 3: Cell Structure" \
|
|
90
|
+
--format apkg \
|
|
91
|
+
--output ./out/
|
|
92
|
+
|
|
93
|
+
# Output — just the deck (the extracted images are embedded inside it):
|
|
94
|
+
# out/ExampleTextbook1StEd2020_Ch3CellStructure.apkg ← import into Anki
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Also include table cards (extra `pdftoppm` rasterization per table):
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
figgydeck chapter1.pdf --book "..." --chapter "..." --tables
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Choose your output format(s) with `--format`/`-f` (required; repeatable or
|
|
104
|
+
comma-separated):
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
figgydeck chapter1.pdf --book "..." --chapter "..." --format apkg # only .apkg
|
|
108
|
+
figgydeck chapter1.pdf --book "..." --chapter "..." --format pptx # only .pptx
|
|
109
|
+
figgydeck chapter1.pdf --book "..." --chapter "..." --format apkg,pptx # both
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
`--format` is required — there is no default. PowerPoint output needs the
|
|
113
|
+
optional `[pptx]` extra (see [Install](#install)).
|
|
114
|
+
|
|
115
|
+
### Keeping the extraction artifacts
|
|
116
|
+
|
|
117
|
+
By default the output directory holds only the decks; the extracted images are
|
|
118
|
+
embedded inside them. To also keep the raw intermediates, pass `--save-images`
|
|
119
|
+
(writes an `images/` folder) and/or `--save-manifest` (writes `manifest.json`,
|
|
120
|
+
the structured list of everything that was extracted):
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
figgydeck chapter1.pdf --book "..." --chapter "..." --format apkg \
|
|
124
|
+
--save-manifest --save-images
|
|
125
|
+
# out/...apkg ← the deck
|
|
126
|
+
# out/manifest.json ← inspect what was extracted
|
|
127
|
+
# out/images/ ← extracted figures
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### Multiple chapters
|
|
131
|
+
|
|
132
|
+
Pass more than one chapter PDF in a single command. Give one `--chapter` per PDF
|
|
133
|
+
(in order), or omit `--chapter` entirely to derive each title from its filename
|
|
134
|
+
(`ch01.pdf` → "Chapter 1"):
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
# one artifact per chapter (like running figgydeck once per PDF)
|
|
138
|
+
figgydeck ch01.pdf ch02.pdf ch03.pdf --book "..." --format pptx
|
|
139
|
+
|
|
140
|
+
# --combine merges every chapter into ONE artifact per format
|
|
141
|
+
figgydeck ch01.pdf ch02.pdf ch03.pdf --book "..." --combine --format apkg,pptx
|
|
142
|
+
# out/<Book>_Combined.pptx ← one deck: title slide + all figures
|
|
143
|
+
# out/<Book>_Combined.apkg ← one Anki package, one subdeck per chapter
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
With multiple PDFs, when you keep artifacts (`--save-manifest`/`--save-images`)
|
|
147
|
+
each chapter's files go in their own `out/NN_<slug>/` subdir (so identical image
|
|
148
|
+
names don't collide). Slide images keep their native aspect ratio and are
|
|
149
|
+
centered — nothing is stretched or cropped.
|
|
150
|
+
|
|
151
|
+
## How it works
|
|
152
|
+
|
|
153
|
+
For an Elsevier-format chapter, `figgydeck`:
|
|
154
|
+
|
|
155
|
+
1. Extracts all embedded images via `pdfimages`
|
|
156
|
+
2. Walks each PDF page with `pdfplumber`, identifying:
|
|
157
|
+
- **`FIGURE X.Y` / `TABLE X.Y` labels** by font size (~9 pt label font)
|
|
158
|
+
- **caption text** by font size (8 pt) within column-aware bounding boxes
|
|
159
|
+
- **table regions** by horizontal-rule detection
|
|
160
|
+
3. Matches each label to the nearest image whose bottom edge sits just above it
|
|
161
|
+
in the same column — robust to images appearing in shuffled stream order
|
|
162
|
+
4. Crops table regions from rasterized pages
|
|
163
|
+
5. Cleans captions (strips running headers, decodes ligatures, de-hyphenates
|
|
164
|
+
line wraps)
|
|
165
|
+
6. Builds your chosen deck(s) — `.apkg` and/or `.pptx` (optionally emitting the
|
|
166
|
+
intermediate `manifest.json` / `images/` with `--save-manifest` /
|
|
167
|
+
`--save-images`)
|
|
168
|
+
|
|
169
|
+
On a well-behaved Elsevier-format chapter this reliably matches every figure to
|
|
170
|
+
its caption and crops tables with their full titles.
|
|
171
|
+
|
|
172
|
+
## What you get
|
|
173
|
+
|
|
174
|
+
Both formats carry the same content — the figure or table image plus its
|
|
175
|
+
caption and book/chapter/page metadata — laid out for the way you'll use it.
|
|
176
|
+
|
|
177
|
+
**PowerPoint slides** (`.pptx`)
|
|
178
|
+
|
|
179
|
+
- One slide per figure/table, image centered and scaled to fit with its aspect
|
|
180
|
+
ratio preserved (no cropping)
|
|
181
|
+
- Number, title, caption, book, chapter, and page in the slide's speaker notes
|
|
182
|
+
- A combined deck (`--combine`) opens with a title slide (book + chapter count)
|
|
183
|
+
|
|
184
|
+
**Anki cards** (`.apkg`)
|
|
185
|
+
|
|
186
|
+
- A clean two-side layout that's readable in both light and dark mode
|
|
187
|
+
- **Front:** image only, with a small `Fig X.Y` tag below
|
|
188
|
+
- **Back:** type label, caption, and book/chapter/page metadata
|
|
189
|
+
- Stable note GUIDs based on `(book, chapter, type, number)` — re-importing
|
|
190
|
+
updates existing cards rather than duplicating them
|
|
191
|
+
|
|
192
|
+
## AI assistance
|
|
193
|
+
|
|
194
|
+
Built with help from [Claude Code](https://claude.com/claude-code). All code is
|
|
195
|
+
human-reviewed.
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
# figgydeck
|
|
2
|
+
|
|
3
|
+
[](https://github.com/gregjoeval/figgydeck/actions/workflows/tests.yml)
|
|
4
|
+
|
|
5
|
+
> Turn textbooks into study decks. Extract figures, tables, and their captions from chapter PDFs, then package them as PowerPoint slides or Anki flashcard decks.
|
|
6
|
+
|
|
7
|
+
`figgydeck` reads a chapter PDF and finds every figure and table along with its
|
|
8
|
+
caption. It gives you one slide or card per figure — the image on the face, and
|
|
9
|
+
the caption plus book/chapter/page metadata attached — the format you actually
|
|
10
|
+
want for visual recall or lecture review.
|
|
11
|
+
|
|
12
|
+
Two output formats, same extraction:
|
|
13
|
+
|
|
14
|
+
- **PowerPoint** (`.pptx`) — one slide per figure/table, image centered with its
|
|
15
|
+
aspect ratio preserved and the metadata in the slide's speaker notes. Opens in
|
|
16
|
+
PowerPoint, Keynote, or Google Slides.
|
|
17
|
+
- **Anki** (`.apkg`) — one card per figure/table, image on the front and the
|
|
18
|
+
caption + metadata on the back. Imports straight into Anki.
|
|
19
|
+
|
|
20
|
+
## Why
|
|
21
|
+
|
|
22
|
+
Manually building slides or flashcards from a 40-figure chapter takes hours. The two
|
|
23
|
+
hard parts are (1) matching extracted images to their figure numbers — PDF
|
|
24
|
+
stream order is often shuffled — and (2) cleaning captions of running headers
|
|
25
|
+
and footnotes that bleed in. `figgydeck` does both with pure-Python signal
|
|
26
|
+
processing (font sizes, spatial geometry, ruled-line detection); no LLM call
|
|
27
|
+
required for the standard Elsevier textbook layout.
|
|
28
|
+
|
|
29
|
+
## Install
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install figgydeck
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
System requirements: `poppler` (for `pdftoppm` and `pdfimages`).
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
# macOS
|
|
39
|
+
brew install poppler
|
|
40
|
+
|
|
41
|
+
# Debian/Ubuntu
|
|
42
|
+
sudo apt install poppler-utils
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
PowerPoint output (`--format pptx`) needs an optional extra:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install "figgydeck[pptx]"
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Usage
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
# One command, chapter PDF → figure deck (figures only by default)
|
|
55
|
+
figgydeck chapter1.pdf \
|
|
56
|
+
--book "Example Textbook (1st ed., 2020)" \
|
|
57
|
+
--chapter "Ch. 3: Cell Structure" \
|
|
58
|
+
--format apkg \
|
|
59
|
+
--output ./out/
|
|
60
|
+
|
|
61
|
+
# Output — just the deck (the extracted images are embedded inside it):
|
|
62
|
+
# out/ExampleTextbook1StEd2020_Ch3CellStructure.apkg ← import into Anki
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Also include table cards (extra `pdftoppm` rasterization per table):
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
figgydeck chapter1.pdf --book "..." --chapter "..." --tables
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Choose your output format(s) with `--format`/`-f` (required; repeatable or
|
|
72
|
+
comma-separated):
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
figgydeck chapter1.pdf --book "..." --chapter "..." --format apkg # only .apkg
|
|
76
|
+
figgydeck chapter1.pdf --book "..." --chapter "..." --format pptx # only .pptx
|
|
77
|
+
figgydeck chapter1.pdf --book "..." --chapter "..." --format apkg,pptx # both
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
`--format` is required — there is no default. PowerPoint output needs the
|
|
81
|
+
optional `[pptx]` extra (see [Install](#install)).
|
|
82
|
+
|
|
83
|
+
### Keeping the extraction artifacts
|
|
84
|
+
|
|
85
|
+
By default the output directory holds only the decks; the extracted images are
|
|
86
|
+
embedded inside them. To also keep the raw intermediates, pass `--save-images`
|
|
87
|
+
(writes an `images/` folder) and/or `--save-manifest` (writes `manifest.json`,
|
|
88
|
+
the structured list of everything that was extracted):
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
figgydeck chapter1.pdf --book "..." --chapter "..." --format apkg \
|
|
92
|
+
--save-manifest --save-images
|
|
93
|
+
# out/...apkg ← the deck
|
|
94
|
+
# out/manifest.json ← inspect what was extracted
|
|
95
|
+
# out/images/ ← extracted figures
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Multiple chapters
|
|
99
|
+
|
|
100
|
+
Pass more than one chapter PDF in a single command. Give one `--chapter` per PDF
|
|
101
|
+
(in order), or omit `--chapter` entirely to derive each title from its filename
|
|
102
|
+
(`ch01.pdf` → "Chapter 1"):
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
# one artifact per chapter (like running figgydeck once per PDF)
|
|
106
|
+
figgydeck ch01.pdf ch02.pdf ch03.pdf --book "..." --format pptx
|
|
107
|
+
|
|
108
|
+
# --combine merges every chapter into ONE artifact per format
|
|
109
|
+
figgydeck ch01.pdf ch02.pdf ch03.pdf --book "..." --combine --format apkg,pptx
|
|
110
|
+
# out/<Book>_Combined.pptx ← one deck: title slide + all figures
|
|
111
|
+
# out/<Book>_Combined.apkg ← one Anki package, one subdeck per chapter
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
With multiple PDFs, when you keep artifacts (`--save-manifest`/`--save-images`)
|
|
115
|
+
each chapter's files go in their own `out/NN_<slug>/` subdir (so identical image
|
|
116
|
+
names don't collide). Slide images keep their native aspect ratio and are
|
|
117
|
+
centered — nothing is stretched or cropped.
|
|
118
|
+
|
|
119
|
+
## How it works
|
|
120
|
+
|
|
121
|
+
For an Elsevier-format chapter, `figgydeck`:
|
|
122
|
+
|
|
123
|
+
1. Extracts all embedded images via `pdfimages`
|
|
124
|
+
2. Walks each PDF page with `pdfplumber`, identifying:
|
|
125
|
+
- **`FIGURE X.Y` / `TABLE X.Y` labels** by font size (~9 pt label font)
|
|
126
|
+
- **caption text** by font size (8 pt) within column-aware bounding boxes
|
|
127
|
+
- **table regions** by horizontal-rule detection
|
|
128
|
+
3. Matches each label to the nearest image whose bottom edge sits just above it
|
|
129
|
+
in the same column — robust to images appearing in shuffled stream order
|
|
130
|
+
4. Crops table regions from rasterized pages
|
|
131
|
+
5. Cleans captions (strips running headers, decodes ligatures, de-hyphenates
|
|
132
|
+
line wraps)
|
|
133
|
+
6. Builds your chosen deck(s) — `.apkg` and/or `.pptx` (optionally emitting the
|
|
134
|
+
intermediate `manifest.json` / `images/` with `--save-manifest` /
|
|
135
|
+
`--save-images`)
|
|
136
|
+
|
|
137
|
+
On a well-behaved Elsevier-format chapter this reliably matches every figure to
|
|
138
|
+
its caption and crops tables with their full titles.
|
|
139
|
+
|
|
140
|
+
## What you get
|
|
141
|
+
|
|
142
|
+
Both formats carry the same content — the figure or table image plus its
|
|
143
|
+
caption and book/chapter/page metadata — laid out for the way you'll use it.
|
|
144
|
+
|
|
145
|
+
**PowerPoint slides** (`.pptx`)
|
|
146
|
+
|
|
147
|
+
- One slide per figure/table, image centered and scaled to fit with its aspect
|
|
148
|
+
ratio preserved (no cropping)
|
|
149
|
+
- Number, title, caption, book, chapter, and page in the slide's speaker notes
|
|
150
|
+
- A combined deck (`--combine`) opens with a title slide (book + chapter count)
|
|
151
|
+
|
|
152
|
+
**Anki cards** (`.apkg`)
|
|
153
|
+
|
|
154
|
+
- A clean two-side layout that's readable in both light and dark mode
|
|
155
|
+
- **Front:** image only, with a small `Fig X.Y` tag below
|
|
156
|
+
- **Back:** type label, caption, and book/chapter/page metadata
|
|
157
|
+
- Stable note GUIDs based on `(book, chapter, type, number)` — re-importing
|
|
158
|
+
updates existing cards rather than duplicating them
|
|
159
|
+
|
|
160
|
+
## AI assistance
|
|
161
|
+
|
|
162
|
+
Built with help from [Claude Code](https://claude.com/claude-code). All code is
|
|
163
|
+
human-reviewed.
|