figgydeck 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- figgydeck/__init__.py +22 -0
- figgydeck/anki.py +331 -0
- figgydeck/clean.py +83 -0
- figgydeck/cli.py +212 -0
- figgydeck/extract.py +135 -0
- figgydeck/images.py +91 -0
- figgydeck/layout.py +326 -0
- figgydeck/models.py +45 -0
- figgydeck/pptx.py +333 -0
- figgydeck-1.0.0.dist-info/METADATA +195 -0
- figgydeck-1.0.0.dist-info/RECORD +14 -0
- figgydeck-1.0.0.dist-info/WHEEL +4 -0
- figgydeck-1.0.0.dist-info/entry_points.txt +2 -0
- figgydeck-1.0.0.dist-info/licenses/LICENSE +21 -0
figgydeck/__init__.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""figgydeck: turn textbooks into study decks (PowerPoint slides or Anki decks)."""
|
|
2
|
+
|
|
3
|
+
from importlib.metadata import PackageNotFoundError
|
|
4
|
+
from importlib.metadata import version as _pkg_version
|
|
5
|
+
|
|
6
|
+
from figgydeck.anki import build_apkg, build_combined_apkg
|
|
7
|
+
from figgydeck.extract import extract_chapter
|
|
8
|
+
from figgydeck.models import Chapter
|
|
9
|
+
|
|
10
|
+
# The PowerPoint builders (build_pptx, build_combined_pptx) are equal
|
|
11
|
+
# first-class outputs, but they live in figgydeck.pptx and are imported from
|
|
12
|
+
# there so that the top-level package never hard-imports the optional
|
|
13
|
+
# python-pptx dependency. Use `from figgydeck.pptx import build_pptx`.
|
|
14
|
+
|
|
15
|
+
# Version is defined once in pyproject.toml; read it back from the installed
|
|
16
|
+
# distribution metadata so the two can never drift.
|
|
17
|
+
try:
|
|
18
|
+
__version__ = _pkg_version("figgydeck")
|
|
19
|
+
except PackageNotFoundError: # running from a source tree that isn't installed
|
|
20
|
+
__version__ = "0.0.0+unknown"
|
|
21
|
+
|
|
22
|
+
__all__ = ["extract_chapter", "build_apkg", "build_combined_apkg", "Chapter"]
|
figgydeck/anki.py
ADDED
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
"""Anki deck builder: turn a manifest + images into a .apkg file."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import html
|
|
7
|
+
import re
|
|
8
|
+
import shutil
|
|
9
|
+
import tempfile
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
import genanki
|
|
13
|
+
|
|
14
|
+
from figgydeck.models import Chapter, load_manifest
|
|
15
|
+
|
|
16
|
+
# Stable model ID — must not change between runs, or Anki will fail to
|
|
17
|
+
# update existing notes when users re-import.
|
|
18
|
+
_MODEL_ID = 1607392319
|
|
19
|
+
_DECK_ID_BASE = 2059400110
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
CARD_CSS = """
|
|
23
|
+
/* ---------- Light mode (default) ---------- */
|
|
24
|
+
.card {
|
|
25
|
+
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
|
|
26
|
+
font-size: 17px; color: #2A1B23; background: #F8F4EC;
|
|
27
|
+
text-align: left; padding: 18px; line-height: 1.5;
|
|
28
|
+
}
|
|
29
|
+
.card-img {
|
|
30
|
+
display: block; max-width: 100%; max-height: 60vh;
|
|
31
|
+
margin: 0 auto 12px; border-radius: 6px;
|
|
32
|
+
box-shadow: 0 2px 12px rgba(0,0,0,0.15);
|
|
33
|
+
background: #ffffff; padding: 6px;
|
|
34
|
+
}
|
|
35
|
+
.tag-row {
|
|
36
|
+
text-align: center; font-size: 12px; letter-spacing: 2px;
|
|
37
|
+
text-transform: uppercase; color: #A26769;
|
|
38
|
+
font-weight: 600; margin-top: 8px;
|
|
39
|
+
}
|
|
40
|
+
.divider {
|
|
41
|
+
border: 0; border-top: 2px solid #C9A86A;
|
|
42
|
+
width: 60px; margin: 16px auto;
|
|
43
|
+
}
|
|
44
|
+
.label {
|
|
45
|
+
font-size: 11px; letter-spacing: 1.5px; text-transform: uppercase;
|
|
46
|
+
color: #A26769; font-weight: 600; margin-bottom: 4px;
|
|
47
|
+
}
|
|
48
|
+
.title {
|
|
49
|
+
font-family: Georgia, serif; font-size: 22px; font-weight: 700;
|
|
50
|
+
color: #6D2E46; margin: 0 0 14px 0; line-height: 1.3;
|
|
51
|
+
}
|
|
52
|
+
.caption { font-size: 15px; color: #2A1B23; margin: 0 0 16px 0; }
|
|
53
|
+
.meta {
|
|
54
|
+
font-size: 12px; color: #8A7079; font-style: italic;
|
|
55
|
+
border-top: 1px solid #D8C9B5; padding-top: 10px; margin-top: 12px;
|
|
56
|
+
}
|
|
57
|
+
.meta-line { margin: 2px 0; }
|
|
58
|
+
.meta b { color: #6D2E46; font-style: normal; }
|
|
59
|
+
|
|
60
|
+
/* ---------- Dark mode (.nightMode / .night_mode) ---------- */
|
|
61
|
+
.nightMode .card, .night_mode .card,
|
|
62
|
+
.card.nightMode, .card.night_mode {
|
|
63
|
+
color: #ECE2D0; background: #1F1419;
|
|
64
|
+
}
|
|
65
|
+
.nightMode .card-img, .night_mode .card-img,
|
|
66
|
+
.card.nightMode .card-img, .card.night_mode .card-img {
|
|
67
|
+
background: #ffffff; box-shadow: 0 2px 16px rgba(0,0,0,0.6);
|
|
68
|
+
}
|
|
69
|
+
.nightMode .tag-row, .night_mode .tag-row,
|
|
70
|
+
.card.nightMode .tag-row, .card.night_mode .tag-row { color: #E8B86F; }
|
|
71
|
+
.nightMode .divider, .night_mode .divider,
|
|
72
|
+
.card.nightMode .divider, .card.night_mode .divider { border-top-color: #E8B86F; }
|
|
73
|
+
.nightMode .label, .night_mode .label,
|
|
74
|
+
.card.nightMode .label, .card.night_mode .label { color: #E8B86F; }
|
|
75
|
+
.nightMode .title, .night_mode .title,
|
|
76
|
+
.card.nightMode .title, .card.night_mode .title { color: #F2C9A0; }
|
|
77
|
+
.nightMode .caption, .night_mode .caption,
|
|
78
|
+
.card.nightMode .caption, .card.night_mode .caption { color: #ECE2D0; }
|
|
79
|
+
.nightMode .meta, .night_mode .meta,
|
|
80
|
+
.card.nightMode .meta, .card.night_mode .meta {
|
|
81
|
+
color: #B5A39A; border-top-color: #4A2D38;
|
|
82
|
+
}
|
|
83
|
+
.nightMode .meta b, .night_mode .meta b,
|
|
84
|
+
.card.nightMode .meta b, .card.night_mode .meta b { color: #F2C9A0; }
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _build_model() -> genanki.Model:
|
|
89
|
+
return genanki.Model(
|
|
90
|
+
_MODEL_ID,
|
|
91
|
+
"figgydeck Card",
|
|
92
|
+
fields=[
|
|
93
|
+
{"name": "Number"}, # "Fig X.Y" or "Tab X.Y"
|
|
94
|
+
{"name": "Image"}, # <img src="...">
|
|
95
|
+
{"name": "Title"}, # tables have one; figures may be empty
|
|
96
|
+
{"name": "Caption"}, # cleaned caption body
|
|
97
|
+
{"name": "Book"},
|
|
98
|
+
{"name": "Chapter"},
|
|
99
|
+
{"name": "Page"},
|
|
100
|
+
{"name": "Type"}, # "Figure" | "Table"
|
|
101
|
+
],
|
|
102
|
+
templates=[{
|
|
103
|
+
"name": "Image -> Details",
|
|
104
|
+
"qfmt": '{{Image}}<div class="tag-row">{{Number}}</div>',
|
|
105
|
+
"afmt": (
|
|
106
|
+
'{{Image}}<div class="tag-row">{{Number}}</div>'
|
|
107
|
+
'<hr class="divider">'
|
|
108
|
+
'<div class="label">{{Type}}</div>'
|
|
109
|
+
'{{#Title}}<div class="title">{{Title}}</div>{{/Title}}'
|
|
110
|
+
'{{#Caption}}<div class="caption">{{Caption}}</div>{{/Caption}}'
|
|
111
|
+
'<div class="meta">'
|
|
112
|
+
'<div class="meta-line"><b>Book:</b> {{Book}}</div>'
|
|
113
|
+
'<div class="meta-line"><b>Chapter:</b> {{Chapter}}</div>'
|
|
114
|
+
'<div class="meta-line"><b>Page:</b> {{Page}}</div>'
|
|
115
|
+
'</div>'
|
|
116
|
+
),
|
|
117
|
+
}],
|
|
118
|
+
css=CARD_CSS,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _slugify(s: str) -> str:
|
|
123
|
+
s = s.lower()
|
|
124
|
+
s = re.sub(r"[^\w\s-]", "", s)
|
|
125
|
+
s = re.sub(r"[-\s]+", "-", s).strip("-")
|
|
126
|
+
return s
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _add_notes(
|
|
130
|
+
deck: genanki.Deck,
|
|
131
|
+
model: genanki.Model,
|
|
132
|
+
manifest: list[dict],
|
|
133
|
+
images_dir: Path,
|
|
134
|
+
book_title: str,
|
|
135
|
+
chapter_title: str,
|
|
136
|
+
media_files: list[str],
|
|
137
|
+
*,
|
|
138
|
+
media_prefix: str = "",
|
|
139
|
+
stage_dir: Path | None = None,
|
|
140
|
+
log=lambda *a, **k: None,
|
|
141
|
+
) -> int:
|
|
142
|
+
"""Add one note per figure/table entry to `deck`; return the skipped count.
|
|
143
|
+
|
|
144
|
+
Appends each referenced image path to `media_files` (the caller writes them
|
|
145
|
+
into the genanki Package). genanki keys media by basename, so when merging
|
|
146
|
+
several chapters into one package `media_prefix` (the chapter slug) is set:
|
|
147
|
+
each image is copied into `stage_dir` as ``f"{media_prefix}__{name}"`` and
|
|
148
|
+
the note's ``<img src>`` references that unique basename. With an empty
|
|
149
|
+
prefix (single-chapter build) images are referenced in place, unchanged.
|
|
150
|
+
"""
|
|
151
|
+
book_tag = _slugify(book_title)
|
|
152
|
+
chapter_tag = _slugify(chapter_title)
|
|
153
|
+
skipped = 0
|
|
154
|
+
|
|
155
|
+
for entry in manifest:
|
|
156
|
+
if not entry.get("image_filename"):
|
|
157
|
+
log(f" skip {entry['type']} {entry['number']}: no image")
|
|
158
|
+
skipped += 1
|
|
159
|
+
continue
|
|
160
|
+
img_path = images_dir / entry["image_filename"]
|
|
161
|
+
if not img_path.exists():
|
|
162
|
+
log(f" skip {entry['type']} {entry['number']}: missing {img_path}")
|
|
163
|
+
skipped += 1
|
|
164
|
+
continue
|
|
165
|
+
|
|
166
|
+
if media_prefix:
|
|
167
|
+
if stage_dir is None:
|
|
168
|
+
raise ValueError("media_prefix requires stage_dir")
|
|
169
|
+
media_name = f"{media_prefix}__{img_path.name}"
|
|
170
|
+
staged = stage_dir / media_name
|
|
171
|
+
shutil.copy(img_path, staged)
|
|
172
|
+
media_files.append(str(staged))
|
|
173
|
+
else:
|
|
174
|
+
media_name = img_path.name
|
|
175
|
+
media_files.append(str(img_path))
|
|
176
|
+
|
|
177
|
+
kind = entry["type"].capitalize()
|
|
178
|
+
number_label = f"Tab {entry['number']}" if kind == "Table" else f"Fig {entry['number']}"
|
|
179
|
+
|
|
180
|
+
note = genanki.Note(
|
|
181
|
+
model=model,
|
|
182
|
+
fields=[
|
|
183
|
+
number_label,
|
|
184
|
+
f'<img class="card-img" src="{html.escape(media_name)}">',
|
|
185
|
+
html.escape(entry.get("title") or ""),
|
|
186
|
+
html.escape(entry.get("caption") or ""),
|
|
187
|
+
html.escape(book_title),
|
|
188
|
+
html.escape(chapter_title),
|
|
189
|
+
f"p. {entry['page']}",
|
|
190
|
+
kind,
|
|
191
|
+
],
|
|
192
|
+
tags=[
|
|
193
|
+
f"book::{book_tag}",
|
|
194
|
+
f"chapter::{chapter_tag}",
|
|
195
|
+
f"type::{kind.lower()}",
|
|
196
|
+
],
|
|
197
|
+
# Stable GUID: re-running figgydeck and re-importing into Anki
|
|
198
|
+
# updates existing notes rather than creating duplicates.
|
|
199
|
+
guid=genanki.guid_for(book_tag, chapter_tag, kind, entry["number"]),
|
|
200
|
+
)
|
|
201
|
+
deck.add_note(note)
|
|
202
|
+
|
|
203
|
+
return skipped
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _deck_for(book_title: str, chapter_title: str, deck_name: str) -> genanki.Deck:
|
|
207
|
+
"""Create a genanki.Deck with a stable id derived from book + chapter.
|
|
208
|
+
|
|
209
|
+
Uses a SHA-256 digest (not builtin ``hash()``, which is salted per process
|
|
210
|
+
via ``PYTHONHASHSEED``) so the same book/chapter yields the same deck id on
|
|
211
|
+
every run. Anki keys decks by name on import, so the id is mostly cosmetic —
|
|
212
|
+
but a stable value keeps re-imports and diffs reproducible.
|
|
213
|
+
"""
|
|
214
|
+
book_tag = _slugify(book_title)
|
|
215
|
+
chapter_tag = _slugify(chapter_title)
|
|
216
|
+
digest = hashlib.sha256(f"{book_tag}::{chapter_tag}".encode()).digest()
|
|
217
|
+
deck_id = _DECK_ID_BASE + int.from_bytes(digest[:4], "big") % 100000
|
|
218
|
+
return genanki.Deck(deck_id, deck_name)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def build_apkg(
|
|
222
|
+
manifest: list[dict] | str | Path,
|
|
223
|
+
images_dir: str | Path,
|
|
224
|
+
book_title: str,
|
|
225
|
+
chapter_title: str,
|
|
226
|
+
output_path: str | Path,
|
|
227
|
+
*,
|
|
228
|
+
verbose: bool = True,
|
|
229
|
+
) -> Path:
|
|
230
|
+
"""Build an Anki .apkg from a chapter manifest.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
manifest: Either the manifest list (from `extract_chapter()`) or a
|
|
234
|
+
path to a `manifest.json` file.
|
|
235
|
+
images_dir: Folder containing the extracted images referenced in
|
|
236
|
+
the manifest.
|
|
237
|
+
book_title: Display title for the book — appears on every card back.
|
|
238
|
+
chapter_title: Display title for the chapter.
|
|
239
|
+
output_path: Where to write the `.apkg` file.
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
Path to the written .apkg file.
|
|
243
|
+
"""
|
|
244
|
+
manifest = load_manifest(manifest)
|
|
245
|
+
|
|
246
|
+
images_dir = Path(images_dir)
|
|
247
|
+
output_path = Path(output_path)
|
|
248
|
+
log = print if verbose else (lambda *a, **k: None)
|
|
249
|
+
|
|
250
|
+
deck = _deck_for(book_title, chapter_title, f"{book_title} :: {chapter_title}")
|
|
251
|
+
model = _build_model()
|
|
252
|
+
media_files: list[str] = []
|
|
253
|
+
|
|
254
|
+
skipped = _add_notes(
|
|
255
|
+
deck, model, manifest, images_dir, book_title, chapter_title,
|
|
256
|
+
media_files, log=log,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
pkg = genanki.Package(deck)
|
|
260
|
+
pkg.media_files = media_files
|
|
261
|
+
pkg.write_to_file(str(output_path))
|
|
262
|
+
|
|
263
|
+
log(f"\nWrote: {output_path}")
|
|
264
|
+
log(f" notes added: {len(deck.notes)}")
|
|
265
|
+
log(f" skipped: {skipped}")
|
|
266
|
+
|
|
267
|
+
return output_path
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def build_combined_apkg(
|
|
271
|
+
chapters: list[Chapter],
|
|
272
|
+
book_title: str,
|
|
273
|
+
output_path: str | Path,
|
|
274
|
+
*,
|
|
275
|
+
verbose: bool = True,
|
|
276
|
+
) -> Path:
|
|
277
|
+
"""Build one `.apkg` merging every chapter into its own Anki subdeck.
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
chapters: list of `Chapter` records. Each chapter's `manifest` may be
|
|
281
|
+
the list from `extract_chapter()` or a path to a `manifest.json`.
|
|
282
|
+
book_title: Book title — the parent deck; each chapter becomes a
|
|
283
|
+
subdeck named ``"{book_title}::{chapter_title}"`` ("::" with no
|
|
284
|
+
spaces is Anki's subdeck separator, so they nest under the book).
|
|
285
|
+
output_path: Where to write the combined `.apkg`.
|
|
286
|
+
|
|
287
|
+
Images from different chapters can share a basename (e.g. ``img-000.png``);
|
|
288
|
+
since genanki keys media by basename, each chapter's images are staged with
|
|
289
|
+
a chapter-prefixed basename so they don't collide inside the package.
|
|
290
|
+
"""
|
|
291
|
+
output_path = Path(output_path)
|
|
292
|
+
log = print if verbose else (lambda *a, **k: None)
|
|
293
|
+
|
|
294
|
+
model = _build_model()
|
|
295
|
+
decks: list[genanki.Deck] = []
|
|
296
|
+
media_files: list[str] = []
|
|
297
|
+
|
|
298
|
+
# Keep the staging dir alive until after the package is written.
|
|
299
|
+
with tempfile.TemporaryDirectory(prefix="figgydeck-apkg-") as tmp:
|
|
300
|
+
stage_dir = Path(tmp)
|
|
301
|
+
total_notes = 0
|
|
302
|
+
total_skipped = 0
|
|
303
|
+
|
|
304
|
+
for i, chapter in enumerate(chapters):
|
|
305
|
+
manifest = load_manifest(chapter.manifest)
|
|
306
|
+
images_dir = chapter.images_dir
|
|
307
|
+
chapter_title = chapter.title
|
|
308
|
+
deck = _deck_for(
|
|
309
|
+
book_title, chapter_title, f"{book_title}::{chapter_title}"
|
|
310
|
+
)
|
|
311
|
+
# Index-prefixed so media basenames stay unique even if two
|
|
312
|
+
# chapters share a title (and thus a slug).
|
|
313
|
+
skipped = _add_notes(
|
|
314
|
+
deck, model, manifest, Path(images_dir), book_title, chapter_title,
|
|
315
|
+
media_files, media_prefix=f"ch{i}-{_slugify(chapter_title)}",
|
|
316
|
+
stage_dir=stage_dir, log=log,
|
|
317
|
+
)
|
|
318
|
+
decks.append(deck)
|
|
319
|
+
total_notes += len(deck.notes)
|
|
320
|
+
total_skipped += skipped
|
|
321
|
+
|
|
322
|
+
pkg = genanki.Package(decks)
|
|
323
|
+
pkg.media_files = media_files
|
|
324
|
+
pkg.write_to_file(str(output_path))
|
|
325
|
+
|
|
326
|
+
log(f"\nWrote: {output_path}")
|
|
327
|
+
log(f" chapters: {len(chapters)}")
|
|
328
|
+
log(f" notes added: {total_notes}")
|
|
329
|
+
log(f" skipped: {total_skipped}")
|
|
330
|
+
|
|
331
|
+
return output_path
|
figgydeck/clean.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Caption cleanup: strip running headers, decode ligatures, normalize whitespace."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
# Common PDF font ligatures
|
|
8
|
+
LIGATURES = {
|
|
9
|
+
"fi": "fi", "fl": "fl", "ff": "ff", "ffi": "ffi", "ffl": "ffl",
|
|
10
|
+
"ſt": "ft", "st": "st",
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
# A running header/footer is a banner of consecutive ALL-CAPS words (the book or
|
|
14
|
+
# chapter title) that PDF extraction interleaves into the caption text. We treat
|
|
15
|
+
# any run of >= 3 consecutive capitalized tokens -- with short lowercase
|
|
16
|
+
# connectors like "of"/"the" allowed between them -- as such a banner and strip
|
|
17
|
+
# it. This is a heuristic: ordinary captions rarely contain three consecutive
|
|
18
|
+
# all-caps words with no intervening punctuation (acronym lists like "DNA, RNA"
|
|
19
|
+
# are punctuated, so they don't match and are preserved).
|
|
20
|
+
_CAPS_WORD = r"[A-Z][A-Z0-9&.'\-]*"
|
|
21
|
+
_CONNECTOR = r"(?:of|the|and|in|for|to|a|an|&)"
|
|
22
|
+
RUNNING_HEADER_RE = re.compile(
|
|
23
|
+
rf"{_CAPS_WORD}(?:(?:\s+{_CONNECTOR})*\s+{_CAPS_WORD}){{2,}}"
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
# Per-line footnote patterns are publisher-specific; none are enabled by default.
|
|
27
|
+
# Add re.compile(...) entries here to drop footnote lines for a particular layout.
|
|
28
|
+
FOOTNOTE_LINE_PATTERNS: list[re.Pattern[str]] = []
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def clean_caption(text: str) -> str:
|
|
32
|
+
"""Clean a raw extracted caption into something readable.
|
|
33
|
+
|
|
34
|
+
Steps:
|
|
35
|
+
1. Decode font ligatures
|
|
36
|
+
2. Strip running headers (anywhere they appear)
|
|
37
|
+
3. Filter footnote-style lines
|
|
38
|
+
4. De-hyphenate line-wrapped words
|
|
39
|
+
5. Normalize the en-dash glyph
|
|
40
|
+
6. Collapse whitespace
|
|
41
|
+
7. Strip trailing partial words from running-header bleed
|
|
42
|
+
"""
|
|
43
|
+
if not text:
|
|
44
|
+
return ""
|
|
45
|
+
|
|
46
|
+
# 1. Ligatures
|
|
47
|
+
for lig, repl in LIGATURES.items():
|
|
48
|
+
text = text.replace(lig, repl)
|
|
49
|
+
|
|
50
|
+
# 2. Running headers (mid-text occurrences)
|
|
51
|
+
text = RUNNING_HEADER_RE.sub(" ", text)
|
|
52
|
+
|
|
53
|
+
# 3. Per-line footnote filtering
|
|
54
|
+
lines = [ln.strip() for ln in text.split("\n") if ln.strip()]
|
|
55
|
+
lines = [ln for ln in lines if not any(p.match(ln) for p in FOOTNOTE_LINE_PATTERNS)]
|
|
56
|
+
text = " ".join(lines)
|
|
57
|
+
|
|
58
|
+
# 4. De-hyphenate line-wrapped words
|
|
59
|
+
text = re.sub(r"-\s+", "", text)
|
|
60
|
+
|
|
61
|
+
# 5. Elsevier renders en-dash as "e": "1968e1969" -> "1968–1969"
|
|
62
|
+
text = re.sub(r"(\d{4})e(\d{4})", r"\1–\2", text)
|
|
63
|
+
|
|
64
|
+
# 6. Collapse whitespace
|
|
65
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
66
|
+
|
|
67
|
+
# 7. Second running-header pass: catches banners that only became adjacent
|
|
68
|
+
# once line breaks were collapsed (header bled onto the caption tail).
|
|
69
|
+
text = RUNNING_HEADER_RE.sub(" ", text)
|
|
70
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
71
|
+
|
|
72
|
+
return text
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def split_camel(s: str) -> str:
|
|
76
|
+
"""Restore spaces in CamelCase strings extracted from PDF.
|
|
77
|
+
|
|
78
|
+
pdfplumber sometimes concatenates chars without inferring spaces, giving
|
|
79
|
+
us "RatStrainsandStocksOriginated...". This restores it to readable form.
|
|
80
|
+
"""
|
|
81
|
+
s = re.sub(r"([a-z])([A-Z])", r"\1 \2", s)
|
|
82
|
+
s = re.sub(r"\s+", " ", s).strip()
|
|
83
|
+
return s
|
figgydeck/cli.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
"""figgydeck command-line interface."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import re
|
|
7
|
+
import shutil
|
|
8
|
+
import sys
|
|
9
|
+
import tempfile
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from figgydeck.models import Chapter
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _slug(s: str) -> str:
|
|
16
|
+
"""Collapse whitespace and strip punctuation, keeping alphanumerics.
|
|
17
|
+
|
|
18
|
+
"Ch. 1: Historical Foundations" -> "Ch1HistoricalFoundations"
|
|
19
|
+
"""
|
|
20
|
+
s = re.sub(r"[^\w\s]", "", s)
|
|
21
|
+
s = re.sub(r"\s+", "", s.title())
|
|
22
|
+
return s
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _safe_filename(book: str, chapter: str) -> str:
|
|
26
|
+
"""Build a clean filename from book + chapter titles.
|
|
27
|
+
|
|
28
|
+
Examples:
|
|
29
|
+
"Example Textbook (1st ed., 2020)", "Ch. 3: Cell Structure"
|
|
30
|
+
→ "ExampleTextbook1StEd2020_Ch3CellStructure"
|
|
31
|
+
"""
|
|
32
|
+
return f"{_slug(book)}_{_slug(chapter)}"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _chapter_title(pdf_path: Path, explicit: str | None) -> str:
|
|
36
|
+
"""Resolve a chapter's display title.
|
|
37
|
+
|
|
38
|
+
Uses `explicit` when given; otherwise derives one from the PDF filename:
|
|
39
|
+
a ``ch01``-style stem becomes ``"Chapter 1"``; anything else is title-cased
|
|
40
|
+
with ``-``/``_`` turned into spaces.
|
|
41
|
+
"""
|
|
42
|
+
if explicit:
|
|
43
|
+
return explicit
|
|
44
|
+
stem = pdf_path.stem
|
|
45
|
+
# Match a leading chapter marker and split off any descriptive remainder:
|
|
46
|
+
# "ch01" -> "Chapter 1"
|
|
47
|
+
# "Chapter-3-Biology-and-Diseases..." -> "Chapter 3: Biology and Diseases..."
|
|
48
|
+
# "Chapter 11 - Microbiological..." -> "Chapter 11: Microbiological..."
|
|
49
|
+
m = re.match(r"\s*ch(?:apter)?[\s._-]*0*(\d+)[\s._-]*(.*)$", stem, re.IGNORECASE)
|
|
50
|
+
if m:
|
|
51
|
+
num = int(m.group(1))
|
|
52
|
+
rest = re.sub(r"[\s_-]+", " ", m.group(2)).strip(" _-.")
|
|
53
|
+
return f"Chapter {num}: {rest}" if rest else f"Chapter {num}"
|
|
54
|
+
pretty = re.sub(r"[-_]+", " ", stem).strip()
|
|
55
|
+
return pretty.title() if pretty else stem
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
_VALID_FORMATS = ("apkg", "pptx")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _format_list(s: str) -> list[str]:
|
|
62
|
+
"""Parse a single --format value: comma-separated formats with validation."""
|
|
63
|
+
parts = [p.strip().lower() for p in s.split(",") if p.strip()]
|
|
64
|
+
if not parts:
|
|
65
|
+
raise argparse.ArgumentTypeError("--format value cannot be empty")
|
|
66
|
+
bad = [p for p in parts if p not in _VALID_FORMATS]
|
|
67
|
+
if bad:
|
|
68
|
+
raise argparse.ArgumentTypeError(
|
|
69
|
+
f"unknown format(s): {', '.join(bad)} "
|
|
70
|
+
f"(choose from: {', '.join(_VALID_FORMATS)})"
|
|
71
|
+
)
|
|
72
|
+
return parts
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def main(argv: list[str] | None = None) -> int:
|
|
76
|
+
p = argparse.ArgumentParser(
|
|
77
|
+
prog="figgydeck",
|
|
78
|
+
description="Turn textbooks into study decks: extract figures (and "
|
|
79
|
+
"optionally tables) and captions from chapter PDFs and "
|
|
80
|
+
"package them as Anki decks or PowerPoint slides.",
|
|
81
|
+
)
|
|
82
|
+
p.add_argument("pdfs", nargs="+", metavar="PDF", help="One or more source chapter PDFs")
|
|
83
|
+
p.add_argument("--book", required=True, help="Book title (e.g. 'Example Textbook (1st ed., 2020)')")
|
|
84
|
+
p.add_argument(
|
|
85
|
+
"--chapter", action="append", default=None, metavar="TITLE",
|
|
86
|
+
help="Chapter title (e.g. 'Ch. 1: Historical Foundations'). Repeatable: "
|
|
87
|
+
"give one per PDF, in order. If omitted, titles are derived from "
|
|
88
|
+
"each PDF's filename.",
|
|
89
|
+
)
|
|
90
|
+
p.add_argument("--output", "-o", default="./out", help="Output directory (default: ./out)")
|
|
91
|
+
p.add_argument(
|
|
92
|
+
"--format", "-f", action="append", type=_format_list, default=None,
|
|
93
|
+
metavar="FMT[,FMT...]",
|
|
94
|
+
help="Output format(s) to build (required). Repeatable, or "
|
|
95
|
+
f"comma-separated. Choices: {', '.join(_VALID_FORMATS)}.",
|
|
96
|
+
)
|
|
97
|
+
p.add_argument("--combine", action="store_true",
|
|
98
|
+
help="Merge all input PDFs into a single artifact per format.")
|
|
99
|
+
p.add_argument("--full-res", action="store_true",
|
|
100
|
+
help="Embed full-resolution PNGs in .pptx (default: downscaled "
|
|
101
|
+
"JPEGs, which keep the file small enough for Google Slides).")
|
|
102
|
+
p.add_argument("--tables", action="store_true",
|
|
103
|
+
help="Also extract tables (default: figures only).")
|
|
104
|
+
p.add_argument("--save-manifest", action="store_true",
|
|
105
|
+
help="Also write manifest.json (the structured extraction "
|
|
106
|
+
"result) into the output directory.")
|
|
107
|
+
p.add_argument("--save-images", action="store_true",
|
|
108
|
+
help="Also write the extracted figure/table images (an "
|
|
109
|
+
"images/ folder) into the output directory.")
|
|
110
|
+
p.add_argument("--quiet", "-q", action="store_true", help="Suppress progress logging")
|
|
111
|
+
args = p.parse_args(argv)
|
|
112
|
+
|
|
113
|
+
# An explicit output format is required — the decks are the only
|
|
114
|
+
# deliverables, so we won't guess which one you want.
|
|
115
|
+
if not args.format:
|
|
116
|
+
print(
|
|
117
|
+
f"error: --format is required (choose from: {', '.join(_VALID_FORMATS)}); "
|
|
118
|
+
"e.g. --format apkg or --format apkg,pptx",
|
|
119
|
+
file=sys.stderr,
|
|
120
|
+
)
|
|
121
|
+
return 2
|
|
122
|
+
|
|
123
|
+
pdf_paths = [Path(p) for p in args.pdfs]
|
|
124
|
+
missing = [p for p in pdf_paths if not p.exists()]
|
|
125
|
+
if missing:
|
|
126
|
+
for p in missing:
|
|
127
|
+
print(f"error: {p} does not exist", file=sys.stderr)
|
|
128
|
+
return 2
|
|
129
|
+
|
|
130
|
+
# Resolve one chapter title per PDF: all-or-nothing on --chapter.
|
|
131
|
+
if args.chapter is None:
|
|
132
|
+
titles = [_chapter_title(p, None) for p in pdf_paths]
|
|
133
|
+
elif len(args.chapter) == len(pdf_paths):
|
|
134
|
+
titles = [_chapter_title(p, c) for p, c in zip(pdf_paths, args.chapter, strict=True)]
|
|
135
|
+
else:
|
|
136
|
+
print(
|
|
137
|
+
f"error: got {len(args.chapter)} --chapter value(s) for "
|
|
138
|
+
f"{len(pdf_paths)} PDF(s); supply either none or exactly one per PDF",
|
|
139
|
+
file=sys.stderr,
|
|
140
|
+
)
|
|
141
|
+
return 2
|
|
142
|
+
|
|
143
|
+
out_dir = Path(args.output)
|
|
144
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
145
|
+
verbose = not args.quiet
|
|
146
|
+
|
|
147
|
+
# Flatten + dedupe requested formats while preserving order.
|
|
148
|
+
formats = list(dict.fromkeys(f for chunk in args.format for f in chunk))
|
|
149
|
+
|
|
150
|
+
# Bind builders up front so a missing optional dep (python-pptx) fails
|
|
151
|
+
# before the slow extraction step. Imported at call time so tests can
|
|
152
|
+
# patch the module-level functions.
|
|
153
|
+
apkg_single = apkg_combined = None
|
|
154
|
+
pptx_single = pptx_combined = None
|
|
155
|
+
if "apkg" in formats:
|
|
156
|
+
from figgydeck.anki import build_apkg as apkg_single
|
|
157
|
+
from figgydeck.anki import build_combined_apkg as apkg_combined
|
|
158
|
+
if "pptx" in formats:
|
|
159
|
+
from figgydeck.pptx import build_combined_pptx as pptx_combined
|
|
160
|
+
from figgydeck.pptx import build_pptx as pptx_single
|
|
161
|
+
|
|
162
|
+
from figgydeck.extract import extract_chapter
|
|
163
|
+
|
|
164
|
+
single_pdf = len(pdf_paths) == 1
|
|
165
|
+
|
|
166
|
+
# Extract into a temporary working dir. The built decks embed their own
|
|
167
|
+
# images, so the raw manifest.json / images/ are intermediates — surfaced
|
|
168
|
+
# into out_dir only when --save-manifest / --save-images ask for them. A
|
|
169
|
+
# single PDF extracts flat; multiple PDFs each get their own subdir so fixed
|
|
170
|
+
# image names (img-000.png, ...) don't collide.
|
|
171
|
+
with tempfile.TemporaryDirectory(prefix="figgydeck-extract-") as tmp:
|
|
172
|
+
work = Path(tmp)
|
|
173
|
+
chapters: list[Chapter] = []
|
|
174
|
+
for i, (pdf, title) in enumerate(zip(pdf_paths, titles, strict=True)):
|
|
175
|
+
sub = "" if single_pdf else f"{i:02d}_{_slug(title)}"
|
|
176
|
+
ex_dir = work / sub if sub else work
|
|
177
|
+
manifest = extract_chapter(pdf, ex_dir, include_tables=args.tables, verbose=verbose)
|
|
178
|
+
chapters.append(Chapter(manifest, ex_dir / "images", title))
|
|
179
|
+
|
|
180
|
+
if args.save_manifest or args.save_images:
|
|
181
|
+
dest = out_dir / sub if sub else out_dir
|
|
182
|
+
dest.mkdir(parents=True, exist_ok=True)
|
|
183
|
+
if args.save_manifest:
|
|
184
|
+
shutil.copy(ex_dir / "manifest.json", dest / "manifest.json")
|
|
185
|
+
if args.save_images:
|
|
186
|
+
shutil.copytree(ex_dir / "images", dest / "images", dirs_exist_ok=True)
|
|
187
|
+
|
|
188
|
+
# Write requested formats (inside the temp dir so extracted images exist).
|
|
189
|
+
optimize = not args.full_res
|
|
190
|
+
if args.combine:
|
|
191
|
+
base = f"{_slug(args.book)}_Combined"
|
|
192
|
+
if apkg_combined is not None:
|
|
193
|
+
apkg_combined(chapters, args.book, out_dir / f"{base}.apkg", verbose=verbose)
|
|
194
|
+
if pptx_combined is not None:
|
|
195
|
+
pptx_combined(chapters, args.book, out_dir / f"{base}.pptx",
|
|
196
|
+
optimize_images=optimize, verbose=verbose)
|
|
197
|
+
else:
|
|
198
|
+
for chapter in chapters:
|
|
199
|
+
base = _safe_filename(args.book, chapter.title)
|
|
200
|
+
if apkg_single is not None:
|
|
201
|
+
apkg_single(chapter.manifest, chapter.images_dir, args.book, chapter.title,
|
|
202
|
+
out_dir / f"{base}.apkg", verbose=verbose)
|
|
203
|
+
if pptx_single is not None:
|
|
204
|
+
pptx_single(chapter.manifest, chapter.images_dir, args.book, chapter.title,
|
|
205
|
+
out_dir / f"{base}.pptx",
|
|
206
|
+
optimize_images=optimize, verbose=verbose)
|
|
207
|
+
|
|
208
|
+
return 0
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
if __name__ == "__main__":
|
|
212
|
+
raise SystemExit(main())
|