polyptych 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. common/__init__.py +0 -0
  2. common/compat.py +91 -0
  3. common/logging_setup.py +96 -0
  4. common/usage_log.py +32 -0
  5. polyptych/__init__.py +26 -0
  6. polyptych/batch_utils.py +85 -0
  7. polyptych/clean_source.py +316 -0
  8. polyptych/cli.py +1047 -0
  9. polyptych/client.py +333 -0
  10. polyptych/concurrent_engine.py +214 -0
  11. polyptych/cross_validate.py +319 -0
  12. polyptych/image_batch.py +452 -0
  13. polyptych/logging_setup.py +64 -0
  14. polyptych/model_config.py +186 -0
  15. polyptych/models/__init__.py +165 -0
  16. polyptych/models/infographic.py +258 -0
  17. polyptych/models/slide.py +678 -0
  18. polyptych/pipeline.py +60 -0
  19. polyptych/pipeline_base.py +353 -0
  20. polyptych/pipeline_config.py +161 -0
  21. polyptych/pipeline_infographic.py +448 -0
  22. polyptych/pipeline_task.py +1000 -0
  23. polyptych/presets.py +311 -0
  24. polyptych/prompt_loader.py +146 -0
  25. polyptych/providers/__init__.py +86 -0
  26. polyptych/providers/anthropic.py +215 -0
  27. polyptych/providers/base.py +349 -0
  28. polyptych/providers/gemini.py +179 -0
  29. polyptych/providers/openai.py +158 -0
  30. polyptych/providers/vertex.py +50 -0
  31. polyptych/providers/xai.py +193 -0
  32. polyptych/ref_utils.py +84 -0
  33. polyptych/run_config.py +204 -0
  34. polyptych/task_registry.py +217 -0
  35. polyptych/tasks/__init__.py +32 -0
  36. polyptych/tasks/task_01_genre.py +68 -0
  37. polyptych/tasks/task_02_analysis.py +69 -0
  38. polyptych/tasks/task_03_structure.py +84 -0
  39. polyptych/tasks/task_04_content.py +112 -0
  40. polyptych/tasks/task_05_design.py +101 -0
  41. polyptych/tasks/task_06_slides.py +103 -0
  42. polyptych/tasks/task_07_prompts.py +580 -0
  43. polyptych/tasks/task_i0_analysis.py +60 -0
  44. polyptych/tasks/task_i1_design.py +109 -0
  45. polyptych/tasks/task_i2_critique.py +207 -0
  46. polyptych/tasks/task_i2_prompts.py +131 -0
  47. polyptych/text_utils.py +200 -0
  48. polyptych/usage_log.py +5 -0
  49. polyptych-0.1.0.dist-info/METADATA +209 -0
  50. polyptych-0.1.0.dist-info/RECORD +54 -0
  51. polyptych-0.1.0.dist-info/WHEEL +4 -0
  52. polyptych-0.1.0.dist-info/entry_points.txt +2 -0
  53. polyptych-0.1.0.dist-info/licenses/LICENSE +202 -0
  54. polyptych-0.1.0.dist-info/licenses/NOTICE +10 -0
common/__init__.py ADDED
File without changes
common/compat.py ADDED
@@ -0,0 +1,91 @@
1
+ """Backward-compatible resolution of renamed env vars and runtime directories.
2
+
3
+ The Polyptych rebrand renamed user-facing env vars from the ``SLIDE_GEN_*``
4
+ namespace to ``POLYPTYCH_*`` and runtime directories from ``slide-analysis`` to
5
+ ``polyptych``. To avoid breaking existing setups, the old names are honored as
6
+ deprecated fallbacks: the new name/location takes precedence, the old one is
7
+ used only when the new is absent, and a one-time warning is logged on first use.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import logging
13
+ import os
14
+ from pathlib import Path
15
+ from typing import overload
16
+
17
+ logger = logging.getLogger("polyptych")
18
+
19
+ APP_NAME = "polyptych"
20
+ LEGACY_APP_NAME = "slide-analysis"
21
+
22
+ # Module-global so the deprecation warning fires at most once per process per name.
23
+ _warned_env: set[str] = set()
24
+
25
+
26
+ @overload
27
+ def getenv_compat(new_name: str, old_name: str, default: str) -> str: ...
28
+ @overload
29
+ def getenv_compat(new_name: str, old_name: str, default: None = None) -> str | None: ...
30
+
31
+
32
+ def getenv_compat(
33
+ new_name: str, old_name: str, default: str | None = None
34
+ ) -> str | None:
35
+ """Read ``new_name`` from the environment, falling back to ``old_name``.
36
+
37
+ Precedence: ``$new_name`` > ``$old_name`` (deprecated) > ``default``. The
38
+ first time the deprecated ``old_name`` supplies the value, a one-time
39
+ warning is emitted on the ``polyptych`` logger. Matches ``os.environ.get``
40
+ semantics: an env var set to the empty string counts as set.
41
+ """
42
+ value = os.environ.get(new_name)
43
+ if value is not None:
44
+ return value
45
+ legacy = os.environ.get(old_name)
46
+ if legacy is not None:
47
+ if old_name not in _warned_env:
48
+ _warned_env.add(old_name)
49
+ logger.warning(
50
+ "Environment variable %s is deprecated and will be removed in a "
51
+ "future release; use %s instead.",
52
+ old_name,
53
+ new_name,
54
+ )
55
+ return legacy
56
+ return default
57
+
58
+
59
+ def _runtime_base(kind: str) -> Path:
60
+ """Base dir for a runtime ``kind`` — ``config`` → ~/.config, ``cache`` → ~/.cache."""
61
+ return Path.home() / f".{kind}"
62
+
63
+
64
+ def runtime_dirs(kind: str) -> tuple[Path, Path]:
65
+ """Return ``(new, legacy)`` runtime dirs for ``kind`` (``config`` or ``cache``)."""
66
+ base = _runtime_base(kind)
67
+ return base / APP_NAME, base / LEGACY_APP_NAME
68
+
69
+
70
+ def resolve_runtime_file(kind: str, filename: str) -> Path | None:
71
+ """First existing ``filename`` across the new then legacy runtime dir, or ``None``."""
72
+ new, legacy = runtime_dirs(kind)
73
+ for directory in (new, legacy):
74
+ candidate = directory / filename
75
+ if candidate.exists():
76
+ return candidate
77
+ return None
78
+
79
+
80
+ def runtime_write_dir(kind: str) -> Path:
81
+ """Dir that new runtime files should be written to.
82
+
83
+ Prefers the new ``polyptych`` namespace, but keeps writing into an existing
84
+ legacy ``slide-analysis`` dir when that is the only one present, so a user's
85
+ credentials/state stay consolidated in one location rather than split across
86
+ two.
87
+ """
88
+ new, legacy = runtime_dirs(kind)
89
+ if not new.exists() and legacy.exists():
90
+ return legacy
91
+ return new
@@ -0,0 +1,96 @@
1
+ """Shared logging configuration helper for the CLI entry point.
2
+
3
+ The ``polyptych`` CLI installs handlers on a named logger at startup so
4
+ library modules can use ``logger = logging.getLogger(__name__)`` and emit
5
+ diagnostics at the usual ``debug/info/warning/error`` levels without each
6
+ module wiring up its own output. ``polyptych.logging_setup`` is a thin
7
+ wrapper over this for the ``polyptych`` logger (kept separate because its
8
+ public API is pinned by tests).
9
+
10
+ This lives in ``common`` — the lowest shared layer — so the configurator can
11
+ be reused without coupling modules together.
12
+
13
+ User-facing CLI output (transcripts, timing tables, file paths, summaries)
14
+ stays on ``print`` to stdout; only diagnostics flow through these loggers.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import logging
20
+ import sys
21
+ from pathlib import Path
22
+
23
+ from common.compat import getenv_compat
24
+
25
+ LOG_FORMAT = "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
26
+ DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
27
+ ENV_VAR = "POLYPTYCH_LOG_LEVEL"
28
+ LEGACY_ENV_VAR = "SLIDE_GEN_LOG_LEVEL"
29
+
30
+ _configured: set[str] = set()
31
+
32
+
33
+ def resolve_level(level: str | int | None) -> int:
34
+ """Turn a user-supplied level (name, int, or ``None``) into a ``logging`` int."""
35
+ if level is None:
36
+ env = getenv_compat(ENV_VAR, LEGACY_ENV_VAR)
37
+ level = env if env else "INFO"
38
+ if isinstance(level, int):
39
+ return level
40
+ try:
41
+ return getattr(logging, level.upper())
42
+ except AttributeError as err:
43
+ raise ValueError(
44
+ f"Unknown log level: {level!r}. "
45
+ "Use DEBUG, INFO, WARNING, ERROR, or CRITICAL."
46
+ ) from err
47
+
48
+
49
+ def configure_logging(
50
+ logger_name: str,
51
+ level: str | int | None = None,
52
+ log_file: str | Path | None = None,
53
+ *,
54
+ force: bool = False,
55
+ ) -> logging.Logger:
56
+ """Install stderr (and optional file) handlers on the named logger.
57
+
58
+ Level resolution order: explicit ``level`` argument, then the
59
+ ``POLYPTYCH_LOG_LEVEL`` env var (deprecated alias ``SLIDE_GEN_LOG_LEVEL``),
60
+ then ``INFO``. Safe to call more than once for a given ``logger_name`` —
61
+ repeat calls are ignored unless ``force=True``, which swaps the handlers so
62
+ a CLI flag can override the env var.
63
+ """
64
+ logger = logging.getLogger(logger_name)
65
+ if logger_name in _configured and not force:
66
+ return logger
67
+
68
+ if force:
69
+ for handler in list(logger.handlers):
70
+ logger.removeHandler(handler)
71
+ handler.close()
72
+
73
+ logger.setLevel(resolve_level(level))
74
+ formatter = logging.Formatter(LOG_FORMAT, datefmt=DATE_FORMAT)
75
+
76
+ stderr_handler = logging.StreamHandler(stream=sys.stderr)
77
+ stderr_handler.setFormatter(formatter)
78
+ logger.addHandler(stderr_handler)
79
+
80
+ if log_file is not None:
81
+ file_handler = logging.FileHandler(Path(log_file), encoding="utf-8")
82
+ file_handler.setFormatter(formatter)
83
+ logger.addHandler(file_handler)
84
+
85
+ logger.propagate = False # stop log records from double-printing via root
86
+ _configured.add(logger_name)
87
+ return logger
88
+
89
+
90
+ def reset_logging(logger_name: str) -> None:
91
+ """Drop all handlers and mark the named logger unconfigured. Used by tests."""
92
+ logger = logging.getLogger(logger_name)
93
+ for handler in list(logger.handlers):
94
+ logger.removeHandler(handler)
95
+ handler.close()
96
+ _configured.discard(logger_name)
common/usage_log.py ADDED
@@ -0,0 +1,32 @@
1
+ """JSONL logging for API usage (LLM, image generation)."""
2
+
3
+ import json
4
+ import threading
5
+ from pathlib import Path
6
+
7
+ from .compat import getenv_compat, runtime_write_dir
8
+
9
+ _log_lock = threading.Lock()
10
+
11
+
12
+ def default_usage_log() -> Path:
13
+ """Return the default usage-log path, resolved at call time.
14
+
15
+ Resolution order:
16
+ 1. ``$POLYPTYCH_USAGE_LOG`` env var (new name).
17
+ 2. ``$SLIDE_GEN_USAGE_LOG`` env var (deprecated alias; emits a one-time warning).
18
+ 3. ``~/.cache/polyptych/usage.jsonl`` (stable per-user default via
19
+ :func:`common.compat.runtime_write_dir`).
20
+ """
21
+ env_val = getenv_compat("POLYPTYCH_USAGE_LOG", "SLIDE_GEN_USAGE_LOG")
22
+ if env_val is not None:
23
+ return Path(env_val)
24
+ return runtime_write_dir("cache") / "usage.jsonl"
25
+
26
+
27
+ def log_usage(path: Path, entry: dict) -> None:
28
+ """Append a single usage entry as one JSON line (thread-safe)."""
29
+ line = json.dumps(entry, default=str) + "\n"
30
+ with _log_lock:
31
+ with open(path, "a", encoding="utf-8") as f:
32
+ f.write(line)
polyptych/__init__.py ADDED
@@ -0,0 +1,26 @@
1
+ """Slide generation pipeline using Gemini API."""
2
+
3
+ from .models import (
4
+ Task1Output,
5
+ Task2Output,
6
+ Task3Output,
7
+ Task4Output,
8
+ Task5Output,
9
+ Task6Output,
10
+ Task7Output,
11
+ )
12
+ from .client import TextClient, GeminiTextClient
13
+ from .pipeline import SlidePipeline
14
+
15
+ __all__ = [
16
+ "Task1Output",
17
+ "Task2Output",
18
+ "Task3Output",
19
+ "Task4Output",
20
+ "Task5Output",
21
+ "Task6Output",
22
+ "Task7Output",
23
+ "TextClient",
24
+ "GeminiTextClient",
25
+ "SlidePipeline",
26
+ ]
@@ -0,0 +1,85 @@
1
+ """Generic batch orchestration: overlap splitting + parallel execution.
2
+
3
+ Shared by task_a1_sentence_beats (paragraphs) and task_a2_shot_planning (beats).
4
+ Each domain keeps its own spec dataclass (with domain-specific derived fields)
5
+ and wraps `split_with_overlap()` to enrich the generic `OverlapBatch` records.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from concurrent.futures import ThreadPoolExecutor, as_completed
11
+ from dataclasses import dataclass
12
+ from typing import Callable, Generic, TypeVar
13
+
14
+ T = TypeVar("T")
15
+ R = TypeVar("R")
16
+
17
+
18
+ @dataclass
19
+ class OverlapBatch(Generic[T]):
20
+ """Generic overlap-split batch carrying core items + before/after context.
21
+
22
+ core_start is the 0-based index of the first core item in the original list.
23
+ Callers can derive 1-based IDs or other projections from core_start and
24
+ the lengths of core / context_before / context_after.
25
+ """
26
+
27
+ batch_index: int
28
+ core: list[T]
29
+ context_before: list[T]
30
+ context_after: list[T]
31
+ core_start: int
32
+
33
+
34
+ def split_with_overlap(
35
+ items: list[T],
36
+ batch_size: int,
37
+ overlap: int = 0,
38
+ ) -> list[OverlapBatch[T]]:
39
+ """Split `items` into batches of `batch_size` with `overlap` before/after context.
40
+
41
+ The last batch may be shorter than batch_size. Context lists are clamped
42
+ to the available items at either end. Returns [] for empty input.
43
+ """
44
+ batches: list[OverlapBatch[T]] = []
45
+ for idx, start in enumerate(range(0, len(items), batch_size)):
46
+ end = min(start + batch_size, len(items))
47
+ before_start = max(0, start - overlap)
48
+ after_end = min(len(items), end + overlap)
49
+ batches.append(
50
+ OverlapBatch(
51
+ batch_index=idx,
52
+ core=items[start:end],
53
+ context_before=items[before_start:start],
54
+ context_after=items[end:after_end],
55
+ core_start=start,
56
+ )
57
+ )
58
+ return batches
59
+
60
+
61
+ def run_batches(
62
+ specs: list[T],
63
+ process: Callable[[T], R],
64
+ max_workers: int,
65
+ ) -> list[R]:
66
+ """Run `process(spec)` for each spec, return results in submission order.
67
+
68
+ Single-batch fast path: if there is exactly one spec, it runs directly
69
+ without spawning a ThreadPoolExecutor. Otherwise results are collected
70
+ via `as_completed` and re-ordered by the spec's original index so that
71
+ ordering is deterministic regardless of completion order.
72
+ """
73
+ if len(specs) == 1:
74
+ return [process(specs[0])]
75
+
76
+ effective_workers = min(len(specs), max_workers)
77
+ results: dict[int, R] = {}
78
+ with ThreadPoolExecutor(max_workers=effective_workers) as executor:
79
+ futures = {
80
+ executor.submit(process, spec): idx for idx, spec in enumerate(specs)
81
+ }
82
+ for future in as_completed(futures):
83
+ idx = futures[future]
84
+ results[idx] = future.result()
85
+ return [results[i] for i in range(len(specs))]
@@ -0,0 +1,316 @@
1
+ """Clean PDF-to-markdown conversion artifacts from source files.
2
+
3
+ Removes page numbers, footnote blocks, footnote superscripts, TOC with dot leaders,
4
+ broken image references, and unnecessary backslash escaping.
5
+ """
6
+
7
+ import re
8
+ from dataclasses import dataclass
9
+ from pathlib import Path
10
+
11
+
12
+ @dataclass
13
+ class CleaningConfig:
14
+ """Configuration for which cleaning passes to enable."""
15
+
16
+ remove_toc: bool = True
17
+ remove_image_refs: bool = True
18
+ remove_footnotes: bool = True # strips both blocks and superscripts
19
+ remove_page_numbers: bool = True
20
+ unescape_chars: bool = True
21
+
22
+
23
+ @dataclass
24
+ class CleaningResult:
25
+ """Statistics from a cleaning run."""
26
+
27
+ original_lines: int = 0
28
+ cleaned_lines: int = 0
29
+ toc_lines_removed: int = 0
30
+ image_refs_removed: int = 0
31
+ footnote_refs_removed: int = 0 # superscripts stripped
32
+ footnote_blocks_removed: int = 0 # block lines stripped
33
+ page_numbers_removed: int = 0
34
+ escapes_fixed: int = 0
35
+
36
+
37
+ def remove_toc(text: str) -> tuple[str, int]:
38
+ """Remove table of contents with dot-leader lines.
39
+
40
+ Finds a CONTENTS heading, then removes all lines until the first chapter
41
+ heading that is NOT followed by dot leaders.
42
+ """
43
+ lines = text.split("\n")
44
+ # Find the CONTENTS heading
45
+ contents_idx = None
46
+ for i, line in enumerate(lines):
47
+ if re.match(r"^\*{0,2}CONTENTS\*{0,2}\s*$", line.strip()):
48
+ contents_idx = i
49
+ break
50
+
51
+ if contents_idx is None:
52
+ return text, 0
53
+
54
+ # Find the end of TOC: first chapter heading pattern not part of TOC
55
+ # Chapter headings look like **1. or **1\. at the start
56
+ # TOC lines contain 5+ consecutive dots
57
+ toc_end = None
58
+ for i in range(contents_idx + 1, len(lines)):
59
+ line = lines[i].strip()
60
+ # A chapter heading that starts the actual content (not a TOC entry)
61
+ if re.match(r"\*{2}\d+\\?\.", line) and "....." not in line:
62
+ toc_end = i
63
+ break
64
+
65
+ if toc_end is None:
66
+ # No chapter heading found after TOC; remove to end of consecutive TOC-like lines
67
+ toc_end = contents_idx + 1
68
+ for i in range(contents_idx + 1, len(lines)):
69
+ line = lines[i].strip()
70
+ if (
71
+ line == ""
72
+ or "....." in line
73
+ or re.match(r"^\*", line)
74
+ or re.match(r"^[A-Z]", line)
75
+ ):
76
+ toc_end = i + 1
77
+ else:
78
+ break
79
+
80
+ removed = toc_end - contents_idx
81
+ result_lines = lines[:contents_idx] + lines[toc_end:]
82
+ return "\n".join(result_lines), removed
83
+
84
+
85
+ def remove_image_refs(text: str) -> tuple[str, int]:
86
+ """Remove broken image references and their captions.
87
+
88
+ Matches lines like *![][imageN]* and the immediately following caption line.
89
+ """
90
+ lines = text.split("\n")
91
+ result = []
92
+ count = 0
93
+ skip_next = False
94
+
95
+ for i, line in enumerate(lines):
96
+ if skip_next:
97
+ skip_next = False
98
+ continue
99
+ if re.match(r"^\s*\*?!\[\]\[image\d+\]\*?\s*$", line.strip()):
100
+ count += 1
101
+ # Skip the following caption line if it's a short non-heading, non-blank line
102
+ if i + 1 < len(lines):
103
+ next_line = lines[i + 1].strip()
104
+ if (
105
+ next_line
106
+ and not next_line.startswith("**")
107
+ and len(next_line) < 200
108
+ ):
109
+ skip_next = True
110
+ continue
111
+ result.append(line)
112
+
113
+ return "\n".join(result), count
114
+
115
+
116
+ def remove_footnote_blocks(text: str) -> tuple[str, int]:
117
+ """Remove footnote reference blocks.
118
+
119
+ Footnote blocks start with a line of whitespace (typically ` ` or blank),
120
+ followed by lines starting with a number + space (e.g., `1 General C.C. Krulak...`).
121
+ """
122
+ lines = text.split("\n")
123
+ result = []
124
+ count = 0
125
+ i = 0
126
+
127
+ while i < len(lines):
128
+ # Check for separator line (whitespace-only, often ` `) followed by footnote
129
+ if re.match(r"^[\s]*$", lines[i]):
130
+ # Look ahead for footnote lines
131
+ j = i + 1
132
+ # Skip additional blank lines
133
+ while j < len(lines) and re.match(r"^\s*$", lines[j]):
134
+ j += 1
135
+ # Check if next non-blank line starts a footnote (number + space at start)
136
+ if j < len(lines) and re.match(r"^\d{1,3}\s+\S", lines[j]):
137
+ # This is a footnote block - consume all footnote lines
138
+ block_start = i
139
+ # Consume the separator lines
140
+ while j < len(lines):
141
+ line = lines[j].strip()
142
+ if line == "":
143
+ # Blank line - check if followed by another footnote
144
+ k = j + 1
145
+ while k < len(lines) and re.match(r"^\s*$", lines[k]):
146
+ k += 1
147
+ if k < len(lines) and re.match(r"^\d{1,3}\s+\S", lines[k]):
148
+ j = k + 1
149
+ continue
150
+ else:
151
+ break
152
+ elif re.match(r"^\d{1,3}\s+\S", line):
153
+ j += 1
154
+ continue
155
+ else:
156
+ # Continuation line of a footnote (wrapped text)
157
+ # Check if it looks like content (starts with capital, no footnote number)
158
+ # Heuristic: if the previous line was a footnote, this could be continuation
159
+ if j > 0 and (
160
+ re.match(r"^\d{1,3}\s+\S", lines[j - 1].strip())
161
+ or lines[j - 1].strip() == ""
162
+ ):
163
+ j += 1
164
+ continue
165
+ else:
166
+ break
167
+
168
+ removed_lines = j - block_start
169
+ count += removed_lines
170
+ i = j
171
+ continue
172
+
173
+ result.append(lines[i])
174
+ i += 1
175
+
176
+ return "\n".join(result), count
177
+
178
+
179
+ def remove_footnote_superscripts(text: str) -> tuple[str, int]:
180
+ """Strip footnote superscript digits glued to word endings.
181
+
182
+ Handles: `intellect1.` → `intellect.`, `war2.` → `war.`
183
+ Does NOT strip: dates like 1997, model numbers like F-16, standalone numbers.
184
+ """
185
+ # Match 1-3 digits after a letter, before punctuation or whitespace/end
186
+ # Negative lookbehind: don't match after digits (avoids stripping from years/numbers)
187
+ # Negative lookbehind: don't match after hyphen (avoids F-16 → F-)
188
+ pattern = r"(?<=[a-zA-Z])(\d{1,3})(?=[.,;:!?\s\)\]\"\'»]|$)"
189
+
190
+ count = 0
191
+
192
+ def _replace(m: re.Match) -> str:
193
+ nonlocal count
194
+ count += 1
195
+ return ""
196
+
197
+ result = re.sub(pattern, _replace, text)
198
+ return result, count
199
+
200
+
201
+ def remove_page_numbers(text: str) -> tuple[str, int]:
202
+ r"""Remove standalone page number lines.
203
+
204
+ Removes lines matching `^\d{1,3}\s*$` (Arabic) or `^[ivxlc]+\s*$` (Roman)
205
+ that are surrounded by blank lines.
206
+ """
207
+ lines = text.split("\n")
208
+ result = []
209
+ count = 0
210
+
211
+ # Find first chapter heading to determine front matter boundary for Roman numerals
212
+ first_chapter = len(lines)
213
+ for i, line in enumerate(lines):
214
+ if re.match(r"\*{2}\d+\\?\.", line.strip()):
215
+ first_chapter = i
216
+ break
217
+
218
+ for i, line in enumerate(lines):
219
+ stripped = line.strip()
220
+
221
+ # Check for Arabic page numbers
222
+ is_arabic = bool(re.match(r"^\d{1,3}$", stripped))
223
+
224
+ # Check for Roman numeral page numbers (only in front matter)
225
+ is_roman = bool(re.match(r"^[ivxlc]+$", stripped)) and i < first_chapter
226
+
227
+ if (is_arabic or is_roman) and _is_surrounded_by_blanks(lines, i):
228
+ count += 1
229
+ continue
230
+
231
+ result.append(line)
232
+
233
+ return "\n".join(result), count
234
+
235
+
236
+ def _is_surrounded_by_blanks(lines: list[str], idx: int) -> bool:
237
+ """Check if a line is surrounded by blank (or near-blank) lines."""
238
+ prev_blank = idx == 0 or lines[idx - 1].strip() == ""
239
+ next_blank = idx == len(lines) - 1 or lines[idx + 1].strip() == ""
240
+ return prev_blank and next_blank
241
+
242
+
243
+ def unescape_chars(text: str) -> tuple[str, int]:
244
+ r"""Remove unnecessary backslash escaping from PDF conversion.
245
+
246
+ Cleans: `\.` → `.`, `\-` → `-`, `\+` → `+`, `\[` → `[`, `\]` → `]`,
247
+ `\(` → `(`, `\)` → `)`.
248
+ """
249
+ chars = r".\-+[]()"
250
+ count = 0
251
+ for ch in chars:
252
+ escaped = "\\" + ch
253
+ occurrences = text.count(escaped)
254
+ if occurrences > 0:
255
+ count += occurrences
256
+ text = text.replace(escaped, ch)
257
+ return text, count
258
+
259
+
260
+ def normalize_whitespace(text: str) -> str:
261
+ """Collapse 3+ consecutive blank lines to 2. Strip trailing whitespace per line."""
262
+ # Strip trailing whitespace per line
263
+ lines = [line.rstrip() for line in text.split("\n")]
264
+ text = "\n".join(lines)
265
+ # Collapse 3+ consecutive blank lines to 2
266
+ text = re.sub(r"\n{4,}", "\n\n\n", text)
267
+ return text
268
+
269
+
270
+ def clean_source(
271
+ text: str, config: CleaningConfig | None = None
272
+ ) -> tuple[str, CleaningResult]:
273
+ """Run all enabled cleaning passes in order. Idempotent."""
274
+ if config is None:
275
+ config = CleaningConfig()
276
+
277
+ result = CleaningResult()
278
+ result.original_lines = len(text.split("\n"))
279
+
280
+ if config.remove_toc:
281
+ text, result.toc_lines_removed = remove_toc(text)
282
+
283
+ if config.remove_image_refs:
284
+ text, result.image_refs_removed = remove_image_refs(text)
285
+
286
+ if config.remove_footnotes:
287
+ text, result.footnote_blocks_removed = remove_footnote_blocks(text)
288
+ text, result.footnote_refs_removed = remove_footnote_superscripts(text)
289
+
290
+ if config.remove_page_numbers:
291
+ text, result.page_numbers_removed = remove_page_numbers(text)
292
+
293
+ if config.unescape_chars:
294
+ text, result.escapes_fixed = unescape_chars(text)
295
+
296
+ text = normalize_whitespace(text)
297
+ result.cleaned_lines = len(text.split("\n"))
298
+
299
+ return text, result
300
+
301
+
302
+ def clean_file(
303
+ input_path: Path,
304
+ output_path: Path | None = None,
305
+ config: CleaningConfig | None = None,
306
+ ) -> tuple[Path, CleaningResult]:
307
+ """Read file, clean, write output. Default output: <stem>-clean.md in same directory."""
308
+ text = input_path.read_text(encoding="utf-8")
309
+
310
+ if output_path is None:
311
+ output_path = input_path.parent / f"{input_path.stem}-clean{input_path.suffix}"
312
+
313
+ cleaned_text, result = clean_source(text, config)
314
+ output_path.write_text(cleaned_text, encoding="utf-8")
315
+
316
+ return output_path, result