polyptych 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- common/__init__.py +0 -0
- common/compat.py +91 -0
- common/logging_setup.py +96 -0
- common/usage_log.py +32 -0
- polyptych/__init__.py +26 -0
- polyptych/batch_utils.py +85 -0
- polyptych/clean_source.py +316 -0
- polyptych/cli.py +1047 -0
- polyptych/client.py +333 -0
- polyptych/concurrent_engine.py +214 -0
- polyptych/cross_validate.py +319 -0
- polyptych/image_batch.py +452 -0
- polyptych/logging_setup.py +64 -0
- polyptych/model_config.py +186 -0
- polyptych/models/__init__.py +165 -0
- polyptych/models/infographic.py +258 -0
- polyptych/models/slide.py +678 -0
- polyptych/pipeline.py +60 -0
- polyptych/pipeline_base.py +353 -0
- polyptych/pipeline_config.py +161 -0
- polyptych/pipeline_infographic.py +448 -0
- polyptych/pipeline_task.py +1000 -0
- polyptych/presets.py +311 -0
- polyptych/prompt_loader.py +146 -0
- polyptych/providers/__init__.py +86 -0
- polyptych/providers/anthropic.py +215 -0
- polyptych/providers/base.py +349 -0
- polyptych/providers/gemini.py +179 -0
- polyptych/providers/openai.py +158 -0
- polyptych/providers/vertex.py +50 -0
- polyptych/providers/xai.py +193 -0
- polyptych/ref_utils.py +84 -0
- polyptych/run_config.py +204 -0
- polyptych/task_registry.py +217 -0
- polyptych/tasks/__init__.py +32 -0
- polyptych/tasks/task_01_genre.py +68 -0
- polyptych/tasks/task_02_analysis.py +69 -0
- polyptych/tasks/task_03_structure.py +84 -0
- polyptych/tasks/task_04_content.py +112 -0
- polyptych/tasks/task_05_design.py +101 -0
- polyptych/tasks/task_06_slides.py +103 -0
- polyptych/tasks/task_07_prompts.py +580 -0
- polyptych/tasks/task_i0_analysis.py +60 -0
- polyptych/tasks/task_i1_design.py +109 -0
- polyptych/tasks/task_i2_critique.py +207 -0
- polyptych/tasks/task_i2_prompts.py +131 -0
- polyptych/text_utils.py +200 -0
- polyptych/usage_log.py +5 -0
- polyptych-0.1.0.dist-info/METADATA +209 -0
- polyptych-0.1.0.dist-info/RECORD +54 -0
- polyptych-0.1.0.dist-info/WHEEL +4 -0
- polyptych-0.1.0.dist-info/entry_points.txt +2 -0
- polyptych-0.1.0.dist-info/licenses/LICENSE +202 -0
- polyptych-0.1.0.dist-info/licenses/NOTICE +10 -0
common/__init__.py
ADDED
|
File without changes
|
common/compat.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""Backward-compatible resolution of renamed env vars and runtime directories.
|
|
2
|
+
|
|
3
|
+
The Polyptych rebrand renamed user-facing env vars from the ``SLIDE_GEN_*``
|
|
4
|
+
namespace to ``POLYPTYCH_*`` and runtime directories from ``slide-analysis`` to
|
|
5
|
+
``polyptych``. To avoid breaking existing setups, the old names are honored as
|
|
6
|
+
deprecated fallbacks: the new name/location takes precedence, the old one is
|
|
7
|
+
used only when the new is absent, and a one-time warning is logged on first use.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
import os
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import overload
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger("polyptych")
|
|
18
|
+
|
|
19
|
+
APP_NAME = "polyptych"
|
|
20
|
+
LEGACY_APP_NAME = "slide-analysis"
|
|
21
|
+
|
|
22
|
+
# Module-global so the deprecation warning fires at most once per process per name.
|
|
23
|
+
_warned_env: set[str] = set()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@overload
|
|
27
|
+
def getenv_compat(new_name: str, old_name: str, default: str) -> str: ...
|
|
28
|
+
@overload
|
|
29
|
+
def getenv_compat(new_name: str, old_name: str, default: None = None) -> str | None: ...
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def getenv_compat(
|
|
33
|
+
new_name: str, old_name: str, default: str | None = None
|
|
34
|
+
) -> str | None:
|
|
35
|
+
"""Read ``new_name`` from the environment, falling back to ``old_name``.
|
|
36
|
+
|
|
37
|
+
Precedence: ``$new_name`` > ``$old_name`` (deprecated) > ``default``. The
|
|
38
|
+
first time the deprecated ``old_name`` supplies the value, a one-time
|
|
39
|
+
warning is emitted on the ``polyptych`` logger. Matches ``os.environ.get``
|
|
40
|
+
semantics: an env var set to the empty string counts as set.
|
|
41
|
+
"""
|
|
42
|
+
value = os.environ.get(new_name)
|
|
43
|
+
if value is not None:
|
|
44
|
+
return value
|
|
45
|
+
legacy = os.environ.get(old_name)
|
|
46
|
+
if legacy is not None:
|
|
47
|
+
if old_name not in _warned_env:
|
|
48
|
+
_warned_env.add(old_name)
|
|
49
|
+
logger.warning(
|
|
50
|
+
"Environment variable %s is deprecated and will be removed in a "
|
|
51
|
+
"future release; use %s instead.",
|
|
52
|
+
old_name,
|
|
53
|
+
new_name,
|
|
54
|
+
)
|
|
55
|
+
return legacy
|
|
56
|
+
return default
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _runtime_base(kind: str) -> Path:
|
|
60
|
+
"""Base dir for a runtime ``kind`` — ``config`` → ~/.config, ``cache`` → ~/.cache."""
|
|
61
|
+
return Path.home() / f".{kind}"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def runtime_dirs(kind: str) -> tuple[Path, Path]:
|
|
65
|
+
"""Return ``(new, legacy)`` runtime dirs for ``kind`` (``config`` or ``cache``)."""
|
|
66
|
+
base = _runtime_base(kind)
|
|
67
|
+
return base / APP_NAME, base / LEGACY_APP_NAME
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def resolve_runtime_file(kind: str, filename: str) -> Path | None:
|
|
71
|
+
"""First existing ``filename`` across the new then legacy runtime dir, or ``None``."""
|
|
72
|
+
new, legacy = runtime_dirs(kind)
|
|
73
|
+
for directory in (new, legacy):
|
|
74
|
+
candidate = directory / filename
|
|
75
|
+
if candidate.exists():
|
|
76
|
+
return candidate
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def runtime_write_dir(kind: str) -> Path:
|
|
81
|
+
"""Dir that new runtime files should be written to.
|
|
82
|
+
|
|
83
|
+
Prefers the new ``polyptych`` namespace, but keeps writing into an existing
|
|
84
|
+
legacy ``slide-analysis`` dir when that is the only one present, so a user's
|
|
85
|
+
credentials/state stay consolidated in one location rather than split across
|
|
86
|
+
two.
|
|
87
|
+
"""
|
|
88
|
+
new, legacy = runtime_dirs(kind)
|
|
89
|
+
if not new.exists() and legacy.exists():
|
|
90
|
+
return legacy
|
|
91
|
+
return new
|
common/logging_setup.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Shared logging configuration helper for the CLI entry point.
|
|
2
|
+
|
|
3
|
+
The ``polyptych`` CLI installs handlers on a named logger at startup so
|
|
4
|
+
library modules can use ``logger = logging.getLogger(__name__)`` and emit
|
|
5
|
+
diagnostics at the usual ``debug/info/warning/error`` levels without each
|
|
6
|
+
module wiring up its own output. ``polyptych.logging_setup`` is a thin
|
|
7
|
+
wrapper over this for the ``polyptych`` logger (kept separate because its
|
|
8
|
+
public API is pinned by tests).
|
|
9
|
+
|
|
10
|
+
This lives in ``common`` — the lowest shared layer — so the configurator can
|
|
11
|
+
be reused without coupling modules together.
|
|
12
|
+
|
|
13
|
+
User-facing CLI output (transcripts, timing tables, file paths, summaries)
|
|
14
|
+
stays on ``print`` to stdout; only diagnostics flow through these loggers.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import logging
|
|
20
|
+
import sys
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
from common.compat import getenv_compat
|
|
24
|
+
|
|
25
|
+
LOG_FORMAT = "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
|
|
26
|
+
DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
|
|
27
|
+
ENV_VAR = "POLYPTYCH_LOG_LEVEL"
|
|
28
|
+
LEGACY_ENV_VAR = "SLIDE_GEN_LOG_LEVEL"
|
|
29
|
+
|
|
30
|
+
_configured: set[str] = set()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def resolve_level(level: str | int | None) -> int:
|
|
34
|
+
"""Turn a user-supplied level (name, int, or ``None``) into a ``logging`` int."""
|
|
35
|
+
if level is None:
|
|
36
|
+
env = getenv_compat(ENV_VAR, LEGACY_ENV_VAR)
|
|
37
|
+
level = env if env else "INFO"
|
|
38
|
+
if isinstance(level, int):
|
|
39
|
+
return level
|
|
40
|
+
try:
|
|
41
|
+
return getattr(logging, level.upper())
|
|
42
|
+
except AttributeError as err:
|
|
43
|
+
raise ValueError(
|
|
44
|
+
f"Unknown log level: {level!r}. "
|
|
45
|
+
"Use DEBUG, INFO, WARNING, ERROR, or CRITICAL."
|
|
46
|
+
) from err
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def configure_logging(
|
|
50
|
+
logger_name: str,
|
|
51
|
+
level: str | int | None = None,
|
|
52
|
+
log_file: str | Path | None = None,
|
|
53
|
+
*,
|
|
54
|
+
force: bool = False,
|
|
55
|
+
) -> logging.Logger:
|
|
56
|
+
"""Install stderr (and optional file) handlers on the named logger.
|
|
57
|
+
|
|
58
|
+
Level resolution order: explicit ``level`` argument, then the
|
|
59
|
+
``POLYPTYCH_LOG_LEVEL`` env var (deprecated alias ``SLIDE_GEN_LOG_LEVEL``),
|
|
60
|
+
then ``INFO``. Safe to call more than once for a given ``logger_name`` —
|
|
61
|
+
repeat calls are ignored unless ``force=True``, which swaps the handlers so
|
|
62
|
+
a CLI flag can override the env var.
|
|
63
|
+
"""
|
|
64
|
+
logger = logging.getLogger(logger_name)
|
|
65
|
+
if logger_name in _configured and not force:
|
|
66
|
+
return logger
|
|
67
|
+
|
|
68
|
+
if force:
|
|
69
|
+
for handler in list(logger.handlers):
|
|
70
|
+
logger.removeHandler(handler)
|
|
71
|
+
handler.close()
|
|
72
|
+
|
|
73
|
+
logger.setLevel(resolve_level(level))
|
|
74
|
+
formatter = logging.Formatter(LOG_FORMAT, datefmt=DATE_FORMAT)
|
|
75
|
+
|
|
76
|
+
stderr_handler = logging.StreamHandler(stream=sys.stderr)
|
|
77
|
+
stderr_handler.setFormatter(formatter)
|
|
78
|
+
logger.addHandler(stderr_handler)
|
|
79
|
+
|
|
80
|
+
if log_file is not None:
|
|
81
|
+
file_handler = logging.FileHandler(Path(log_file), encoding="utf-8")
|
|
82
|
+
file_handler.setFormatter(formatter)
|
|
83
|
+
logger.addHandler(file_handler)
|
|
84
|
+
|
|
85
|
+
logger.propagate = False # stop log records from double-printing via root
|
|
86
|
+
_configured.add(logger_name)
|
|
87
|
+
return logger
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def reset_logging(logger_name: str) -> None:
|
|
91
|
+
"""Drop all handlers and mark the named logger unconfigured. Used by tests."""
|
|
92
|
+
logger = logging.getLogger(logger_name)
|
|
93
|
+
for handler in list(logger.handlers):
|
|
94
|
+
logger.removeHandler(handler)
|
|
95
|
+
handler.close()
|
|
96
|
+
_configured.discard(logger_name)
|
common/usage_log.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""JSONL logging for API usage (LLM, image generation)."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import threading
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from .compat import getenv_compat, runtime_write_dir
|
|
8
|
+
|
|
9
|
+
_log_lock = threading.Lock()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def default_usage_log() -> Path:
|
|
13
|
+
"""Return the default usage-log path, resolved at call time.
|
|
14
|
+
|
|
15
|
+
Resolution order:
|
|
16
|
+
1. ``$POLYPTYCH_USAGE_LOG`` env var (new name).
|
|
17
|
+
2. ``$SLIDE_GEN_USAGE_LOG`` env var (deprecated alias; emits a one-time warning).
|
|
18
|
+
3. ``~/.cache/polyptych/usage.jsonl`` (stable per-user default via
|
|
19
|
+
:func:`common.compat.runtime_write_dir`).
|
|
20
|
+
"""
|
|
21
|
+
env_val = getenv_compat("POLYPTYCH_USAGE_LOG", "SLIDE_GEN_USAGE_LOG")
|
|
22
|
+
if env_val is not None:
|
|
23
|
+
return Path(env_val)
|
|
24
|
+
return runtime_write_dir("cache") / "usage.jsonl"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def log_usage(path: Path, entry: dict) -> None:
|
|
28
|
+
"""Append a single usage entry as one JSON line (thread-safe)."""
|
|
29
|
+
line = json.dumps(entry, default=str) + "\n"
|
|
30
|
+
with _log_lock:
|
|
31
|
+
with open(path, "a", encoding="utf-8") as f:
|
|
32
|
+
f.write(line)
|
polyptych/__init__.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Slide generation pipeline using Gemini API."""
|
|
2
|
+
|
|
3
|
+
from .models import (
|
|
4
|
+
Task1Output,
|
|
5
|
+
Task2Output,
|
|
6
|
+
Task3Output,
|
|
7
|
+
Task4Output,
|
|
8
|
+
Task5Output,
|
|
9
|
+
Task6Output,
|
|
10
|
+
Task7Output,
|
|
11
|
+
)
|
|
12
|
+
from .client import TextClient, GeminiTextClient
|
|
13
|
+
from .pipeline import SlidePipeline
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"Task1Output",
|
|
17
|
+
"Task2Output",
|
|
18
|
+
"Task3Output",
|
|
19
|
+
"Task4Output",
|
|
20
|
+
"Task5Output",
|
|
21
|
+
"Task6Output",
|
|
22
|
+
"Task7Output",
|
|
23
|
+
"TextClient",
|
|
24
|
+
"GeminiTextClient",
|
|
25
|
+
"SlidePipeline",
|
|
26
|
+
]
|
polyptych/batch_utils.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""Generic batch orchestration: overlap splitting + parallel execution.
|
|
2
|
+
|
|
3
|
+
Shared by task_a1_sentence_beats (paragraphs) and task_a2_shot_planning (beats).
|
|
4
|
+
Each domain keeps its own spec dataclass (with domain-specific derived fields)
|
|
5
|
+
and wraps `split_with_overlap()` to enrich the generic `OverlapBatch` records.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from typing import Callable, Generic, TypeVar
|
|
13
|
+
|
|
14
|
+
T = TypeVar("T")
|
|
15
|
+
R = TypeVar("R")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class OverlapBatch(Generic[T]):
|
|
20
|
+
"""Generic overlap-split batch carrying core items + before/after context.
|
|
21
|
+
|
|
22
|
+
core_start is the 0-based index of the first core item in the original list.
|
|
23
|
+
Callers can derive 1-based IDs or other projections from core_start and
|
|
24
|
+
the lengths of core / context_before / context_after.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
batch_index: int
|
|
28
|
+
core: list[T]
|
|
29
|
+
context_before: list[T]
|
|
30
|
+
context_after: list[T]
|
|
31
|
+
core_start: int
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def split_with_overlap(
|
|
35
|
+
items: list[T],
|
|
36
|
+
batch_size: int,
|
|
37
|
+
overlap: int = 0,
|
|
38
|
+
) -> list[OverlapBatch[T]]:
|
|
39
|
+
"""Split `items` into batches of `batch_size` with `overlap` before/after context.
|
|
40
|
+
|
|
41
|
+
The last batch may be shorter than batch_size. Context lists are clamped
|
|
42
|
+
to the available items at either end. Returns [] for empty input.
|
|
43
|
+
"""
|
|
44
|
+
batches: list[OverlapBatch[T]] = []
|
|
45
|
+
for idx, start in enumerate(range(0, len(items), batch_size)):
|
|
46
|
+
end = min(start + batch_size, len(items))
|
|
47
|
+
before_start = max(0, start - overlap)
|
|
48
|
+
after_end = min(len(items), end + overlap)
|
|
49
|
+
batches.append(
|
|
50
|
+
OverlapBatch(
|
|
51
|
+
batch_index=idx,
|
|
52
|
+
core=items[start:end],
|
|
53
|
+
context_before=items[before_start:start],
|
|
54
|
+
context_after=items[end:after_end],
|
|
55
|
+
core_start=start,
|
|
56
|
+
)
|
|
57
|
+
)
|
|
58
|
+
return batches
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def run_batches(
|
|
62
|
+
specs: list[T],
|
|
63
|
+
process: Callable[[T], R],
|
|
64
|
+
max_workers: int,
|
|
65
|
+
) -> list[R]:
|
|
66
|
+
"""Run `process(spec)` for each spec, return results in submission order.
|
|
67
|
+
|
|
68
|
+
Single-batch fast path: if there is exactly one spec, it runs directly
|
|
69
|
+
without spawning a ThreadPoolExecutor. Otherwise results are collected
|
|
70
|
+
via `as_completed` and re-ordered by the spec's original index so that
|
|
71
|
+
ordering is deterministic regardless of completion order.
|
|
72
|
+
"""
|
|
73
|
+
if len(specs) == 1:
|
|
74
|
+
return [process(specs[0])]
|
|
75
|
+
|
|
76
|
+
effective_workers = min(len(specs), max_workers)
|
|
77
|
+
results: dict[int, R] = {}
|
|
78
|
+
with ThreadPoolExecutor(max_workers=effective_workers) as executor:
|
|
79
|
+
futures = {
|
|
80
|
+
executor.submit(process, spec): idx for idx, spec in enumerate(specs)
|
|
81
|
+
}
|
|
82
|
+
for future in as_completed(futures):
|
|
83
|
+
idx = futures[future]
|
|
84
|
+
results[idx] = future.result()
|
|
85
|
+
return [results[i] for i in range(len(specs))]
|
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
"""Clean PDF-to-markdown conversion artifacts from source files.
|
|
2
|
+
|
|
3
|
+
Removes page numbers, footnote blocks, footnote superscripts, TOC with dot leaders,
|
|
4
|
+
broken image references, and unnecessary backslash escaping.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class CleaningConfig:
|
|
14
|
+
"""Configuration for which cleaning passes to enable."""
|
|
15
|
+
|
|
16
|
+
remove_toc: bool = True
|
|
17
|
+
remove_image_refs: bool = True
|
|
18
|
+
remove_footnotes: bool = True # strips both blocks and superscripts
|
|
19
|
+
remove_page_numbers: bool = True
|
|
20
|
+
unescape_chars: bool = True
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class CleaningResult:
|
|
25
|
+
"""Statistics from a cleaning run."""
|
|
26
|
+
|
|
27
|
+
original_lines: int = 0
|
|
28
|
+
cleaned_lines: int = 0
|
|
29
|
+
toc_lines_removed: int = 0
|
|
30
|
+
image_refs_removed: int = 0
|
|
31
|
+
footnote_refs_removed: int = 0 # superscripts stripped
|
|
32
|
+
footnote_blocks_removed: int = 0 # block lines stripped
|
|
33
|
+
page_numbers_removed: int = 0
|
|
34
|
+
escapes_fixed: int = 0
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def remove_toc(text: str) -> tuple[str, int]:
|
|
38
|
+
"""Remove table of contents with dot-leader lines.
|
|
39
|
+
|
|
40
|
+
Finds a CONTENTS heading, then removes all lines until the first chapter
|
|
41
|
+
heading that is NOT followed by dot leaders.
|
|
42
|
+
"""
|
|
43
|
+
lines = text.split("\n")
|
|
44
|
+
# Find the CONTENTS heading
|
|
45
|
+
contents_idx = None
|
|
46
|
+
for i, line in enumerate(lines):
|
|
47
|
+
if re.match(r"^\*{0,2}CONTENTS\*{0,2}\s*$", line.strip()):
|
|
48
|
+
contents_idx = i
|
|
49
|
+
break
|
|
50
|
+
|
|
51
|
+
if contents_idx is None:
|
|
52
|
+
return text, 0
|
|
53
|
+
|
|
54
|
+
# Find the end of TOC: first chapter heading pattern not part of TOC
|
|
55
|
+
# Chapter headings look like **1. or **1\. at the start
|
|
56
|
+
# TOC lines contain 5+ consecutive dots
|
|
57
|
+
toc_end = None
|
|
58
|
+
for i in range(contents_idx + 1, len(lines)):
|
|
59
|
+
line = lines[i].strip()
|
|
60
|
+
# A chapter heading that starts the actual content (not a TOC entry)
|
|
61
|
+
if re.match(r"\*{2}\d+\\?\.", line) and "....." not in line:
|
|
62
|
+
toc_end = i
|
|
63
|
+
break
|
|
64
|
+
|
|
65
|
+
if toc_end is None:
|
|
66
|
+
# No chapter heading found after TOC; remove to end of consecutive TOC-like lines
|
|
67
|
+
toc_end = contents_idx + 1
|
|
68
|
+
for i in range(contents_idx + 1, len(lines)):
|
|
69
|
+
line = lines[i].strip()
|
|
70
|
+
if (
|
|
71
|
+
line == ""
|
|
72
|
+
or "....." in line
|
|
73
|
+
or re.match(r"^\*", line)
|
|
74
|
+
or re.match(r"^[A-Z]", line)
|
|
75
|
+
):
|
|
76
|
+
toc_end = i + 1
|
|
77
|
+
else:
|
|
78
|
+
break
|
|
79
|
+
|
|
80
|
+
removed = toc_end - contents_idx
|
|
81
|
+
result_lines = lines[:contents_idx] + lines[toc_end:]
|
|
82
|
+
return "\n".join(result_lines), removed
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def remove_image_refs(text: str) -> tuple[str, int]:
|
|
86
|
+
"""Remove broken image references and their captions.
|
|
87
|
+
|
|
88
|
+
Matches lines like *![][imageN]* and the immediately following caption line.
|
|
89
|
+
"""
|
|
90
|
+
lines = text.split("\n")
|
|
91
|
+
result = []
|
|
92
|
+
count = 0
|
|
93
|
+
skip_next = False
|
|
94
|
+
|
|
95
|
+
for i, line in enumerate(lines):
|
|
96
|
+
if skip_next:
|
|
97
|
+
skip_next = False
|
|
98
|
+
continue
|
|
99
|
+
if re.match(r"^\s*\*?!\[\]\[image\d+\]\*?\s*$", line.strip()):
|
|
100
|
+
count += 1
|
|
101
|
+
# Skip the following caption line if it's a short non-heading, non-blank line
|
|
102
|
+
if i + 1 < len(lines):
|
|
103
|
+
next_line = lines[i + 1].strip()
|
|
104
|
+
if (
|
|
105
|
+
next_line
|
|
106
|
+
and not next_line.startswith("**")
|
|
107
|
+
and len(next_line) < 200
|
|
108
|
+
):
|
|
109
|
+
skip_next = True
|
|
110
|
+
continue
|
|
111
|
+
result.append(line)
|
|
112
|
+
|
|
113
|
+
return "\n".join(result), count
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def remove_footnote_blocks(text: str) -> tuple[str, int]:
|
|
117
|
+
"""Remove footnote reference blocks.
|
|
118
|
+
|
|
119
|
+
Footnote blocks start with a line of whitespace (typically ` ` or blank),
|
|
120
|
+
followed by lines starting with a number + space (e.g., `1 General C.C. Krulak...`).
|
|
121
|
+
"""
|
|
122
|
+
lines = text.split("\n")
|
|
123
|
+
result = []
|
|
124
|
+
count = 0
|
|
125
|
+
i = 0
|
|
126
|
+
|
|
127
|
+
while i < len(lines):
|
|
128
|
+
# Check for separator line (whitespace-only, often ` `) followed by footnote
|
|
129
|
+
if re.match(r"^[\s]*$", lines[i]):
|
|
130
|
+
# Look ahead for footnote lines
|
|
131
|
+
j = i + 1
|
|
132
|
+
# Skip additional blank lines
|
|
133
|
+
while j < len(lines) and re.match(r"^\s*$", lines[j]):
|
|
134
|
+
j += 1
|
|
135
|
+
# Check if next non-blank line starts a footnote (number + space at start)
|
|
136
|
+
if j < len(lines) and re.match(r"^\d{1,3}\s+\S", lines[j]):
|
|
137
|
+
# This is a footnote block - consume all footnote lines
|
|
138
|
+
block_start = i
|
|
139
|
+
# Consume the separator lines
|
|
140
|
+
while j < len(lines):
|
|
141
|
+
line = lines[j].strip()
|
|
142
|
+
if line == "":
|
|
143
|
+
# Blank line - check if followed by another footnote
|
|
144
|
+
k = j + 1
|
|
145
|
+
while k < len(lines) and re.match(r"^\s*$", lines[k]):
|
|
146
|
+
k += 1
|
|
147
|
+
if k < len(lines) and re.match(r"^\d{1,3}\s+\S", lines[k]):
|
|
148
|
+
j = k + 1
|
|
149
|
+
continue
|
|
150
|
+
else:
|
|
151
|
+
break
|
|
152
|
+
elif re.match(r"^\d{1,3}\s+\S", line):
|
|
153
|
+
j += 1
|
|
154
|
+
continue
|
|
155
|
+
else:
|
|
156
|
+
# Continuation line of a footnote (wrapped text)
|
|
157
|
+
# Check if it looks like content (starts with capital, no footnote number)
|
|
158
|
+
# Heuristic: if the previous line was a footnote, this could be continuation
|
|
159
|
+
if j > 0 and (
|
|
160
|
+
re.match(r"^\d{1,3}\s+\S", lines[j - 1].strip())
|
|
161
|
+
or lines[j - 1].strip() == ""
|
|
162
|
+
):
|
|
163
|
+
j += 1
|
|
164
|
+
continue
|
|
165
|
+
else:
|
|
166
|
+
break
|
|
167
|
+
|
|
168
|
+
removed_lines = j - block_start
|
|
169
|
+
count += removed_lines
|
|
170
|
+
i = j
|
|
171
|
+
continue
|
|
172
|
+
|
|
173
|
+
result.append(lines[i])
|
|
174
|
+
i += 1
|
|
175
|
+
|
|
176
|
+
return "\n".join(result), count
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def remove_footnote_superscripts(text: str) -> tuple[str, int]:
|
|
180
|
+
"""Strip footnote superscript digits glued to word endings.
|
|
181
|
+
|
|
182
|
+
Handles: `intellect1.` → `intellect.`, `war2.` → `war.`
|
|
183
|
+
Does NOT strip: dates like 1997, model numbers like F-16, standalone numbers.
|
|
184
|
+
"""
|
|
185
|
+
# Match 1-3 digits after a letter, before punctuation or whitespace/end
|
|
186
|
+
# Negative lookbehind: don't match after digits (avoids stripping from years/numbers)
|
|
187
|
+
# Negative lookbehind: don't match after hyphen (avoids F-16 → F-)
|
|
188
|
+
pattern = r"(?<=[a-zA-Z])(\d{1,3})(?=[.,;:!?\s\)\]\"\'»]|$)"
|
|
189
|
+
|
|
190
|
+
count = 0
|
|
191
|
+
|
|
192
|
+
def _replace(m: re.Match) -> str:
|
|
193
|
+
nonlocal count
|
|
194
|
+
count += 1
|
|
195
|
+
return ""
|
|
196
|
+
|
|
197
|
+
result = re.sub(pattern, _replace, text)
|
|
198
|
+
return result, count
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def remove_page_numbers(text: str) -> tuple[str, int]:
|
|
202
|
+
r"""Remove standalone page number lines.
|
|
203
|
+
|
|
204
|
+
Removes lines matching `^\d{1,3}\s*$` (Arabic) or `^[ivxlc]+\s*$` (Roman)
|
|
205
|
+
that are surrounded by blank lines.
|
|
206
|
+
"""
|
|
207
|
+
lines = text.split("\n")
|
|
208
|
+
result = []
|
|
209
|
+
count = 0
|
|
210
|
+
|
|
211
|
+
# Find first chapter heading to determine front matter boundary for Roman numerals
|
|
212
|
+
first_chapter = len(lines)
|
|
213
|
+
for i, line in enumerate(lines):
|
|
214
|
+
if re.match(r"\*{2}\d+\\?\.", line.strip()):
|
|
215
|
+
first_chapter = i
|
|
216
|
+
break
|
|
217
|
+
|
|
218
|
+
for i, line in enumerate(lines):
|
|
219
|
+
stripped = line.strip()
|
|
220
|
+
|
|
221
|
+
# Check for Arabic page numbers
|
|
222
|
+
is_arabic = bool(re.match(r"^\d{1,3}$", stripped))
|
|
223
|
+
|
|
224
|
+
# Check for Roman numeral page numbers (only in front matter)
|
|
225
|
+
is_roman = bool(re.match(r"^[ivxlc]+$", stripped)) and i < first_chapter
|
|
226
|
+
|
|
227
|
+
if (is_arabic or is_roman) and _is_surrounded_by_blanks(lines, i):
|
|
228
|
+
count += 1
|
|
229
|
+
continue
|
|
230
|
+
|
|
231
|
+
result.append(line)
|
|
232
|
+
|
|
233
|
+
return "\n".join(result), count
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _is_surrounded_by_blanks(lines: list[str], idx: int) -> bool:
|
|
237
|
+
"""Check if a line is surrounded by blank (or near-blank) lines."""
|
|
238
|
+
prev_blank = idx == 0 or lines[idx - 1].strip() == ""
|
|
239
|
+
next_blank = idx == len(lines) - 1 or lines[idx + 1].strip() == ""
|
|
240
|
+
return prev_blank and next_blank
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def unescape_chars(text: str) -> tuple[str, int]:
|
|
244
|
+
r"""Remove unnecessary backslash escaping from PDF conversion.
|
|
245
|
+
|
|
246
|
+
Cleans: `\.` → `.`, `\-` → `-`, `\+` → `+`, `\[` → `[`, `\]` → `]`,
|
|
247
|
+
`\(` → `(`, `\)` → `)`.
|
|
248
|
+
"""
|
|
249
|
+
chars = r".\-+[]()"
|
|
250
|
+
count = 0
|
|
251
|
+
for ch in chars:
|
|
252
|
+
escaped = "\\" + ch
|
|
253
|
+
occurrences = text.count(escaped)
|
|
254
|
+
if occurrences > 0:
|
|
255
|
+
count += occurrences
|
|
256
|
+
text = text.replace(escaped, ch)
|
|
257
|
+
return text, count
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def normalize_whitespace(text: str) -> str:
|
|
261
|
+
"""Collapse 3+ consecutive blank lines to 2. Strip trailing whitespace per line."""
|
|
262
|
+
# Strip trailing whitespace per line
|
|
263
|
+
lines = [line.rstrip() for line in text.split("\n")]
|
|
264
|
+
text = "\n".join(lines)
|
|
265
|
+
# Collapse 3+ consecutive blank lines to 2
|
|
266
|
+
text = re.sub(r"\n{4,}", "\n\n\n", text)
|
|
267
|
+
return text
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def clean_source(
|
|
271
|
+
text: str, config: CleaningConfig | None = None
|
|
272
|
+
) -> tuple[str, CleaningResult]:
|
|
273
|
+
"""Run all enabled cleaning passes in order. Idempotent."""
|
|
274
|
+
if config is None:
|
|
275
|
+
config = CleaningConfig()
|
|
276
|
+
|
|
277
|
+
result = CleaningResult()
|
|
278
|
+
result.original_lines = len(text.split("\n"))
|
|
279
|
+
|
|
280
|
+
if config.remove_toc:
|
|
281
|
+
text, result.toc_lines_removed = remove_toc(text)
|
|
282
|
+
|
|
283
|
+
if config.remove_image_refs:
|
|
284
|
+
text, result.image_refs_removed = remove_image_refs(text)
|
|
285
|
+
|
|
286
|
+
if config.remove_footnotes:
|
|
287
|
+
text, result.footnote_blocks_removed = remove_footnote_blocks(text)
|
|
288
|
+
text, result.footnote_refs_removed = remove_footnote_superscripts(text)
|
|
289
|
+
|
|
290
|
+
if config.remove_page_numbers:
|
|
291
|
+
text, result.page_numbers_removed = remove_page_numbers(text)
|
|
292
|
+
|
|
293
|
+
if config.unescape_chars:
|
|
294
|
+
text, result.escapes_fixed = unescape_chars(text)
|
|
295
|
+
|
|
296
|
+
text = normalize_whitespace(text)
|
|
297
|
+
result.cleaned_lines = len(text.split("\n"))
|
|
298
|
+
|
|
299
|
+
return text, result
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def clean_file(
|
|
303
|
+
input_path: Path,
|
|
304
|
+
output_path: Path | None = None,
|
|
305
|
+
config: CleaningConfig | None = None,
|
|
306
|
+
) -> tuple[Path, CleaningResult]:
|
|
307
|
+
"""Read file, clean, write output. Default output: <stem>-clean.md in same directory."""
|
|
308
|
+
text = input_path.read_text(encoding="utf-8")
|
|
309
|
+
|
|
310
|
+
if output_path is None:
|
|
311
|
+
output_path = input_path.parent / f"{input_path.stem}-clean{input_path.suffix}"
|
|
312
|
+
|
|
313
|
+
cleaned_text, result = clean_source(text, config)
|
|
314
|
+
output_path.write_text(cleaned_text, encoding="utf-8")
|
|
315
|
+
|
|
316
|
+
return output_path, result
|