PyPI - accesspdf - Versions diffs - 1.0.0__tar.gz - Mend

accesspdf 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

accesspdf-1.0.0/.claude/settings.local.json +14 -0
accesspdf-1.0.0/.gitignore +18 -0
accesspdf-1.0.0/LICENSE +15 -0
accesspdf-1.0.0/OldDocs/accesspdf-prd.docx +0 -0
accesspdf-1.0.0/OldDocs/accesspdf-readme.docx +0 -0
accesspdf-1.0.0/OldDocs/accesspdf-roadmap.docx +0 -0
accesspdf-1.0.0/OldDocs/accesspdf-technical-architecture.docx +0 -0
accesspdf-1.0.0/PKG-INFO +228 -0
accesspdf-1.0.0/README.md +134 -0
accesspdf-1.0.0/accesspdf/__init__.py +3 -0
accesspdf-1.0.0/accesspdf/__main__.py +5 -0
accesspdf-1.0.0/accesspdf/alttext/__init__.py +1 -0
accesspdf-1.0.0/accesspdf/alttext/extract.py +235 -0
accesspdf-1.0.0/accesspdf/alttext/injector.py +237 -0
accesspdf-1.0.0/accesspdf/alttext/sidecar.py +169 -0
accesspdf-1.0.0/accesspdf/analyzer.py +514 -0
accesspdf-1.0.0/accesspdf/cli.py +429 -0
accesspdf-1.0.0/accesspdf/config.py +62 -0
accesspdf-1.0.0/accesspdf/models.py +147 -0
accesspdf-1.0.0/accesspdf/pipeline.py +129 -0
accesspdf-1.0.0/accesspdf/processors/__init__.py +16 -0
accesspdf-1.0.0/accesspdf/processors/_pdf_helpers.py +128 -0
accesspdf-1.0.0/accesspdf/processors/_text_extract.py +171 -0
accesspdf-1.0.0/accesspdf/processors/base.py +34 -0
accesspdf-1.0.0/accesspdf/processors/bookmarks.py +159 -0
accesspdf-1.0.0/accesspdf/processors/headings.py +251 -0
accesspdf-1.0.0/accesspdf/processors/links.py +111 -0
accesspdf-1.0.0/accesspdf/processors/metadata.py +151 -0
accesspdf-1.0.0/accesspdf/processors/reading_order.py +117 -0
accesspdf-1.0.0/accesspdf/processors/tables.py +202 -0
accesspdf-1.0.0/accesspdf/processors/tagger.py +196 -0
accesspdf-1.0.0/accesspdf/providers/__init__.py +87 -0
accesspdf-1.0.0/accesspdf/providers/anthropic.py +94 -0
accesspdf-1.0.0/accesspdf/providers/base.py +59 -0
accesspdf-1.0.0/accesspdf/providers/gemini.py +255 -0
accesspdf-1.0.0/accesspdf/providers/noop.py +19 -0
accesspdf-1.0.0/accesspdf/providers/ollama.py +129 -0
accesspdf-1.0.0/accesspdf/providers/openai.py +89 -0
accesspdf-1.0.0/accesspdf/reporter.py +54 -0
accesspdf-1.0.0/accesspdf/review/__init__.py +1 -0
accesspdf-1.0.0/accesspdf/review/app.py +227 -0
accesspdf-1.0.0/accesspdf/review/renderer.py +106 -0
accesspdf-1.0.0/accesspdf/review/widgets.py +180 -0
accesspdf-1.0.0/accesspdf/utils/__init__.py +1 -0
accesspdf-1.0.0/accesspdf/utils/contrast.py +80 -0
accesspdf-1.0.0/accesspdf/web/__init__.py +1 -0
accesspdf-1.0.0/accesspdf/web/app.py +581 -0
accesspdf-1.0.0/accesspdf/web/templates/index.html +1172 -0
accesspdf-1.0.0/accesspdf/writer.py +32 -0
accesspdf-1.0.0/docs/CLAUDE.md +255 -0
accesspdf-1.0.0/docs/pyproject.toml +140 -0
accesspdf-1.0.0/files/accesspdf-prd-v2.docx +0 -0
accesspdf-1.0.0/files/accesspdf-readme-v2.docx +0 -0
accesspdf-1.0.0/files/accesspdf-roadmap-v2.docx +0 -0
accesspdf-1.0.0/files/accesspdf-technical-architecture-v3.docx +0 -0
accesspdf-1.0.0/pyproject.toml +150 -0
accesspdf-1.0.0/tests/__init__.py +0 -0
accesspdf-1.0.0/tests/conftest.py +108 -0
accesspdf-1.0.0/tests/corpus/ambiguous_links.pdf +98 -0
accesspdf-1.0.0/tests/corpus/headings.pdf +74 -0
accesspdf-1.0.0/tests/corpus/images.alttext.yaml +24 -0
accesspdf-1.0.0/tests/corpus/images.pdf +103 -0
accesspdf-1.0.0/tests/corpus/links.pdf +90 -0
accesspdf-1.0.0/tests/corpus/low_contrast.pdf +68 -0
accesspdf-1.0.0/tests/corpus/multicolumn.pdf +68 -0
accesspdf-1.0.0/tests/corpus/scanned.pdf +139 -0
accesspdf-1.0.0/tests/corpus/simple.pdf +68 -0
accesspdf-1.0.0/tests/corpus/tables.pdf +74 -0
accesspdf-1.0.0/tests/fixtures/__init__.py +0 -0
accesspdf-1.0.0/tests/fixtures/generate.py +354 -0
accesspdf-1.0.0/tests/test_analyzer.py +38 -0
accesspdf-1.0.0/tests/test_batch.py +93 -0
accesspdf-1.0.0/tests/test_bookmarks.py +56 -0
accesspdf-1.0.0/tests/test_config.py +52 -0
accesspdf-1.0.0/tests/test_contrast.py +118 -0
accesspdf-1.0.0/tests/test_extract.py +55 -0
accesspdf-1.0.0/tests/test_headings.py +58 -0
accesspdf-1.0.0/tests/test_injector.py +125 -0
accesspdf-1.0.0/tests/test_links.py +45 -0
accesspdf-1.0.0/tests/test_metadata.py +70 -0
accesspdf-1.0.0/tests/test_models.py +102 -0
accesspdf-1.0.0/tests/test_pipeline_integration.py +103 -0
accesspdf-1.0.0/tests/test_providers.py +198 -0
accesspdf-1.0.0/tests/test_review_app.py +110 -0
accesspdf-1.0.0/tests/test_sidecar.py +177 -0
accesspdf-1.0.0/tests/test_tables.py +81 -0
accesspdf-1.0.0/tests/test_tagger.py +78 -0
accesspdf-1.0.0/tests/test_web.py +336 -0
accesspdf-1.0.0/tests/utils/__init__.py +0 -0
accesspdf-1.0.0/tests/utils/validate.py +125 -0

accesspdf-1.0.0/.claude/settings.local.json ADDED Viewed

@@ -0,0 +1,14 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(git init)",
+      "Bash(pip install -e \".[dev]\")",
+      "Bash(python -m pytest tests/ -v)",
+      "Bash(python -m accesspdf --version)",
+      "Bash(python -m accesspdf --help)",
+      "Bash(git add .gitignore LICENSE README.md pyproject.toml accesspdf/ tests/ docs/ OldDocs/ files/ .claude/)",
+      "Bash(xargs grep -l \"contrast\\\\|font.*size\\\\|color\")",
+      "Bash(wc -l /c/Users/laure/Documents/AccessPDF/accesspdf/**/*.py)"
+    ]
+  }
+}

accesspdf-1.0.0/.gitignore ADDED Viewed

@@ -0,0 +1,18 @@
+__pycache__/
+*.py[cod]
+*$py.class
+*.egg-info/
+dist/
+build/
+.eggs/
+*.egg
+.mypy_cache/
+.ruff_cache/
+.pytest_cache/
+.coverage
+htmlcov/
+*.so
+.env
+.venv/
+env/
+venv/

accesspdf-1.0.0/LICENSE ADDED Viewed

@@ -0,0 +1,15 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

accesspdf-1.0.0/OldDocs/accesspdf-prd.docx ADDED Viewed

Binary file

accesspdf-1.0.0/OldDocs/accesspdf-readme.docx ADDED Viewed

Binary file

accesspdf-1.0.0/OldDocs/accesspdf-roadmap.docx ADDED Viewed

Binary file

accesspdf-1.0.0/OldDocs/accesspdf-technical-architecture.docx ADDED Viewed

Binary file

accesspdf-1.0.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,228 @@
+Metadata-Version: 2.4
+Name: accesspdf
+Version: 1.0.0
+Summary: AI-powered PDF accessibility remediation, at scale
+Project-URL: Homepage, https://github.com/laurenaulet/accesspdf
+Project-URL: Repository, https://github.com/laurenaulet/accesspdf
+Project-URL: Issues, https://github.com/laurenaulet/accesspdf/issues
+Project-URL: Changelog, https://github.com/laurenaulet/accesspdf/CHANGELOG.md
+License:                                  Apache License
+                                   Version 2.0, January 2004
+                                http://www.apache.org/licenses/
+           Licensed under the Apache License, Version 2.0 (the "License");
+           you may not use this file except in compliance with the License.
+           You may obtain a copy of the License at
+               http://www.apache.org/licenses/LICENSE-2.0
+           Unless required by applicable law or agreed to in writing, software
+           distributed under the License is distributed on an "AS IS" BASIS,
+           WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+           See the License for the specific language governing permissions and
+           limitations under the License.
+License-File: LICENSE
+Keywords: a11y,accessibility,alt-text,pdf,wcag
+Classifier: Development Status :: 4 - Beta
+Classifier: Environment :: Console
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Education
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Text Processing :: Markup
+Classifier: Topic :: Utilities
+Requires-Python: >=3.10
+Requires-Dist: httpx>=0.27
+Requires-Dist: jinja2>=3.1
+Requires-Dist: langdetect>=1.0.9
+Requires-Dist: pdfminer-six>=20221105
+Requires-Dist: pikepdf<10.0,>=8.0
+Requires-Dist: pillow>=10.0
+Requires-Dist: pydantic>=2.0
+Requires-Dist: pyyaml>=6.0
+Requires-Dist: rich>=13.0
+Requires-Dist: textual>=0.50
+Requires-Dist: typer>=0.12
+Provides-Extra: all
+Requires-Dist: anthropic>=0.25; extra == 'all'
+Requires-Dist: fastapi>=0.110; extra == 'all'
+Requires-Dist: httpx>=0.27; extra == 'all'
+Requires-Dist: mypy>=1.9; extra == 'all'
+Requires-Dist: openai>=1.25; extra == 'all'
+Requires-Dist: pytest-asyncio>=0.23; extra == 'all'
+Requires-Dist: pytest-cov>=5.0; extra == 'all'
+Requires-Dist: pytest-vcr>=1.0; extra == 'all'
+Requires-Dist: pytest>=8.0; extra == 'all'
+Requires-Dist: python-multipart>=0.0.9; extra == 'all'
+Requires-Dist: reportlab>=4.0; extra == 'all'
+Requires-Dist: ruff>=0.4; extra == 'all'
+Requires-Dist: types-pillow; extra == 'all'
+Requires-Dist: types-pyyaml; extra == 'all'
+Requires-Dist: uvicorn[standard]>=0.29; extra == 'all'
+Requires-Dist: vcrpy>=6.0; extra == 'all'
+Provides-Extra: all-providers
+Requires-Dist: anthropic>=0.25; extra == 'all-providers'
+Requires-Dist: openai>=1.25; extra == 'all-providers'
+Provides-Extra: anthropic
+Requires-Dist: anthropic>=0.25; extra == 'anthropic'
+Provides-Extra: dev
+Requires-Dist: fastapi>=0.110; extra == 'dev'
+Requires-Dist: httpx>=0.27; extra == 'dev'
+Requires-Dist: mypy>=1.9; extra == 'dev'
+Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
+Requires-Dist: pytest-cov>=5.0; extra == 'dev'
+Requires-Dist: pytest-vcr>=1.0; extra == 'dev'
+Requires-Dist: pytest>=8.0; extra == 'dev'
+Requires-Dist: python-multipart>=0.0.9; extra == 'dev'
+Requires-Dist: reportlab>=4.0; extra == 'dev'
+Requires-Dist: ruff>=0.4; extra == 'dev'
+Requires-Dist: types-pillow; extra == 'dev'
+Requires-Dist: types-pyyaml; extra == 'dev'
+Requires-Dist: uvicorn[standard]>=0.29; extra == 'dev'
+Requires-Dist: vcrpy>=6.0; extra == 'dev'
+Provides-Extra: openai
+Requires-Dist: openai>=1.25; extra == 'openai'
+Provides-Extra: web
+Requires-Dist: fastapi>=0.110; extra == 'web'
+Requires-Dist: python-multipart>=0.0.9; extra == 'web'
+Requires-Dist: uvicorn[standard]>=0.29; extra == 'web'
+Description-Content-Type: text/markdown
+# AccessPDF
+Make PDFs accessible. Fixes structure, reading order, tables, and headings automatically -- then helps you add image descriptions with local AI or by hand.
+Targets **WCAG 2.1 AA** and **PDF/UA**.
+## Quick start
+```bash
+pip install "accesspdf[web]"
+accesspdf serve
+```
+This opens a browser UI at `http://localhost:8080`. Upload a PDF, get an accessibility report, download the fixed version. If your PDF has images, you can write alt text right in the browser -- or let AI do a first draft.
+For AI-generated alt text, we recommend **[Ollama](https://ollama.com)** -- it's free, runs locally, and needs no API key. Install it, then:
+```bash
+ollama pull llava
+```
+That's it. Select "Ollama" in the web UI and click generate.
+---
+## How it works
+AccessPDF does two things:
+1. **Fixes structure automatically** -- tags, language, reading order, headings, tables, links, bookmarks
+2. **Helps you add image descriptions** -- the one part that needs a human (or AI + human review)
+Your original PDF is never modified. Output always goes to a new file.
+## CLI workflow
+If you prefer the command line over the web UI:
+```bash
+# 1. See what's wrong (read-only, never touches your file)
+accesspdf check my-document.pdf
+# 2. Fix structural issues
+accesspdf fix my-document.pdf -o my-document_accessible.pdf
+# 3. Generate AI alt text drafts (optional)
+accesspdf generate-alt-text my-document_accessible.pdf
+# 4. Review and approve the drafts
+accesspdf review my-document_accessible.pdf
+# 5. Re-run fix to inject approved descriptions
+accesspdf fix my-document.pdf -o my-document_accessible.pdf --alt-text my-document.alttext.yaml
+```
+## AI alt text providers
+AccessPDF uses AI vision models to draft image descriptions. You always review before anything gets injected.
+| Provider | Setup | API key? | Cost |
+|----------|-------|----------|------|
+| **Ollama** (recommended) | [Install Ollama](https://ollama.com), `ollama pull llava` | No | Free (local) |
+| Google Gemini | None | `GOOGLE_API_KEY` | Free tier |
+| Anthropic (Claude) | `pip install accesspdf[anthropic]` | `ANTHROPIC_API_KEY` | Paid |
+| OpenAI (GPT-4) | `pip install accesspdf[openai]` | `OPENAI_API_KEY` | Paid |
+**Ollama is the easiest** -- no API key, no account, nothing leaves your machine. Just install it and pull a model.
+For cloud providers, set your key as an environment variable or pass it directly:
+```bash
+accesspdf generate-alt-text my-document.pdf --provider gemini --api-key AIza...
+```
+In the web UI, you can paste your API key in the settings panel -- it's sent per-request and never saved to disk.
+## Batch processing
+Fix every PDF in a folder:
+```bash
+accesspdf batch ./papers/ -o ./papers/accessible/
+accesspdf batch ./papers/ -o ./papers/accessible/ -r   # include subdirectories
+```
+## The sidecar file
+Image descriptions live in a `.alttext.yaml` file next to your PDF:
+```yaml
+images:
+- id: img_37044c
+  page: 1
+  ai_draft: 'Bar chart showing quarterly revenue from 2023-2025.'
+  alt_text: 'Bar chart showing quarterly revenue. Q1 2025 is highest at $4.2M.'
+  status: approved
+```
+Statuses: **needs_review** (not yet described), **approved** (gets injected), **decorative** (screen readers skip it). You can edit this file by hand.
+## CLI reference
+```
+accesspdf check <pdf>                    # Analyze accessibility (read-only)
+accesspdf fix <pdf> -o <output>          # Fix structure + inject alt text
+accesspdf fix <pdf> --alt-text <yaml>    # Fix with sidecar descriptions
+accesspdf batch <dir> -o <outdir>        # Fix all PDFs in a directory
+accesspdf review <pdf>                   # Terminal UI for alt text
+accesspdf serve                          # Web UI at localhost:8080
+accesspdf generate-alt-text <pdf>        # AI drafts (Ollama default)
+accesspdf providers                      # Show available AI providers
+```
+## Installation options
+```bash
+pip install accesspdf          # CLI only
+pip install "accesspdf[web]"   # CLI + browser UI (recommended)
+pip install "accesspdf[anthropic]"  # Add Claude provider
+pip install "accesspdf[openai]"     # Add GPT-4 provider
+```
+## Contributing
+```bash
+git clone https://github.com/laurenaulet/accesspdf.git
+cd accesspdf
+pip install -e ".[dev]"
+python -m pytest tests/ -v
+```
+## License
+Apache 2.0

accesspdf-1.0.0/README.md ADDED Viewed

@@ -0,0 +1,134 @@
+# AccessPDF
+Make PDFs accessible. Fixes structure, reading order, tables, and headings automatically -- then helps you add image descriptions with local AI or by hand.
+Targets **WCAG 2.1 AA** and **PDF/UA**.
+## Quick start
+```bash
+pip install "accesspdf[web]"
+accesspdf serve
+```
+This opens a browser UI at `http://localhost:8080`. Upload a PDF, get an accessibility report, download the fixed version. If your PDF has images, you can write alt text right in the browser -- or let AI do a first draft.
+For AI-generated alt text, we recommend **[Ollama](https://ollama.com)** -- it's free, runs locally, and needs no API key. Install it, then:
+```bash
+ollama pull llava
+```
+That's it. Select "Ollama" in the web UI and click generate.
+---
+## How it works
+AccessPDF does two things:
+1. **Fixes structure automatically** -- tags, language, reading order, headings, tables, links, bookmarks
+2. **Helps you add image descriptions** -- the one part that needs a human (or AI + human review)
+Your original PDF is never modified. Output always goes to a new file.
+## CLI workflow
+If you prefer the command line over the web UI:
+```bash
+# 1. See what's wrong (read-only, never touches your file)
+accesspdf check my-document.pdf
+# 2. Fix structural issues
+accesspdf fix my-document.pdf -o my-document_accessible.pdf
+# 3. Generate AI alt text drafts (optional)
+accesspdf generate-alt-text my-document_accessible.pdf
+# 4. Review and approve the drafts
+accesspdf review my-document_accessible.pdf
+# 5. Re-run fix to inject approved descriptions
+accesspdf fix my-document.pdf -o my-document_accessible.pdf --alt-text my-document.alttext.yaml
+```
+## AI alt text providers
+AccessPDF uses AI vision models to draft image descriptions. You always review before anything gets injected.
+| Provider | Setup | API key? | Cost |
+|----------|-------|----------|------|
+| **Ollama** (recommended) | [Install Ollama](https://ollama.com), `ollama pull llava` | No | Free (local) |
+| Google Gemini | None | `GOOGLE_API_KEY` | Free tier |
+| Anthropic (Claude) | `pip install accesspdf[anthropic]` | `ANTHROPIC_API_KEY` | Paid |
+| OpenAI (GPT-4) | `pip install accesspdf[openai]` | `OPENAI_API_KEY` | Paid |
+**Ollama is the easiest** -- no API key, no account, nothing leaves your machine. Just install it and pull a model.
+For cloud providers, set your key as an environment variable or pass it directly:
+```bash
+accesspdf generate-alt-text my-document.pdf --provider gemini --api-key AIza...
+```
+In the web UI, you can paste your API key in the settings panel -- it's sent per-request and never saved to disk.
+## Batch processing
+Fix every PDF in a folder:
+```bash
+accesspdf batch ./papers/ -o ./papers/accessible/
+accesspdf batch ./papers/ -o ./papers/accessible/ -r   # include subdirectories
+```
+## The sidecar file
+Image descriptions live in a `.alttext.yaml` file next to your PDF:
+```yaml
+images:
+- id: img_37044c
+  page: 1
+  ai_draft: 'Bar chart showing quarterly revenue from 2023-2025.'
+  alt_text: 'Bar chart showing quarterly revenue. Q1 2025 is highest at $4.2M.'
+  status: approved
+```
+Statuses: **needs_review** (not yet described), **approved** (gets injected), **decorative** (screen readers skip it). You can edit this file by hand.
+## CLI reference
+```
+accesspdf check <pdf>                    # Analyze accessibility (read-only)
+accesspdf fix <pdf> -o <output>          # Fix structure + inject alt text
+accesspdf fix <pdf> --alt-text <yaml>    # Fix with sidecar descriptions
+accesspdf batch <dir> -o <outdir>        # Fix all PDFs in a directory
+accesspdf review <pdf>                   # Terminal UI for alt text
+accesspdf serve                          # Web UI at localhost:8080
+accesspdf generate-alt-text <pdf>        # AI drafts (Ollama default)
+accesspdf providers                      # Show available AI providers
+```
+## Installation options
+```bash
+pip install accesspdf          # CLI only
+pip install "accesspdf[web]"   # CLI + browser UI (recommended)
+pip install "accesspdf[anthropic]"  # Add Claude provider
+pip install "accesspdf[openai]"     # Add GPT-4 provider
+```
+## Contributing
+```bash
+git clone https://github.com/laurenaulet/accesspdf.git
+cd accesspdf
+pip install -e ".[dev]"
+python -m pytest tests/ -v
+```
+## License
+Apache 2.0

accesspdf-1.0.0/accesspdf/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""AccessPDF — PDF accessibility remediation tool."""
+__version__ = "1.0.0"

accesspdf-1.0.0/accesspdf/__main__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Allow running as ``python -m accesspdf``."""
+from accesspdf.cli import app
+app()

accesspdf-1.0.0/accesspdf/alttext/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Alt text management — sidecar files, injection, caching."""

accesspdf-1.0.0/accesspdf/alttext/extract.py ADDED Viewed

@@ -0,0 +1,235 @@
+"""Extract images from PDFs as Pillow Image objects."""
+from __future__ import annotations
+import hashlib
+import io
+import logging
+from pathlib import Path
+import pikepdf
+from PIL import Image
+logger = logging.getLogger(__name__)
+def extract_image(pdf_path: Path, image_hash: str) -> Image.Image | None:
+    """Extract a specific image from a PDF by its md5 hash.
+    Returns a Pillow Image or None if not found.
+    """
+    with pikepdf.open(pdf_path) as pdf:
+        for page in pdf.pages:
+            result = _search_page(page, image_hash)
+            if result is not None:
+                return result
+    return None
+def extract_all_images(pdf_path: Path) -> list[tuple[str, int, Image.Image]]:
+    """Extract all images from a PDF.
+    Returns a list of (hash, page_number, Image) tuples.
+    Deduplicates by hash.
+    """
+    results: list[tuple[str, int, Image.Image]] = []
+    seen: set[str] = set()
+    with pikepdf.open(pdf_path) as pdf:
+        for page_idx, page in enumerate(pdf.pages, start=1):
+            _collect_page_images(page, page_idx, seen, results)
+    return results
+def _search_page(page: pikepdf.Page, target_hash: str) -> Image.Image | None:
+    """Search a page for an image with a specific hash."""
+    if "/Resources" not in page or "/XObject" not in page["/Resources"]:
+        return None
+    for _name, xobj_ref in page["/Resources"]["/XObject"].items():
+        try:
+            xobj = xobj_ref.resolve() if hasattr(xobj_ref, "resolve") else xobj_ref
+            if not isinstance(xobj, pikepdf.Stream):
+                continue
+            subtype = str(xobj.get("/Subtype", ""))
+            if subtype == "/Image":
+                result = _try_extract(xobj, target_hash)
+                if result is not None:
+                    return result
+            elif subtype == "/Form":
+                result = _search_form(xobj, target_hash)
+                if result is not None:
+                    return result
+        except Exception:
+            logger.debug("Error searching XObject", exc_info=True)
+    return None
+def _search_form(form_xobj: pikepdf.Stream, target_hash: str) -> Image.Image | None:
+    """Search inside a Form XObject for an image with a specific hash."""
+    try:
+        resources = form_xobj.get("/Resources")
+        if resources is None or "/XObject" not in resources:
+            return None
+        for _name, inner_ref in resources["/XObject"].items():
+            inner = inner_ref.resolve() if hasattr(inner_ref, "resolve") else inner_ref
+            if not isinstance(inner, pikepdf.Stream):
+                continue
+            if str(inner.get("/Subtype", "")) == "/Image":
+                result = _try_extract(inner, target_hash)
+                if result is not None:
+                    return result
+    except Exception:
+        logger.debug("Error searching form XObject", exc_info=True)
+    return None
+def _try_extract(xobj: pikepdf.Stream, target_hash: str) -> Image.Image | None:
+    """Check if an XObject matches the target hash and extract as Image."""
+    raw = bytes(xobj.read_raw_bytes())
+    img_hash = hashlib.md5(raw).hexdigest()
+    if img_hash != target_hash:
+        return None
+    return _xobj_to_pil(xobj)
+def _collect_page_images(
+    page: pikepdf.Page,
+    page_num: int,
+    seen: set[str],
+    results: list[tuple[str, int, Image.Image]],
+) -> None:
+    """Collect all images from a page."""
+    if "/Resources" not in page or "/XObject" not in page["/Resources"]:
+        return
+    for _name, xobj_ref in page["/Resources"]["/XObject"].items():
+        try:
+            xobj = xobj_ref.resolve() if hasattr(xobj_ref, "resolve") else xobj_ref
+            if not isinstance(xobj, pikepdf.Stream):
+                continue
+            subtype = str(xobj.get("/Subtype", ""))
+            if subtype == "/Image":
+                _try_collect(xobj, page_num, seen, results)
+            elif subtype == "/Form":
+                _collect_form_images(xobj, page_num, seen, results)
+        except Exception:
+            logger.debug("Error collecting image on page %d", page_num, exc_info=True)
+def _collect_form_images(
+    form_xobj: pikepdf.Stream,
+    page_num: int,
+    seen: set[str],
+    results: list[tuple[str, int, Image.Image]],
+) -> None:
+    """Collect images from inside a Form XObject."""
+    try:
+        resources = form_xobj.get("/Resources")
+        if resources is None or "/XObject" not in resources:
+            return
+        for _name, inner_ref in resources["/XObject"].items():
+            inner = inner_ref.resolve() if hasattr(inner_ref, "resolve") else inner_ref
+            if not isinstance(inner, pikepdf.Stream):
+                continue
+            if str(inner.get("/Subtype", "")) == "/Image":
+                _try_collect(inner, page_num, seen, results)
+    except Exception:
+        logger.debug("Error collecting form images", exc_info=True)
+def _try_collect(
+    xobj: pikepdf.Stream,
+    page_num: int,
+    seen: set[str],
+    results: list[tuple[str, int, Image.Image]],
+) -> None:
+    """Try to extract an image XObject and add it to results."""
+    raw = bytes(xobj.read_raw_bytes())
+    img_hash = hashlib.md5(raw).hexdigest()
+    if img_hash in seen:
+        return
+    seen.add(img_hash)
+    img = _xobj_to_pil(xobj)
+    if img is not None:
+        results.append((img_hash, page_num, img))
+def _xobj_to_pil(xobj: pikepdf.Stream) -> Image.Image | None:
+    """Convert a pikepdf image XObject to a Pillow Image.
+    Always returns an RGB or L (grayscale) image so callers can safely
+    save as PNG without mode errors (e.g. CMYK → RGB).
+    """
+    try:
+        pdfimage = pikepdf.PdfImage(xobj)
+        img = pdfimage.as_pil_image()
+        return _ensure_rgb(img)
+    except Exception:
+        logger.debug("pikepdf.PdfImage extraction failed, trying raw decode", exc_info=True)
+    # Fallback: try to decode raw bytes
+    try:
+        raw = bytes(xobj.read_raw_bytes())
+        w = int(xobj.get("/Width", 0))
+        h = int(xobj.get("/Height", 0))
+        bpc = int(xobj.get("/BitsPerComponent", 8))
+        cs = str(xobj.get("/ColorSpace", ""))
+        if w == 0 or h == 0:
+            return None
+        if "/DeviceRGB" in cs or "/RGB" in cs:
+            mode = "RGB"
+        elif "/DeviceGray" in cs or "/Gray" in cs:
+            mode = "L"
+        else:
+            mode = "RGB"
+        expected_size = w * h * (3 if mode == "RGB" else 1) * (bpc // 8)
+        if len(raw) >= expected_size:
+            img = Image.frombytes(mode, (w, h), raw[:expected_size])
+            return _ensure_rgb(img)
+    except Exception:
+        logger.debug("Raw image decode failed", exc_info=True)
+    return None
+def prepare_for_ai(img: Image.Image, *, max_dim: int = 512) -> bytes:
+    """Resize a PIL image and return PNG bytes ready for an AI provider.
+    Vision models don't need full-resolution PDF images.  Downsizing to
+    *max_dim* pixels on the longest side dramatically reduces payload size
+    and inference time while preserving enough detail for alt-text generation.
+    """
+    # Resize if larger than max_dim on either axis
+    w, h = img.size
+    if max(w, h) > max_dim:
+        scale = max_dim / max(w, h)
+        new_w = max(1, int(w * scale))
+        new_h = max(1, int(h * scale))
+        img = img.resize((new_w, new_h), Image.LANCZOS)
+        logger.debug("Resized image from %dx%d to %dx%d for AI", w, h, new_w, new_h)
+    buf = io.BytesIO()
+    img.save(buf, format="PNG")
+    return buf.getvalue()
+def _ensure_rgb(img: Image.Image) -> Image.Image:
+    """Convert any image mode (CMYK, P, LA, etc.) to RGB for safe PNG export."""
+    if img.mode in ("RGB", "L"):
+        return img
+    if img.mode == "CMYK":
+        return img.convert("RGB")
+    if img.mode in ("RGBA", "LA", "PA"):
+        # Keep alpha by converting to RGBA, which PNG supports
+        return img.convert("RGBA")
+    # Catch-all for any other mode (P, I, F, etc.)
+    return img.convert("RGB")