accesspdf 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. accesspdf-1.0.0/.claude/settings.local.json +14 -0
  2. accesspdf-1.0.0/.gitignore +18 -0
  3. accesspdf-1.0.0/LICENSE +15 -0
  4. accesspdf-1.0.0/OldDocs/accesspdf-prd.docx +0 -0
  5. accesspdf-1.0.0/OldDocs/accesspdf-readme.docx +0 -0
  6. accesspdf-1.0.0/OldDocs/accesspdf-roadmap.docx +0 -0
  7. accesspdf-1.0.0/OldDocs/accesspdf-technical-architecture.docx +0 -0
  8. accesspdf-1.0.0/PKG-INFO +228 -0
  9. accesspdf-1.0.0/README.md +134 -0
  10. accesspdf-1.0.0/accesspdf/__init__.py +3 -0
  11. accesspdf-1.0.0/accesspdf/__main__.py +5 -0
  12. accesspdf-1.0.0/accesspdf/alttext/__init__.py +1 -0
  13. accesspdf-1.0.0/accesspdf/alttext/extract.py +235 -0
  14. accesspdf-1.0.0/accesspdf/alttext/injector.py +237 -0
  15. accesspdf-1.0.0/accesspdf/alttext/sidecar.py +169 -0
  16. accesspdf-1.0.0/accesspdf/analyzer.py +514 -0
  17. accesspdf-1.0.0/accesspdf/cli.py +429 -0
  18. accesspdf-1.0.0/accesspdf/config.py +62 -0
  19. accesspdf-1.0.0/accesspdf/models.py +147 -0
  20. accesspdf-1.0.0/accesspdf/pipeline.py +129 -0
  21. accesspdf-1.0.0/accesspdf/processors/__init__.py +16 -0
  22. accesspdf-1.0.0/accesspdf/processors/_pdf_helpers.py +128 -0
  23. accesspdf-1.0.0/accesspdf/processors/_text_extract.py +171 -0
  24. accesspdf-1.0.0/accesspdf/processors/base.py +34 -0
  25. accesspdf-1.0.0/accesspdf/processors/bookmarks.py +159 -0
  26. accesspdf-1.0.0/accesspdf/processors/headings.py +251 -0
  27. accesspdf-1.0.0/accesspdf/processors/links.py +111 -0
  28. accesspdf-1.0.0/accesspdf/processors/metadata.py +151 -0
  29. accesspdf-1.0.0/accesspdf/processors/reading_order.py +117 -0
  30. accesspdf-1.0.0/accesspdf/processors/tables.py +202 -0
  31. accesspdf-1.0.0/accesspdf/processors/tagger.py +196 -0
  32. accesspdf-1.0.0/accesspdf/providers/__init__.py +87 -0
  33. accesspdf-1.0.0/accesspdf/providers/anthropic.py +94 -0
  34. accesspdf-1.0.0/accesspdf/providers/base.py +59 -0
  35. accesspdf-1.0.0/accesspdf/providers/gemini.py +255 -0
  36. accesspdf-1.0.0/accesspdf/providers/noop.py +19 -0
  37. accesspdf-1.0.0/accesspdf/providers/ollama.py +129 -0
  38. accesspdf-1.0.0/accesspdf/providers/openai.py +89 -0
  39. accesspdf-1.0.0/accesspdf/reporter.py +54 -0
  40. accesspdf-1.0.0/accesspdf/review/__init__.py +1 -0
  41. accesspdf-1.0.0/accesspdf/review/app.py +227 -0
  42. accesspdf-1.0.0/accesspdf/review/renderer.py +106 -0
  43. accesspdf-1.0.0/accesspdf/review/widgets.py +180 -0
  44. accesspdf-1.0.0/accesspdf/utils/__init__.py +1 -0
  45. accesspdf-1.0.0/accesspdf/utils/contrast.py +80 -0
  46. accesspdf-1.0.0/accesspdf/web/__init__.py +1 -0
  47. accesspdf-1.0.0/accesspdf/web/app.py +581 -0
  48. accesspdf-1.0.0/accesspdf/web/templates/index.html +1172 -0
  49. accesspdf-1.0.0/accesspdf/writer.py +32 -0
  50. accesspdf-1.0.0/docs/CLAUDE.md +255 -0
  51. accesspdf-1.0.0/docs/pyproject.toml +140 -0
  52. accesspdf-1.0.0/files/accesspdf-prd-v2.docx +0 -0
  53. accesspdf-1.0.0/files/accesspdf-readme-v2.docx +0 -0
  54. accesspdf-1.0.0/files/accesspdf-roadmap-v2.docx +0 -0
  55. accesspdf-1.0.0/files/accesspdf-technical-architecture-v3.docx +0 -0
  56. accesspdf-1.0.0/pyproject.toml +150 -0
  57. accesspdf-1.0.0/tests/__init__.py +0 -0
  58. accesspdf-1.0.0/tests/conftest.py +108 -0
  59. accesspdf-1.0.0/tests/corpus/ambiguous_links.pdf +98 -0
  60. accesspdf-1.0.0/tests/corpus/headings.pdf +74 -0
  61. accesspdf-1.0.0/tests/corpus/images.alttext.yaml +24 -0
  62. accesspdf-1.0.0/tests/corpus/images.pdf +103 -0
  63. accesspdf-1.0.0/tests/corpus/links.pdf +90 -0
  64. accesspdf-1.0.0/tests/corpus/low_contrast.pdf +68 -0
  65. accesspdf-1.0.0/tests/corpus/multicolumn.pdf +68 -0
  66. accesspdf-1.0.0/tests/corpus/scanned.pdf +139 -0
  67. accesspdf-1.0.0/tests/corpus/simple.pdf +68 -0
  68. accesspdf-1.0.0/tests/corpus/tables.pdf +74 -0
  69. accesspdf-1.0.0/tests/fixtures/__init__.py +0 -0
  70. accesspdf-1.0.0/tests/fixtures/generate.py +354 -0
  71. accesspdf-1.0.0/tests/test_analyzer.py +38 -0
  72. accesspdf-1.0.0/tests/test_batch.py +93 -0
  73. accesspdf-1.0.0/tests/test_bookmarks.py +56 -0
  74. accesspdf-1.0.0/tests/test_config.py +52 -0
  75. accesspdf-1.0.0/tests/test_contrast.py +118 -0
  76. accesspdf-1.0.0/tests/test_extract.py +55 -0
  77. accesspdf-1.0.0/tests/test_headings.py +58 -0
  78. accesspdf-1.0.0/tests/test_injector.py +125 -0
  79. accesspdf-1.0.0/tests/test_links.py +45 -0
  80. accesspdf-1.0.0/tests/test_metadata.py +70 -0
  81. accesspdf-1.0.0/tests/test_models.py +102 -0
  82. accesspdf-1.0.0/tests/test_pipeline_integration.py +103 -0
  83. accesspdf-1.0.0/tests/test_providers.py +198 -0
  84. accesspdf-1.0.0/tests/test_review_app.py +110 -0
  85. accesspdf-1.0.0/tests/test_sidecar.py +177 -0
  86. accesspdf-1.0.0/tests/test_tables.py +81 -0
  87. accesspdf-1.0.0/tests/test_tagger.py +78 -0
  88. accesspdf-1.0.0/tests/test_web.py +336 -0
  89. accesspdf-1.0.0/tests/utils/__init__.py +0 -0
  90. accesspdf-1.0.0/tests/utils/validate.py +125 -0
@@ -0,0 +1,14 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(git init)",
5
+ "Bash(pip install -e \".[dev]\")",
6
+ "Bash(python -m pytest tests/ -v)",
7
+ "Bash(python -m accesspdf --version)",
8
+ "Bash(python -m accesspdf --help)",
9
+ "Bash(git add .gitignore LICENSE README.md pyproject.toml accesspdf/ tests/ docs/ OldDocs/ files/ .claude/)",
10
+ "Bash(xargs grep -l \"contrast\\\\|font.*size\\\\|color\")",
11
+ "Bash(wc -l /c/Users/laure/Documents/AccessPDF/accesspdf/**/*.py)"
12
+ ]
13
+ }
14
+ }
@@ -0,0 +1,18 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .eggs/
8
+ *.egg
9
+ .mypy_cache/
10
+ .ruff_cache/
11
+ .pytest_cache/
12
+ .coverage
13
+ htmlcov/
14
+ *.so
15
+ .env
16
+ .venv/
17
+ env/
18
+ venv/
@@ -0,0 +1,15 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ Licensed under the Apache License, Version 2.0 (the "License");
6
+ you may not use this file except in compliance with the License.
7
+ You may obtain a copy of the License at
8
+
9
+ http://www.apache.org/licenses/LICENSE-2.0
10
+
11
+ Unless required by applicable law or agreed to in writing, software
12
+ distributed under the License is distributed on an "AS IS" BASIS,
13
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ See the License for the specific language governing permissions and
15
+ limitations under the License.
@@ -0,0 +1,228 @@
1
+ Metadata-Version: 2.4
2
+ Name: accesspdf
3
+ Version: 1.0.0
4
+ Summary: AI-powered PDF accessibility remediation, at scale
5
+ Project-URL: Homepage, https://github.com/laurenaulet/accesspdf
6
+ Project-URL: Repository, https://github.com/laurenaulet/accesspdf
7
+ Project-URL: Issues, https://github.com/laurenaulet/accesspdf/issues
8
+ Project-URL: Changelog, https://github.com/laurenaulet/accesspdf/CHANGELOG.md
9
+ License: Apache License
10
+ Version 2.0, January 2004
11
+ http://www.apache.org/licenses/
12
+
13
+ Licensed under the Apache License, Version 2.0 (the "License");
14
+ you may not use this file except in compliance with the License.
15
+ You may obtain a copy of the License at
16
+
17
+ http://www.apache.org/licenses/LICENSE-2.0
18
+
19
+ Unless required by applicable law or agreed to in writing, software
20
+ distributed under the License is distributed on an "AS IS" BASIS,
21
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22
+ See the License for the specific language governing permissions and
23
+ limitations under the License.
24
+ License-File: LICENSE
25
+ Keywords: a11y,accessibility,alt-text,pdf,wcag
26
+ Classifier: Development Status :: 4 - Beta
27
+ Classifier: Environment :: Console
28
+ Classifier: Intended Audience :: Developers
29
+ Classifier: Intended Audience :: Education
30
+ Classifier: Intended Audience :: Science/Research
31
+ Classifier: License :: OSI Approved :: Apache Software License
32
+ Classifier: Programming Language :: Python :: 3
33
+ Classifier: Programming Language :: Python :: 3.10
34
+ Classifier: Programming Language :: Python :: 3.11
35
+ Classifier: Programming Language :: Python :: 3.12
36
+ Classifier: Topic :: Text Processing :: Markup
37
+ Classifier: Topic :: Utilities
38
+ Requires-Python: >=3.10
39
+ Requires-Dist: httpx>=0.27
40
+ Requires-Dist: jinja2>=3.1
41
+ Requires-Dist: langdetect>=1.0.9
42
+ Requires-Dist: pdfminer-six>=20221105
43
+ Requires-Dist: pikepdf<10.0,>=8.0
44
+ Requires-Dist: pillow>=10.0
45
+ Requires-Dist: pydantic>=2.0
46
+ Requires-Dist: pyyaml>=6.0
47
+ Requires-Dist: rich>=13.0
48
+ Requires-Dist: textual>=0.50
49
+ Requires-Dist: typer>=0.12
50
+ Provides-Extra: all
51
+ Requires-Dist: anthropic>=0.25; extra == 'all'
52
+ Requires-Dist: fastapi>=0.110; extra == 'all'
53
+ Requires-Dist: httpx>=0.27; extra == 'all'
54
+ Requires-Dist: mypy>=1.9; extra == 'all'
55
+ Requires-Dist: openai>=1.25; extra == 'all'
56
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'all'
57
+ Requires-Dist: pytest-cov>=5.0; extra == 'all'
58
+ Requires-Dist: pytest-vcr>=1.0; extra == 'all'
59
+ Requires-Dist: pytest>=8.0; extra == 'all'
60
+ Requires-Dist: python-multipart>=0.0.9; extra == 'all'
61
+ Requires-Dist: reportlab>=4.0; extra == 'all'
62
+ Requires-Dist: ruff>=0.4; extra == 'all'
63
+ Requires-Dist: types-pillow; extra == 'all'
64
+ Requires-Dist: types-pyyaml; extra == 'all'
65
+ Requires-Dist: uvicorn[standard]>=0.29; extra == 'all'
66
+ Requires-Dist: vcrpy>=6.0; extra == 'all'
67
+ Provides-Extra: all-providers
68
+ Requires-Dist: anthropic>=0.25; extra == 'all-providers'
69
+ Requires-Dist: openai>=1.25; extra == 'all-providers'
70
+ Provides-Extra: anthropic
71
+ Requires-Dist: anthropic>=0.25; extra == 'anthropic'
72
+ Provides-Extra: dev
73
+ Requires-Dist: fastapi>=0.110; extra == 'dev'
74
+ Requires-Dist: httpx>=0.27; extra == 'dev'
75
+ Requires-Dist: mypy>=1.9; extra == 'dev'
76
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
77
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
78
+ Requires-Dist: pytest-vcr>=1.0; extra == 'dev'
79
+ Requires-Dist: pytest>=8.0; extra == 'dev'
80
+ Requires-Dist: python-multipart>=0.0.9; extra == 'dev'
81
+ Requires-Dist: reportlab>=4.0; extra == 'dev'
82
+ Requires-Dist: ruff>=0.4; extra == 'dev'
83
+ Requires-Dist: types-pillow; extra == 'dev'
84
+ Requires-Dist: types-pyyaml; extra == 'dev'
85
+ Requires-Dist: uvicorn[standard]>=0.29; extra == 'dev'
86
+ Requires-Dist: vcrpy>=6.0; extra == 'dev'
87
+ Provides-Extra: openai
88
+ Requires-Dist: openai>=1.25; extra == 'openai'
89
+ Provides-Extra: web
90
+ Requires-Dist: fastapi>=0.110; extra == 'web'
91
+ Requires-Dist: python-multipart>=0.0.9; extra == 'web'
92
+ Requires-Dist: uvicorn[standard]>=0.29; extra == 'web'
93
+ Description-Content-Type: text/markdown
94
+
95
+ # AccessPDF
96
+
97
+ Make PDFs accessible. Fixes structure, reading order, tables, and headings automatically -- then helps you add image descriptions with local AI or by hand.
98
+
99
+ Targets **WCAG 2.1 AA** and **PDF/UA**.
100
+
101
+ ## Quick start
102
+
103
+ ```bash
104
+ pip install "accesspdf[web]"
105
+ accesspdf serve
106
+ ```
107
+
108
+ This opens a browser UI at `http://localhost:8080`. Upload a PDF, get an accessibility report, download the fixed version. If your PDF has images, you can write alt text right in the browser -- or let AI do a first draft.
109
+
110
+ For AI-generated alt text, we recommend **[Ollama](https://ollama.com)** -- it's free, runs locally, and needs no API key. Install it, then:
111
+
112
+ ```bash
113
+ ollama pull llava
114
+ ```
115
+
116
+ That's it. Select "Ollama" in the web UI and click generate.
117
+
118
+ ---
119
+
120
+ ## How it works
121
+
122
+ AccessPDF does two things:
123
+
124
+ 1. **Fixes structure automatically** -- tags, language, reading order, headings, tables, links, bookmarks
125
+ 2. **Helps you add image descriptions** -- the one part that needs a human (or AI + human review)
126
+
127
+ Your original PDF is never modified. Output always goes to a new file.
128
+
129
+ ## CLI workflow
130
+
131
+ If you prefer the command line over the web UI:
132
+
133
+ ```bash
134
+ # 1. See what's wrong (read-only, never touches your file)
135
+ accesspdf check my-document.pdf
136
+
137
+ # 2. Fix structural issues
138
+ accesspdf fix my-document.pdf -o my-document_accessible.pdf
139
+
140
+ # 3. Generate AI alt text drafts (optional)
141
+ accesspdf generate-alt-text my-document_accessible.pdf
142
+
143
+ # 4. Review and approve the drafts
144
+ accesspdf review my-document_accessible.pdf
145
+
146
+ # 5. Re-run fix to inject approved descriptions
147
+ accesspdf fix my-document.pdf -o my-document_accessible.pdf --alt-text my-document.alttext.yaml
148
+ ```
149
+
150
+ ## AI alt text providers
151
+
152
+ AccessPDF uses AI vision models to draft image descriptions. You always review before anything gets injected.
153
+
154
+ | Provider | Setup | API key? | Cost |
155
+ |----------|-------|----------|------|
156
+ | **Ollama** (recommended) | [Install Ollama](https://ollama.com), `ollama pull llava` | No | Free (local) |
157
+ | Google Gemini | None | `GOOGLE_API_KEY` | Free tier |
158
+ | Anthropic (Claude) | `pip install accesspdf[anthropic]` | `ANTHROPIC_API_KEY` | Paid |
159
+ | OpenAI (GPT-4) | `pip install accesspdf[openai]` | `OPENAI_API_KEY` | Paid |
160
+
161
+ **Ollama is the easiest** -- no API key, no account, nothing leaves your machine. Just install it and pull a model.
162
+
163
+ For cloud providers, set your key as an environment variable or pass it directly:
164
+
165
+ ```bash
166
+ accesspdf generate-alt-text my-document.pdf --provider gemini --api-key AIza...
167
+ ```
168
+
169
+ In the web UI, you can paste your API key in the settings panel -- it's sent per-request and never saved to disk.
170
+
171
+ ## Batch processing
172
+
173
+ Fix every PDF in a folder:
174
+
175
+ ```bash
176
+ accesspdf batch ./papers/ -o ./papers/accessible/
177
+ accesspdf batch ./papers/ -o ./papers/accessible/ -r # include subdirectories
178
+ ```
179
+
180
+ ## The sidecar file
181
+
182
+ Image descriptions live in a `.alttext.yaml` file next to your PDF:
183
+
184
+ ```yaml
185
+ images:
186
+ - id: img_37044c
187
+ page: 1
188
+ ai_draft: 'Bar chart showing quarterly revenue from 2023-2025.'
189
+ alt_text: 'Bar chart showing quarterly revenue. Q1 2025 is highest at $4.2M.'
190
+ status: approved
191
+ ```
192
+
193
+ Statuses: **needs_review** (not yet described), **approved** (gets injected), **decorative** (screen readers skip it). You can edit this file by hand.
194
+
195
+ ## CLI reference
196
+
197
+ ```
198
+ accesspdf check <pdf> # Analyze accessibility (read-only)
199
+ accesspdf fix <pdf> -o <output> # Fix structure + inject alt text
200
+ accesspdf fix <pdf> --alt-text <yaml> # Fix with sidecar descriptions
201
+ accesspdf batch <dir> -o <outdir> # Fix all PDFs in a directory
202
+ accesspdf review <pdf> # Terminal UI for alt text
203
+ accesspdf serve # Web UI at localhost:8080
204
+ accesspdf generate-alt-text <pdf> # AI drafts (Ollama default)
205
+ accesspdf providers # Show available AI providers
206
+ ```
207
+
208
+ ## Installation options
209
+
210
+ ```bash
211
+ pip install accesspdf # CLI only
212
+ pip install "accesspdf[web]" # CLI + browser UI (recommended)
213
+ pip install "accesspdf[anthropic]" # Add Claude provider
214
+ pip install "accesspdf[openai]" # Add GPT-4 provider
215
+ ```
216
+
217
+ ## Contributing
218
+
219
+ ```bash
220
+ git clone https://github.com/laurenaulet/accesspdf.git
221
+ cd accesspdf
222
+ pip install -e ".[dev]"
223
+ python -m pytest tests/ -v
224
+ ```
225
+
226
+ ## License
227
+
228
+ Apache 2.0
@@ -0,0 +1,134 @@
1
+ # AccessPDF
2
+
3
+ Make PDFs accessible. Fixes structure, reading order, tables, and headings automatically -- then helps you add image descriptions with local AI or by hand.
4
+
5
+ Targets **WCAG 2.1 AA** and **PDF/UA**.
6
+
7
+ ## Quick start
8
+
9
+ ```bash
10
+ pip install "accesspdf[web]"
11
+ accesspdf serve
12
+ ```
13
+
14
+ This opens a browser UI at `http://localhost:8080`. Upload a PDF, get an accessibility report, download the fixed version. If your PDF has images, you can write alt text right in the browser -- or let AI do a first draft.
15
+
16
+ For AI-generated alt text, we recommend **[Ollama](https://ollama.com)** -- it's free, runs locally, and needs no API key. Install it, then:
17
+
18
+ ```bash
19
+ ollama pull llava
20
+ ```
21
+
22
+ That's it. Select "Ollama" in the web UI and click generate.
23
+
24
+ ---
25
+
26
+ ## How it works
27
+
28
+ AccessPDF does two things:
29
+
30
+ 1. **Fixes structure automatically** -- tags, language, reading order, headings, tables, links, bookmarks
31
+ 2. **Helps you add image descriptions** -- the one part that needs a human (or AI + human review)
32
+
33
+ Your original PDF is never modified. Output always goes to a new file.
34
+
35
+ ## CLI workflow
36
+
37
+ If you prefer the command line over the web UI:
38
+
39
+ ```bash
40
+ # 1. See what's wrong (read-only, never touches your file)
41
+ accesspdf check my-document.pdf
42
+
43
+ # 2. Fix structural issues
44
+ accesspdf fix my-document.pdf -o my-document_accessible.pdf
45
+
46
+ # 3. Generate AI alt text drafts (optional)
47
+ accesspdf generate-alt-text my-document_accessible.pdf
48
+
49
+ # 4. Review and approve the drafts
50
+ accesspdf review my-document_accessible.pdf
51
+
52
+ # 5. Re-run fix to inject approved descriptions
53
+ accesspdf fix my-document.pdf -o my-document_accessible.pdf --alt-text my-document.alttext.yaml
54
+ ```
55
+
56
+ ## AI alt text providers
57
+
58
+ AccessPDF uses AI vision models to draft image descriptions. You always review before anything gets injected.
59
+
60
+ | Provider | Setup | API key? | Cost |
61
+ |----------|-------|----------|------|
62
+ | **Ollama** (recommended) | [Install Ollama](https://ollama.com), `ollama pull llava` | No | Free (local) |
63
+ | Google Gemini | None | `GOOGLE_API_KEY` | Free tier |
64
+ | Anthropic (Claude) | `pip install accesspdf[anthropic]` | `ANTHROPIC_API_KEY` | Paid |
65
+ | OpenAI (GPT-4) | `pip install accesspdf[openai]` | `OPENAI_API_KEY` | Paid |
66
+
67
+ **Ollama is the easiest** -- no API key, no account, nothing leaves your machine. Just install it and pull a model.
68
+
69
+ For cloud providers, set your key as an environment variable or pass it directly:
70
+
71
+ ```bash
72
+ accesspdf generate-alt-text my-document.pdf --provider gemini --api-key AIza...
73
+ ```
74
+
75
+ In the web UI, you can paste your API key in the settings panel -- it's sent per-request and never saved to disk.
76
+
77
+ ## Batch processing
78
+
79
+ Fix every PDF in a folder:
80
+
81
+ ```bash
82
+ accesspdf batch ./papers/ -o ./papers/accessible/
83
+ accesspdf batch ./papers/ -o ./papers/accessible/ -r # include subdirectories
84
+ ```
85
+
86
+ ## The sidecar file
87
+
88
+ Image descriptions live in a `.alttext.yaml` file next to your PDF:
89
+
90
+ ```yaml
91
+ images:
92
+ - id: img_37044c
93
+ page: 1
94
+ ai_draft: 'Bar chart showing quarterly revenue from 2023-2025.'
95
+ alt_text: 'Bar chart showing quarterly revenue. Q1 2025 is highest at $4.2M.'
96
+ status: approved
97
+ ```
98
+
99
+ Statuses: **needs_review** (not yet described), **approved** (gets injected), **decorative** (screen readers skip it). You can edit this file by hand.
100
+
101
+ ## CLI reference
102
+
103
+ ```
104
+ accesspdf check <pdf> # Analyze accessibility (read-only)
105
+ accesspdf fix <pdf> -o <output> # Fix structure + inject alt text
106
+ accesspdf fix <pdf> --alt-text <yaml> # Fix with sidecar descriptions
107
+ accesspdf batch <dir> -o <outdir> # Fix all PDFs in a directory
108
+ accesspdf review <pdf> # Terminal UI for alt text
109
+ accesspdf serve # Web UI at localhost:8080
110
+ accesspdf generate-alt-text <pdf> # AI drafts (Ollama default)
111
+ accesspdf providers # Show available AI providers
112
+ ```
113
+
114
+ ## Installation options
115
+
116
+ ```bash
117
+ pip install accesspdf # CLI only
118
+ pip install "accesspdf[web]" # CLI + browser UI (recommended)
119
+ pip install "accesspdf[anthropic]" # Add Claude provider
120
+ pip install "accesspdf[openai]" # Add GPT-4 provider
121
+ ```
122
+
123
+ ## Contributing
124
+
125
+ ```bash
126
+ git clone https://github.com/laurenaulet/accesspdf.git
127
+ cd accesspdf
128
+ pip install -e ".[dev]"
129
+ python -m pytest tests/ -v
130
+ ```
131
+
132
+ ## License
133
+
134
+ Apache 2.0
@@ -0,0 +1,3 @@
1
+ """AccessPDF — PDF accessibility remediation tool."""
2
+
3
+ __version__ = "1.0.0"
@@ -0,0 +1,5 @@
1
+ """Allow running as ``python -m accesspdf``."""
2
+
3
+ from accesspdf.cli import app
4
+
5
+ app()
@@ -0,0 +1 @@
1
+ """Alt text management — sidecar files, injection, caching."""
@@ -0,0 +1,235 @@
1
+ """Extract images from PDFs as Pillow Image objects."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import io
7
+ import logging
8
+ from pathlib import Path
9
+
10
+ import pikepdf
11
+ from PIL import Image
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def extract_image(pdf_path: Path, image_hash: str) -> Image.Image | None:
17
+ """Extract a specific image from a PDF by its md5 hash.
18
+
19
+ Returns a Pillow Image or None if not found.
20
+ """
21
+ with pikepdf.open(pdf_path) as pdf:
22
+ for page in pdf.pages:
23
+ result = _search_page(page, image_hash)
24
+ if result is not None:
25
+ return result
26
+ return None
27
+
28
+
29
+ def extract_all_images(pdf_path: Path) -> list[tuple[str, int, Image.Image]]:
30
+ """Extract all images from a PDF.
31
+
32
+ Returns a list of (hash, page_number, Image) tuples.
33
+ Deduplicates by hash.
34
+ """
35
+ results: list[tuple[str, int, Image.Image]] = []
36
+ seen: set[str] = set()
37
+
38
+ with pikepdf.open(pdf_path) as pdf:
39
+ for page_idx, page in enumerate(pdf.pages, start=1):
40
+ _collect_page_images(page, page_idx, seen, results)
41
+
42
+ return results
43
+
44
+
45
+ def _search_page(page: pikepdf.Page, target_hash: str) -> Image.Image | None:
46
+ """Search a page for an image with a specific hash."""
47
+ if "/Resources" not in page or "/XObject" not in page["/Resources"]:
48
+ return None
49
+
50
+ for _name, xobj_ref in page["/Resources"]["/XObject"].items():
51
+ try:
52
+ xobj = xobj_ref.resolve() if hasattr(xobj_ref, "resolve") else xobj_ref
53
+ if not isinstance(xobj, pikepdf.Stream):
54
+ continue
55
+
56
+ subtype = str(xobj.get("/Subtype", ""))
57
+ if subtype == "/Image":
58
+ result = _try_extract(xobj, target_hash)
59
+ if result is not None:
60
+ return result
61
+ elif subtype == "/Form":
62
+ result = _search_form(xobj, target_hash)
63
+ if result is not None:
64
+ return result
65
+ except Exception:
66
+ logger.debug("Error searching XObject", exc_info=True)
67
+
68
+ return None
69
+
70
+
71
+ def _search_form(form_xobj: pikepdf.Stream, target_hash: str) -> Image.Image | None:
72
+ """Search inside a Form XObject for an image with a specific hash."""
73
+ try:
74
+ resources = form_xobj.get("/Resources")
75
+ if resources is None or "/XObject" not in resources:
76
+ return None
77
+ for _name, inner_ref in resources["/XObject"].items():
78
+ inner = inner_ref.resolve() if hasattr(inner_ref, "resolve") else inner_ref
79
+ if not isinstance(inner, pikepdf.Stream):
80
+ continue
81
+ if str(inner.get("/Subtype", "")) == "/Image":
82
+ result = _try_extract(inner, target_hash)
83
+ if result is not None:
84
+ return result
85
+ except Exception:
86
+ logger.debug("Error searching form XObject", exc_info=True)
87
+ return None
88
+
89
+
90
+ def _try_extract(xobj: pikepdf.Stream, target_hash: str) -> Image.Image | None:
91
+ """Check if an XObject matches the target hash and extract as Image."""
92
+ raw = bytes(xobj.read_raw_bytes())
93
+ img_hash = hashlib.md5(raw).hexdigest()
94
+ if img_hash != target_hash:
95
+ return None
96
+ return _xobj_to_pil(xobj)
97
+
98
+
99
+ def _collect_page_images(
100
+ page: pikepdf.Page,
101
+ page_num: int,
102
+ seen: set[str],
103
+ results: list[tuple[str, int, Image.Image]],
104
+ ) -> None:
105
+ """Collect all images from a page."""
106
+ if "/Resources" not in page or "/XObject" not in page["/Resources"]:
107
+ return
108
+
109
+ for _name, xobj_ref in page["/Resources"]["/XObject"].items():
110
+ try:
111
+ xobj = xobj_ref.resolve() if hasattr(xobj_ref, "resolve") else xobj_ref
112
+ if not isinstance(xobj, pikepdf.Stream):
113
+ continue
114
+
115
+ subtype = str(xobj.get("/Subtype", ""))
116
+ if subtype == "/Image":
117
+ _try_collect(xobj, page_num, seen, results)
118
+ elif subtype == "/Form":
119
+ _collect_form_images(xobj, page_num, seen, results)
120
+ except Exception:
121
+ logger.debug("Error collecting image on page %d", page_num, exc_info=True)
122
+
123
+
124
+ def _collect_form_images(
125
+ form_xobj: pikepdf.Stream,
126
+ page_num: int,
127
+ seen: set[str],
128
+ results: list[tuple[str, int, Image.Image]],
129
+ ) -> None:
130
+ """Collect images from inside a Form XObject."""
131
+ try:
132
+ resources = form_xobj.get("/Resources")
133
+ if resources is None or "/XObject" not in resources:
134
+ return
135
+ for _name, inner_ref in resources["/XObject"].items():
136
+ inner = inner_ref.resolve() if hasattr(inner_ref, "resolve") else inner_ref
137
+ if not isinstance(inner, pikepdf.Stream):
138
+ continue
139
+ if str(inner.get("/Subtype", "")) == "/Image":
140
+ _try_collect(inner, page_num, seen, results)
141
+ except Exception:
142
+ logger.debug("Error collecting form images", exc_info=True)
143
+
144
+
145
+ def _try_collect(
146
+ xobj: pikepdf.Stream,
147
+ page_num: int,
148
+ seen: set[str],
149
+ results: list[tuple[str, int, Image.Image]],
150
+ ) -> None:
151
+ """Try to extract an image XObject and add it to results."""
152
+ raw = bytes(xobj.read_raw_bytes())
153
+ img_hash = hashlib.md5(raw).hexdigest()
154
+ if img_hash in seen:
155
+ return
156
+ seen.add(img_hash)
157
+
158
+ img = _xobj_to_pil(xobj)
159
+ if img is not None:
160
+ results.append((img_hash, page_num, img))
161
+
162
+
163
+ def _xobj_to_pil(xobj: pikepdf.Stream) -> Image.Image | None:
164
+ """Convert a pikepdf image XObject to a Pillow Image.
165
+
166
+ Always returns an RGB or L (grayscale) image so callers can safely
167
+ save as PNG without mode errors (e.g. CMYK → RGB).
168
+ """
169
+ try:
170
+ pdfimage = pikepdf.PdfImage(xobj)
171
+ img = pdfimage.as_pil_image()
172
+ return _ensure_rgb(img)
173
+ except Exception:
174
+ logger.debug("pikepdf.PdfImage extraction failed, trying raw decode", exc_info=True)
175
+
176
+ # Fallback: try to decode raw bytes
177
+ try:
178
+ raw = bytes(xobj.read_raw_bytes())
179
+ w = int(xobj.get("/Width", 0))
180
+ h = int(xobj.get("/Height", 0))
181
+ bpc = int(xobj.get("/BitsPerComponent", 8))
182
+ cs = str(xobj.get("/ColorSpace", ""))
183
+
184
+ if w == 0 or h == 0:
185
+ return None
186
+
187
+ if "/DeviceRGB" in cs or "/RGB" in cs:
188
+ mode = "RGB"
189
+ elif "/DeviceGray" in cs or "/Gray" in cs:
190
+ mode = "L"
191
+ else:
192
+ mode = "RGB"
193
+
194
+ expected_size = w * h * (3 if mode == "RGB" else 1) * (bpc // 8)
195
+ if len(raw) >= expected_size:
196
+ img = Image.frombytes(mode, (w, h), raw[:expected_size])
197
+ return _ensure_rgb(img)
198
+ except Exception:
199
+ logger.debug("Raw image decode failed", exc_info=True)
200
+
201
+ return None
202
+
203
+
204
+ def prepare_for_ai(img: Image.Image, *, max_dim: int = 512) -> bytes:
205
+ """Resize a PIL image and return PNG bytes ready for an AI provider.
206
+
207
+ Vision models don't need full-resolution PDF images. Downsizing to
208
+ *max_dim* pixels on the longest side dramatically reduces payload size
209
+ and inference time while preserving enough detail for alt-text generation.
210
+ """
211
+ # Resize if larger than max_dim on either axis
212
+ w, h = img.size
213
+ if max(w, h) > max_dim:
214
+ scale = max_dim / max(w, h)
215
+ new_w = max(1, int(w * scale))
216
+ new_h = max(1, int(h * scale))
217
+ img = img.resize((new_w, new_h), Image.LANCZOS)
218
+ logger.debug("Resized image from %dx%d to %dx%d for AI", w, h, new_w, new_h)
219
+
220
+ buf = io.BytesIO()
221
+ img.save(buf, format="PNG")
222
+ return buf.getvalue()
223
+
224
+
225
+ def _ensure_rgb(img: Image.Image) -> Image.Image:
226
+ """Convert any image mode (CMYK, P, LA, etc.) to RGB for safe PNG export."""
227
+ if img.mode in ("RGB", "L"):
228
+ return img
229
+ if img.mode == "CMYK":
230
+ return img.convert("RGB")
231
+ if img.mode in ("RGBA", "LA", "PA"):
232
+ # Keep alpha by converting to RGBA, which PNG supports
233
+ return img.convert("RGBA")
234
+ # Catch-all for any other mode (P, I, F, etc.)
235
+ return img.convert("RGB")