pdf-transcriber 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. pdf_transcriber-1.0.0/.gitignore +91 -0
  2. pdf_transcriber-1.0.0/LICENSE +21 -0
  3. pdf_transcriber-1.0.0/PKG-INFO +401 -0
  4. pdf_transcriber-1.0.0/README.md +364 -0
  5. pdf_transcriber-1.0.0/pyproject.toml +82 -0
  6. pdf_transcriber-1.0.0/src/pdf_transcriber/__init__.py +6 -0
  7. pdf_transcriber-1.0.0/src/pdf_transcriber/cli.py +291 -0
  8. pdf_transcriber-1.0.0/src/pdf_transcriber/config.py +109 -0
  9. pdf_transcriber-1.0.0/src/pdf_transcriber/core/__init__.py +21 -0
  10. pdf_transcriber-1.0.0/src/pdf_transcriber/core/linter/__init__.py +5 -0
  11. pdf_transcriber-1.0.0/src/pdf_transcriber/core/linter/engine.py +184 -0
  12. pdf_transcriber-1.0.0/src/pdf_transcriber/core/linter/models.py +72 -0
  13. pdf_transcriber-1.0.0/src/pdf_transcriber/core/linter/rules/__init__.py +55 -0
  14. pdf_transcriber-1.0.0/src/pdf_transcriber/core/linter/rules/artifacts.py +1030 -0
  15. pdf_transcriber-1.0.0/src/pdf_transcriber/core/linter/rules/markdown.py +191 -0
  16. pdf_transcriber-1.0.0/src/pdf_transcriber/core/linter/rules/math.py +633 -0
  17. pdf_transcriber-1.0.0/src/pdf_transcriber/core/metadata_parser.py +245 -0
  18. pdf_transcriber-1.0.0/src/pdf_transcriber/core/pdf_processor.py +173 -0
  19. pdf_transcriber-1.0.0/src/pdf_transcriber/core/state_manager.py +325 -0
  20. pdf_transcriber-1.0.0/src/pdf_transcriber/core/transcription.py +476 -0
  21. pdf_transcriber-1.0.0/src/pdf_transcriber/server.py +50 -0
  22. pdf_transcriber-1.0.0/src/pdf_transcriber/skills/__init__.py +1 -0
  23. pdf_transcriber-1.0.0/src/pdf_transcriber/skills/transcribe.md +48 -0
  24. pdf_transcriber-1.0.0/src/pdf_transcriber/tools/__init__.py +4 -0
  25. pdf_transcriber-1.0.0/src/pdf_transcriber/tools/lint.py +72 -0
  26. pdf_transcriber-1.0.0/src/pdf_transcriber/tools/transcribe.py +333 -0
@@ -0,0 +1,91 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ *.manifest
31
+ *.spec
32
+
33
+ # Installer logs
34
+ pip-log.txt
35
+ pip-delete-this-directory.txt
36
+
37
+ # Unit test / coverage reports
38
+ htmlcov/
39
+ .tox/
40
+ .nox/
41
+ .coverage
42
+ .coverage.*
43
+ .cache
44
+ nosetests.xml
45
+ coverage.xml
46
+ *.cover
47
+ *.py,cover
48
+ .hypothesis/
49
+ .pytest_cache/
50
+ cover/
51
+
52
+ # Translations
53
+ *.mo
54
+ *.pot
55
+
56
+ # Environments
57
+ .env
58
+ .venv
59
+ env/
60
+ venv/
61
+ ENV/
62
+ env.bak/
63
+ venv.bak/
64
+
65
+ # IDEs
66
+ .idea/
67
+ .vscode/
68
+ *.swp
69
+ *.swo
70
+ *~
71
+
72
+ # mypy
73
+ .mypy_cache/
74
+ .dmypy.json
75
+ dmypy.json
76
+
77
+ # Ruff
78
+ .ruff_cache/
79
+
80
+ # pyright
81
+ pyrightconfig.json
82
+
83
+ # uv
84
+ uv.lock
85
+
86
+ # macOS
87
+ .DS_Store
88
+
89
+ # Project specific
90
+ *.pdf
91
+ .pdf-progress/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Gus Schmidt
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,401 @@
1
+ Metadata-Version: 2.4
2
+ Name: pdf-transcriber
3
+ Version: 1.0.0
4
+ Summary: Convert math-heavy PDFs to Markdown using Marker OCR with optional LLM enhancement
5
+ Project-URL: Homepage, https://github.com/AugustSchmidt/pdf-transcriber
6
+ Project-URL: Documentation, https://github.com/AugustSchmidt/pdf-transcriber#readme
7
+ Project-URL: Repository, https://github.com/AugustSchmidt/pdf-transcriber
8
+ Project-URL: Issues, https://github.com/AugustSchmidt/pdf-transcriber/issues
9
+ Author: Gus Schmidt
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: academic,claude,latex,markdown,marker,math,mcp,ocr,pdf,transcription
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Environment :: Console
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: Intended Audience :: Science/Research
17
+ Classifier: License :: OSI Approved :: MIT License
18
+ Classifier: Operating System :: OS Independent
19
+ Classifier: Programming Language :: Python :: 3
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Topic :: Scientific/Engineering
24
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
25
+ Classifier: Topic :: Utilities
26
+ Requires-Python: >=3.10
27
+ Requires-Dist: marker-pdf>=1.10.1
28
+ Requires-Dist: mcp>=1.2.0
29
+ Requires-Dist: pymupdf>=1.24.0
30
+ Requires-Dist: pyyaml>=6.0
31
+ Provides-Extra: dev
32
+ Requires-Dist: mypy>=1.0; extra == 'dev'
33
+ Requires-Dist: pytest-asyncio>=0.21; extra == 'dev'
34
+ Requires-Dist: pytest>=7.0; extra == 'dev'
35
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
36
+ Description-Content-Type: text/markdown
37
+
38
+ # PDF Transcriber
39
+
40
+ Convert math-heavy PDFs to Markdown using Marker OCR with optional LLM enhancement.
41
+
42
+ ## Installation
43
+
44
+ ### Recommended (isolated environment)
45
+
46
+ ```bash
47
+ uv tool install pdf-transcriber
48
+ ```
49
+
50
+ ### Alternative (pip)
51
+
52
+ ```bash
53
+ pip install pdf-transcriber
54
+ ```
55
+
56
+ ### Verify installation
57
+
58
+ ```bash
59
+ pdf-transcriber-cli check
60
+ ```
61
+
62
+ ## Three Ways to Use
63
+
64
+ | Interface | Command | Use Case |
65
+ |-----------|---------|----------|
66
+ | **CLI** | `pdf-transcriber-cli transcribe paper.pdf` | Direct terminal usage |
67
+ | **Skill** | `/transcribe paper.pdf` | Claude Code slash command |
68
+ | **MCP Server** | `pdf-transcriber` | Claude Code background integration |
69
+
70
+ ### 1. CLI (Direct Terminal Usage)
71
+
72
+ ```bash
73
+ # Basic transcription
74
+ pdf-transcriber-cli transcribe ~/Downloads/paper.pdf
75
+
76
+ # High quality mode
77
+ pdf-transcriber-cli transcribe ~/Downloads/paper.pdf -q high-quality
78
+
79
+ # Disable LLM (faster, less accurate)
80
+ pdf-transcriber-cli transcribe ~/Downloads/paper.pdf --no-llm
81
+
82
+ # Skip automatic linting
83
+ pdf-transcriber-cli transcribe ~/Downloads/paper.pdf --no-lint
84
+
85
+ # Health check
86
+ pdf-transcriber-cli check
87
+ ```
88
+
89
+ ### 2. Claude Code Skill (Slash Command)
90
+
91
+ ```bash
92
+ # Install the skill
93
+ pdf-transcriber-cli install-skill
94
+
95
+ # Then in Claude Code:
96
+ /transcribe ~/Downloads/paper.pdf
97
+ ```
98
+
99
+ ### 3. MCP Server (Claude Code Integration)
100
+
101
+ > **Note**: This is a standard [MCP (Model Context Protocol)](https://modelcontextprotocol.io) server. While examples show Claude Code configuration, it works with any MCP-compatible agent orchestrator (Cursor, Cline, custom agents, etc.).
102
+
103
+ Add to `~/.claude/settings.json`:
104
+
105
+ ```json
106
+ {
107
+ "mcpServers": {
108
+ "pdf-transcriber": {
109
+ "command": "pdf-transcriber",
110
+ "env": {
111
+ "PDF_TRANSCRIBER_OUTPUT_DIR": "~/Documents/transcriptions"
112
+ }
113
+ }
114
+ }
115
+ }
116
+ ```
117
+
118
+ ## LLM-Enhanced OCR Setup
119
+
120
+ PDF Transcriber can use a local vision LLM (VLM) to significantly improve OCR accuracy, especially for:
121
+ - Complex mathematical notation
122
+ - Handwritten annotations
123
+ - Low-quality scans
124
+ - Tables and figures
125
+
126
+ ### Quick Start
127
+
128
+ 1. **Install Ollama**: https://ollama.ai
129
+ 2. **Pull a vision model**:
130
+ ```bash
131
+ ollama pull qwen2.5vl:3b
132
+ ```
133
+ 3. **Start Ollama**:
134
+ ```bash
135
+ ollama serve
136
+ ```
137
+
138
+ LLM enhancement is **enabled by default**. To disable:
139
+ ```bash
140
+ # CLI
141
+ pdf-transcriber-cli transcribe paper.pdf --no-llm
142
+
143
+ # Environment variable
144
+ PDF_TRANSCRIBER_USE_LLM=false
145
+ ```
146
+
147
+ ### Recommended Vision Models
148
+
149
+ | Model | Size | RAM Required | Quality | Speed | Best For |
150
+ |-------|------|--------------|---------|-------|----------|
151
+ | `qwen2.5vl:3b` | 3.2 GB | 8 GB | Good | Fast | **Default** - laptops, CI |
152
+ | `qwen2.5vl:7b` | 5.5 GB | 16 GB | Better | Medium | Workstations |
153
+ | `qwen3-vl:4b` | 3.5 GB | 10 GB | Best (newest) | Medium | Best quality/size |
154
+
155
+ **Important**: Only **vision models** (VLMs) work. Text-only models like `llama3` won't process images.
156
+
157
+ ### Choosing a Model
158
+
159
+ - **8GB RAM / M1 MacBook**: `qwen2.5vl:3b` (default)
160
+ - **16GB RAM / M2/M3 Pro**: `qwen2.5vl:7b` or `qwen3-vl:4b`
161
+ - **24GB+ / NVIDIA GPU**: `llava:13b` or larger
162
+ - **CI/Automated pipelines**: `qwen2.5vl:3b` or disable LLM (`--no-llm`)
163
+
164
+ To use a different model:
165
+ ```bash
166
+ # Environment variable
167
+ PDF_TRANSCRIBER_OLLAMA_MODEL=qwen3-vl:4b
168
+
169
+ # Or pull and use
170
+ ollama pull qwen3-vl:4b
171
+ ```
172
+
173
+ ### Without LLM Enhancement
174
+
175
+ If you don't want to run a local LLM:
176
+ ```bash
177
+ pdf-transcriber-cli transcribe paper.pdf --no-llm
178
+ ```
179
+
180
+ This uses Marker OCR alone, which is still excellent for clean, typed PDFs.
181
+
182
+ ## Configuration
183
+
184
+ All settings can be configured via environment variables:
185
+
186
+ | Variable | Description | Default |
187
+ |----------|-------------|---------|
188
+ | `PDF_TRANSCRIBER_OUTPUT_DIR` | Where transcriptions are saved | `./transcriptions` |
189
+ | `PDF_TRANSCRIBER_QUALITY` | fast, balanced, high-quality | `balanced` |
190
+ | `PDF_TRANSCRIBER_USE_GPU` | Enable GPU acceleration | Auto-detected |
191
+ | `PDF_TRANSCRIBER_USE_LLM` | Enable LLM-enhanced OCR | `true` |
192
+ | `PDF_TRANSCRIBER_OLLAMA_URL` | Ollama server URL | `http://localhost:11434` |
193
+ | `PDF_TRANSCRIBER_OLLAMA_MODEL` | Vision model for OCR | `qwen2.5vl:3b` |
194
+ | `PDF_TRANSCRIBER_CHUNK_SIZE` | Pages per chunk (large PDFs) | `25` |
195
+ | `PDF_TRANSCRIBER_AUTO_CHUNK_THRESHOLD` | Auto-chunk above this page count | `100` |
196
+
197
+ ## CLI Commands
198
+
199
+ | Command | Description |
200
+ |---------|-------------|
201
+ | `transcribe <pdf>` | Transcribe a PDF to Markdown |
202
+ | `check` | Health check (config, paths, Ollama) |
203
+ | `install-skill` | Install Claude Code `/transcribe` skill |
204
+
205
+ ## MCP Tools
206
+
207
+ When running as an MCP server, these tools are available:
208
+
209
+ | Tool | Description |
210
+ |------|-------------|
211
+ | `transcribe_pdf` | Convert PDF to Markdown |
212
+ | `clear_transcription_cache` | Free ~2GB memory from cached OCR models |
213
+ | `lint_paper` | Fix common OCR artifacts |
214
+
215
+ ## MCP Server vs CLI + Skill: When to Use What
216
+
217
+ ### Context Usage Comparison
218
+
219
+ | Approach | Context Overhead | Best For |
220
+ |----------|------------------|----------|
221
+ | **MCP Server** | ~1,200 tokens (3 tools) | Frequent transcription, linting workflows |
222
+ | **CLI + Skill** | ~200 tokens (skill definition only) | Occasional use, context-constrained sessions |
223
+ | **CLI only** | 0 tokens | Automation, CI pipelines |
224
+
225
+ ### Recommendation
226
+
227
+ - **Frequent transcription**: Use **MCP Server** — tools always available
228
+ - **Occasional transcription**: Use **CLI + Skill** — minimal context overhead
229
+ - **CI/CD pipelines**: Use **CLI only** — zero agent orchestrator dependency
230
+
231
+ ## Quality Presets
232
+
233
+ | Preset | DPI | Resolution | Use Case |
234
+ |--------|-----|------------|----------|
235
+ | `fast` | 100 | ~1275×1650px | Quick previews, simple documents |
236
+ | `balanced` | 150 | ~1913×2475px | **Default** - best quality/speed |
237
+ | `high-quality` | 200 | ~2550×3300px | Complex math, small text |
238
+
239
+ ## Linting
240
+
241
+ Transcriptions are automatically linted after transcription to fix common OCR artifacts. The original (pre-lint) version is saved as `{name}.original.md`.
242
+
243
+ ### Available Lint Rules
244
+
245
+ #### Markdown Structure Rules
246
+
247
+ | Rule | Auto-Fix | Description |
248
+ |------|----------|-------------|
249
+ | `excessive_blank_lines` | ✅ | Reduces >2 consecutive blank lines |
250
+ | `trailing_whitespace` | ✅ | Removes spaces/tabs at end of lines |
251
+ | `leading_whitespace` | ✅ | Fixes inconsistent leading whitespace |
252
+ | `header_whitespace` | ✅ | Normalizes spacing around headers |
253
+ | `sparse_table_row` | ⚠️ | Warns about table rows >50% empty cells |
254
+ | `orphaned_list_marker` | ⚠️ | Warns about list markers with no content |
255
+
256
+ #### PDF Artifact Rules
257
+
258
+ | Rule | Auto-Fix | Description |
259
+ |------|----------|-------------|
260
+ | `page_number` | ✅ | Removes standalone page numbers like "42" |
261
+ | `page_marker` | ✅ | Removes page break markers |
262
+ | `orphaned_label` | ✅ | Removes orphaned LaTeX labels like `def:Tilt` |
263
+ | `hyphenation_artifact` | ✅ | Rejoins words split across lines (`hy-\nphenated`) |
264
+ | `html_artifacts` | ✅ | Converts HTML tags to markdown equivalents |
265
+ | `html_math_notation` | ✅ | Converts `<sup>2</sup>` to `$^2$` in math context |
266
+ | `footnote_spacing` | ✅ | Fixes spacing around footnote markers |
267
+ | `malformed_footnote` | ⚠️ | Warns about malformed footnote references |
268
+ | `garbled_text` | ⚠️ | Warns about corrupted/nonsense text fragments |
269
+ | `repeated_line` | ⚠️ | Warns about likely running headers/footers |
270
+
271
+ #### Math Notation Rules
272
+
273
+ | Rule | Auto-Fix | Description |
274
+ |------|----------|-------------|
275
+ | `unicode_math_symbols` | ✅ | Converts Unicode math (α, →, ∈) to LaTeX (`\alpha`, `\to`, `\in`) |
276
+ | `unwrapped_math_expressions` | ✅ | Wraps bare math expressions in `$...$` |
277
+ | `broken_math_delimiters` | ✅ | Fixes unbalanced `$` delimiters |
278
+ | `space_in_math_variable` | ✅ | Removes spaces in variable names (`x _1` → `x_1`) |
279
+ | `display_math_whitespace` | ✅ | Normalizes whitespace around `$$...$$` blocks |
280
+ | `repetition_hallucination` | ⚠️ | Warns about repeated sequences (OCR hallucination) |
281
+
282
+ ### Running Specific Rules
283
+
284
+ To run only specific lint rules (via MCP or programmatically):
285
+
286
+ ```python
287
+ # Run only math-related rules
288
+ lint_paper(paper_path, rules=["unicode_math_symbols", "broken_math_delimiters"])
289
+
290
+ # Run only whitespace cleanup
291
+ lint_paper(paper_path, rules=["excessive_blank_lines", "trailing_whitespace"])
292
+
293
+ # Preview issues without fixing
294
+ lint_paper(paper_path, fix=False)
295
+ ```
296
+
297
+ ### Customizing for Your Workflow
298
+
299
+ If you're seeing specific patterns in your PDFs, you can run targeted lint passes:
300
+
301
+ **For math-heavy papers:**
302
+ ```python
303
+ lint_paper(path, rules=[
304
+ "unicode_math_symbols",
305
+ "unwrapped_math_expressions",
306
+ "broken_math_delimiters",
307
+ "space_in_math_variable"
308
+ ])
309
+ ```
310
+
311
+ **For scanned books with page numbers:**
312
+ ```python
313
+ lint_paper(path, rules=[
314
+ "page_number",
315
+ "page_marker",
316
+ "repeated_line", # catches running headers
317
+ "hyphenation_artifact"
318
+ ])
319
+ ```
320
+
321
+ **For cleaning up whitespace only:**
322
+ ```python
323
+ lint_paper(path, rules=[
324
+ "excessive_blank_lines",
325
+ "trailing_whitespace",
326
+ "display_math_whitespace"
327
+ ])
328
+ ```
329
+
330
+ ### Adding Custom Lint Rules
331
+
332
+ If you're seeing specific patterns in your PDFs that aren't caught by existing rules, you can add custom rules.
333
+
334
+ Rules are generator functions that take the file content and yield `LintIssue` objects:
335
+
336
+ ```python
337
+ # my_rules.py
338
+ import re
339
+ from pdf_transcriber.core.linter.models import LintIssue, Severity, Fix
340
+
341
+ def my_custom_rule(content: str):
342
+ """
343
+ Detect and fix a specific pattern in your PDFs.
344
+
345
+ Rules are generators that yield LintIssue objects.
346
+ """
347
+ pattern = re.compile(r'PATTERN_TO_MATCH')
348
+
349
+ for match in pattern.finditer(content):
350
+ line_num = content[:match.start()].count('\n') + 1
351
+
352
+ yield LintIssue(
353
+ rule="my_custom_rule",
354
+ severity=Severity.AUTO_FIX, # or WARNING for manual review
355
+ line=line_num,
356
+ message="Description of the issue",
357
+ fix=Fix(
358
+ old=match.group(),
359
+ new="replacement text"
360
+ )
361
+ )
362
+ ```
363
+
364
+ To register your rule, add it to `rules/__init__.py`:
365
+
366
+ ```python
367
+ # In RULES dict:
368
+ "my_custom_rule": my_module.my_custom_rule,
369
+
370
+ # If auto-fixable, add to DEFAULT_AUTO_FIX:
371
+ DEFAULT_AUTO_FIX.add("my_custom_rule")
372
+ ```
373
+
374
+ **Severity levels:**
375
+ | Level | Use Case |
376
+ |-------|----------|
377
+ | `Severity.AUTO_FIX` | Safe to fix automatically (provide a `Fix`) |
378
+ | `Severity.WARNING` | Needs human review |
379
+ | `Severity.ERROR` | Must be addressed before use |
380
+
381
+ ### Disabling Automatic Linting
382
+
383
+ To skip linting during transcription:
384
+
385
+ ```bash
386
+ # CLI
387
+ pdf-transcriber-cli transcribe paper.pdf --no-lint
388
+
389
+ # MCP tool
390
+ transcribe_pdf(pdf_path, lint=False)
391
+ ```
392
+
393
+ You can then run linting manually later with custom rules.
394
+
395
+ ## License
396
+
397
+ MIT
398
+
399
+ ## Contributing
400
+
401
+ Issues and PRs welcome at https://github.com/AugustSchmidt/pdf-transcriber