socr 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. socr-1.0.0/.claude/settings.local.json +14 -0
  2. socr-1.0.0/.gitignore +54 -0
  3. socr-1.0.0/LICENSE +21 -0
  4. socr-1.0.0/PKG-INFO +194 -0
  5. socr-1.0.0/README.md +140 -0
  6. socr-1.0.0/docs/ARCHITECTURE.md +35 -0
  7. socr-1.0.0/examples/bernanke_kuttner_2005/bernanke_kuttner_2005.md +828 -0
  8. socr-1.0.0/examples/bernanke_kuttner_2005/figures/figure_1_page7.png +0 -0
  9. socr-1.0.0/examples/bernanke_kuttner_2005/figures/figure_2_page8.png +0 -0
  10. socr-1.0.0/examples/bernanke_kuttner_2005/figures/figure_3_page16.png +0 -0
  11. socr-1.0.0/examples/bernanke_kuttner_2005/figures/figure_4_page22.png +0 -0
  12. socr-1.0.0/examples/bernanke_kuttner_2005/figures/figure_5_page24.png +0 -0
  13. socr-1.0.0/examples/bernanke_kuttner_2005/figures/figure_6_page30.png +0 -0
  14. socr-1.0.0/examples/bernanke_kuttner_2005/metadata.json +26 -0
  15. socr-1.0.0/examples/kuttner_2001/figures/figure_1_page1.png +0 -0
  16. socr-1.0.0/examples/kuttner_2001/figures/figure_2_page8.png +0 -0
  17. socr-1.0.0/examples/kuttner_2001/figures/figure_3_page12.png +0 -0
  18. socr-1.0.0/examples/kuttner_2001/kuttner_2001.md +531 -0
  19. socr-1.0.0/examples/kuttner_2001/metadata.json +27 -0
  20. socr-1.0.0/examples/sutskever_2014/metadata.json +26 -0
  21. socr-1.0.0/examples/sutskever_2014/sutskever_2014.md +203 -0
  22. socr-1.0.0/poetry.lock +5564 -0
  23. socr-1.0.0/pyproject.toml +72 -0
  24. socr-1.0.0/src/socr/__init__.py +14 -0
  25. socr-1.0.0/src/socr/audit/__init__.py +6 -0
  26. socr-1.0.0/src/socr/audit/heuristics.py +300 -0
  27. socr-1.0.0/src/socr/audit/llm_audit.py +182 -0
  28. socr-1.0.0/src/socr/cli.py +209 -0
  29. socr-1.0.0/src/socr/core/__init__.py +25 -0
  30. socr-1.0.0/src/socr/core/config.py +185 -0
  31. socr-1.0.0/src/socr/core/document.py +90 -0
  32. socr-1.0.0/src/socr/core/metadata.py +93 -0
  33. socr-1.0.0/src/socr/core/result.py +97 -0
  34. socr-1.0.0/src/socr/engines/__init__.py +18 -0
  35. socr-1.0.0/src/socr/engines/base.py +260 -0
  36. socr-1.0.0/src/socr/engines/deepseek.py +95 -0
  37. socr-1.0.0/src/socr/engines/deepseek_vllm.py +250 -0
  38. socr-1.0.0/src/socr/engines/gemini.py +45 -0
  39. socr-1.0.0/src/socr/engines/marker.py +38 -0
  40. socr-1.0.0/src/socr/engines/mistral.py +42 -0
  41. socr-1.0.0/src/socr/engines/nougat.py +38 -0
  42. socr-1.0.0/src/socr/engines/registry.py +25 -0
  43. socr-1.0.0/src/socr/engines/vllm.py +196 -0
  44. socr-1.0.0/src/socr/engines/vllm_manager.py +292 -0
  45. socr-1.0.0/src/socr/figures/__init__.py +5 -0
  46. socr-1.0.0/src/socr/figures/extractor.py +342 -0
  47. socr-1.0.0/src/socr/pipeline/__init__.py +5 -0
  48. socr-1.0.0/src/socr/pipeline/hpc_pipeline.py +479 -0
  49. socr-1.0.0/src/socr/pipeline/processor.py +242 -0
  50. socr-1.0.0/src/socr/pipeline/reconciler.py +332 -0
  51. socr-1.0.0/src/socr/pipeline/router.py +104 -0
  52. socr-1.0.0/src/socr/ui/__init__.py +7 -0
  53. socr-1.0.0/src/socr/ui/console.py +148 -0
  54. socr-1.0.0/src/socr/ui/panels.py +203 -0
  55. socr-1.0.0/src/socr/ui/progress.py +227 -0
  56. socr-1.0.0/src/socr/ui/theme.py +89 -0
  57. socr-1.0.0/tests/test_audit_heuristics.py +31 -0
  58. socr-1.0.0/tests/test_figure_pass.py +42 -0
  59. socr-1.0.0/tests/test_pipeline_routing.py +66 -0
@@ -0,0 +1,14 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(git remote set-url origin https://github.com/r SECTIONuben/socr.git)",
5
+ "Bash(cd:*)",
6
+ "Bash(gh repo edit r-uben/socr --add-topic ocr --add-topic pdf --add-topic document-processing --add-topic nougat --add-topic gemini --add-topic deepseek)",
7
+ "Bash(git remote set-url origin https://github.com/r-uben/socr.git)"
8
+ ]
9
+ },
10
+ "enabledMcpjsonServers": [
11
+ "gemini-cli"
12
+ ],
13
+ "enableAllProjectMcpServers": true
14
+ }
socr-1.0.0/.gitignore ADDED
@@ -0,0 +1,54 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual environments
24
+ .venv/
25
+ venv/
26
+ ENV/
27
+
28
+ # IDE
29
+ .idea/
30
+ .vscode/
31
+ *.swp
32
+ *.swo
33
+ .DS_Store
34
+
35
+ # Testing
36
+ .pytest_cache/
37
+ .coverage
38
+ htmlcov/
39
+ .tox/
40
+ .nox/
41
+
42
+ # Output (user-generated, not examples)
43
+ output/
44
+ *.md
45
+ !README.md
46
+ !docs/*.md
47
+ !examples/**/*.md
48
+
49
+ # Test artifacts
50
+ test_*.md
51
+ /tmp/
52
+
53
+ # Lock files (keep poetry.lock, ignore uv.lock)
54
+ uv.lock
socr-1.0.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Ruben Fernandez-Fuertes
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
socr-1.0.0/PKG-INFO ADDED
@@ -0,0 +1,194 @@
1
+ Metadata-Version: 2.4
2
+ Name: socr
3
+ Version: 1.0.0
4
+ Summary: Multi-engine document OCR with cascading fallback
5
+ Project-URL: Homepage, https://github.com/r-uben/socr
6
+ Project-URL: Repository, https://github.com/r-uben/socr
7
+ Project-URL: Issues, https://github.com/r-uben/socr/issues
8
+ Author-email: Ruben Fernandez-Fuertes <fernandezfuertesruben@gmail.com>
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: deepseek,document-processing,gemini,marker,mistral,nougat,ocr,pdf
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Requires-Python: <3.13,>=3.11
20
+ Requires-Dist: click>=8.1.0
21
+ Requires-Dist: httpx>=0.27.0
22
+ Requires-Dist: pillow>=10.0.0
23
+ Requires-Dist: pymupdf>=1.24.0
24
+ Requires-Dist: pyyaml>=6.0.0
25
+ Requires-Dist: rich>=13.0.0
26
+ Provides-Extra: all
27
+ Requires-Dist: deepseek-ocr-cli>=0.1.0; extra == 'all'
28
+ Requires-Dist: gemini-ocr-cli>=0.2.0; extra == 'all'
29
+ Requires-Dist: marker-ocr-cli>=0.2.0; extra == 'all'
30
+ Requires-Dist: mistral-ocr-cli>=0.1.0; extra == 'all'
31
+ Requires-Dist: nougat-ocr-cli>=0.1.2; extra == 'all'
32
+ Provides-Extra: cloud
33
+ Requires-Dist: gemini-ocr-cli>=0.2.0; extra == 'cloud'
34
+ Requires-Dist: mistral-ocr-cli>=0.1.0; extra == 'cloud'
35
+ Provides-Extra: deepseek
36
+ Requires-Dist: deepseek-ocr-cli>=0.1.0; extra == 'deepseek'
37
+ Provides-Extra: dev
38
+ Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
39
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
40
+ Requires-Dist: ruff>=0.8.0; extra == 'dev'
41
+ Provides-Extra: gemini
42
+ Requires-Dist: gemini-ocr-cli>=0.2.0; extra == 'gemini'
43
+ Provides-Extra: local
44
+ Requires-Dist: deepseek-ocr-cli>=0.1.0; extra == 'local'
45
+ Requires-Dist: marker-ocr-cli>=0.2.0; extra == 'local'
46
+ Requires-Dist: nougat-ocr-cli>=0.1.2; extra == 'local'
47
+ Provides-Extra: marker
48
+ Requires-Dist: marker-ocr-cli>=0.2.0; extra == 'marker'
49
+ Provides-Extra: mistral
50
+ Requires-Dist: mistral-ocr-cli>=0.1.0; extra == 'mistral'
51
+ Provides-Extra: nougat
52
+ Requires-Dist: nougat-ocr-cli>=0.1.2; extra == 'nougat'
53
+ Description-Content-Type: text/markdown
54
+
55
+ # socr
56
+
57
+ [![PyPI](https://img.shields.io/pypi/v/socr)](https://pypi.org/project/socr/)
58
+ [![Python 3.11–3.12](https://img.shields.io/pypi/pyversions/socr)](https://pypi.org/project/socr/)
59
+ [![License](https://img.shields.io/github/license/r-uben/socr)](LICENSE)
60
+
61
+ Multi-engine document OCR with cascading fallback and quality audit.
62
+
63
+ `socr` orchestrates multiple OCR engines — calling each as a CLI subprocess, auditing output quality, and falling back to a different engine when results are poor. Each engine is a standalone CLI tool (`gemini-ocr`, `deepseek-ocr`, `marker-ocr`, etc.) that can also be used independently.
64
+
65
+ ## Install
66
+
67
+ ```bash
68
+ pip install socr
69
+
70
+ # With specific engine backends
71
+ pip install socr[gemini] # Google Gemini (cloud)
72
+ pip install socr[local] # DeepSeek + Nougat (local/free)
73
+ pip install socr[all] # All engines
74
+ ```
75
+
76
+ Engines are installed separately because they have different dependencies (torch, cloud SDKs, etc.). Install only what you need.
77
+
78
+ ## Usage
79
+
80
+ ```bash
81
+ # Process a PDF
82
+ socr paper.pdf
83
+
84
+ # Choose engine
85
+ socr paper.pdf --primary gemini
86
+ socr paper.pdf --primary marker
87
+
88
+ # Save extracted figures
89
+ socr paper.pdf --save-figures
90
+
91
+ # Batch process a directory
92
+ socr batch ~/Papers/ -o ./results/
93
+ socr batch ~/Papers/ --dry-run # preview what would be processed
94
+ socr batch ~/Papers/ --reprocess # force reprocess all
95
+
96
+ # Check which engines are available
97
+ socr engines
98
+ ```
99
+
100
+ ## How it works
101
+
102
+ ```
103
+ PDF → Primary OCR → Quality Audit → (Fallback OCR if needed) → Markdown
104
+ ```
105
+
106
+ 1. **Primary OCR** — Calls the primary engine CLI on the whole PDF
107
+ 2. **Quality audit** — Heuristic checks (word count, garbage ratio, repetition)
108
+ 3. **Fallback** — If audit fails, tries a different engine
109
+
110
+ Each engine is a separate CLI binary. `socr` calls it as a subprocess, reads the output markdown, and applies the quality pipeline.
111
+
112
+ ## Engines
113
+
114
+ | Engine | Package | Type | Notes |
115
+ |--------|---------|------|-------|
116
+ | Gemini | `gemini-ocr-cli` | Cloud | Google Gemini, ~$0.0002/page |
117
+ | Mistral | `mistral-ocr-cli` | Cloud | Mistral AI |
118
+ | Marker | `marker-ocr-cli` | Local | Layout-aware (Surya + Texify) |
119
+ | DeepSeek | `deepseek-ocr-cli` | Local | Via Ollama |
120
+ | Nougat | `nougat-ocr-cli` | Local | Academic papers, Python <3.13 |
121
+
122
+ Check availability:
123
+ ```
124
+ $ socr engines
125
+
126
+ [+] gemini cloud, ~$0.0002/page
127
+ [+] marker local, layout-aware (Surya + Texify)
128
+ [+] mistral cloud, ~$0.001/page
129
+ [+] deepseek local via Ollama
130
+ [x] nougat local, academic papers
131
+ ```
132
+
133
+ ## CLI reference
134
+
135
+ ```
136
+ socr process <PDF> [OPTIONS]
137
+ -o, --output-dir PATH Output directory
138
+ --primary ENGINE Primary OCR engine (gemini, marker, deepseek, etc.)
139
+ --fallback ENGINE Fallback engine
140
+ --no-audit Skip quality audit
141
+ --save-figures Save extracted figure images
142
+ --timeout SECONDS Subprocess timeout (default: 300)
143
+ --profile NAME Load ~/.config/socr/{name}.yaml
144
+ --config PATH Custom YAML config file
145
+ -q, --quiet Suppress non-error output
146
+ -v, --verbose Verbose output
147
+ --dry-run List files without processing
148
+ --reprocess Force reprocess already-done files
149
+
150
+ socr batch <DIR> [OPTIONS]
151
+ Same options as process, plus:
152
+ --limit N Process first N files
153
+
154
+ socr engines Show available engines
155
+ ```
156
+
157
+ ## Output
158
+
159
+ ```
160
+ output/<doc_stem>/
161
+ ├── <doc_stem>.md # OCR text
162
+ ├── metadata.json # Processing stats
163
+ └── figures/ # With --save-figures
164
+ └── figure_1_page3.png
165
+ ```
166
+
167
+ ## Configuration
168
+
169
+ Create `~/.config/socr/config.yaml`:
170
+
171
+ ```yaml
172
+ primary_engine: gemini
173
+ fallback_engine: marker
174
+ timeout: 300
175
+ save_figures: false
176
+ audit_enabled: true
177
+ audit_min_words: 50
178
+ ```
179
+
180
+ Or use profiles: `~/.config/socr/fast.yaml` → `socr paper.pdf --profile fast`
181
+
182
+ ## Engine CLIs
183
+
184
+ Each backend is an independent CLI tool:
185
+
186
+ - [gemini-ocr-cli](https://github.com/r-uben/gemini-ocr-cli) — Google Gemini
187
+ - [deepseek-ocr-cli](https://github.com/r-uben/deepseek-ocr-cli) — DeepSeek via Ollama
188
+ - [mistral-ocr-cli](https://github.com/r-uben/mistral-ocr-cli) — Mistral AI
189
+ - [marker-ocr-cli](https://github.com/r-uben/marker-ocr-cli) — Marker (Surya + Texify)
190
+ - [nougat-ocr-cli](https://github.com/r-uben/nougat-ocr-cli) — Meta Nougat
191
+
192
+ ## License
193
+
194
+ MIT
socr-1.0.0/README.md ADDED
@@ -0,0 +1,140 @@
1
+ # socr
2
+
3
+ [![PyPI](https://img.shields.io/pypi/v/socr)](https://pypi.org/project/socr/)
4
+ [![Python 3.11–3.12](https://img.shields.io/pypi/pyversions/socr)](https://pypi.org/project/socr/)
5
+ [![License](https://img.shields.io/github/license/r-uben/socr)](LICENSE)
6
+
7
+ Multi-engine document OCR with cascading fallback and quality audit.
8
+
9
+ `socr` orchestrates multiple OCR engines — calling each as a CLI subprocess, auditing output quality, and falling back to a different engine when results are poor. Each engine is a standalone CLI tool (`gemini-ocr`, `deepseek-ocr`, `marker-ocr`, etc.) that can also be used independently.
10
+
11
+ ## Install
12
+
13
+ ```bash
14
+ pip install socr
15
+
16
+ # With specific engine backends
17
+ pip install socr[gemini] # Google Gemini (cloud)
18
+ pip install socr[local] # DeepSeek + Nougat (local/free)
19
+ pip install socr[all] # All engines
20
+ ```
21
+
22
+ Engines are installed separately because they have different dependencies (torch, cloud SDKs, etc.). Install only what you need.
23
+
24
+ ## Usage
25
+
26
+ ```bash
27
+ # Process a PDF
28
+ socr paper.pdf
29
+
30
+ # Choose engine
31
+ socr paper.pdf --primary gemini
32
+ socr paper.pdf --primary marker
33
+
34
+ # Save extracted figures
35
+ socr paper.pdf --save-figures
36
+
37
+ # Batch process a directory
38
+ socr batch ~/Papers/ -o ./results/
39
+ socr batch ~/Papers/ --dry-run # preview what would be processed
40
+ socr batch ~/Papers/ --reprocess # force reprocess all
41
+
42
+ # Check which engines are available
43
+ socr engines
44
+ ```
45
+
46
+ ## How it works
47
+
48
+ ```
49
+ PDF → Primary OCR → Quality Audit → (Fallback OCR if needed) → Markdown
50
+ ```
51
+
52
+ 1. **Primary OCR** — Calls the primary engine CLI on the whole PDF
53
+ 2. **Quality audit** — Heuristic checks (word count, garbage ratio, repetition)
54
+ 3. **Fallback** — If audit fails, tries a different engine
55
+
56
+ Each engine is a separate CLI binary. `socr` calls it as a subprocess, reads the output markdown, and applies the quality pipeline.
57
+
58
+ ## Engines
59
+
60
+ | Engine | Package | Type | Notes |
61
+ |--------|---------|------|-------|
62
+ | Gemini | `gemini-ocr-cli` | Cloud | Google Gemini, ~$0.0002/page |
63
+ | Mistral | `mistral-ocr-cli` | Cloud | Mistral AI |
64
+ | Marker | `marker-ocr-cli` | Local | Layout-aware (Surya + Texify) |
65
+ | DeepSeek | `deepseek-ocr-cli` | Local | Via Ollama |
66
+ | Nougat | `nougat-ocr-cli` | Local | Academic papers, Python <3.13 |
67
+
68
+ Check availability:
69
+ ```
70
+ $ socr engines
71
+
72
+ [+] gemini cloud, ~$0.0002/page
73
+ [+] marker local, layout-aware (Surya + Texify)
74
+ [+] mistral cloud, ~$0.001/page
75
+ [+] deepseek local via Ollama
76
+ [x] nougat local, academic papers
77
+ ```
78
+
79
+ ## CLI reference
80
+
81
+ ```
82
+ socr process <PDF> [OPTIONS]
83
+ -o, --output-dir PATH Output directory
84
+ --primary ENGINE Primary OCR engine (gemini, marker, deepseek, etc.)
85
+ --fallback ENGINE Fallback engine
86
+ --no-audit Skip quality audit
87
+ --save-figures Save extracted figure images
88
+ --timeout SECONDS Subprocess timeout (default: 300)
89
+ --profile NAME Load ~/.config/socr/{name}.yaml
90
+ --config PATH Custom YAML config file
91
+ -q, --quiet Suppress non-error output
92
+ -v, --verbose Verbose output
93
+ --dry-run List files without processing
94
+ --reprocess Force reprocess already-done files
95
+
96
+ socr batch <DIR> [OPTIONS]
97
+ Same options as process, plus:
98
+ --limit N Process first N files
99
+
100
+ socr engines Show available engines
101
+ ```
102
+
103
+ ## Output
104
+
105
+ ```
106
+ output/<doc_stem>/
107
+ ├── <doc_stem>.md # OCR text
108
+ ├── metadata.json # Processing stats
109
+ └── figures/ # With --save-figures
110
+ └── figure_1_page3.png
111
+ ```
112
+
113
+ ## Configuration
114
+
115
+ Create `~/.config/socr/config.yaml`:
116
+
117
+ ```yaml
118
+ primary_engine: gemini
119
+ fallback_engine: marker
120
+ timeout: 300
121
+ save_figures: false
122
+ audit_enabled: true
123
+ audit_min_words: 50
124
+ ```
125
+
126
+ Or use profiles: `~/.config/socr/fast.yaml` → `socr paper.pdf --profile fast`
127
+
128
+ ## Engine CLIs
129
+
130
+ Each backend is an independent CLI tool:
131
+
132
+ - [gemini-ocr-cli](https://github.com/r-uben/gemini-ocr-cli) — Google Gemini
133
+ - [deepseek-ocr-cli](https://github.com/r-uben/deepseek-ocr-cli) — DeepSeek via Ollama
134
+ - [mistral-ocr-cli](https://github.com/r-uben/mistral-ocr-cli) — Mistral AI
135
+ - [marker-ocr-cli](https://github.com/r-uben/marker-ocr-cli) — Marker (Surya + Texify)
136
+ - [nougat-ocr-cli](https://github.com/r-uben/nougat-ocr-cli) — Meta Nougat
137
+
138
+ ## License
139
+
140
+ MIT
@@ -0,0 +1,35 @@
1
+ # socr Architecture
2
+
3
+ ## Modules
4
+ - `socr/cli.py`: Click commands (`process`, `engines`, `audit-status`, `describe_figures`, shorthand `p`).
5
+ - `core/`: shared types and configuration.
6
+ - `config.py`: `AgentConfig`, per-engine configs, audit settings, routing overrides, optional cross-check toggle.
7
+ - `document.py`: PDF loading/rendering, basic document classification.
8
+ - `result.py`: page/figure/results, stats, markdown export.
9
+ - `engines/`: one adapter per engine implementing `BaseEngine` (`nougat`, `deepseek`, `gemini`, `mistral`).
10
+ - `audit/`: heuristic checks (`HeuristicsChecker`) and optional Ollama LLM audit (`LLMAuditor`).
11
+ - `pipeline/`:
12
+ - `router.py`: engine selection (primary/fallback/cross-check).
13
+ - `processor.py`: orchestrates stages, output writer, figure pass.
14
+ - `ui/`: Rich-based console/progress/panels/theme.
15
+ - `tests/`: routing/output, figure pass, and heuristic tests (require dev extras).
16
+
17
+ ## Pipeline Stages
18
+ 1) **Primary OCR** — pick engine via `EngineRouter` (honors overrides, prefers local).
19
+ 2) **Verifier** — heuristics; optional cross-check on flagged pages using the other local engine; optional Ollama LLM audit.
20
+ 3) **Fallback OCR** — reprocess flagged pages with a different engine (prefers cheaper cloud).
21
+ 4) **Figure Pass** — experimental: extract embedded page images, filter out tiny/extreme assets, send to vision-capable engine `describe_figure`.
22
+ 5) **Output** — write `output/<doc_stem>/<doc_stem>.<ext>` plus `metadata.json` (stats, engines used, pages needing rerun, doc metadata).
23
+
24
+ ## Verification (local-first)
25
+ - Heuristics: word count, garbage ratio, structure, repeated patterns.
26
+ - Cross-check (optional): re-run a few flagged pages on the other local engine (Nougat↔DeepSeek) before cloud fallback.
27
+ - LLM audit: optional Ollama-based review on flagged pages.
28
+
29
+ ## Figure Pass
30
+ - Uses PyMuPDF to extract embedded images per page, skips tiny/extreme-aspect assets, downscales large images, calls `describe_figure` on first available vision engine (Gemini/DeepSeek/Mistral). Results are attached to `PageResult.figures` and printed.
31
+
32
+ ## Testing
33
+ - Install (editable): `uv pip install -e .`
34
+ - Run: `pytest -q --disable-warnings --maxfail=1`.
35
+ - Coverage: routing/output (`tests/test_pipeline_routing.py`), figure extraction (`tests/test_figure_pass.py`), heuristics/reprocessing (`tests/test_audit_heuristics.py`). Tests skip if rich/fitz/Pillow aren’t installed.