socr 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- socr-1.0.0/.claude/settings.local.json +14 -0
- socr-1.0.0/.gitignore +54 -0
- socr-1.0.0/LICENSE +21 -0
- socr-1.0.0/PKG-INFO +194 -0
- socr-1.0.0/README.md +140 -0
- socr-1.0.0/docs/ARCHITECTURE.md +35 -0
- socr-1.0.0/examples/bernanke_kuttner_2005/bernanke_kuttner_2005.md +828 -0
- socr-1.0.0/examples/bernanke_kuttner_2005/figures/figure_1_page7.png +0 -0
- socr-1.0.0/examples/bernanke_kuttner_2005/figures/figure_2_page8.png +0 -0
- socr-1.0.0/examples/bernanke_kuttner_2005/figures/figure_3_page16.png +0 -0
- socr-1.0.0/examples/bernanke_kuttner_2005/figures/figure_4_page22.png +0 -0
- socr-1.0.0/examples/bernanke_kuttner_2005/figures/figure_5_page24.png +0 -0
- socr-1.0.0/examples/bernanke_kuttner_2005/figures/figure_6_page30.png +0 -0
- socr-1.0.0/examples/bernanke_kuttner_2005/metadata.json +26 -0
- socr-1.0.0/examples/kuttner_2001/figures/figure_1_page1.png +0 -0
- socr-1.0.0/examples/kuttner_2001/figures/figure_2_page8.png +0 -0
- socr-1.0.0/examples/kuttner_2001/figures/figure_3_page12.png +0 -0
- socr-1.0.0/examples/kuttner_2001/kuttner_2001.md +531 -0
- socr-1.0.0/examples/kuttner_2001/metadata.json +27 -0
- socr-1.0.0/examples/sutskever_2014/metadata.json +26 -0
- socr-1.0.0/examples/sutskever_2014/sutskever_2014.md +203 -0
- socr-1.0.0/poetry.lock +5564 -0
- socr-1.0.0/pyproject.toml +72 -0
- socr-1.0.0/src/socr/__init__.py +14 -0
- socr-1.0.0/src/socr/audit/__init__.py +6 -0
- socr-1.0.0/src/socr/audit/heuristics.py +300 -0
- socr-1.0.0/src/socr/audit/llm_audit.py +182 -0
- socr-1.0.0/src/socr/cli.py +209 -0
- socr-1.0.0/src/socr/core/__init__.py +25 -0
- socr-1.0.0/src/socr/core/config.py +185 -0
- socr-1.0.0/src/socr/core/document.py +90 -0
- socr-1.0.0/src/socr/core/metadata.py +93 -0
- socr-1.0.0/src/socr/core/result.py +97 -0
- socr-1.0.0/src/socr/engines/__init__.py +18 -0
- socr-1.0.0/src/socr/engines/base.py +260 -0
- socr-1.0.0/src/socr/engines/deepseek.py +95 -0
- socr-1.0.0/src/socr/engines/deepseek_vllm.py +250 -0
- socr-1.0.0/src/socr/engines/gemini.py +45 -0
- socr-1.0.0/src/socr/engines/marker.py +38 -0
- socr-1.0.0/src/socr/engines/mistral.py +42 -0
- socr-1.0.0/src/socr/engines/nougat.py +38 -0
- socr-1.0.0/src/socr/engines/registry.py +25 -0
- socr-1.0.0/src/socr/engines/vllm.py +196 -0
- socr-1.0.0/src/socr/engines/vllm_manager.py +292 -0
- socr-1.0.0/src/socr/figures/__init__.py +5 -0
- socr-1.0.0/src/socr/figures/extractor.py +342 -0
- socr-1.0.0/src/socr/pipeline/__init__.py +5 -0
- socr-1.0.0/src/socr/pipeline/hpc_pipeline.py +479 -0
- socr-1.0.0/src/socr/pipeline/processor.py +242 -0
- socr-1.0.0/src/socr/pipeline/reconciler.py +332 -0
- socr-1.0.0/src/socr/pipeline/router.py +104 -0
- socr-1.0.0/src/socr/ui/__init__.py +7 -0
- socr-1.0.0/src/socr/ui/console.py +148 -0
- socr-1.0.0/src/socr/ui/panels.py +203 -0
- socr-1.0.0/src/socr/ui/progress.py +227 -0
- socr-1.0.0/src/socr/ui/theme.py +89 -0
- socr-1.0.0/tests/test_audit_heuristics.py +31 -0
- socr-1.0.0/tests/test_figure_pass.py +42 -0
- socr-1.0.0/tests/test_pipeline_routing.py +66 -0
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
{
|
|
2
|
+
"permissions": {
|
|
3
|
+
"allow": [
|
|
4
|
+
"Bash(git remote set-url origin https://github.com/r SECTIONuben/socr.git)",
|
|
5
|
+
"Bash(cd:*)",
|
|
6
|
+
"Bash(gh repo edit r-uben/socr --add-topic ocr --add-topic pdf --add-topic document-processing --add-topic nougat --add-topic gemini --add-topic deepseek)",
|
|
7
|
+
"Bash(git remote set-url origin https://github.com/r-uben/socr.git)"
|
|
8
|
+
]
|
|
9
|
+
},
|
|
10
|
+
"enabledMcpjsonServers": [
|
|
11
|
+
"gemini-cli"
|
|
12
|
+
],
|
|
13
|
+
"enableAllProjectMcpServers": true
|
|
14
|
+
}
|
socr-1.0.0/.gitignore
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
build/
|
|
8
|
+
develop-eggs/
|
|
9
|
+
dist/
|
|
10
|
+
downloads/
|
|
11
|
+
eggs/
|
|
12
|
+
.eggs/
|
|
13
|
+
lib/
|
|
14
|
+
lib64/
|
|
15
|
+
parts/
|
|
16
|
+
sdist/
|
|
17
|
+
var/
|
|
18
|
+
wheels/
|
|
19
|
+
*.egg-info/
|
|
20
|
+
.installed.cfg
|
|
21
|
+
*.egg
|
|
22
|
+
|
|
23
|
+
# Virtual environments
|
|
24
|
+
.venv/
|
|
25
|
+
venv/
|
|
26
|
+
ENV/
|
|
27
|
+
|
|
28
|
+
# IDE
|
|
29
|
+
.idea/
|
|
30
|
+
.vscode/
|
|
31
|
+
*.swp
|
|
32
|
+
*.swo
|
|
33
|
+
.DS_Store
|
|
34
|
+
|
|
35
|
+
# Testing
|
|
36
|
+
.pytest_cache/
|
|
37
|
+
.coverage
|
|
38
|
+
htmlcov/
|
|
39
|
+
.tox/
|
|
40
|
+
.nox/
|
|
41
|
+
|
|
42
|
+
# Output (user-generated, not examples)
|
|
43
|
+
output/
|
|
44
|
+
*.md
|
|
45
|
+
!README.md
|
|
46
|
+
!docs/*.md
|
|
47
|
+
!examples/**/*.md
|
|
48
|
+
|
|
49
|
+
# Test artifacts
|
|
50
|
+
test_*.md
|
|
51
|
+
/tmp/
|
|
52
|
+
|
|
53
|
+
# Lock files (keep poetry.lock, ignore uv.lock)
|
|
54
|
+
uv.lock
|
socr-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Ruben Fernandez-Fuertes
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
socr-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: socr
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Multi-engine document OCR with cascading fallback
|
|
5
|
+
Project-URL: Homepage, https://github.com/r-uben/socr
|
|
6
|
+
Project-URL: Repository, https://github.com/r-uben/socr
|
|
7
|
+
Project-URL: Issues, https://github.com/r-uben/socr/issues
|
|
8
|
+
Author-email: Ruben Fernandez-Fuertes <fernandezfuertesruben@gmail.com>
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: deepseek,document-processing,gemini,marker,mistral,nougat,ocr,pdf
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Requires-Python: <3.13,>=3.11
|
|
20
|
+
Requires-Dist: click>=8.1.0
|
|
21
|
+
Requires-Dist: httpx>=0.27.0
|
|
22
|
+
Requires-Dist: pillow>=10.0.0
|
|
23
|
+
Requires-Dist: pymupdf>=1.24.0
|
|
24
|
+
Requires-Dist: pyyaml>=6.0.0
|
|
25
|
+
Requires-Dist: rich>=13.0.0
|
|
26
|
+
Provides-Extra: all
|
|
27
|
+
Requires-Dist: deepseek-ocr-cli>=0.1.0; extra == 'all'
|
|
28
|
+
Requires-Dist: gemini-ocr-cli>=0.2.0; extra == 'all'
|
|
29
|
+
Requires-Dist: marker-ocr-cli>=0.2.0; extra == 'all'
|
|
30
|
+
Requires-Dist: mistral-ocr-cli>=0.1.0; extra == 'all'
|
|
31
|
+
Requires-Dist: nougat-ocr-cli>=0.1.2; extra == 'all'
|
|
32
|
+
Provides-Extra: cloud
|
|
33
|
+
Requires-Dist: gemini-ocr-cli>=0.2.0; extra == 'cloud'
|
|
34
|
+
Requires-Dist: mistral-ocr-cli>=0.1.0; extra == 'cloud'
|
|
35
|
+
Provides-Extra: deepseek
|
|
36
|
+
Requires-Dist: deepseek-ocr-cli>=0.1.0; extra == 'deepseek'
|
|
37
|
+
Provides-Extra: dev
|
|
38
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
39
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
40
|
+
Requires-Dist: ruff>=0.8.0; extra == 'dev'
|
|
41
|
+
Provides-Extra: gemini
|
|
42
|
+
Requires-Dist: gemini-ocr-cli>=0.2.0; extra == 'gemini'
|
|
43
|
+
Provides-Extra: local
|
|
44
|
+
Requires-Dist: deepseek-ocr-cli>=0.1.0; extra == 'local'
|
|
45
|
+
Requires-Dist: marker-ocr-cli>=0.2.0; extra == 'local'
|
|
46
|
+
Requires-Dist: nougat-ocr-cli>=0.1.2; extra == 'local'
|
|
47
|
+
Provides-Extra: marker
|
|
48
|
+
Requires-Dist: marker-ocr-cli>=0.2.0; extra == 'marker'
|
|
49
|
+
Provides-Extra: mistral
|
|
50
|
+
Requires-Dist: mistral-ocr-cli>=0.1.0; extra == 'mistral'
|
|
51
|
+
Provides-Extra: nougat
|
|
52
|
+
Requires-Dist: nougat-ocr-cli>=0.1.2; extra == 'nougat'
|
|
53
|
+
Description-Content-Type: text/markdown
|
|
54
|
+
|
|
55
|
+
# socr
|
|
56
|
+
|
|
57
|
+
[](https://pypi.org/project/socr/)
|
|
58
|
+
[](https://pypi.org/project/socr/)
|
|
59
|
+
[](LICENSE)
|
|
60
|
+
|
|
61
|
+
Multi-engine document OCR with cascading fallback and quality audit.
|
|
62
|
+
|
|
63
|
+
`socr` orchestrates multiple OCR engines — calling each as a CLI subprocess, auditing output quality, and falling back to a different engine when results are poor. Each engine is a standalone CLI tool (`gemini-ocr`, `deepseek-ocr`, `marker-ocr`, etc.) that can also be used independently.
|
|
64
|
+
|
|
65
|
+
## Install
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
pip install socr
|
|
69
|
+
|
|
70
|
+
# With specific engine backends
|
|
71
|
+
pip install socr[gemini] # Google Gemini (cloud)
|
|
72
|
+
pip install socr[local] # DeepSeek + Nougat (local/free)
|
|
73
|
+
pip install socr[all] # All engines
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Engines are installed separately because they have different dependencies (torch, cloud SDKs, etc.). Install only what you need.
|
|
77
|
+
|
|
78
|
+
## Usage
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
# Process a PDF
|
|
82
|
+
socr paper.pdf
|
|
83
|
+
|
|
84
|
+
# Choose engine
|
|
85
|
+
socr paper.pdf --primary gemini
|
|
86
|
+
socr paper.pdf --primary marker
|
|
87
|
+
|
|
88
|
+
# Save extracted figures
|
|
89
|
+
socr paper.pdf --save-figures
|
|
90
|
+
|
|
91
|
+
# Batch process a directory
|
|
92
|
+
socr batch ~/Papers/ -o ./results/
|
|
93
|
+
socr batch ~/Papers/ --dry-run # preview what would be processed
|
|
94
|
+
socr batch ~/Papers/ --reprocess # force reprocess all
|
|
95
|
+
|
|
96
|
+
# Check which engines are available
|
|
97
|
+
socr engines
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## How it works
|
|
101
|
+
|
|
102
|
+
```
|
|
103
|
+
PDF → Primary OCR → Quality Audit → (Fallback OCR if needed) → Markdown
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
1. **Primary OCR** — Calls the primary engine CLI on the whole PDF
|
|
107
|
+
2. **Quality audit** — Heuristic checks (word count, garbage ratio, repetition)
|
|
108
|
+
3. **Fallback** — If audit fails, tries a different engine
|
|
109
|
+
|
|
110
|
+
Each engine is a separate CLI binary. `socr` calls it as a subprocess, reads the output markdown, and applies the quality pipeline.
|
|
111
|
+
|
|
112
|
+
## Engines
|
|
113
|
+
|
|
114
|
+
| Engine | Package | Type | Notes |
|
|
115
|
+
|--------|---------|------|-------|
|
|
116
|
+
| Gemini | `gemini-ocr-cli` | Cloud | Google Gemini, ~$0.0002/page |
|
|
117
|
+
| Mistral | `mistral-ocr-cli` | Cloud | Mistral AI |
|
|
118
|
+
| Marker | `marker-ocr-cli` | Local | Layout-aware (Surya + Texify) |
|
|
119
|
+
| DeepSeek | `deepseek-ocr-cli` | Local | Via Ollama |
|
|
120
|
+
| Nougat | `nougat-ocr-cli` | Local | Academic papers, Python <3.13 |
|
|
121
|
+
|
|
122
|
+
Check availability:
|
|
123
|
+
```
|
|
124
|
+
$ socr engines
|
|
125
|
+
|
|
126
|
+
[+] gemini cloud, ~$0.0002/page
|
|
127
|
+
[+] marker local, layout-aware (Surya + Texify)
|
|
128
|
+
[+] mistral cloud, ~$0.001/page
|
|
129
|
+
[+] deepseek local via Ollama
|
|
130
|
+
[x] nougat local, academic papers
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## CLI reference
|
|
134
|
+
|
|
135
|
+
```
|
|
136
|
+
socr process <PDF> [OPTIONS]
|
|
137
|
+
-o, --output-dir PATH Output directory
|
|
138
|
+
--primary ENGINE Primary OCR engine (gemini, marker, deepseek, etc.)
|
|
139
|
+
--fallback ENGINE Fallback engine
|
|
140
|
+
--no-audit Skip quality audit
|
|
141
|
+
--save-figures Save extracted figure images
|
|
142
|
+
--timeout SECONDS Subprocess timeout (default: 300)
|
|
143
|
+
--profile NAME Load ~/.config/socr/{name}.yaml
|
|
144
|
+
--config PATH Custom YAML config file
|
|
145
|
+
-q, --quiet Suppress non-error output
|
|
146
|
+
-v, --verbose Verbose output
|
|
147
|
+
--dry-run List files without processing
|
|
148
|
+
--reprocess Force reprocess already-done files
|
|
149
|
+
|
|
150
|
+
socr batch <DIR> [OPTIONS]
|
|
151
|
+
Same options as process, plus:
|
|
152
|
+
--limit N Process first N files
|
|
153
|
+
|
|
154
|
+
socr engines Show available engines
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## Output
|
|
158
|
+
|
|
159
|
+
```
|
|
160
|
+
output/<doc_stem>/
|
|
161
|
+
├── <doc_stem>.md # OCR text
|
|
162
|
+
├── metadata.json # Processing stats
|
|
163
|
+
└── figures/ # With --save-figures
|
|
164
|
+
└── figure_1_page3.png
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
## Configuration
|
|
168
|
+
|
|
169
|
+
Create `~/.config/socr/config.yaml`:
|
|
170
|
+
|
|
171
|
+
```yaml
|
|
172
|
+
primary_engine: gemini
|
|
173
|
+
fallback_engine: marker
|
|
174
|
+
timeout: 300
|
|
175
|
+
save_figures: false
|
|
176
|
+
audit_enabled: true
|
|
177
|
+
audit_min_words: 50
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
Or use profiles: `~/.config/socr/fast.yaml` → `socr paper.pdf --profile fast`
|
|
181
|
+
|
|
182
|
+
## Engine CLIs
|
|
183
|
+
|
|
184
|
+
Each backend is an independent CLI tool:
|
|
185
|
+
|
|
186
|
+
- [gemini-ocr-cli](https://github.com/r-uben/gemini-ocr-cli) — Google Gemini
|
|
187
|
+
- [deepseek-ocr-cli](https://github.com/r-uben/deepseek-ocr-cli) — DeepSeek via Ollama
|
|
188
|
+
- [mistral-ocr-cli](https://github.com/r-uben/mistral-ocr-cli) — Mistral AI
|
|
189
|
+
- [marker-ocr-cli](https://github.com/r-uben/marker-ocr-cli) — Marker (Surya + Texify)
|
|
190
|
+
- [nougat-ocr-cli](https://github.com/r-uben/nougat-ocr-cli) — Meta Nougat
|
|
191
|
+
|
|
192
|
+
## License
|
|
193
|
+
|
|
194
|
+
MIT
|
socr-1.0.0/README.md
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# socr
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/socr/)
|
|
4
|
+
[](https://pypi.org/project/socr/)
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
|
|
7
|
+
Multi-engine document OCR with cascading fallback and quality audit.
|
|
8
|
+
|
|
9
|
+
`socr` orchestrates multiple OCR engines — calling each as a CLI subprocess, auditing output quality, and falling back to a different engine when results are poor. Each engine is a standalone CLI tool (`gemini-ocr`, `deepseek-ocr`, `marker-ocr`, etc.) that can also be used independently.
|
|
10
|
+
|
|
11
|
+
## Install
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
pip install socr
|
|
15
|
+
|
|
16
|
+
# With specific engine backends
|
|
17
|
+
pip install socr[gemini] # Google Gemini (cloud)
|
|
18
|
+
pip install socr[local] # DeepSeek + Nougat (local/free)
|
|
19
|
+
pip install socr[all] # All engines
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
Engines are installed separately because they have different dependencies (torch, cloud SDKs, etc.). Install only what you need.
|
|
23
|
+
|
|
24
|
+
## Usage
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
# Process a PDF
|
|
28
|
+
socr paper.pdf
|
|
29
|
+
|
|
30
|
+
# Choose engine
|
|
31
|
+
socr paper.pdf --primary gemini
|
|
32
|
+
socr paper.pdf --primary marker
|
|
33
|
+
|
|
34
|
+
# Save extracted figures
|
|
35
|
+
socr paper.pdf --save-figures
|
|
36
|
+
|
|
37
|
+
# Batch process a directory
|
|
38
|
+
socr batch ~/Papers/ -o ./results/
|
|
39
|
+
socr batch ~/Papers/ --dry-run # preview what would be processed
|
|
40
|
+
socr batch ~/Papers/ --reprocess # force reprocess all
|
|
41
|
+
|
|
42
|
+
# Check which engines are available
|
|
43
|
+
socr engines
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## How it works
|
|
47
|
+
|
|
48
|
+
```
|
|
49
|
+
PDF → Primary OCR → Quality Audit → (Fallback OCR if needed) → Markdown
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
1. **Primary OCR** — Calls the primary engine CLI on the whole PDF
|
|
53
|
+
2. **Quality audit** — Heuristic checks (word count, garbage ratio, repetition)
|
|
54
|
+
3. **Fallback** — If audit fails, tries a different engine
|
|
55
|
+
|
|
56
|
+
Each engine is a separate CLI binary. `socr` calls it as a subprocess, reads the output markdown, and applies the quality pipeline.
|
|
57
|
+
|
|
58
|
+
## Engines
|
|
59
|
+
|
|
60
|
+
| Engine | Package | Type | Notes |
|
|
61
|
+
|--------|---------|------|-------|
|
|
62
|
+
| Gemini | `gemini-ocr-cli` | Cloud | Google Gemini, ~$0.0002/page |
|
|
63
|
+
| Mistral | `mistral-ocr-cli` | Cloud | Mistral AI |
|
|
64
|
+
| Marker | `marker-ocr-cli` | Local | Layout-aware (Surya + Texify) |
|
|
65
|
+
| DeepSeek | `deepseek-ocr-cli` | Local | Via Ollama |
|
|
66
|
+
| Nougat | `nougat-ocr-cli` | Local | Academic papers, Python <3.13 |
|
|
67
|
+
|
|
68
|
+
Check availability:
|
|
69
|
+
```
|
|
70
|
+
$ socr engines
|
|
71
|
+
|
|
72
|
+
[+] gemini cloud, ~$0.0002/page
|
|
73
|
+
[+] marker local, layout-aware (Surya + Texify)
|
|
74
|
+
[+] mistral cloud, ~$0.001/page
|
|
75
|
+
[+] deepseek local via Ollama
|
|
76
|
+
[x] nougat local, academic papers
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## CLI reference
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
socr process <PDF> [OPTIONS]
|
|
83
|
+
-o, --output-dir PATH Output directory
|
|
84
|
+
--primary ENGINE Primary OCR engine (gemini, marker, deepseek, etc.)
|
|
85
|
+
--fallback ENGINE Fallback engine
|
|
86
|
+
--no-audit Skip quality audit
|
|
87
|
+
--save-figures Save extracted figure images
|
|
88
|
+
--timeout SECONDS Subprocess timeout (default: 300)
|
|
89
|
+
--profile NAME Load ~/.config/socr/{name}.yaml
|
|
90
|
+
--config PATH Custom YAML config file
|
|
91
|
+
-q, --quiet Suppress non-error output
|
|
92
|
+
-v, --verbose Verbose output
|
|
93
|
+
--dry-run List files without processing
|
|
94
|
+
--reprocess Force reprocess already-done files
|
|
95
|
+
|
|
96
|
+
socr batch <DIR> [OPTIONS]
|
|
97
|
+
Same options as process, plus:
|
|
98
|
+
--limit N Process first N files
|
|
99
|
+
|
|
100
|
+
socr engines Show available engines
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Output
|
|
104
|
+
|
|
105
|
+
```
|
|
106
|
+
output/<doc_stem>/
|
|
107
|
+
├── <doc_stem>.md # OCR text
|
|
108
|
+
├── metadata.json # Processing stats
|
|
109
|
+
└── figures/ # With --save-figures
|
|
110
|
+
└── figure_1_page3.png
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Configuration
|
|
114
|
+
|
|
115
|
+
Create `~/.config/socr/config.yaml`:
|
|
116
|
+
|
|
117
|
+
```yaml
|
|
118
|
+
primary_engine: gemini
|
|
119
|
+
fallback_engine: marker
|
|
120
|
+
timeout: 300
|
|
121
|
+
save_figures: false
|
|
122
|
+
audit_enabled: true
|
|
123
|
+
audit_min_words: 50
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
Or use profiles: `~/.config/socr/fast.yaml` → `socr paper.pdf --profile fast`
|
|
127
|
+
|
|
128
|
+
## Engine CLIs
|
|
129
|
+
|
|
130
|
+
Each backend is an independent CLI tool:
|
|
131
|
+
|
|
132
|
+
- [gemini-ocr-cli](https://github.com/r-uben/gemini-ocr-cli) — Google Gemini
|
|
133
|
+
- [deepseek-ocr-cli](https://github.com/r-uben/deepseek-ocr-cli) — DeepSeek via Ollama
|
|
134
|
+
- [mistral-ocr-cli](https://github.com/r-uben/mistral-ocr-cli) — Mistral AI
|
|
135
|
+
- [marker-ocr-cli](https://github.com/r-uben/marker-ocr-cli) — Marker (Surya + Texify)
|
|
136
|
+
- [nougat-ocr-cli](https://github.com/r-uben/nougat-ocr-cli) — Meta Nougat
|
|
137
|
+
|
|
138
|
+
## License
|
|
139
|
+
|
|
140
|
+
MIT
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# socr Architecture
|
|
2
|
+
|
|
3
|
+
## Modules
|
|
4
|
+
- `socr/cli.py`: Click commands (`process`, `engines`, `audit-status`, `describe_figures`, shorthand `p`).
|
|
5
|
+
- `core/`: shared types and configuration.
|
|
6
|
+
- `config.py`: `AgentConfig`, per-engine configs, audit settings, routing overrides, optional cross-check toggle.
|
|
7
|
+
- `document.py`: PDF loading/rendering, basic document classification.
|
|
8
|
+
- `result.py`: page/figure/results, stats, markdown export.
|
|
9
|
+
- `engines/`: one adapter per engine implementing `BaseEngine` (`nougat`, `deepseek`, `gemini`, `mistral`).
|
|
10
|
+
- `audit/`: heuristic checks (`HeuristicsChecker`) and optional Ollama LLM audit (`LLMAuditor`).
|
|
11
|
+
- `pipeline/`:
|
|
12
|
+
- `router.py`: engine selection (primary/fallback/cross-check).
|
|
13
|
+
- `processor.py`: orchestrates stages, output writer, figure pass.
|
|
14
|
+
- `ui/`: Rich-based console/progress/panels/theme.
|
|
15
|
+
- `tests/`: routing/output, figure pass, and heuristic tests (require dev extras).
|
|
16
|
+
|
|
17
|
+
## Pipeline Stages
|
|
18
|
+
1) **Primary OCR** — pick engine via `EngineRouter` (honors overrides, prefers local).
|
|
19
|
+
2) **Verifier** — heuristics; optional cross-check on flagged pages using the other local engine; optional Ollama LLM audit.
|
|
20
|
+
3) **Fallback OCR** — reprocess flagged pages with a different engine (prefers cheaper cloud).
|
|
21
|
+
4) **Figure Pass** — experimental: extract embedded page images, filter out tiny/extreme assets, send to vision-capable engine `describe_figure`.
|
|
22
|
+
5) **Output** — write `output/<doc_stem>/<doc_stem>.<ext>` plus `metadata.json` (stats, engines used, pages needing rerun, doc metadata).
|
|
23
|
+
|
|
24
|
+
## Verification (local-first)
|
|
25
|
+
- Heuristics: word count, garbage ratio, structure, repeated patterns.
|
|
26
|
+
- Cross-check (optional): re-run a few flagged pages on the other local engine (Nougat↔DeepSeek) before cloud fallback.
|
|
27
|
+
- LLM audit: optional Ollama-based review on flagged pages.
|
|
28
|
+
|
|
29
|
+
## Figure Pass
|
|
30
|
+
- Uses PyMuPDF to extract embedded images per page, skips tiny/extreme-aspect assets, downscales large images, calls `describe_figure` on first available vision engine (Gemini/DeepSeek/Mistral). Results are attached to `PageResult.figures` and printed.
|
|
31
|
+
|
|
32
|
+
## Testing
|
|
33
|
+
- Install (editable): `uv pip install -e .`
|
|
34
|
+
- Run: `pytest -q --disable-warnings --maxfail=1`.
|
|
35
|
+
- Coverage: routing/output (`tests/test_pipeline_routing.py`), figure extraction (`tests/test_figure_pass.py`), heuristics/reprocessing (`tests/test_audit_heuristics.py`). Tests skip if rich/fitz/Pillow aren’t installed.
|