nougat-ocr-cli 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nougat_ocr_cli-0.1.0/.gitignore +51 -0
- nougat_ocr_cli-0.1.0/LICENSE +21 -0
- nougat_ocr_cli-0.1.0/PKG-INFO +143 -0
- nougat_ocr_cli-0.1.0/README.md +109 -0
- nougat_ocr_cli-0.1.0/examples/basic_usage.py +39 -0
- nougat_ocr_cli-0.1.0/examples/batch_processing.py +69 -0
- nougat_ocr_cli-0.1.0/pyproject.toml +73 -0
- nougat_ocr_cli-0.1.0/src/nougat_wrapper/__init__.py +6 -0
- nougat_ocr_cli-0.1.0/src/nougat_wrapper/__main__.py +6 -0
- nougat_ocr_cli-0.1.0/src/nougat_wrapper/cli.py +109 -0
- nougat_ocr_cli-0.1.0/src/nougat_wrapper/core.py +154 -0
- nougat_ocr_cli-0.1.0/src/nougat_wrapper/py.typed +0 -0
- nougat_ocr_cli-0.1.0/tests/__init__.py +1 -0
- nougat_ocr_cli-0.1.0/tests/test_basic.py +38 -0
- nougat_ocr_cli-0.1.0/tests/test_gpu.py +39 -0
- nougat_ocr_cli-0.1.0/uv.lock +2974 -0
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
build/
|
|
8
|
+
develop-eggs/
|
|
9
|
+
dist/
|
|
10
|
+
downloads/
|
|
11
|
+
eggs/
|
|
12
|
+
.eggs/
|
|
13
|
+
lib/
|
|
14
|
+
lib64/
|
|
15
|
+
parts/
|
|
16
|
+
sdist/
|
|
17
|
+
var/
|
|
18
|
+
wheels/
|
|
19
|
+
*.egg-info/
|
|
20
|
+
.installed.cfg
|
|
21
|
+
*.egg
|
|
22
|
+
|
|
23
|
+
# Virtual environments
|
|
24
|
+
.venv/
|
|
25
|
+
venv/
|
|
26
|
+
ENV/
|
|
27
|
+
env/
|
|
28
|
+
|
|
29
|
+
# IDEs
|
|
30
|
+
.vscode/
|
|
31
|
+
.idea/
|
|
32
|
+
*.swp
|
|
33
|
+
*.swo
|
|
34
|
+
|
|
35
|
+
# Testing
|
|
36
|
+
.pytest_cache/
|
|
37
|
+
.coverage
|
|
38
|
+
htmlcov/
|
|
39
|
+
.tox/
|
|
40
|
+
|
|
41
|
+
# OS
|
|
42
|
+
.DS_Store
|
|
43
|
+
Thumbs.db
|
|
44
|
+
|
|
45
|
+
# UV
|
|
46
|
+
.uv-cache/
|
|
47
|
+
|
|
48
|
+
# Project specific
|
|
49
|
+
prompts/
|
|
50
|
+
ocr_output/
|
|
51
|
+
test.pdf
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Ruben Fernandez-Fuertes
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nougat-ocr-cli
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Simple CLI wrapper for Nougat OCR with GPU acceleration support
|
|
5
|
+
Project-URL: Homepage, https://github.com/rubenffuertes/nougat-ocr-cli
|
|
6
|
+
Project-URL: Repository, https://github.com/rubenffuertes/nougat-ocr-cli
|
|
7
|
+
Project-URL: Issues, https://github.com/rubenffuertes/nougat-ocr-cli/issues
|
|
8
|
+
Author-email: Ruben Fernandez-Fuertes <fernandezfuertesruben@gmail.com>
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: cli,document,extraction,gpu,nougat,ocr,pdf
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Classifier: Topic :: Text Processing :: General
|
|
23
|
+
Classifier: Topic :: Utilities
|
|
24
|
+
Requires-Python: >=3.11
|
|
25
|
+
Requires-Dist: albumentations==1.3.1
|
|
26
|
+
Requires-Dist: nougat-ocr>=0.1.17
|
|
27
|
+
Requires-Dist: torch>=2.0.0
|
|
28
|
+
Requires-Dist: transformers>=4.30.0
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
31
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
32
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
|
|
35
|
+
# Nougat OCR CLI
|
|
36
|
+
|
|
37
|
+
Simple, batteries-included CLI wrapper for [Nougat OCR](https://github.com/facebookresearch/nougat) with GPU acceleration.
|
|
38
|
+
|
|
39
|
+
## Features
|
|
40
|
+
|
|
41
|
+
- GPU acceleration (CUDA & Apple Metal)
|
|
42
|
+
- Simple CLI interface
|
|
43
|
+
- Batch processing support
|
|
44
|
+
- Clean Markdown output
|
|
45
|
+
- Automatic model downloading
|
|
46
|
+
- Python API with type hints
|
|
47
|
+
|
|
48
|
+
## Installation
|
|
49
|
+
|
|
50
|
+
### From PyPI
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pip install nougat-ocr-cli
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### From GitHub
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
pip install git+https://github.com/rubenffuertes/nougat-ocr-cli.git
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### From source
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
git clone https://github.com/rubenffuertes/nougat-ocr-cli.git
|
|
66
|
+
cd nougat-ocr-cli
|
|
67
|
+
uv pip install -e .
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## CLI Usage
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
# Basic usage - outputs to current directory
|
|
74
|
+
nougat-ocr-cli document.pdf
|
|
75
|
+
|
|
76
|
+
# Specify output directory
|
|
77
|
+
nougat-ocr-cli document.pdf -o output/
|
|
78
|
+
|
|
79
|
+
# Process specific pages (zero-indexed)
|
|
80
|
+
nougat-ocr-cli document.pdf --pages 0-5
|
|
81
|
+
nougat-ocr-cli document.pdf --pages 1,3,5,7
|
|
82
|
+
|
|
83
|
+
# Use smaller model for faster processing
|
|
84
|
+
nougat-ocr-cli document.pdf --model 0.1.0-small
|
|
85
|
+
|
|
86
|
+
# Use full precision (FP32) for better accuracy
|
|
87
|
+
nougat-ocr-cli document.pdf --full-precision
|
|
88
|
+
|
|
89
|
+
# Set batch size manually
|
|
90
|
+
nougat-ocr-cli document.pdf --batch-size 4
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### CLI Options
|
|
94
|
+
|
|
95
|
+
| Option | Description |
|
|
96
|
+
|--------|-------------|
|
|
97
|
+
| `input` | Input PDF file to process |
|
|
98
|
+
| `-o, --output` | Output directory (default: current directory) |
|
|
99
|
+
| `--model` | Model version (default: 0.1.0-base) |
|
|
100
|
+
| `--batch-size` | Batch size for processing (auto-detected) |
|
|
101
|
+
| `--full-precision` | Use FP32 instead of BF16 |
|
|
102
|
+
| `--no-markdown` | Disable markdown post-processing |
|
|
103
|
+
| `--pages` | Page range (e.g., '0-5' or '1,3,5') |
|
|
104
|
+
|
|
105
|
+
## Python API
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from nougat_wrapper import NougatOCR
|
|
109
|
+
from pathlib import Path
|
|
110
|
+
|
|
111
|
+
# Initialize (loads model to GPU automatically)
|
|
112
|
+
ocr = NougatOCR()
|
|
113
|
+
|
|
114
|
+
# Extract text from PDF
|
|
115
|
+
result = ocr.extract_text(Path("paper.pdf"))
|
|
116
|
+
|
|
117
|
+
print(f"Extracted {result.pages} pages")
|
|
118
|
+
print(f"Failed pages: {result.placeholder_pages}")
|
|
119
|
+
print(result.text) # Markdown output
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Advanced Usage
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
ocr = NougatOCR(
|
|
126
|
+
model_tag="0.1.0-small", # Use smaller model
|
|
127
|
+
batch_size=4, # Process 4 pages at once
|
|
128
|
+
full_precision=True, # Use FP32 instead of BF16
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Only OCR pages 0, 1, 2 (zero-indexed)
|
|
132
|
+
result = ocr.extract_text(pdf_path, pages=[0, 1, 2])
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Requirements
|
|
136
|
+
|
|
137
|
+
- Python 3.11+
|
|
138
|
+
- GPU recommended (CUDA or Apple Metal)
|
|
139
|
+
- ~1.3 GB for model weights (auto-downloaded)
|
|
140
|
+
|
|
141
|
+
## License
|
|
142
|
+
|
|
143
|
+
MIT
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# Nougat OCR CLI
|
|
2
|
+
|
|
3
|
+
Simple, batteries-included CLI wrapper for [Nougat OCR](https://github.com/facebookresearch/nougat) with GPU acceleration.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- GPU acceleration (CUDA & Apple Metal)
|
|
8
|
+
- Simple CLI interface
|
|
9
|
+
- Batch processing support
|
|
10
|
+
- Clean Markdown output
|
|
11
|
+
- Automatic model downloading
|
|
12
|
+
- Python API with type hints
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
### From PyPI
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install nougat-ocr-cli
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
### From GitHub
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install git+https://github.com/rubenffuertes/nougat-ocr-cli.git
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### From source
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
git clone https://github.com/rubenffuertes/nougat-ocr-cli.git
|
|
32
|
+
cd nougat-ocr-cli
|
|
33
|
+
uv pip install -e .
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## CLI Usage
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
# Basic usage - outputs to current directory
|
|
40
|
+
nougat-ocr-cli document.pdf
|
|
41
|
+
|
|
42
|
+
# Specify output directory
|
|
43
|
+
nougat-ocr-cli document.pdf -o output/
|
|
44
|
+
|
|
45
|
+
# Process specific pages (zero-indexed)
|
|
46
|
+
nougat-ocr-cli document.pdf --pages 0-5
|
|
47
|
+
nougat-ocr-cli document.pdf --pages 1,3,5,7
|
|
48
|
+
|
|
49
|
+
# Use smaller model for faster processing
|
|
50
|
+
nougat-ocr-cli document.pdf --model 0.1.0-small
|
|
51
|
+
|
|
52
|
+
# Use full precision (FP32) for better accuracy
|
|
53
|
+
nougat-ocr-cli document.pdf --full-precision
|
|
54
|
+
|
|
55
|
+
# Set batch size manually
|
|
56
|
+
nougat-ocr-cli document.pdf --batch-size 4
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### CLI Options
|
|
60
|
+
|
|
61
|
+
| Option | Description |
|
|
62
|
+
|--------|-------------|
|
|
63
|
+
| `input` | Input PDF file to process |
|
|
64
|
+
| `-o, --output` | Output directory (default: current directory) |
|
|
65
|
+
| `--model` | Model version (default: 0.1.0-base) |
|
|
66
|
+
| `--batch-size` | Batch size for processing (auto-detected) |
|
|
67
|
+
| `--full-precision` | Use FP32 instead of BF16 |
|
|
68
|
+
| `--no-markdown` | Disable markdown post-processing |
|
|
69
|
+
| `--pages` | Page range (e.g., '0-5' or '1,3,5') |
|
|
70
|
+
|
|
71
|
+
## Python API
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from nougat_wrapper import NougatOCR
|
|
75
|
+
from pathlib import Path
|
|
76
|
+
|
|
77
|
+
# Initialize (loads model to GPU automatically)
|
|
78
|
+
ocr = NougatOCR()
|
|
79
|
+
|
|
80
|
+
# Extract text from PDF
|
|
81
|
+
result = ocr.extract_text(Path("paper.pdf"))
|
|
82
|
+
|
|
83
|
+
print(f"Extracted {result.pages} pages")
|
|
84
|
+
print(f"Failed pages: {result.placeholder_pages}")
|
|
85
|
+
print(result.text) # Markdown output
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### Advanced Usage
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
ocr = NougatOCR(
|
|
92
|
+
model_tag="0.1.0-small", # Use smaller model
|
|
93
|
+
batch_size=4, # Process 4 pages at once
|
|
94
|
+
full_precision=True, # Use FP32 instead of BF16
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Only OCR pages 0, 1, 2 (zero-indexed)
|
|
98
|
+
result = ocr.extract_text(pdf_path, pages=[0, 1, 2])
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## Requirements
|
|
102
|
+
|
|
103
|
+
- Python 3.11+
|
|
104
|
+
- GPU recommended (CUDA or Apple Metal)
|
|
105
|
+
- ~1.3 GB for model weights (auto-downloaded)
|
|
106
|
+
|
|
107
|
+
## License
|
|
108
|
+
|
|
109
|
+
MIT
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Basic usage example for nougat-ocr-cli."""
|
|
3
|
+
|
|
4
|
+
from nougat_wrapper import NougatOCR
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def main():
|
|
10
|
+
if len(sys.argv) < 2:
|
|
11
|
+
print("Usage: python basic_usage.py <pdf_file>")
|
|
12
|
+
sys.exit(1)
|
|
13
|
+
|
|
14
|
+
pdf_path = Path(sys.argv[1])
|
|
15
|
+
if not pdf_path.exists():
|
|
16
|
+
print(f"Error: {pdf_path} not found")
|
|
17
|
+
sys.exit(1)
|
|
18
|
+
|
|
19
|
+
print(f"Initializing Nougat OCR...")
|
|
20
|
+
ocr = NougatOCR()
|
|
21
|
+
|
|
22
|
+
print(f"Processing {pdf_path.name}...")
|
|
23
|
+
result = ocr.extract_text(pdf_path)
|
|
24
|
+
|
|
25
|
+
print(f"\n{'='*60}")
|
|
26
|
+
print(f"Results:")
|
|
27
|
+
print(f" Pages processed: {result.pages}")
|
|
28
|
+
print(f" Pages with issues: {result.placeholder_pages}")
|
|
29
|
+
print(f" Text length: {len(result.text):,} characters")
|
|
30
|
+
print(f"{'='*60}\n")
|
|
31
|
+
|
|
32
|
+
# Save to markdown file
|
|
33
|
+
output_path = pdf_path.with_suffix('.md')
|
|
34
|
+
output_path.write_text(result.text)
|
|
35
|
+
print(f"Saved to: {output_path}")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
if __name__ == "__main__":
|
|
39
|
+
main()
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Batch processing example for nougat-ocr-cli."""
|
|
3
|
+
|
|
4
|
+
from nougat_wrapper import NougatOCR
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def main():
|
|
10
|
+
if len(sys.argv) < 2:
|
|
11
|
+
print("Usage: python batch_processing.py <directory>")
|
|
12
|
+
print("Processes all PDF files in the specified directory.")
|
|
13
|
+
sys.exit(1)
|
|
14
|
+
|
|
15
|
+
input_dir = Path(sys.argv[1])
|
|
16
|
+
if not input_dir.is_dir():
|
|
17
|
+
print(f"Error: {input_dir} is not a directory")
|
|
18
|
+
sys.exit(1)
|
|
19
|
+
|
|
20
|
+
# Find all PDFs
|
|
21
|
+
pdf_files = list(input_dir.glob("*.pdf"))
|
|
22
|
+
if not pdf_files:
|
|
23
|
+
print(f"No PDF files found in {input_dir}")
|
|
24
|
+
sys.exit(1)
|
|
25
|
+
|
|
26
|
+
print(f"Found {len(pdf_files)} PDF files")
|
|
27
|
+
print(f"Initializing Nougat OCR...")
|
|
28
|
+
ocr = NougatOCR()
|
|
29
|
+
|
|
30
|
+
# Create output directory
|
|
31
|
+
output_dir = input_dir / "ocr_output"
|
|
32
|
+
output_dir.mkdir(exist_ok=True)
|
|
33
|
+
|
|
34
|
+
# Process each PDF
|
|
35
|
+
total_pages = 0
|
|
36
|
+
total_failed = 0
|
|
37
|
+
|
|
38
|
+
for i, pdf_path in enumerate(pdf_files, 1):
|
|
39
|
+
print(f"\n[{i}/{len(pdf_files)}] Processing {pdf_path.name}...")
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
result = ocr.extract_text(pdf_path)
|
|
43
|
+
|
|
44
|
+
# Save output
|
|
45
|
+
output_path = output_dir / f"{pdf_path.stem}.md"
|
|
46
|
+
output_path.write_text(result.text)
|
|
47
|
+
|
|
48
|
+
total_pages += result.pages
|
|
49
|
+
total_failed += result.placeholder_pages
|
|
50
|
+
|
|
51
|
+
print(f" ✓ Pages: {result.pages}, Failed: {result.placeholder_pages}")
|
|
52
|
+
print(f" Saved to: {output_path}")
|
|
53
|
+
|
|
54
|
+
except Exception as e:
|
|
55
|
+
print(f" ✗ Error: {e}")
|
|
56
|
+
continue
|
|
57
|
+
|
|
58
|
+
# Summary
|
|
59
|
+
print(f"\n{'='*60}")
|
|
60
|
+
print(f"Batch Processing Summary:")
|
|
61
|
+
print(f" Files processed: {len(pdf_files)}")
|
|
62
|
+
print(f" Total pages: {total_pages}")
|
|
63
|
+
print(f" Failed pages: {total_failed}")
|
|
64
|
+
print(f" Output directory: {output_dir}")
|
|
65
|
+
print(f"{'='*60}")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
if __name__ == "__main__":
|
|
69
|
+
main()
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "nougat-ocr-cli"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Simple CLI wrapper for Nougat OCR with GPU acceleration support"
|
|
5
|
+
authors = [
|
|
6
|
+
{name = "Ruben Fernandez-Fuertes", email = "fernandezfuertesruben@gmail.com"}
|
|
7
|
+
]
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
requires-python = ">=3.11"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
keywords = ["ocr", "pdf", "nougat", "document", "extraction", "cli", "gpu"]
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 4 - Beta",
|
|
14
|
+
"Environment :: Console",
|
|
15
|
+
"Intended Audience :: Developers",
|
|
16
|
+
"Intended Audience :: Science/Research",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
23
|
+
"Topic :: Text Processing :: General",
|
|
24
|
+
"Topic :: Utilities",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
dependencies = [
|
|
28
|
+
"torch>=2.0.0",
|
|
29
|
+
"nougat-ocr>=0.1.17",
|
|
30
|
+
"transformers>=4.30.0",
|
|
31
|
+
"albumentations==1.3.1",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[project.optional-dependencies]
|
|
35
|
+
dev = [
|
|
36
|
+
"pytest>=7.0.0",
|
|
37
|
+
"pytest-cov>=4.0.0",
|
|
38
|
+
"ruff>=0.1.0",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
[project.scripts]
|
|
42
|
+
nougat-ocr-cli = "nougat_wrapper.cli:main"
|
|
43
|
+
|
|
44
|
+
[project.urls]
|
|
45
|
+
Homepage = "https://github.com/rubenffuertes/nougat-ocr-cli"
|
|
46
|
+
Repository = "https://github.com/rubenffuertes/nougat-ocr-cli"
|
|
47
|
+
Issues = "https://github.com/rubenffuertes/nougat-ocr-cli/issues"
|
|
48
|
+
|
|
49
|
+
[build-system]
|
|
50
|
+
requires = ["hatchling"]
|
|
51
|
+
build-backend = "hatchling.build"
|
|
52
|
+
|
|
53
|
+
[tool.hatch.build.targets.wheel]
|
|
54
|
+
packages = ["src/nougat_wrapper"]
|
|
55
|
+
|
|
56
|
+
[dependency-groups]
|
|
57
|
+
dev = [
|
|
58
|
+
"pytest>=7.0.0",
|
|
59
|
+
"pytest-cov>=4.0.0",
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
[tool.ruff]
|
|
63
|
+
line-length = 100
|
|
64
|
+
target-version = "py311"
|
|
65
|
+
|
|
66
|
+
[tool.ruff.lint]
|
|
67
|
+
select = ["E", "F", "I", "N", "W"]
|
|
68
|
+
ignore = []
|
|
69
|
+
|
|
70
|
+
[tool.pytest.ini_options]
|
|
71
|
+
testpaths = ["tests"]
|
|
72
|
+
python_files = ["test_*.py"]
|
|
73
|
+
python_functions = ["test_*"]
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Command-line interface for Nougat OCR."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from nougat_wrapper.core import NougatOCR
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def main() -> int:
|
|
13
|
+
"""Main entry point for the CLI."""
|
|
14
|
+
parser = argparse.ArgumentParser(
|
|
15
|
+
prog="nougat-ocr-cli",
|
|
16
|
+
description="Extract text from PDFs using Nougat OCR with GPU acceleration.",
|
|
17
|
+
)
|
|
18
|
+
parser.add_argument(
|
|
19
|
+
"input",
|
|
20
|
+
type=Path,
|
|
21
|
+
help="Input PDF file or image to process",
|
|
22
|
+
)
|
|
23
|
+
parser.add_argument(
|
|
24
|
+
"-o",
|
|
25
|
+
"--output",
|
|
26
|
+
type=Path,
|
|
27
|
+
help="Output directory for markdown files (default: current directory)",
|
|
28
|
+
default=Path.cwd(),
|
|
29
|
+
)
|
|
30
|
+
parser.add_argument(
|
|
31
|
+
"--model",
|
|
32
|
+
type=str,
|
|
33
|
+
default="0.1.0-base",
|
|
34
|
+
help="Model version to use (default: 0.1.0-base)",
|
|
35
|
+
)
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
"--batch-size",
|
|
38
|
+
type=int,
|
|
39
|
+
default=None,
|
|
40
|
+
help="Batch size for processing (auto-detected if not specified)",
|
|
41
|
+
)
|
|
42
|
+
parser.add_argument(
|
|
43
|
+
"--full-precision",
|
|
44
|
+
action="store_true",
|
|
45
|
+
help="Use FP32 instead of BF16 (slower but more accurate)",
|
|
46
|
+
)
|
|
47
|
+
parser.add_argument(
|
|
48
|
+
"--no-markdown",
|
|
49
|
+
action="store_true",
|
|
50
|
+
help="Disable markdown post-processing",
|
|
51
|
+
)
|
|
52
|
+
parser.add_argument(
|
|
53
|
+
"--pages",
|
|
54
|
+
type=str,
|
|
55
|
+
default=None,
|
|
56
|
+
help="Page range to process (e.g., '0-5' or '1,3,5')",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
args = parser.parse_args()
|
|
60
|
+
|
|
61
|
+
# Validate input
|
|
62
|
+
if not args.input.exists():
|
|
63
|
+
print(f"Error: Input file not found: {args.input}", file=sys.stderr)
|
|
64
|
+
return 1
|
|
65
|
+
|
|
66
|
+
# Parse pages if specified
|
|
67
|
+
pages = None
|
|
68
|
+
if args.pages:
|
|
69
|
+
pages = _parse_pages(args.pages)
|
|
70
|
+
|
|
71
|
+
# Initialize OCR
|
|
72
|
+
print(f"Loading Nougat model ({args.model})...")
|
|
73
|
+
ocr = NougatOCR(
|
|
74
|
+
model_tag=args.model,
|
|
75
|
+
batch_size=args.batch_size,
|
|
76
|
+
markdown=not args.no_markdown,
|
|
77
|
+
full_precision=args.full_precision,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# Process the file
|
|
81
|
+
print(f"Processing: {args.input}")
|
|
82
|
+
result = ocr.extract_text(args.input, pages=pages)
|
|
83
|
+
|
|
84
|
+
# Write output
|
|
85
|
+
args.output.mkdir(parents=True, exist_ok=True)
|
|
86
|
+
output_file = args.output / f"{args.input.stem}.md"
|
|
87
|
+
output_file.write_text(result.text, encoding="utf-8")
|
|
88
|
+
|
|
89
|
+
print(f"Extracted {result.pages} pages ({result.placeholder_pages} failed)")
|
|
90
|
+
print(f"Output: {output_file}")
|
|
91
|
+
|
|
92
|
+
return 0
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _parse_pages(pages_str: str) -> list[int]:
|
|
96
|
+
"""Parse page specification string into list of page numbers."""
|
|
97
|
+
pages = []
|
|
98
|
+
for part in pages_str.split(","):
|
|
99
|
+
part = part.strip()
|
|
100
|
+
if "-" in part:
|
|
101
|
+
start, end = part.split("-", 1)
|
|
102
|
+
pages.extend(range(int(start), int(end) + 1))
|
|
103
|
+
else:
|
|
104
|
+
pages.append(int(part))
|
|
105
|
+
return pages
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
if __name__ == "__main__":
|
|
109
|
+
sys.exit(main())
|