gemini-ocr-cli 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gemini_ocr/utils.py ADDED
@@ -0,0 +1,193 @@
1
+ """Utility functions for Gemini OCR CLI."""
2
+
3
+ import json
4
+ import logging
5
+ import re
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Optional
9
+
10
+ import fitz # PyMuPDF
11
+ from PIL import Image
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Supported file extensions
16
+ SUPPORTED_IMAGES = {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".tiff", ".tif"}
17
+ SUPPORTED_DOCUMENTS = {".pdf"}
18
+ SUPPORTED_EXTENSIONS = SUPPORTED_IMAGES | SUPPORTED_DOCUMENTS
19
+
20
+
21
+ def setup_logging(level: str = "INFO", verbose: bool = False) -> None:
22
+ """Configure logging for the application."""
23
+ log_level = logging.DEBUG if verbose else getattr(logging, level.upper(), logging.INFO)
24
+ logging.basicConfig(
25
+ level=log_level,
26
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
27
+ datefmt="%Y-%m-%d %H:%M:%S",
28
+ )
29
+
30
+
31
+ def is_supported_file(file_path: Path) -> bool:
32
+ """Check if file type is supported."""
33
+ return file_path.suffix.lower() in SUPPORTED_EXTENSIONS
34
+
35
+
36
+ def is_image_file(file_path: Path) -> bool:
37
+ """Check if file is an image."""
38
+ return file_path.suffix.lower() in SUPPORTED_IMAGES
39
+
40
+
41
+ def is_pdf_file(file_path: Path) -> bool:
42
+ """Check if file is a PDF."""
43
+ return file_path.suffix.lower() in SUPPORTED_DOCUMENTS
44
+
45
+
46
+ def get_supported_files(directory: Path, recursive: bool = True) -> List[Path]:
47
+ """Get all supported files in a directory."""
48
+ pattern = "**/*" if recursive else "*"
49
+ files = []
50
+ for file_path in directory.glob(pattern):
51
+ if file_path.is_file() and is_supported_file(file_path):
52
+ files.append(file_path)
53
+ return sorted(files)
54
+
55
+
56
+ def sanitize_filename(filename: str, max_length: Optional[int] = 200) -> str:
57
+ """Sanitize filename for safe filesystem usage."""
58
+ # Remove or replace invalid characters
59
+ sanitized = re.sub(r'[<>:"/\\|?*]', "_", filename)
60
+ sanitized = re.sub(r"\s+", "_", sanitized)
61
+ sanitized = re.sub(r"_+", "_", sanitized)
62
+ sanitized = sanitized.strip("_")
63
+
64
+ if max_length and len(sanitized) > max_length:
65
+ sanitized = sanitized[:max_length]
66
+
67
+ return sanitized or "unnamed"
68
+
69
+
70
+ def format_file_size(size_bytes: int) -> str:
71
+ """Format file size in human-readable format."""
72
+ for unit in ["B", "KB", "MB", "GB"]:
73
+ if size_bytes < 1024:
74
+ return f"{size_bytes:.1f} {unit}"
75
+ size_bytes /= 1024
76
+ return f"{size_bytes:.1f} TB"
77
+
78
+
79
+ def determine_output_path(
80
+ input_path: Path,
81
+ output_path: Optional[Path] = None,
82
+ add_timestamp: bool = False,
83
+ ) -> Path:
84
+ """Determine the output directory path."""
85
+ if output_path:
86
+ base_output = output_path
87
+ elif input_path.is_file():
88
+ base_output = input_path.parent / "gemini_ocr_output"
89
+ else:
90
+ base_output = input_path / "gemini_ocr_output"
91
+
92
+ if add_timestamp:
93
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
94
+ base_output = base_output.parent / f"{base_output.name}_{timestamp}"
95
+
96
+ base_output.mkdir(parents=True, exist_ok=True)
97
+ return base_output
98
+
99
+
100
+ def pdf_to_images(
101
+ pdf_path: Path,
102
+ dpi: int = 200,
103
+ pages: Optional[List[int]] = None,
104
+ ) -> List[Image.Image]:
105
+ """Convert PDF pages to PIL Images."""
106
+ doc = fitz.open(pdf_path)
107
+ images = []
108
+
109
+ page_indices = pages if pages else range(len(doc))
110
+
111
+ for page_idx in page_indices:
112
+ if page_idx >= len(doc):
113
+ logger.warning(f"Page {page_idx} out of range, skipping")
114
+ continue
115
+
116
+ page = doc[page_idx]
117
+ # Calculate zoom factor for desired DPI (default PDF is 72 DPI)
118
+ zoom = dpi / 72
119
+ matrix = fitz.Matrix(zoom, zoom)
120
+ pix = page.get_pixmap(matrix=matrix)
121
+
122
+ # Convert to PIL Image
123
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
124
+ images.append(img)
125
+
126
+ doc.close()
127
+ return images
128
+
129
+
130
+ def extract_pdf_images(pdf_path: Path) -> List[Dict[str, Any]]:
131
+ """Extract embedded images from PDF."""
132
+ doc = fitz.open(pdf_path)
133
+ extracted = []
134
+
135
+ for page_idx in range(len(doc)):
136
+ page = doc[page_idx]
137
+ image_list = page.get_images()
138
+
139
+ for img_idx, img_info in enumerate(image_list):
140
+ xref = img_info[0]
141
+ try:
142
+ base_image = doc.extract_image(xref)
143
+ extracted.append(
144
+ {
145
+ "page": page_idx + 1,
146
+ "index": img_idx + 1,
147
+ "data": base_image["image"],
148
+ "ext": base_image["ext"],
149
+ "width": base_image.get("width"),
150
+ "height": base_image.get("height"),
151
+ }
152
+ )
153
+ except Exception as e:
154
+ logger.warning(f"Failed to extract image {img_idx} from page {page_idx}: {e}")
155
+
156
+ doc.close()
157
+ return extracted
158
+
159
+
160
+ def load_metadata(output_dir: Path) -> Dict[str, Any]:
161
+ """Load existing metadata from output directory."""
162
+ metadata_path = output_dir / "metadata.json"
163
+ if metadata_path.exists():
164
+ with open(metadata_path, "r", encoding="utf-8") as f:
165
+ return json.load(f)
166
+ return {"files_processed": [], "errors": [], "total_processing_time": 0}
167
+
168
+
169
+ def save_metadata(
170
+ output_dir: Path,
171
+ processed_files: List[Dict],
172
+ processing_time: float,
173
+ errors: List[Dict],
174
+ ) -> None:
175
+ """Save processing metadata to JSON file."""
176
+ metadata_path = output_dir / "metadata.json"
177
+
178
+ # Load existing metadata and merge
179
+ existing = load_metadata(output_dir)
180
+ existing_files_set = {item["file"] for item in existing["files_processed"]}
181
+
182
+ # Add new processed files
183
+ for item in processed_files:
184
+ if item["file"] not in existing_files_set:
185
+ existing["files_processed"].append(item)
186
+
187
+ # Add new errors
188
+ existing["errors"].extend(errors)
189
+ existing["total_processing_time"] += processing_time
190
+ existing["last_updated"] = datetime.now().isoformat()
191
+
192
+ with open(metadata_path, "w", encoding="utf-8") as f:
193
+ json.dump(existing, f, indent=2, default=str)
@@ -0,0 +1,193 @@
1
+ Metadata-Version: 2.4
2
+ Name: gemini-ocr-cli
3
+ Version: 0.2.0
4
+ Summary: CLI tool for OCR processing using Google Gemini's vision capabilities
5
+ Project-URL: Homepage, https://github.com/r-uben/gemini-ocr-cli
6
+ Project-URL: Repository, https://github.com/r-uben/gemini-ocr-cli
7
+ Project-URL: Issues, https://github.com/r-uben/gemini-ocr-cli/issues
8
+ Author-email: Ruben Fernandez-Fuertes <fernandezfuertesruben@gmail.com>
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: cli,document-processing,gemini,google,ocr,pdf,vision
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Classifier: Topic :: Text Processing :: General
23
+ Requires-Python: >=3.10
24
+ Requires-Dist: click>=8.1.0
25
+ Requires-Dist: google-genai>=1.0.0
26
+ Requires-Dist: pillow>=10.0.0
27
+ Requires-Dist: pydantic-settings>=2.0.0
28
+ Requires-Dist: pydantic>=2.0.0
29
+ Requires-Dist: pymupdf>=1.24.0
30
+ Requires-Dist: python-dotenv>=1.0.0
31
+ Requires-Dist: rich>=13.0.0
32
+ Provides-Extra: dev
33
+ Requires-Dist: mypy>=1.0.0; extra == 'dev'
34
+ Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
35
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
36
+ Requires-Dist: ruff>=0.8.0; extra == 'dev'
37
+ Description-Content-Type: text/markdown
38
+
39
+ # Gemini OCR CLI
40
+
41
+ Command-line tool for OCR processing using Google Gemini's vision capabilities. Extract text, tables, equations, and figures from PDFs and images with high accuracy.
42
+
43
+ ## Features
44
+
45
+ - **Native PDF upload**: Direct PDF processing via Gemini Files API (fast, single API call)
46
+ - **Multi-format support**: PDF and images (JPG, PNG, WEBP, GIF, BMP, TIFF)
47
+ - **High-quality OCR**: Leverages Gemini's advanced vision models
48
+ - **Structure preservation**: Maintains headings, tables, lists, equations
49
+ - **Figure analysis**: Generate detailed descriptions of charts and diagrams
50
+ - **Batch processing**: Process entire directories with progress tracking
51
+ - **Incremental processing**: Skip already-processed files
52
+ - **Automatic retry**: Exponential backoff for API rate limits
53
+ - **Markdown output**: Clean, structured output format
54
+
55
+ ## Installation
56
+
57
+ ### From PyPI (recommended)
58
+
59
+ ```bash
60
+ pip install gemini-ocr-cli
61
+ ```
62
+
63
+ ### Using pipx
64
+
65
+ ```bash
66
+ pipx install gemini-ocr-cli
67
+ ```
68
+
69
+ ### From source
70
+
71
+ ```bash
72
+ git clone https://github.com/r-uben/gemini-ocr-cli.git
73
+ cd gemini-ocr-cli
74
+ uv pip install -e .
75
+ ```
76
+
77
+ ## Quick Start
78
+
79
+ ### API Key Resolution
80
+
81
+ The CLI automatically picks up your API key from environment variables (no configuration needed if already set):
82
+
83
+ **Priority order:**
84
+ 1. `--api-key` CLI argument (highest priority)
85
+ 2. `GEMINI_API_KEY` environment variable
86
+ 3. `GOOGLE_API_KEY` environment variable (fallback)
87
+ 4. `.env` file in current directory
88
+
89
+ ```bash
90
+ # Option 1: Set environment variable (recommended)
91
+ export GEMINI_API_KEY="your-api-key"
92
+
93
+ # Option 2: Use existing GOOGLE_API_KEY (auto-detected)
94
+ export GOOGLE_API_KEY="your-api-key"
95
+
96
+ # Option 3: Create a .env file
97
+ echo "GEMINI_API_KEY=your-api-key" > .env
98
+
99
+ # Option 4: Pass directly (not recommended for security)
100
+ gemini-ocr paper.pdf --api-key "your-api-key"
101
+ ```
102
+
103
+ ### Process documents
104
+
105
+ ```bash
106
+ # Single file
107
+ gemini-ocr paper.pdf
108
+
109
+ # Directory
110
+ gemini-ocr ./documents/ -o ./results/
111
+
112
+ # With custom model
113
+ gemini-ocr paper.pdf --model gemini-1.5-pro
114
+ ```
115
+
116
+ ### Describe figures
117
+
118
+ ```bash
119
+ # Analyze a chart/diagram
120
+ gemini-ocr describe chart.png
121
+
122
+ # Save to file
123
+ gemini-ocr describe figure.jpg -o description.md
124
+ ```
125
+
126
+ ## CLI Reference
127
+
128
+ ### `gemini-ocr process`
129
+
130
+ Process documents and images with OCR.
131
+
132
+ ```
133
+ Usage: gemini-ocr process [OPTIONS] INPUT_PATH
134
+
135
+ Options:
136
+ -o, --output-dir PATH Output directory for results
137
+ --api-key TEXT Gemini API key
138
+ --model TEXT Model to use (default: gemini-3.0-flash)
139
+ --task [convert|extract|table] OCR task type (default: convert)
140
+ --prompt TEXT Custom prompt for OCR
141
+ --include-images/--no-images Extract embedded images (default: True)
142
+ --save-originals/--no-save-originals
143
+ Save original input images (default: True)
144
+ --add-timestamp/--no-timestamp Add timestamp to output folder
145
+ --reprocess Reprocess existing files
146
+ --env-file PATH Path to .env file
147
+ -v, --verbose Enable verbose output
148
+ ```
149
+
150
+ ### `gemini-ocr describe`
151
+
152
+ Generate detailed descriptions of figures, charts, and diagrams.
153
+
154
+ ```
155
+ Usage: gemini-ocr describe [OPTIONS] IMAGE_PATH
156
+
157
+ Options:
158
+ --api-key TEXT Gemini API key
159
+ --model TEXT Model to use
160
+ -o, --output PATH Output file (default: stdout)
161
+ ```
162
+
163
+ ### `gemini-ocr info`
164
+
165
+ Show configuration and system information.
166
+
167
+ ## Output Format
168
+
169
+ Results are saved as Markdown files with:
170
+ - File metadata (original path, processing time)
171
+ - Extracted text (full document)
172
+ - Embedded image references (if enabled)
173
+ - `metadata.json` tracking all processed files
174
+
175
+ ## Models
176
+
177
+ | Model | Speed | Quality | Cost | Recommended For |
178
+ |-------|-------|---------|------|-----------------|
179
+ | `gemini-3.0-flash` | Fast | Good | Low | Default, most documents |
180
+ | `gemini-1.5-flash` | Fast | Good | Low | Simple documents |
181
+ | `gemini-1.5-pro` | Slower | Best | Higher | Complex layouts, equations |
182
+
183
+ ## Environment Variables
184
+
185
+ | Variable | Description | Default |
186
+ |----------|-------------|---------|
187
+ | `GEMINI_API_KEY` | Google Gemini API key | Required |
188
+ | `GOOGLE_API_KEY` | Fallback API key | - |
189
+ | `GEMINI_MODEL` | Default model | `gemini-3.0-flash` |
190
+
191
+ ## License
192
+
193
+ MIT
@@ -0,0 +1,12 @@
1
+ gemini_ocr/__init__.py,sha256=byxnwK8svMrrvH4nt_-IM-HE6V6PSO-7X2XP4JD1-aA,246
2
+ gemini_ocr/__main__.py,sha256=VcR6YUnMQeOJ2iNJN3-NcJuBbZh3eqRNlJZOlHnQEFU,117
3
+ gemini_ocr/cli.py,sha256=xcDVQy-Vu6GlTynTOfgqNiSTG2hD7UasF0aeZQcZ4a4,10566
4
+ gemini_ocr/config.py,sha256=J1dwYUCmYvPspQplvcFglx6-UXXFxRdmGCdkjX62IDQ,3148
5
+ gemini_ocr/processor.py,sha256=VxHU_tm8Y_y1GXr55jLa0J28nSzHWzj8_yJzVphRSBY,19105
6
+ gemini_ocr/retry.py,sha256=PyTXCQsgJNQBz7J5_Lb26nLx28ZpKwXN9PE1bQXchL4,3174
7
+ gemini_ocr/utils.py,sha256=GCe4BA_-uLYRY-O9_PvBIttjr3qj0ja5l0O3aCU1qfo,6073
8
+ gemini_ocr_cli-0.2.0.dist-info/METADATA,sha256=TrasZjXVPpMVXb3tbUZbf0TMFEtR4rcujgWi40LGjdE,5809
9
+ gemini_ocr_cli-0.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
10
+ gemini_ocr_cli-0.2.0.dist-info/entry_points.txt,sha256=uMDGvtr5S_VF8PyZAtftEjg5V9ziXVNV-4xWVanHv6U,51
11
+ gemini_ocr_cli-0.2.0.dist-info/licenses/LICENSE,sha256=ijREx9a6EP9kmqF9PfuDMsqa_D53H1NUNgDd65R86_o,1080
12
+ gemini_ocr_cli-0.2.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ gemini-ocr = gemini_ocr.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Ruben Fernandez-Fuertes
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.