PyPI - gemini-ocr-cli - Versions diffs - 0.2.0__py3-none-any.whl - Mend

gemini-ocr-cli 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

gemini_ocr/__init__.py +8 -0
gemini_ocr/__main__.py +6 -0
gemini_ocr/cli.py +367 -0
gemini_ocr/config.py +106 -0
gemini_ocr/processor.py +560 -0
gemini_ocr/retry.py +104 -0
gemini_ocr/utils.py +193 -0
gemini_ocr_cli-0.2.0.dist-info/METADATA +193 -0
gemini_ocr_cli-0.2.0.dist-info/RECORD +12 -0
gemini_ocr_cli-0.2.0.dist-info/WHEEL +4 -0
gemini_ocr_cli-0.2.0.dist-info/entry_points.txt +2 -0
gemini_ocr_cli-0.2.0.dist-info/licenses/LICENSE +21 -0

gemini_ocr/utils.py ADDED Viewed

@@ -0,0 +1,193 @@
+"""Utility functions for Gemini OCR CLI."""
+import json
+import logging
+import re
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+import fitz  # PyMuPDF
+from PIL import Image
+logger = logging.getLogger(__name__)
+# Supported file extensions
+SUPPORTED_IMAGES = {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".tiff", ".tif"}
+SUPPORTED_DOCUMENTS = {".pdf"}
+SUPPORTED_EXTENSIONS = SUPPORTED_IMAGES | SUPPORTED_DOCUMENTS
+def setup_logging(level: str = "INFO", verbose: bool = False) -> None:
+    """Configure logging for the application."""
+    log_level = logging.DEBUG if verbose else getattr(logging, level.upper(), logging.INFO)
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+def is_supported_file(file_path: Path) -> bool:
+    """Check if file type is supported."""
+    return file_path.suffix.lower() in SUPPORTED_EXTENSIONS
+def is_image_file(file_path: Path) -> bool:
+    """Check if file is an image."""
+    return file_path.suffix.lower() in SUPPORTED_IMAGES
+def is_pdf_file(file_path: Path) -> bool:
+    """Check if file is a PDF."""
+    return file_path.suffix.lower() in SUPPORTED_DOCUMENTS
+def get_supported_files(directory: Path, recursive: bool = True) -> List[Path]:
+    """Get all supported files in a directory."""
+    pattern = "**/*" if recursive else "*"
+    files = []
+    for file_path in directory.glob(pattern):
+        if file_path.is_file() and is_supported_file(file_path):
+            files.append(file_path)
+    return sorted(files)
+def sanitize_filename(filename: str, max_length: Optional[int] = 200) -> str:
+    """Sanitize filename for safe filesystem usage."""
+    # Remove or replace invalid characters
+    sanitized = re.sub(r'[<>:"/\\|?*]', "_", filename)
+    sanitized = re.sub(r"\s+", "_", sanitized)
+    sanitized = re.sub(r"_+", "_", sanitized)
+    sanitized = sanitized.strip("_")
+    if max_length and len(sanitized) > max_length:
+        sanitized = sanitized[:max_length]
+    return sanitized or "unnamed"
+def format_file_size(size_bytes: int) -> str:
+    """Format file size in human-readable format."""
+    for unit in ["B", "KB", "MB", "GB"]:
+        if size_bytes < 1024:
+            return f"{size_bytes:.1f} {unit}"
+        size_bytes /= 1024
+    return f"{size_bytes:.1f} TB"
+def determine_output_path(
+    input_path: Path,
+    output_path: Optional[Path] = None,
+    add_timestamp: bool = False,
+) -> Path:
+    """Determine the output directory path."""
+    if output_path:
+        base_output = output_path
+    elif input_path.is_file():
+        base_output = input_path.parent / "gemini_ocr_output"
+    else:
+        base_output = input_path / "gemini_ocr_output"
+    if add_timestamp:
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        base_output = base_output.parent / f"{base_output.name}_{timestamp}"
+    base_output.mkdir(parents=True, exist_ok=True)
+    return base_output
+def pdf_to_images(
+    pdf_path: Path,
+    dpi: int = 200,
+    pages: Optional[List[int]] = None,
+) -> List[Image.Image]:
+    """Convert PDF pages to PIL Images."""
+    doc = fitz.open(pdf_path)
+    images = []
+    page_indices = pages if pages else range(len(doc))
+    for page_idx in page_indices:
+        if page_idx >= len(doc):
+            logger.warning(f"Page {page_idx} out of range, skipping")
+            continue
+        page = doc[page_idx]
+        # Calculate zoom factor for desired DPI (default PDF is 72 DPI)
+        zoom = dpi / 72
+        matrix = fitz.Matrix(zoom, zoom)
+        pix = page.get_pixmap(matrix=matrix)
+        # Convert to PIL Image
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        images.append(img)
+    doc.close()
+    return images
+def extract_pdf_images(pdf_path: Path) -> List[Dict[str, Any]]:
+    """Extract embedded images from PDF."""
+    doc = fitz.open(pdf_path)
+    extracted = []
+    for page_idx in range(len(doc)):
+        page = doc[page_idx]
+        image_list = page.get_images()
+        for img_idx, img_info in enumerate(image_list):
+            xref = img_info[0]
+            try:
+                base_image = doc.extract_image(xref)
+                extracted.append(
+                    {
+                        "page": page_idx + 1,
+                        "index": img_idx + 1,
+                        "data": base_image["image"],
+                        "ext": base_image["ext"],
+                        "width": base_image.get("width"),
+                        "height": base_image.get("height"),
+                    }
+                )
+            except Exception as e:
+                logger.warning(f"Failed to extract image {img_idx} from page {page_idx}: {e}")
+    doc.close()
+    return extracted
+def load_metadata(output_dir: Path) -> Dict[str, Any]:
+    """Load existing metadata from output directory."""
+    metadata_path = output_dir / "metadata.json"
+    if metadata_path.exists():
+        with open(metadata_path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    return {"files_processed": [], "errors": [], "total_processing_time": 0}
+def save_metadata(
+    output_dir: Path,
+    processed_files: List[Dict],
+    processing_time: float,
+    errors: List[Dict],
+) -> None:
+    """Save processing metadata to JSON file."""
+    metadata_path = output_dir / "metadata.json"
+    # Load existing metadata and merge
+    existing = load_metadata(output_dir)
+    existing_files_set = {item["file"] for item in existing["files_processed"]}
+    # Add new processed files
+    for item in processed_files:
+        if item["file"] not in existing_files_set:
+            existing["files_processed"].append(item)
+    # Add new errors
+    existing["errors"].extend(errors)
+    existing["total_processing_time"] += processing_time
+    existing["last_updated"] = datetime.now().isoformat()
+    with open(metadata_path, "w", encoding="utf-8") as f:
+        json.dump(existing, f, indent=2, default=str)

gemini_ocr_cli-0.2.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,193 @@
+Metadata-Version: 2.4
+Name: gemini-ocr-cli
+Version: 0.2.0
+Summary: CLI tool for OCR processing using Google Gemini's vision capabilities
+Project-URL: Homepage, https://github.com/r-uben/gemini-ocr-cli
+Project-URL: Repository, https://github.com/r-uben/gemini-ocr-cli
+Project-URL: Issues, https://github.com/r-uben/gemini-ocr-cli/issues
+Author-email: Ruben Fernandez-Fuertes <fernandezfuertesruben@gmail.com>
+License: MIT
+License-File: LICENSE
+Keywords: cli,document-processing,gemini,google,ocr,pdf,vision
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Text Processing :: General
+Requires-Python: >=3.10
+Requires-Dist: click>=8.1.0
+Requires-Dist: google-genai>=1.0.0
+Requires-Dist: pillow>=10.0.0
+Requires-Dist: pydantic-settings>=2.0.0
+Requires-Dist: pydantic>=2.0.0
+Requires-Dist: pymupdf>=1.24.0
+Requires-Dist: python-dotenv>=1.0.0
+Requires-Dist: rich>=13.0.0
+Provides-Extra: dev
+Requires-Dist: mypy>=1.0.0; extra == 'dev'
+Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
+Requires-Dist: pytest>=8.0.0; extra == 'dev'
+Requires-Dist: ruff>=0.8.0; extra == 'dev'
+Description-Content-Type: text/markdown
+# Gemini OCR CLI
+Command-line tool for OCR processing using Google Gemini's vision capabilities. Extract text, tables, equations, and figures from PDFs and images with high accuracy.
+## Features
+- **Native PDF upload**: Direct PDF processing via Gemini Files API (fast, single API call)
+- **Multi-format support**: PDF and images (JPG, PNG, WEBP, GIF, BMP, TIFF)
+- **High-quality OCR**: Leverages Gemini's advanced vision models
+- **Structure preservation**: Maintains headings, tables, lists, equations
+- **Figure analysis**: Generate detailed descriptions of charts and diagrams
+- **Batch processing**: Process entire directories with progress tracking
+- **Incremental processing**: Skip already-processed files
+- **Automatic retry**: Exponential backoff for API rate limits
+- **Markdown output**: Clean, structured output format
+## Installation
+### From PyPI (recommended)
+```bash
+pip install gemini-ocr-cli
+```
+### Using pipx
+```bash
+pipx install gemini-ocr-cli
+```
+### From source
+```bash
+git clone https://github.com/r-uben/gemini-ocr-cli.git
+cd gemini-ocr-cli
+uv pip install -e .
+```
+## Quick Start
+### API Key Resolution
+The CLI automatically picks up your API key from environment variables (no configuration needed if already set):
+**Priority order:**
+1. `--api-key` CLI argument (highest priority)
+2. `GEMINI_API_KEY` environment variable
+3. `GOOGLE_API_KEY` environment variable (fallback)
+4. `.env` file in current directory
+```bash
+# Option 1: Set environment variable (recommended)
+export GEMINI_API_KEY="your-api-key"
+# Option 2: Use existing GOOGLE_API_KEY (auto-detected)
+export GOOGLE_API_KEY="your-api-key"
+# Option 3: Create a .env file
+echo "GEMINI_API_KEY=your-api-key" > .env
+# Option 4: Pass directly (not recommended for security)
+gemini-ocr paper.pdf --api-key "your-api-key"
+```
+### Process documents
+```bash
+# Single file
+gemini-ocr paper.pdf
+# Directory
+gemini-ocr ./documents/ -o ./results/
+# With custom model
+gemini-ocr paper.pdf --model gemini-1.5-pro
+```
+### Describe figures
+```bash
+# Analyze a chart/diagram
+gemini-ocr describe chart.png
+# Save to file
+gemini-ocr describe figure.jpg -o description.md
+```
+## CLI Reference
+### `gemini-ocr process`
+Process documents and images with OCR.
+```
+Usage: gemini-ocr process [OPTIONS] INPUT_PATH
+Options:
+  -o, --output-dir PATH           Output directory for results
+  --api-key TEXT                  Gemini API key
+  --model TEXT                    Model to use (default: gemini-3.0-flash)
+  --task [convert|extract|table]  OCR task type (default: convert)
+  --prompt TEXT                   Custom prompt for OCR
+  --include-images/--no-images    Extract embedded images (default: True)
+  --save-originals/--no-save-originals
+                                  Save original input images (default: True)
+  --add-timestamp/--no-timestamp  Add timestamp to output folder
+  --reprocess                     Reprocess existing files
+  --env-file PATH                 Path to .env file
+  -v, --verbose                   Enable verbose output
+```
+### `gemini-ocr describe`
+Generate detailed descriptions of figures, charts, and diagrams.
+```
+Usage: gemini-ocr describe [OPTIONS] IMAGE_PATH
+Options:
+  --api-key TEXT    Gemini API key
+  --model TEXT      Model to use
+  -o, --output PATH Output file (default: stdout)
+```
+### `gemini-ocr info`
+Show configuration and system information.
+## Output Format
+Results are saved as Markdown files with:
+- File metadata (original path, processing time)
+- Extracted text (full document)
+- Embedded image references (if enabled)
+- `metadata.json` tracking all processed files
+## Models
+| Model | Speed | Quality | Cost | Recommended For |
+|-------|-------|---------|------|-----------------|
+| `gemini-3.0-flash` | Fast | Good | Low | Default, most documents |
+| `gemini-1.5-flash` | Fast | Good | Low | Simple documents |
+| `gemini-1.5-pro` | Slower | Best | Higher | Complex layouts, equations |
+## Environment Variables
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `GEMINI_API_KEY` | Google Gemini API key | Required |
+| `GOOGLE_API_KEY` | Fallback API key | - |
+| `GEMINI_MODEL` | Default model | `gemini-3.0-flash` |
+## License
+MIT

gemini_ocr_cli-0.2.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,12 @@
+gemini_ocr/__init__.py,sha256=byxnwK8svMrrvH4nt_-IM-HE6V6PSO-7X2XP4JD1-aA,246
+gemini_ocr/__main__.py,sha256=VcR6YUnMQeOJ2iNJN3-NcJuBbZh3eqRNlJZOlHnQEFU,117
+gemini_ocr/cli.py,sha256=xcDVQy-Vu6GlTynTOfgqNiSTG2hD7UasF0aeZQcZ4a4,10566
+gemini_ocr/config.py,sha256=J1dwYUCmYvPspQplvcFglx6-UXXFxRdmGCdkjX62IDQ,3148
+gemini_ocr/processor.py,sha256=VxHU_tm8Y_y1GXr55jLa0J28nSzHWzj8_yJzVphRSBY,19105
+gemini_ocr/retry.py,sha256=PyTXCQsgJNQBz7J5_Lb26nLx28ZpKwXN9PE1bQXchL4,3174
+gemini_ocr/utils.py,sha256=GCe4BA_-uLYRY-O9_PvBIttjr3qj0ja5l0O3aCU1qfo,6073
+gemini_ocr_cli-0.2.0.dist-info/METADATA,sha256=TrasZjXVPpMVXb3tbUZbf0TMFEtR4rcujgWi40LGjdE,5809
+gemini_ocr_cli-0.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+gemini_ocr_cli-0.2.0.dist-info/entry_points.txt,sha256=uMDGvtr5S_VF8PyZAtftEjg5V9ziXVNV-4xWVanHv6U,51
+gemini_ocr_cli-0.2.0.dist-info/licenses/LICENSE,sha256=ijREx9a6EP9kmqF9PfuDMsqa_D53H1NUNgDd65R86_o,1080
+gemini_ocr_cli-0.2.0.dist-info/RECORD,,

gemini_ocr_cli-0.2.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.28.0
+Root-Is-Purelib: true
+Tag: py3-none-any

gemini_ocr_cli-0.2.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ gemini-ocr = gemini_ocr.cli:main

gemini_ocr_cli-0.2.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2024 Ruben Fernandez-Fuertes
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.