gemini-ocr-cli 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gemini_ocr/__init__.py ADDED
@@ -0,0 +1,8 @@
1
+ """Gemini OCR CLI - Document processing using Google Gemini's vision capabilities."""
2
+
3
+ __version__ = "0.2.0"
4
+
5
+ from gemini_ocr.processor import OCRProcessor
6
+ from gemini_ocr.config import Config
7
+
8
+ __all__ = ["OCRProcessor", "Config", "__version__"]
gemini_ocr/__main__.py ADDED
@@ -0,0 +1,6 @@
1
+ """Allow running as python -m gemini_ocr."""
2
+
3
+ from gemini_ocr.cli import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
gemini_ocr/cli.py ADDED
@@ -0,0 +1,367 @@
1
+ """Command-line interface for Gemini OCR."""
2
+
3
+ import os
4
+ import sys
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ import click
9
+ from rich.console import Console
10
+ from rich.table import Table
11
+
12
+ from gemini_ocr import __version__
13
+ from gemini_ocr.config import Config
14
+ from gemini_ocr.processor import OCRProcessor
15
+ from gemini_ocr.utils import setup_logging
16
+
17
+ console = Console()
18
+
19
+ # Get original working directory if set (for wrapper scripts)
20
+ ORIGINAL_CWD = os.environ.get("GEMINI_OCR_CWD", os.getcwd())
21
+
22
+
23
+ def print_banner() -> None:
24
+ """Print CLI banner."""
25
+ console.print(f"[bold blue]Gemini OCR[/bold blue] [dim]v{__version__}[/dim]")
26
+ console.print("[dim]Powered by Google Gemini[/dim]\n")
27
+
28
+
29
+ @click.group()
30
+ @click.version_option(version=__version__, prog_name="gemini-ocr")
31
+ @click.option("--verbose", "-v", is_flag=True, help="Enable verbose output")
32
+ @click.pass_context
33
+ def cli(ctx: click.Context, verbose: bool) -> None:
34
+ """Gemini OCR - Document processing using Google Gemini's vision capabilities.
35
+
36
+ Process PDF and image files with state-of-the-art OCR, extracting text,
37
+ tables, equations, and figures with high accuracy.
38
+
39
+ \b
40
+ Examples:
41
+ gemini-ocr process document.pdf
42
+ gemini-ocr process ./papers/ --recursive
43
+ gemini-ocr describe figure.png
44
+ """
45
+ ctx.ensure_object(dict)
46
+ ctx.obj["verbose"] = verbose
47
+ setup_logging(verbose=verbose)
48
+
49
+
50
+ @cli.command()
51
+ @click.argument("input_path", type=click.Path(path_type=Path))
52
+ @click.option(
53
+ "-o",
54
+ "--output-dir",
55
+ type=click.Path(path_type=Path),
56
+ help="Output directory for results",
57
+ )
58
+ @click.option(
59
+ "--api-key",
60
+ type=str,
61
+ envvar="GEMINI_API_KEY",
62
+ help="Gemini API key (or set GEMINI_API_KEY env var)",
63
+ )
64
+ @click.option(
65
+ "--model",
66
+ type=str,
67
+ default="gemini-3.0-flash",
68
+ help="Gemini model to use (default: gemini-3.0-flash)",
69
+ )
70
+ @click.option(
71
+ "--task",
72
+ type=click.Choice(["convert", "extract", "table"]),
73
+ default="convert",
74
+ help="OCR task type (default: convert)",
75
+ )
76
+ @click.option(
77
+ "--prompt",
78
+ type=str,
79
+ help="Custom prompt for OCR processing",
80
+ )
81
+ @click.option(
82
+ "--include-images/--no-images",
83
+ default=True,
84
+ help="Extract embedded images (default: True)",
85
+ )
86
+ @click.option(
87
+ "--save-originals/--no-save-originals",
88
+ default=True,
89
+ help="Save original input images (default: True)",
90
+ )
91
+ @click.option(
92
+ "--add-timestamp/--no-timestamp",
93
+ default=False,
94
+ help="Add timestamp to output folder (default: False)",
95
+ )
96
+ @click.option(
97
+ "--reprocess",
98
+ is_flag=True,
99
+ help="Reprocess files even if already done",
100
+ )
101
+ @click.option(
102
+ "--env-file",
103
+ type=click.Path(exists=True, path_type=Path),
104
+ help="Path to .env file with configuration",
105
+ )
106
+ @click.pass_context
107
+ def process(
108
+ ctx: click.Context,
109
+ input_path: Path,
110
+ output_dir: Optional[Path],
111
+ api_key: Optional[str],
112
+ model: str,
113
+ task: str,
114
+ prompt: Optional[str],
115
+ include_images: bool,
116
+ save_originals: bool,
117
+ add_timestamp: bool,
118
+ reprocess: bool,
119
+ env_file: Optional[Path],
120
+ ) -> None:
121
+ """Process documents and images with OCR.
122
+
123
+ INPUT_PATH can be a single file or directory.
124
+
125
+ \b
126
+ Supported formats:
127
+ - Images: JPG, PNG, WEBP, GIF, BMP, TIFF
128
+ - Documents: PDF
129
+
130
+ \b
131
+ Examples:
132
+ # Process a single PDF
133
+ gemini-ocr process paper.pdf
134
+
135
+ # Process directory with custom output
136
+ gemini-ocr process ./documents -o ./results
137
+
138
+ # Use specific model
139
+ gemini-ocr process doc.pdf --model gemini-1.5-pro
140
+
141
+ # Custom OCR prompt
142
+ gemini-ocr process form.jpg --prompt "Extract all form fields"
143
+ """
144
+ print_banner()
145
+
146
+ try:
147
+ # Resolve paths relative to original CWD
148
+ if not input_path.is_absolute():
149
+ input_path = Path(ORIGINAL_CWD) / input_path
150
+
151
+ if not input_path.exists():
152
+ raise ValueError(f"Input path does not exist: {input_path}")
153
+
154
+ if output_dir and not output_dir.is_absolute():
155
+ output_dir = Path(ORIGINAL_CWD) / output_dir
156
+
157
+ # Load configuration
158
+ if env_file:
159
+ config = Config.from_env(env_file)
160
+ else:
161
+ if api_key:
162
+ os.environ["GEMINI_API_KEY"] = api_key
163
+ config = Config.from_env()
164
+
165
+ # Override with CLI options
166
+ config.model = model
167
+ config.include_images = include_images
168
+ config.save_original_images = save_originals
169
+ config.verbose = ctx.obj["verbose"]
170
+
171
+ # Create processor and run
172
+ processor = OCRProcessor(config)
173
+ processor.process(
174
+ input_path,
175
+ output_path=output_dir,
176
+ task=task,
177
+ custom_prompt=prompt,
178
+ add_timestamp=add_timestamp,
179
+ reprocess=reprocess,
180
+ )
181
+
182
+ console.print("\n[bold green]Done![/bold green]\n")
183
+
184
+ except ValueError as e:
185
+ console.print(f"\n[red]Error:[/red] {e}\n")
186
+ sys.exit(1)
187
+ except KeyboardInterrupt:
188
+ console.print("\n[yellow]Interrupted[/yellow]\n")
189
+ sys.exit(130)
190
+ except Exception as e:
191
+ console.print(f"\n[red]Error:[/red] {e}\n")
192
+ if ctx.obj["verbose"]:
193
+ import traceback
194
+ traceback.print_exc()
195
+ sys.exit(1)
196
+
197
+
198
+ @cli.command()
199
+ @click.argument("image_path", type=click.Path(exists=True, path_type=Path))
200
+ @click.option(
201
+ "--api-key",
202
+ type=str,
203
+ envvar="GEMINI_API_KEY",
204
+ help="Gemini API key",
205
+ )
206
+ @click.option(
207
+ "--model",
208
+ type=str,
209
+ default="gemini-2.0-flash",
210
+ help="Gemini model to use",
211
+ )
212
+ @click.option(
213
+ "-o",
214
+ "--output",
215
+ type=click.Path(path_type=Path),
216
+ help="Output file for description (default: stdout)",
217
+ )
218
+ @click.pass_context
219
+ def describe(
220
+ ctx: click.Context,
221
+ image_path: Path,
222
+ api_key: Optional[str],
223
+ model: str,
224
+ output: Optional[Path],
225
+ ) -> None:
226
+ """Generate detailed description of a figure/chart/diagram.
227
+
228
+ Analyzes the image and provides structured description including:
229
+ - Type of visualization
230
+ - Axes, labels, components
231
+ - Data/information conveyed
232
+ - Key findings
233
+
234
+ \b
235
+ Examples:
236
+ gemini-ocr describe chart.png
237
+ gemini-ocr describe diagram.jpg -o description.md
238
+ """
239
+ print_banner()
240
+
241
+ try:
242
+ # Resolve path
243
+ if not image_path.is_absolute():
244
+ image_path = Path(ORIGINAL_CWD) / image_path
245
+
246
+ if api_key:
247
+ os.environ["GEMINI_API_KEY"] = api_key
248
+
249
+ config = Config.from_env()
250
+ config.model = model
251
+ config.verbose = ctx.obj["verbose"]
252
+
253
+ processor = OCRProcessor(config)
254
+
255
+ console.print(f"[blue]Analyzing:[/blue] {image_path.name}\n")
256
+
257
+ description = processor.describe_figure(image_path)
258
+
259
+ if output:
260
+ if not output.is_absolute():
261
+ output = Path(ORIGINAL_CWD) / output
262
+ output.write_text(description, encoding="utf-8")
263
+ console.print(f"[green]Saved to:[/green] {output}")
264
+ else:
265
+ console.print("[bold]Description:[/bold]\n")
266
+ console.print(description)
267
+
268
+ console.print("\n[bold green]Done![/bold green]\n")
269
+
270
+ except Exception as e:
271
+ console.print(f"\n[red]Error:[/red] {e}\n")
272
+ if ctx.obj["verbose"]:
273
+ import traceback
274
+ traceback.print_exc()
275
+ sys.exit(1)
276
+
277
+
278
+ @cli.command()
279
+ def info() -> None:
280
+ """Show configuration and system information."""
281
+ print_banner()
282
+
283
+ # System info
284
+ sys_table = Table(title="System Information")
285
+ sys_table.add_column("Component", style="cyan")
286
+ sys_table.add_column("Value", style="green")
287
+
288
+ sys_table.add_row("Python", f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}")
289
+ sys_table.add_row("Platform", sys.platform)
290
+
291
+ console.print(sys_table)
292
+ console.print()
293
+
294
+ # Configuration
295
+ try:
296
+ config = Config.from_env()
297
+
298
+ config_table = Table(title="Configuration")
299
+ config_table.add_column("Setting", style="cyan")
300
+ config_table.add_column("Value", style="yellow")
301
+
302
+ config_table.add_row("API Key", "Set" if config.api_key else "[red]Not set[/red]")
303
+ config_table.add_row("Model", config.model)
304
+ config_table.add_row("DPI", str(config.dpi))
305
+ config_table.add_row("Max File Size", f"{config.max_file_size_mb} MB")
306
+ config_table.add_row("Include Images", str(config.include_images))
307
+
308
+ console.print(config_table)
309
+ console.print()
310
+
311
+ # Test API if key is set
312
+ if config.api_key:
313
+ console.print("[dim]Testing API connection...[/dim]")
314
+ try:
315
+ from google import genai
316
+ client = genai.Client(api_key=config.api_key)
317
+ # Try to list models to verify connection
318
+ models = list(client.models.list())
319
+ console.print(f"[green]API connection successful[/green]")
320
+ console.print(f"[dim]Available models: {len(models)}[/dim]")
321
+ except Exception as e:
322
+ console.print(f"[red]API connection failed:[/red] {e}")
323
+ else:
324
+ console.print("[yellow]Set GEMINI_API_KEY to enable API features[/yellow]")
325
+
326
+ except Exception as e:
327
+ console.print(f"[red]Error loading config:[/red] {e}")
328
+
329
+ console.print()
330
+ console.print("[bold]Supported Formats:[/bold]")
331
+ console.print(" Images: JPG, PNG, WEBP, GIF, BMP, TIFF")
332
+ console.print(" Documents: PDF")
333
+ console.print()
334
+
335
+
336
+ def main() -> None:
337
+ """Entry point with shorthand support.
338
+
339
+ Allows `gemini-ocr file.pdf` as shorthand for `gemini-ocr process file.pdf`
340
+ """
341
+ argv = sys.argv[1:]
342
+
343
+ if argv:
344
+ known_commands = {"process", "describe", "info"}
345
+
346
+ # Find first non-option argument
347
+ first_arg_idx = None
348
+ for idx, arg in enumerate(argv):
349
+ if not arg.startswith("-"):
350
+ first_arg_idx = idx
351
+ break
352
+
353
+ # If it's a file path, insert "process" command
354
+ if first_arg_idx is not None:
355
+ candidate = argv[first_arg_idx]
356
+ if candidate not in known_commands:
357
+ # Check if it looks like a path
358
+ potential_path = Path(ORIGINAL_CWD) / candidate if not Path(candidate).is_absolute() else Path(candidate)
359
+ if potential_path.exists():
360
+ argv = argv[:first_arg_idx] + ["process"] + argv[first_arg_idx:]
361
+ sys.argv = [sys.argv[0], *argv]
362
+
363
+ cli(obj={})
364
+
365
+
366
+ if __name__ == "__main__":
367
+ main()
gemini_ocr/config.py ADDED
@@ -0,0 +1,106 @@
1
+ """Configuration management for Gemini OCR CLI."""
2
+
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ from pydantic import Field, field_validator
8
+ from pydantic_settings import BaseSettings, SettingsConfigDict
9
+
10
+
11
+ class Config(BaseSettings):
12
+ """Configuration settings for Gemini OCR."""
13
+
14
+ model_config = SettingsConfigDict(
15
+ env_prefix="GEMINI_",
16
+ env_file=".env",
17
+ env_file_encoding="utf-8",
18
+ extra="ignore",
19
+ )
20
+
21
+ # API Configuration - checks GEMINI_API_KEY first, then GOOGLE_API_KEY
22
+ api_key: str = Field(default="", description="Google Gemini API key")
23
+
24
+ @field_validator("api_key", mode="before")
25
+ @classmethod
26
+ def resolve_api_key(cls, v: str) -> str:
27
+ """Resolve API key from multiple sources.
28
+
29
+ Priority:
30
+ 1. Explicitly passed value (non-empty)
31
+ 2. GEMINI_API_KEY environment variable
32
+ 3. GOOGLE_API_KEY environment variable (fallback)
33
+ """
34
+ if v: # Explicitly provided value
35
+ return v
36
+
37
+ # Check environment variables
38
+ gemini_key = os.environ.get("GEMINI_API_KEY", "")
39
+ if gemini_key:
40
+ return gemini_key
41
+
42
+ google_key = os.environ.get("GOOGLE_API_KEY", "")
43
+ if google_key:
44
+ return google_key
45
+
46
+ return ""
47
+
48
+ # Model Configuration
49
+ model: str = Field(
50
+ default="gemini-3.0-flash",
51
+ description="Gemini model to use for OCR",
52
+ )
53
+
54
+ # Processing Configuration
55
+ include_images: bool = Field(
56
+ default=True,
57
+ description="Extract and save images from documents",
58
+ )
59
+ save_original_images: bool = Field(
60
+ default=True,
61
+ description="Save original input images alongside results",
62
+ )
63
+ dpi: int = Field(
64
+ default=200,
65
+ description="DPI for PDF rendering",
66
+ )
67
+ max_file_size_mb: float = Field(
68
+ default=20.0,
69
+ description="Maximum file size in MB",
70
+ )
71
+
72
+ # Output Configuration
73
+ output_dir: Optional[Path] = Field(
74
+ default=None,
75
+ description="Default output directory",
76
+ )
77
+
78
+ # Runtime
79
+ verbose: bool = Field(default=False, description="Enable verbose output")
80
+
81
+ @classmethod
82
+ def from_env(cls, env_file: Optional[Path] = None) -> "Config":
83
+ """Load configuration from environment or .env file."""
84
+ if env_file and env_file.exists():
85
+ from dotenv import load_dotenv
86
+
87
+ load_dotenv(env_file)
88
+
89
+ return cls()
90
+
91
+ def validate_api_key(self) -> None:
92
+ """Validate that API key is set."""
93
+ if not self.api_key:
94
+ raise ValueError(
95
+ "Gemini API key not set. "
96
+ "Set GEMINI_API_KEY environment variable or pass --api-key"
97
+ )
98
+
99
+ def validate_file_size(self, file_path: Path) -> None:
100
+ """Validate file size is within limits."""
101
+ file_size_mb = file_path.stat().st_size / (1024 * 1024)
102
+ if file_size_mb > self.max_file_size_mb:
103
+ raise ValueError(
104
+ f"File size ({file_size_mb:.2f} MB) exceeds maximum "
105
+ f"allowed size ({self.max_file_size_mb} MB)"
106
+ )