gemini-ocr-cli 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,560 @@
1
+ """Core OCR processing module using Google Gemini with native PDF support."""
2
+
3
+ import io
4
+ import logging
5
+ import shutil
6
+ import time
7
+ from dataclasses import dataclass, field
8
+ from pathlib import Path
9
+ from typing import Any, Dict, List, Optional
10
+
11
+ from google import genai
12
+ from google.genai import types
13
+ from PIL import Image
14
+ from rich.console import Console
15
+ from rich.progress import Progress, SpinnerColumn, TextColumn
16
+
17
+ from gemini_ocr.config import Config
18
+ from gemini_ocr.retry import retry, is_retryable_error
19
+ from gemini_ocr.utils import (
20
+ determine_output_path,
21
+ extract_pdf_images,
22
+ format_file_size,
23
+ get_supported_files,
24
+ is_image_file,
25
+ is_pdf_file,
26
+ load_metadata,
27
+ sanitize_filename,
28
+ save_metadata,
29
+ )
30
+
31
+ logger = logging.getLogger(__name__)
32
+ console = Console()
33
+
34
+
35
+ # OCR prompts for different tasks
36
+ OCR_PROMPTS = {
37
+ "convert": """Extract all text from this document and convert it to clean markdown format.
38
+
39
+ Rules:
40
+ - Preserve the document structure (headings, paragraphs, lists, tables)
41
+ - Convert tables to markdown table format
42
+ - Preserve mathematical equations in LaTeX format where possible
43
+ - Include figure/image captions if present
44
+ - Do not describe images, just note their presence as [Figure X] or [Image]
45
+ - Output ONLY the extracted text in markdown, no commentary""",
46
+ "extract": """Extract all visible text from this document exactly as it appears.
47
+ Output only the extracted text, preserving line breaks and spacing.""",
48
+ "describe_figure": """Analyze this figure/chart/diagram in detail:
49
+ 1. What type of visualization is this? (bar chart, line graph, flowchart, etc.)
50
+ 2. What are the axes, labels, or key components?
51
+ 3. What data or information does it convey?
52
+ 4. What are the main findings or takeaways?
53
+
54
+ Provide a structured description.""",
55
+ "table": """Extract all tables from this document and convert them to markdown format.
56
+ Preserve all data, headers, and structure. Output only the markdown tables.""",
57
+ }
58
+
59
+
60
+ @dataclass
61
+ class OCRResult:
62
+ """Result from processing a document."""
63
+
64
+ file_path: Path
65
+ text: str
66
+ success: bool
67
+ error: Optional[str] = None
68
+ processing_time: float = 0.0
69
+ token_count: Optional[int] = None
70
+ extracted_images: List[Dict[str, Any]] = field(default_factory=list)
71
+
72
+ @property
73
+ def total_pages(self) -> int:
74
+ """Estimate page count (for compatibility)."""
75
+ # Rough estimate: ~3000 chars per page
76
+ return max(1, len(self.text) // 3000) if self.text else 0
77
+
78
+
79
+ class OCRProcessor:
80
+ """OCR processor using Google Gemini API with native PDF support."""
81
+
82
+ def __init__(self, config: Config):
83
+ """Initialize the OCR processor."""
84
+ self.config = config
85
+ config.validate_api_key()
86
+
87
+ # Initialize Gemini client
88
+ self.client = genai.Client(api_key=config.api_key)
89
+ self.model_name = config.model
90
+
91
+ self.errors: List[Dict] = []
92
+ self.processed_files: List[Dict] = []
93
+
94
+ logger.info(f"Initialized OCRProcessor with model: {config.model}")
95
+
96
+ def _upload_file(self, file_path: Path) -> Any:
97
+ """Upload file to Gemini Files API.
98
+
99
+ Args:
100
+ file_path: Path to the file to upload
101
+
102
+ Returns:
103
+ Uploaded file object from Gemini API
104
+ """
105
+ if self.config.verbose:
106
+ console.print(f"[dim]Uploading {file_path.name}...[/dim]")
107
+
108
+ uploaded = self.client.files.upload(file=str(file_path))
109
+
110
+ # Wait for file to be processed
111
+ while uploaded.state == "PROCESSING":
112
+ time.sleep(0.5)
113
+ uploaded = self.client.files.get(name=uploaded.name)
114
+
115
+ if uploaded.state == "FAILED":
116
+ raise RuntimeError(f"File upload failed: {uploaded.name}")
117
+
118
+ if self.config.verbose:
119
+ console.print(f"[dim]Upload complete: {uploaded.name}[/dim]")
120
+
121
+ return uploaded
122
+
123
+ def _pil_to_part(self, image: Image.Image) -> types.Part:
124
+ """Convert PIL Image to Gemini Part."""
125
+ buffer = io.BytesIO()
126
+ if image.mode != "RGB":
127
+ image = image.convert("RGB")
128
+ image.save(buffer, format="JPEG", quality=95)
129
+ buffer.seek(0)
130
+
131
+ return types.Part.from_bytes(
132
+ data=buffer.getvalue(),
133
+ mime_type="image/jpeg",
134
+ )
135
+
136
+ @retry(max_attempts=3, backoff_factor=2.0, initial_delay=1.0)
137
+ def _generate_content(
138
+ self,
139
+ contents: List[Any],
140
+ prompt: str,
141
+ ) -> str:
142
+ """Generate content with retry logic.
143
+
144
+ Args:
145
+ contents: List of content parts (files, images, text)
146
+ prompt: The prompt to send
147
+
148
+ Returns:
149
+ Generated text response
150
+ """
151
+ response = self.client.models.generate_content(
152
+ model=self.model_name,
153
+ contents=[prompt, *contents],
154
+ config=types.GenerateContentConfig(
155
+ temperature=0.1,
156
+ max_output_tokens=8192,
157
+ ),
158
+ )
159
+
160
+ if response.text:
161
+ return response.text.strip()
162
+ return ""
163
+
164
+ def process_image(
165
+ self,
166
+ image_path: Path,
167
+ task: str = "convert",
168
+ custom_prompt: Optional[str] = None,
169
+ ) -> OCRResult:
170
+ """Process a single image file.
171
+
172
+ Args:
173
+ image_path: Path to the image file
174
+ task: OCR task type
175
+ custom_prompt: Optional custom prompt
176
+
177
+ Returns:
178
+ OCRResult with extracted text
179
+ """
180
+ start_time = time.time()
181
+
182
+ try:
183
+ self.config.validate_file_size(image_path)
184
+ image = Image.open(image_path)
185
+
186
+ if image.mode != "RGB":
187
+ image = image.convert("RGB")
188
+
189
+ prompt = custom_prompt or OCR_PROMPTS.get(task, OCR_PROMPTS["convert"])
190
+ image_part = self._pil_to_part(image)
191
+
192
+ text = self._generate_content([image_part], prompt)
193
+ processing_time = time.time() - start_time
194
+
195
+ return OCRResult(
196
+ file_path=image_path,
197
+ text=text,
198
+ success=True,
199
+ processing_time=processing_time,
200
+ )
201
+
202
+ except Exception as e:
203
+ processing_time = time.time() - start_time
204
+ error_msg = str(e)
205
+ logger.error(f"Error processing {image_path}: {error_msg}")
206
+
207
+ return OCRResult(
208
+ file_path=image_path,
209
+ text="",
210
+ success=False,
211
+ error=error_msg,
212
+ processing_time=processing_time,
213
+ )
214
+
215
+ def process_pdf(
216
+ self,
217
+ pdf_path: Path,
218
+ task: str = "convert",
219
+ custom_prompt: Optional[str] = None,
220
+ show_progress: bool = True,
221
+ ) -> OCRResult:
222
+ """Process a PDF file using native Gemini PDF support.
223
+
224
+ This method uploads the entire PDF to Gemini's Files API and processes
225
+ it in a single API call, which is faster and more accurate than
226
+ converting to images page-by-page.
227
+
228
+ Args:
229
+ pdf_path: Path to the PDF file
230
+ task: OCR task type
231
+ custom_prompt: Optional custom prompt
232
+ show_progress: Whether to show progress indicator
233
+
234
+ Returns:
235
+ OCRResult with extracted text
236
+ """
237
+ start_time = time.time()
238
+ self.config.validate_file_size(pdf_path)
239
+
240
+ try:
241
+ # Upload PDF to Gemini Files API
242
+ if show_progress:
243
+ with Progress(
244
+ SpinnerColumn(),
245
+ TextColumn("[progress.description]{task.description}"),
246
+ console=console,
247
+ transient=True,
248
+ ) as progress:
249
+ progress.add_task("Uploading PDF...", total=None)
250
+ uploaded_file = self._upload_file(pdf_path)
251
+ progress.update(progress.task_ids[0], description="Processing...")
252
+
253
+ prompt = custom_prompt or OCR_PROMPTS.get(task, OCR_PROMPTS["convert"])
254
+ text = self._generate_content([uploaded_file], prompt)
255
+ else:
256
+ uploaded_file = self._upload_file(pdf_path)
257
+ prompt = custom_prompt or OCR_PROMPTS.get(task, OCR_PROMPTS["convert"])
258
+ text = self._generate_content([uploaded_file], prompt)
259
+
260
+ # Extract embedded images if configured
261
+ extracted_images = []
262
+ if self.config.include_images:
263
+ try:
264
+ extracted_images = extract_pdf_images(pdf_path)
265
+ except Exception as e:
266
+ logger.warning(f"Failed to extract embedded images: {e}")
267
+
268
+ processing_time = time.time() - start_time
269
+
270
+ return OCRResult(
271
+ file_path=pdf_path,
272
+ text=text,
273
+ success=True,
274
+ processing_time=processing_time,
275
+ extracted_images=extracted_images,
276
+ )
277
+
278
+ except Exception as e:
279
+ processing_time = time.time() - start_time
280
+ error_msg = str(e)
281
+ logger.error(f"Error processing {pdf_path}: {error_msg}")
282
+
283
+ return OCRResult(
284
+ file_path=pdf_path,
285
+ text="",
286
+ success=False,
287
+ error=error_msg,
288
+ processing_time=processing_time,
289
+ )
290
+
291
+ def describe_figure(self, image_path: Path) -> str:
292
+ """Generate a detailed description of a figure/chart.
293
+
294
+ Args:
295
+ image_path: Path to the image file
296
+
297
+ Returns:
298
+ Detailed description of the figure
299
+ """
300
+ image = Image.open(image_path)
301
+ if image.mode != "RGB":
302
+ image = image.convert("RGB")
303
+
304
+ image_part = self._pil_to_part(image)
305
+ return self._generate_content([image_part], OCR_PROMPTS["describe_figure"])
306
+
307
+ def process_file(
308
+ self,
309
+ file_path: Path,
310
+ task: str = "convert",
311
+ custom_prompt: Optional[str] = None,
312
+ show_progress: bool = True,
313
+ ) -> OCRResult:
314
+ """Process a single file (image or PDF).
315
+
316
+ Args:
317
+ file_path: Path to the file
318
+ task: OCR task type
319
+ custom_prompt: Optional custom prompt
320
+ show_progress: Whether to show progress
321
+
322
+ Returns:
323
+ OCRResult with extracted text
324
+ """
325
+ if is_pdf_file(file_path):
326
+ return self.process_pdf(
327
+ file_path,
328
+ task=task,
329
+ custom_prompt=custom_prompt,
330
+ show_progress=show_progress,
331
+ )
332
+ elif is_image_file(file_path):
333
+ return self.process_image(
334
+ file_path,
335
+ task=task,
336
+ custom_prompt=custom_prompt,
337
+ )
338
+ else:
339
+ raise ValueError(f"Unsupported file type: {file_path.suffix}")
340
+
341
+ def save_results(
342
+ self,
343
+ result: OCRResult,
344
+ output_dir: Path,
345
+ ) -> Path:
346
+ """Save OCR results to files.
347
+
348
+ Args:
349
+ result: OCRResult to save
350
+ output_dir: Directory to save results
351
+
352
+ Returns:
353
+ Path to the saved markdown file
354
+ """
355
+ base_name = sanitize_filename(result.file_path.stem)
356
+ markdown_path = output_dir / f"{base_name}.md"
357
+
358
+ # Save original image if configured
359
+ if self.config.save_original_images and is_image_file(result.file_path):
360
+ originals_dir = output_dir / "original_images"
361
+ originals_dir.mkdir(parents=True, exist_ok=True)
362
+ original_output = originals_dir / f"{base_name}{result.file_path.suffix}"
363
+ shutil.copy2(result.file_path, original_output)
364
+
365
+ # Build markdown content
366
+ content = []
367
+ content.append(f"# OCR Results\n")
368
+ content.append(f"**Original File:** {result.file_path.name}\n")
369
+ content.append(f"**Full Path:** `{result.file_path}`\n")
370
+ content.append(f"**Processed:** {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
371
+ content.append(f"**Processing Time:** {result.processing_time:.2f}s\n")
372
+ content.append("\n---\n\n")
373
+
374
+ # Add extracted content
375
+ if result.success:
376
+ content.append(result.text)
377
+ content.append("\n\n")
378
+ else:
379
+ content.append(f"*[OCR Failed: {result.error}]*\n\n")
380
+
381
+ # Save extracted images if any
382
+ if result.extracted_images and self.config.include_images:
383
+ images_dir = output_dir / "extracted_images"
384
+ images_dir.mkdir(parents=True, exist_ok=True)
385
+
386
+ content.append("## Extracted Images\n\n")
387
+
388
+ for img_info in result.extracted_images:
389
+ img_filename = (
390
+ f"{base_name}_page{img_info['page']}_img{img_info['index']}.{img_info['ext']}"
391
+ )
392
+ img_path = images_dir / img_filename
393
+
394
+ with open(img_path, "wb") as f:
395
+ f.write(img_info["data"])
396
+
397
+ content.append(f"![Page {img_info['page']} Image {img_info['index']}]")
398
+ content.append(f"(./extracted_images/{img_filename})\n\n")
399
+
400
+ # Write markdown
401
+ with open(markdown_path, "w", encoding="utf-8") as f:
402
+ f.write("".join(content))
403
+
404
+ if self.config.verbose:
405
+ console.print(f"[green]Saved:[/green] {markdown_path}")
406
+
407
+ return markdown_path
408
+
409
+ def process(
410
+ self,
411
+ input_path: Path,
412
+ output_path: Optional[Path] = None,
413
+ task: str = "convert",
414
+ custom_prompt: Optional[str] = None,
415
+ add_timestamp: bool = False,
416
+ reprocess: bool = False,
417
+ ) -> None:
418
+ """Process input path (file or directory).
419
+
420
+ Args:
421
+ input_path: Path to file or directory
422
+ output_path: Optional output directory
423
+ task: OCR task type
424
+ custom_prompt: Optional custom prompt
425
+ add_timestamp: Add timestamp to output folder
426
+ reprocess: Reprocess already-processed files
427
+ """
428
+ if input_path.is_file():
429
+ self._process_single_file(
430
+ input_path,
431
+ output_path,
432
+ task,
433
+ custom_prompt,
434
+ add_timestamp,
435
+ reprocess,
436
+ )
437
+ elif input_path.is_dir():
438
+ self._process_directory(
439
+ input_path,
440
+ output_path,
441
+ task,
442
+ custom_prompt,
443
+ add_timestamp,
444
+ reprocess,
445
+ )
446
+ else:
447
+ raise ValueError(f"Input path does not exist: {input_path}")
448
+
449
+ def _process_single_file(
450
+ self,
451
+ file_path: Path,
452
+ output_path: Optional[Path],
453
+ task: str,
454
+ custom_prompt: Optional[str],
455
+ add_timestamp: bool,
456
+ reprocess: bool,
457
+ ) -> None:
458
+ """Process a single file."""
459
+ output_dir = determine_output_path(file_path, output_path, add_timestamp)
460
+
461
+ # Check if already processed
462
+ existing = load_metadata(output_dir)
463
+ existing_files = {item["file"] for item in existing["files_processed"]}
464
+
465
+ if str(file_path) in existing_files and not reprocess:
466
+ console.print(f"[yellow]Already processed:[/yellow] {file_path.name}")
467
+ console.print("[dim]Use --reprocess to force reprocessing[/dim]")
468
+ return
469
+
470
+ console.print(f"[blue]Processing:[/blue] {file_path}")
471
+ console.print(f"[blue]Output:[/blue] {output_dir}\n")
472
+
473
+ result = self.process_file(file_path, task=task, custom_prompt=custom_prompt)
474
+
475
+ if result.success:
476
+ output_file = self.save_results(result, output_dir)
477
+ self.processed_files.append(
478
+ {
479
+ "file": str(file_path),
480
+ "size": file_path.stat().st_size,
481
+ "output": str(output_file),
482
+ "pages": result.total_pages,
483
+ }
484
+ )
485
+ save_metadata(output_dir, self.processed_files, result.processing_time, self.errors)
486
+ console.print(f"\n[green]Success[/green]")
487
+ console.print(f"[dim]Time: {result.processing_time:.2f}s[/dim]")
488
+ else:
489
+ self.errors.append({"file": str(file_path), "error": result.error})
490
+ console.print(f"\n[red]Failed to process file: {result.error}[/red]")
491
+
492
+ def _process_directory(
493
+ self,
494
+ dir_path: Path,
495
+ output_path: Optional[Path],
496
+ task: str,
497
+ custom_prompt: Optional[str],
498
+ add_timestamp: bool,
499
+ reprocess: bool,
500
+ ) -> None:
501
+ """Process all files in a directory."""
502
+ files = get_supported_files(dir_path)
503
+
504
+ if not files:
505
+ console.print("[yellow]No supported files found[/yellow]")
506
+ return
507
+
508
+ output_dir = determine_output_path(dir_path, output_path, add_timestamp)
509
+ existing = load_metadata(output_dir)
510
+ existing_files = {item["file"] for item in existing["files_processed"]}
511
+
512
+ # Filter files
513
+ files_to_process = []
514
+ for f in files:
515
+ if str(f) in existing_files and not reprocess:
516
+ if self.config.verbose:
517
+ console.print(f"[dim]Skipping: {f.name}[/dim]")
518
+ else:
519
+ files_to_process.append(f)
520
+
521
+ if not files_to_process:
522
+ console.print("[green]All files already processed[/green]")
523
+ console.print("[dim]Use --reprocess to force reprocessing[/dim]")
524
+ return
525
+
526
+ console.print(f"[blue]Processing {len(files_to_process)} file(s)...[/blue]")
527
+ console.print(f"[blue]Output:[/blue] {output_dir}\n")
528
+
529
+ start_time = time.time()
530
+ success_count = 0
531
+
532
+ for file_path in files_to_process:
533
+ file_size = format_file_size(file_path.stat().st_size)
534
+ console.print(f"[cyan]{file_path.name}[/cyan] ({file_size})")
535
+
536
+ result = self.process_file(file_path, task=task, custom_prompt=custom_prompt)
537
+
538
+ if result.success:
539
+ output_file = self.save_results(result, output_dir)
540
+ self.processed_files.append(
541
+ {
542
+ "file": str(file_path),
543
+ "size": file_path.stat().st_size,
544
+ "output": str(output_file),
545
+ "pages": result.total_pages,
546
+ }
547
+ )
548
+ success_count += 1
549
+ console.print(f" [green]OK[/green] ({result.processing_time:.1f}s)\n")
550
+ else:
551
+ self.errors.append({"file": str(file_path), "error": result.error})
552
+ console.print(f" [red]FAILED[/red]\n")
553
+
554
+ total_time = time.time() - start_time
555
+ save_metadata(output_dir, self.processed_files, total_time, self.errors)
556
+
557
+ console.print(f"\n[green]Completed:[/green] {success_count}/{len(files_to_process)} files")
558
+ if self.errors:
559
+ console.print(f"[red]Errors:[/red] {len(self.errors)} file(s)")
560
+ console.print(f"[dim]Total time: {total_time:.2f}s[/dim]")
gemini_ocr/retry.py ADDED
@@ -0,0 +1,104 @@
1
+ """Retry logic with exponential backoff for API calls."""
2
+
3
+ import logging
4
+ import time
5
+ from functools import wraps
6
+ from typing import Callable, Tuple, Type, TypeVar
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ T = TypeVar("T")
11
+
12
+
13
+ class RetryError(Exception):
14
+ """Raised when all retry attempts are exhausted."""
15
+
16
+ def __init__(self, message: str, last_exception: Exception):
17
+ super().__init__(message)
18
+ self.last_exception = last_exception
19
+
20
+
21
+ def retry(
22
+ max_attempts: int = 3,
23
+ backoff_factor: float = 2.0,
24
+ initial_delay: float = 1.0,
25
+ max_delay: float = 60.0,
26
+ exceptions: Tuple[Type[Exception], ...] = (Exception,),
27
+ ) -> Callable[[Callable[..., T]], Callable[..., T]]:
28
+ """Decorator for retrying functions with exponential backoff.
29
+
30
+ Args:
31
+ max_attempts: Maximum number of attempts (including first try)
32
+ backoff_factor: Multiplier for delay between retries
33
+ initial_delay: Initial delay in seconds
34
+ max_delay: Maximum delay in seconds
35
+ exceptions: Tuple of exception types to catch and retry
36
+
37
+ Returns:
38
+ Decorated function with retry logic
39
+ """
40
+
41
+ def decorator(func: Callable[..., T]) -> Callable[..., T]:
42
+ @wraps(func)
43
+ def wrapper(*args, **kwargs) -> T:
44
+ delay = initial_delay
45
+ last_exception = None
46
+
47
+ for attempt in range(1, max_attempts + 1):
48
+ try:
49
+ return func(*args, **kwargs)
50
+ except exceptions as e:
51
+ last_exception = e
52
+ if attempt == max_attempts:
53
+ logger.error(
54
+ f"All {max_attempts} attempts failed for {func.__name__}: {e}"
55
+ )
56
+ raise RetryError(
57
+ f"Failed after {max_attempts} attempts", last_exception
58
+ ) from e
59
+
60
+ logger.warning(
61
+ f"Attempt {attempt}/{max_attempts} failed for {func.__name__}: {e}. "
62
+ f"Retrying in {delay:.1f}s..."
63
+ )
64
+ time.sleep(delay)
65
+ delay = min(delay * backoff_factor, max_delay)
66
+
67
+ # Should not reach here, but for type safety
68
+ raise RetryError(f"Failed after {max_attempts} attempts", last_exception)
69
+
70
+ return wrapper
71
+
72
+ return decorator
73
+
74
+
75
+ def is_retryable_error(error: Exception) -> bool:
76
+ """Check if an error is retryable.
77
+
78
+ Args:
79
+ error: The exception to check
80
+
81
+ Returns:
82
+ True if the error is typically transient and retryable
83
+ """
84
+ error_str = str(error).lower()
85
+
86
+ # Rate limit errors
87
+ if "rate" in error_str and "limit" in error_str:
88
+ return True
89
+ if "429" in error_str or "too many requests" in error_str:
90
+ return True
91
+
92
+ # Server errors
93
+ if "500" in error_str or "502" in error_str or "503" in error_str:
94
+ return True
95
+ if "internal" in error_str and "error" in error_str:
96
+ return True
97
+
98
+ # Connection errors
99
+ if "timeout" in error_str:
100
+ return True
101
+ if "connection" in error_str:
102
+ return True
103
+
104
+ return False