gemini-ocr-cli 0.3.0__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/.env.example +2 -2
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/PKG-INFO +15 -3
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/README.md +14 -2
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/gemini_ocr/__init__.py +1 -1
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/gemini_ocr/cli.py +11 -8
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/gemini_ocr/config.py +1 -1
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/gemini_ocr/processor.py +70 -18
- gemini_ocr_cli-0.3.2/gemini_ocr/retry.py +104 -0
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/gemini_ocr/utils.py +4 -2
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/pyproject.toml +1 -1
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/tests/conftest.py +1 -1
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/tests/test_config.py +1 -1
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/tests/test_metadata.py +3 -3
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/.github/workflows/ci.yml +0 -0
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/.gitignore +0 -0
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/.pre-commit-config.yaml +0 -0
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/CHANGELOG.md +0 -0
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/LICENSE +0 -0
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/gemini_ocr/__main__.py +0 -0
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/gemini_ocr/metadata.py +0 -0
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/tests/__init__.py +0 -0
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/tests/test_cli.py +0 -0
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/tests/test_import.py +0 -0
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/tests/test_integration.py +0 -0
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/tests/test_processor.py +0 -0
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/tests/test_utils.py +0 -0
- {gemini_ocr_cli-0.3.0 → gemini_ocr_cli-0.3.2}/uv.lock +0 -0
|
@@ -5,8 +5,8 @@
|
|
|
5
5
|
# Get one at: https://aistudio.google.com/apikey
|
|
6
6
|
GEMINI_API_KEY=your-api-key-here
|
|
7
7
|
|
|
8
|
-
# Optional: Model to use (default: gemini-3
|
|
9
|
-
# GEMINI_MODEL=gemini-3
|
|
8
|
+
# Optional: Model to use (default: gemini-3-flash-preview)
|
|
9
|
+
# GEMINI_MODEL=gemini-3-flash-preview
|
|
10
10
|
|
|
11
11
|
# Optional: Maximum file size in MB (default: 50)
|
|
12
12
|
# GEMINI_MAX_FILE_SIZE_MB=50
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gemini-ocr-cli
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.2
|
|
4
4
|
Summary: CLI tool for OCR processing using Google Gemini's vision capabilities
|
|
5
5
|
Project-URL: Homepage, https://github.com/r-uben/gemini-ocr-cli
|
|
6
6
|
Project-URL: Repository, https://github.com/r-uben/gemini-ocr-cli
|
|
@@ -45,6 +45,18 @@ Description-Content-Type: text/markdown
|
|
|
45
45
|
|
|
46
46
|
A command-line tool for OCR processing using Google Gemini's vision capabilities. Process PDFs and images to extract text, tables, equations, and figures.
|
|
47
47
|
|
|
48
|
+
## Choosing an OCR tool
|
|
49
|
+
|
|
50
|
+
This is one of five OCR CLI tools with a shared design: clean Markdown output, batch processing, and figure extraction. Pick based on your constraints:
|
|
51
|
+
|
|
52
|
+
| Tool | Engine | Runs | Cost | Best for |
|
|
53
|
+
|------|--------|------|------|----------|
|
|
54
|
+
| [deepseek-ocr-cli](https://github.com/r-uben/deepseek-ocr-cli) | DeepSeek vision | Local (Ollama / vLLM) | Free | General-purpose local OCR with multi-backend flexibility |
|
|
55
|
+
| **gemini-ocr-cli** (this repo) | Google Gemini | Cloud API | Free tier / Pay-per-use | Fast cloud OCR with concurrent processing |
|
|
56
|
+
| [marker-ocr-cli](https://github.com/r-uben/marker-ocr-cli) | Marker (Surya + Texify) | Local | Free | Academic papers with equations, tables, complex layouts |
|
|
57
|
+
| [mistral-ocr-cli](https://github.com/r-uben/mistral-ocr-cli) | Mistral OCR API | Cloud API | ~$1/1k pages | Structured extraction (tables, headers, footers) |
|
|
58
|
+
| [nougat-ocr-cli](https://github.com/r-uben/nougat-ocr-cli) | Meta Nougat | Local (GPU) | Free | Academic papers, GPU-accelerated batch processing |
|
|
59
|
+
|
|
48
60
|
## Installation
|
|
49
61
|
|
|
50
62
|
Requires Python 3.11+ and a [Google Gemini API key](https://aistudio.google.com/apikey).
|
|
@@ -88,7 +100,7 @@ Usage: gemini-ocr [OPTIONS] INPUT_PATH
|
|
|
88
100
|
Options:
|
|
89
101
|
-o, --output-dir PATH Output directory (default: <input_dir>/gemini_ocr_output/)
|
|
90
102
|
--api-key TEXT Gemini API key (or set GEMINI_API_KEY env var)
|
|
91
|
-
--model TEXT Model to use (default: gemini-3
|
|
103
|
+
--model TEXT Model to use (default: gemini-3-flash-preview)
|
|
92
104
|
--task [convert|extract|table|describe_figure]
|
|
93
105
|
OCR task type (default: convert)
|
|
94
106
|
--prompt TEXT Custom prompt for OCR processing
|
|
@@ -136,7 +148,7 @@ All CLI options can also be set via environment variables or a `.env` file:
|
|
|
136
148
|
| CLI flag | Environment variable | Default |
|
|
137
149
|
|----------|---------------------|---------|
|
|
138
150
|
| `--api-key` | `GEMINI_API_KEY` | (required) |
|
|
139
|
-
| `--model` | `GEMINI_MODEL` | `gemini-3
|
|
151
|
+
| `--model` | `GEMINI_MODEL` | `gemini-3-flash-preview` |
|
|
140
152
|
| `--include-images` | `GEMINI_INCLUDE_IMAGES` | `true` |
|
|
141
153
|
| `--save-originals` | `GEMINI_SAVE_ORIGINAL_IMAGES` | `true` |
|
|
142
154
|
| `--workers` | `GEMINI_MAX_WORKERS` | `1` |
|
|
@@ -7,6 +7,18 @@
|
|
|
7
7
|
|
|
8
8
|
A command-line tool for OCR processing using Google Gemini's vision capabilities. Process PDFs and images to extract text, tables, equations, and figures.
|
|
9
9
|
|
|
10
|
+
## Choosing an OCR tool
|
|
11
|
+
|
|
12
|
+
This is one of five OCR CLI tools with a shared design: clean Markdown output, batch processing, and figure extraction. Pick based on your constraints:
|
|
13
|
+
|
|
14
|
+
| Tool | Engine | Runs | Cost | Best for |
|
|
15
|
+
|------|--------|------|------|----------|
|
|
16
|
+
| [deepseek-ocr-cli](https://github.com/r-uben/deepseek-ocr-cli) | DeepSeek vision | Local (Ollama / vLLM) | Free | General-purpose local OCR with multi-backend flexibility |
|
|
17
|
+
| **gemini-ocr-cli** (this repo) | Google Gemini | Cloud API | Free tier / Pay-per-use | Fast cloud OCR with concurrent processing |
|
|
18
|
+
| [marker-ocr-cli](https://github.com/r-uben/marker-ocr-cli) | Marker (Surya + Texify) | Local | Free | Academic papers with equations, tables, complex layouts |
|
|
19
|
+
| [mistral-ocr-cli](https://github.com/r-uben/mistral-ocr-cli) | Mistral OCR API | Cloud API | ~$1/1k pages | Structured extraction (tables, headers, footers) |
|
|
20
|
+
| [nougat-ocr-cli](https://github.com/r-uben/nougat-ocr-cli) | Meta Nougat | Local (GPU) | Free | Academic papers, GPU-accelerated batch processing |
|
|
21
|
+
|
|
10
22
|
## Installation
|
|
11
23
|
|
|
12
24
|
Requires Python 3.11+ and a [Google Gemini API key](https://aistudio.google.com/apikey).
|
|
@@ -50,7 +62,7 @@ Usage: gemini-ocr [OPTIONS] INPUT_PATH
|
|
|
50
62
|
Options:
|
|
51
63
|
-o, --output-dir PATH Output directory (default: <input_dir>/gemini_ocr_output/)
|
|
52
64
|
--api-key TEXT Gemini API key (or set GEMINI_API_KEY env var)
|
|
53
|
-
--model TEXT Model to use (default: gemini-3
|
|
65
|
+
--model TEXT Model to use (default: gemini-3-flash-preview)
|
|
54
66
|
--task [convert|extract|table|describe_figure]
|
|
55
67
|
OCR task type (default: convert)
|
|
56
68
|
--prompt TEXT Custom prompt for OCR processing
|
|
@@ -98,7 +110,7 @@ All CLI options can also be set via environment variables or a `.env` file:
|
|
|
98
110
|
| CLI flag | Environment variable | Default |
|
|
99
111
|
|----------|---------------------|---------|
|
|
100
112
|
| `--api-key` | `GEMINI_API_KEY` | (required) |
|
|
101
|
-
| `--model` | `GEMINI_MODEL` | `gemini-3
|
|
113
|
+
| `--model` | `GEMINI_MODEL` | `gemini-3-flash-preview` |
|
|
102
114
|
| `--include-images` | `GEMINI_INCLUDE_IMAGES` | `true` |
|
|
103
115
|
| `--save-originals` | `GEMINI_SAVE_ORIGINAL_IMAGES` | `true` |
|
|
104
116
|
| `--workers` | `GEMINI_MAX_WORKERS` | `1` |
|
|
@@ -23,7 +23,8 @@ from gemini_ocr.utils import (
|
|
|
23
23
|
console = Console()
|
|
24
24
|
|
|
25
25
|
# Get original working directory if set (for wrapper scripts)
|
|
26
|
-
|
|
26
|
+
_cwd_override = os.environ.get("GEMINI_OCR_CWD", "")
|
|
27
|
+
ORIGINAL_CWD = _cwd_override if _cwd_override and Path(_cwd_override).is_absolute() else os.getcwd()
|
|
27
28
|
|
|
28
29
|
|
|
29
30
|
def _resolve_path(path: Path) -> Path:
|
|
@@ -50,8 +51,8 @@ def _resolve_path(path: Path) -> Path:
|
|
|
50
51
|
@click.option(
|
|
51
52
|
"--model",
|
|
52
53
|
type=str,
|
|
53
|
-
default="gemini-3
|
|
54
|
-
help="Gemini model to use (default: gemini-3
|
|
54
|
+
default="gemini-3-flash-preview",
|
|
55
|
+
help="Gemini model to use (default: gemini-3-flash-preview)",
|
|
55
56
|
)
|
|
56
57
|
@click.option(
|
|
57
58
|
"--task",
|
|
@@ -174,10 +175,12 @@ def cli(
|
|
|
174
175
|
if env_file:
|
|
175
176
|
config = Config.from_env(env_file)
|
|
176
177
|
else:
|
|
177
|
-
if api_key:
|
|
178
|
-
os.environ["GEMINI_API_KEY"] = api_key
|
|
179
178
|
config = Config.from_env()
|
|
180
179
|
|
|
180
|
+
# Pass CLI api_key directly to config (don't pollute os.environ)
|
|
181
|
+
if api_key:
|
|
182
|
+
config.api_key = api_key
|
|
183
|
+
|
|
181
184
|
# Override with CLI options
|
|
182
185
|
config.model = model
|
|
183
186
|
config.include_images = include_images
|
|
@@ -213,7 +216,7 @@ def cli(
|
|
|
213
216
|
if verbose:
|
|
214
217
|
import traceback
|
|
215
218
|
|
|
216
|
-
traceback.print_exc()
|
|
219
|
+
traceback.print_exc(file=sys.stderr)
|
|
217
220
|
sys.exit(1)
|
|
218
221
|
|
|
219
222
|
|
|
@@ -260,9 +263,9 @@ def _show_info(api_key: str | None = None) -> None:
|
|
|
260
263
|
console.print()
|
|
261
264
|
|
|
262
265
|
try:
|
|
263
|
-
if api_key:
|
|
264
|
-
os.environ["GEMINI_API_KEY"] = api_key
|
|
265
266
|
config = Config.from_env()
|
|
267
|
+
if api_key:
|
|
268
|
+
config.api_key = api_key
|
|
266
269
|
|
|
267
270
|
config_table = Table(title="Configuration")
|
|
268
271
|
config_table.add_column("Setting", style="cyan")
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import io
|
|
4
4
|
import logging
|
|
5
|
+
import re
|
|
5
6
|
import shutil
|
|
6
7
|
import threading
|
|
7
8
|
import time
|
|
@@ -35,15 +36,12 @@ console = Console()
|
|
|
35
36
|
|
|
36
37
|
# OCR prompts for different tasks
|
|
37
38
|
OCR_PROMPTS = {
|
|
38
|
-
"convert": """
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
-
|
|
42
|
-
-
|
|
43
|
-
-
|
|
44
|
-
- Include figure/image captions if present
|
|
45
|
-
- Do not describe images, just note their presence as [Figure X] or [Image]
|
|
46
|
-
- Output ONLY the extracted text in markdown, no commentary""",
|
|
39
|
+
"convert": """Convert this document into well-structured markdown.
|
|
40
|
+
|
|
41
|
+
- Maintain headings, paragraphs, lists, and tables (use markdown table format).
|
|
42
|
+
- Represent equations in LaTeX syntax.
|
|
43
|
+
- Preserve figure captions as [Figure N: <caption>]. Do not describe figure contents.
|
|
44
|
+
- Output only the resulting markdown, no commentary.""",
|
|
47
45
|
"extract": """Extract all visible text from this document exactly as it appears.
|
|
48
46
|
Output only the extracted text, preserving line breaks and spacing.""",
|
|
49
47
|
"describe_figure": """Analyze this figure/chart/diagram in detail:
|
|
@@ -101,23 +99,67 @@ class OCRProcessor:
|
|
|
101
99
|
error_str = str(error).lower()
|
|
102
100
|
return "429" in error_str or "rate limit" in error_str or "quota" in error_str
|
|
103
101
|
|
|
102
|
+
# Gemini 3.x Flash models use thinking architecture and need explicit config
|
|
103
|
+
# to avoid empty responses (thinking stalls at low temperature).
|
|
104
|
+
# Does NOT match: gemini-2.x (different thinking API), gemini-3-pro (not Flash)
|
|
105
|
+
_GEMINI_3_FLASH_RE = re.compile(r"gemini-3(?:\.\d+)?-flash")
|
|
106
|
+
|
|
107
|
+
def _build_generation_config(self) -> types.GenerateContentConfig:
|
|
108
|
+
"""Build GenerateContentConfig, adding thinking config for Gemini 3 Flash models."""
|
|
109
|
+
kwargs: dict[str, Any] = {"temperature": 0.1}
|
|
110
|
+
|
|
111
|
+
if self._GEMINI_3_FLASH_RE.search(self.model_name):
|
|
112
|
+
kwargs["thinking_config"] = types.ThinkingConfig(
|
|
113
|
+
thinking_level="MINIMAL",
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
return types.GenerateContentConfig(**kwargs)
|
|
117
|
+
|
|
118
|
+
@staticmethod
|
|
119
|
+
def _extract_text(response: Any) -> str:
|
|
120
|
+
"""Extract text from a GenerateContentResponse by walking parts explicitly.
|
|
121
|
+
|
|
122
|
+
The `.text` shortcut returns None when parts include thought summaries,
|
|
123
|
+
non-text parts, or when finish_reason != STOP — which is common with
|
|
124
|
+
Gemini 3.x thinking models. Walking parts is the reliable path.
|
|
125
|
+
"""
|
|
126
|
+
candidates = getattr(response, "candidates", None) or []
|
|
127
|
+
if not candidates:
|
|
128
|
+
feedback = getattr(response, "prompt_feedback", None)
|
|
129
|
+
raise RuntimeError(f"Empty response: no candidates (prompt_feedback={feedback})")
|
|
130
|
+
|
|
131
|
+
candidate = candidates[0]
|
|
132
|
+
content = getattr(candidate, "content", None)
|
|
133
|
+
parts = getattr(content, "parts", None) or []
|
|
134
|
+
text = "".join(
|
|
135
|
+
p.text for p in parts if getattr(p, "text", None) and not getattr(p, "thought", False)
|
|
136
|
+
).strip()
|
|
137
|
+
|
|
138
|
+
if not text:
|
|
139
|
+
finish = getattr(candidate, "finish_reason", None)
|
|
140
|
+
safety = getattr(candidate, "safety_ratings", None)
|
|
141
|
+
part_types = [type(p).__name__ for p in parts]
|
|
142
|
+
raise RuntimeError(
|
|
143
|
+
f"Empty response: finish_reason={finish}, "
|
|
144
|
+
f"len(parts)={len(parts)}, part_types={part_types}, "
|
|
145
|
+
f"safety_ratings={safety}"
|
|
146
|
+
)
|
|
147
|
+
return text
|
|
148
|
+
|
|
104
149
|
def _call_with_retry(self, contents: list[Any], prompt: str) -> str:
|
|
105
150
|
"""Call generate_content with exponential backoff on transient errors."""
|
|
106
151
|
max_attempts = self.config.max_retries + 1
|
|
107
152
|
base_delay = self.config.retry_base_delay
|
|
153
|
+
config = self._build_generation_config()
|
|
108
154
|
|
|
109
155
|
for attempt in range(max_attempts):
|
|
110
156
|
try:
|
|
111
157
|
response = self.client.models.generate_content(
|
|
112
158
|
model=self.model_name,
|
|
113
159
|
contents=[prompt, *contents],
|
|
114
|
-
config=
|
|
115
|
-
temperature=0.1,
|
|
116
|
-
),
|
|
160
|
+
config=config,
|
|
117
161
|
)
|
|
118
|
-
|
|
119
|
-
return response.text.strip()
|
|
120
|
-
return ""
|
|
162
|
+
return self._extract_text(response)
|
|
121
163
|
except Exception as e:
|
|
122
164
|
is_last = attempt == max_attempts - 1
|
|
123
165
|
if is_last or not self._is_retryable(e):
|
|
@@ -204,6 +246,7 @@ class OCRProcessor:
|
|
|
204
246
|
start_time = time.time()
|
|
205
247
|
self.config.validate_file_size(pdf_path)
|
|
206
248
|
|
|
249
|
+
uploaded_file = None
|
|
207
250
|
try:
|
|
208
251
|
if show_progress and not self.config.quiet:
|
|
209
252
|
with Progress(
|
|
@@ -245,6 +288,13 @@ class OCRProcessor:
|
|
|
245
288
|
error=str(e),
|
|
246
289
|
processing_time=time.time() - start_time,
|
|
247
290
|
)
|
|
291
|
+
finally:
|
|
292
|
+
# Clean up uploaded file from Gemini Files API (48hr retention)
|
|
293
|
+
if uploaded_file is not None:
|
|
294
|
+
try:
|
|
295
|
+
self.client.files.delete(name=uploaded_file.name)
|
|
296
|
+
except Exception as del_err:
|
|
297
|
+
logger.debug(f"Failed to delete uploaded file: {del_err}")
|
|
248
298
|
|
|
249
299
|
def process_file(
|
|
250
300
|
self,
|
|
@@ -281,9 +331,11 @@ class OCRProcessor:
|
|
|
281
331
|
shutil.copy2(result.file_path, original_output)
|
|
282
332
|
|
|
283
333
|
# Write clean markdown — just the OCR text, no headers
|
|
284
|
-
|
|
285
|
-
result.text
|
|
286
|
-
|
|
334
|
+
if result.success:
|
|
335
|
+
markdown_path.write_text(result.text, encoding="utf-8")
|
|
336
|
+
else:
|
|
337
|
+
# Sanitize error: don't leak raw exception details to output files
|
|
338
|
+
markdown_path.write_text("*[OCR Failed]*", encoding="utf-8")
|
|
287
339
|
|
|
288
340
|
# Save extracted images
|
|
289
341
|
if result.extracted_images and self.config.include_images:
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Retry logic with exponential backoff for API calls."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
from functools import wraps
|
|
6
|
+
from typing import Callable, Tuple, Type, TypeVar
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
T = TypeVar("T")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class RetryError(Exception):
|
|
14
|
+
"""Raised when all retry attempts are exhausted."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, message: str, last_exception: Exception):
|
|
17
|
+
super().__init__(message)
|
|
18
|
+
self.last_exception = last_exception
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def retry(
|
|
22
|
+
max_attempts: int = 3,
|
|
23
|
+
backoff_factor: float = 2.0,
|
|
24
|
+
initial_delay: float = 1.0,
|
|
25
|
+
max_delay: float = 60.0,
|
|
26
|
+
exceptions: Tuple[Type[Exception], ...] = (Exception,),
|
|
27
|
+
) -> Callable[[Callable[..., T]], Callable[..., T]]:
|
|
28
|
+
"""Decorator for retrying functions with exponential backoff.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
max_attempts: Maximum number of attempts (including first try)
|
|
32
|
+
backoff_factor: Multiplier for delay between retries
|
|
33
|
+
initial_delay: Initial delay in seconds
|
|
34
|
+
max_delay: Maximum delay in seconds
|
|
35
|
+
exceptions: Tuple of exception types to catch and retry
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Decorated function with retry logic
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def decorator(func: Callable[..., T]) -> Callable[..., T]:
|
|
42
|
+
@wraps(func)
|
|
43
|
+
def wrapper(*args, **kwargs) -> T:
|
|
44
|
+
delay = initial_delay
|
|
45
|
+
last_exception = None
|
|
46
|
+
|
|
47
|
+
for attempt in range(1, max_attempts + 1):
|
|
48
|
+
try:
|
|
49
|
+
return func(*args, **kwargs)
|
|
50
|
+
except exceptions as e:
|
|
51
|
+
last_exception = e
|
|
52
|
+
if attempt == max_attempts:
|
|
53
|
+
logger.error(
|
|
54
|
+
f"All {max_attempts} attempts failed for {func.__name__}: {e}"
|
|
55
|
+
)
|
|
56
|
+
raise RetryError(
|
|
57
|
+
f"Failed after {max_attempts} attempts", last_exception
|
|
58
|
+
) from e
|
|
59
|
+
|
|
60
|
+
logger.warning(
|
|
61
|
+
f"Attempt {attempt}/{max_attempts} failed for {func.__name__}: {e}. "
|
|
62
|
+
f"Retrying in {delay:.1f}s..."
|
|
63
|
+
)
|
|
64
|
+
time.sleep(delay)
|
|
65
|
+
delay = min(delay * backoff_factor, max_delay)
|
|
66
|
+
|
|
67
|
+
# Should not reach here, but for type safety
|
|
68
|
+
raise RetryError(f"Failed after {max_attempts} attempts", last_exception)
|
|
69
|
+
|
|
70
|
+
return wrapper
|
|
71
|
+
|
|
72
|
+
return decorator
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def is_retryable_error(error: Exception) -> bool:
|
|
76
|
+
"""Check if an error is retryable.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
error: The exception to check
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
True if the error is typically transient and retryable
|
|
83
|
+
"""
|
|
84
|
+
error_str = str(error).lower()
|
|
85
|
+
|
|
86
|
+
# Rate limit errors
|
|
87
|
+
if "rate" in error_str and "limit" in error_str:
|
|
88
|
+
return True
|
|
89
|
+
if "429" in error_str or "too many requests" in error_str:
|
|
90
|
+
return True
|
|
91
|
+
|
|
92
|
+
# Server errors
|
|
93
|
+
if "500" in error_str or "502" in error_str or "503" in error_str:
|
|
94
|
+
return True
|
|
95
|
+
if "internal" in error_str and "error" in error_str:
|
|
96
|
+
return True
|
|
97
|
+
|
|
98
|
+
# Connection errors
|
|
99
|
+
if "timeout" in error_str:
|
|
100
|
+
return True
|
|
101
|
+
if "connection" in error_str:
|
|
102
|
+
return True
|
|
103
|
+
|
|
104
|
+
return False
|
|
@@ -56,10 +56,12 @@ def get_supported_files(directory: Path, recursive: bool = True) -> list[Path]:
|
|
|
56
56
|
|
|
57
57
|
def sanitize_filename(filename: str, max_length: int | None = 200) -> str:
|
|
58
58
|
"""Sanitize filename for safe filesystem usage."""
|
|
59
|
-
|
|
59
|
+
# Strip null bytes and leading dots (prevent hidden files / path tricks)
|
|
60
|
+
sanitized = filename.replace("\x00", "")
|
|
61
|
+
sanitized = re.sub(r'[<>:"/\\|?*]', "_", sanitized)
|
|
60
62
|
sanitized = re.sub(r"\s+", "_", sanitized)
|
|
61
63
|
sanitized = re.sub(r"_+", "_", sanitized)
|
|
62
|
-
sanitized = sanitized.strip("_")
|
|
64
|
+
sanitized = sanitized.strip("_.")
|
|
63
65
|
if max_length and len(sanitized) > max_length:
|
|
64
66
|
sanitized = sanitized[:max_length]
|
|
65
67
|
return sanitized or "unnamed"
|
|
@@ -66,7 +66,7 @@ def mock_config():
|
|
|
66
66
|
with patch.dict(os.environ, {"GEMINI_API_KEY": "test-api-key"}):
|
|
67
67
|
config = Config()
|
|
68
68
|
config.api_key = "test-api-key"
|
|
69
|
-
config.model = "gemini-3
|
|
69
|
+
config.model = "gemini-3-flash-preview"
|
|
70
70
|
config.verbose = False
|
|
71
71
|
config.quiet = False
|
|
72
72
|
config.max_workers = 1
|
|
@@ -55,7 +55,7 @@ class TestConfigDefaults:
|
|
|
55
55
|
def test_default_model(self):
|
|
56
56
|
with patch.dict(os.environ, {"GEMINI_API_KEY": "test"}, clear=True):
|
|
57
57
|
config = Config()
|
|
58
|
-
assert config.model == "gemini-3
|
|
58
|
+
assert config.model == "gemini-3-flash-preview"
|
|
59
59
|
|
|
60
60
|
def test_default_max_file_size(self):
|
|
61
61
|
with patch.dict(os.environ, {"GEMINI_API_KEY": "test"}, clear=True):
|
|
@@ -40,7 +40,7 @@ class TestMetadataManager:
|
|
|
40
40
|
f.write_bytes(b"fake pdf content")
|
|
41
41
|
|
|
42
42
|
meta = MetadataManager(tmp_path)
|
|
43
|
-
meta.record(f, processing_time=1.5, model="gemini-3
|
|
43
|
+
meta.record(f, processing_time=1.5, model="gemini-3-flash-preview", output_path="test/test.md")
|
|
44
44
|
|
|
45
45
|
assert meta.is_processed(f)
|
|
46
46
|
|
|
@@ -98,12 +98,12 @@ class TestMetadataManager:
|
|
|
98
98
|
f.write_bytes(b"data")
|
|
99
99
|
|
|
100
100
|
meta = MetadataManager(tmp_path)
|
|
101
|
-
meta.record(f, processing_time=2.5, model="gemini-3
|
|
101
|
+
meta.record(f, processing_time=2.5, model="gemini-3-flash-preview", output_path="test/test.md")
|
|
102
102
|
|
|
103
103
|
entry = meta.files["test.pdf"]
|
|
104
104
|
assert entry["status"] == "completed"
|
|
105
105
|
assert entry["processing_time"] == 2.5
|
|
106
|
-
assert entry["model"] == "gemini-3
|
|
106
|
+
assert entry["model"] == "gemini-3-flash-preview"
|
|
107
107
|
assert entry["output_path"] == "test/test.md"
|
|
108
108
|
assert "checksum" in entry
|
|
109
109
|
assert "timestamp" in entry
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|