visual-parser 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,396 @@
1
+ """
2
+ text_extractor.py — Two text-extraction engines for PDF pages.
3
+
4
+ Nougat (default)
5
+ Uses the Facebook Nougat transformer to OCR each page rendered as an image.
6
+ Best for: scanned PDFs, PDFs with equations, complex layouts.
7
+ Requires: nougat_engine.NougatInitializer() result to be passed in.
8
+
9
+ Lightweight (fast)
10
+ Uses PyMuPDF's native text layer (fitz.Page.get_text) with a PyPDFLoader
11
+ fallback. Also extracts embedded equations via regex + pytesseract.
12
+ Best for: born-digital PDFs where text is already machine-readable.
13
+ Requires: only PyMuPDF + langchain_community; no GPU.
14
+
15
+ Both engines:
16
+ • Walk only the PDFs listed in *only_process_these*
17
+ • Skip PDFs already recorded in 04_processed_pdfs.txt
18
+ • Chunk extracted text with RecursiveCharacterTextSplitter
19
+ • Write chunks to 01_chunks_kb.jsonl
20
+ • Return the list of successfully processed PDF basenames
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import io
26
+ import logging
27
+ import os
28
+ import re
29
+ import time
30
+ from concurrent.futures import ThreadPoolExecutor, as_completed
31
+ from typing import Dict, List, Optional, Tuple
32
+
33
+ import fitz # PyMuPDF
34
+ from PIL import Image
35
+
36
+ from visual_parser.jsonl_writer import append_to_jsonl, make_document_id
37
+ from visual_parser.pdf_tracker import load_processed_pdfs
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+
42
+ # ---------------------------------------------------------------------------
43
+ # Shared text-splitter factory
44
+ # ---------------------------------------------------------------------------
45
+
46
+ def _make_splitter(chunk_size: int = 500, chunk_overlap: int = 100):
47
+ try:
48
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
49
+ except ImportError:
50
+ from langchain.text_splitter import RecursiveCharacterTextSplitter # legacy fallback
51
+ return RecursiveCharacterTextSplitter(
52
+ chunk_size=chunk_size,
53
+ chunk_overlap=chunk_overlap,
54
+ length_function=len,
55
+ separators=["\n\n", "\n", " ", ""],
56
+ )
57
+
58
+
59
+ # ===========================================================================
60
+ # ENGINE 1 — Nougat (transformer-based OCR)
61
+ # ===========================================================================
62
+
63
+ def nougat_extract_pdfs(
64
+ only_process_these: List[str],
65
+ output_dir: str,
66
+ processor,
67
+ model,
68
+ device: str,
69
+ chunk_size: int = 500,
70
+ chunk_overlap: int = 100,
71
+ max_workers: int = 4,
72
+ ) -> Tuple[str, List[str], List[str], int]:
73
+ """
74
+ Extract text from each PDF in *only_process_these* using the Nougat model,
75
+ chunk it, and append chunks to ``01_chunks_kb.jsonl`` in *output_dir*.
76
+
77
+ Args:
78
+ only_process_these: Full paths of PDFs to attempt.
79
+ output_dir: Directory where JSONL files are written.
80
+ processor: Nougat AutoProcessor instance.
81
+ model: Nougat VisionEncoderDecoderModel instance.
82
+ device: ``'cuda'`` or ``'cpu'``.
83
+ chunk_size: Characters per chunk.
84
+ chunk_overlap: Overlap between adjacent chunks.
85
+ max_workers: Thread-pool size for parallel PDF processing.
86
+
87
+ Returns:
88
+ (summary_message, successful_basenames, failed_basenames, chunks_written_this_run)
89
+ """
90
+ from transformers import StoppingCriteriaList
91
+
92
+ from visual_parser.nougat_engine import RasterizePaper, StoppingCriteriaScores
93
+
94
+ registry_path = os.path.join(output_dir, "04_processed_pdfs.txt")
95
+ processed_set = set(load_processed_pdfs(registry_path))
96
+ pdfs_to_run = [p for p in only_process_these if os.path.basename(p) not in processed_set]
97
+
98
+ if not pdfs_to_run:
99
+ return "No new PDFs to process (Nougat).", [], [], 0
100
+
101
+ text_splitter = _make_splitter(chunk_size, chunk_overlap)
102
+
103
+ # -----------------------------------------------------------------------
104
+ def _process_one(pdf_path: str) -> Tuple[List[Dict], bool]:
105
+ chunks: List[Dict] = []
106
+ pdf_name = os.path.basename(pdf_path)
107
+ document_id = make_document_id(pdf_name)
108
+
109
+ try:
110
+ images = RasterizePaper(pdf=pdf_path, return_pil=True)
111
+ if not images:
112
+ logger.warning("No images rasterized for %s", pdf_name)
113
+ return [], False
114
+
115
+ for page_num, image_bytes_obj in enumerate(images):
116
+ image = Image.open(io.BytesIO(image_bytes_obj.getvalue()))
117
+ try:
118
+ pixel_values = processor(
119
+ images=image,
120
+ return_tensors="pt",
121
+ do_crop_margin=False,
122
+ ).pixel_values.to(device)
123
+ except TypeError:
124
+ pixel_values = processor(
125
+ images=image,
126
+ return_tensors="pt",
127
+ ).pixel_values.to(device)
128
+
129
+ outputs = model.generate(
130
+ pixel_values,
131
+ min_length=1,
132
+ max_length=3584,
133
+ bad_words_ids=[[processor.tokenizer.unk_token_id]],
134
+ return_dict_in_generate=True,
135
+ output_scores=True,
136
+ stopping_criteria=StoppingCriteriaList([StoppingCriteriaScores()]),
137
+ )
138
+
139
+ generated_text = processor.batch_decode(outputs[0], skip_special_tokens=True)[0]
140
+ # post_process_generation was removed in newer tokenizers builds;
141
+ # fall back to the raw decoded string when the method is absent.
142
+ try:
143
+ generated_text = processor.post_process_generation(
144
+ generated_text, fix_markdown=False
145
+ )
146
+ except AttributeError:
147
+ pass
148
+
149
+ for i, chunk_text in enumerate(text_splitter.split_text(generated_text)):
150
+ chunks.append({
151
+ "source": pdf_name,
152
+ "page": page_num + 1,
153
+ "content": chunk_text,
154
+ "chunk_index": i,
155
+ "document_id": document_id,
156
+ "chunk_id": f"{document_id}:p{page_num+1}:c{i}",
157
+ "extractor": "nougat",
158
+ })
159
+
160
+ except Exception as exc:
161
+ logger.error("Nougat failed on %s: %s", pdf_name, exc)
162
+ return [], False
163
+
164
+ if not chunks:
165
+ logger.error("Nougat produced no chunks for %s.", pdf_name)
166
+ return [], False
167
+
168
+ return chunks, True
169
+ # -----------------------------------------------------------------------
170
+
171
+ all_chunks: List[Dict] = []
172
+ processed_basenames: List[str] = []
173
+ failed_basenames: List[str] = []
174
+
175
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
176
+ futures = {executor.submit(_process_one, p): p for p in pdfs_to_run}
177
+ for future in as_completed(futures):
178
+ pdf_path = futures[future]
179
+ try:
180
+ result, succeeded = future.result()
181
+ if succeeded and result:
182
+ all_chunks.extend(result)
183
+ processed_basenames.append(os.path.basename(pdf_path))
184
+ else:
185
+ failed_basenames.append(os.path.basename(pdf_path))
186
+ except Exception as exc:
187
+ logger.error("Error collecting result for %s: %s", pdf_path, exc)
188
+ failed_basenames.append(os.path.basename(pdf_path))
189
+
190
+ if all_chunks:
191
+ chunks_path = os.path.join(output_dir, "01_chunks_kb.jsonl")
192
+ append_to_jsonl(chunks_path, all_chunks)
193
+
194
+ summary = (
195
+ f"Nougat extraction complete. "
196
+ f"{len(processed_basenames)} PDF(s) processed → {len(all_chunks)} chunks."
197
+ )
198
+ summary = (
199
+ f"Nougat extraction complete. "
200
+ f"{len(processed_basenames)} PDF(s) processed -> {len(all_chunks)} chunks. "
201
+ f"{len(failed_basenames)} PDF(s) failed."
202
+ )
203
+ return summary, processed_basenames, failed_basenames, len(all_chunks)
204
+
205
+
206
+ # ===========================================================================
207
+ # ENGINE 2 — Lightweight (PyMuPDF native text layer)
208
+ # ===========================================================================
209
+
210
+ def _extract_equations(text: str, images: list) -> List[str]:
211
+ """
212
+ Extract LaTeX/math-like patterns from *text* and (optionally) OCR *images*.
213
+ """
214
+ patterns = [
215
+ (r'\$\$(.*?)\$\$', True),
216
+ (r'\$(.*?)\$', True),
217
+ (r'\w+\^\w+|\w+\^\{.*?\}', False),
218
+ (r'\\frac\{.*?\}\{.*?\}', False),
219
+ (r'\\int_.*?\^.*? ', False),
220
+ (r'\\log\(.*?\)|\\ln\(.*?\)', False),
221
+ (r'\\begin\{.*?matrix\}(.*?)\\end\{.*?matrix\}', True),
222
+ (r'\\sum_.*?\^.*?', False),
223
+ (r'\\prod_.*?\^.*?', False),
224
+ (r'\\frac{d.*?}{d.*?}|\\partial.*?', False),
225
+ (r'\\[a-zA-Z]+', False),
226
+ (r'\\lim_.*?', False),
227
+ (r'\\vec{.*?}|\\mathbf{.*?}', False),
228
+ (r'\\in|\\cup|\\cap|\\forall|\\exists', False),
229
+ (r'\\langle.*?\\rangle', False),
230
+ ]
231
+ equations: List[str] = []
232
+ for pattern, is_latex in patterns:
233
+ for match in re.findall(pattern, text, re.DOTALL):
234
+ equations.append(f"$$ {match} $$" if is_latex else match)
235
+
236
+ for img in images:
237
+ try:
238
+ import pytesseract
239
+ ocr_text = pytesseract.image_to_string(img)
240
+ for pattern, is_latex in patterns:
241
+ for match in re.findall(pattern, ocr_text, re.DOTALL):
242
+ equations.append(f"$$ {match} $$" if is_latex else match)
243
+ except Exception:
244
+ pass # pytesseract is optional
245
+
246
+ return equations
247
+
248
+
249
+ def lightweight_extract_pdfs(
250
+ only_process_these: List[str],
251
+ output_dir: str,
252
+ chunk_size: int = 500,
253
+ chunk_overlap: int = 100,
254
+ max_workers: int = 4,
255
+ ) -> Tuple[str, List[str], List[str], int]:
256
+ """
257
+ Extract text from each PDF in *only_process_these* using PyMuPDF's native
258
+ text layer (fast, no GPU required), chunk it, and append to
259
+ ``01_chunks_kb.jsonl`` in *output_dir*.
260
+
261
+ Falls back to LangChain's PyPDFLoader when PyMuPDF fails on a file.
262
+
263
+ Args:
264
+ only_process_these: Full paths of PDFs to attempt.
265
+ output_dir: Directory where JSONL files are written.
266
+ chunk_size: Characters per chunk.
267
+ chunk_overlap: Overlap between adjacent chunks.
268
+ max_workers: Thread-pool size for parallel PDF processing.
269
+
270
+ Returns:
271
+ (summary_message, successful_basenames, failed_basenames, chunks_written_this_run)
272
+ """
273
+ from io import BytesIO
274
+
275
+ registry_path = os.path.join(output_dir, "04_processed_pdfs.txt")
276
+ processed_set = set(load_processed_pdfs(registry_path))
277
+ pdfs_to_run = [p for p in only_process_these if os.path.basename(p) not in processed_set]
278
+
279
+ if not pdfs_to_run:
280
+ return "No new PDFs to process (lightweight).", [], [], 0
281
+
282
+ text_splitter = _make_splitter(chunk_size, chunk_overlap)
283
+
284
+ # -----------------------------------------------------------------------
285
+ def _extract_images_from_page(page) -> list:
286
+ imgs = []
287
+ for img_info in page.get_images(full=True):
288
+ xref = img_info[0]
289
+ try:
290
+ base_image = page.parent.extract_image(xref)
291
+ image_bytes = base_image.get("image", b"")
292
+ if image_bytes:
293
+ imgs.append(Image.open(BytesIO(image_bytes)))
294
+ except Exception:
295
+ pass
296
+ return imgs
297
+
298
+ def _process_one(pdf_path: str) -> Tuple[List[Dict], bool]:
299
+ chunks: List[Dict] = []
300
+ pdf_name = os.path.basename(pdf_path)
301
+ document_id = make_document_id(pdf_name)
302
+ start = time.time()
303
+
304
+ try:
305
+ doc = fitz.open(pdf_path)
306
+ for page_num in range(len(doc)):
307
+ page = doc.load_page(page_num)
308
+ text = page.get_text("text")
309
+ images = _extract_images_from_page(page)
310
+ equations = _extract_equations(text, images)
311
+
312
+ eq_text = "\n\n".join(equations)
313
+ full_text = (
314
+ f"{text.strip()}\n\nExtracted Equations:\n{eq_text}"
315
+ if equations else text.strip()
316
+ )
317
+
318
+ for i, chunk_text in enumerate(text_splitter.split_text(full_text)):
319
+ chunks.append({
320
+ "source": pdf_name,
321
+ "page": page_num + 1,
322
+ "content": chunk_text,
323
+ "chunk_index": i,
324
+ "document_id": document_id,
325
+ "chunk_id": f"{document_id}:p{page_num+1}:c{i}",
326
+ "extractor": "lightweight",
327
+ })
328
+ doc.close()
329
+
330
+ except Exception as exc:
331
+ logger.warning("PyMuPDF failed on %s (%s) — trying PyPDFLoader.", pdf_name, exc)
332
+ try:
333
+ from langchain_community.document_loaders import PyPDFLoader
334
+ loader = PyPDFLoader(file_path=pdf_path)
335
+ for page_num, doc_obj in enumerate(loader.load(), start=1):
336
+ for i, chunk_text in enumerate(
337
+ text_splitter.split_text(doc_obj.page_content or "")
338
+ ):
339
+ chunks.append({
340
+ "source": pdf_name,
341
+ "page": page_num,
342
+ "content": chunk_text,
343
+ "chunk_index": i,
344
+ "document_id": document_id,
345
+ "chunk_id": f"{document_id}:p{page_num}:c{i}",
346
+ "extractor": "pypdf",
347
+ })
348
+ except Exception as exc2:
349
+ logger.error("Both extractors failed on %s: %s", pdf_name, exc2)
350
+ return [], False
351
+
352
+ elapsed = time.time() - start
353
+ if not chunks:
354
+ logger.error(
355
+ "Lightweight extraction produced no chunks for %s. "
356
+ "This usually means the PDF is image-only/scanned and has no usable text layer.",
357
+ pdf_name,
358
+ )
359
+ return [], False
360
+ logger.info("Lightweight: %s processed in %.1f s (%d chunks)", pdf_name, elapsed, len(chunks))
361
+ return chunks, True
362
+ # -----------------------------------------------------------------------
363
+
364
+ all_chunks: List[Dict] = []
365
+ processed_basenames: List[str] = []
366
+ failed_basenames: List[str] = []
367
+
368
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
369
+ futures = {executor.submit(_process_one, p): p for p in pdfs_to_run}
370
+ for future in as_completed(futures):
371
+ pdf_path = futures[future]
372
+ try:
373
+ result, succeeded = future.result()
374
+ if succeeded and result:
375
+ all_chunks.extend(result)
376
+ processed_basenames.append(os.path.basename(pdf_path))
377
+ else:
378
+ failed_basenames.append(os.path.basename(pdf_path))
379
+ except Exception as exc:
380
+ logger.error("Error collecting result for %s: %s", pdf_path, exc)
381
+ failed_basenames.append(os.path.basename(pdf_path))
382
+
383
+ if all_chunks:
384
+ chunks_path = os.path.join(output_dir, "01_chunks_kb.jsonl")
385
+ append_to_jsonl(chunks_path, all_chunks)
386
+
387
+ summary = (
388
+ f"Lightweight extraction complete. "
389
+ f"{len(processed_basenames)} PDF(s) processed → {len(all_chunks)} chunks."
390
+ )
391
+ summary = (
392
+ f"Lightweight extraction complete. "
393
+ f"{len(processed_basenames)} PDF(s) processed -> {len(all_chunks)} chunks. "
394
+ f"{len(failed_basenames)} PDF(s) failed."
395
+ )
396
+ return summary, processed_basenames, failed_basenames, len(all_chunks)
@@ -0,0 +1,269 @@
1
+ """
2
+ vision_llm.py — Thin, cb-free wrapper around OpenAI and Google Gemini vision APIs.
3
+
4
+ Model routing
5
+ -------------
6
+ When the user picks provider "gpt" or "gemini" without specifying a model,
7
+ the pipeline defaults to the most capable current model for each provider:
8
+
9
+ gpt -> gpt-5.4
10
+ (also accepts: gpt-5.5, gpt-5.3-chat-latest, gpt-5.2, gpt-5.1,
11
+ gpt-5, gpt-4o, gpt-4.1)
12
+ gemini → gemini-3-pro-preview
13
+ (also accepts: gemini-2.5-flash, gemini-1.5-pro)
14
+
15
+ GPT-5.x models
16
+ --------------
17
+ GPT-5 reasoning models support a ``reasoning_effort`` parameter instead of
18
+ temperature. This wrapper detects those models and adds the parameter
19
+ automatically. ``gpt-5.3-chat-latest`` is accepted, but follows the non-
20
+ reasoning path used by the main RADIANT-LLM app.
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import base64
26
+ import io
27
+ import logging
28
+ import os
29
+ from typing import List, Literal, Optional
30
+
31
+ from PIL import Image
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+ DetailLevel = Literal["low", "high", "auto"]
36
+ ReasoningEffort = Literal["minimal", "none", "low", "medium", "high", "xhigh"]
37
+
38
+ # GPT-family models with explicit reasoning_effort support.
39
+ _GPT_REASONING_MODELS = {"gpt-5", "gpt-5.1", "gpt-5.2", "gpt-5.4", "gpt-5.5"}
40
+
41
+ # GPT-family models accepted by the app but not documented here with
42
+ # reasoning_effort support.
43
+ _GPT_NO_REASONING_MODELS = {"gpt-5.3-chat-latest"}
44
+
45
+ _GPT_REASONING_EFFORT_OPTIONS = {
46
+ "gpt-5": {"minimal", "low", "medium", "high"},
47
+ "gpt-5.1": {"none", "low", "medium", "high"},
48
+ "gpt-5.2": {"none", "low", "medium", "high", "xhigh"},
49
+ "gpt-5.4": {"none", "low", "medium", "high", "xhigh"},
50
+ "gpt-5.5": {"none", "low", "medium", "high", "xhigh"},
51
+ }
52
+
53
+ # Latest default model per provider
54
+ LATEST_GPT_MODEL = "gpt-5.4"
55
+ LATEST_GEMINI_MODEL = "gemini-3-pro-preview"
56
+
57
+
58
+ def _supports_reasoning_effort(model: str) -> bool:
59
+ """Return True when *model* supports reasoning_effort in this wrapper."""
60
+ return model.lower() in _GPT_REASONING_MODELS
61
+
62
+
63
+ def _is_gpt5_chat_latest(model: str) -> bool:
64
+ """Return True for accepted GPT-5-era models without reasoning_effort support."""
65
+ return model.lower() in _GPT_NO_REASONING_MODELS
66
+
67
+
68
+ def _normalize_reasoning_effort(
69
+ model: str,
70
+ reasoning_effort: Optional[ReasoningEffort],
71
+ ) -> Optional[str]:
72
+ """
73
+ Keep supported reasoning-effort values only for models that accept them.
74
+ """
75
+ if not reasoning_effort:
76
+ return None
77
+
78
+ normalized_model = model.lower()
79
+ normalized_effort = reasoning_effort.lower()
80
+ allowed = _GPT_REASONING_EFFORT_OPTIONS.get(normalized_model, set())
81
+ if normalized_effort in allowed:
82
+ return normalized_effort
83
+
84
+ if allowed:
85
+ logger.warning(
86
+ "Ignoring unsupported reasoning_effort=%s for model=%s. Allowed: %s",
87
+ reasoning_effort,
88
+ model,
89
+ ", ".join(sorted(allowed)),
90
+ )
91
+ return None
92
+
93
+
94
+ # ---------------------------------------------------------------------------
95
+ # OpenAI / GPT
96
+ # ---------------------------------------------------------------------------
97
+
98
+ def call_vision_llm_gpt(
99
+ images: List[bytes],
100
+ prompt: str,
101
+ api_key: str,
102
+ model: str = LATEST_GPT_MODEL,
103
+ detail: DetailLevel = "low",
104
+ reasoning_effort: Optional[ReasoningEffort] = "medium",
105
+ ) -> str:
106
+ """
107
+ Send *images* (PNG bytes) and *prompt* to an OpenAI vision model.
108
+
109
+ For supported GPT-5 reasoning models, the ``reasoning_effort`` parameter
110
+ is passed to the API instead of temperature. ``gpt-5.3-chat-latest`` is
111
+ accepted without ``reasoning_effort``.
112
+
113
+ Args:
114
+ images: List of raw PNG byte strings.
115
+ prompt: Text instruction for the model.
116
+ api_key: OpenAI API key.
117
+ model: Vision-capable model name.
118
+ detail: Image resolution hint ('low', 'high', or 'auto').
119
+ reasoning_effort: Reasoning depth for supported GPT-5.x models.
120
+ Ignored for gpt-5.3-chat-latest and older models
121
+ such as gpt-4o and gpt-4.1.
122
+
123
+ Returns:
124
+ Model response as a plain string.
125
+ """
126
+ try:
127
+ from openai import OpenAI
128
+ except ImportError as exc:
129
+ raise RuntimeError("openai package not installed. Run: pip install openai") from exc
130
+
131
+ if not api_key:
132
+ raise RuntimeError("OpenAI API key is not set.")
133
+
134
+ client = OpenAI(api_key=api_key)
135
+
136
+ # Build the multimodal message content
137
+ content = [{"type": "text", "text": prompt}]
138
+ for img_bytes in images:
139
+ b64 = base64.b64encode(img_bytes).decode("ascii")
140
+ content.append({
141
+ "type": "image_url",
142
+ "image_url": {
143
+ "url": f"data:image/png;base64,{b64}",
144
+ "detail": detail,
145
+ },
146
+ })
147
+
148
+ # Build API call kwargs
149
+ call_kwargs: dict = {
150
+ "model": model,
151
+ "messages": [{"role": "user", "content": content}],
152
+ }
153
+
154
+ if _supports_reasoning_effort(model):
155
+ # GPT-5 reasoning models: use reasoning_effort; temperature is not supported.
156
+ normalized_effort = _normalize_reasoning_effort(model, reasoning_effort)
157
+ if normalized_effort:
158
+ call_kwargs["reasoning_effort"] = normalized_effort
159
+ logger.info("[GPT-5 reasoning] Using model=%s reasoning_effort=%s", model, normalized_effort)
160
+ elif _is_gpt5_chat_latest(model):
161
+ # Keep parity with the main RADIANT-LLM app for gpt-5.3-chat-latest.
162
+ call_kwargs["temperature"] = 1.0
163
+ logger.info("[GPT-5 chat-latest] Using model=%s temperature=1.0", model)
164
+ else:
165
+ # Older models: standard temperature
166
+ call_kwargs["temperature"] = 0
167
+
168
+ try:
169
+ response = client.chat.completions.create(**call_kwargs)
170
+ return response.choices[0].message.content.strip()
171
+ except Exception as exc:
172
+ raise RuntimeError(f"OpenAI vision call failed (model={model}): {exc}") from exc
173
+
174
+
175
+ # ---------------------------------------------------------------------------
176
+ # Google Gemini
177
+ # ---------------------------------------------------------------------------
178
+
179
+ def call_vision_llm_gemini(
180
+ images: List[bytes],
181
+ prompt: str,
182
+ api_key: str,
183
+ model: str = LATEST_GEMINI_MODEL,
184
+ ) -> str:
185
+ """
186
+ Send *images* (PNG bytes) and *prompt* to a Google Gemini vision model.
187
+
188
+ Args:
189
+ images: List of raw PNG byte strings.
190
+ prompt: Text instruction for the model.
191
+ api_key: Gemini API key.
192
+ model: Gemini model name.
193
+
194
+ Returns:
195
+ Model response as a plain string.
196
+ """
197
+ try:
198
+ import google.generativeai as genai
199
+ except ImportError as exc:
200
+ raise RuntimeError(
201
+ "google-generativeai package not installed. "
202
+ "Run: pip install google-generativeai"
203
+ ) from exc
204
+
205
+ if not api_key:
206
+ raise RuntimeError("Gemini API key is not set.")
207
+
208
+ genai.configure(api_key=api_key)
209
+ vision_model = genai.GenerativeModel(model)
210
+
211
+ pil_images = [Image.open(io.BytesIO(b)).convert("RGB") for b in images]
212
+ logger.info("[GEMINI] Using model=%s", model)
213
+
214
+ try:
215
+ response = vision_model.generate_content([prompt] + pil_images)
216
+ return response.text
217
+ except Exception as exc:
218
+ raise RuntimeError(f"Gemini vision call failed (model={model}): {exc}") from exc
219
+
220
+
221
+ # ---------------------------------------------------------------------------
222
+ # Unified dispatcher
223
+ # ---------------------------------------------------------------------------
224
+
225
+ def call_vision_llm(
226
+ images: List[bytes],
227
+ prompt: str,
228
+ provider: str,
229
+ api_key: str,
230
+ model: str,
231
+ detail: DetailLevel = "low",
232
+ reasoning_effort: Optional[ReasoningEffort] = "medium",
233
+ ) -> str:
234
+ """
235
+ Unified vision-LLM dispatcher.
236
+
237
+ Automatically routes to the correct provider backend. When *model* is
238
+ empty or None, falls back to the latest default for that provider.
239
+
240
+ Args:
241
+ images: List of raw PNG byte strings.
242
+ prompt: Text instruction for the model.
243
+ provider: ``'gpt'`` or ``'gemini'``.
244
+ api_key: API key for the chosen provider.
245
+ model: Model name string.
246
+ detail: Image detail level (GPT only; ignored for Gemini).
247
+ reasoning_effort: Reasoning depth for GPT-5.x (ignored for older GPT
248
+ models and all Gemini models).
249
+
250
+ Returns:
251
+ Model response as a plain string.
252
+ """
253
+ resolved_model = model or (
254
+ LATEST_GPT_MODEL if provider == "gpt" else LATEST_GEMINI_MODEL
255
+ )
256
+
257
+ if provider == "gpt":
258
+ return call_vision_llm_gpt(
259
+ images, prompt, api_key,
260
+ model=resolved_model,
261
+ detail=detail,
262
+ reasoning_effort=reasoning_effort,
263
+ )
264
+ if provider == "gemini":
265
+ return call_vision_llm_gemini(images, prompt, api_key, model=resolved_model)
266
+
267
+ raise RuntimeError(
268
+ f"Unknown vision provider: {provider!r}. Must be 'gpt' or 'gemini'."
269
+ )