visual-parser 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- visual_parser/__init__.py +20 -0
- visual_parser/__main__.py +8 -0
- visual_parser/cli.py +230 -0
- visual_parser/cli_main.py +223 -0
- visual_parser/config.py +168 -0
- visual_parser/figure_describer.py +218 -0
- visual_parser/jsonl_writer.py +102 -0
- visual_parser/metadata_extractor.py +94 -0
- visual_parser/nougat_engine.py +222 -0
- visual_parser/pdf_tracker.py +105 -0
- visual_parser/pipeline.py +255 -0
- visual_parser/prompts.py +98 -0
- visual_parser/text_extractor.py +396 -0
- visual_parser/vision_llm.py +269 -0
- visual_parser-1.0.0.dist-info/METADATA +191 -0
- visual_parser-1.0.0.dist-info/RECORD +19 -0
- visual_parser-1.0.0.dist-info/WHEEL +5 -0
- visual_parser-1.0.0.dist-info/entry_points.txt +2 -0
- visual_parser-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
"""
|
|
2
|
+
text_extractor.py — Two text-extraction engines for PDF pages.
|
|
3
|
+
|
|
4
|
+
Nougat (default)
|
|
5
|
+
Uses the Facebook Nougat transformer to OCR each page rendered as an image.
|
|
6
|
+
Best for: scanned PDFs, PDFs with equations, complex layouts.
|
|
7
|
+
Requires: nougat_engine.NougatInitializer() result to be passed in.
|
|
8
|
+
|
|
9
|
+
Lightweight (fast)
|
|
10
|
+
Uses PyMuPDF's native text layer (fitz.Page.get_text) with a PyPDFLoader
|
|
11
|
+
fallback. Also extracts embedded equations via regex + pytesseract.
|
|
12
|
+
Best for: born-digital PDFs where text is already machine-readable.
|
|
13
|
+
Requires: only PyMuPDF + langchain_community; no GPU.
|
|
14
|
+
|
|
15
|
+
Both engines:
|
|
16
|
+
• Walk only the PDFs listed in *only_process_these*
|
|
17
|
+
• Skip PDFs already recorded in 04_processed_pdfs.txt
|
|
18
|
+
• Chunk extracted text with RecursiveCharacterTextSplitter
|
|
19
|
+
• Write chunks to 01_chunks_kb.jsonl
|
|
20
|
+
• Return the list of successfully processed PDF basenames
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import io
|
|
26
|
+
import logging
|
|
27
|
+
import os
|
|
28
|
+
import re
|
|
29
|
+
import time
|
|
30
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
31
|
+
from typing import Dict, List, Optional, Tuple
|
|
32
|
+
|
|
33
|
+
import fitz # PyMuPDF
|
|
34
|
+
from PIL import Image
|
|
35
|
+
|
|
36
|
+
from visual_parser.jsonl_writer import append_to_jsonl, make_document_id
|
|
37
|
+
from visual_parser.pdf_tracker import load_processed_pdfs
|
|
38
|
+
|
|
39
|
+
logger = logging.getLogger(__name__)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# ---------------------------------------------------------------------------
|
|
43
|
+
# Shared text-splitter factory
|
|
44
|
+
# ---------------------------------------------------------------------------
|
|
45
|
+
|
|
46
|
+
def _make_splitter(chunk_size: int = 500, chunk_overlap: int = 100):
|
|
47
|
+
try:
|
|
48
|
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
49
|
+
except ImportError:
|
|
50
|
+
from langchain.text_splitter import RecursiveCharacterTextSplitter # legacy fallback
|
|
51
|
+
return RecursiveCharacterTextSplitter(
|
|
52
|
+
chunk_size=chunk_size,
|
|
53
|
+
chunk_overlap=chunk_overlap,
|
|
54
|
+
length_function=len,
|
|
55
|
+
separators=["\n\n", "\n", " ", ""],
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# ===========================================================================
|
|
60
|
+
# ENGINE 1 — Nougat (transformer-based OCR)
|
|
61
|
+
# ===========================================================================
|
|
62
|
+
|
|
63
|
+
def nougat_extract_pdfs(
|
|
64
|
+
only_process_these: List[str],
|
|
65
|
+
output_dir: str,
|
|
66
|
+
processor,
|
|
67
|
+
model,
|
|
68
|
+
device: str,
|
|
69
|
+
chunk_size: int = 500,
|
|
70
|
+
chunk_overlap: int = 100,
|
|
71
|
+
max_workers: int = 4,
|
|
72
|
+
) -> Tuple[str, List[str], List[str], int]:
|
|
73
|
+
"""
|
|
74
|
+
Extract text from each PDF in *only_process_these* using the Nougat model,
|
|
75
|
+
chunk it, and append chunks to ``01_chunks_kb.jsonl`` in *output_dir*.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
only_process_these: Full paths of PDFs to attempt.
|
|
79
|
+
output_dir: Directory where JSONL files are written.
|
|
80
|
+
processor: Nougat AutoProcessor instance.
|
|
81
|
+
model: Nougat VisionEncoderDecoderModel instance.
|
|
82
|
+
device: ``'cuda'`` or ``'cpu'``.
|
|
83
|
+
chunk_size: Characters per chunk.
|
|
84
|
+
chunk_overlap: Overlap between adjacent chunks.
|
|
85
|
+
max_workers: Thread-pool size for parallel PDF processing.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
(summary_message, successful_basenames, failed_basenames, chunks_written_this_run)
|
|
89
|
+
"""
|
|
90
|
+
from transformers import StoppingCriteriaList
|
|
91
|
+
|
|
92
|
+
from visual_parser.nougat_engine import RasterizePaper, StoppingCriteriaScores
|
|
93
|
+
|
|
94
|
+
registry_path = os.path.join(output_dir, "04_processed_pdfs.txt")
|
|
95
|
+
processed_set = set(load_processed_pdfs(registry_path))
|
|
96
|
+
pdfs_to_run = [p for p in only_process_these if os.path.basename(p) not in processed_set]
|
|
97
|
+
|
|
98
|
+
if not pdfs_to_run:
|
|
99
|
+
return "No new PDFs to process (Nougat).", [], [], 0
|
|
100
|
+
|
|
101
|
+
text_splitter = _make_splitter(chunk_size, chunk_overlap)
|
|
102
|
+
|
|
103
|
+
# -----------------------------------------------------------------------
|
|
104
|
+
def _process_one(pdf_path: str) -> Tuple[List[Dict], bool]:
|
|
105
|
+
chunks: List[Dict] = []
|
|
106
|
+
pdf_name = os.path.basename(pdf_path)
|
|
107
|
+
document_id = make_document_id(pdf_name)
|
|
108
|
+
|
|
109
|
+
try:
|
|
110
|
+
images = RasterizePaper(pdf=pdf_path, return_pil=True)
|
|
111
|
+
if not images:
|
|
112
|
+
logger.warning("No images rasterized for %s", pdf_name)
|
|
113
|
+
return [], False
|
|
114
|
+
|
|
115
|
+
for page_num, image_bytes_obj in enumerate(images):
|
|
116
|
+
image = Image.open(io.BytesIO(image_bytes_obj.getvalue()))
|
|
117
|
+
try:
|
|
118
|
+
pixel_values = processor(
|
|
119
|
+
images=image,
|
|
120
|
+
return_tensors="pt",
|
|
121
|
+
do_crop_margin=False,
|
|
122
|
+
).pixel_values.to(device)
|
|
123
|
+
except TypeError:
|
|
124
|
+
pixel_values = processor(
|
|
125
|
+
images=image,
|
|
126
|
+
return_tensors="pt",
|
|
127
|
+
).pixel_values.to(device)
|
|
128
|
+
|
|
129
|
+
outputs = model.generate(
|
|
130
|
+
pixel_values,
|
|
131
|
+
min_length=1,
|
|
132
|
+
max_length=3584,
|
|
133
|
+
bad_words_ids=[[processor.tokenizer.unk_token_id]],
|
|
134
|
+
return_dict_in_generate=True,
|
|
135
|
+
output_scores=True,
|
|
136
|
+
stopping_criteria=StoppingCriteriaList([StoppingCriteriaScores()]),
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
generated_text = processor.batch_decode(outputs[0], skip_special_tokens=True)[0]
|
|
140
|
+
# post_process_generation was removed in newer tokenizers builds;
|
|
141
|
+
# fall back to the raw decoded string when the method is absent.
|
|
142
|
+
try:
|
|
143
|
+
generated_text = processor.post_process_generation(
|
|
144
|
+
generated_text, fix_markdown=False
|
|
145
|
+
)
|
|
146
|
+
except AttributeError:
|
|
147
|
+
pass
|
|
148
|
+
|
|
149
|
+
for i, chunk_text in enumerate(text_splitter.split_text(generated_text)):
|
|
150
|
+
chunks.append({
|
|
151
|
+
"source": pdf_name,
|
|
152
|
+
"page": page_num + 1,
|
|
153
|
+
"content": chunk_text,
|
|
154
|
+
"chunk_index": i,
|
|
155
|
+
"document_id": document_id,
|
|
156
|
+
"chunk_id": f"{document_id}:p{page_num+1}:c{i}",
|
|
157
|
+
"extractor": "nougat",
|
|
158
|
+
})
|
|
159
|
+
|
|
160
|
+
except Exception as exc:
|
|
161
|
+
logger.error("Nougat failed on %s: %s", pdf_name, exc)
|
|
162
|
+
return [], False
|
|
163
|
+
|
|
164
|
+
if not chunks:
|
|
165
|
+
logger.error("Nougat produced no chunks for %s.", pdf_name)
|
|
166
|
+
return [], False
|
|
167
|
+
|
|
168
|
+
return chunks, True
|
|
169
|
+
# -----------------------------------------------------------------------
|
|
170
|
+
|
|
171
|
+
all_chunks: List[Dict] = []
|
|
172
|
+
processed_basenames: List[str] = []
|
|
173
|
+
failed_basenames: List[str] = []
|
|
174
|
+
|
|
175
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
176
|
+
futures = {executor.submit(_process_one, p): p for p in pdfs_to_run}
|
|
177
|
+
for future in as_completed(futures):
|
|
178
|
+
pdf_path = futures[future]
|
|
179
|
+
try:
|
|
180
|
+
result, succeeded = future.result()
|
|
181
|
+
if succeeded and result:
|
|
182
|
+
all_chunks.extend(result)
|
|
183
|
+
processed_basenames.append(os.path.basename(pdf_path))
|
|
184
|
+
else:
|
|
185
|
+
failed_basenames.append(os.path.basename(pdf_path))
|
|
186
|
+
except Exception as exc:
|
|
187
|
+
logger.error("Error collecting result for %s: %s", pdf_path, exc)
|
|
188
|
+
failed_basenames.append(os.path.basename(pdf_path))
|
|
189
|
+
|
|
190
|
+
if all_chunks:
|
|
191
|
+
chunks_path = os.path.join(output_dir, "01_chunks_kb.jsonl")
|
|
192
|
+
append_to_jsonl(chunks_path, all_chunks)
|
|
193
|
+
|
|
194
|
+
summary = (
|
|
195
|
+
f"Nougat extraction complete. "
|
|
196
|
+
f"{len(processed_basenames)} PDF(s) processed → {len(all_chunks)} chunks."
|
|
197
|
+
)
|
|
198
|
+
summary = (
|
|
199
|
+
f"Nougat extraction complete. "
|
|
200
|
+
f"{len(processed_basenames)} PDF(s) processed -> {len(all_chunks)} chunks. "
|
|
201
|
+
f"{len(failed_basenames)} PDF(s) failed."
|
|
202
|
+
)
|
|
203
|
+
return summary, processed_basenames, failed_basenames, len(all_chunks)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
# ===========================================================================
|
|
207
|
+
# ENGINE 2 — Lightweight (PyMuPDF native text layer)
|
|
208
|
+
# ===========================================================================
|
|
209
|
+
|
|
210
|
+
def _extract_equations(text: str, images: list) -> List[str]:
|
|
211
|
+
"""
|
|
212
|
+
Extract LaTeX/math-like patterns from *text* and (optionally) OCR *images*.
|
|
213
|
+
"""
|
|
214
|
+
patterns = [
|
|
215
|
+
(r'\$\$(.*?)\$\$', True),
|
|
216
|
+
(r'\$(.*?)\$', True),
|
|
217
|
+
(r'\w+\^\w+|\w+\^\{.*?\}', False),
|
|
218
|
+
(r'\\frac\{.*?\}\{.*?\}', False),
|
|
219
|
+
(r'\\int_.*?\^.*? ', False),
|
|
220
|
+
(r'\\log\(.*?\)|\\ln\(.*?\)', False),
|
|
221
|
+
(r'\\begin\{.*?matrix\}(.*?)\\end\{.*?matrix\}', True),
|
|
222
|
+
(r'\\sum_.*?\^.*?', False),
|
|
223
|
+
(r'\\prod_.*?\^.*?', False),
|
|
224
|
+
(r'\\frac{d.*?}{d.*?}|\\partial.*?', False),
|
|
225
|
+
(r'\\[a-zA-Z]+', False),
|
|
226
|
+
(r'\\lim_.*?', False),
|
|
227
|
+
(r'\\vec{.*?}|\\mathbf{.*?}', False),
|
|
228
|
+
(r'\\in|\\cup|\\cap|\\forall|\\exists', False),
|
|
229
|
+
(r'\\langle.*?\\rangle', False),
|
|
230
|
+
]
|
|
231
|
+
equations: List[str] = []
|
|
232
|
+
for pattern, is_latex in patterns:
|
|
233
|
+
for match in re.findall(pattern, text, re.DOTALL):
|
|
234
|
+
equations.append(f"$$ {match} $$" if is_latex else match)
|
|
235
|
+
|
|
236
|
+
for img in images:
|
|
237
|
+
try:
|
|
238
|
+
import pytesseract
|
|
239
|
+
ocr_text = pytesseract.image_to_string(img)
|
|
240
|
+
for pattern, is_latex in patterns:
|
|
241
|
+
for match in re.findall(pattern, ocr_text, re.DOTALL):
|
|
242
|
+
equations.append(f"$$ {match} $$" if is_latex else match)
|
|
243
|
+
except Exception:
|
|
244
|
+
pass # pytesseract is optional
|
|
245
|
+
|
|
246
|
+
return equations
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def lightweight_extract_pdfs(
|
|
250
|
+
only_process_these: List[str],
|
|
251
|
+
output_dir: str,
|
|
252
|
+
chunk_size: int = 500,
|
|
253
|
+
chunk_overlap: int = 100,
|
|
254
|
+
max_workers: int = 4,
|
|
255
|
+
) -> Tuple[str, List[str], List[str], int]:
|
|
256
|
+
"""
|
|
257
|
+
Extract text from each PDF in *only_process_these* using PyMuPDF's native
|
|
258
|
+
text layer (fast, no GPU required), chunk it, and append to
|
|
259
|
+
``01_chunks_kb.jsonl`` in *output_dir*.
|
|
260
|
+
|
|
261
|
+
Falls back to LangChain's PyPDFLoader when PyMuPDF fails on a file.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
only_process_these: Full paths of PDFs to attempt.
|
|
265
|
+
output_dir: Directory where JSONL files are written.
|
|
266
|
+
chunk_size: Characters per chunk.
|
|
267
|
+
chunk_overlap: Overlap between adjacent chunks.
|
|
268
|
+
max_workers: Thread-pool size for parallel PDF processing.
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
(summary_message, successful_basenames, failed_basenames, chunks_written_this_run)
|
|
272
|
+
"""
|
|
273
|
+
from io import BytesIO
|
|
274
|
+
|
|
275
|
+
registry_path = os.path.join(output_dir, "04_processed_pdfs.txt")
|
|
276
|
+
processed_set = set(load_processed_pdfs(registry_path))
|
|
277
|
+
pdfs_to_run = [p for p in only_process_these if os.path.basename(p) not in processed_set]
|
|
278
|
+
|
|
279
|
+
if not pdfs_to_run:
|
|
280
|
+
return "No new PDFs to process (lightweight).", [], [], 0
|
|
281
|
+
|
|
282
|
+
text_splitter = _make_splitter(chunk_size, chunk_overlap)
|
|
283
|
+
|
|
284
|
+
# -----------------------------------------------------------------------
|
|
285
|
+
def _extract_images_from_page(page) -> list:
|
|
286
|
+
imgs = []
|
|
287
|
+
for img_info in page.get_images(full=True):
|
|
288
|
+
xref = img_info[0]
|
|
289
|
+
try:
|
|
290
|
+
base_image = page.parent.extract_image(xref)
|
|
291
|
+
image_bytes = base_image.get("image", b"")
|
|
292
|
+
if image_bytes:
|
|
293
|
+
imgs.append(Image.open(BytesIO(image_bytes)))
|
|
294
|
+
except Exception:
|
|
295
|
+
pass
|
|
296
|
+
return imgs
|
|
297
|
+
|
|
298
|
+
def _process_one(pdf_path: str) -> Tuple[List[Dict], bool]:
|
|
299
|
+
chunks: List[Dict] = []
|
|
300
|
+
pdf_name = os.path.basename(pdf_path)
|
|
301
|
+
document_id = make_document_id(pdf_name)
|
|
302
|
+
start = time.time()
|
|
303
|
+
|
|
304
|
+
try:
|
|
305
|
+
doc = fitz.open(pdf_path)
|
|
306
|
+
for page_num in range(len(doc)):
|
|
307
|
+
page = doc.load_page(page_num)
|
|
308
|
+
text = page.get_text("text")
|
|
309
|
+
images = _extract_images_from_page(page)
|
|
310
|
+
equations = _extract_equations(text, images)
|
|
311
|
+
|
|
312
|
+
eq_text = "\n\n".join(equations)
|
|
313
|
+
full_text = (
|
|
314
|
+
f"{text.strip()}\n\nExtracted Equations:\n{eq_text}"
|
|
315
|
+
if equations else text.strip()
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
for i, chunk_text in enumerate(text_splitter.split_text(full_text)):
|
|
319
|
+
chunks.append({
|
|
320
|
+
"source": pdf_name,
|
|
321
|
+
"page": page_num + 1,
|
|
322
|
+
"content": chunk_text,
|
|
323
|
+
"chunk_index": i,
|
|
324
|
+
"document_id": document_id,
|
|
325
|
+
"chunk_id": f"{document_id}:p{page_num+1}:c{i}",
|
|
326
|
+
"extractor": "lightweight",
|
|
327
|
+
})
|
|
328
|
+
doc.close()
|
|
329
|
+
|
|
330
|
+
except Exception as exc:
|
|
331
|
+
logger.warning("PyMuPDF failed on %s (%s) — trying PyPDFLoader.", pdf_name, exc)
|
|
332
|
+
try:
|
|
333
|
+
from langchain_community.document_loaders import PyPDFLoader
|
|
334
|
+
loader = PyPDFLoader(file_path=pdf_path)
|
|
335
|
+
for page_num, doc_obj in enumerate(loader.load(), start=1):
|
|
336
|
+
for i, chunk_text in enumerate(
|
|
337
|
+
text_splitter.split_text(doc_obj.page_content or "")
|
|
338
|
+
):
|
|
339
|
+
chunks.append({
|
|
340
|
+
"source": pdf_name,
|
|
341
|
+
"page": page_num,
|
|
342
|
+
"content": chunk_text,
|
|
343
|
+
"chunk_index": i,
|
|
344
|
+
"document_id": document_id,
|
|
345
|
+
"chunk_id": f"{document_id}:p{page_num}:c{i}",
|
|
346
|
+
"extractor": "pypdf",
|
|
347
|
+
})
|
|
348
|
+
except Exception as exc2:
|
|
349
|
+
logger.error("Both extractors failed on %s: %s", pdf_name, exc2)
|
|
350
|
+
return [], False
|
|
351
|
+
|
|
352
|
+
elapsed = time.time() - start
|
|
353
|
+
if not chunks:
|
|
354
|
+
logger.error(
|
|
355
|
+
"Lightweight extraction produced no chunks for %s. "
|
|
356
|
+
"This usually means the PDF is image-only/scanned and has no usable text layer.",
|
|
357
|
+
pdf_name,
|
|
358
|
+
)
|
|
359
|
+
return [], False
|
|
360
|
+
logger.info("Lightweight: %s processed in %.1f s (%d chunks)", pdf_name, elapsed, len(chunks))
|
|
361
|
+
return chunks, True
|
|
362
|
+
# -----------------------------------------------------------------------
|
|
363
|
+
|
|
364
|
+
all_chunks: List[Dict] = []
|
|
365
|
+
processed_basenames: List[str] = []
|
|
366
|
+
failed_basenames: List[str] = []
|
|
367
|
+
|
|
368
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
369
|
+
futures = {executor.submit(_process_one, p): p for p in pdfs_to_run}
|
|
370
|
+
for future in as_completed(futures):
|
|
371
|
+
pdf_path = futures[future]
|
|
372
|
+
try:
|
|
373
|
+
result, succeeded = future.result()
|
|
374
|
+
if succeeded and result:
|
|
375
|
+
all_chunks.extend(result)
|
|
376
|
+
processed_basenames.append(os.path.basename(pdf_path))
|
|
377
|
+
else:
|
|
378
|
+
failed_basenames.append(os.path.basename(pdf_path))
|
|
379
|
+
except Exception as exc:
|
|
380
|
+
logger.error("Error collecting result for %s: %s", pdf_path, exc)
|
|
381
|
+
failed_basenames.append(os.path.basename(pdf_path))
|
|
382
|
+
|
|
383
|
+
if all_chunks:
|
|
384
|
+
chunks_path = os.path.join(output_dir, "01_chunks_kb.jsonl")
|
|
385
|
+
append_to_jsonl(chunks_path, all_chunks)
|
|
386
|
+
|
|
387
|
+
summary = (
|
|
388
|
+
f"Lightweight extraction complete. "
|
|
389
|
+
f"{len(processed_basenames)} PDF(s) processed → {len(all_chunks)} chunks."
|
|
390
|
+
)
|
|
391
|
+
summary = (
|
|
392
|
+
f"Lightweight extraction complete. "
|
|
393
|
+
f"{len(processed_basenames)} PDF(s) processed -> {len(all_chunks)} chunks. "
|
|
394
|
+
f"{len(failed_basenames)} PDF(s) failed."
|
|
395
|
+
)
|
|
396
|
+
return summary, processed_basenames, failed_basenames, len(all_chunks)
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
"""
|
|
2
|
+
vision_llm.py — Thin, cb-free wrapper around OpenAI and Google Gemini vision APIs.
|
|
3
|
+
|
|
4
|
+
Model routing
|
|
5
|
+
-------------
|
|
6
|
+
When the user picks provider "gpt" or "gemini" without specifying a model,
|
|
7
|
+
the pipeline defaults to the most capable current model for each provider:
|
|
8
|
+
|
|
9
|
+
gpt -> gpt-5.4
|
|
10
|
+
(also accepts: gpt-5.5, gpt-5.3-chat-latest, gpt-5.2, gpt-5.1,
|
|
11
|
+
gpt-5, gpt-4o, gpt-4.1)
|
|
12
|
+
gemini → gemini-3-pro-preview
|
|
13
|
+
(also accepts: gemini-2.5-flash, gemini-1.5-pro)
|
|
14
|
+
|
|
15
|
+
GPT-5.x models
|
|
16
|
+
--------------
|
|
17
|
+
GPT-5 reasoning models support a ``reasoning_effort`` parameter instead of
|
|
18
|
+
temperature. This wrapper detects those models and adds the parameter
|
|
19
|
+
automatically. ``gpt-5.3-chat-latest`` is accepted, but follows the non-
|
|
20
|
+
reasoning path used by the main RADIANT-LLM app.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import base64
|
|
26
|
+
import io
|
|
27
|
+
import logging
|
|
28
|
+
import os
|
|
29
|
+
from typing import List, Literal, Optional
|
|
30
|
+
|
|
31
|
+
from PIL import Image
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
DetailLevel = Literal["low", "high", "auto"]
|
|
36
|
+
ReasoningEffort = Literal["minimal", "none", "low", "medium", "high", "xhigh"]
|
|
37
|
+
|
|
38
|
+
# GPT-family models with explicit reasoning_effort support.
|
|
39
|
+
_GPT_REASONING_MODELS = {"gpt-5", "gpt-5.1", "gpt-5.2", "gpt-5.4", "gpt-5.5"}
|
|
40
|
+
|
|
41
|
+
# GPT-family models accepted by the app but not documented here with
|
|
42
|
+
# reasoning_effort support.
|
|
43
|
+
_GPT_NO_REASONING_MODELS = {"gpt-5.3-chat-latest"}
|
|
44
|
+
|
|
45
|
+
_GPT_REASONING_EFFORT_OPTIONS = {
|
|
46
|
+
"gpt-5": {"minimal", "low", "medium", "high"},
|
|
47
|
+
"gpt-5.1": {"none", "low", "medium", "high"},
|
|
48
|
+
"gpt-5.2": {"none", "low", "medium", "high", "xhigh"},
|
|
49
|
+
"gpt-5.4": {"none", "low", "medium", "high", "xhigh"},
|
|
50
|
+
"gpt-5.5": {"none", "low", "medium", "high", "xhigh"},
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
# Latest default model per provider
|
|
54
|
+
LATEST_GPT_MODEL = "gpt-5.4"
|
|
55
|
+
LATEST_GEMINI_MODEL = "gemini-3-pro-preview"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _supports_reasoning_effort(model: str) -> bool:
|
|
59
|
+
"""Return True when *model* supports reasoning_effort in this wrapper."""
|
|
60
|
+
return model.lower() in _GPT_REASONING_MODELS
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _is_gpt5_chat_latest(model: str) -> bool:
|
|
64
|
+
"""Return True for accepted GPT-5-era models without reasoning_effort support."""
|
|
65
|
+
return model.lower() in _GPT_NO_REASONING_MODELS
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _normalize_reasoning_effort(
|
|
69
|
+
model: str,
|
|
70
|
+
reasoning_effort: Optional[ReasoningEffort],
|
|
71
|
+
) -> Optional[str]:
|
|
72
|
+
"""
|
|
73
|
+
Keep supported reasoning-effort values only for models that accept them.
|
|
74
|
+
"""
|
|
75
|
+
if not reasoning_effort:
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
normalized_model = model.lower()
|
|
79
|
+
normalized_effort = reasoning_effort.lower()
|
|
80
|
+
allowed = _GPT_REASONING_EFFORT_OPTIONS.get(normalized_model, set())
|
|
81
|
+
if normalized_effort in allowed:
|
|
82
|
+
return normalized_effort
|
|
83
|
+
|
|
84
|
+
if allowed:
|
|
85
|
+
logger.warning(
|
|
86
|
+
"Ignoring unsupported reasoning_effort=%s for model=%s. Allowed: %s",
|
|
87
|
+
reasoning_effort,
|
|
88
|
+
model,
|
|
89
|
+
", ".join(sorted(allowed)),
|
|
90
|
+
)
|
|
91
|
+
return None
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# ---------------------------------------------------------------------------
|
|
95
|
+
# OpenAI / GPT
|
|
96
|
+
# ---------------------------------------------------------------------------
|
|
97
|
+
|
|
98
|
+
def call_vision_llm_gpt(
|
|
99
|
+
images: List[bytes],
|
|
100
|
+
prompt: str,
|
|
101
|
+
api_key: str,
|
|
102
|
+
model: str = LATEST_GPT_MODEL,
|
|
103
|
+
detail: DetailLevel = "low",
|
|
104
|
+
reasoning_effort: Optional[ReasoningEffort] = "medium",
|
|
105
|
+
) -> str:
|
|
106
|
+
"""
|
|
107
|
+
Send *images* (PNG bytes) and *prompt* to an OpenAI vision model.
|
|
108
|
+
|
|
109
|
+
For supported GPT-5 reasoning models, the ``reasoning_effort`` parameter
|
|
110
|
+
is passed to the API instead of temperature. ``gpt-5.3-chat-latest`` is
|
|
111
|
+
accepted without ``reasoning_effort``.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
images: List of raw PNG byte strings.
|
|
115
|
+
prompt: Text instruction for the model.
|
|
116
|
+
api_key: OpenAI API key.
|
|
117
|
+
model: Vision-capable model name.
|
|
118
|
+
detail: Image resolution hint ('low', 'high', or 'auto').
|
|
119
|
+
reasoning_effort: Reasoning depth for supported GPT-5.x models.
|
|
120
|
+
Ignored for gpt-5.3-chat-latest and older models
|
|
121
|
+
such as gpt-4o and gpt-4.1.
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
Model response as a plain string.
|
|
125
|
+
"""
|
|
126
|
+
try:
|
|
127
|
+
from openai import OpenAI
|
|
128
|
+
except ImportError as exc:
|
|
129
|
+
raise RuntimeError("openai package not installed. Run: pip install openai") from exc
|
|
130
|
+
|
|
131
|
+
if not api_key:
|
|
132
|
+
raise RuntimeError("OpenAI API key is not set.")
|
|
133
|
+
|
|
134
|
+
client = OpenAI(api_key=api_key)
|
|
135
|
+
|
|
136
|
+
# Build the multimodal message content
|
|
137
|
+
content = [{"type": "text", "text": prompt}]
|
|
138
|
+
for img_bytes in images:
|
|
139
|
+
b64 = base64.b64encode(img_bytes).decode("ascii")
|
|
140
|
+
content.append({
|
|
141
|
+
"type": "image_url",
|
|
142
|
+
"image_url": {
|
|
143
|
+
"url": f"data:image/png;base64,{b64}",
|
|
144
|
+
"detail": detail,
|
|
145
|
+
},
|
|
146
|
+
})
|
|
147
|
+
|
|
148
|
+
# Build API call kwargs
|
|
149
|
+
call_kwargs: dict = {
|
|
150
|
+
"model": model,
|
|
151
|
+
"messages": [{"role": "user", "content": content}],
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
if _supports_reasoning_effort(model):
|
|
155
|
+
# GPT-5 reasoning models: use reasoning_effort; temperature is not supported.
|
|
156
|
+
normalized_effort = _normalize_reasoning_effort(model, reasoning_effort)
|
|
157
|
+
if normalized_effort:
|
|
158
|
+
call_kwargs["reasoning_effort"] = normalized_effort
|
|
159
|
+
logger.info("[GPT-5 reasoning] Using model=%s reasoning_effort=%s", model, normalized_effort)
|
|
160
|
+
elif _is_gpt5_chat_latest(model):
|
|
161
|
+
# Keep parity with the main RADIANT-LLM app for gpt-5.3-chat-latest.
|
|
162
|
+
call_kwargs["temperature"] = 1.0
|
|
163
|
+
logger.info("[GPT-5 chat-latest] Using model=%s temperature=1.0", model)
|
|
164
|
+
else:
|
|
165
|
+
# Older models: standard temperature
|
|
166
|
+
call_kwargs["temperature"] = 0
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
response = client.chat.completions.create(**call_kwargs)
|
|
170
|
+
return response.choices[0].message.content.strip()
|
|
171
|
+
except Exception as exc:
|
|
172
|
+
raise RuntimeError(f"OpenAI vision call failed (model={model}): {exc}") from exc
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
# ---------------------------------------------------------------------------
|
|
176
|
+
# Google Gemini
|
|
177
|
+
# ---------------------------------------------------------------------------
|
|
178
|
+
|
|
179
|
+
def call_vision_llm_gemini(
|
|
180
|
+
images: List[bytes],
|
|
181
|
+
prompt: str,
|
|
182
|
+
api_key: str,
|
|
183
|
+
model: str = LATEST_GEMINI_MODEL,
|
|
184
|
+
) -> str:
|
|
185
|
+
"""
|
|
186
|
+
Send *images* (PNG bytes) and *prompt* to a Google Gemini vision model.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
images: List of raw PNG byte strings.
|
|
190
|
+
prompt: Text instruction for the model.
|
|
191
|
+
api_key: Gemini API key.
|
|
192
|
+
model: Gemini model name.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
Model response as a plain string.
|
|
196
|
+
"""
|
|
197
|
+
try:
|
|
198
|
+
import google.generativeai as genai
|
|
199
|
+
except ImportError as exc:
|
|
200
|
+
raise RuntimeError(
|
|
201
|
+
"google-generativeai package not installed. "
|
|
202
|
+
"Run: pip install google-generativeai"
|
|
203
|
+
) from exc
|
|
204
|
+
|
|
205
|
+
if not api_key:
|
|
206
|
+
raise RuntimeError("Gemini API key is not set.")
|
|
207
|
+
|
|
208
|
+
genai.configure(api_key=api_key)
|
|
209
|
+
vision_model = genai.GenerativeModel(model)
|
|
210
|
+
|
|
211
|
+
pil_images = [Image.open(io.BytesIO(b)).convert("RGB") for b in images]
|
|
212
|
+
logger.info("[GEMINI] Using model=%s", model)
|
|
213
|
+
|
|
214
|
+
try:
|
|
215
|
+
response = vision_model.generate_content([prompt] + pil_images)
|
|
216
|
+
return response.text
|
|
217
|
+
except Exception as exc:
|
|
218
|
+
raise RuntimeError(f"Gemini vision call failed (model={model}): {exc}") from exc
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
# ---------------------------------------------------------------------------
|
|
222
|
+
# Unified dispatcher
|
|
223
|
+
# ---------------------------------------------------------------------------
|
|
224
|
+
|
|
225
|
+
def call_vision_llm(
|
|
226
|
+
images: List[bytes],
|
|
227
|
+
prompt: str,
|
|
228
|
+
provider: str,
|
|
229
|
+
api_key: str,
|
|
230
|
+
model: str,
|
|
231
|
+
detail: DetailLevel = "low",
|
|
232
|
+
reasoning_effort: Optional[ReasoningEffort] = "medium",
|
|
233
|
+
) -> str:
|
|
234
|
+
"""
|
|
235
|
+
Unified vision-LLM dispatcher.
|
|
236
|
+
|
|
237
|
+
Automatically routes to the correct provider backend. When *model* is
|
|
238
|
+
empty or None, falls back to the latest default for that provider.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
images: List of raw PNG byte strings.
|
|
242
|
+
prompt: Text instruction for the model.
|
|
243
|
+
provider: ``'gpt'`` or ``'gemini'``.
|
|
244
|
+
api_key: API key for the chosen provider.
|
|
245
|
+
model: Model name string.
|
|
246
|
+
detail: Image detail level (GPT only; ignored for Gemini).
|
|
247
|
+
reasoning_effort: Reasoning depth for GPT-5.x (ignored for older GPT
|
|
248
|
+
models and all Gemini models).
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
Model response as a plain string.
|
|
252
|
+
"""
|
|
253
|
+
resolved_model = model or (
|
|
254
|
+
LATEST_GPT_MODEL if provider == "gpt" else LATEST_GEMINI_MODEL
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
if provider == "gpt":
|
|
258
|
+
return call_vision_llm_gpt(
|
|
259
|
+
images, prompt, api_key,
|
|
260
|
+
model=resolved_model,
|
|
261
|
+
detail=detail,
|
|
262
|
+
reasoning_effort=reasoning_effort,
|
|
263
|
+
)
|
|
264
|
+
if provider == "gemini":
|
|
265
|
+
return call_vision_llm_gemini(images, prompt, api_key, model=resolved_model)
|
|
266
|
+
|
|
267
|
+
raise RuntimeError(
|
|
268
|
+
f"Unknown vision provider: {provider!r}. Must be 'gpt' or 'gemini'."
|
|
269
|
+
)
|