cat-stack 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2087 @@
1
+ import warnings
2
+
3
+ from .text_functions import _detect_model_source
4
+ from .calls.pdf_stepback import get_pdf_stepback_insight
5
+
6
+ # Exported names (excludes deprecated pdf_multi_class)
7
+ __all__ = [
8
+ "_load_pdf_files",
9
+ "_get_pdf_pages",
10
+ "_extract_page_as_pdf_bytes",
11
+ "_extract_page_as_image_bytes",
12
+ "_encode_bytes_to_base64",
13
+ "_extract_page_text",
14
+ "explore_pdf_categories",
15
+ ]
16
+ from .calls.pdf_CoVe import (
17
+ pdf_chain_of_verification_openai,
18
+ pdf_chain_of_verification_anthropic,
19
+ pdf_chain_of_verification_google,
20
+ pdf_chain_of_verification_mistral
21
+ )
22
+
23
+
24
+ def _anthropic_supports_pdf(model_name):
25
+ """Check if the Anthropic model supports native PDF input.
26
+
27
+ PDF support is available for Claude 3.5 Sonnet, Claude 3 Opus, and Claude 3 Sonnet,
28
+ but NOT for Claude 3 Haiku.
29
+ """
30
+ model_lower = model_name.lower()
31
+ # Haiku models don't support PDF
32
+ if "haiku" in model_lower:
33
+ return False
34
+ # Sonnet, Opus support PDF
35
+ if any(x in model_lower for x in ["sonnet", "opus"]):
36
+ return True
37
+ # Default to False for unknown models to be safe
38
+ return False
39
+
40
+
41
+ def _load_pdf_files(pdf_input):
42
+ """Load PDF files from directory path, single file path, or return list as-is."""
43
+ import os
44
+ import glob
45
+
46
+ if isinstance(pdf_input, list):
47
+ pdf_files = pdf_input
48
+ print(f"Provided a list of {len(pdf_input)} PDFs.")
49
+ elif os.path.isfile(pdf_input):
50
+ # Single file path
51
+ pdf_files = [pdf_input]
52
+ print(f"Provided 1 PDF file.")
53
+ elif os.path.isdir(pdf_input):
54
+ # Directory path - glob for PDFs
55
+ pdf_files = glob.glob(os.path.join(pdf_input, '*.pdf'))
56
+ pdf_files.extend(glob.glob(os.path.join(pdf_input, '*.PDF')))
57
+ # Remove duplicates (case-insensitive systems)
58
+ seen = set()
59
+ unique_files = []
60
+ for f in pdf_files:
61
+ if f.lower() not in seen:
62
+ seen.add(f.lower())
63
+ unique_files.append(f)
64
+ pdf_files = unique_files
65
+ print(f"Found {len(pdf_files)} PDFs in directory.")
66
+ else:
67
+ raise FileNotFoundError(f"PDF input not found: {pdf_input}")
68
+
69
+ return pdf_files
70
+
71
+
72
+ def _get_pdf_pages(pdf_path):
73
+ """
74
+ Extract all pages from a PDF as separate page objects.
75
+ Returns list of tuples: [(page_index, page_label), ...]
76
+
77
+ For 'document.pdf' with 3 pages:
78
+ [(0, "document_p1"), (1, "document_p2"), (2, "document_p3")]
79
+
80
+ The actual page data is extracted later based on provider needs.
81
+ """
82
+ import os
83
+ from pathlib import Path
84
+
85
+ try:
86
+ import fitz # PyMuPDF
87
+ except ImportError:
88
+ raise ImportError(
89
+ "PyMuPDF is required for PDF processing. "
90
+ "Install it with: pip install PyMuPDF"
91
+ )
92
+
93
+ pdf_name = Path(pdf_path).stem # filename without extension
94
+
95
+ try:
96
+ doc = fitz.open(pdf_path)
97
+ page_count = len(doc)
98
+ doc.close()
99
+
100
+ if page_count == 0:
101
+ print(f"Warning: {pdf_path} has no pages")
102
+ return []
103
+
104
+ pages = []
105
+ for i in range(page_count):
106
+ page_label = f"{pdf_name}_p{i+1}"
107
+ pages.append((pdf_path, i, page_label))
108
+
109
+ return pages
110
+
111
+ except Exception as e:
112
+ print(f"Error reading PDF {pdf_path}: {e}")
113
+ return []
114
+
115
+
116
+ def _extract_page_as_pdf_bytes(pdf_path, page_index):
117
+ """
118
+ Extract a single page from a PDF as PDF bytes.
119
+ Used for providers with native PDF support (Anthropic, Google).
120
+ """
121
+ import fitz # PyMuPDF
122
+
123
+ try:
124
+ doc = fitz.open(pdf_path)
125
+ page = doc[page_index]
126
+
127
+ # Create a new PDF with just this page
128
+ new_doc = fitz.open()
129
+ new_doc.insert_pdf(doc, from_page=page_index, to_page=page_index)
130
+
131
+ pdf_bytes = new_doc.tobytes()
132
+ new_doc.close()
133
+ doc.close()
134
+
135
+ return pdf_bytes, True
136
+
137
+ except Exception as e:
138
+ print(f"Error extracting page {page_index} from {pdf_path}: {e}")
139
+ return None, False
140
+
141
+
142
+ def _extract_page_as_image_bytes(pdf_path, page_index, dpi=150):
143
+ """
144
+ Extract a single page from a PDF as PNG image bytes.
145
+ Used for providers without native PDF support (OpenAI, Mistral, etc.).
146
+ """
147
+ import fitz # PyMuPDF
148
+
149
+ try:
150
+ doc = fitz.open(pdf_path)
151
+ page = doc[page_index]
152
+
153
+ # Render page to image
154
+ mat = fitz.Matrix(dpi / 72, dpi / 72) # 72 is default PDF DPI
155
+ pix = page.get_pixmap(matrix=mat)
156
+
157
+ # Get PNG bytes
158
+ image_bytes = pix.tobytes("png")
159
+ doc.close()
160
+
161
+ return image_bytes, True
162
+
163
+ except Exception as e:
164
+ print(f"Error rendering page {page_index} from {pdf_path}: {e}")
165
+ return None, False
166
+
167
+
168
+ def _encode_bytes_to_base64(data_bytes):
169
+ """Encode bytes to base64 string."""
170
+ import base64
171
+ return base64.b64encode(data_bytes).decode("utf-8")
172
+
173
+
174
+ def _extract_page_text(pdf_path, page_index):
175
+ """
176
+ Extract text content from a single PDF page.
177
+ Used for text-based processing mode.
178
+ """
179
+ import fitz # PyMuPDF
180
+
181
+ try:
182
+ doc = fitz.open(pdf_path)
183
+ page = doc[page_index]
184
+ text = page.get_text("text")
185
+ doc.close()
186
+
187
+ if not text.strip():
188
+ return None, False, "Page contains no extractable text"
189
+
190
+ return text.strip(), True, None
191
+
192
+ except Exception as e:
193
+ print(f"Error extracting text from page {page_index} of {pdf_path}: {e}")
194
+ return None, False, str(e)
195
+
196
+
197
+ # PDF multi-class (binary) function
198
+ def pdf_multi_class(
199
+ pdf_description,
200
+ pdf_input,
201
+ categories,
202
+ api_key,
203
+ user_model="gpt-4o",
204
+ mode="image",
205
+ creativity=None,
206
+ safety=False,
207
+ chain_of_verification=False,
208
+ chain_of_thought=True,
209
+ step_back_prompt=False,
210
+ context_prompt=False,
211
+ thinking_budget=0,
212
+ example1=None,
213
+ example2=None,
214
+ example3=None,
215
+ example4=None,
216
+ example5=None,
217
+ example6=None,
218
+ filename=None,
219
+ save_directory=None,
220
+ model_source="auto",
221
+ progress_callback=None
222
+ ):
223
+ """
224
+ Categorize PDF pages using LLMs with multi-label classification.
225
+
226
+ Each page of each PDF is processed separately, with output labeled as
227
+ {pdf_name}_p{page_number} (e.g., "report_p1", "report_p2").
228
+
229
+ Args:
230
+ pdf_description (str): Description of the PDF documents being categorized.
231
+ pdf_input (str or list): Directory path containing PDFs, or list of PDF file paths.
232
+ categories (list or "auto"): List of category names for classification,
233
+ or "auto" to automatically extract categories from the PDFs first.
234
+ api_key (str): API key for the model provider.
235
+ user_model (str): Model name to use. Default "gpt-4o".
236
+ mode (str): How to process PDF pages. Options:
237
+ - "image": Render pages as images (best for visual elements like charts/tables)
238
+ - "text": Extract text only (best for text-heavy documents, faster/cheaper)
239
+ - "both": Send both text and image (most comprehensive but slower/costlier)
240
+ Default is "image".
241
+ creativity (float): Temperature setting. None uses model default.
242
+ safety (bool): If True, saves progress after each page.
243
+ chain_of_verification (bool): Enable Chain of Verification for accuracy.
244
+ chain_of_thought (bool): Enable step-by-step reasoning. Default True.
245
+ step_back_prompt (bool): Enable step-back prompting for abstract thinking.
246
+ context_prompt (bool): Add expert context to prompts.
247
+ thinking_budget (int): Token budget for thinking (Google models).
248
+ example1-6 (str): Example categorizations for few-shot learning.
249
+ filename (str): Output filename for CSV.
250
+ save_directory (str): Directory to save results.
251
+ model_source (str): Provider - "auto", "openai", "anthropic", "google",
252
+ "mistral", "perplexity", "huggingface", "xai".
253
+
254
+ Returns:
255
+ pd.DataFrame: Results with columns:
256
+ - pdf_input: Page label (e.g., "report_p1")
257
+ - model_response: Raw model response
258
+ - json: Extracted JSON
259
+ - category_1, category_2, ...: Binary category assignments
260
+ - processing_status: "success" or "error"
261
+
262
+ Example:
263
+ >>> import cat_stack as cat
264
+ >>> # Image mode (default) - good for documents with charts/tables
265
+ >>> results = cat.pdf_multi_class(
266
+ ... pdf_description="financial reports",
267
+ ... pdf_input="/path/to/pdfs/",
268
+ ... categories=["has_chart", "has_table", "is_summary"],
269
+ ... api_key="your-api-key",
270
+ ... mode="image"
271
+ ... )
272
+ >>> # Text mode - good for text-heavy documents, faster and cheaper
273
+ >>> results = cat.pdf_multi_class(
274
+ ... pdf_description="research papers",
275
+ ... pdf_input="/path/to/pdfs/",
276
+ ... categories=["discusses_methodology", "has_results"],
277
+ ... api_key="your-api-key",
278
+ ... mode="text"
279
+ ... )
280
+
281
+ .. deprecated::
282
+ Use :func:`cat_stack.classify` instead. This function will be removed in a future version.
283
+ """
284
+ warnings.warn(
285
+ "pdf_multi_class() is deprecated and will be removed in a future version. "
286
+ "Use cat_stack.classify() instead, which auto-detects PDF input.",
287
+ DeprecationWarning,
288
+ stacklevel=2,
289
+ )
290
+
291
+ import os
292
+ import json
293
+ import pandas as pd
294
+ import regex
295
+ import time
296
+ from tqdm import tqdm
297
+
298
+ if save_directory is not None and not os.path.isdir(save_directory):
299
+ raise FileNotFoundError(f"Directory {save_directory} doesn't exist")
300
+
301
+ # Validate mode parameter
302
+ mode = mode.lower()
303
+ if mode not in {"image", "text", "both"}:
304
+ raise ValueError(f"mode must be 'image', 'text', or 'both', got: {mode}")
305
+
306
+ model_source = _detect_model_source(user_model, model_source)
307
+
308
+ # Providers with native PDF support (only used in image/both modes)
309
+ native_pdf_providers = {"anthropic", "google"}
310
+
311
+ print(f"Processing mode: {mode}")
312
+
313
+ # Load PDF files
314
+ pdf_files = _load_pdf_files(pdf_input)
315
+
316
+ # Extract all pages from all PDFs
317
+ all_pages = [] # List of (pdf_path, page_index, page_label)
318
+ for pdf_path in pdf_files:
319
+ pages = _get_pdf_pages(pdf_path)
320
+ all_pages.extend(pages)
321
+
322
+ print(f"Total pages to process: {len(all_pages)}")
323
+
324
+ # Handle "auto" categories - extract categories first
325
+ if categories == "auto":
326
+ if not pdf_description:
327
+ raise ValueError("pdf_description is required when using categories='auto'")
328
+
329
+ print("\nAuto-extracting categories from PDFs...")
330
+ auto_result = explore_pdf_categories(
331
+ pdf_input=pdf_input,
332
+ api_key=api_key,
333
+ pdf_description=pdf_description,
334
+ user_model=user_model,
335
+ mode=mode,
336
+ model_source=model_source,
337
+ creativity=creativity
338
+ )
339
+ categories = auto_result["top_categories"]
340
+ print(f"Extracted {len(categories)} categories: {categories}\n")
341
+
342
+ categories_str = "\n".join(f"{i + 1}. {cat}" for i, cat in enumerate(categories))
343
+ cat_num = len(categories)
344
+ category_dict = {str(i+1): "0" for i in range(cat_num)}
345
+ example_JSON = json.dumps(category_dict, indent=4)
346
+
347
+ print(f"\nCategories to classify by {model_source} {user_model}:")
348
+ for i, cat in enumerate(categories, 1):
349
+ print(f"{i}. {cat}")
350
+
351
+ # Build examples text from provided examples
352
+ examples = [example1, example2, example3, example4, example5, example6]
353
+ examples = [ex for ex in examples if ex is not None]
354
+ if examples:
355
+ examples_text = "Here are some examples of how to categorize:\n" + "\n".join(examples)
356
+ else:
357
+ examples_text = ""
358
+
359
+ # Helper function for CoVe
360
+ def remove_numbering(line):
361
+ line = line.strip()
362
+ if line.startswith('- '):
363
+ return line[2:].strip()
364
+ if line.startswith('• '):
365
+ return line[2:].strip()
366
+ if line and line[0].isdigit():
367
+ i = 0
368
+ while i < len(line) and line[i].isdigit():
369
+ i += 1
370
+ if i < len(line) and line[i] in '.':
371
+ return line[i+1:].strip()
372
+ elif i < len(line) and line[i] in ')':
373
+ return line[i+1:].strip()
374
+ return line
375
+
376
+ # Step-back insight initialization
377
+ if step_back_prompt:
378
+ stepback = f"""What are the key content patterns or elements that typically indicate the presence of these categories in document pages showing "{pdf_description}"?
379
+
380
+ Categories to consider:
381
+ {categories_str}
382
+
383
+ Provide a brief analysis of what content cues to look for when categorizing such document pages."""
384
+
385
+ stepback_insight, step_back_added = get_pdf_stepback_insight(
386
+ model_source, stepback, api_key, user_model, creativity
387
+ )
388
+ else:
389
+ stepback_insight = None
390
+ step_back_added = False
391
+
392
+ page_labels = []
393
+ link1 = []
394
+ extracted_jsons = []
395
+
396
+ def _build_base_prompt_text(page_text=None):
397
+ """Build the base text portion of the prompt based on mode."""
398
+ # Determine instruction based on mode
399
+ if mode == "text":
400
+ examine_instruction = "Examine the following text extracted from a PDF page"
401
+ elif mode == "both":
402
+ examine_instruction = "Examine the attached PDF page image AND the extracted text below"
403
+ else: # image mode
404
+ examine_instruction = "Examine the attached PDF page"
405
+
406
+ if chain_of_thought:
407
+ base_text = (
408
+ f"You are a document-tagging assistant.\n"
409
+ f"Task ► {examine_instruction} and decide, **for each category below**, "
410
+ f"whether it is PRESENT (1) or NOT PRESENT (0).\n\n"
411
+ f"Document page is expected to contain: {pdf_description}\n\n"
412
+ f"Categories:\n{categories_str}\n\n"
413
+ f"Let's analyze step by step:\n"
414
+ f"1. First, identify the key content elements in the document page\n"
415
+ f"2. Then, match each element to the relevant categories\n"
416
+ f"3. Finally, assign 1 to matching categories and 0 to non-matching categories\n\n"
417
+ f"{examples_text}\n\n"
418
+ f"Output format ► Respond with **only** a JSON object whose keys are the "
419
+ f"quoted category numbers ('1', '2', …) and whose values are 1 or 0. "
420
+ f"No additional keys, comments, or text.\n\n"
421
+ f"Example (three categories):\n"
422
+ f"{example_JSON}"
423
+ )
424
+ else:
425
+ base_text = (
426
+ f"You are a document-tagging assistant.\n"
427
+ f"Task ► {examine_instruction} and decide, **for each category below**, "
428
+ f"whether it is PRESENT (1) or NOT PRESENT (0).\n\n"
429
+ f"Document page is expected to contain: {pdf_description}\n\n"
430
+ f"Categories:\n{categories_str}\n\n"
431
+ f"{examples_text}\n\n"
432
+ f"Output format ► Respond with **only** a JSON object whose keys are the "
433
+ f"quoted category numbers ('1', '2', …) and whose values are 1 or 0. "
434
+ f"No additional keys, comments, or text.\n\n"
435
+ f"Example (three categories):\n"
436
+ f"{example_JSON}"
437
+ )
438
+
439
+ # Add extracted text for text and both modes
440
+ if page_text and mode in ("text", "both"):
441
+ base_text += f"\n\n--- EXTRACTED TEXT FROM PAGE ---\n{page_text}\n--- END OF EXTRACTED TEXT ---"
442
+
443
+ if context_prompt:
444
+ context = (
445
+ "You are an expert document analyst specializing in page categorization. "
446
+ "Apply multi-label classification based on explicit and implicit content cues. "
447
+ "When uncertain, prioritize precision over recall.\n\n"
448
+ )
449
+ base_text = context + base_text
450
+
451
+ return base_text
452
+
453
+ def _build_cove_prompts(base_prompt_text):
454
+ """Build chain of verification prompts for PDF pages."""
455
+ step2_prompt = f"""You provided this initial categorization:
456
+ <<INITIAL_REPLY>>
457
+
458
+ Original task: {base_prompt_text}
459
+
460
+ Generate a focused list of 3-5 verification questions to fact-check your categorization. Each question should:
461
+ - Be concise and specific (one sentence)
462
+ - Address a distinct content element or category assignment
463
+ - Be answerable by re-examining the document page
464
+
465
+ Focus on verifying:
466
+ - Whether each category assignment matches what's visible in the page
467
+ - Whether any content elements were missed or misinterpreted
468
+ - Whether there are any logical inconsistencies
469
+
470
+ Provide only the verification questions as a numbered list."""
471
+
472
+ step3_prompt = f"""Re-examine the attached document page and answer the following verification question.
473
+
474
+ Document description: {pdf_description}
475
+
476
+ Verification question: <<QUESTION>>
477
+
478
+ Provide a brief, direct answer (1-2 sentences maximum) based on what you observe in the page.
479
+
480
+ Answer:"""
481
+
482
+ step4_prompt = f"""Original task: {base_prompt_text}
483
+ Initial categorization:
484
+ <<INITIAL_REPLY>>
485
+ Verification questions and answers:
486
+ <<VERIFICATION_QA>>
487
+ Based on this verification, provide the final corrected categorization.
488
+ If no categories are present, assign "0" to all categories.
489
+ Provide the final categorization in the same JSON format:"""
490
+
491
+ return step2_prompt, step3_prompt, step4_prompt
492
+
493
+ def _build_prompt_openai_mistral(encoded_image, base_text):
494
+ """Build prompt for OpenAI/Mistral format (PDF converted to image)."""
495
+ encoded_image_url = f"data:image/png;base64,{encoded_image}"
496
+ return [
497
+ {"type": "text", "text": base_text},
498
+ {"type": "image_url", "image_url": {"url": encoded_image_url, "detail": "high"}},
499
+ ]
500
+
501
+ def _build_prompt_anthropic_pdf(encoded_pdf, base_text):
502
+ """Build prompt for Anthropic format with native PDF support."""
503
+ return [
504
+ {"type": "text", "text": base_text},
505
+ {
506
+ "type": "document",
507
+ "source": {
508
+ "type": "base64",
509
+ "media_type": "application/pdf",
510
+ "data": encoded_pdf
511
+ }
512
+ }
513
+ ]
514
+
515
+ def _build_prompt_anthropic_image(encoded_image, base_text):
516
+ """Build prompt for Anthropic format with image (for Haiku and other non-PDF models)."""
517
+ return [
518
+ {"type": "text", "text": base_text},
519
+ {
520
+ "type": "image",
521
+ "source": {
522
+ "type": "base64",
523
+ "media_type": "image/png",
524
+ "data": encoded_image
525
+ }
526
+ }
527
+ ]
528
+
529
+ def _build_prompt_google_pdf(encoded_pdf, base_text):
530
+ """Build prompt data for Google format with native PDF support."""
531
+ return {
532
+ "text_prompt": base_text,
533
+ "pdf_data": encoded_pdf,
534
+ "mime_type": "application/pdf"
535
+ }
536
+
537
+ def _call_openai_compatible(prompt, step2_prompt, step3_prompt, step4_prompt, pdf_content):
538
+ """Handle OpenAI-compatible API calls (OpenAI, Perplexity, HuggingFace, xAI).
539
+
540
+ Uses direct HTTP requests instead of OpenAI SDK for lighter dependencies.
541
+ """
542
+ import requests as req
543
+
544
+ # Determine the base URL based on model source
545
+ if model_source == "huggingface":
546
+ from cat_stack.text_functions import _detect_huggingface_endpoint
547
+ base_url = _detect_huggingface_endpoint(api_key, user_model)
548
+ elif model_source == "huggingface-together":
549
+ base_url = "https://router.huggingface.co/together/v1"
550
+ elif model_source == "perplexity":
551
+ base_url = "https://api.perplexity.ai"
552
+ elif model_source == "xai":
553
+ base_url = "https://api.x.ai/v1"
554
+ else:
555
+ base_url = "https://api.openai.com/v1"
556
+
557
+ endpoint = f"{base_url}/chat/completions"
558
+
559
+ headers = {
560
+ "Content-Type": "application/json",
561
+ "Authorization": f"Bearer {api_key}"
562
+ }
563
+
564
+ max_retries = 8
565
+ delay = 2
566
+
567
+ for attempt in range(max_retries):
568
+ try:
569
+ # Build messages with optional stepback
570
+ messages = []
571
+ if step_back_prompt and step_back_added:
572
+ messages.append({'role': 'user', 'content': stepback})
573
+ messages.append({'role': 'assistant', 'content': stepback_insight})
574
+ messages.append({'role': 'user', 'content': prompt})
575
+
576
+ payload = {
577
+ "model": user_model,
578
+ "messages": messages,
579
+ }
580
+ if creativity is not None:
581
+ payload["temperature"] = creativity
582
+
583
+ response = req.post(endpoint, headers=headers, json=payload, timeout=120)
584
+ response.raise_for_status()
585
+ result = response.json()
586
+ reply = result["choices"][0]["message"]["content"]
587
+
588
+ if chain_of_verification:
589
+ reply = pdf_chain_of_verification_openai(
590
+ initial_reply=reply,
591
+ step2_prompt=step2_prompt,
592
+ step3_prompt=step3_prompt,
593
+ step4_prompt=step4_prompt,
594
+ client=None, # Not used anymore - CoVe will use requests
595
+ user_model=user_model,
596
+ creativity=creativity,
597
+ remove_numbering=remove_numbering,
598
+ pdf_content=pdf_content,
599
+ api_key=api_key,
600
+ base_url=base_url
601
+ )
602
+
603
+ return reply, None
604
+
605
+ except req.exceptions.HTTPError as e:
606
+ status_code = e.response.status_code if e.response else None
607
+ if status_code == 400 and attempt < max_retries - 1:
608
+ wait_time = delay * (2 ** attempt)
609
+ print(f"⚠️ Bad request. Attempt {attempt + 1}/{max_retries}")
610
+ print(f"Retrying in {wait_time}s...")
611
+ time.sleep(wait_time)
612
+ elif status_code == 404:
613
+ raise ValueError(f"❌ Model '{user_model}' on {model_source} not found.") from e
614
+ elif status_code in [500, 502, 503, 504] and attempt < max_retries - 1:
615
+ wait_time = delay * (2 ** attempt)
616
+ print(f"Attempt {attempt + 1} failed with error: {e}")
617
+ print(f"Retrying in {wait_time}s...")
618
+ time.sleep(wait_time)
619
+ else:
620
+ print(f"❌ Failed after {max_retries} attempts: {e}")
621
+ return """{"1":"e"}""", f"Error processing input: {e}"
622
+
623
+ except Exception as e:
624
+ if ("500" in str(e) or "504" in str(e)) and attempt < max_retries - 1:
625
+ wait_time = delay * (2 ** attempt)
626
+ print(f"Attempt {attempt + 1} failed with error: {e}")
627
+ print(f"Retrying in {wait_time}s...")
628
+ time.sleep(wait_time)
629
+ else:
630
+ print(f"❌ Failed after {max_retries} attempts: {e}")
631
+ return """{"1":"e"}""", f"Error processing input: {e}"
632
+
633
+ return """{"1":"e"}""", "Max retries exceeded"
634
+
635
+ def _call_anthropic(prompt, step2_prompt, step3_prompt, step4_prompt, pdf_content):
636
+ """Handle Anthropic API calls with native PDF support using direct HTTP requests."""
637
+ import requests as req
638
+
639
+ endpoint = "https://api.anthropic.com/v1/messages"
640
+ headers = {
641
+ "Content-Type": "application/json",
642
+ "x-api-key": api_key,
643
+ "anthropic-version": "2023-06-01"
644
+ }
645
+
646
+ try:
647
+ # Build messages with optional stepback
648
+ messages = []
649
+ if step_back_prompt and step_back_added:
650
+ messages.append({'role': 'user', 'content': stepback})
651
+ messages.append({'role': 'assistant', 'content': stepback_insight})
652
+ messages.append({'role': 'user', 'content': prompt})
653
+
654
+ payload = {
655
+ "model": user_model,
656
+ "max_tokens": 1024,
657
+ "messages": messages,
658
+ }
659
+ if creativity is not None:
660
+ payload["temperature"] = creativity
661
+
662
+ response = req.post(endpoint, headers=headers, json=payload, timeout=120)
663
+ response.raise_for_status()
664
+ result = response.json()
665
+
666
+ content = result.get("content", [])
667
+ if content and content[0].get("type") == "text":
668
+ reply = content[0].get("text", "")
669
+ else:
670
+ return """{"1":"e"}""", "No text content in response"
671
+
672
+ if chain_of_verification:
673
+ reply = pdf_chain_of_verification_anthropic(
674
+ initial_reply=reply,
675
+ step2_prompt=step2_prompt,
676
+ step3_prompt=step3_prompt,
677
+ step4_prompt=step4_prompt,
678
+ client=None, # No longer using SDK client
679
+ user_model=user_model,
680
+ creativity=creativity,
681
+ remove_numbering=remove_numbering,
682
+ pdf_content=pdf_content,
683
+ api_key=api_key # Pass api_key for HTTP calls
684
+ )
685
+
686
+ return reply, None
687
+
688
+ except req.exceptions.HTTPError as e:
689
+ if e.response is not None and e.response.status_code == 404:
690
+ raise ValueError(f"❌ Model '{user_model}' on {model_source} not found.") from e
691
+ print(f"An error occurred: {e}")
692
+ return """{"1":"e"}""", f"Error processing input: {e}"
693
+ except Exception as e:
694
+ print(f"An error occurred: {e}")
695
+ return """{"1":"e"}""", f"Error processing input: {e}"
696
+
697
+ def _call_google(prompt_data, step2_prompt, step3_prompt, step4_prompt, base_prompt_text):
698
+ """Handle Google API calls with native PDF support."""
699
+ import requests
700
+
701
+ def make_google_request(url, headers, payload, max_retries=8):
702
+ for attempt in range(max_retries):
703
+ try:
704
+ response = requests.post(url, headers=headers, json=payload)
705
+ response.raise_for_status()
706
+ return response.json()
707
+ except requests.exceptions.HTTPError as e:
708
+ status_code = e.response.status_code
709
+ retryable_errors = [429, 500, 502, 503, 504]
710
+
711
+ if status_code in retryable_errors and attempt < max_retries - 1:
712
+ wait_time = 10 * (2 ** attempt) if status_code == 429 else 2 * (2 ** attempt)
713
+ error_type = "Rate limited" if status_code == 429 else f"Server error {status_code}"
714
+ print(f"⚠️ {error_type}. Attempt {attempt + 1}/{max_retries}")
715
+ print(f"Retrying in {wait_time}s...")
716
+ time.sleep(wait_time)
717
+ else:
718
+ raise
719
+
720
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/{user_model}:generateContent"
721
+ headers = {
722
+ "x-goog-api-key": api_key,
723
+ "Content-Type": "application/json"
724
+ }
725
+
726
+ # Build parts with optional stepback context
727
+ parts = []
728
+ if step_back_prompt and step_back_added:
729
+ parts.append({"text": f"Context from step-back analysis:\n{stepback_insight}\n\n"})
730
+ parts.append({"text": prompt_data["text_prompt"]})
731
+ parts.append({
732
+ "inline_data": {
733
+ "mime_type": prompt_data["mime_type"],
734
+ "data": prompt_data["pdf_data"]
735
+ }
736
+ })
737
+
738
+ payload = {
739
+ "contents": [{"parts": parts}],
740
+ "generationConfig": {
741
+ "responseMimeType": "application/json",
742
+ **({"temperature": creativity} if creativity is not None else {}),
743
+ **({"thinkingConfig": {"thinkingBudget": thinking_budget}} if thinking_budget else {})
744
+ }
745
+ }
746
+
747
+ try:
748
+ result = make_google_request(url, headers, payload)
749
+
750
+ if "candidates" in result and result["candidates"]:
751
+ reply = result["candidates"][0]["content"]["parts"][0]["text"]
752
+ else:
753
+ return "No response generated", None
754
+
755
+ if chain_of_verification:
756
+ reply = pdf_chain_of_verification_google(
757
+ initial_reply=reply,
758
+ prompt=base_prompt_text,
759
+ step2_prompt=step2_prompt,
760
+ step3_prompt=step3_prompt,
761
+ step4_prompt=step4_prompt,
762
+ url=url,
763
+ headers=headers,
764
+ creativity=creativity,
765
+ remove_numbering=remove_numbering,
766
+ make_google_request=make_google_request,
767
+ pdf_data=prompt_data["pdf_data"],
768
+ mime_type=prompt_data["mime_type"]
769
+ )
770
+
771
+ return reply, None
772
+
773
+ except requests.exceptions.HTTPError as e:
774
+ if e.response.status_code == 404:
775
+ raise ValueError(f"❌ Model '{user_model}' not found.") from e
776
+ elif e.response.status_code in [401, 403]:
777
+ raise ValueError(f"❌ Authentication failed.") from e
778
+ else:
779
+ print(f"HTTP error occurred: {e}")
780
+ return """{"1":"e"}""", f"Error processing input: {e}"
781
+ except Exception as e:
782
+ print(f"An error occurred: {e}")
783
+ return """{"1":"e"}""", f"Error processing input: {e}"
784
+
785
+ def _call_mistral(prompt, step2_prompt, step3_prompt, step4_prompt, pdf_content):
786
+ """Handle Mistral API calls (PDF converted to image).
787
+
788
+ Uses direct HTTP requests instead of Mistral SDK for lighter dependencies.
789
+ """
790
+ import requests as req
791
+
792
+ endpoint = "https://api.mistral.ai/v1/chat/completions"
793
+ headers = {
794
+ "Content-Type": "application/json",
795
+ "Authorization": f"Bearer {api_key}"
796
+ }
797
+
798
+ max_retries = 8
799
+ delay = 2
800
+
801
+ for attempt in range(max_retries):
802
+ try:
803
+ # Build messages with optional stepback
804
+ messages = []
805
+ if step_back_prompt and step_back_added:
806
+ messages.append({'role': 'user', 'content': stepback})
807
+ messages.append({'role': 'assistant', 'content': stepback_insight})
808
+ messages.append({'role': 'user', 'content': prompt})
809
+
810
+ payload = {
811
+ "model": user_model,
812
+ "messages": messages,
813
+ }
814
+ if creativity is not None:
815
+ payload["temperature"] = creativity
816
+
817
+ response = req.post(endpoint, headers=headers, json=payload, timeout=120)
818
+ response.raise_for_status()
819
+ result = response.json()
820
+ reply = result["choices"][0]["message"]["content"]
821
+
822
+ if chain_of_verification:
823
+ reply = pdf_chain_of_verification_mistral(
824
+ initial_reply=reply,
825
+ step2_prompt=step2_prompt,
826
+ step3_prompt=step3_prompt,
827
+ step4_prompt=step4_prompt,
828
+ client=None, # Not used - CoVe will use requests
829
+ user_model=user_model,
830
+ creativity=creativity,
831
+ remove_numbering=remove_numbering,
832
+ pdf_content=pdf_content,
833
+ api_key=api_key
834
+ )
835
+
836
+ return reply, None
837
+
838
+ except req.exceptions.HTTPError as e:
839
+ status_code = e.response.status_code if e.response else None
840
+ if status_code == 404:
841
+ raise ValueError(f"❌ Model '{user_model}' not found.") from e
842
+ elif status_code in [401, 403]:
843
+ raise ValueError(f"❌ Authentication failed.") from e
844
+ elif status_code in [500, 502, 503, 504] and attempt < max_retries - 1:
845
+ wait_time = delay * (2 ** attempt)
846
+ print(f"⚠️ Server error {status_code}. Attempt {attempt + 1}/{max_retries}")
847
+ print(f"Retrying in {wait_time}s...")
848
+ time.sleep(wait_time)
849
+ else:
850
+ print(f"❌ Failed after {max_retries} attempts: {e}")
851
+ return """{"1":"e"}""", f"Error processing input: {e}"
852
+
853
+ except Exception as e:
854
+ print(f"❌ Unexpected error: {e}")
855
+ return """{"1":"e"}""", f"Error processing input: {e}"
856
+
857
+ return """{"1":"e"}""", "Max retries exceeded"
858
+
859
+ def _build_prompt_text_only(base_text):
860
+ """Build text-only prompt for providers (no image attachment)."""
861
+ return [{"type": "text", "text": base_text}]
862
+
863
+ def _call_openai_text_only(prompt_text, step2_prompt, step3_prompt, step4_prompt):
864
+ """Handle OpenAI-compatible API calls with text-only prompt.
865
+
866
+ Uses direct HTTP requests instead of OpenAI SDK for lighter dependencies.
867
+ """
868
+ import requests as req
869
+
870
+ # Determine the base URL based on model source
871
+ if model_source == "huggingface":
872
+ from cat_stack.text_functions import _detect_huggingface_endpoint
873
+ base_url = _detect_huggingface_endpoint(api_key, user_model)
874
+ elif model_source == "huggingface-together":
875
+ base_url = "https://router.huggingface.co/together/v1"
876
+ elif model_source == "perplexity":
877
+ base_url = "https://api.perplexity.ai"
878
+ elif model_source == "xai":
879
+ base_url = "https://api.x.ai/v1"
880
+ else:
881
+ base_url = "https://api.openai.com/v1"
882
+
883
+ endpoint = f"{base_url}/chat/completions"
884
+
885
+ headers = {
886
+ "Content-Type": "application/json",
887
+ "Authorization": f"Bearer {api_key}"
888
+ }
889
+
890
+ max_retries = 8
891
+ delay = 2
892
+
893
+ for attempt in range(max_retries):
894
+ try:
895
+ messages = []
896
+ if step_back_prompt and step_back_added:
897
+ messages.append({'role': 'user', 'content': stepback})
898
+ messages.append({'role': 'assistant', 'content': stepback_insight})
899
+ messages.append({'role': 'user', 'content': prompt_text})
900
+
901
+ payload = {
902
+ "model": user_model,
903
+ "messages": messages,
904
+ }
905
+ if creativity is not None:
906
+ payload["temperature"] = creativity
907
+
908
+ response = req.post(endpoint, headers=headers, json=payload, timeout=120)
909
+ response.raise_for_status()
910
+ result = response.json()
911
+ reply = result["choices"][0]["message"]["content"]
912
+ return reply, None
913
+
914
+ except req.exceptions.HTTPError as e:
915
+ status_code = e.response.status_code if e.response else None
916
+ if status_code == 400 and attempt < max_retries - 1:
917
+ wait_time = delay * (2 ** attempt)
918
+ print(f"⚠️ Bad request. Attempt {attempt + 1}/{max_retries}")
919
+ print(f"Retrying in {wait_time}s...")
920
+ time.sleep(wait_time)
921
+ elif status_code == 404:
922
+ raise ValueError(f"❌ Model '{user_model}' on {model_source} not found.") from e
923
+ elif status_code in [500, 502, 503, 504] and attempt < max_retries - 1:
924
+ wait_time = delay * (2 ** attempt)
925
+ print(f"Attempt {attempt + 1} failed with error: {e}")
926
+ print(f"Retrying in {wait_time}s...")
927
+ time.sleep(wait_time)
928
+ else:
929
+ print(f"❌ Failed after {max_retries} attempts: {e}")
930
+ return """{"1":"e"}""", f"Error processing input: {e}"
931
+
932
+ except Exception as e:
933
+ if ("500" in str(e) or "504" in str(e)) and attempt < max_retries - 1:
934
+ wait_time = delay * (2 ** attempt)
935
+ print(f"Attempt {attempt + 1} failed with error: {e}")
936
+ print(f"Retrying in {wait_time}s...")
937
+ time.sleep(wait_time)
938
+ else:
939
+ print(f"❌ Failed after {max_retries} attempts: {e}")
940
+ return """{"1":"e"}""", f"Error processing input: {e}"
941
+
942
+ return """{"1":"e"}""", "Max retries exceeded"
943
+
944
+ def _call_anthropic_text_only(prompt_text, step2_prompt, step3_prompt, step4_prompt):
945
+ """Handle Anthropic API calls with text-only prompt using direct HTTP requests."""
946
+ import requests as req
947
+
948
+ endpoint = "https://api.anthropic.com/v1/messages"
949
+ headers = {
950
+ "Content-Type": "application/json",
951
+ "x-api-key": api_key,
952
+ "anthropic-version": "2023-06-01"
953
+ }
954
+
955
+ try:
956
+ messages = []
957
+ if step_back_prompt and step_back_added:
958
+ messages.append({'role': 'user', 'content': stepback})
959
+ messages.append({'role': 'assistant', 'content': stepback_insight})
960
+ messages.append({'role': 'user', 'content': prompt_text})
961
+
962
+ payload = {
963
+ "model": user_model,
964
+ "max_tokens": 1024,
965
+ "messages": messages,
966
+ }
967
+ if creativity is not None:
968
+ payload["temperature"] = creativity
969
+
970
+ response = req.post(endpoint, headers=headers, json=payload, timeout=120)
971
+ response.raise_for_status()
972
+ result = response.json()
973
+
974
+ content = result.get("content", [])
975
+ if content and content[0].get("type") == "text":
976
+ reply = content[0].get("text", "")
977
+ return reply, None
978
+ return """{"1":"e"}""", "No text content in response"
979
+
980
+ except req.exceptions.HTTPError as e:
981
+ if e.response is not None and e.response.status_code == 404:
982
+ raise ValueError(f"❌ Model '{user_model}' on {model_source} not found.") from e
983
+ print(f"An error occurred: {e}")
984
+ return """{"1":"e"}""", f"Error processing input: {e}"
985
+ except Exception as e:
986
+ print(f"An error occurred: {e}")
987
+ return """{"1":"e"}""", f"Error processing input: {e}"
988
+
989
+ def _call_google_text_only(prompt_text, step2_prompt, step3_prompt, step4_prompt):
990
+ """Handle Google API calls with text-only prompt."""
991
+ import requests
992
+
993
+ def make_google_request(url, headers, payload, max_retries=8):
994
+ for attempt in range(max_retries):
995
+ try:
996
+ response = requests.post(url, headers=headers, json=payload)
997
+ response.raise_for_status()
998
+ return response.json()
999
+ except requests.exceptions.HTTPError as e:
1000
+ status_code = e.response.status_code
1001
+ retryable_errors = [429, 500, 502, 503, 504]
1002
+
1003
+ if status_code in retryable_errors and attempt < max_retries - 1:
1004
+ wait_time = 10 * (2 ** attempt) if status_code == 429 else 2 * (2 ** attempt)
1005
+ error_type = "Rate limited" if status_code == 429 else f"Server error {status_code}"
1006
+ print(f"⚠️ {error_type}. Attempt {attempt + 1}/{max_retries}")
1007
+ print(f"Retrying in {wait_time}s...")
1008
+ time.sleep(wait_time)
1009
+ else:
1010
+ raise
1011
+
1012
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/{user_model}:generateContent"
1013
+ headers = {
1014
+ "x-goog-api-key": api_key,
1015
+ "Content-Type": "application/json"
1016
+ }
1017
+
1018
+ parts = []
1019
+ if step_back_prompt and step_back_added:
1020
+ parts.append({"text": f"Context from step-back analysis:\n{stepback_insight}\n\n"})
1021
+ parts.append({"text": prompt_text})
1022
+
1023
+ payload = {
1024
+ "contents": [{"parts": parts}],
1025
+ "generationConfig": {
1026
+ "responseMimeType": "application/json",
1027
+ **({"temperature": creativity} if creativity is not None else {}),
1028
+ **({"thinkingConfig": {"thinkingBudget": thinking_budget}} if thinking_budget else {})
1029
+ }
1030
+ }
1031
+
1032
+ try:
1033
+ result = make_google_request(url, headers, payload)
1034
+
1035
+ if "candidates" in result and result["candidates"]:
1036
+ reply = result["candidates"][0]["content"]["parts"][0]["text"]
1037
+ else:
1038
+ return "No response generated", None
1039
+
1040
+ return reply, None
1041
+
1042
+ except requests.exceptions.HTTPError as e:
1043
+ if e.response.status_code == 404:
1044
+ raise ValueError(f"❌ Model '{user_model}' not found.") from e
1045
+ elif e.response.status_code in [401, 403]:
1046
+ raise ValueError(f"❌ Authentication failed.") from e
1047
+ else:
1048
+ print(f"HTTP error occurred: {e}")
1049
+ return """{"1":"e"}""", f"Error processing input: {e}"
1050
+ except Exception as e:
1051
+ print(f"An error occurred: {e}")
1052
+ return """{"1":"e"}""", f"Error processing input: {e}"
1053
+
1054
+ def _call_mistral_text_only(prompt_text, step2_prompt, step3_prompt, step4_prompt):
1055
+ """Handle Mistral API calls with text-only prompt.
1056
+
1057
+ Uses direct HTTP requests instead of Mistral SDK for lighter dependencies.
1058
+ """
1059
+ import requests as req
1060
+
1061
+ endpoint = "https://api.mistral.ai/v1/chat/completions"
1062
+ headers = {
1063
+ "Content-Type": "application/json",
1064
+ "Authorization": f"Bearer {api_key}"
1065
+ }
1066
+
1067
+ max_retries = 8
1068
+ delay = 2
1069
+
1070
+ for attempt in range(max_retries):
1071
+ try:
1072
+ messages = []
1073
+ if step_back_prompt and step_back_added:
1074
+ messages.append({'role': 'user', 'content': stepback})
1075
+ messages.append({'role': 'assistant', 'content': stepback_insight})
1076
+ messages.append({'role': 'user', 'content': prompt_text})
1077
+
1078
+ payload = {
1079
+ "model": user_model,
1080
+ "messages": messages,
1081
+ }
1082
+ if creativity is not None:
1083
+ payload["temperature"] = creativity
1084
+
1085
+ response = req.post(endpoint, headers=headers, json=payload, timeout=120)
1086
+ response.raise_for_status()
1087
+ result = response.json()
1088
+ reply = result["choices"][0]["message"]["content"]
1089
+ return reply, None
1090
+
1091
+ except req.exceptions.HTTPError as e:
1092
+ status_code = e.response.status_code if e.response else None
1093
+ if status_code == 404:
1094
+ raise ValueError(f"❌ Model '{user_model}' not found.") from e
1095
+ elif status_code in [401, 403]:
1096
+ raise ValueError(f"❌ Authentication failed.") from e
1097
+ elif status_code in [500, 502, 503, 504] and attempt < max_retries - 1:
1098
+ wait_time = delay * (2 ** attempt)
1099
+ print(f"⚠️ Server error {status_code}. Attempt {attempt + 1}/{max_retries}")
1100
+ print(f"Retrying in {wait_time}s...")
1101
+ time.sleep(wait_time)
1102
+ else:
1103
+ print(f"❌ Failed after {max_retries} attempts: {e}")
1104
+ return """{"1":"e"}""", f"Error processing input: {e}"
1105
+
1106
+ except Exception as e:
1107
+ print(f"❌ Unexpected error: {e}")
1108
+ return """{"1":"e"}""", f"Error processing input: {e}"
1109
+
1110
+ return """{"1":"e"}""", "Max retries exceeded"
1111
+
1112
+ def _process_single_page(pdf_path, page_index, page_label):
1113
+ """Process a single PDF page and return (reply, error_msg)."""
1114
+
1115
+ # Extract text if needed for text or both modes
1116
+ page_text = None
1117
+ if mode in ("text", "both"):
1118
+ page_text, text_valid, text_error = _extract_page_text(pdf_path, page_index)
1119
+ if mode == "text" and not text_valid:
1120
+ # Text mode requires text - fail if extraction failed
1121
+ return None, f"Failed to extract text: {text_error}"
1122
+ # For "both" mode, we continue even if text extraction fails
1123
+
1124
+ # Build prompt with text if available
1125
+ base_prompt_text = _build_base_prompt_text(page_text)
1126
+
1127
+ if chain_of_verification:
1128
+ step2_prompt, step3_prompt, step4_prompt = _build_cove_prompts(base_prompt_text)
1129
+ else:
1130
+ step2_prompt = step3_prompt = step4_prompt = None
1131
+
1132
+ # TEXT-ONLY MODE: No image/PDF attachment needed
1133
+ if mode == "text":
1134
+ if model_source == "anthropic":
1135
+ return _call_anthropic_text_only(base_prompt_text, step2_prompt, step3_prompt, step4_prompt)
1136
+ elif model_source == "google":
1137
+ return _call_google_text_only(base_prompt_text, step2_prompt, step3_prompt, step4_prompt)
1138
+ elif model_source in ["openai", "perplexity", "huggingface", "xai"]:
1139
+ return _call_openai_text_only(base_prompt_text, step2_prompt, step3_prompt, step4_prompt)
1140
+ elif model_source == "mistral":
1141
+ return _call_mistral_text_only(base_prompt_text, step2_prompt, step3_prompt, step4_prompt)
1142
+ else:
1143
+ raise ValueError(f"Unknown source! Choose from OpenAI, Anthropic, Perplexity, Google, xAI, Huggingface, or Mistral")
1144
+
1145
+ # IMAGE or BOTH MODE: Include image/PDF attachment
1146
+ # Handle providers with native PDF support
1147
+ if model_source == "anthropic":
1148
+ # Check if model supports native PDF (Haiku doesn't)
1149
+ if _anthropic_supports_pdf(user_model):
1150
+ pdf_bytes, is_valid = _extract_page_as_pdf_bytes(pdf_path, page_index)
1151
+ if not is_valid:
1152
+ return None, "Failed to extract PDF page"
1153
+
1154
+ encoded_pdf = _encode_bytes_to_base64(pdf_bytes)
1155
+ prompt = _build_prompt_anthropic_pdf(encoded_pdf, base_prompt_text)
1156
+ pdf_content = {
1157
+ "type": "document",
1158
+ "source": {
1159
+ "type": "base64",
1160
+ "media_type": "application/pdf",
1161
+ "data": encoded_pdf
1162
+ }
1163
+ }
1164
+ else:
1165
+ # Haiku and other non-PDF models: convert to image
1166
+ image_bytes, is_valid = _extract_page_as_image_bytes(pdf_path, page_index)
1167
+ if not is_valid:
1168
+ return None, "Failed to render PDF page to image"
1169
+
1170
+ encoded_image = _encode_bytes_to_base64(image_bytes)
1171
+ prompt = _build_prompt_anthropic_image(encoded_image, base_prompt_text)
1172
+ pdf_content = {
1173
+ "type": "image",
1174
+ "source": {
1175
+ "type": "base64",
1176
+ "media_type": "image/png",
1177
+ "data": encoded_image
1178
+ }
1179
+ }
1180
+ return _call_anthropic(prompt, step2_prompt, step3_prompt, step4_prompt, pdf_content)
1181
+
1182
+ elif model_source == "google":
1183
+ pdf_bytes, is_valid = _extract_page_as_pdf_bytes(pdf_path, page_index)
1184
+ if not is_valid:
1185
+ return None, "Failed to extract PDF page"
1186
+
1187
+ encoded_pdf = _encode_bytes_to_base64(pdf_bytes)
1188
+ prompt_data = _build_prompt_google_pdf(encoded_pdf, base_prompt_text)
1189
+ return _call_google(prompt_data, step2_prompt, step3_prompt, step4_prompt, base_prompt_text)
1190
+
1191
+ # Handle providers requiring image conversion
1192
+ else:
1193
+ image_bytes, is_valid = _extract_page_as_image_bytes(pdf_path, page_index)
1194
+ if not is_valid:
1195
+ return None, "Failed to render PDF page to image"
1196
+
1197
+ encoded_image = _encode_bytes_to_base64(image_bytes)
1198
+ prompt = _build_prompt_openai_mistral(encoded_image, base_prompt_text)
1199
+
1200
+ # PDF content for CoVe (as image)
1201
+ encoded_image_url = f"data:image/png;base64,{encoded_image}"
1202
+ pdf_content = {"type": "image_url", "image_url": {"url": encoded_image_url, "detail": "high"}}
1203
+
1204
+ if model_source in ["openai", "perplexity", "huggingface", "xai"]:
1205
+ return _call_openai_compatible(prompt, step2_prompt, step3_prompt, step4_prompt, pdf_content)
1206
+ elif model_source == "mistral":
1207
+ return _call_mistral(prompt, step2_prompt, step3_prompt, step4_prompt, pdf_content)
1208
+ else:
1209
+ raise ValueError(f"Unknown source! Choose from OpenAI, Anthropic, Perplexity, Google, xAI, Huggingface, or Mistral")
1210
+
1211
+ def _extract_json(reply):
1212
+ """Extract JSON from model reply."""
1213
+ if reply is None:
1214
+ return """{"1":"e"}"""
1215
+
1216
+ extracted_json = regex.findall(r'\{(?:[^{}]|(?R))*\}', reply, regex.DOTALL)
1217
+ if extracted_json:
1218
+ return extracted_json[0].replace('[', '').replace(']', '').replace('\n', '').replace(" ", '').replace(" ", '')
1219
+ else:
1220
+ print("""{"1":"e"}""")
1221
+ return """{"1":"e"}"""
1222
+
1223
+ # Main processing loop
1224
+ total_pages = len(all_pages)
1225
+ for idx, (pdf_path, page_index, page_label) in enumerate(tqdm(all_pages, desc="Categorizing PDF pages")):
1226
+ # Call progress callback if provided
1227
+ if progress_callback:
1228
+ progress_callback(idx, total_pages, page_label)
1229
+
1230
+ page_labels.append(page_label)
1231
+
1232
+ reply, error_msg = _process_single_page(pdf_path, page_index, page_label)
1233
+
1234
+ if error_msg:
1235
+ link1.append(error_msg)
1236
+ extracted_jsons.append("""{"1":"e"}""")
1237
+ else:
1238
+ link1.append(reply)
1239
+ extracted_jsons.append(_extract_json(reply))
1240
+
1241
+ # --- Safety Save ---
1242
+ if safety:
1243
+ if filename is None:
1244
+ raise TypeError("filename is required when using safety. Please provide the filename.")
1245
+
1246
+ normalized_data_list = []
1247
+ for json_str in extracted_jsons:
1248
+ try:
1249
+ parsed_obj = json.loads(json_str)
1250
+ normalized_data_list.append(pd.json_normalize(parsed_obj))
1251
+ except json.JSONDecodeError:
1252
+ normalized_data_list.append(pd.DataFrame({"1": ["e"]}))
1253
+ normalized_data = pd.concat(normalized_data_list, ignore_index=True)
1254
+
1255
+ temp_df = pd.DataFrame({
1256
+ 'pdf_input': page_labels,
1257
+ 'model_response': link1,
1258
+ 'json': extracted_jsons
1259
+ })
1260
+ temp_df = pd.concat([temp_df, normalized_data], axis=1)
1261
+
1262
+ save_path = os.path.join(save_directory, filename) if save_directory else filename
1263
+ temp_df.to_csv(save_path, index=False)
1264
+
1265
+ # --- Final DataFrame ---
1266
+ normalized_data_list = []
1267
+ for json_str in extracted_jsons:
1268
+ try:
1269
+ parsed_obj = json.loads(json_str)
1270
+ normalized_data_list.append(pd.json_normalize(parsed_obj))
1271
+ except json.JSONDecodeError:
1272
+ normalized_data_list.append(pd.DataFrame({"1": ["e"]}))
1273
+ normalized_data = pd.concat(normalized_data_list, ignore_index=True)
1274
+
1275
+ categorized_data = pd.DataFrame({
1276
+ 'pdf_input': pd.Series(page_labels),
1277
+ 'model_response': pd.Series(link1).reset_index(drop=True),
1278
+ 'json': pd.Series(extracted_jsons).reset_index(drop=True)
1279
+ })
1280
+ categorized_data = pd.concat([categorized_data, normalized_data], axis=1)
1281
+ categorized_data = categorized_data.rename(columns=lambda x: f'category_{x}' if str(x).isdigit() else x)
1282
+
1283
+ # Identify rows with invalid strings (like "e")
1284
+ cat_cols = [col for col in categorized_data.columns if col.startswith('category_')]
1285
+ has_invalid_strings = categorized_data[cat_cols].apply(
1286
+ lambda col: pd.to_numeric(col, errors='coerce').isna() & col.notna()
1287
+ ).any(axis=1)
1288
+
1289
+ categorized_data['processing_status'] = (~has_invalid_strings).map({True: 'success', False: 'error'})
1290
+ categorized_data.loc[has_invalid_strings, cat_cols] = pd.NA
1291
+
1292
+ for col in cat_cols:
1293
+ categorized_data[col] = pd.to_numeric(categorized_data[col], errors='coerce')
1294
+
1295
+ categorized_data.loc[~has_invalid_strings, cat_cols] = (
1296
+ categorized_data.loc[~has_invalid_strings, cat_cols].fillna(0)
1297
+ )
1298
+ categorized_data[cat_cols] = categorized_data[cat_cols].astype('Int64')
1299
+
1300
+ # Create categories_id (comma-separated binary values for each category)
1301
+ categorized_data['categories_id'] = categorized_data[cat_cols].apply(
1302
+ lambda x: ','.join(x.dropna().astype(int).astype(str)), axis=1
1303
+ )
1304
+
1305
+ if filename:
1306
+ save_path = os.path.join(save_directory, filename) if save_directory else filename
1307
+ categorized_data.to_csv(save_path, index=False)
1308
+
1309
+ return categorized_data
1310
+
1311
+
1312
+ def explore_pdf_categories(
1313
+ pdf_input,
1314
+ api_key,
1315
+ pdf_description="",
1316
+ max_categories=12,
1317
+ categories_per_chunk=10,
1318
+ divisions=5,
1319
+ user_model="gpt-4o",
1320
+ creativity=None,
1321
+ specificity="broad",
1322
+ research_question=None,
1323
+ mode="text",
1324
+ filename=None,
1325
+ model_source="auto",
1326
+ iterations=3,
1327
+ random_state=None,
1328
+ progress_callback=None,
1329
+ ):
1330
+ """
1331
+ Explore and extract common categories from PDF pages.
1332
+
1333
+ Modes:
1334
+ - "text" (default): Extracts text from pages, concatenates pages within
1335
+ each chunk, and sends combined text to identify categories. Similar to
1336
+ how explore_common_categories works with text responses. Best for
1337
+ text-heavy documents.
1338
+
1339
+ - "image": Samples random pages from the full pool of all pages across
1340
+ all PDFs and sends them as images to a vision model. Best for visual
1341
+ documents where layout matters.
1342
+
1343
+ - "both": Samples random pages, uses vision model to describe each page's
1344
+ content (text + visual elements), then extracts categories from those
1345
+ descriptions. Best for documents with mixed text and visual content
1346
+ (charts, diagrams, scanned documents).
1347
+
1348
+ Args:
1349
+ pdf_input: Path to PDF file, directory of PDFs, or list of PDF paths
1350
+ api_key: API key for the model provider
1351
+ pdf_description: Description of what the PDFs contain
1352
+ max_categories: Maximum number of final categories to return
1353
+ categories_per_chunk: Categories to extract per chunk of pages
1354
+ divisions: Number of chunks to divide pages into
1355
+ user_model: Model to use (vision model required for image/both modes)
1356
+ creativity: Temperature setting (None for default)
1357
+ specificity: "broad" or "specific" category granularity
1358
+ research_question: Optional research context
1359
+ mode: "text", "image", or "both"
1360
+ filename: Optional CSV filename to save results
1361
+ model_source: "auto", "openai", "anthropic", "google", "mistral"
1362
+ iterations: Number of passes over the data
1363
+ random_state: Random seed for reproducibility
1364
+ progress_callback: Optional callback function for progress updates.
1365
+ Called as progress_callback(current_step, total_steps, step_label).
1366
+
1367
+ Returns:
1368
+ dict with keys:
1369
+ - counts_df: DataFrame of categories with counts
1370
+ - top_categories: List of top category names
1371
+ - raw_top_text: Raw model output from final merge step
1372
+ """
1373
+ import os
1374
+ import re
1375
+ import pandas as pd
1376
+ import numpy as np
1377
+ from tqdm import tqdm
1378
+
1379
+ model_source = _detect_model_source(user_model, model_source)
1380
+
1381
+ # Load all PDF pages
1382
+ pdf_files = _load_pdf_files(pdf_input)
1383
+ if not pdf_files:
1384
+ raise ValueError("No PDF files found in the specified input.")
1385
+
1386
+ all_pages = []
1387
+ for pdf_path in pdf_files:
1388
+ pages = _get_pdf_pages(pdf_path)
1389
+ all_pages.extend(pages)
1390
+
1391
+ n = len(all_pages)
1392
+ if n == 0:
1393
+ raise ValueError("No pages found in the PDF files.")
1394
+
1395
+ # Auto-adjust divisions for small datasets
1396
+ # PDF pages can have multiple categories each, so we can use fewer divisions
1397
+ original_divisions = divisions
1398
+ divisions = min(divisions, max(1, n // 2)) # At least 2 pages per chunk
1399
+ if divisions != original_divisions:
1400
+ print(f"Auto-adjusted divisions from {original_divisions} to {divisions} for {n} pages.")
1401
+
1402
+ # Chunk sizing - PDF pages often contain multiple categories each
1403
+ chunk_size = int(round(max(1, n / divisions), 0))
1404
+ # Don't reduce categories_per_chunk as aggressively for PDFs since each page can yield many categories
1405
+ if chunk_size < 2:
1406
+ # Only reduce if we have very few pages
1407
+ old_categories_per_chunk = categories_per_chunk
1408
+ categories_per_chunk = max(5, chunk_size * 4)
1409
+ print(f"Auto-adjusted categories_per_chunk from {old_categories_per_chunk} to {categories_per_chunk} for chunk size {chunk_size}.")
1410
+
1411
+ print(
1412
+ f"Exploring categories in PDFs: '{pdf_description}'.\n"
1413
+ f" {n} total pages, {categories_per_chunk * divisions} categories to extract, "
1414
+ f"{max_categories} final categories. Mode: {mode}\n"
1415
+ )
1416
+
1417
+ # RNG for reproducible sampling
1418
+ rng = np.random.default_rng(random_state)
1419
+
1420
+ # Initialize client/config based on model source
1421
+ # For OpenAI-compatible APIs (including Mistral), we use requests directly instead of SDK
1422
+ import requests as http_client
1423
+
1424
+ if model_source in ["openai", "huggingface", "huggingface-together", "xai", "perplexity"]:
1425
+ # Determine base URL for OpenAI-compatible APIs
1426
+ if model_source == "huggingface":
1427
+ from cat_stack.text_functions import _detect_huggingface_endpoint
1428
+ openai_base_url = _detect_huggingface_endpoint(api_key, user_model)
1429
+ elif model_source == "huggingface-together":
1430
+ openai_base_url = "https://router.huggingface.co/together/v1"
1431
+ elif model_source == "xai":
1432
+ openai_base_url = "https://api.x.ai/v1"
1433
+ elif model_source == "perplexity":
1434
+ openai_base_url = "https://api.perplexity.ai"
1435
+ else:
1436
+ openai_base_url = "https://api.openai.com/v1"
1437
+ client = None # We'll use requests directly
1438
+ elif model_source == "anthropic":
1439
+ # Using direct HTTP requests instead of Anthropic SDK
1440
+ client = None
1441
+ openai_base_url = None
1442
+ elif model_source == "google":
1443
+ client = None
1444
+ openai_base_url = None
1445
+ elif model_source == "mistral":
1446
+ # Mistral API is OpenAI-compatible, use requests directly
1447
+ openai_base_url = "https://api.mistral.ai/v1"
1448
+ client = None
1449
+ else:
1450
+ raise ValueError(f"Unsupported model_source: {model_source}")
1451
+
1452
+ def make_text_prompt(text_blob: str) -> str:
1453
+ """Build prompt for text mode - concatenated page text."""
1454
+ return (
1455
+ f"Identify {categories_per_chunk} {specificity} categories of content found in this document text. "
1456
+ f"The document is: {pdf_description}. "
1457
+ f"{'Research context: ' + research_question + '. ' if research_question else ''}"
1458
+ f"The text is contained within triple backticks: ```{text_blob}``` "
1459
+ f"Number your categories from 1 through {categories_per_chunk} and provide concise labels only (no descriptions)."
1460
+ )
1461
+
1462
+ def make_image_prompt() -> str:
1463
+ """Build prompt for image mode - single page image."""
1464
+ return (
1465
+ f"Identify {categories_per_chunk} {specificity} categories of content found in this PDF page. "
1466
+ f"The document is: {pdf_description}. "
1467
+ f"{'Research context: ' + research_question if research_question else ''}\n\n"
1468
+ f"Number your categories from 1 through {categories_per_chunk} and provide concise labels only (no descriptions)."
1469
+ )
1470
+
1471
+ def make_describe_prompt() -> str:
1472
+ """Build prompt for 'both' mode - describe page content."""
1473
+ return (
1474
+ f"Describe the content of this PDF page in detail. "
1475
+ f"Include all text, images, charts, diagrams, tables, and layout elements. "
1476
+ f"The document is: {pdf_description}. "
1477
+ f"{'Research context: ' + research_question if research_question else ''}\n\n"
1478
+ f"Provide a comprehensive text description that captures both visual and textual content."
1479
+ )
1480
+
1481
+ def describe_page_with_vision(pdf_path, page_index):
1482
+ """Use vision model to describe a page's content as text.
1483
+
1484
+ Uses native PDF support for Anthropic (non-Haiku) and Google, converts to image for others.
1485
+ """
1486
+ prompt_text = make_describe_prompt()
1487
+
1488
+ try:
1489
+ # Anthropic - use native PDF support if model supports it
1490
+ if model_source == "anthropic" and _anthropic_supports_pdf(user_model):
1491
+ pdf_bytes, is_valid = _extract_page_as_pdf_bytes(pdf_path, page_index)
1492
+ if not is_valid:
1493
+ return None
1494
+ encoded_pdf = _encode_bytes_to_base64(pdf_bytes)
1495
+ content = [
1496
+ {"type": "text", "text": prompt_text},
1497
+ {
1498
+ "type": "document",
1499
+ "source": {
1500
+ "type": "base64",
1501
+ "media_type": "application/pdf",
1502
+ "data": encoded_pdf
1503
+ }
1504
+ }
1505
+ ]
1506
+ endpoint = "https://api.anthropic.com/v1/messages"
1507
+ headers = {
1508
+ "Content-Type": "application/json",
1509
+ "x-api-key": api_key,
1510
+ "anthropic-version": "2023-06-01"
1511
+ }
1512
+ payload = {
1513
+ "model": user_model,
1514
+ "max_tokens": 4096,
1515
+ "messages": [{"role": "user", "content": content}],
1516
+ }
1517
+ if creativity is not None:
1518
+ payload["temperature"] = creativity
1519
+ resp = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
1520
+ resp.raise_for_status()
1521
+ result = resp.json()
1522
+ resp_content = result.get("content", [])
1523
+ if resp_content and resp_content[0].get("type") == "text":
1524
+ return resp_content[0].get("text", "")
1525
+ return None
1526
+
1527
+ # Anthropic Haiku - convert to image (doesn't support PDF)
1528
+ elif model_source == "anthropic":
1529
+ image_bytes, is_valid = _extract_page_as_image_bytes(pdf_path, page_index)
1530
+ if not is_valid:
1531
+ return None
1532
+ encoded_image = _encode_bytes_to_base64(image_bytes)
1533
+ content = [
1534
+ {"type": "text", "text": prompt_text},
1535
+ {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": encoded_image}}
1536
+ ]
1537
+ endpoint = "https://api.anthropic.com/v1/messages"
1538
+ headers = {
1539
+ "Content-Type": "application/json",
1540
+ "x-api-key": api_key,
1541
+ "anthropic-version": "2023-06-01"
1542
+ }
1543
+ payload = {
1544
+ "model": user_model,
1545
+ "max_tokens": 4096,
1546
+ "messages": [{"role": "user", "content": content}],
1547
+ }
1548
+ if creativity is not None:
1549
+ payload["temperature"] = creativity
1550
+ resp = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
1551
+ resp.raise_for_status()
1552
+ result = resp.json()
1553
+ resp_content = result.get("content", [])
1554
+ if resp_content and resp_content[0].get("type") == "text":
1555
+ return resp_content[0].get("text", "")
1556
+ return None
1557
+
1558
+ # Google - use native PDF support
1559
+ elif model_source == "google":
1560
+ pdf_bytes, is_valid = _extract_page_as_pdf_bytes(pdf_path, page_index)
1561
+ if not is_valid:
1562
+ return None
1563
+ encoded_pdf = _encode_bytes_to_base64(pdf_bytes)
1564
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/{user_model}:generateContent"
1565
+ headers = {"x-goog-api-key": api_key, "Content-Type": "application/json"}
1566
+ parts = [
1567
+ {"text": prompt_text},
1568
+ {"inline_data": {"mime_type": "application/pdf", "data": encoded_pdf}}
1569
+ ]
1570
+ payload = {
1571
+ "contents": [{"parts": parts}],
1572
+ "generationConfig": {**({"temperature": creativity} if creativity is not None else {})}
1573
+ }
1574
+ response = http_client.post(url, headers=headers, json=payload, timeout=120)
1575
+ response.raise_for_status()
1576
+ result = response.json()
1577
+ if "candidates" in result and result["candidates"]:
1578
+ return result["candidates"][0]["content"]["parts"][0]["text"]
1579
+ return None
1580
+
1581
+ # Other providers - convert PDF page to image
1582
+ else:
1583
+ image_bytes, is_valid = _extract_page_as_image_bytes(pdf_path, page_index)
1584
+ if not is_valid:
1585
+ return None
1586
+ encoded_image = _encode_bytes_to_base64(image_bytes)
1587
+
1588
+ if model_source in ["openai", "huggingface", "huggingface-together", "xai", "perplexity"]:
1589
+ # Use requests directly instead of OpenAI SDK
1590
+ endpoint = f"{openai_base_url}/chat/completions"
1591
+ headers = {
1592
+ "Content-Type": "application/json",
1593
+ "Authorization": f"Bearer {api_key}"
1594
+ }
1595
+ messages = [{
1596
+ "role": "user",
1597
+ "content": [
1598
+ {"type": "text", "text": prompt_text},
1599
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
1600
+ ]
1601
+ }]
1602
+ payload = {"model": user_model, "messages": messages}
1603
+ if creativity is not None:
1604
+ payload["temperature"] = creativity
1605
+ resp = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
1606
+ resp.raise_for_status()
1607
+ return resp.json()["choices"][0]["message"]["content"]
1608
+
1609
+ elif model_source == "mistral":
1610
+ # Use requests directly instead of Mistral SDK
1611
+ endpoint = f"{openai_base_url}/chat/completions"
1612
+ headers = {
1613
+ "Content-Type": "application/json",
1614
+ "Authorization": f"Bearer {api_key}"
1615
+ }
1616
+ messages = [{
1617
+ "role": "user",
1618
+ "content": [
1619
+ {"type": "text", "text": prompt_text},
1620
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
1621
+ ]
1622
+ }]
1623
+ payload = {"model": user_model, "messages": messages}
1624
+ if creativity is not None:
1625
+ payload["temperature"] = creativity
1626
+ resp = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
1627
+ resp.raise_for_status()
1628
+ return resp.json()["choices"][0]["message"]["content"]
1629
+
1630
+ except Exception as e:
1631
+ print(f"Error describing page {page_index}: {e}")
1632
+ return None
1633
+
1634
+ def call_model_with_text(prompt_text):
1635
+ """Send concatenated text to the model."""
1636
+ try:
1637
+ if model_source in ["openai", "huggingface", "huggingface-together", "xai", "perplexity"]:
1638
+ # Use requests directly instead of OpenAI SDK
1639
+ endpoint = f"{openai_base_url}/chat/completions"
1640
+ headers = {
1641
+ "Content-Type": "application/json",
1642
+ "Authorization": f"Bearer {api_key}"
1643
+ }
1644
+ payload = {
1645
+ "model": user_model,
1646
+ "messages": [{"role": "user", "content": prompt_text}]
1647
+ }
1648
+ if creativity is not None:
1649
+ payload["temperature"] = creativity
1650
+ resp = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
1651
+ resp.raise_for_status()
1652
+ return resp.json()["choices"][0]["message"]["content"]
1653
+
1654
+ elif model_source == "anthropic":
1655
+ endpoint = "https://api.anthropic.com/v1/messages"
1656
+ headers = {
1657
+ "Content-Type": "application/json",
1658
+ "x-api-key": api_key,
1659
+ "anthropic-version": "2023-06-01"
1660
+ }
1661
+ payload = {
1662
+ "model": user_model,
1663
+ "max_tokens": 2048,
1664
+ "messages": [{"role": "user", "content": prompt_text}],
1665
+ }
1666
+ if creativity is not None:
1667
+ payload["temperature"] = creativity
1668
+ resp = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
1669
+ resp.raise_for_status()
1670
+ result = resp.json()
1671
+ resp_content = result.get("content", [])
1672
+ if resp_content and resp_content[0].get("type") == "text":
1673
+ return resp_content[0].get("text", "")
1674
+ return None
1675
+
1676
+ elif model_source == "google":
1677
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/{user_model}:generateContent"
1678
+ headers = {"x-goog-api-key": api_key, "Content-Type": "application/json"}
1679
+ payload = {
1680
+ "contents": [{"parts": [{"text": prompt_text}]}],
1681
+ "generationConfig": {**({"temperature": creativity} if creativity is not None else {})}
1682
+ }
1683
+ response = http_client.post(url, headers=headers, json=payload, timeout=120)
1684
+ response.raise_for_status()
1685
+ result = response.json()
1686
+ if "candidates" in result and result["candidates"]:
1687
+ return result["candidates"][0]["content"]["parts"][0]["text"]
1688
+ return None
1689
+
1690
+ elif model_source == "mistral":
1691
+ # Use requests directly instead of Mistral SDK
1692
+ endpoint = f"{openai_base_url}/chat/completions"
1693
+ headers = {
1694
+ "Content-Type": "application/json",
1695
+ "Authorization": f"Bearer {api_key}"
1696
+ }
1697
+ payload = {
1698
+ "model": user_model,
1699
+ "messages": [{"role": "user", "content": prompt_text}]
1700
+ }
1701
+ if creativity is not None:
1702
+ payload["temperature"] = creativity
1703
+ resp = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
1704
+ resp.raise_for_status()
1705
+ return resp.json()["choices"][0]["message"]["content"]
1706
+
1707
+ except Exception as e:
1708
+ print(f"Error in text mode: {e}")
1709
+ return None
1710
+
1711
+ def call_model_with_image(pdf_path, page_index, prompt_text):
1712
+ """Send a PDF page to the model.
1713
+
1714
+ Uses native PDF support for Anthropic (non-Haiku) and Google, converts to image for others.
1715
+ """
1716
+ try:
1717
+ # Anthropic - use native PDF support if model supports it
1718
+ if model_source == "anthropic" and _anthropic_supports_pdf(user_model):
1719
+ pdf_bytes, is_valid = _extract_page_as_pdf_bytes(pdf_path, page_index)
1720
+ if not is_valid:
1721
+ return None
1722
+ encoded_pdf = _encode_bytes_to_base64(pdf_bytes)
1723
+ content = [
1724
+ {"type": "text", "text": prompt_text},
1725
+ {
1726
+ "type": "document",
1727
+ "source": {
1728
+ "type": "base64",
1729
+ "media_type": "application/pdf",
1730
+ "data": encoded_pdf
1731
+ }
1732
+ }
1733
+ ]
1734
+ endpoint = "https://api.anthropic.com/v1/messages"
1735
+ headers = {
1736
+ "Content-Type": "application/json",
1737
+ "x-api-key": api_key,
1738
+ "anthropic-version": "2023-06-01"
1739
+ }
1740
+ payload = {
1741
+ "model": user_model,
1742
+ "max_tokens": 2048,
1743
+ "messages": [{"role": "user", "content": content}],
1744
+ }
1745
+ if creativity is not None:
1746
+ payload["temperature"] = creativity
1747
+ resp = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
1748
+ resp.raise_for_status()
1749
+ result = resp.json()
1750
+ resp_content = result.get("content", [])
1751
+ if resp_content and resp_content[0].get("type") == "text":
1752
+ return resp_content[0].get("text", "")
1753
+ return None
1754
+
1755
+ # Anthropic Haiku - convert to image (doesn't support PDF)
1756
+ elif model_source == "anthropic":
1757
+ image_bytes, is_valid = _extract_page_as_image_bytes(pdf_path, page_index)
1758
+ if not is_valid:
1759
+ return None
1760
+ encoded_image = _encode_bytes_to_base64(image_bytes)
1761
+ content = [
1762
+ {"type": "text", "text": prompt_text},
1763
+ {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": encoded_image}}
1764
+ ]
1765
+ endpoint = "https://api.anthropic.com/v1/messages"
1766
+ headers = {
1767
+ "Content-Type": "application/json",
1768
+ "x-api-key": api_key,
1769
+ "anthropic-version": "2023-06-01"
1770
+ }
1771
+ payload = {
1772
+ "model": user_model,
1773
+ "max_tokens": 2048,
1774
+ "messages": [{"role": "user", "content": content}],
1775
+ }
1776
+ if creativity is not None:
1777
+ payload["temperature"] = creativity
1778
+ resp = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
1779
+ resp.raise_for_status()
1780
+ result = resp.json()
1781
+ resp_content = result.get("content", [])
1782
+ if resp_content and resp_content[0].get("type") == "text":
1783
+ return resp_content[0].get("text", "")
1784
+ return None
1785
+
1786
+ # Google - use native PDF support
1787
+ elif model_source == "google":
1788
+ pdf_bytes, is_valid = _extract_page_as_pdf_bytes(pdf_path, page_index)
1789
+ if not is_valid:
1790
+ return None
1791
+ encoded_pdf = _encode_bytes_to_base64(pdf_bytes)
1792
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/{user_model}:generateContent"
1793
+ headers = {"x-goog-api-key": api_key, "Content-Type": "application/json"}
1794
+ parts = [
1795
+ {"text": prompt_text},
1796
+ {"inline_data": {"mime_type": "application/pdf", "data": encoded_pdf}}
1797
+ ]
1798
+ payload = {
1799
+ "contents": [{"parts": parts}],
1800
+ "generationConfig": {**({"temperature": creativity} if creativity is not None else {})}
1801
+ }
1802
+ response = http_client.post(url, headers=headers, json=payload, timeout=120)
1803
+ response.raise_for_status()
1804
+ result = response.json()
1805
+ if "candidates" in result and result["candidates"]:
1806
+ return result["candidates"][0]["content"]["parts"][0]["text"]
1807
+ return None
1808
+
1809
+ # Other providers - convert PDF page to image
1810
+ else:
1811
+ image_bytes, is_valid = _extract_page_as_image_bytes(pdf_path, page_index)
1812
+ if not is_valid:
1813
+ return None
1814
+ encoded_image = _encode_bytes_to_base64(image_bytes)
1815
+
1816
+ if model_source in ["openai", "huggingface", "huggingface-together", "xai", "perplexity"]:
1817
+ # Use requests directly instead of OpenAI SDK
1818
+ endpoint = f"{openai_base_url}/chat/completions"
1819
+ headers = {
1820
+ "Content-Type": "application/json",
1821
+ "Authorization": f"Bearer {api_key}"
1822
+ }
1823
+ messages = [{
1824
+ "role": "user",
1825
+ "content": [
1826
+ {"type": "text", "text": prompt_text},
1827
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
1828
+ ]
1829
+ }]
1830
+ payload = {"model": user_model, "messages": messages}
1831
+ if creativity is not None:
1832
+ payload["temperature"] = creativity
1833
+ resp = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
1834
+ resp.raise_for_status()
1835
+ return resp.json()["choices"][0]["message"]["content"]
1836
+
1837
+ elif model_source == "mistral":
1838
+ # Use requests directly instead of Mistral SDK
1839
+ endpoint = f"{openai_base_url}/chat/completions"
1840
+ headers = {
1841
+ "Content-Type": "application/json",
1842
+ "Authorization": f"Bearer {api_key}"
1843
+ }
1844
+ messages = [{
1845
+ "role": "user",
1846
+ "content": [
1847
+ {"type": "text", "text": prompt_text},
1848
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
1849
+ ]
1850
+ }]
1851
+ payload = {"model": user_model, "messages": messages}
1852
+ if creativity is not None:
1853
+ payload["temperature"] = creativity
1854
+ resp = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
1855
+ resp.raise_for_status()
1856
+ return resp.json()["choices"][0]["message"]["content"]
1857
+
1858
+ except Exception as e:
1859
+ print(f"Error processing page {page_index}: {e}")
1860
+ return None
1861
+
1862
+ # Parse numbered list pattern
1863
+ line_pat = re.compile(r"^\s*\d+\s*[\.\)\-]\s*(.+)$")
1864
+
1865
+ all_items = []
1866
+
1867
+ # Calculate total steps for progress tracking: (iterations * divisions) + 1 for final merge
1868
+ total_steps = (iterations * divisions) + 1
1869
+ current_step = 0
1870
+
1871
+ for pass_idx in range(iterations):
1872
+ # Shuffle page indices for this pass
1873
+ page_indices = list(range(n))
1874
+ rng.shuffle(page_indices)
1875
+
1876
+ # Create chunks
1877
+ chunks = [page_indices[i:i + chunk_size] for i in range(0, len(page_indices), chunk_size)][:divisions]
1878
+
1879
+ for chunk_idx, chunk in enumerate(tqdm(chunks, desc=f"Processing chunks (pass {pass_idx+1}/{iterations})")):
1880
+ if not chunk:
1881
+ continue
1882
+
1883
+ if mode == "text":
1884
+ # TEXT MODE: Extract and concatenate text from all pages in chunk
1885
+ chunk_texts = []
1886
+ for idx in chunk:
1887
+ page_tuple = all_pages[idx]
1888
+ pdf_path, page_index, page_label = page_tuple
1889
+ text, is_valid, _ = _extract_page_text(pdf_path, page_index)
1890
+ if is_valid and text:
1891
+ chunk_texts.append(text)
1892
+
1893
+ if not chunk_texts:
1894
+ continue
1895
+
1896
+ # Concatenate texts with separator
1897
+ combined_text = "\n---\n".join(chunk_texts)
1898
+ prompt = make_text_prompt(combined_text)
1899
+ reply = call_model_with_text(prompt)
1900
+
1901
+ elif mode == "image":
1902
+ # IMAGE MODE: Sample one random page from the full pool
1903
+ random_idx = rng.choice(page_indices)
1904
+ page_tuple = all_pages[random_idx]
1905
+ pdf_path, page_index, _ = page_tuple
1906
+ prompt = make_image_prompt()
1907
+ reply = call_model_with_image(pdf_path, page_index, prompt)
1908
+
1909
+ elif mode == "both":
1910
+ # BOTH MODE: Sample random page, describe with vision, then extract categories from description
1911
+ random_idx = rng.choice(page_indices)
1912
+ page_tuple = all_pages[random_idx]
1913
+ pdf_path, page_index, _ = page_tuple
1914
+
1915
+ # Step 1: Get text description of the page using vision
1916
+ page_description = describe_page_with_vision(pdf_path, page_index)
1917
+ if not page_description:
1918
+ continue
1919
+
1920
+ # Step 2: Extract categories from the description
1921
+ prompt = make_text_prompt(page_description)
1922
+ reply = call_model_with_text(prompt)
1923
+
1924
+ else:
1925
+ raise ValueError(f"Invalid mode: {mode}. Must be 'text', 'image', or 'both'.")
1926
+
1927
+ if reply:
1928
+ # Extract numbered items
1929
+ items = []
1930
+ for raw_line in reply.splitlines():
1931
+ m = line_pat.match(raw_line.strip())
1932
+ if m:
1933
+ items.append(m.group(1).strip())
1934
+ # Fallback for unnumbered lines
1935
+ if not items:
1936
+ for raw_line in reply.splitlines():
1937
+ s = raw_line.strip()
1938
+ if s:
1939
+ items.append(s)
1940
+ all_items.extend(items)
1941
+
1942
+ # Progress callback
1943
+ current_step += 1
1944
+ if progress_callback:
1945
+ progress_callback(current_step, total_steps, f"Pass {pass_idx+1}/{iterations}, chunk {chunk_idx+1}/{len(chunks)}")
1946
+
1947
+ # Normalize and count
1948
+ def normalize_category(cat):
1949
+ terms = sorted([t.strip().lower() for t in str(cat).split("/")])
1950
+ return "/".join(terms)
1951
+
1952
+ flat_list = [str(x).strip() for x in all_items if str(x).strip()]
1953
+ if not flat_list:
1954
+ raise ValueError("No categories were extracted from the PDF pages.")
1955
+
1956
+ df = pd.DataFrame(flat_list, columns=["Category"])
1957
+ df["normalized"] = df["Category"].map(normalize_category)
1958
+
1959
+ result = (
1960
+ df.groupby("normalized")
1961
+ .agg(Category=("Category", lambda x: x.value_counts().index[0]),
1962
+ counts=("Category", "size"))
1963
+ .sort_values("counts", ascending=False)
1964
+ .reset_index(drop=True)
1965
+ )
1966
+
1967
+ # Second-pass semantic merge
1968
+ seed_list = result["Category"].head(max_categories * 3).tolist()
1969
+
1970
+ second_prompt = f"""
1971
+ You are a data analyst reviewing categorized document data.
1972
+
1973
+ Task: From the provided categories, identify and return the top {max_categories} CONCEPTUALLY UNIQUE categories.
1974
+
1975
+ Critical Instructions:
1976
+ 1) Exact duplicates are already removed.
1977
+ 2) Merge SEMANTIC duplicates (same concept, different wording).
1978
+ 3) When merging:
1979
+ - Combine frequencies mentally
1980
+ - Keep the most frequent OR clearest label
1981
+ - Each concept appears ONLY ONCE
1982
+ 4) Keep category names {specificity}.
1983
+ 5) Return ONLY a numbered list of {max_categories} categories. No extra text.
1984
+
1985
+ Pre-processed Categories (sorted by frequency, top sample):
1986
+ {seed_list}
1987
+
1988
+ Output:
1989
+ 1. category
1990
+ 2. category
1991
+ ...
1992
+ {max_categories}. category
1993
+ """.strip()
1994
+
1995
+ try:
1996
+ if model_source in ["openai", "huggingface", "huggingface-together", "xai", "perplexity"]:
1997
+ # Use requests directly instead of OpenAI SDK
1998
+ endpoint = f"{openai_base_url}/chat/completions"
1999
+ headers = {
2000
+ "Content-Type": "application/json",
2001
+ "Authorization": f"Bearer {api_key}"
2002
+ }
2003
+ payload = {
2004
+ "model": user_model,
2005
+ "messages": [{"role": "user", "content": second_prompt}]
2006
+ }
2007
+ if creativity is not None:
2008
+ payload["temperature"] = creativity
2009
+ resp2 = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
2010
+ resp2.raise_for_status()
2011
+ top_categories_text = resp2.json()["choices"][0]["message"]["content"]
2012
+ elif model_source == "anthropic":
2013
+ endpoint = "https://api.anthropic.com/v1/messages"
2014
+ headers = {
2015
+ "Content-Type": "application/json",
2016
+ "x-api-key": api_key,
2017
+ "anthropic-version": "2023-06-01"
2018
+ }
2019
+ payload = {
2020
+ "model": user_model,
2021
+ "max_tokens": 2048,
2022
+ "messages": [{"role": "user", "content": second_prompt}],
2023
+ }
2024
+ if creativity is not None:
2025
+ payload["temperature"] = creativity
2026
+ resp2 = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
2027
+ resp2.raise_for_status()
2028
+ result = resp2.json()
2029
+ resp_content = result.get("content", [])
2030
+ if resp_content and resp_content[0].get("type") == "text":
2031
+ top_categories_text = resp_content[0].get("text", "")
2032
+ else:
2033
+ top_categories_text = ""
2034
+ elif model_source == "google":
2035
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/{user_model}:generateContent"
2036
+ headers = {"x-goog-api-key": api_key, "Content-Type": "application/json"}
2037
+ payload = {
2038
+ "contents": [{"parts": [{"text": second_prompt}]}],
2039
+ "generationConfig": {**({"temperature": creativity} if creativity is not None else {})}
2040
+ }
2041
+ response = http_client.post(url, headers=headers, json=payload, timeout=120)
2042
+ response.raise_for_status()
2043
+ res = response.json()
2044
+ top_categories_text = res["candidates"][0]["content"]["parts"][0]["text"]
2045
+ elif model_source == "mistral":
2046
+ # Use requests directly instead of Mistral SDK
2047
+ endpoint = f"{openai_base_url}/chat/completions"
2048
+ headers = {
2049
+ "Content-Type": "application/json",
2050
+ "Authorization": f"Bearer {api_key}"
2051
+ }
2052
+ payload = {
2053
+ "model": user_model,
2054
+ "messages": [{"role": "user", "content": second_prompt}]
2055
+ }
2056
+ if creativity is not None:
2057
+ payload["temperature"] = creativity
2058
+ resp2 = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
2059
+ resp2.raise_for_status()
2060
+ top_categories_text = resp2.json()["choices"][0]["message"]["content"]
2061
+ except Exception as e:
2062
+ print(f"Error in second-pass merge: {e}")
2063
+ top_categories_text = ""
2064
+
2065
+ # Final progress callback for the merge step
2066
+ if progress_callback:
2067
+ progress_callback(total_steps, total_steps, "Merging categories")
2068
+
2069
+ # Parse final list
2070
+ final = []
2071
+ for line in top_categories_text.splitlines():
2072
+ m = line_pat.match(line.strip())
2073
+ if m:
2074
+ final.append(m.group(1).strip())
2075
+ if not final:
2076
+ final = [l.strip("-*• ").strip() for l in top_categories_text.splitlines() if l.strip()]
2077
+
2078
+ print("\nTop categories:\n" + "\n".join(f"{i+1}. {c}" for i, c in enumerate(final[:max_categories])))
2079
+
2080
+ if filename:
2081
+ result.to_csv(filename, index=False)
2082
+
2083
+ return {
2084
+ "counts_df": result,
2085
+ "top_categories": final[:max_categories],
2086
+ "raw_top_text": top_categories_text
2087
+ }