cat-stack 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cat_stack/__about__.py +10 -0
- cat_stack/__init__.py +128 -0
- cat_stack/_batch.py +1388 -0
- cat_stack/_category_analysis.py +348 -0
- cat_stack/_chunked.py +424 -0
- cat_stack/_embeddings.py +189 -0
- cat_stack/_formatter.py +169 -0
- cat_stack/_providers.py +1048 -0
- cat_stack/_tiebreaker.py +277 -0
- cat_stack/_utils.py +512 -0
- cat_stack/_web_fetch.py +194 -0
- cat_stack/calls/CoVe.py +287 -0
- cat_stack/calls/__init__.py +25 -0
- cat_stack/calls/all_calls.py +622 -0
- cat_stack/calls/image_CoVe.py +386 -0
- cat_stack/calls/image_stepback.py +210 -0
- cat_stack/calls/pdf_CoVe.py +386 -0
- cat_stack/calls/pdf_stepback.py +210 -0
- cat_stack/calls/stepback.py +180 -0
- cat_stack/calls/top_n.py +217 -0
- cat_stack/classify.py +682 -0
- cat_stack/explore.py +111 -0
- cat_stack/extract.py +218 -0
- cat_stack/image_functions.py +2078 -0
- cat_stack/images/circle.png +0 -0
- cat_stack/images/cube.png +0 -0
- cat_stack/images/diamond.png +0 -0
- cat_stack/images/overlapping_pentagons.png +0 -0
- cat_stack/images/rectangles.png +0 -0
- cat_stack/model_reference_list.py +94 -0
- cat_stack/pdf_functions.py +2087 -0
- cat_stack/summarize.py +290 -0
- cat_stack/text_functions.py +1358 -0
- cat_stack/text_functions_ensemble.py +3644 -0
- cat_stack-0.1.0.dist-info/METADATA +150 -0
- cat_stack-0.1.0.dist-info/RECORD +38 -0
- cat_stack-0.1.0.dist-info/WHEEL +4 -0
- cat_stack-0.1.0.dist-info/licenses/LICENSE +672 -0
|
@@ -0,0 +1,2087 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
|
|
3
|
+
from .text_functions import _detect_model_source
|
|
4
|
+
from .calls.pdf_stepback import get_pdf_stepback_insight
|
|
5
|
+
|
|
6
|
+
# Exported names (excludes deprecated pdf_multi_class)
|
|
7
|
+
__all__ = [
|
|
8
|
+
"_load_pdf_files",
|
|
9
|
+
"_get_pdf_pages",
|
|
10
|
+
"_extract_page_as_pdf_bytes",
|
|
11
|
+
"_extract_page_as_image_bytes",
|
|
12
|
+
"_encode_bytes_to_base64",
|
|
13
|
+
"_extract_page_text",
|
|
14
|
+
"explore_pdf_categories",
|
|
15
|
+
]
|
|
16
|
+
from .calls.pdf_CoVe import (
|
|
17
|
+
pdf_chain_of_verification_openai,
|
|
18
|
+
pdf_chain_of_verification_anthropic,
|
|
19
|
+
pdf_chain_of_verification_google,
|
|
20
|
+
pdf_chain_of_verification_mistral
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _anthropic_supports_pdf(model_name):
|
|
25
|
+
"""Check if the Anthropic model supports native PDF input.
|
|
26
|
+
|
|
27
|
+
PDF support is available for Claude 3.5 Sonnet, Claude 3 Opus, and Claude 3 Sonnet,
|
|
28
|
+
but NOT for Claude 3 Haiku.
|
|
29
|
+
"""
|
|
30
|
+
model_lower = model_name.lower()
|
|
31
|
+
# Haiku models don't support PDF
|
|
32
|
+
if "haiku" in model_lower:
|
|
33
|
+
return False
|
|
34
|
+
# Sonnet, Opus support PDF
|
|
35
|
+
if any(x in model_lower for x in ["sonnet", "opus"]):
|
|
36
|
+
return True
|
|
37
|
+
# Default to False for unknown models to be safe
|
|
38
|
+
return False
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _load_pdf_files(pdf_input):
|
|
42
|
+
"""Load PDF files from directory path, single file path, or return list as-is."""
|
|
43
|
+
import os
|
|
44
|
+
import glob
|
|
45
|
+
|
|
46
|
+
if isinstance(pdf_input, list):
|
|
47
|
+
pdf_files = pdf_input
|
|
48
|
+
print(f"Provided a list of {len(pdf_input)} PDFs.")
|
|
49
|
+
elif os.path.isfile(pdf_input):
|
|
50
|
+
# Single file path
|
|
51
|
+
pdf_files = [pdf_input]
|
|
52
|
+
print(f"Provided 1 PDF file.")
|
|
53
|
+
elif os.path.isdir(pdf_input):
|
|
54
|
+
# Directory path - glob for PDFs
|
|
55
|
+
pdf_files = glob.glob(os.path.join(pdf_input, '*.pdf'))
|
|
56
|
+
pdf_files.extend(glob.glob(os.path.join(pdf_input, '*.PDF')))
|
|
57
|
+
# Remove duplicates (case-insensitive systems)
|
|
58
|
+
seen = set()
|
|
59
|
+
unique_files = []
|
|
60
|
+
for f in pdf_files:
|
|
61
|
+
if f.lower() not in seen:
|
|
62
|
+
seen.add(f.lower())
|
|
63
|
+
unique_files.append(f)
|
|
64
|
+
pdf_files = unique_files
|
|
65
|
+
print(f"Found {len(pdf_files)} PDFs in directory.")
|
|
66
|
+
else:
|
|
67
|
+
raise FileNotFoundError(f"PDF input not found: {pdf_input}")
|
|
68
|
+
|
|
69
|
+
return pdf_files
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _get_pdf_pages(pdf_path):
|
|
73
|
+
"""
|
|
74
|
+
Extract all pages from a PDF as separate page objects.
|
|
75
|
+
Returns list of tuples: [(page_index, page_label), ...]
|
|
76
|
+
|
|
77
|
+
For 'document.pdf' with 3 pages:
|
|
78
|
+
[(0, "document_p1"), (1, "document_p2"), (2, "document_p3")]
|
|
79
|
+
|
|
80
|
+
The actual page data is extracted later based on provider needs.
|
|
81
|
+
"""
|
|
82
|
+
import os
|
|
83
|
+
from pathlib import Path
|
|
84
|
+
|
|
85
|
+
try:
|
|
86
|
+
import fitz # PyMuPDF
|
|
87
|
+
except ImportError:
|
|
88
|
+
raise ImportError(
|
|
89
|
+
"PyMuPDF is required for PDF processing. "
|
|
90
|
+
"Install it with: pip install PyMuPDF"
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
pdf_name = Path(pdf_path).stem # filename without extension
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
doc = fitz.open(pdf_path)
|
|
97
|
+
page_count = len(doc)
|
|
98
|
+
doc.close()
|
|
99
|
+
|
|
100
|
+
if page_count == 0:
|
|
101
|
+
print(f"Warning: {pdf_path} has no pages")
|
|
102
|
+
return []
|
|
103
|
+
|
|
104
|
+
pages = []
|
|
105
|
+
for i in range(page_count):
|
|
106
|
+
page_label = f"{pdf_name}_p{i+1}"
|
|
107
|
+
pages.append((pdf_path, i, page_label))
|
|
108
|
+
|
|
109
|
+
return pages
|
|
110
|
+
|
|
111
|
+
except Exception as e:
|
|
112
|
+
print(f"Error reading PDF {pdf_path}: {e}")
|
|
113
|
+
return []
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _extract_page_as_pdf_bytes(pdf_path, page_index):
|
|
117
|
+
"""
|
|
118
|
+
Extract a single page from a PDF as PDF bytes.
|
|
119
|
+
Used for providers with native PDF support (Anthropic, Google).
|
|
120
|
+
"""
|
|
121
|
+
import fitz # PyMuPDF
|
|
122
|
+
|
|
123
|
+
try:
|
|
124
|
+
doc = fitz.open(pdf_path)
|
|
125
|
+
page = doc[page_index]
|
|
126
|
+
|
|
127
|
+
# Create a new PDF with just this page
|
|
128
|
+
new_doc = fitz.open()
|
|
129
|
+
new_doc.insert_pdf(doc, from_page=page_index, to_page=page_index)
|
|
130
|
+
|
|
131
|
+
pdf_bytes = new_doc.tobytes()
|
|
132
|
+
new_doc.close()
|
|
133
|
+
doc.close()
|
|
134
|
+
|
|
135
|
+
return pdf_bytes, True
|
|
136
|
+
|
|
137
|
+
except Exception as e:
|
|
138
|
+
print(f"Error extracting page {page_index} from {pdf_path}: {e}")
|
|
139
|
+
return None, False
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _extract_page_as_image_bytes(pdf_path, page_index, dpi=150):
|
|
143
|
+
"""
|
|
144
|
+
Extract a single page from a PDF as PNG image bytes.
|
|
145
|
+
Used for providers without native PDF support (OpenAI, Mistral, etc.).
|
|
146
|
+
"""
|
|
147
|
+
import fitz # PyMuPDF
|
|
148
|
+
|
|
149
|
+
try:
|
|
150
|
+
doc = fitz.open(pdf_path)
|
|
151
|
+
page = doc[page_index]
|
|
152
|
+
|
|
153
|
+
# Render page to image
|
|
154
|
+
mat = fitz.Matrix(dpi / 72, dpi / 72) # 72 is default PDF DPI
|
|
155
|
+
pix = page.get_pixmap(matrix=mat)
|
|
156
|
+
|
|
157
|
+
# Get PNG bytes
|
|
158
|
+
image_bytes = pix.tobytes("png")
|
|
159
|
+
doc.close()
|
|
160
|
+
|
|
161
|
+
return image_bytes, True
|
|
162
|
+
|
|
163
|
+
except Exception as e:
|
|
164
|
+
print(f"Error rendering page {page_index} from {pdf_path}: {e}")
|
|
165
|
+
return None, False
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _encode_bytes_to_base64(data_bytes):
|
|
169
|
+
"""Encode bytes to base64 string."""
|
|
170
|
+
import base64
|
|
171
|
+
return base64.b64encode(data_bytes).decode("utf-8")
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _extract_page_text(pdf_path, page_index):
|
|
175
|
+
"""
|
|
176
|
+
Extract text content from a single PDF page.
|
|
177
|
+
Used for text-based processing mode.
|
|
178
|
+
"""
|
|
179
|
+
import fitz # PyMuPDF
|
|
180
|
+
|
|
181
|
+
try:
|
|
182
|
+
doc = fitz.open(pdf_path)
|
|
183
|
+
page = doc[page_index]
|
|
184
|
+
text = page.get_text("text")
|
|
185
|
+
doc.close()
|
|
186
|
+
|
|
187
|
+
if not text.strip():
|
|
188
|
+
return None, False, "Page contains no extractable text"
|
|
189
|
+
|
|
190
|
+
return text.strip(), True, None
|
|
191
|
+
|
|
192
|
+
except Exception as e:
|
|
193
|
+
print(f"Error extracting text from page {page_index} of {pdf_path}: {e}")
|
|
194
|
+
return None, False, str(e)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
# PDF multi-class (binary) function
|
|
198
|
+
def pdf_multi_class(
|
|
199
|
+
pdf_description,
|
|
200
|
+
pdf_input,
|
|
201
|
+
categories,
|
|
202
|
+
api_key,
|
|
203
|
+
user_model="gpt-4o",
|
|
204
|
+
mode="image",
|
|
205
|
+
creativity=None,
|
|
206
|
+
safety=False,
|
|
207
|
+
chain_of_verification=False,
|
|
208
|
+
chain_of_thought=True,
|
|
209
|
+
step_back_prompt=False,
|
|
210
|
+
context_prompt=False,
|
|
211
|
+
thinking_budget=0,
|
|
212
|
+
example1=None,
|
|
213
|
+
example2=None,
|
|
214
|
+
example3=None,
|
|
215
|
+
example4=None,
|
|
216
|
+
example5=None,
|
|
217
|
+
example6=None,
|
|
218
|
+
filename=None,
|
|
219
|
+
save_directory=None,
|
|
220
|
+
model_source="auto",
|
|
221
|
+
progress_callback=None
|
|
222
|
+
):
|
|
223
|
+
"""
|
|
224
|
+
Categorize PDF pages using LLMs with multi-label classification.
|
|
225
|
+
|
|
226
|
+
Each page of each PDF is processed separately, with output labeled as
|
|
227
|
+
{pdf_name}_p{page_number} (e.g., "report_p1", "report_p2").
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
pdf_description (str): Description of the PDF documents being categorized.
|
|
231
|
+
pdf_input (str or list): Directory path containing PDFs, or list of PDF file paths.
|
|
232
|
+
categories (list or "auto"): List of category names for classification,
|
|
233
|
+
or "auto" to automatically extract categories from the PDFs first.
|
|
234
|
+
api_key (str): API key for the model provider.
|
|
235
|
+
user_model (str): Model name to use. Default "gpt-4o".
|
|
236
|
+
mode (str): How to process PDF pages. Options:
|
|
237
|
+
- "image": Render pages as images (best for visual elements like charts/tables)
|
|
238
|
+
- "text": Extract text only (best for text-heavy documents, faster/cheaper)
|
|
239
|
+
- "both": Send both text and image (most comprehensive but slower/costlier)
|
|
240
|
+
Default is "image".
|
|
241
|
+
creativity (float): Temperature setting. None uses model default.
|
|
242
|
+
safety (bool): If True, saves progress after each page.
|
|
243
|
+
chain_of_verification (bool): Enable Chain of Verification for accuracy.
|
|
244
|
+
chain_of_thought (bool): Enable step-by-step reasoning. Default True.
|
|
245
|
+
step_back_prompt (bool): Enable step-back prompting for abstract thinking.
|
|
246
|
+
context_prompt (bool): Add expert context to prompts.
|
|
247
|
+
thinking_budget (int): Token budget for thinking (Google models).
|
|
248
|
+
example1-6 (str): Example categorizations for few-shot learning.
|
|
249
|
+
filename (str): Output filename for CSV.
|
|
250
|
+
save_directory (str): Directory to save results.
|
|
251
|
+
model_source (str): Provider - "auto", "openai", "anthropic", "google",
|
|
252
|
+
"mistral", "perplexity", "huggingface", "xai".
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
pd.DataFrame: Results with columns:
|
|
256
|
+
- pdf_input: Page label (e.g., "report_p1")
|
|
257
|
+
- model_response: Raw model response
|
|
258
|
+
- json: Extracted JSON
|
|
259
|
+
- category_1, category_2, ...: Binary category assignments
|
|
260
|
+
- processing_status: "success" or "error"
|
|
261
|
+
|
|
262
|
+
Example:
|
|
263
|
+
>>> import cat_stack as cat
|
|
264
|
+
>>> # Image mode (default) - good for documents with charts/tables
|
|
265
|
+
>>> results = cat.pdf_multi_class(
|
|
266
|
+
... pdf_description="financial reports",
|
|
267
|
+
... pdf_input="/path/to/pdfs/",
|
|
268
|
+
... categories=["has_chart", "has_table", "is_summary"],
|
|
269
|
+
... api_key="your-api-key",
|
|
270
|
+
... mode="image"
|
|
271
|
+
... )
|
|
272
|
+
>>> # Text mode - good for text-heavy documents, faster and cheaper
|
|
273
|
+
>>> results = cat.pdf_multi_class(
|
|
274
|
+
... pdf_description="research papers",
|
|
275
|
+
... pdf_input="/path/to/pdfs/",
|
|
276
|
+
... categories=["discusses_methodology", "has_results"],
|
|
277
|
+
... api_key="your-api-key",
|
|
278
|
+
... mode="text"
|
|
279
|
+
... )
|
|
280
|
+
|
|
281
|
+
.. deprecated::
|
|
282
|
+
Use :func:`cat_stack.classify` instead. This function will be removed in a future version.
|
|
283
|
+
"""
|
|
284
|
+
warnings.warn(
|
|
285
|
+
"pdf_multi_class() is deprecated and will be removed in a future version. "
|
|
286
|
+
"Use cat_stack.classify() instead, which auto-detects PDF input.",
|
|
287
|
+
DeprecationWarning,
|
|
288
|
+
stacklevel=2,
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
import os
|
|
292
|
+
import json
|
|
293
|
+
import pandas as pd
|
|
294
|
+
import regex
|
|
295
|
+
import time
|
|
296
|
+
from tqdm import tqdm
|
|
297
|
+
|
|
298
|
+
if save_directory is not None and not os.path.isdir(save_directory):
|
|
299
|
+
raise FileNotFoundError(f"Directory {save_directory} doesn't exist")
|
|
300
|
+
|
|
301
|
+
# Validate mode parameter
|
|
302
|
+
mode = mode.lower()
|
|
303
|
+
if mode not in {"image", "text", "both"}:
|
|
304
|
+
raise ValueError(f"mode must be 'image', 'text', or 'both', got: {mode}")
|
|
305
|
+
|
|
306
|
+
model_source = _detect_model_source(user_model, model_source)
|
|
307
|
+
|
|
308
|
+
# Providers with native PDF support (only used in image/both modes)
|
|
309
|
+
native_pdf_providers = {"anthropic", "google"}
|
|
310
|
+
|
|
311
|
+
print(f"Processing mode: {mode}")
|
|
312
|
+
|
|
313
|
+
# Load PDF files
|
|
314
|
+
pdf_files = _load_pdf_files(pdf_input)
|
|
315
|
+
|
|
316
|
+
# Extract all pages from all PDFs
|
|
317
|
+
all_pages = [] # List of (pdf_path, page_index, page_label)
|
|
318
|
+
for pdf_path in pdf_files:
|
|
319
|
+
pages = _get_pdf_pages(pdf_path)
|
|
320
|
+
all_pages.extend(pages)
|
|
321
|
+
|
|
322
|
+
print(f"Total pages to process: {len(all_pages)}")
|
|
323
|
+
|
|
324
|
+
# Handle "auto" categories - extract categories first
|
|
325
|
+
if categories == "auto":
|
|
326
|
+
if not pdf_description:
|
|
327
|
+
raise ValueError("pdf_description is required when using categories='auto'")
|
|
328
|
+
|
|
329
|
+
print("\nAuto-extracting categories from PDFs...")
|
|
330
|
+
auto_result = explore_pdf_categories(
|
|
331
|
+
pdf_input=pdf_input,
|
|
332
|
+
api_key=api_key,
|
|
333
|
+
pdf_description=pdf_description,
|
|
334
|
+
user_model=user_model,
|
|
335
|
+
mode=mode,
|
|
336
|
+
model_source=model_source,
|
|
337
|
+
creativity=creativity
|
|
338
|
+
)
|
|
339
|
+
categories = auto_result["top_categories"]
|
|
340
|
+
print(f"Extracted {len(categories)} categories: {categories}\n")
|
|
341
|
+
|
|
342
|
+
categories_str = "\n".join(f"{i + 1}. {cat}" for i, cat in enumerate(categories))
|
|
343
|
+
cat_num = len(categories)
|
|
344
|
+
category_dict = {str(i+1): "0" for i in range(cat_num)}
|
|
345
|
+
example_JSON = json.dumps(category_dict, indent=4)
|
|
346
|
+
|
|
347
|
+
print(f"\nCategories to classify by {model_source} {user_model}:")
|
|
348
|
+
for i, cat in enumerate(categories, 1):
|
|
349
|
+
print(f"{i}. {cat}")
|
|
350
|
+
|
|
351
|
+
# Build examples text from provided examples
|
|
352
|
+
examples = [example1, example2, example3, example4, example5, example6]
|
|
353
|
+
examples = [ex for ex in examples if ex is not None]
|
|
354
|
+
if examples:
|
|
355
|
+
examples_text = "Here are some examples of how to categorize:\n" + "\n".join(examples)
|
|
356
|
+
else:
|
|
357
|
+
examples_text = ""
|
|
358
|
+
|
|
359
|
+
# Helper function for CoVe
|
|
360
|
+
def remove_numbering(line):
|
|
361
|
+
line = line.strip()
|
|
362
|
+
if line.startswith('- '):
|
|
363
|
+
return line[2:].strip()
|
|
364
|
+
if line.startswith('• '):
|
|
365
|
+
return line[2:].strip()
|
|
366
|
+
if line and line[0].isdigit():
|
|
367
|
+
i = 0
|
|
368
|
+
while i < len(line) and line[i].isdigit():
|
|
369
|
+
i += 1
|
|
370
|
+
if i < len(line) and line[i] in '.':
|
|
371
|
+
return line[i+1:].strip()
|
|
372
|
+
elif i < len(line) and line[i] in ')':
|
|
373
|
+
return line[i+1:].strip()
|
|
374
|
+
return line
|
|
375
|
+
|
|
376
|
+
# Step-back insight initialization
|
|
377
|
+
if step_back_prompt:
|
|
378
|
+
stepback = f"""What are the key content patterns or elements that typically indicate the presence of these categories in document pages showing "{pdf_description}"?
|
|
379
|
+
|
|
380
|
+
Categories to consider:
|
|
381
|
+
{categories_str}
|
|
382
|
+
|
|
383
|
+
Provide a brief analysis of what content cues to look for when categorizing such document pages."""
|
|
384
|
+
|
|
385
|
+
stepback_insight, step_back_added = get_pdf_stepback_insight(
|
|
386
|
+
model_source, stepback, api_key, user_model, creativity
|
|
387
|
+
)
|
|
388
|
+
else:
|
|
389
|
+
stepback_insight = None
|
|
390
|
+
step_back_added = False
|
|
391
|
+
|
|
392
|
+
page_labels = []
|
|
393
|
+
link1 = []
|
|
394
|
+
extracted_jsons = []
|
|
395
|
+
|
|
396
|
+
def _build_base_prompt_text(page_text=None):
|
|
397
|
+
"""Build the base text portion of the prompt based on mode."""
|
|
398
|
+
# Determine instruction based on mode
|
|
399
|
+
if mode == "text":
|
|
400
|
+
examine_instruction = "Examine the following text extracted from a PDF page"
|
|
401
|
+
elif mode == "both":
|
|
402
|
+
examine_instruction = "Examine the attached PDF page image AND the extracted text below"
|
|
403
|
+
else: # image mode
|
|
404
|
+
examine_instruction = "Examine the attached PDF page"
|
|
405
|
+
|
|
406
|
+
if chain_of_thought:
|
|
407
|
+
base_text = (
|
|
408
|
+
f"You are a document-tagging assistant.\n"
|
|
409
|
+
f"Task ► {examine_instruction} and decide, **for each category below**, "
|
|
410
|
+
f"whether it is PRESENT (1) or NOT PRESENT (0).\n\n"
|
|
411
|
+
f"Document page is expected to contain: {pdf_description}\n\n"
|
|
412
|
+
f"Categories:\n{categories_str}\n\n"
|
|
413
|
+
f"Let's analyze step by step:\n"
|
|
414
|
+
f"1. First, identify the key content elements in the document page\n"
|
|
415
|
+
f"2. Then, match each element to the relevant categories\n"
|
|
416
|
+
f"3. Finally, assign 1 to matching categories and 0 to non-matching categories\n\n"
|
|
417
|
+
f"{examples_text}\n\n"
|
|
418
|
+
f"Output format ► Respond with **only** a JSON object whose keys are the "
|
|
419
|
+
f"quoted category numbers ('1', '2', …) and whose values are 1 or 0. "
|
|
420
|
+
f"No additional keys, comments, or text.\n\n"
|
|
421
|
+
f"Example (three categories):\n"
|
|
422
|
+
f"{example_JSON}"
|
|
423
|
+
)
|
|
424
|
+
else:
|
|
425
|
+
base_text = (
|
|
426
|
+
f"You are a document-tagging assistant.\n"
|
|
427
|
+
f"Task ► {examine_instruction} and decide, **for each category below**, "
|
|
428
|
+
f"whether it is PRESENT (1) or NOT PRESENT (0).\n\n"
|
|
429
|
+
f"Document page is expected to contain: {pdf_description}\n\n"
|
|
430
|
+
f"Categories:\n{categories_str}\n\n"
|
|
431
|
+
f"{examples_text}\n\n"
|
|
432
|
+
f"Output format ► Respond with **only** a JSON object whose keys are the "
|
|
433
|
+
f"quoted category numbers ('1', '2', …) and whose values are 1 or 0. "
|
|
434
|
+
f"No additional keys, comments, or text.\n\n"
|
|
435
|
+
f"Example (three categories):\n"
|
|
436
|
+
f"{example_JSON}"
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
# Add extracted text for text and both modes
|
|
440
|
+
if page_text and mode in ("text", "both"):
|
|
441
|
+
base_text += f"\n\n--- EXTRACTED TEXT FROM PAGE ---\n{page_text}\n--- END OF EXTRACTED TEXT ---"
|
|
442
|
+
|
|
443
|
+
if context_prompt:
|
|
444
|
+
context = (
|
|
445
|
+
"You are an expert document analyst specializing in page categorization. "
|
|
446
|
+
"Apply multi-label classification based on explicit and implicit content cues. "
|
|
447
|
+
"When uncertain, prioritize precision over recall.\n\n"
|
|
448
|
+
)
|
|
449
|
+
base_text = context + base_text
|
|
450
|
+
|
|
451
|
+
return base_text
|
|
452
|
+
|
|
453
|
+
def _build_cove_prompts(base_prompt_text):
|
|
454
|
+
"""Build chain of verification prompts for PDF pages."""
|
|
455
|
+
step2_prompt = f"""You provided this initial categorization:
|
|
456
|
+
<<INITIAL_REPLY>>
|
|
457
|
+
|
|
458
|
+
Original task: {base_prompt_text}
|
|
459
|
+
|
|
460
|
+
Generate a focused list of 3-5 verification questions to fact-check your categorization. Each question should:
|
|
461
|
+
- Be concise and specific (one sentence)
|
|
462
|
+
- Address a distinct content element or category assignment
|
|
463
|
+
- Be answerable by re-examining the document page
|
|
464
|
+
|
|
465
|
+
Focus on verifying:
|
|
466
|
+
- Whether each category assignment matches what's visible in the page
|
|
467
|
+
- Whether any content elements were missed or misinterpreted
|
|
468
|
+
- Whether there are any logical inconsistencies
|
|
469
|
+
|
|
470
|
+
Provide only the verification questions as a numbered list."""
|
|
471
|
+
|
|
472
|
+
step3_prompt = f"""Re-examine the attached document page and answer the following verification question.
|
|
473
|
+
|
|
474
|
+
Document description: {pdf_description}
|
|
475
|
+
|
|
476
|
+
Verification question: <<QUESTION>>
|
|
477
|
+
|
|
478
|
+
Provide a brief, direct answer (1-2 sentences maximum) based on what you observe in the page.
|
|
479
|
+
|
|
480
|
+
Answer:"""
|
|
481
|
+
|
|
482
|
+
step4_prompt = f"""Original task: {base_prompt_text}
|
|
483
|
+
Initial categorization:
|
|
484
|
+
<<INITIAL_REPLY>>
|
|
485
|
+
Verification questions and answers:
|
|
486
|
+
<<VERIFICATION_QA>>
|
|
487
|
+
Based on this verification, provide the final corrected categorization.
|
|
488
|
+
If no categories are present, assign "0" to all categories.
|
|
489
|
+
Provide the final categorization in the same JSON format:"""
|
|
490
|
+
|
|
491
|
+
return step2_prompt, step3_prompt, step4_prompt
|
|
492
|
+
|
|
493
|
+
def _build_prompt_openai_mistral(encoded_image, base_text):
|
|
494
|
+
"""Build prompt for OpenAI/Mistral format (PDF converted to image)."""
|
|
495
|
+
encoded_image_url = f"data:image/png;base64,{encoded_image}"
|
|
496
|
+
return [
|
|
497
|
+
{"type": "text", "text": base_text},
|
|
498
|
+
{"type": "image_url", "image_url": {"url": encoded_image_url, "detail": "high"}},
|
|
499
|
+
]
|
|
500
|
+
|
|
501
|
+
def _build_prompt_anthropic_pdf(encoded_pdf, base_text):
|
|
502
|
+
"""Build prompt for Anthropic format with native PDF support."""
|
|
503
|
+
return [
|
|
504
|
+
{"type": "text", "text": base_text},
|
|
505
|
+
{
|
|
506
|
+
"type": "document",
|
|
507
|
+
"source": {
|
|
508
|
+
"type": "base64",
|
|
509
|
+
"media_type": "application/pdf",
|
|
510
|
+
"data": encoded_pdf
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
]
|
|
514
|
+
|
|
515
|
+
def _build_prompt_anthropic_image(encoded_image, base_text):
|
|
516
|
+
"""Build prompt for Anthropic format with image (for Haiku and other non-PDF models)."""
|
|
517
|
+
return [
|
|
518
|
+
{"type": "text", "text": base_text},
|
|
519
|
+
{
|
|
520
|
+
"type": "image",
|
|
521
|
+
"source": {
|
|
522
|
+
"type": "base64",
|
|
523
|
+
"media_type": "image/png",
|
|
524
|
+
"data": encoded_image
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
]
|
|
528
|
+
|
|
529
|
+
def _build_prompt_google_pdf(encoded_pdf, base_text):
|
|
530
|
+
"""Build prompt data for Google format with native PDF support."""
|
|
531
|
+
return {
|
|
532
|
+
"text_prompt": base_text,
|
|
533
|
+
"pdf_data": encoded_pdf,
|
|
534
|
+
"mime_type": "application/pdf"
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
def _call_openai_compatible(prompt, step2_prompt, step3_prompt, step4_prompt, pdf_content):
|
|
538
|
+
"""Handle OpenAI-compatible API calls (OpenAI, Perplexity, HuggingFace, xAI).
|
|
539
|
+
|
|
540
|
+
Uses direct HTTP requests instead of OpenAI SDK for lighter dependencies.
|
|
541
|
+
"""
|
|
542
|
+
import requests as req
|
|
543
|
+
|
|
544
|
+
# Determine the base URL based on model source
|
|
545
|
+
if model_source == "huggingface":
|
|
546
|
+
from cat_stack.text_functions import _detect_huggingface_endpoint
|
|
547
|
+
base_url = _detect_huggingface_endpoint(api_key, user_model)
|
|
548
|
+
elif model_source == "huggingface-together":
|
|
549
|
+
base_url = "https://router.huggingface.co/together/v1"
|
|
550
|
+
elif model_source == "perplexity":
|
|
551
|
+
base_url = "https://api.perplexity.ai"
|
|
552
|
+
elif model_source == "xai":
|
|
553
|
+
base_url = "https://api.x.ai/v1"
|
|
554
|
+
else:
|
|
555
|
+
base_url = "https://api.openai.com/v1"
|
|
556
|
+
|
|
557
|
+
endpoint = f"{base_url}/chat/completions"
|
|
558
|
+
|
|
559
|
+
headers = {
|
|
560
|
+
"Content-Type": "application/json",
|
|
561
|
+
"Authorization": f"Bearer {api_key}"
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
max_retries = 8
|
|
565
|
+
delay = 2
|
|
566
|
+
|
|
567
|
+
for attempt in range(max_retries):
|
|
568
|
+
try:
|
|
569
|
+
# Build messages with optional stepback
|
|
570
|
+
messages = []
|
|
571
|
+
if step_back_prompt and step_back_added:
|
|
572
|
+
messages.append({'role': 'user', 'content': stepback})
|
|
573
|
+
messages.append({'role': 'assistant', 'content': stepback_insight})
|
|
574
|
+
messages.append({'role': 'user', 'content': prompt})
|
|
575
|
+
|
|
576
|
+
payload = {
|
|
577
|
+
"model": user_model,
|
|
578
|
+
"messages": messages,
|
|
579
|
+
}
|
|
580
|
+
if creativity is not None:
|
|
581
|
+
payload["temperature"] = creativity
|
|
582
|
+
|
|
583
|
+
response = req.post(endpoint, headers=headers, json=payload, timeout=120)
|
|
584
|
+
response.raise_for_status()
|
|
585
|
+
result = response.json()
|
|
586
|
+
reply = result["choices"][0]["message"]["content"]
|
|
587
|
+
|
|
588
|
+
if chain_of_verification:
|
|
589
|
+
reply = pdf_chain_of_verification_openai(
|
|
590
|
+
initial_reply=reply,
|
|
591
|
+
step2_prompt=step2_prompt,
|
|
592
|
+
step3_prompt=step3_prompt,
|
|
593
|
+
step4_prompt=step4_prompt,
|
|
594
|
+
client=None, # Not used anymore - CoVe will use requests
|
|
595
|
+
user_model=user_model,
|
|
596
|
+
creativity=creativity,
|
|
597
|
+
remove_numbering=remove_numbering,
|
|
598
|
+
pdf_content=pdf_content,
|
|
599
|
+
api_key=api_key,
|
|
600
|
+
base_url=base_url
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
return reply, None
|
|
604
|
+
|
|
605
|
+
except req.exceptions.HTTPError as e:
|
|
606
|
+
status_code = e.response.status_code if e.response else None
|
|
607
|
+
if status_code == 400 and attempt < max_retries - 1:
|
|
608
|
+
wait_time = delay * (2 ** attempt)
|
|
609
|
+
print(f"⚠️ Bad request. Attempt {attempt + 1}/{max_retries}")
|
|
610
|
+
print(f"Retrying in {wait_time}s...")
|
|
611
|
+
time.sleep(wait_time)
|
|
612
|
+
elif status_code == 404:
|
|
613
|
+
raise ValueError(f"❌ Model '{user_model}' on {model_source} not found.") from e
|
|
614
|
+
elif status_code in [500, 502, 503, 504] and attempt < max_retries - 1:
|
|
615
|
+
wait_time = delay * (2 ** attempt)
|
|
616
|
+
print(f"Attempt {attempt + 1} failed with error: {e}")
|
|
617
|
+
print(f"Retrying in {wait_time}s...")
|
|
618
|
+
time.sleep(wait_time)
|
|
619
|
+
else:
|
|
620
|
+
print(f"❌ Failed after {max_retries} attempts: {e}")
|
|
621
|
+
return """{"1":"e"}""", f"Error processing input: {e}"
|
|
622
|
+
|
|
623
|
+
except Exception as e:
|
|
624
|
+
if ("500" in str(e) or "504" in str(e)) and attempt < max_retries - 1:
|
|
625
|
+
wait_time = delay * (2 ** attempt)
|
|
626
|
+
print(f"Attempt {attempt + 1} failed with error: {e}")
|
|
627
|
+
print(f"Retrying in {wait_time}s...")
|
|
628
|
+
time.sleep(wait_time)
|
|
629
|
+
else:
|
|
630
|
+
print(f"❌ Failed after {max_retries} attempts: {e}")
|
|
631
|
+
return """{"1":"e"}""", f"Error processing input: {e}"
|
|
632
|
+
|
|
633
|
+
return """{"1":"e"}""", "Max retries exceeded"
|
|
634
|
+
|
|
635
|
+
def _call_anthropic(prompt, step2_prompt, step3_prompt, step4_prompt, pdf_content):
|
|
636
|
+
"""Handle Anthropic API calls with native PDF support using direct HTTP requests."""
|
|
637
|
+
import requests as req
|
|
638
|
+
|
|
639
|
+
endpoint = "https://api.anthropic.com/v1/messages"
|
|
640
|
+
headers = {
|
|
641
|
+
"Content-Type": "application/json",
|
|
642
|
+
"x-api-key": api_key,
|
|
643
|
+
"anthropic-version": "2023-06-01"
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
try:
|
|
647
|
+
# Build messages with optional stepback
|
|
648
|
+
messages = []
|
|
649
|
+
if step_back_prompt and step_back_added:
|
|
650
|
+
messages.append({'role': 'user', 'content': stepback})
|
|
651
|
+
messages.append({'role': 'assistant', 'content': stepback_insight})
|
|
652
|
+
messages.append({'role': 'user', 'content': prompt})
|
|
653
|
+
|
|
654
|
+
payload = {
|
|
655
|
+
"model": user_model,
|
|
656
|
+
"max_tokens": 1024,
|
|
657
|
+
"messages": messages,
|
|
658
|
+
}
|
|
659
|
+
if creativity is not None:
|
|
660
|
+
payload["temperature"] = creativity
|
|
661
|
+
|
|
662
|
+
response = req.post(endpoint, headers=headers, json=payload, timeout=120)
|
|
663
|
+
response.raise_for_status()
|
|
664
|
+
result = response.json()
|
|
665
|
+
|
|
666
|
+
content = result.get("content", [])
|
|
667
|
+
if content and content[0].get("type") == "text":
|
|
668
|
+
reply = content[0].get("text", "")
|
|
669
|
+
else:
|
|
670
|
+
return """{"1":"e"}""", "No text content in response"
|
|
671
|
+
|
|
672
|
+
if chain_of_verification:
|
|
673
|
+
reply = pdf_chain_of_verification_anthropic(
|
|
674
|
+
initial_reply=reply,
|
|
675
|
+
step2_prompt=step2_prompt,
|
|
676
|
+
step3_prompt=step3_prompt,
|
|
677
|
+
step4_prompt=step4_prompt,
|
|
678
|
+
client=None, # No longer using SDK client
|
|
679
|
+
user_model=user_model,
|
|
680
|
+
creativity=creativity,
|
|
681
|
+
remove_numbering=remove_numbering,
|
|
682
|
+
pdf_content=pdf_content,
|
|
683
|
+
api_key=api_key # Pass api_key for HTTP calls
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
return reply, None
|
|
687
|
+
|
|
688
|
+
except req.exceptions.HTTPError as e:
|
|
689
|
+
if e.response is not None and e.response.status_code == 404:
|
|
690
|
+
raise ValueError(f"❌ Model '{user_model}' on {model_source} not found.") from e
|
|
691
|
+
print(f"An error occurred: {e}")
|
|
692
|
+
return """{"1":"e"}""", f"Error processing input: {e}"
|
|
693
|
+
except Exception as e:
|
|
694
|
+
print(f"An error occurred: {e}")
|
|
695
|
+
return """{"1":"e"}""", f"Error processing input: {e}"
|
|
696
|
+
|
|
697
|
+
def _call_google(prompt_data, step2_prompt, step3_prompt, step4_prompt, base_prompt_text):
|
|
698
|
+
"""Handle Google API calls with native PDF support."""
|
|
699
|
+
import requests
|
|
700
|
+
|
|
701
|
+
def make_google_request(url, headers, payload, max_retries=8):
|
|
702
|
+
for attempt in range(max_retries):
|
|
703
|
+
try:
|
|
704
|
+
response = requests.post(url, headers=headers, json=payload)
|
|
705
|
+
response.raise_for_status()
|
|
706
|
+
return response.json()
|
|
707
|
+
except requests.exceptions.HTTPError as e:
|
|
708
|
+
status_code = e.response.status_code
|
|
709
|
+
retryable_errors = [429, 500, 502, 503, 504]
|
|
710
|
+
|
|
711
|
+
if status_code in retryable_errors and attempt < max_retries - 1:
|
|
712
|
+
wait_time = 10 * (2 ** attempt) if status_code == 429 else 2 * (2 ** attempt)
|
|
713
|
+
error_type = "Rate limited" if status_code == 429 else f"Server error {status_code}"
|
|
714
|
+
print(f"⚠️ {error_type}. Attempt {attempt + 1}/{max_retries}")
|
|
715
|
+
print(f"Retrying in {wait_time}s...")
|
|
716
|
+
time.sleep(wait_time)
|
|
717
|
+
else:
|
|
718
|
+
raise
|
|
719
|
+
|
|
720
|
+
url = f"https://generativelanguage.googleapis.com/v1beta/models/{user_model}:generateContent"
|
|
721
|
+
headers = {
|
|
722
|
+
"x-goog-api-key": api_key,
|
|
723
|
+
"Content-Type": "application/json"
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
# Build parts with optional stepback context
|
|
727
|
+
parts = []
|
|
728
|
+
if step_back_prompt and step_back_added:
|
|
729
|
+
parts.append({"text": f"Context from step-back analysis:\n{stepback_insight}\n\n"})
|
|
730
|
+
parts.append({"text": prompt_data["text_prompt"]})
|
|
731
|
+
parts.append({
|
|
732
|
+
"inline_data": {
|
|
733
|
+
"mime_type": prompt_data["mime_type"],
|
|
734
|
+
"data": prompt_data["pdf_data"]
|
|
735
|
+
}
|
|
736
|
+
})
|
|
737
|
+
|
|
738
|
+
payload = {
|
|
739
|
+
"contents": [{"parts": parts}],
|
|
740
|
+
"generationConfig": {
|
|
741
|
+
"responseMimeType": "application/json",
|
|
742
|
+
**({"temperature": creativity} if creativity is not None else {}),
|
|
743
|
+
**({"thinkingConfig": {"thinkingBudget": thinking_budget}} if thinking_budget else {})
|
|
744
|
+
}
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
try:
|
|
748
|
+
result = make_google_request(url, headers, payload)
|
|
749
|
+
|
|
750
|
+
if "candidates" in result and result["candidates"]:
|
|
751
|
+
reply = result["candidates"][0]["content"]["parts"][0]["text"]
|
|
752
|
+
else:
|
|
753
|
+
return "No response generated", None
|
|
754
|
+
|
|
755
|
+
if chain_of_verification:
|
|
756
|
+
reply = pdf_chain_of_verification_google(
|
|
757
|
+
initial_reply=reply,
|
|
758
|
+
prompt=base_prompt_text,
|
|
759
|
+
step2_prompt=step2_prompt,
|
|
760
|
+
step3_prompt=step3_prompt,
|
|
761
|
+
step4_prompt=step4_prompt,
|
|
762
|
+
url=url,
|
|
763
|
+
headers=headers,
|
|
764
|
+
creativity=creativity,
|
|
765
|
+
remove_numbering=remove_numbering,
|
|
766
|
+
make_google_request=make_google_request,
|
|
767
|
+
pdf_data=prompt_data["pdf_data"],
|
|
768
|
+
mime_type=prompt_data["mime_type"]
|
|
769
|
+
)
|
|
770
|
+
|
|
771
|
+
return reply, None
|
|
772
|
+
|
|
773
|
+
except requests.exceptions.HTTPError as e:
|
|
774
|
+
if e.response.status_code == 404:
|
|
775
|
+
raise ValueError(f"❌ Model '{user_model}' not found.") from e
|
|
776
|
+
elif e.response.status_code in [401, 403]:
|
|
777
|
+
raise ValueError(f"❌ Authentication failed.") from e
|
|
778
|
+
else:
|
|
779
|
+
print(f"HTTP error occurred: {e}")
|
|
780
|
+
return """{"1":"e"}""", f"Error processing input: {e}"
|
|
781
|
+
except Exception as e:
|
|
782
|
+
print(f"An error occurred: {e}")
|
|
783
|
+
return """{"1":"e"}""", f"Error processing input: {e}"
|
|
784
|
+
|
|
785
|
+
def _call_mistral(prompt, step2_prompt, step3_prompt, step4_prompt, pdf_content):
|
|
786
|
+
"""Handle Mistral API calls (PDF converted to image).
|
|
787
|
+
|
|
788
|
+
Uses direct HTTP requests instead of Mistral SDK for lighter dependencies.
|
|
789
|
+
"""
|
|
790
|
+
import requests as req
|
|
791
|
+
|
|
792
|
+
endpoint = "https://api.mistral.ai/v1/chat/completions"
|
|
793
|
+
headers = {
|
|
794
|
+
"Content-Type": "application/json",
|
|
795
|
+
"Authorization": f"Bearer {api_key}"
|
|
796
|
+
}
|
|
797
|
+
|
|
798
|
+
max_retries = 8
|
|
799
|
+
delay = 2
|
|
800
|
+
|
|
801
|
+
for attempt in range(max_retries):
|
|
802
|
+
try:
|
|
803
|
+
# Build messages with optional stepback
|
|
804
|
+
messages = []
|
|
805
|
+
if step_back_prompt and step_back_added:
|
|
806
|
+
messages.append({'role': 'user', 'content': stepback})
|
|
807
|
+
messages.append({'role': 'assistant', 'content': stepback_insight})
|
|
808
|
+
messages.append({'role': 'user', 'content': prompt})
|
|
809
|
+
|
|
810
|
+
payload = {
|
|
811
|
+
"model": user_model,
|
|
812
|
+
"messages": messages,
|
|
813
|
+
}
|
|
814
|
+
if creativity is not None:
|
|
815
|
+
payload["temperature"] = creativity
|
|
816
|
+
|
|
817
|
+
response = req.post(endpoint, headers=headers, json=payload, timeout=120)
|
|
818
|
+
response.raise_for_status()
|
|
819
|
+
result = response.json()
|
|
820
|
+
reply = result["choices"][0]["message"]["content"]
|
|
821
|
+
|
|
822
|
+
if chain_of_verification:
|
|
823
|
+
reply = pdf_chain_of_verification_mistral(
|
|
824
|
+
initial_reply=reply,
|
|
825
|
+
step2_prompt=step2_prompt,
|
|
826
|
+
step3_prompt=step3_prompt,
|
|
827
|
+
step4_prompt=step4_prompt,
|
|
828
|
+
client=None, # Not used - CoVe will use requests
|
|
829
|
+
user_model=user_model,
|
|
830
|
+
creativity=creativity,
|
|
831
|
+
remove_numbering=remove_numbering,
|
|
832
|
+
pdf_content=pdf_content,
|
|
833
|
+
api_key=api_key
|
|
834
|
+
)
|
|
835
|
+
|
|
836
|
+
return reply, None
|
|
837
|
+
|
|
838
|
+
except req.exceptions.HTTPError as e:
|
|
839
|
+
status_code = e.response.status_code if e.response else None
|
|
840
|
+
if status_code == 404:
|
|
841
|
+
raise ValueError(f"❌ Model '{user_model}' not found.") from e
|
|
842
|
+
elif status_code in [401, 403]:
|
|
843
|
+
raise ValueError(f"❌ Authentication failed.") from e
|
|
844
|
+
elif status_code in [500, 502, 503, 504] and attempt < max_retries - 1:
|
|
845
|
+
wait_time = delay * (2 ** attempt)
|
|
846
|
+
print(f"⚠️ Server error {status_code}. Attempt {attempt + 1}/{max_retries}")
|
|
847
|
+
print(f"Retrying in {wait_time}s...")
|
|
848
|
+
time.sleep(wait_time)
|
|
849
|
+
else:
|
|
850
|
+
print(f"❌ Failed after {max_retries} attempts: {e}")
|
|
851
|
+
return """{"1":"e"}""", f"Error processing input: {e}"
|
|
852
|
+
|
|
853
|
+
except Exception as e:
|
|
854
|
+
print(f"❌ Unexpected error: {e}")
|
|
855
|
+
return """{"1":"e"}""", f"Error processing input: {e}"
|
|
856
|
+
|
|
857
|
+
return """{"1":"e"}""", "Max retries exceeded"
|
|
858
|
+
|
|
859
|
+
def _build_prompt_text_only(base_text):
|
|
860
|
+
"""Build text-only prompt for providers (no image attachment)."""
|
|
861
|
+
return [{"type": "text", "text": base_text}]
|
|
862
|
+
|
|
863
|
+
def _call_openai_text_only(prompt_text, step2_prompt, step3_prompt, step4_prompt):
|
|
864
|
+
"""Handle OpenAI-compatible API calls with text-only prompt.
|
|
865
|
+
|
|
866
|
+
Uses direct HTTP requests instead of OpenAI SDK for lighter dependencies.
|
|
867
|
+
"""
|
|
868
|
+
import requests as req
|
|
869
|
+
|
|
870
|
+
# Determine the base URL based on model source
|
|
871
|
+
if model_source == "huggingface":
|
|
872
|
+
from cat_stack.text_functions import _detect_huggingface_endpoint
|
|
873
|
+
base_url = _detect_huggingface_endpoint(api_key, user_model)
|
|
874
|
+
elif model_source == "huggingface-together":
|
|
875
|
+
base_url = "https://router.huggingface.co/together/v1"
|
|
876
|
+
elif model_source == "perplexity":
|
|
877
|
+
base_url = "https://api.perplexity.ai"
|
|
878
|
+
elif model_source == "xai":
|
|
879
|
+
base_url = "https://api.x.ai/v1"
|
|
880
|
+
else:
|
|
881
|
+
base_url = "https://api.openai.com/v1"
|
|
882
|
+
|
|
883
|
+
endpoint = f"{base_url}/chat/completions"
|
|
884
|
+
|
|
885
|
+
headers = {
|
|
886
|
+
"Content-Type": "application/json",
|
|
887
|
+
"Authorization": f"Bearer {api_key}"
|
|
888
|
+
}
|
|
889
|
+
|
|
890
|
+
max_retries = 8
|
|
891
|
+
delay = 2
|
|
892
|
+
|
|
893
|
+
for attempt in range(max_retries):
|
|
894
|
+
try:
|
|
895
|
+
messages = []
|
|
896
|
+
if step_back_prompt and step_back_added:
|
|
897
|
+
messages.append({'role': 'user', 'content': stepback})
|
|
898
|
+
messages.append({'role': 'assistant', 'content': stepback_insight})
|
|
899
|
+
messages.append({'role': 'user', 'content': prompt_text})
|
|
900
|
+
|
|
901
|
+
payload = {
|
|
902
|
+
"model": user_model,
|
|
903
|
+
"messages": messages,
|
|
904
|
+
}
|
|
905
|
+
if creativity is not None:
|
|
906
|
+
payload["temperature"] = creativity
|
|
907
|
+
|
|
908
|
+
response = req.post(endpoint, headers=headers, json=payload, timeout=120)
|
|
909
|
+
response.raise_for_status()
|
|
910
|
+
result = response.json()
|
|
911
|
+
reply = result["choices"][0]["message"]["content"]
|
|
912
|
+
return reply, None
|
|
913
|
+
|
|
914
|
+
except req.exceptions.HTTPError as e:
|
|
915
|
+
status_code = e.response.status_code if e.response else None
|
|
916
|
+
if status_code == 400 and attempt < max_retries - 1:
|
|
917
|
+
wait_time = delay * (2 ** attempt)
|
|
918
|
+
print(f"⚠️ Bad request. Attempt {attempt + 1}/{max_retries}")
|
|
919
|
+
print(f"Retrying in {wait_time}s...")
|
|
920
|
+
time.sleep(wait_time)
|
|
921
|
+
elif status_code == 404:
|
|
922
|
+
raise ValueError(f"❌ Model '{user_model}' on {model_source} not found.") from e
|
|
923
|
+
elif status_code in [500, 502, 503, 504] and attempt < max_retries - 1:
|
|
924
|
+
wait_time = delay * (2 ** attempt)
|
|
925
|
+
print(f"Attempt {attempt + 1} failed with error: {e}")
|
|
926
|
+
print(f"Retrying in {wait_time}s...")
|
|
927
|
+
time.sleep(wait_time)
|
|
928
|
+
else:
|
|
929
|
+
print(f"❌ Failed after {max_retries} attempts: {e}")
|
|
930
|
+
return """{"1":"e"}""", f"Error processing input: {e}"
|
|
931
|
+
|
|
932
|
+
except Exception as e:
|
|
933
|
+
if ("500" in str(e) or "504" in str(e)) and attempt < max_retries - 1:
|
|
934
|
+
wait_time = delay * (2 ** attempt)
|
|
935
|
+
print(f"Attempt {attempt + 1} failed with error: {e}")
|
|
936
|
+
print(f"Retrying in {wait_time}s...")
|
|
937
|
+
time.sleep(wait_time)
|
|
938
|
+
else:
|
|
939
|
+
print(f"❌ Failed after {max_retries} attempts: {e}")
|
|
940
|
+
return """{"1":"e"}""", f"Error processing input: {e}"
|
|
941
|
+
|
|
942
|
+
return """{"1":"e"}""", "Max retries exceeded"
|
|
943
|
+
|
|
944
|
+
def _call_anthropic_text_only(prompt_text, step2_prompt, step3_prompt, step4_prompt):
|
|
945
|
+
"""Handle Anthropic API calls with text-only prompt using direct HTTP requests."""
|
|
946
|
+
import requests as req
|
|
947
|
+
|
|
948
|
+
endpoint = "https://api.anthropic.com/v1/messages"
|
|
949
|
+
headers = {
|
|
950
|
+
"Content-Type": "application/json",
|
|
951
|
+
"x-api-key": api_key,
|
|
952
|
+
"anthropic-version": "2023-06-01"
|
|
953
|
+
}
|
|
954
|
+
|
|
955
|
+
try:
|
|
956
|
+
messages = []
|
|
957
|
+
if step_back_prompt and step_back_added:
|
|
958
|
+
messages.append({'role': 'user', 'content': stepback})
|
|
959
|
+
messages.append({'role': 'assistant', 'content': stepback_insight})
|
|
960
|
+
messages.append({'role': 'user', 'content': prompt_text})
|
|
961
|
+
|
|
962
|
+
payload = {
|
|
963
|
+
"model": user_model,
|
|
964
|
+
"max_tokens": 1024,
|
|
965
|
+
"messages": messages,
|
|
966
|
+
}
|
|
967
|
+
if creativity is not None:
|
|
968
|
+
payload["temperature"] = creativity
|
|
969
|
+
|
|
970
|
+
response = req.post(endpoint, headers=headers, json=payload, timeout=120)
|
|
971
|
+
response.raise_for_status()
|
|
972
|
+
result = response.json()
|
|
973
|
+
|
|
974
|
+
content = result.get("content", [])
|
|
975
|
+
if content and content[0].get("type") == "text":
|
|
976
|
+
reply = content[0].get("text", "")
|
|
977
|
+
return reply, None
|
|
978
|
+
return """{"1":"e"}""", "No text content in response"
|
|
979
|
+
|
|
980
|
+
except req.exceptions.HTTPError as e:
|
|
981
|
+
if e.response is not None and e.response.status_code == 404:
|
|
982
|
+
raise ValueError(f"❌ Model '{user_model}' on {model_source} not found.") from e
|
|
983
|
+
print(f"An error occurred: {e}")
|
|
984
|
+
return """{"1":"e"}""", f"Error processing input: {e}"
|
|
985
|
+
except Exception as e:
|
|
986
|
+
print(f"An error occurred: {e}")
|
|
987
|
+
return """{"1":"e"}""", f"Error processing input: {e}"
|
|
988
|
+
|
|
989
|
+
def _call_google_text_only(prompt_text, step2_prompt, step3_prompt, step4_prompt):
|
|
990
|
+
"""Handle Google API calls with text-only prompt."""
|
|
991
|
+
import requests
|
|
992
|
+
|
|
993
|
+
def make_google_request(url, headers, payload, max_retries=8):
|
|
994
|
+
for attempt in range(max_retries):
|
|
995
|
+
try:
|
|
996
|
+
response = requests.post(url, headers=headers, json=payload)
|
|
997
|
+
response.raise_for_status()
|
|
998
|
+
return response.json()
|
|
999
|
+
except requests.exceptions.HTTPError as e:
|
|
1000
|
+
status_code = e.response.status_code
|
|
1001
|
+
retryable_errors = [429, 500, 502, 503, 504]
|
|
1002
|
+
|
|
1003
|
+
if status_code in retryable_errors and attempt < max_retries - 1:
|
|
1004
|
+
wait_time = 10 * (2 ** attempt) if status_code == 429 else 2 * (2 ** attempt)
|
|
1005
|
+
error_type = "Rate limited" if status_code == 429 else f"Server error {status_code}"
|
|
1006
|
+
print(f"⚠️ {error_type}. Attempt {attempt + 1}/{max_retries}")
|
|
1007
|
+
print(f"Retrying in {wait_time}s...")
|
|
1008
|
+
time.sleep(wait_time)
|
|
1009
|
+
else:
|
|
1010
|
+
raise
|
|
1011
|
+
|
|
1012
|
+
url = f"https://generativelanguage.googleapis.com/v1beta/models/{user_model}:generateContent"
|
|
1013
|
+
headers = {
|
|
1014
|
+
"x-goog-api-key": api_key,
|
|
1015
|
+
"Content-Type": "application/json"
|
|
1016
|
+
}
|
|
1017
|
+
|
|
1018
|
+
parts = []
|
|
1019
|
+
if step_back_prompt and step_back_added:
|
|
1020
|
+
parts.append({"text": f"Context from step-back analysis:\n{stepback_insight}\n\n"})
|
|
1021
|
+
parts.append({"text": prompt_text})
|
|
1022
|
+
|
|
1023
|
+
payload = {
|
|
1024
|
+
"contents": [{"parts": parts}],
|
|
1025
|
+
"generationConfig": {
|
|
1026
|
+
"responseMimeType": "application/json",
|
|
1027
|
+
**({"temperature": creativity} if creativity is not None else {}),
|
|
1028
|
+
**({"thinkingConfig": {"thinkingBudget": thinking_budget}} if thinking_budget else {})
|
|
1029
|
+
}
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1032
|
+
try:
|
|
1033
|
+
result = make_google_request(url, headers, payload)
|
|
1034
|
+
|
|
1035
|
+
if "candidates" in result and result["candidates"]:
|
|
1036
|
+
reply = result["candidates"][0]["content"]["parts"][0]["text"]
|
|
1037
|
+
else:
|
|
1038
|
+
return "No response generated", None
|
|
1039
|
+
|
|
1040
|
+
return reply, None
|
|
1041
|
+
|
|
1042
|
+
except requests.exceptions.HTTPError as e:
|
|
1043
|
+
if e.response.status_code == 404:
|
|
1044
|
+
raise ValueError(f"❌ Model '{user_model}' not found.") from e
|
|
1045
|
+
elif e.response.status_code in [401, 403]:
|
|
1046
|
+
raise ValueError(f"❌ Authentication failed.") from e
|
|
1047
|
+
else:
|
|
1048
|
+
print(f"HTTP error occurred: {e}")
|
|
1049
|
+
return """{"1":"e"}""", f"Error processing input: {e}"
|
|
1050
|
+
except Exception as e:
|
|
1051
|
+
print(f"An error occurred: {e}")
|
|
1052
|
+
return """{"1":"e"}""", f"Error processing input: {e}"
|
|
1053
|
+
|
|
1054
|
+
def _call_mistral_text_only(prompt_text, step2_prompt, step3_prompt, step4_prompt):
|
|
1055
|
+
"""Handle Mistral API calls with text-only prompt.
|
|
1056
|
+
|
|
1057
|
+
Uses direct HTTP requests instead of Mistral SDK for lighter dependencies.
|
|
1058
|
+
"""
|
|
1059
|
+
import requests as req
|
|
1060
|
+
|
|
1061
|
+
endpoint = "https://api.mistral.ai/v1/chat/completions"
|
|
1062
|
+
headers = {
|
|
1063
|
+
"Content-Type": "application/json",
|
|
1064
|
+
"Authorization": f"Bearer {api_key}"
|
|
1065
|
+
}
|
|
1066
|
+
|
|
1067
|
+
max_retries = 8
|
|
1068
|
+
delay = 2
|
|
1069
|
+
|
|
1070
|
+
for attempt in range(max_retries):
|
|
1071
|
+
try:
|
|
1072
|
+
messages = []
|
|
1073
|
+
if step_back_prompt and step_back_added:
|
|
1074
|
+
messages.append({'role': 'user', 'content': stepback})
|
|
1075
|
+
messages.append({'role': 'assistant', 'content': stepback_insight})
|
|
1076
|
+
messages.append({'role': 'user', 'content': prompt_text})
|
|
1077
|
+
|
|
1078
|
+
payload = {
|
|
1079
|
+
"model": user_model,
|
|
1080
|
+
"messages": messages,
|
|
1081
|
+
}
|
|
1082
|
+
if creativity is not None:
|
|
1083
|
+
payload["temperature"] = creativity
|
|
1084
|
+
|
|
1085
|
+
response = req.post(endpoint, headers=headers, json=payload, timeout=120)
|
|
1086
|
+
response.raise_for_status()
|
|
1087
|
+
result = response.json()
|
|
1088
|
+
reply = result["choices"][0]["message"]["content"]
|
|
1089
|
+
return reply, None
|
|
1090
|
+
|
|
1091
|
+
except req.exceptions.HTTPError as e:
|
|
1092
|
+
status_code = e.response.status_code if e.response else None
|
|
1093
|
+
if status_code == 404:
|
|
1094
|
+
raise ValueError(f"❌ Model '{user_model}' not found.") from e
|
|
1095
|
+
elif status_code in [401, 403]:
|
|
1096
|
+
raise ValueError(f"❌ Authentication failed.") from e
|
|
1097
|
+
elif status_code in [500, 502, 503, 504] and attempt < max_retries - 1:
|
|
1098
|
+
wait_time = delay * (2 ** attempt)
|
|
1099
|
+
print(f"⚠️ Server error {status_code}. Attempt {attempt + 1}/{max_retries}")
|
|
1100
|
+
print(f"Retrying in {wait_time}s...")
|
|
1101
|
+
time.sleep(wait_time)
|
|
1102
|
+
else:
|
|
1103
|
+
print(f"❌ Failed after {max_retries} attempts: {e}")
|
|
1104
|
+
return """{"1":"e"}""", f"Error processing input: {e}"
|
|
1105
|
+
|
|
1106
|
+
except Exception as e:
|
|
1107
|
+
print(f"❌ Unexpected error: {e}")
|
|
1108
|
+
return """{"1":"e"}""", f"Error processing input: {e}"
|
|
1109
|
+
|
|
1110
|
+
return """{"1":"e"}""", "Max retries exceeded"
|
|
1111
|
+
|
|
1112
|
+
def _process_single_page(pdf_path, page_index, page_label):
|
|
1113
|
+
"""Process a single PDF page and return (reply, error_msg)."""
|
|
1114
|
+
|
|
1115
|
+
# Extract text if needed for text or both modes
|
|
1116
|
+
page_text = None
|
|
1117
|
+
if mode in ("text", "both"):
|
|
1118
|
+
page_text, text_valid, text_error = _extract_page_text(pdf_path, page_index)
|
|
1119
|
+
if mode == "text" and not text_valid:
|
|
1120
|
+
# Text mode requires text - fail if extraction failed
|
|
1121
|
+
return None, f"Failed to extract text: {text_error}"
|
|
1122
|
+
# For "both" mode, we continue even if text extraction fails
|
|
1123
|
+
|
|
1124
|
+
# Build prompt with text if available
|
|
1125
|
+
base_prompt_text = _build_base_prompt_text(page_text)
|
|
1126
|
+
|
|
1127
|
+
if chain_of_verification:
|
|
1128
|
+
step2_prompt, step3_prompt, step4_prompt = _build_cove_prompts(base_prompt_text)
|
|
1129
|
+
else:
|
|
1130
|
+
step2_prompt = step3_prompt = step4_prompt = None
|
|
1131
|
+
|
|
1132
|
+
# TEXT-ONLY MODE: No image/PDF attachment needed
|
|
1133
|
+
if mode == "text":
|
|
1134
|
+
if model_source == "anthropic":
|
|
1135
|
+
return _call_anthropic_text_only(base_prompt_text, step2_prompt, step3_prompt, step4_prompt)
|
|
1136
|
+
elif model_source == "google":
|
|
1137
|
+
return _call_google_text_only(base_prompt_text, step2_prompt, step3_prompt, step4_prompt)
|
|
1138
|
+
elif model_source in ["openai", "perplexity", "huggingface", "xai"]:
|
|
1139
|
+
return _call_openai_text_only(base_prompt_text, step2_prompt, step3_prompt, step4_prompt)
|
|
1140
|
+
elif model_source == "mistral":
|
|
1141
|
+
return _call_mistral_text_only(base_prompt_text, step2_prompt, step3_prompt, step4_prompt)
|
|
1142
|
+
else:
|
|
1143
|
+
raise ValueError(f"Unknown source! Choose from OpenAI, Anthropic, Perplexity, Google, xAI, Huggingface, or Mistral")
|
|
1144
|
+
|
|
1145
|
+
# IMAGE or BOTH MODE: Include image/PDF attachment
|
|
1146
|
+
# Handle providers with native PDF support
|
|
1147
|
+
if model_source == "anthropic":
|
|
1148
|
+
# Check if model supports native PDF (Haiku doesn't)
|
|
1149
|
+
if _anthropic_supports_pdf(user_model):
|
|
1150
|
+
pdf_bytes, is_valid = _extract_page_as_pdf_bytes(pdf_path, page_index)
|
|
1151
|
+
if not is_valid:
|
|
1152
|
+
return None, "Failed to extract PDF page"
|
|
1153
|
+
|
|
1154
|
+
encoded_pdf = _encode_bytes_to_base64(pdf_bytes)
|
|
1155
|
+
prompt = _build_prompt_anthropic_pdf(encoded_pdf, base_prompt_text)
|
|
1156
|
+
pdf_content = {
|
|
1157
|
+
"type": "document",
|
|
1158
|
+
"source": {
|
|
1159
|
+
"type": "base64",
|
|
1160
|
+
"media_type": "application/pdf",
|
|
1161
|
+
"data": encoded_pdf
|
|
1162
|
+
}
|
|
1163
|
+
}
|
|
1164
|
+
else:
|
|
1165
|
+
# Haiku and other non-PDF models: convert to image
|
|
1166
|
+
image_bytes, is_valid = _extract_page_as_image_bytes(pdf_path, page_index)
|
|
1167
|
+
if not is_valid:
|
|
1168
|
+
return None, "Failed to render PDF page to image"
|
|
1169
|
+
|
|
1170
|
+
encoded_image = _encode_bytes_to_base64(image_bytes)
|
|
1171
|
+
prompt = _build_prompt_anthropic_image(encoded_image, base_prompt_text)
|
|
1172
|
+
pdf_content = {
|
|
1173
|
+
"type": "image",
|
|
1174
|
+
"source": {
|
|
1175
|
+
"type": "base64",
|
|
1176
|
+
"media_type": "image/png",
|
|
1177
|
+
"data": encoded_image
|
|
1178
|
+
}
|
|
1179
|
+
}
|
|
1180
|
+
return _call_anthropic(prompt, step2_prompt, step3_prompt, step4_prompt, pdf_content)
|
|
1181
|
+
|
|
1182
|
+
elif model_source == "google":
|
|
1183
|
+
pdf_bytes, is_valid = _extract_page_as_pdf_bytes(pdf_path, page_index)
|
|
1184
|
+
if not is_valid:
|
|
1185
|
+
return None, "Failed to extract PDF page"
|
|
1186
|
+
|
|
1187
|
+
encoded_pdf = _encode_bytes_to_base64(pdf_bytes)
|
|
1188
|
+
prompt_data = _build_prompt_google_pdf(encoded_pdf, base_prompt_text)
|
|
1189
|
+
return _call_google(prompt_data, step2_prompt, step3_prompt, step4_prompt, base_prompt_text)
|
|
1190
|
+
|
|
1191
|
+
# Handle providers requiring image conversion
|
|
1192
|
+
else:
|
|
1193
|
+
image_bytes, is_valid = _extract_page_as_image_bytes(pdf_path, page_index)
|
|
1194
|
+
if not is_valid:
|
|
1195
|
+
return None, "Failed to render PDF page to image"
|
|
1196
|
+
|
|
1197
|
+
encoded_image = _encode_bytes_to_base64(image_bytes)
|
|
1198
|
+
prompt = _build_prompt_openai_mistral(encoded_image, base_prompt_text)
|
|
1199
|
+
|
|
1200
|
+
# PDF content for CoVe (as image)
|
|
1201
|
+
encoded_image_url = f"data:image/png;base64,{encoded_image}"
|
|
1202
|
+
pdf_content = {"type": "image_url", "image_url": {"url": encoded_image_url, "detail": "high"}}
|
|
1203
|
+
|
|
1204
|
+
if model_source in ["openai", "perplexity", "huggingface", "xai"]:
|
|
1205
|
+
return _call_openai_compatible(prompt, step2_prompt, step3_prompt, step4_prompt, pdf_content)
|
|
1206
|
+
elif model_source == "mistral":
|
|
1207
|
+
return _call_mistral(prompt, step2_prompt, step3_prompt, step4_prompt, pdf_content)
|
|
1208
|
+
else:
|
|
1209
|
+
raise ValueError(f"Unknown source! Choose from OpenAI, Anthropic, Perplexity, Google, xAI, Huggingface, or Mistral")
|
|
1210
|
+
|
|
1211
|
+
def _extract_json(reply):
|
|
1212
|
+
"""Extract JSON from model reply."""
|
|
1213
|
+
if reply is None:
|
|
1214
|
+
return """{"1":"e"}"""
|
|
1215
|
+
|
|
1216
|
+
extracted_json = regex.findall(r'\{(?:[^{}]|(?R))*\}', reply, regex.DOTALL)
|
|
1217
|
+
if extracted_json:
|
|
1218
|
+
return extracted_json[0].replace('[', '').replace(']', '').replace('\n', '').replace(" ", '').replace(" ", '')
|
|
1219
|
+
else:
|
|
1220
|
+
print("""{"1":"e"}""")
|
|
1221
|
+
return """{"1":"e"}"""
|
|
1222
|
+
|
|
1223
|
+
# Main processing loop
|
|
1224
|
+
total_pages = len(all_pages)
|
|
1225
|
+
for idx, (pdf_path, page_index, page_label) in enumerate(tqdm(all_pages, desc="Categorizing PDF pages")):
|
|
1226
|
+
# Call progress callback if provided
|
|
1227
|
+
if progress_callback:
|
|
1228
|
+
progress_callback(idx, total_pages, page_label)
|
|
1229
|
+
|
|
1230
|
+
page_labels.append(page_label)
|
|
1231
|
+
|
|
1232
|
+
reply, error_msg = _process_single_page(pdf_path, page_index, page_label)
|
|
1233
|
+
|
|
1234
|
+
if error_msg:
|
|
1235
|
+
link1.append(error_msg)
|
|
1236
|
+
extracted_jsons.append("""{"1":"e"}""")
|
|
1237
|
+
else:
|
|
1238
|
+
link1.append(reply)
|
|
1239
|
+
extracted_jsons.append(_extract_json(reply))
|
|
1240
|
+
|
|
1241
|
+
# --- Safety Save ---
|
|
1242
|
+
if safety:
|
|
1243
|
+
if filename is None:
|
|
1244
|
+
raise TypeError("filename is required when using safety. Please provide the filename.")
|
|
1245
|
+
|
|
1246
|
+
normalized_data_list = []
|
|
1247
|
+
for json_str in extracted_jsons:
|
|
1248
|
+
try:
|
|
1249
|
+
parsed_obj = json.loads(json_str)
|
|
1250
|
+
normalized_data_list.append(pd.json_normalize(parsed_obj))
|
|
1251
|
+
except json.JSONDecodeError:
|
|
1252
|
+
normalized_data_list.append(pd.DataFrame({"1": ["e"]}))
|
|
1253
|
+
normalized_data = pd.concat(normalized_data_list, ignore_index=True)
|
|
1254
|
+
|
|
1255
|
+
temp_df = pd.DataFrame({
|
|
1256
|
+
'pdf_input': page_labels,
|
|
1257
|
+
'model_response': link1,
|
|
1258
|
+
'json': extracted_jsons
|
|
1259
|
+
})
|
|
1260
|
+
temp_df = pd.concat([temp_df, normalized_data], axis=1)
|
|
1261
|
+
|
|
1262
|
+
save_path = os.path.join(save_directory, filename) if save_directory else filename
|
|
1263
|
+
temp_df.to_csv(save_path, index=False)
|
|
1264
|
+
|
|
1265
|
+
# --- Final DataFrame ---
|
|
1266
|
+
normalized_data_list = []
|
|
1267
|
+
for json_str in extracted_jsons:
|
|
1268
|
+
try:
|
|
1269
|
+
parsed_obj = json.loads(json_str)
|
|
1270
|
+
normalized_data_list.append(pd.json_normalize(parsed_obj))
|
|
1271
|
+
except json.JSONDecodeError:
|
|
1272
|
+
normalized_data_list.append(pd.DataFrame({"1": ["e"]}))
|
|
1273
|
+
normalized_data = pd.concat(normalized_data_list, ignore_index=True)
|
|
1274
|
+
|
|
1275
|
+
categorized_data = pd.DataFrame({
|
|
1276
|
+
'pdf_input': pd.Series(page_labels),
|
|
1277
|
+
'model_response': pd.Series(link1).reset_index(drop=True),
|
|
1278
|
+
'json': pd.Series(extracted_jsons).reset_index(drop=True)
|
|
1279
|
+
})
|
|
1280
|
+
categorized_data = pd.concat([categorized_data, normalized_data], axis=1)
|
|
1281
|
+
categorized_data = categorized_data.rename(columns=lambda x: f'category_{x}' if str(x).isdigit() else x)
|
|
1282
|
+
|
|
1283
|
+
# Identify rows with invalid strings (like "e")
|
|
1284
|
+
cat_cols = [col for col in categorized_data.columns if col.startswith('category_')]
|
|
1285
|
+
has_invalid_strings = categorized_data[cat_cols].apply(
|
|
1286
|
+
lambda col: pd.to_numeric(col, errors='coerce').isna() & col.notna()
|
|
1287
|
+
).any(axis=1)
|
|
1288
|
+
|
|
1289
|
+
categorized_data['processing_status'] = (~has_invalid_strings).map({True: 'success', False: 'error'})
|
|
1290
|
+
categorized_data.loc[has_invalid_strings, cat_cols] = pd.NA
|
|
1291
|
+
|
|
1292
|
+
for col in cat_cols:
|
|
1293
|
+
categorized_data[col] = pd.to_numeric(categorized_data[col], errors='coerce')
|
|
1294
|
+
|
|
1295
|
+
categorized_data.loc[~has_invalid_strings, cat_cols] = (
|
|
1296
|
+
categorized_data.loc[~has_invalid_strings, cat_cols].fillna(0)
|
|
1297
|
+
)
|
|
1298
|
+
categorized_data[cat_cols] = categorized_data[cat_cols].astype('Int64')
|
|
1299
|
+
|
|
1300
|
+
# Create categories_id (comma-separated binary values for each category)
|
|
1301
|
+
categorized_data['categories_id'] = categorized_data[cat_cols].apply(
|
|
1302
|
+
lambda x: ','.join(x.dropna().astype(int).astype(str)), axis=1
|
|
1303
|
+
)
|
|
1304
|
+
|
|
1305
|
+
if filename:
|
|
1306
|
+
save_path = os.path.join(save_directory, filename) if save_directory else filename
|
|
1307
|
+
categorized_data.to_csv(save_path, index=False)
|
|
1308
|
+
|
|
1309
|
+
return categorized_data
|
|
1310
|
+
|
|
1311
|
+
|
|
1312
|
+
def explore_pdf_categories(
|
|
1313
|
+
pdf_input,
|
|
1314
|
+
api_key,
|
|
1315
|
+
pdf_description="",
|
|
1316
|
+
max_categories=12,
|
|
1317
|
+
categories_per_chunk=10,
|
|
1318
|
+
divisions=5,
|
|
1319
|
+
user_model="gpt-4o",
|
|
1320
|
+
creativity=None,
|
|
1321
|
+
specificity="broad",
|
|
1322
|
+
research_question=None,
|
|
1323
|
+
mode="text",
|
|
1324
|
+
filename=None,
|
|
1325
|
+
model_source="auto",
|
|
1326
|
+
iterations=3,
|
|
1327
|
+
random_state=None,
|
|
1328
|
+
progress_callback=None,
|
|
1329
|
+
):
|
|
1330
|
+
"""
|
|
1331
|
+
Explore and extract common categories from PDF pages.
|
|
1332
|
+
|
|
1333
|
+
Modes:
|
|
1334
|
+
- "text" (default): Extracts text from pages, concatenates pages within
|
|
1335
|
+
each chunk, and sends combined text to identify categories. Similar to
|
|
1336
|
+
how explore_common_categories works with text responses. Best for
|
|
1337
|
+
text-heavy documents.
|
|
1338
|
+
|
|
1339
|
+
- "image": Samples random pages from the full pool of all pages across
|
|
1340
|
+
all PDFs and sends them as images to a vision model. Best for visual
|
|
1341
|
+
documents where layout matters.
|
|
1342
|
+
|
|
1343
|
+
- "both": Samples random pages, uses vision model to describe each page's
|
|
1344
|
+
content (text + visual elements), then extracts categories from those
|
|
1345
|
+
descriptions. Best for documents with mixed text and visual content
|
|
1346
|
+
(charts, diagrams, scanned documents).
|
|
1347
|
+
|
|
1348
|
+
Args:
|
|
1349
|
+
pdf_input: Path to PDF file, directory of PDFs, or list of PDF paths
|
|
1350
|
+
api_key: API key for the model provider
|
|
1351
|
+
pdf_description: Description of what the PDFs contain
|
|
1352
|
+
max_categories: Maximum number of final categories to return
|
|
1353
|
+
categories_per_chunk: Categories to extract per chunk of pages
|
|
1354
|
+
divisions: Number of chunks to divide pages into
|
|
1355
|
+
user_model: Model to use (vision model required for image/both modes)
|
|
1356
|
+
creativity: Temperature setting (None for default)
|
|
1357
|
+
specificity: "broad" or "specific" category granularity
|
|
1358
|
+
research_question: Optional research context
|
|
1359
|
+
mode: "text", "image", or "both"
|
|
1360
|
+
filename: Optional CSV filename to save results
|
|
1361
|
+
model_source: "auto", "openai", "anthropic", "google", "mistral"
|
|
1362
|
+
iterations: Number of passes over the data
|
|
1363
|
+
random_state: Random seed for reproducibility
|
|
1364
|
+
progress_callback: Optional callback function for progress updates.
|
|
1365
|
+
Called as progress_callback(current_step, total_steps, step_label).
|
|
1366
|
+
|
|
1367
|
+
Returns:
|
|
1368
|
+
dict with keys:
|
|
1369
|
+
- counts_df: DataFrame of categories with counts
|
|
1370
|
+
- top_categories: List of top category names
|
|
1371
|
+
- raw_top_text: Raw model output from final merge step
|
|
1372
|
+
"""
|
|
1373
|
+
import os
|
|
1374
|
+
import re
|
|
1375
|
+
import pandas as pd
|
|
1376
|
+
import numpy as np
|
|
1377
|
+
from tqdm import tqdm
|
|
1378
|
+
|
|
1379
|
+
model_source = _detect_model_source(user_model, model_source)
|
|
1380
|
+
|
|
1381
|
+
# Load all PDF pages
|
|
1382
|
+
pdf_files = _load_pdf_files(pdf_input)
|
|
1383
|
+
if not pdf_files:
|
|
1384
|
+
raise ValueError("No PDF files found in the specified input.")
|
|
1385
|
+
|
|
1386
|
+
all_pages = []
|
|
1387
|
+
for pdf_path in pdf_files:
|
|
1388
|
+
pages = _get_pdf_pages(pdf_path)
|
|
1389
|
+
all_pages.extend(pages)
|
|
1390
|
+
|
|
1391
|
+
n = len(all_pages)
|
|
1392
|
+
if n == 0:
|
|
1393
|
+
raise ValueError("No pages found in the PDF files.")
|
|
1394
|
+
|
|
1395
|
+
# Auto-adjust divisions for small datasets
|
|
1396
|
+
# PDF pages can have multiple categories each, so we can use fewer divisions
|
|
1397
|
+
original_divisions = divisions
|
|
1398
|
+
divisions = min(divisions, max(1, n // 2)) # At least 2 pages per chunk
|
|
1399
|
+
if divisions != original_divisions:
|
|
1400
|
+
print(f"Auto-adjusted divisions from {original_divisions} to {divisions} for {n} pages.")
|
|
1401
|
+
|
|
1402
|
+
# Chunk sizing - PDF pages often contain multiple categories each
|
|
1403
|
+
chunk_size = int(round(max(1, n / divisions), 0))
|
|
1404
|
+
# Don't reduce categories_per_chunk as aggressively for PDFs since each page can yield many categories
|
|
1405
|
+
if chunk_size < 2:
|
|
1406
|
+
# Only reduce if we have very few pages
|
|
1407
|
+
old_categories_per_chunk = categories_per_chunk
|
|
1408
|
+
categories_per_chunk = max(5, chunk_size * 4)
|
|
1409
|
+
print(f"Auto-adjusted categories_per_chunk from {old_categories_per_chunk} to {categories_per_chunk} for chunk size {chunk_size}.")
|
|
1410
|
+
|
|
1411
|
+
print(
|
|
1412
|
+
f"Exploring categories in PDFs: '{pdf_description}'.\n"
|
|
1413
|
+
f" {n} total pages, {categories_per_chunk * divisions} categories to extract, "
|
|
1414
|
+
f"{max_categories} final categories. Mode: {mode}\n"
|
|
1415
|
+
)
|
|
1416
|
+
|
|
1417
|
+
# RNG for reproducible sampling
|
|
1418
|
+
rng = np.random.default_rng(random_state)
|
|
1419
|
+
|
|
1420
|
+
# Initialize client/config based on model source
|
|
1421
|
+
# For OpenAI-compatible APIs (including Mistral), we use requests directly instead of SDK
|
|
1422
|
+
import requests as http_client
|
|
1423
|
+
|
|
1424
|
+
if model_source in ["openai", "huggingface", "huggingface-together", "xai", "perplexity"]:
|
|
1425
|
+
# Determine base URL for OpenAI-compatible APIs
|
|
1426
|
+
if model_source == "huggingface":
|
|
1427
|
+
from cat_stack.text_functions import _detect_huggingface_endpoint
|
|
1428
|
+
openai_base_url = _detect_huggingface_endpoint(api_key, user_model)
|
|
1429
|
+
elif model_source == "huggingface-together":
|
|
1430
|
+
openai_base_url = "https://router.huggingface.co/together/v1"
|
|
1431
|
+
elif model_source == "xai":
|
|
1432
|
+
openai_base_url = "https://api.x.ai/v1"
|
|
1433
|
+
elif model_source == "perplexity":
|
|
1434
|
+
openai_base_url = "https://api.perplexity.ai"
|
|
1435
|
+
else:
|
|
1436
|
+
openai_base_url = "https://api.openai.com/v1"
|
|
1437
|
+
client = None # We'll use requests directly
|
|
1438
|
+
elif model_source == "anthropic":
|
|
1439
|
+
# Using direct HTTP requests instead of Anthropic SDK
|
|
1440
|
+
client = None
|
|
1441
|
+
openai_base_url = None
|
|
1442
|
+
elif model_source == "google":
|
|
1443
|
+
client = None
|
|
1444
|
+
openai_base_url = None
|
|
1445
|
+
elif model_source == "mistral":
|
|
1446
|
+
# Mistral API is OpenAI-compatible, use requests directly
|
|
1447
|
+
openai_base_url = "https://api.mistral.ai/v1"
|
|
1448
|
+
client = None
|
|
1449
|
+
else:
|
|
1450
|
+
raise ValueError(f"Unsupported model_source: {model_source}")
|
|
1451
|
+
|
|
1452
|
+
def make_text_prompt(text_blob: str) -> str:
|
|
1453
|
+
"""Build prompt for text mode - concatenated page text."""
|
|
1454
|
+
return (
|
|
1455
|
+
f"Identify {categories_per_chunk} {specificity} categories of content found in this document text. "
|
|
1456
|
+
f"The document is: {pdf_description}. "
|
|
1457
|
+
f"{'Research context: ' + research_question + '. ' if research_question else ''}"
|
|
1458
|
+
f"The text is contained within triple backticks: ```{text_blob}``` "
|
|
1459
|
+
f"Number your categories from 1 through {categories_per_chunk} and provide concise labels only (no descriptions)."
|
|
1460
|
+
)
|
|
1461
|
+
|
|
1462
|
+
def make_image_prompt() -> str:
|
|
1463
|
+
"""Build prompt for image mode - single page image."""
|
|
1464
|
+
return (
|
|
1465
|
+
f"Identify {categories_per_chunk} {specificity} categories of content found in this PDF page. "
|
|
1466
|
+
f"The document is: {pdf_description}. "
|
|
1467
|
+
f"{'Research context: ' + research_question if research_question else ''}\n\n"
|
|
1468
|
+
f"Number your categories from 1 through {categories_per_chunk} and provide concise labels only (no descriptions)."
|
|
1469
|
+
)
|
|
1470
|
+
|
|
1471
|
+
def make_describe_prompt() -> str:
|
|
1472
|
+
"""Build prompt for 'both' mode - describe page content."""
|
|
1473
|
+
return (
|
|
1474
|
+
f"Describe the content of this PDF page in detail. "
|
|
1475
|
+
f"Include all text, images, charts, diagrams, tables, and layout elements. "
|
|
1476
|
+
f"The document is: {pdf_description}. "
|
|
1477
|
+
f"{'Research context: ' + research_question if research_question else ''}\n\n"
|
|
1478
|
+
f"Provide a comprehensive text description that captures both visual and textual content."
|
|
1479
|
+
)
|
|
1480
|
+
|
|
1481
|
+
def describe_page_with_vision(pdf_path, page_index):
|
|
1482
|
+
"""Use vision model to describe a page's content as text.
|
|
1483
|
+
|
|
1484
|
+
Uses native PDF support for Anthropic (non-Haiku) and Google, converts to image for others.
|
|
1485
|
+
"""
|
|
1486
|
+
prompt_text = make_describe_prompt()
|
|
1487
|
+
|
|
1488
|
+
try:
|
|
1489
|
+
# Anthropic - use native PDF support if model supports it
|
|
1490
|
+
if model_source == "anthropic" and _anthropic_supports_pdf(user_model):
|
|
1491
|
+
pdf_bytes, is_valid = _extract_page_as_pdf_bytes(pdf_path, page_index)
|
|
1492
|
+
if not is_valid:
|
|
1493
|
+
return None
|
|
1494
|
+
encoded_pdf = _encode_bytes_to_base64(pdf_bytes)
|
|
1495
|
+
content = [
|
|
1496
|
+
{"type": "text", "text": prompt_text},
|
|
1497
|
+
{
|
|
1498
|
+
"type": "document",
|
|
1499
|
+
"source": {
|
|
1500
|
+
"type": "base64",
|
|
1501
|
+
"media_type": "application/pdf",
|
|
1502
|
+
"data": encoded_pdf
|
|
1503
|
+
}
|
|
1504
|
+
}
|
|
1505
|
+
]
|
|
1506
|
+
endpoint = "https://api.anthropic.com/v1/messages"
|
|
1507
|
+
headers = {
|
|
1508
|
+
"Content-Type": "application/json",
|
|
1509
|
+
"x-api-key": api_key,
|
|
1510
|
+
"anthropic-version": "2023-06-01"
|
|
1511
|
+
}
|
|
1512
|
+
payload = {
|
|
1513
|
+
"model": user_model,
|
|
1514
|
+
"max_tokens": 4096,
|
|
1515
|
+
"messages": [{"role": "user", "content": content}],
|
|
1516
|
+
}
|
|
1517
|
+
if creativity is not None:
|
|
1518
|
+
payload["temperature"] = creativity
|
|
1519
|
+
resp = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
|
|
1520
|
+
resp.raise_for_status()
|
|
1521
|
+
result = resp.json()
|
|
1522
|
+
resp_content = result.get("content", [])
|
|
1523
|
+
if resp_content and resp_content[0].get("type") == "text":
|
|
1524
|
+
return resp_content[0].get("text", "")
|
|
1525
|
+
return None
|
|
1526
|
+
|
|
1527
|
+
# Anthropic Haiku - convert to image (doesn't support PDF)
|
|
1528
|
+
elif model_source == "anthropic":
|
|
1529
|
+
image_bytes, is_valid = _extract_page_as_image_bytes(pdf_path, page_index)
|
|
1530
|
+
if not is_valid:
|
|
1531
|
+
return None
|
|
1532
|
+
encoded_image = _encode_bytes_to_base64(image_bytes)
|
|
1533
|
+
content = [
|
|
1534
|
+
{"type": "text", "text": prompt_text},
|
|
1535
|
+
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": encoded_image}}
|
|
1536
|
+
]
|
|
1537
|
+
endpoint = "https://api.anthropic.com/v1/messages"
|
|
1538
|
+
headers = {
|
|
1539
|
+
"Content-Type": "application/json",
|
|
1540
|
+
"x-api-key": api_key,
|
|
1541
|
+
"anthropic-version": "2023-06-01"
|
|
1542
|
+
}
|
|
1543
|
+
payload = {
|
|
1544
|
+
"model": user_model,
|
|
1545
|
+
"max_tokens": 4096,
|
|
1546
|
+
"messages": [{"role": "user", "content": content}],
|
|
1547
|
+
}
|
|
1548
|
+
if creativity is not None:
|
|
1549
|
+
payload["temperature"] = creativity
|
|
1550
|
+
resp = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
|
|
1551
|
+
resp.raise_for_status()
|
|
1552
|
+
result = resp.json()
|
|
1553
|
+
resp_content = result.get("content", [])
|
|
1554
|
+
if resp_content and resp_content[0].get("type") == "text":
|
|
1555
|
+
return resp_content[0].get("text", "")
|
|
1556
|
+
return None
|
|
1557
|
+
|
|
1558
|
+
# Google - use native PDF support
|
|
1559
|
+
elif model_source == "google":
|
|
1560
|
+
pdf_bytes, is_valid = _extract_page_as_pdf_bytes(pdf_path, page_index)
|
|
1561
|
+
if not is_valid:
|
|
1562
|
+
return None
|
|
1563
|
+
encoded_pdf = _encode_bytes_to_base64(pdf_bytes)
|
|
1564
|
+
url = f"https://generativelanguage.googleapis.com/v1beta/models/{user_model}:generateContent"
|
|
1565
|
+
headers = {"x-goog-api-key": api_key, "Content-Type": "application/json"}
|
|
1566
|
+
parts = [
|
|
1567
|
+
{"text": prompt_text},
|
|
1568
|
+
{"inline_data": {"mime_type": "application/pdf", "data": encoded_pdf}}
|
|
1569
|
+
]
|
|
1570
|
+
payload = {
|
|
1571
|
+
"contents": [{"parts": parts}],
|
|
1572
|
+
"generationConfig": {**({"temperature": creativity} if creativity is not None else {})}
|
|
1573
|
+
}
|
|
1574
|
+
response = http_client.post(url, headers=headers, json=payload, timeout=120)
|
|
1575
|
+
response.raise_for_status()
|
|
1576
|
+
result = response.json()
|
|
1577
|
+
if "candidates" in result and result["candidates"]:
|
|
1578
|
+
return result["candidates"][0]["content"]["parts"][0]["text"]
|
|
1579
|
+
return None
|
|
1580
|
+
|
|
1581
|
+
# Other providers - convert PDF page to image
|
|
1582
|
+
else:
|
|
1583
|
+
image_bytes, is_valid = _extract_page_as_image_bytes(pdf_path, page_index)
|
|
1584
|
+
if not is_valid:
|
|
1585
|
+
return None
|
|
1586
|
+
encoded_image = _encode_bytes_to_base64(image_bytes)
|
|
1587
|
+
|
|
1588
|
+
if model_source in ["openai", "huggingface", "huggingface-together", "xai", "perplexity"]:
|
|
1589
|
+
# Use requests directly instead of OpenAI SDK
|
|
1590
|
+
endpoint = f"{openai_base_url}/chat/completions"
|
|
1591
|
+
headers = {
|
|
1592
|
+
"Content-Type": "application/json",
|
|
1593
|
+
"Authorization": f"Bearer {api_key}"
|
|
1594
|
+
}
|
|
1595
|
+
messages = [{
|
|
1596
|
+
"role": "user",
|
|
1597
|
+
"content": [
|
|
1598
|
+
{"type": "text", "text": prompt_text},
|
|
1599
|
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
|
|
1600
|
+
]
|
|
1601
|
+
}]
|
|
1602
|
+
payload = {"model": user_model, "messages": messages}
|
|
1603
|
+
if creativity is not None:
|
|
1604
|
+
payload["temperature"] = creativity
|
|
1605
|
+
resp = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
|
|
1606
|
+
resp.raise_for_status()
|
|
1607
|
+
return resp.json()["choices"][0]["message"]["content"]
|
|
1608
|
+
|
|
1609
|
+
elif model_source == "mistral":
|
|
1610
|
+
# Use requests directly instead of Mistral SDK
|
|
1611
|
+
endpoint = f"{openai_base_url}/chat/completions"
|
|
1612
|
+
headers = {
|
|
1613
|
+
"Content-Type": "application/json",
|
|
1614
|
+
"Authorization": f"Bearer {api_key}"
|
|
1615
|
+
}
|
|
1616
|
+
messages = [{
|
|
1617
|
+
"role": "user",
|
|
1618
|
+
"content": [
|
|
1619
|
+
{"type": "text", "text": prompt_text},
|
|
1620
|
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
|
|
1621
|
+
]
|
|
1622
|
+
}]
|
|
1623
|
+
payload = {"model": user_model, "messages": messages}
|
|
1624
|
+
if creativity is not None:
|
|
1625
|
+
payload["temperature"] = creativity
|
|
1626
|
+
resp = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
|
|
1627
|
+
resp.raise_for_status()
|
|
1628
|
+
return resp.json()["choices"][0]["message"]["content"]
|
|
1629
|
+
|
|
1630
|
+
except Exception as e:
|
|
1631
|
+
print(f"Error describing page {page_index}: {e}")
|
|
1632
|
+
return None
|
|
1633
|
+
|
|
1634
|
+
def call_model_with_text(prompt_text):
|
|
1635
|
+
"""Send concatenated text to the model."""
|
|
1636
|
+
try:
|
|
1637
|
+
if model_source in ["openai", "huggingface", "huggingface-together", "xai", "perplexity"]:
|
|
1638
|
+
# Use requests directly instead of OpenAI SDK
|
|
1639
|
+
endpoint = f"{openai_base_url}/chat/completions"
|
|
1640
|
+
headers = {
|
|
1641
|
+
"Content-Type": "application/json",
|
|
1642
|
+
"Authorization": f"Bearer {api_key}"
|
|
1643
|
+
}
|
|
1644
|
+
payload = {
|
|
1645
|
+
"model": user_model,
|
|
1646
|
+
"messages": [{"role": "user", "content": prompt_text}]
|
|
1647
|
+
}
|
|
1648
|
+
if creativity is not None:
|
|
1649
|
+
payload["temperature"] = creativity
|
|
1650
|
+
resp = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
|
|
1651
|
+
resp.raise_for_status()
|
|
1652
|
+
return resp.json()["choices"][0]["message"]["content"]
|
|
1653
|
+
|
|
1654
|
+
elif model_source == "anthropic":
|
|
1655
|
+
endpoint = "https://api.anthropic.com/v1/messages"
|
|
1656
|
+
headers = {
|
|
1657
|
+
"Content-Type": "application/json",
|
|
1658
|
+
"x-api-key": api_key,
|
|
1659
|
+
"anthropic-version": "2023-06-01"
|
|
1660
|
+
}
|
|
1661
|
+
payload = {
|
|
1662
|
+
"model": user_model,
|
|
1663
|
+
"max_tokens": 2048,
|
|
1664
|
+
"messages": [{"role": "user", "content": prompt_text}],
|
|
1665
|
+
}
|
|
1666
|
+
if creativity is not None:
|
|
1667
|
+
payload["temperature"] = creativity
|
|
1668
|
+
resp = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
|
|
1669
|
+
resp.raise_for_status()
|
|
1670
|
+
result = resp.json()
|
|
1671
|
+
resp_content = result.get("content", [])
|
|
1672
|
+
if resp_content and resp_content[0].get("type") == "text":
|
|
1673
|
+
return resp_content[0].get("text", "")
|
|
1674
|
+
return None
|
|
1675
|
+
|
|
1676
|
+
elif model_source == "google":
|
|
1677
|
+
url = f"https://generativelanguage.googleapis.com/v1beta/models/{user_model}:generateContent"
|
|
1678
|
+
headers = {"x-goog-api-key": api_key, "Content-Type": "application/json"}
|
|
1679
|
+
payload = {
|
|
1680
|
+
"contents": [{"parts": [{"text": prompt_text}]}],
|
|
1681
|
+
"generationConfig": {**({"temperature": creativity} if creativity is not None else {})}
|
|
1682
|
+
}
|
|
1683
|
+
response = http_client.post(url, headers=headers, json=payload, timeout=120)
|
|
1684
|
+
response.raise_for_status()
|
|
1685
|
+
result = response.json()
|
|
1686
|
+
if "candidates" in result and result["candidates"]:
|
|
1687
|
+
return result["candidates"][0]["content"]["parts"][0]["text"]
|
|
1688
|
+
return None
|
|
1689
|
+
|
|
1690
|
+
elif model_source == "mistral":
|
|
1691
|
+
# Use requests directly instead of Mistral SDK
|
|
1692
|
+
endpoint = f"{openai_base_url}/chat/completions"
|
|
1693
|
+
headers = {
|
|
1694
|
+
"Content-Type": "application/json",
|
|
1695
|
+
"Authorization": f"Bearer {api_key}"
|
|
1696
|
+
}
|
|
1697
|
+
payload = {
|
|
1698
|
+
"model": user_model,
|
|
1699
|
+
"messages": [{"role": "user", "content": prompt_text}]
|
|
1700
|
+
}
|
|
1701
|
+
if creativity is not None:
|
|
1702
|
+
payload["temperature"] = creativity
|
|
1703
|
+
resp = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
|
|
1704
|
+
resp.raise_for_status()
|
|
1705
|
+
return resp.json()["choices"][0]["message"]["content"]
|
|
1706
|
+
|
|
1707
|
+
except Exception as e:
|
|
1708
|
+
print(f"Error in text mode: {e}")
|
|
1709
|
+
return None
|
|
1710
|
+
|
|
1711
|
+
def call_model_with_image(pdf_path, page_index, prompt_text):
|
|
1712
|
+
"""Send a PDF page to the model.
|
|
1713
|
+
|
|
1714
|
+
Uses native PDF support for Anthropic (non-Haiku) and Google, converts to image for others.
|
|
1715
|
+
"""
|
|
1716
|
+
try:
|
|
1717
|
+
# Anthropic - use native PDF support if model supports it
|
|
1718
|
+
if model_source == "anthropic" and _anthropic_supports_pdf(user_model):
|
|
1719
|
+
pdf_bytes, is_valid = _extract_page_as_pdf_bytes(pdf_path, page_index)
|
|
1720
|
+
if not is_valid:
|
|
1721
|
+
return None
|
|
1722
|
+
encoded_pdf = _encode_bytes_to_base64(pdf_bytes)
|
|
1723
|
+
content = [
|
|
1724
|
+
{"type": "text", "text": prompt_text},
|
|
1725
|
+
{
|
|
1726
|
+
"type": "document",
|
|
1727
|
+
"source": {
|
|
1728
|
+
"type": "base64",
|
|
1729
|
+
"media_type": "application/pdf",
|
|
1730
|
+
"data": encoded_pdf
|
|
1731
|
+
}
|
|
1732
|
+
}
|
|
1733
|
+
]
|
|
1734
|
+
endpoint = "https://api.anthropic.com/v1/messages"
|
|
1735
|
+
headers = {
|
|
1736
|
+
"Content-Type": "application/json",
|
|
1737
|
+
"x-api-key": api_key,
|
|
1738
|
+
"anthropic-version": "2023-06-01"
|
|
1739
|
+
}
|
|
1740
|
+
payload = {
|
|
1741
|
+
"model": user_model,
|
|
1742
|
+
"max_tokens": 2048,
|
|
1743
|
+
"messages": [{"role": "user", "content": content}],
|
|
1744
|
+
}
|
|
1745
|
+
if creativity is not None:
|
|
1746
|
+
payload["temperature"] = creativity
|
|
1747
|
+
resp = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
|
|
1748
|
+
resp.raise_for_status()
|
|
1749
|
+
result = resp.json()
|
|
1750
|
+
resp_content = result.get("content", [])
|
|
1751
|
+
if resp_content and resp_content[0].get("type") == "text":
|
|
1752
|
+
return resp_content[0].get("text", "")
|
|
1753
|
+
return None
|
|
1754
|
+
|
|
1755
|
+
# Anthropic Haiku - convert to image (doesn't support PDF)
|
|
1756
|
+
elif model_source == "anthropic":
|
|
1757
|
+
image_bytes, is_valid = _extract_page_as_image_bytes(pdf_path, page_index)
|
|
1758
|
+
if not is_valid:
|
|
1759
|
+
return None
|
|
1760
|
+
encoded_image = _encode_bytes_to_base64(image_bytes)
|
|
1761
|
+
content = [
|
|
1762
|
+
{"type": "text", "text": prompt_text},
|
|
1763
|
+
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": encoded_image}}
|
|
1764
|
+
]
|
|
1765
|
+
endpoint = "https://api.anthropic.com/v1/messages"
|
|
1766
|
+
headers = {
|
|
1767
|
+
"Content-Type": "application/json",
|
|
1768
|
+
"x-api-key": api_key,
|
|
1769
|
+
"anthropic-version": "2023-06-01"
|
|
1770
|
+
}
|
|
1771
|
+
payload = {
|
|
1772
|
+
"model": user_model,
|
|
1773
|
+
"max_tokens": 2048,
|
|
1774
|
+
"messages": [{"role": "user", "content": content}],
|
|
1775
|
+
}
|
|
1776
|
+
if creativity is not None:
|
|
1777
|
+
payload["temperature"] = creativity
|
|
1778
|
+
resp = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
|
|
1779
|
+
resp.raise_for_status()
|
|
1780
|
+
result = resp.json()
|
|
1781
|
+
resp_content = result.get("content", [])
|
|
1782
|
+
if resp_content and resp_content[0].get("type") == "text":
|
|
1783
|
+
return resp_content[0].get("text", "")
|
|
1784
|
+
return None
|
|
1785
|
+
|
|
1786
|
+
# Google - use native PDF support
|
|
1787
|
+
elif model_source == "google":
|
|
1788
|
+
pdf_bytes, is_valid = _extract_page_as_pdf_bytes(pdf_path, page_index)
|
|
1789
|
+
if not is_valid:
|
|
1790
|
+
return None
|
|
1791
|
+
encoded_pdf = _encode_bytes_to_base64(pdf_bytes)
|
|
1792
|
+
url = f"https://generativelanguage.googleapis.com/v1beta/models/{user_model}:generateContent"
|
|
1793
|
+
headers = {"x-goog-api-key": api_key, "Content-Type": "application/json"}
|
|
1794
|
+
parts = [
|
|
1795
|
+
{"text": prompt_text},
|
|
1796
|
+
{"inline_data": {"mime_type": "application/pdf", "data": encoded_pdf}}
|
|
1797
|
+
]
|
|
1798
|
+
payload = {
|
|
1799
|
+
"contents": [{"parts": parts}],
|
|
1800
|
+
"generationConfig": {**({"temperature": creativity} if creativity is not None else {})}
|
|
1801
|
+
}
|
|
1802
|
+
response = http_client.post(url, headers=headers, json=payload, timeout=120)
|
|
1803
|
+
response.raise_for_status()
|
|
1804
|
+
result = response.json()
|
|
1805
|
+
if "candidates" in result and result["candidates"]:
|
|
1806
|
+
return result["candidates"][0]["content"]["parts"][0]["text"]
|
|
1807
|
+
return None
|
|
1808
|
+
|
|
1809
|
+
# Other providers - convert PDF page to image
|
|
1810
|
+
else:
|
|
1811
|
+
image_bytes, is_valid = _extract_page_as_image_bytes(pdf_path, page_index)
|
|
1812
|
+
if not is_valid:
|
|
1813
|
+
return None
|
|
1814
|
+
encoded_image = _encode_bytes_to_base64(image_bytes)
|
|
1815
|
+
|
|
1816
|
+
if model_source in ["openai", "huggingface", "huggingface-together", "xai", "perplexity"]:
|
|
1817
|
+
# Use requests directly instead of OpenAI SDK
|
|
1818
|
+
endpoint = f"{openai_base_url}/chat/completions"
|
|
1819
|
+
headers = {
|
|
1820
|
+
"Content-Type": "application/json",
|
|
1821
|
+
"Authorization": f"Bearer {api_key}"
|
|
1822
|
+
}
|
|
1823
|
+
messages = [{
|
|
1824
|
+
"role": "user",
|
|
1825
|
+
"content": [
|
|
1826
|
+
{"type": "text", "text": prompt_text},
|
|
1827
|
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
|
|
1828
|
+
]
|
|
1829
|
+
}]
|
|
1830
|
+
payload = {"model": user_model, "messages": messages}
|
|
1831
|
+
if creativity is not None:
|
|
1832
|
+
payload["temperature"] = creativity
|
|
1833
|
+
resp = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
|
|
1834
|
+
resp.raise_for_status()
|
|
1835
|
+
return resp.json()["choices"][0]["message"]["content"]
|
|
1836
|
+
|
|
1837
|
+
elif model_source == "mistral":
|
|
1838
|
+
# Use requests directly instead of Mistral SDK
|
|
1839
|
+
endpoint = f"{openai_base_url}/chat/completions"
|
|
1840
|
+
headers = {
|
|
1841
|
+
"Content-Type": "application/json",
|
|
1842
|
+
"Authorization": f"Bearer {api_key}"
|
|
1843
|
+
}
|
|
1844
|
+
messages = [{
|
|
1845
|
+
"role": "user",
|
|
1846
|
+
"content": [
|
|
1847
|
+
{"type": "text", "text": prompt_text},
|
|
1848
|
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
|
|
1849
|
+
]
|
|
1850
|
+
}]
|
|
1851
|
+
payload = {"model": user_model, "messages": messages}
|
|
1852
|
+
if creativity is not None:
|
|
1853
|
+
payload["temperature"] = creativity
|
|
1854
|
+
resp = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
|
|
1855
|
+
resp.raise_for_status()
|
|
1856
|
+
return resp.json()["choices"][0]["message"]["content"]
|
|
1857
|
+
|
|
1858
|
+
except Exception as e:
|
|
1859
|
+
print(f"Error processing page {page_index}: {e}")
|
|
1860
|
+
return None
|
|
1861
|
+
|
|
1862
|
+
# Parse numbered list pattern
|
|
1863
|
+
line_pat = re.compile(r"^\s*\d+\s*[\.\)\-]\s*(.+)$")
|
|
1864
|
+
|
|
1865
|
+
all_items = []
|
|
1866
|
+
|
|
1867
|
+
# Calculate total steps for progress tracking: (iterations * divisions) + 1 for final merge
|
|
1868
|
+
total_steps = (iterations * divisions) + 1
|
|
1869
|
+
current_step = 0
|
|
1870
|
+
|
|
1871
|
+
for pass_idx in range(iterations):
|
|
1872
|
+
# Shuffle page indices for this pass
|
|
1873
|
+
page_indices = list(range(n))
|
|
1874
|
+
rng.shuffle(page_indices)
|
|
1875
|
+
|
|
1876
|
+
# Create chunks
|
|
1877
|
+
chunks = [page_indices[i:i + chunk_size] for i in range(0, len(page_indices), chunk_size)][:divisions]
|
|
1878
|
+
|
|
1879
|
+
for chunk_idx, chunk in enumerate(tqdm(chunks, desc=f"Processing chunks (pass {pass_idx+1}/{iterations})")):
|
|
1880
|
+
if not chunk:
|
|
1881
|
+
continue
|
|
1882
|
+
|
|
1883
|
+
if mode == "text":
|
|
1884
|
+
# TEXT MODE: Extract and concatenate text from all pages in chunk
|
|
1885
|
+
chunk_texts = []
|
|
1886
|
+
for idx in chunk:
|
|
1887
|
+
page_tuple = all_pages[idx]
|
|
1888
|
+
pdf_path, page_index, page_label = page_tuple
|
|
1889
|
+
text, is_valid, _ = _extract_page_text(pdf_path, page_index)
|
|
1890
|
+
if is_valid and text:
|
|
1891
|
+
chunk_texts.append(text)
|
|
1892
|
+
|
|
1893
|
+
if not chunk_texts:
|
|
1894
|
+
continue
|
|
1895
|
+
|
|
1896
|
+
# Concatenate texts with separator
|
|
1897
|
+
combined_text = "\n---\n".join(chunk_texts)
|
|
1898
|
+
prompt = make_text_prompt(combined_text)
|
|
1899
|
+
reply = call_model_with_text(prompt)
|
|
1900
|
+
|
|
1901
|
+
elif mode == "image":
|
|
1902
|
+
# IMAGE MODE: Sample one random page from the full pool
|
|
1903
|
+
random_idx = rng.choice(page_indices)
|
|
1904
|
+
page_tuple = all_pages[random_idx]
|
|
1905
|
+
pdf_path, page_index, _ = page_tuple
|
|
1906
|
+
prompt = make_image_prompt()
|
|
1907
|
+
reply = call_model_with_image(pdf_path, page_index, prompt)
|
|
1908
|
+
|
|
1909
|
+
elif mode == "both":
|
|
1910
|
+
# BOTH MODE: Sample random page, describe with vision, then extract categories from description
|
|
1911
|
+
random_idx = rng.choice(page_indices)
|
|
1912
|
+
page_tuple = all_pages[random_idx]
|
|
1913
|
+
pdf_path, page_index, _ = page_tuple
|
|
1914
|
+
|
|
1915
|
+
# Step 1: Get text description of the page using vision
|
|
1916
|
+
page_description = describe_page_with_vision(pdf_path, page_index)
|
|
1917
|
+
if not page_description:
|
|
1918
|
+
continue
|
|
1919
|
+
|
|
1920
|
+
# Step 2: Extract categories from the description
|
|
1921
|
+
prompt = make_text_prompt(page_description)
|
|
1922
|
+
reply = call_model_with_text(prompt)
|
|
1923
|
+
|
|
1924
|
+
else:
|
|
1925
|
+
raise ValueError(f"Invalid mode: {mode}. Must be 'text', 'image', or 'both'.")
|
|
1926
|
+
|
|
1927
|
+
if reply:
|
|
1928
|
+
# Extract numbered items
|
|
1929
|
+
items = []
|
|
1930
|
+
for raw_line in reply.splitlines():
|
|
1931
|
+
m = line_pat.match(raw_line.strip())
|
|
1932
|
+
if m:
|
|
1933
|
+
items.append(m.group(1).strip())
|
|
1934
|
+
# Fallback for unnumbered lines
|
|
1935
|
+
if not items:
|
|
1936
|
+
for raw_line in reply.splitlines():
|
|
1937
|
+
s = raw_line.strip()
|
|
1938
|
+
if s:
|
|
1939
|
+
items.append(s)
|
|
1940
|
+
all_items.extend(items)
|
|
1941
|
+
|
|
1942
|
+
# Progress callback
|
|
1943
|
+
current_step += 1
|
|
1944
|
+
if progress_callback:
|
|
1945
|
+
progress_callback(current_step, total_steps, f"Pass {pass_idx+1}/{iterations}, chunk {chunk_idx+1}/{len(chunks)}")
|
|
1946
|
+
|
|
1947
|
+
# Normalize and count
|
|
1948
|
+
def normalize_category(cat):
|
|
1949
|
+
terms = sorted([t.strip().lower() for t in str(cat).split("/")])
|
|
1950
|
+
return "/".join(terms)
|
|
1951
|
+
|
|
1952
|
+
flat_list = [str(x).strip() for x in all_items if str(x).strip()]
|
|
1953
|
+
if not flat_list:
|
|
1954
|
+
raise ValueError("No categories were extracted from the PDF pages.")
|
|
1955
|
+
|
|
1956
|
+
df = pd.DataFrame(flat_list, columns=["Category"])
|
|
1957
|
+
df["normalized"] = df["Category"].map(normalize_category)
|
|
1958
|
+
|
|
1959
|
+
result = (
|
|
1960
|
+
df.groupby("normalized")
|
|
1961
|
+
.agg(Category=("Category", lambda x: x.value_counts().index[0]),
|
|
1962
|
+
counts=("Category", "size"))
|
|
1963
|
+
.sort_values("counts", ascending=False)
|
|
1964
|
+
.reset_index(drop=True)
|
|
1965
|
+
)
|
|
1966
|
+
|
|
1967
|
+
# Second-pass semantic merge
|
|
1968
|
+
seed_list = result["Category"].head(max_categories * 3).tolist()
|
|
1969
|
+
|
|
1970
|
+
second_prompt = f"""
|
|
1971
|
+
You are a data analyst reviewing categorized document data.
|
|
1972
|
+
|
|
1973
|
+
Task: From the provided categories, identify and return the top {max_categories} CONCEPTUALLY UNIQUE categories.
|
|
1974
|
+
|
|
1975
|
+
Critical Instructions:
|
|
1976
|
+
1) Exact duplicates are already removed.
|
|
1977
|
+
2) Merge SEMANTIC duplicates (same concept, different wording).
|
|
1978
|
+
3) When merging:
|
|
1979
|
+
- Combine frequencies mentally
|
|
1980
|
+
- Keep the most frequent OR clearest label
|
|
1981
|
+
- Each concept appears ONLY ONCE
|
|
1982
|
+
4) Keep category names {specificity}.
|
|
1983
|
+
5) Return ONLY a numbered list of {max_categories} categories. No extra text.
|
|
1984
|
+
|
|
1985
|
+
Pre-processed Categories (sorted by frequency, top sample):
|
|
1986
|
+
{seed_list}
|
|
1987
|
+
|
|
1988
|
+
Output:
|
|
1989
|
+
1. category
|
|
1990
|
+
2. category
|
|
1991
|
+
...
|
|
1992
|
+
{max_categories}. category
|
|
1993
|
+
""".strip()
|
|
1994
|
+
|
|
1995
|
+
try:
|
|
1996
|
+
if model_source in ["openai", "huggingface", "huggingface-together", "xai", "perplexity"]:
|
|
1997
|
+
# Use requests directly instead of OpenAI SDK
|
|
1998
|
+
endpoint = f"{openai_base_url}/chat/completions"
|
|
1999
|
+
headers = {
|
|
2000
|
+
"Content-Type": "application/json",
|
|
2001
|
+
"Authorization": f"Bearer {api_key}"
|
|
2002
|
+
}
|
|
2003
|
+
payload = {
|
|
2004
|
+
"model": user_model,
|
|
2005
|
+
"messages": [{"role": "user", "content": second_prompt}]
|
|
2006
|
+
}
|
|
2007
|
+
if creativity is not None:
|
|
2008
|
+
payload["temperature"] = creativity
|
|
2009
|
+
resp2 = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
|
|
2010
|
+
resp2.raise_for_status()
|
|
2011
|
+
top_categories_text = resp2.json()["choices"][0]["message"]["content"]
|
|
2012
|
+
elif model_source == "anthropic":
|
|
2013
|
+
endpoint = "https://api.anthropic.com/v1/messages"
|
|
2014
|
+
headers = {
|
|
2015
|
+
"Content-Type": "application/json",
|
|
2016
|
+
"x-api-key": api_key,
|
|
2017
|
+
"anthropic-version": "2023-06-01"
|
|
2018
|
+
}
|
|
2019
|
+
payload = {
|
|
2020
|
+
"model": user_model,
|
|
2021
|
+
"max_tokens": 2048,
|
|
2022
|
+
"messages": [{"role": "user", "content": second_prompt}],
|
|
2023
|
+
}
|
|
2024
|
+
if creativity is not None:
|
|
2025
|
+
payload["temperature"] = creativity
|
|
2026
|
+
resp2 = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
|
|
2027
|
+
resp2.raise_for_status()
|
|
2028
|
+
result = resp2.json()
|
|
2029
|
+
resp_content = result.get("content", [])
|
|
2030
|
+
if resp_content and resp_content[0].get("type") == "text":
|
|
2031
|
+
top_categories_text = resp_content[0].get("text", "")
|
|
2032
|
+
else:
|
|
2033
|
+
top_categories_text = ""
|
|
2034
|
+
elif model_source == "google":
|
|
2035
|
+
url = f"https://generativelanguage.googleapis.com/v1beta/models/{user_model}:generateContent"
|
|
2036
|
+
headers = {"x-goog-api-key": api_key, "Content-Type": "application/json"}
|
|
2037
|
+
payload = {
|
|
2038
|
+
"contents": [{"parts": [{"text": second_prompt}]}],
|
|
2039
|
+
"generationConfig": {**({"temperature": creativity} if creativity is not None else {})}
|
|
2040
|
+
}
|
|
2041
|
+
response = http_client.post(url, headers=headers, json=payload, timeout=120)
|
|
2042
|
+
response.raise_for_status()
|
|
2043
|
+
res = response.json()
|
|
2044
|
+
top_categories_text = res["candidates"][0]["content"]["parts"][0]["text"]
|
|
2045
|
+
elif model_source == "mistral":
|
|
2046
|
+
# Use requests directly instead of Mistral SDK
|
|
2047
|
+
endpoint = f"{openai_base_url}/chat/completions"
|
|
2048
|
+
headers = {
|
|
2049
|
+
"Content-Type": "application/json",
|
|
2050
|
+
"Authorization": f"Bearer {api_key}"
|
|
2051
|
+
}
|
|
2052
|
+
payload = {
|
|
2053
|
+
"model": user_model,
|
|
2054
|
+
"messages": [{"role": "user", "content": second_prompt}]
|
|
2055
|
+
}
|
|
2056
|
+
if creativity is not None:
|
|
2057
|
+
payload["temperature"] = creativity
|
|
2058
|
+
resp2 = http_client.post(endpoint, headers=headers, json=payload, timeout=120)
|
|
2059
|
+
resp2.raise_for_status()
|
|
2060
|
+
top_categories_text = resp2.json()["choices"][0]["message"]["content"]
|
|
2061
|
+
except Exception as e:
|
|
2062
|
+
print(f"Error in second-pass merge: {e}")
|
|
2063
|
+
top_categories_text = ""
|
|
2064
|
+
|
|
2065
|
+
# Final progress callback for the merge step
|
|
2066
|
+
if progress_callback:
|
|
2067
|
+
progress_callback(total_steps, total_steps, "Merging categories")
|
|
2068
|
+
|
|
2069
|
+
# Parse final list
|
|
2070
|
+
final = []
|
|
2071
|
+
for line in top_categories_text.splitlines():
|
|
2072
|
+
m = line_pat.match(line.strip())
|
|
2073
|
+
if m:
|
|
2074
|
+
final.append(m.group(1).strip())
|
|
2075
|
+
if not final:
|
|
2076
|
+
final = [l.strip("-*• ").strip() for l in top_categories_text.splitlines() if l.strip()]
|
|
2077
|
+
|
|
2078
|
+
print("\nTop categories:\n" + "\n".join(f"{i+1}. {c}" for i, c in enumerate(final[:max_categories])))
|
|
2079
|
+
|
|
2080
|
+
if filename:
|
|
2081
|
+
result.to_csv(filename, index=False)
|
|
2082
|
+
|
|
2083
|
+
return {
|
|
2084
|
+
"counts_df": result,
|
|
2085
|
+
"top_categories": final[:max_categories],
|
|
2086
|
+
"raw_top_text": top_categories_text
|
|
2087
|
+
}
|