cat-llm 0.0.25__py3-none-any.whl → 0.0.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,692 @@
1
+ # image multi-class (binary) function
2
+ def extract_image_multi_class(
3
+ image_description,
4
+ image_input,
5
+ categories,
6
+ api_key,
7
+ columns="numbered",
8
+ user_model="gpt-4o",
9
+ creativity=0,
10
+ to_csv=False,
11
+ safety=False,
12
+ filename="categorized_data.csv",
13
+ save_directory=None,
14
+ model_source="OpenAI"
15
+ ):
16
+ import os
17
+ import json
18
+ import pandas as pd
19
+ import regex
20
+ from tqdm import tqdm
21
+ import glob
22
+ import base64
23
+ from pathlib import Path
24
+
25
+ if save_directory is not None and not os.path.isdir(save_directory):
26
+ # Directory doesn't exist - raise an exception to halt execution
27
+ raise FileNotFoundError(f"Directory {save_directory} doesn't exist")
28
+
29
+ image_extensions = [
30
+ '*.png', '*.jpg', '*.jpeg',
31
+ '*.gif', '*.webp', '*.svg', '*.svgz', '*.avif', '*.apng',
32
+ '*.tif', '*.tiff', '*.bmp',
33
+ '*.heif', '*.heic', '*.ico',
34
+ '*.psd'
35
+ ]
36
+
37
+ if not isinstance(image_input, list):
38
+ # If image_input is a filepath (string)
39
+ image_files = []
40
+ for ext in image_extensions:
41
+ image_files.extend(glob.glob(os.path.join(image_input, ext)))
42
+
43
+ print(f"Found {len(image_files)} images.")
44
+ else:
45
+ # If image_files is already a list
46
+ image_files = image_input
47
+ print(f"Provided a list of {len(image_input)} images.")
48
+
49
+ categories_str = "\n".join(f"{i + 1}. {cat}" for i, cat in enumerate(categories))
50
+ cat_num = len(categories)
51
+ category_dict = {str(i+1): "0" for i in range(cat_num)}
52
+ example_JSON = json.dumps(category_dict, indent=4)
53
+
54
+ # ensure number of categories is what user wants
55
+ print("Categories to classify:")
56
+ for i, cat in enumerate(categories, 1):
57
+ print(f"{i}. {cat}")
58
+
59
+ link1 = []
60
+ extracted_jsons = []
61
+
62
+ for i, img_path in enumerate(tqdm(image_files, desc="Categorising images"), start=0):
63
+ # Check validity first
64
+ if img_path is None or not os.path.exists(img_path):
65
+ link1.append("Skipped NaN input or invalid path")
66
+ extracted_jsons.append("""{"no_valid_image": 1}""")
67
+ continue # Skip the rest of the loop iteration
68
+
69
+ # Only open the file if path is valid
70
+ with open(img_path, "rb") as f:
71
+ encoded = base64.b64encode(f.read()).decode("utf-8")
72
+
73
+ # Handle extension safely
74
+ ext = Path(img_path).suffix.lstrip(".").lower()
75
+ if model_source == "OpenAI":
76
+ encoded_image = f"data:image/{ext};base64,{encoded}"
77
+ prompt = [
78
+ {
79
+ "type": "text",
80
+ "text": (
81
+ f"You are an image-tagging assistant.\n"
82
+ f"Task ► Examine the attached image and decide, **for each category below**, "
83
+ f"whether it is PRESENT (1) or NOT PRESENT (0).\n\n"
84
+ f"Image is expected to show: {image_description}\n\n"
85
+ f"Categories:\n{categories_str}\n\n"
86
+ f"Output format ► Respond with **only** a JSON object whose keys are the "
87
+ f"quoted category numbers ('1', '2', …) and whose values are 1 or 0. "
88
+ f"No additional keys, comments, or text.\n\n"
89
+ f"Example (three categories):\n"
90
+ f"{example_JSON}"
91
+ ),
92
+ },
93
+ {
94
+ "type": "image_url",
95
+ "image_url": {"url": encoded_image, "detail": "high"},
96
+ },
97
+ ]
98
+
99
+ if model_source == "Anthropic":
100
+ encoded_image = f"data:image/{ext};base64,{encoded}"
101
+ prompt = [
102
+ {"type": "text",
103
+ "text": (
104
+ f"You are an image-tagging assistant.\n"
105
+ f"Task ► Examine the attached image and decide, **for each category below**, "
106
+ f"whether it is PRESENT (1) or NOT PRESENT (0).\n\n"
107
+ f"Image is expected to show: {image_description}\n\n"
108
+ f"Categories:\n{categories_str}\n\n"
109
+ f"Output format ► Respond with **only** a JSON object whose keys are the "
110
+ f"quoted category numbers ('1', '2', …) and whose values are 1 or 0. "
111
+ f"No additional keys, comments, or text.\n\n"
112
+ f"Example (three categories):\n"
113
+ f"{example_JSON}"
114
+ ),
115
+ },
116
+ {
117
+ "type": "image",
118
+ "source": {
119
+ "type": "base64",
120
+ "media_type": "image/jpeg",
121
+ "data": encoded
122
+ }
123
+ }
124
+ ]
125
+ if model_source == "OpenAI":
126
+ from openai import OpenAI
127
+ client = OpenAI(api_key=api_key)
128
+ try:
129
+ response_obj = client.chat.completions.create(
130
+ model=user_model,
131
+ messages=[{'role': 'user', 'content': prompt}],
132
+ temperature=creativity
133
+ )
134
+ reply = response_obj.choices[0].message.content
135
+ link1.append(reply)
136
+ except Exception as e:
137
+ print(f"An error occurred: {e}")
138
+ link1.append(f"Error processing input: {e}")
139
+
140
+ elif model_source == "Anthropic":
141
+ import anthropic
142
+ reply = None
143
+ client = anthropic.Anthropic(api_key=api_key)
144
+ try:
145
+ message = client.messages.create(
146
+ model=user_model,
147
+ max_tokens=1024,
148
+ temperature=creativity,
149
+ messages=[{"role": "user", "content": prompt}]
150
+ )
151
+ reply = message.content[0].text
152
+ link1.append(reply)
153
+ except Exception as e:
154
+ print(f"An error occurred: {e}")
155
+ link1.append(f"Error processing input: {e}")
156
+
157
+ elif model_source == "Mistral":
158
+ from mistralai import Mistral
159
+ client = Mistral(api_key=api_key)
160
+ try:
161
+ response = client.chat.complete(
162
+ model=user_model,
163
+ messages=[
164
+ {'role': 'user', 'content': prompt}
165
+ ],
166
+ temperature=creativity
167
+ )
168
+ reply = response.choices[0].message.content
169
+ link1.append(reply)
170
+ except Exception as e:
171
+ print(f"An error occurred: {e}")
172
+ link1.append(f"Error processing input: {e}")
173
+ else:
174
+ raise ValueError("Unknown source! Choose from OpenAI, Anthropic, Perplexity, or Mistral")
175
+ # in situation that no JSON is found
176
+ if reply is not None:
177
+ extracted_json = regex.findall(r'\{(?:[^{}]|(?R))*\}', reply, regex.DOTALL)
178
+ if extracted_json:
179
+ cleaned_json = extracted_json[0].replace('[', '').replace(']', '').replace('\n', '').replace(" ", '').replace(" ", '')
180
+ extracted_jsons.append(cleaned_json)
181
+ #print(cleaned_json)
182
+ else:
183
+ error_message = """{"1":"e"}"""
184
+ extracted_jsons.append(error_message)
185
+ print(error_message)
186
+ else:
187
+ error_message = """{"1":"e"}"""
188
+ extracted_jsons.append(error_message)
189
+ #print(error_message)
190
+
191
+ # --- Safety Save ---
192
+ if safety:
193
+ #print(f"Saving CSV to: {save_directory}")
194
+ # Save progress so far
195
+ temp_df = pd.DataFrame({
196
+ 'image_input': image_files[:i+1],
197
+ 'link1': link1,
198
+ 'json': extracted_jsons
199
+ })
200
+ # Normalize processed jsons so far
201
+ normalized_data_list = []
202
+ for json_str in extracted_jsons:
203
+ try:
204
+ parsed_obj = json.loads(json_str)
205
+ normalized_data_list.append(pd.json_normalize(parsed_obj))
206
+ except json.JSONDecodeError:
207
+ normalized_data_list.append(pd.DataFrame({"1": ["e"]}))
208
+ normalized_data = pd.concat(normalized_data_list, ignore_index=True)
209
+ temp_df = pd.concat([temp_df, normalized_data], axis=1)
210
+ # Save to CSV
211
+ if save_directory is None:
212
+ save_directory = os.getcwd()
213
+ temp_df.to_csv(os.path.join(save_directory, filename), index=False)
214
+
215
+ # --- Final DataFrame ---
216
+ normalized_data_list = []
217
+ for json_str in extracted_jsons:
218
+ try:
219
+ parsed_obj = json.loads(json_str)
220
+ normalized_data_list.append(pd.json_normalize(parsed_obj))
221
+ except json.JSONDecodeError:
222
+ normalized_data_list.append(pd.DataFrame({"1": ["e"]}))
223
+ normalized_data = pd.concat(normalized_data_list, ignore_index=True)
224
+
225
+ categorized_data = pd.DataFrame({
226
+ 'image_input': image_files,
227
+ 'link1': pd.Series(link1).reset_index(drop=True),
228
+ 'json': pd.Series(extracted_jsons).reset_index(drop=True)
229
+ })
230
+ categorized_data = pd.concat([categorized_data, normalized_data], axis=1)
231
+
232
+ if columns != "numbered": #if user wants text columns
233
+ categorized_data.columns = list(categorized_data.columns[:3]) + categories[:len(categorized_data.columns) - 3]
234
+
235
+ if to_csv:
236
+ if save_directory is None:
237
+ save_directory = os.getcwd()
238
+ categorized_data.to_csv(os.path.join(save_directory, filename), index=False)
239
+
240
+ return categorized_data
241
+
242
+ #image score function
243
+ def extract_image_score(
244
+ reference_image_description,
245
+ image_input,
246
+ reference_image,
247
+ api_key,
248
+ columns="numbered",
249
+ user_model="gpt-4o-2024-11-20",
250
+ creativity=0,
251
+ to_csv=False,
252
+ safety=False,
253
+ filename="categorized_data.csv",
254
+ save_directory=None,
255
+ model_source="OpenAI"
256
+ ):
257
+ import os
258
+ import json
259
+ import pandas as pd
260
+ import regex
261
+ from tqdm import tqdm
262
+ import glob
263
+ import base64
264
+ from pathlib import Path
265
+
266
+ if save_directory is not None and not os.path.isdir(save_directory):
267
+ # Directory doesn't exist - raise an exception to halt execution
268
+ raise FileNotFoundError(f"Directory {save_directory} doesn't exist")
269
+
270
+ image_extensions = [
271
+ '*.png', '*.jpg', '*.jpeg',
272
+ '*.gif', '*.webp', '*.svg', '*.svgz', '*.avif', '*.apng',
273
+ '*.tif', '*.tiff', '*.bmp',
274
+ '*.heif', '*.heic', '*.ico',
275
+ '*.psd'
276
+ ]
277
+
278
+ if not isinstance(image_input, list):
279
+ # If image_input is a filepath (string)
280
+ image_files = []
281
+ for ext in image_extensions:
282
+ image_files.extend(glob.glob(os.path.join(image_input, ext)))
283
+
284
+ print(f"Found {len(image_files)} images.")
285
+ else:
286
+ # If image_files is already a list
287
+ image_files = image_input
288
+ print(f"Provided a list of {len(image_input)} images.")
289
+
290
+ with open(reference_image, 'rb') as f:
291
+ reference_image = f"data:image/{reference_image.split('.')[-1]};base64,{base64.b64encode(f.read()).decode('utf-8')}"
292
+
293
+ link1 = []
294
+ extracted_jsons = []
295
+
296
+ for i, img_path in enumerate(tqdm(image_files, desc="Categorising images"), start=0):
297
+ # Check validity first
298
+ if img_path is None or not os.path.exists(img_path):
299
+ link1.append("Skipped NaN input or invalid path")
300
+ extracted_jsons.append("""{"no_valid_image": 1}""")
301
+ continue # Skip the rest of the loop iteration
302
+
303
+ # Only open the file if path is valid
304
+ with open(img_path, "rb") as f:
305
+ encoded = base64.b64encode(f.read()).decode("utf-8")
306
+
307
+ # Handle extension safely
308
+ ext = Path(img_path).suffix.lstrip(".").lower()
309
+ encoded_image = f"data:image/{ext};base64,{encoded}"
310
+
311
+ prompt = [
312
+ {
313
+ "type": "text",
314
+ "text": (
315
+ f"You are a visual similarity assessment system.\n"
316
+ f"Task ► Compare these two images:\n"
317
+ f"1. REFERENCE (left): {reference_image_description}\n"
318
+ f"2. INPUT (right): User-provided drawing\n\n"
319
+ f"Rating criteria:\n"
320
+ f"1: No meaningful similarity (fundamentally different)\n"
321
+ f"2: Barely recognizable similarity (25% match)\n"
322
+ f"3: Partial match (50% key features)\n"
323
+ f"4: Strong alignment (75% features)\n"
324
+ f"5: Near-perfect match (90%+ similarity)\n\n"
325
+ f"Output format ► Return ONLY:\n"
326
+ "{\n"
327
+ ' "score": [1-5],\n'
328
+ ' "summary": "reason you scored"\n'
329
+ "}\n\n"
330
+ f"Critical rules:\n"
331
+ f"- Score must reflect shape, proportions, and key details\n"
332
+ f"- List only concrete matching elements from reference\n"
333
+ f"- No markdown or additional text"
334
+ ),
335
+ },
336
+ {"type": "image_url",
337
+ "image_url": {"url": reference_image, "detail": "high"}
338
+ },
339
+ {
340
+ "type": "image_url",
341
+
342
+ "image_url": {"url": encoded_image, "detail": "high"},
343
+ },
344
+ ]
345
+ if model_source == "OpenAI":
346
+ from openai import OpenAI
347
+ client = OpenAI(api_key=api_key)
348
+ try:
349
+ response_obj = client.chat.completions.create(
350
+ model=user_model,
351
+ messages=[{'role': 'user', 'content': prompt}],
352
+ temperature=creativity
353
+ )
354
+ reply = response_obj.choices[0].message.content
355
+ link1.append(reply)
356
+ except Exception as e:
357
+ print(f"An error occurred: {e}")
358
+ link1.append(f"Error processing input: {e}")
359
+
360
+ elif model_source == "Perplexity":
361
+ from openai import OpenAI
362
+ client = OpenAI(api_key=api_key, base_url="https://api.perplexity.ai")
363
+ try:
364
+ response_obj = client.chat.completions.create(
365
+ model=user_model,
366
+ messages=[{'role': 'user', 'content': prompt}],
367
+ temperature=creativity
368
+ )
369
+ reply = response_obj.choices[0].message.content
370
+ link1.append(reply)
371
+ except Exception as e:
372
+ print(f"An error occurred: {e}")
373
+ link1.append(f"Error processing input: {e}")
374
+ elif model_source == "Anthropic":
375
+ import anthropic
376
+ client = anthropic.Anthropic(api_key=api_key)
377
+ try:
378
+ message = client.messages.create(
379
+ model=user_model,
380
+ max_tokens=1024,
381
+ temperature=creativity,
382
+ messages=[{"role": "user", "content": prompt}]
383
+ )
384
+ reply = message.content[0].text # Anthropic returns content as list
385
+ link1.append(reply)
386
+ except Exception as e:
387
+ print(f"An error occurred: {e}")
388
+ link1.append(f"Error processing input: {e}")
389
+ elif model_source == "Mistral":
390
+ from mistralai import Mistral
391
+ client = Mistral(api_key=api_key)
392
+ try:
393
+ response = client.chat.complete(
394
+ model=user_model,
395
+ messages=[
396
+ {'role': 'user', 'content': prompt}
397
+ ],
398
+ temperature=creativity
399
+ )
400
+ reply = response.choices[0].message.content
401
+ link1.append(reply)
402
+ except Exception as e:
403
+ print(f"An error occurred: {e}")
404
+ link1.append(f"Error processing input: {e}")
405
+ else:
406
+ raise ValueError("Unknown source! Choose from OpenAI, Anthropic, Perplexity, or Mistral")
407
+ # in situation that no JSON is found
408
+ if reply is not None:
409
+ extracted_json = regex.findall(r'\{(?:[^{}]|(?R))*\}', reply, regex.DOTALL)
410
+ if extracted_json:
411
+ cleaned_json = extracted_json[0].replace('[', '').replace(']', '').replace('\n', '').replace(" ", '').replace(" ", '')
412
+ extracted_jsons.append(cleaned_json)
413
+ #print(cleaned_json)
414
+ else:
415
+ error_message = """{"1":"e"}"""
416
+ extracted_jsons.append(error_message)
417
+ print(error_message)
418
+ else:
419
+ error_message = """{"1":"e"}"""
420
+ extracted_jsons.append(error_message)
421
+ #print(error_message)
422
+
423
+ # --- Safety Save ---
424
+ if safety:
425
+ # Save progress so far
426
+ temp_df = pd.DataFrame({
427
+ 'image_input': image_files[:i+1],
428
+ 'link1': link1,
429
+ 'json': extracted_jsons
430
+ })
431
+ # Normalize processed jsons so far
432
+ normalized_data_list = []
433
+ for json_str in extracted_jsons:
434
+ try:
435
+ parsed_obj = json.loads(json_str)
436
+ normalized_data_list.append(pd.json_normalize(parsed_obj))
437
+ except json.JSONDecodeError:
438
+ normalized_data_list.append(pd.DataFrame({"1": ["e"]}))
439
+ normalized_data = pd.concat(normalized_data_list, ignore_index=True)
440
+ temp_df = pd.concat([temp_df, normalized_data], axis=1)
441
+ # Save to CSV
442
+ if save_directory is None:
443
+ save_directory = os.getcwd()
444
+ temp_df.to_csv(os.path.join(save_directory, filename), index=False)
445
+
446
+ # --- Final DataFrame ---
447
+ normalized_data_list = []
448
+ for json_str in extracted_jsons:
449
+ try:
450
+ parsed_obj = json.loads(json_str)
451
+ normalized_data_list.append(pd.json_normalize(parsed_obj))
452
+ except json.JSONDecodeError:
453
+ normalized_data_list.append(pd.DataFrame({"1": ["e"]}))
454
+ normalized_data = pd.concat(normalized_data_list, ignore_index=True)
455
+
456
+ categorized_data = pd.DataFrame({
457
+ 'image_input': image_files,
458
+ 'link1': pd.Series(link1).reset_index(drop=True),
459
+ 'json': pd.Series(extracted_jsons).reset_index(drop=True)
460
+ })
461
+ categorized_data = pd.concat([categorized_data, normalized_data], axis=1)
462
+
463
+ if to_csv:
464
+ if save_directory is None:
465
+ save_directory = os.getcwd()
466
+ categorized_data.to_csv(os.path.join(save_directory, filename), index=False)
467
+
468
+ return categorized_data
469
+
470
+ # image features function
471
+ def extract_image_features(
472
+ image_description,
473
+ image_input,
474
+ features_to_extract,
475
+ api_key,
476
+ columns="numbered",
477
+ user_model="gpt-4o-2024-11-20",
478
+ creativity=0,
479
+ to_csv=False,
480
+ safety=False,
481
+ filename="categorized_data.csv",
482
+ save_directory=None,
483
+ model_source="OpenAI"
484
+ ):
485
+ import os
486
+ import json
487
+ import pandas as pd
488
+ import regex
489
+ from tqdm import tqdm
490
+ import glob
491
+ import base64
492
+ from pathlib import Path
493
+
494
+ if save_directory is not None and not os.path.isdir(save_directory):
495
+ # Directory doesn't exist - raise an exception to halt execution
496
+ raise FileNotFoundError(f"Directory {save_directory} doesn't exist")
497
+
498
+ image_extensions = [
499
+ '*.png', '*.jpg', '*.jpeg',
500
+ '*.gif', '*.webp', '*.svg', '*.svgz', '*.avif', '*.apng',
501
+ '*.tif', '*.tiff', '*.bmp',
502
+ '*.heif', '*.heic', '*.ico',
503
+ '*.psd'
504
+ ]
505
+
506
+ if not isinstance(image_input, list):
507
+ # If image_input is a filepath (string)
508
+ image_files = []
509
+ for ext in image_extensions:
510
+ image_files.extend(glob.glob(os.path.join(image_input, ext)))
511
+
512
+ print(f"Found {len(image_files)} images.")
513
+ else:
514
+ # If image_files is already a list
515
+ image_files = image_input
516
+ print(f"Provided a list of {len(image_input)} images.")
517
+
518
+ categories_str = "\n".join(f"{i + 1}. {cat}" for i, cat in enumerate(features_to_extract))
519
+ cat_num = len(features_to_extract)
520
+ category_dict = {str(i+1): "0" for i in range(cat_num)}
521
+ example_JSON = json.dumps(category_dict, indent=4)
522
+
523
+ # ensure number of categories is what user wants
524
+ print("\nThe image features to be extracted are:")
525
+ for i, cat in enumerate(features_to_extract, 1):
526
+ print(f"{i}. {cat}")
527
+
528
+ link1 = []
529
+ extracted_jsons = []
530
+
531
+ for i, img_path in enumerate(
532
+ tqdm(image_files, desc="Categorising images"), start=0):
533
+ # encode this specific image once
534
+ with open(img_path, "rb") as f:
535
+ encoded = base64.b64encode(f.read()).decode("utf-8")
536
+ ext = Path(img_path).suffix.lstrip(".").lower()
537
+ encoded_image = f"data:image/{ext};base64,{encoded}"
538
+
539
+ prompt = [
540
+ {
541
+ "type": "text",
542
+ "text": (
543
+ f"You are a visual question answering assistant.\n"
544
+ f"Task ► Analyze the attached image and answer these specific questions:\n\n"
545
+ f"Image context: {image_description}\n\n"
546
+ f"Questions to answer:\n{categories_str}\n\n"
547
+ f"Output format ► Return **only** a JSON object where:\n"
548
+ f"- Keys are question numbers ('1', '2', ...)\n"
549
+ f"- Values are concise answers (numbers, short phrases)\n\n"
550
+ f"Example for 3 questions:\n"
551
+ "{\n"
552
+ ' "1": "4",\n'
553
+ ' "2": "blue",\n'
554
+ ' "3": "yes"\n'
555
+ "}\n\n"
556
+ f"Important rules:\n"
557
+ f"1. Answer directly - no explanations\n"
558
+ f"2. Use exact numerical values when possible\n"
559
+ f"3. For yes/no questions, use 'yes' or 'no'\n"
560
+ f"4. Never add extra keys or formatting"
561
+ ),
562
+ },
563
+ {
564
+ "type": "image_url",
565
+ "image_url": {"url": encoded_image, "detail": "high"},
566
+ },
567
+ ]
568
+ if model_source == "OpenAI":
569
+ from openai import OpenAI
570
+ client = OpenAI(api_key=api_key)
571
+ try:
572
+ response_obj = client.chat.completions.create(
573
+ model=user_model,
574
+ messages=[{'role': 'user', 'content': prompt}],
575
+ temperature=creativity
576
+ )
577
+ reply = response_obj.choices[0].message.content
578
+ link1.append(reply)
579
+ except Exception as e:
580
+ print(f"An error occurred: {e}")
581
+ link1.append(f"Error processing input: {e}")
582
+
583
+ elif model_source == "Perplexity":
584
+ from openai import OpenAI
585
+ client = OpenAI(api_key=api_key, base_url="https://api.perplexity.ai")
586
+ try:
587
+ response_obj = client.chat.completions.create(
588
+ model=user_model,
589
+ messages=[{'role': 'user', 'content': prompt}],
590
+ temperature=creativity
591
+ )
592
+ reply = response_obj.choices[0].message.content
593
+ link1.append(reply)
594
+ except Exception as e:
595
+ print(f"An error occurred: {e}")
596
+ link1.append(f"Error processing input: {e}")
597
+ elif model_source == "Anthropic":
598
+ import anthropic
599
+ client = anthropic.Anthropic(api_key=api_key)
600
+ try:
601
+ message = client.messages.create(
602
+ model=user_model,
603
+ max_tokens=1024,
604
+ temperature=creativity,
605
+ messages=[{"role": "user", "content": prompt}]
606
+ )
607
+ reply = message.content[0].text # Anthropic returns content as list
608
+ link1.append(reply)
609
+ except Exception as e:
610
+ print(f"An error occurred: {e}")
611
+ link1.append(f"Error processing input: {e}")
612
+ elif model_source == "Mistral":
613
+ from mistralai import Mistral
614
+ client = Mistral(api_key=api_key)
615
+ try:
616
+ response = client.chat.complete(
617
+ model=user_model,
618
+ messages=[
619
+ {'role': 'user', 'content': prompt}
620
+ ],
621
+ temperature=creativity
622
+ )
623
+ reply = response.choices[0].message.content
624
+ link1.append(reply)
625
+ except Exception as e:
626
+ print(f"An error occurred: {e}")
627
+ link1.append(f"Error processing input: {e}")
628
+ else:
629
+ raise ValueError("Unknown source! Choose from OpenAI, Anthropic, Perplexity, or Mistral")
630
+ # in situation that no JSON is found
631
+ if reply is not None:
632
+ extracted_json = regex.findall(r'\{(?:[^{}]|(?R))*\}', reply, regex.DOTALL)
633
+ if extracted_json:
634
+ cleaned_json = extracted_json[0].replace('[', '').replace(']', '').replace('\n', '').replace(" ", '').replace(" ", '')
635
+ extracted_jsons.append(cleaned_json)
636
+ #print(cleaned_json)
637
+ else:
638
+ error_message = """{"1":"e"}"""
639
+ extracted_jsons.append(error_message)
640
+ print(error_message)
641
+ else:
642
+ error_message = """{"1":"e"}"""
643
+ extracted_jsons.append(error_message)
644
+ #print(error_message)
645
+
646
+ # --- Safety Save ---
647
+ if safety:
648
+ #print(f"Saving CSV to: {save_directory}")
649
+ # Save progress so far
650
+ temp_df = pd.DataFrame({
651
+ 'image_input': image_files[:i+1],
652
+ 'link1': link1,
653
+ 'json': extracted_jsons
654
+ })
655
+ # Normalize processed jsons so far
656
+ normalized_data_list = []
657
+ for json_str in extracted_jsons:
658
+ try:
659
+ parsed_obj = json.loads(json_str)
660
+ normalized_data_list.append(pd.json_normalize(parsed_obj))
661
+ except json.JSONDecodeError:
662
+ normalized_data_list.append(pd.DataFrame({"1": ["e"]}))
663
+ normalized_data = pd.concat(normalized_data_list, ignore_index=True)
664
+ temp_df = pd.concat([temp_df, normalized_data], axis=1)
665
+ # Save to CSV
666
+ if save_directory is None:
667
+ save_directory = os.getcwd()
668
+ temp_df.to_csv(os.path.join(save_directory, filename), index=False)
669
+
670
+ # --- Final DataFrame ---
671
+ normalized_data_list = []
672
+ for json_str in extracted_jsons:
673
+ try:
674
+ parsed_obj = json.loads(json_str)
675
+ normalized_data_list.append(pd.json_normalize(parsed_obj))
676
+ except json.JSONDecodeError:
677
+ normalized_data_list.append(pd.DataFrame({"1": ["e"]}))
678
+ normalized_data = pd.concat(normalized_data_list, ignore_index=True)
679
+
680
+ categorized_data = pd.DataFrame({
681
+ 'image_input': image_files,
682
+ 'link1': pd.Series(link1).reset_index(drop=True),
683
+ 'json': pd.Series(extracted_jsons).reset_index(drop=True)
684
+ })
685
+ categorized_data = pd.concat([categorized_data, normalized_data], axis=1)
686
+
687
+ if to_csv:
688
+ if save_directory is None:
689
+ save_directory = os.getcwd()
690
+ categorized_data.to_csv(os.path.join(save_directory, filename), index=False)
691
+
692
+ return categorized_data