PyPI - cat-llm - Versions diffs - 0.0.42__tar.gz → 0.0.50__tar.gz - Mend

cat-llm 0.0.42tar.gz → 0.0.50tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

{cat_llm-0.0.42 → cat_llm-0.0.50}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cat-llm
-Version: 0.0.42
+Version: 0.0.50
 Summary: A tool for categorizing text data and images using LLMs and vision models
 Project-URL: Documentation, https://github.com/chrissoria/cat-llm#readme
 Project-URL: Issues, https://github.com/chrissoria/cat-llm/issues

{cat_llm-0.0.42 → cat_llm-0.0.50}/src/catllm/CERAD_functions.py RENAMED Viewed

@@ -21,6 +21,7 @@ Areas for improvement:
 10. Test variety: expanding or adding functions to handle score more tests relevant for cogntive assesment, such as the MMSE.
 11. Error handling: improving error handling to better manage unexpected inputs or model failures.
 """
 def cerad_drawn_score(
     shape,
     image_input,
@@ -265,8 +266,11 @@ def cerad_drawn_score(
                 reply = response_obj.choices[0].message.content
                 link1.append(reply)
             except Exception as e:
-                print("An error occurred: {e}")
-                link1.append("Error processing input: {e}")
+                if "model" in str(e).lower():
+                    raise ValueError(f"Invalid OpenAI model '{user_model}': {e}")
+                else:
+                    print("An error occurred: {e}")
+                    link1.append("Error processing input: {e}")
         elif model_source == "Anthropic"  and valid_image:
             import anthropic
@@ -281,8 +285,11 @@ def cerad_drawn_score(
                 reply = message.content[0].text  # Anthropic returns content as list
                 link1.append(reply)
             except Exception as e:
-                print("An error occurred: {e}")
-                link1.append("Error processing input: {e}")
+                if "model" in str(e).lower():
+                    raise ValueError(f"Invalid OpenAI model '{user_model}': {e}")
+                else:
+                    print("An error occurred: {e}")
+                    link1.append("Error processing input: {e}")
         elif model_source == "Mistral"  and valid_image:
             from mistralai import Mistral
@@ -299,9 +306,11 @@ def cerad_drawn_score(
                 reply = response.choices[0].message.content
                 link1.append(reply)
             except Exception as e:
-                reply = None
-                print("An error occurred: {e}")
-                link1.append("Error processing input: {e}")
+                if "model" in str(e).lower():
+                    raise ValueError(f"Invalid OpenAI model '{user_model}': {e}")
+                else:
+                    print("An error occurred: {e}")
+                    link1.append("Error processing input: {e}")
         #if no valid image path is provided
         elif  valid_image == False:
             reply = "invalid image path"
@@ -365,7 +374,10 @@ def cerad_drawn_score(
     normalized_data = pd.concat(normalized_data_list, ignore_index=True)
     categorized_data = pd.DataFrame({
-        'image_input': image_files,
+        'image_input': (
+            image_files.reset_index(drop=True) if isinstance(image_files, (pd.DataFrame, pd.Series))
+            else pd.Series(image_files)
+        ),
         'link1': pd.Series(link1).reset_index(drop=True),
         'json': pd.Series(extracted_jsons).reset_index(drop=True)
     })

{cat_llm-0.0.42 → cat_llm-0.0.50}/src/catllm/__about__.py RENAMED Viewed

@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.0.42"
+__version__ = "0.0.50"
 __author__ = "Chris Soria"
 __email__ = "chrissoria@berkeley.edu"
 __title__ = "cat-llm"

cat_llm-0.0.50/src/catllm/build_web_research.py ADDED Viewed

@@ -0,0 +1,169 @@
+#build dataset classification
+def build_web_research_dataset(
+    search_question,
+    search_input,
+    api_key,
+    answer_format = "concise",
+    additional_instructions = "",
+    categories = ['Answer','URL'],
+    user_model="claude-3-7-sonnet-20250219",
+    creativity=0,
+    safety=False,
+    filename="categorized_data.csv",
+    save_directory=None,
+    model_source="Anthropic",
+    time_delay=5
+):
+    import os
+    import json
+    import pandas as pd
+    import regex
+    from tqdm import tqdm
+    import time
+    categories_str = "\n".join(f"{i + 1}. {cat}" for i, cat in enumerate(categories))
+    print(categories_str)
+    cat_num = len(categories)
+    category_dict = {str(i+1): "0" for i in range(cat_num)}
+    example_JSON = json.dumps(category_dict, indent=4)
+    # ensure number of categories is what user wants
+    #print("\nThe information to be extracted:")
+    #for i, cat in enumerate(categories, 1):
+        #print(f"{i}. {cat}")
+    link1 = []
+    extracted_jsons = []
+    for idx, item in enumerate(tqdm(search_input, desc="Building dataset")):
+        if idx == 0:  # delay the first item just to be safe
+            time.sleep(time_delay)
+        reply = None
+        if pd.isna(item):
+            link1.append("Skipped NaN input")
+            default_json = example_JSON
+            extracted_jsons.append(default_json)
+            #print(f"Skipped NaN input.")
+        else:
+            prompt = f"""<role>You are a research assistant specializing in finding current, factual information.</role>
+            <task>Find information about {item}'s {search_question}</task>
+            <rules>
+            - Search for the most current and authoritative information available
+            - Provide your answer as {answer_format}
+            - Prioritize official sources when possible
+            - If information is not found, state "Information not found"
+            - Include exactly one source URL where you found the information
+            - Do not include any explanatory text or commentary beyond the JSON
+                {additional_instructions}
+            </rules>
+            <format>
+            Return your response as valid JSON with this exact structure:
+            {{
+            "answer": "Your factual answer or 'Information not found'",
+            "url": "Source URL or 'No source available'"
+        }}
+        </format>"""
+            #print(prompt)
+            if model_source == "Anthropic":
+                import anthropic
+                client = anthropic.Anthropic(api_key=api_key)
+                try:
+                    message = client.messages.create(
+                    model=user_model,
+                    max_tokens=1024,
+                    temperature=creativity,
+                    messages=[{"role": "user", "content": prompt}],
+                    tools=[{
+                    "type": "web_search_20250305",
+                    "name": "web_search"
+                    }]
+                )
+                    reply = " ".join(
+                        block.text
+                        for block in message.content
+                        if getattr(block, "type", "") == "text"
+                    ).strip()
+                    link1.append(reply)
+                    time.sleep(time_delay)
+                    print(reply)
+                except Exception as e:
+                    print(f"An error occurred: {e}")
+                    link1.append(f"Error processing input: {e}")
+                    time.sleep(time_delay)
+            else:
+                raise ValueError("Unknown source! Currently this function only supports 'Anthropic' as model_source.")
+            # in situation that no JSON is found
+            if reply is not None:
+                extracted_json = regex.findall(r'\{(?:[^{}]|(?R))*\}', reply, regex.DOTALL)
+                if extracted_json:
+                    raw_json = extracted_json[0].strip()  # Only strip leading/trailing whitespace
+                    try:
+                        # Parse to validate JSON structure
+                        parsed_obj = json.loads(raw_json)
+                        # Re-serialize for consistent formatting (optional)
+                        cleaned_json = json.dumps(parsed_obj)
+                        extracted_jsons.append(cleaned_json)
+                    except json.JSONDecodeError as e:
+                        print(f"JSON parsing error: {e}")
+                        # Fallback to raw extraction if parsing fails
+                        extracted_jsons.append(raw_json)
+                else:
+                    # Use consistent schema for errors
+                    error_message = json.dumps({"answer": "e", "url": "e"})
+                    extracted_jsons.append(error_message)
+                    print(error_message)
+            else:
+                # Handle None reply case
+                error_message = json.dumps({"answer": "e", "url": "e"})
+                extracted_jsons.append(error_message)
+                #print(error_message)
+        # --- Safety Save ---
+        if safety:
+            # Save progress so far
+            temp_df = pd.DataFrame({
+                'survey_response': search_input[:idx+1],
+                'link1': link1,
+                'json': extracted_jsons
+            })
+            # Normalize processed jsons so far
+            normalized_data_list = []
+            for json_str in extracted_jsons:
+                try:
+                    parsed_obj = json.loads(json_str)
+                    normalized_data_list.append(pd.json_normalize(parsed_obj))
+                except json.JSONDecodeError:
+                    normalized_data_list.append(pd.DataFrame({"1": ["e"]}))
+            normalized_data = pd.concat(normalized_data_list, ignore_index=True)
+            temp_df = pd.concat([temp_df, normalized_data], axis=1)
+            # Save to CSV
+            if save_directory is None:
+                save_directory = os.getcwd()
+            temp_df.to_csv(os.path.join(save_directory, filename), index=False)
+    # --- Final DataFrame ---
+    normalized_data_list = []
+    for json_str in extracted_jsons:
+        try:
+            parsed_obj = json.loads(json_str)
+            normalized_data_list.append(pd.json_normalize(parsed_obj))
+        except json.JSONDecodeError:
+            normalized_data_list.append(pd.DataFrame({"1": ["e"]}))
+    normalized_data = pd.concat(normalized_data_list, ignore_index=True)
+    categorized_data = pd.DataFrame({
+        'survey_response': (
+            search_input.reset_index(drop=True) if isinstance(search_input, (pd.DataFrame, pd.Series))
+            else pd.Series(search_input)
+        ),
+        'link1': pd.Series(link1).reset_index(drop=True),
+        'json': pd.Series(extracted_jsons).reset_index(drop=True)
+    })
+    categorized_data = pd.concat([categorized_data, normalized_data], axis=1)
+    return categorized_data

{cat_llm-0.0.42 → cat_llm-0.0.50}/src/catllm/image_functions.py RENAMED Viewed

@@ -148,8 +148,11 @@ def image_multi_class(
                 reply = response_obj.choices[0].message.content
                 link1.append(reply)
             except Exception as e:
-                print(f"An error occurred: {e}")
-                link1.append(f"Error processing input: {e}")
+                if "model" in str(e).lower():
+                    raise ValueError(f"Invalid OpenAI model '{user_model}': {e}")
+                else:
+                    print("An error occurred: {e}")
+                    link1.append("Error processing input: {e}")
         elif model_source == "Anthropic":
             import anthropic
@@ -165,8 +168,11 @@ def image_multi_class(
                 reply = message.content[0].text
                 link1.append(reply)
             except Exception as e:
-                print(f"An error occurred: {e}")
-                link1.append(f"Error processing input: {e}")
+                if "model" in str(e).lower():
+                    raise ValueError(f"Invalid OpenAI model '{user_model}': {e}")
+                else:
+                    print("An error occurred: {e}")
+                    link1.append("Error processing input: {e}")
         elif model_source == "Mistral":
             from mistralai import Mistral
@@ -182,8 +188,11 @@ def image_multi_class(
                 reply = response.choices[0].message.content
                 link1.append(reply)
             except Exception as e:
-                print(f"An error occurred: {e}")
-                link1.append(f"Error processing input: {e}")
+                if "model" in str(e).lower():
+                    raise ValueError(f"Invalid OpenAI model '{user_model}': {e}")
+                else:
+                    print("An error occurred: {e}")
+                    link1.append("Error processing input: {e}")
         #if no valid image path is provided
         elif  valid_image == False:
             reply = "invalid image path"
@@ -243,9 +252,11 @@ def image_multi_class(
         except json.JSONDecodeError:
             normalized_data_list.append(pd.DataFrame({"1": ["e"]}))
     normalized_data = pd.concat(normalized_data_list, ignore_index=True)
     categorized_data = pd.DataFrame({
-        'image_input': image_files,
+        'image_input': (
+            image_files.reset_index(drop=True) if isinstance(image_files, (pd.DataFrame, pd.Series))
+            else pd.Series(image_files)
+        ),
         'link1': pd.Series(link1).reset_index(drop=True),
         'json': pd.Series(extracted_jsons).reset_index(drop=True)
     })
@@ -436,8 +447,11 @@ def image_score_drawing(
                 reply = response_obj.choices[0].message.content
                 link1.append(reply)
             except Exception as e:
-                print(f"An error occurred: {e}")
-                link1.append(f"Error processing input: {e}")
+                if "model" in str(e).lower():
+                    raise ValueError(f"Invalid OpenAI model '{user_model}': {e}")
+                else:
+                    print("An error occurred: {e}")
+                    link1.append("Error processing input: {e}")
         elif model_source == "Anthropic":
             import anthropic
@@ -452,8 +466,11 @@ def image_score_drawing(
                 reply = message.content[0].text  # Anthropic returns content as list
                 link1.append(reply)
             except Exception as e:
-                print(f"An error occurred: {e}")
-                link1.append(f"Error processing input: {e}")
+                if "model" in str(e).lower():
+                    raise ValueError(f"Invalid OpenAI model '{user_model}': {e}")
+                else:
+                    print("An error occurred: {e}")
+                    link1.append("Error processing input: {e}")
         elif model_source == "Mistral":
             from mistralai import Mistral
@@ -469,8 +486,11 @@ def image_score_drawing(
                 reply = response.choices[0].message.content
                 link1.append(reply)
             except Exception as e:
-                print(f"An error occurred: {e}")
-                link1.append(f"Error processing input: {e}")
+                if "model" in str(e).lower():
+                    raise ValueError(f"Invalid OpenAI model '{user_model}': {e}")
+                else:
+                    print("An error occurred: {e}")
+                    link1.append("Error processing input: {e}")
         #if no valid image path is provided
         elif  valid_image == False:
             reply = "invalid image path"
@@ -531,7 +551,10 @@ def image_score_drawing(
     normalized_data = pd.concat(normalized_data_list, ignore_index=True)
     categorized_data = pd.DataFrame({
-        'image_input': image_files,
+        'image_input': (
+            image_files.reset_index(drop=True) if isinstance(image_files, (pd.DataFrame, pd.Series))
+            else pd.Series(image_files)
+        ),
         'link1': pd.Series(link1).reset_index(drop=True),
         'json': pd.Series(extracted_jsons).reset_index(drop=True)
     })
@@ -567,10 +590,6 @@ def image_features(
     import base64
     from pathlib import Path
-    if save_directory is not None and not os.path.isdir(save_directory):
-    # Directory doesn't exist - raise an exception to halt execution
-        raise FileNotFoundError(f"Directory {save_directory} doesn't exist")
     image_extensions = [
     '*.png', '*.jpg', '*.jpeg',
     '*.gif', '*.webp', '*.svg', '*.svgz', '*.avif', '*.apng',
@@ -595,26 +614,35 @@ def image_features(
     cat_num = len(features_to_extract)
     category_dict = {str(i+1): "0" for i in range(cat_num)}
     example_JSON = json.dumps(category_dict, indent=4)
-    # ensure number of categories is what user wants
-    print("\nThe image features to be extracted are:")
-    for i, cat in enumerate(features_to_extract, 1):
-        print(f"{i}. {cat}")
     link1 = []
     extracted_jsons = []
-    for i, img_path in enumerate(
-        tqdm(image_files, desc="Categorising images"), start=0):
+    for i, img_path in enumerate(tqdm(image_files, desc="Scoring images"), start=0):
+    # Check validity first
         if img_path is None or not os.path.exists(img_path):
             link1.append("Skipped NaN input or invalid path")
             extracted_jsons.append("""{"no_valid_image": 1}""")
             continue  # Skip the rest of the loop iteration
-    # encode this specific image once
-        with open(img_path, "rb") as f:
-            encoded = base64.b64encode(f.read()).decode("utf-8")
-        ext = Path(img_path).suffix.lstrip(".").lower()
-        encoded_image = f"data:image/{ext};base64,{encoded}"
+    # Only open the file if path is valid
+        if os.path.isdir(img_path):
+            encoded = "Not a Valid Image, contains file path"
+        else:
+            try:
+                with open(img_path, "rb") as f:
+                    encoded = base64.b64encode(f.read()).decode("utf-8")
+            except Exception as e:
+                    encoded = f"Error: {str(e)}"
+    # Handle extension safely
+        if encoded.startswith("Error:") or encoded == "Not a Valid Image, contains file path":
+            encoded_image = encoded
+            valid_image = False
+        else:
+            ext = Path(img_path).suffix.lstrip(".").lower()
+            encoded_image = f"data:image/{ext};base64,{encoded}"
+            valid_image = True
         if model_source == "OpenAI" or model_source == "Mistral":
             prompt = [
@@ -692,8 +720,11 @@ def image_features(
                 reply = response_obj.choices[0].message.content
                 link1.append(reply)
             except Exception as e:
-                print(f"An error occurred: {e}")
-                link1.append(f"Error processing input: {e}")
+                if "model" in str(e).lower():
+                    raise ValueError(f"Invalid OpenAI model '{user_model}': {e}")
+                else:
+                    print("An error occurred: {e}")
+                    link1.append("Error processing input: {e}")
         elif model_source == "Perplexity":
             from openai import OpenAI
@@ -707,8 +738,12 @@ def image_features(
                 reply = response_obj.choices[0].message.content
                 link1.append(reply)
             except Exception as e:
-                print(f"An error occurred: {e}")
-                link1.append(f"Error processing input: {e}")
+                if "model" in str(e).lower():
+                    raise ValueError(f"Invalid OpenAI model '{user_model}': {e}")
+                else:
+                    print("An error occurred: {e}")
+                    link1.append("Error processing input: {e}")
         elif model_source == "Anthropic":
             import anthropic
             client = anthropic.Anthropic(api_key=api_key)
@@ -722,8 +757,12 @@ def image_features(
                 reply = message.content[0].text  # Anthropic returns content as list
                 link1.append(reply)
             except Exception as e:
-                print(f"An error occurred: {e}")
-                link1.append(f"Error processing input: {e}")
+                if "model" in str(e).lower():
+                    raise ValueError(f"Invalid OpenAI model '{user_model}': {e}")
+                else:
+                    print("An error occurred: {e}")
+                    link1.append("Error processing input: {e}")
         elif model_source == "Mistral":
             from mistralai import Mistral
             client = Mistral(api_key=api_key)
@@ -738,8 +777,12 @@ def image_features(
                 reply = response.choices[0].message.content
                 link1.append(reply)
             except Exception as e:
-                print(f"An error occurred: {e}")
-                link1.append(f"Error processing input: {e}")
+                if "model" in str(e).lower():
+                    raise ValueError(f"Invalid OpenAI model '{user_model}': {e}")
+                else:
+                    print("An error occurred: {e}")
+                    link1.append("Error processing input: {e}")
         elif  valid_image == False:
             print("Skipped NaN input or invalid path")
             reply = None
@@ -797,7 +840,10 @@ def image_features(
     normalized_data = pd.concat(normalized_data_list, ignore_index=True)
     categorized_data = pd.DataFrame({
-        'image_input': image_files,
+        'image_input': (
+            image_files.reset_index(drop=True) if isinstance(image_files, (pd.DataFrame, pd.Series))
+            else pd.Series(image_files)
+        ),
         'link1': pd.Series(link1).reset_index(drop=True),
         'json': pd.Series(extracted_jsons).reset_index(drop=True)
     })

{cat_llm-0.0.42 → cat_llm-0.0.50}/src/catllm/text_functions.py RENAMED Viewed

@@ -373,20 +373,14 @@ Provide your work in JSON format where the number belonging to each category is
         except json.JSONDecodeError:
             normalized_data_list.append(pd.DataFrame({"1": ["e"]}))
     normalized_data = pd.concat(normalized_data_list, ignore_index=True)
     categorized_data = pd.DataFrame({
-        'survey_response': survey_input.reset_index(drop=True),
+        'image_input': (
+            survey_input.reset_index(drop=True) if isinstance(survey_input, (pd.DataFrame, pd.Series))
+            else pd.Series(survey_input)
+        ),
         'link1': pd.Series(link1).reset_index(drop=True),
         'json': pd.Series(extracted_jsons).reset_index(drop=True)
     })
     categorized_data = pd.concat([categorized_data, normalized_data], axis=1)
-    if columns != "numbered": #if user wants text columns
-        categorized_data.columns = list(categorized_data.columns[:3]) + categories[:len(categorized_data.columns) - 3]
-    if to_csv:
-        if save_directory is None:
-            save_directory = os.getcwd()
-        categorized_data.to_csv(os.path.join(save_directory, filename), index=False)
     return categorized_data