PyPI - cat-llm - Versions diffs - 0.0.43__tar.gz → 0.0.51__tar.gz - Mend

cat-llm 0.0.43tar.gz → 0.0.51tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

{cat_llm-0.0.43 → cat_llm-0.0.51}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cat-llm
-Version: 0.0.43
+Version: 0.0.51
 Summary: A tool for categorizing text data and images using LLMs and vision models
 Project-URL: Documentation, https://github.com/chrissoria/cat-llm#readme
 Project-URL: Issues, https://github.com/chrissoria/cat-llm/issues

{cat_llm-0.0.43 → cat_llm-0.0.51}/src/catllm/CERAD_functions.py RENAMED Viewed

@@ -374,7 +374,10 @@ def cerad_drawn_score(
     normalized_data = pd.concat(normalized_data_list, ignore_index=True)
     categorized_data = pd.DataFrame({
-        'image_input': image_files,
+        'image_input': (
+            image_files.reset_index(drop=True) if isinstance(image_files, (pd.DataFrame, pd.Series))
+            else pd.Series(image_files)
+        ),
         'link1': pd.Series(link1).reset_index(drop=True),
         'json': pd.Series(extracted_jsons).reset_index(drop=True)
     })

{cat_llm-0.0.43 → cat_llm-0.0.51}/src/catllm/__about__.py RENAMED Viewed

@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.0.43"
+__version__ = "0.0.51"
 __author__ = "Chris Soria"
 __email__ = "chrissoria@berkeley.edu"
 __title__ = "cat-llm"

{cat_llm-0.0.43 → cat_llm-0.0.51}/src/catllm/__init__.py RENAMED Viewed

@@ -13,4 +13,5 @@ from .__about__ import (
 from .text_functions import *
 from .CERAD_functions import *
-from .image_functions import *
+from .image_functions import *
+from .build_web_research import *

cat_llm-0.0.51/src/catllm/build_web_research.py ADDED Viewed

@@ -0,0 +1,169 @@
+#build dataset classification
+def build_web_research_dataset(
+    search_question,
+    search_input,
+    api_key,
+    answer_format = "concise",
+    additional_instructions = "",
+    categories = ['Answer','URL'],
+    user_model="claude-3-7-sonnet-20250219",
+    creativity=0,
+    safety=False,
+    filename="categorized_data.csv",
+    save_directory=None,
+    model_source="Anthropic",
+    time_delay=5
+):
+    import os
+    import json
+    import pandas as pd
+    import regex
+    from tqdm import tqdm
+    import time
+    categories_str = "\n".join(f"{i + 1}. {cat}" for i, cat in enumerate(categories))
+    print(categories_str)
+    cat_num = len(categories)
+    category_dict = {str(i+1): "0" for i in range(cat_num)}
+    example_JSON = json.dumps(category_dict, indent=4)
+    # ensure number of categories is what user wants
+    #print("\nThe information to be extracted:")
+    #for i, cat in enumerate(categories, 1):
+        #print(f"{i}. {cat}")
+    link1 = []
+    extracted_jsons = []
+    for idx, item in enumerate(tqdm(search_input, desc="Building dataset")):
+        if idx == 0:  # delay the first item just to be safe
+            time.sleep(time_delay)
+        reply = None
+        if pd.isna(item):
+            link1.append("Skipped NaN input")
+            default_json = example_JSON
+            extracted_jsons.append(default_json)
+            #print(f"Skipped NaN input.")
+        else:
+            prompt = f"""<role>You are a research assistant specializing in finding current, factual information.</role>
+            <task>Find information about {item}'s {search_question}</task>
+            <rules>
+            - Search for the most current and authoritative information available
+            - Provide your answer as {answer_format}
+            - Prioritize official sources when possible
+            - If information is not found, state "Information not found"
+            - Include exactly one source URL where you found the information
+            - Do not include any explanatory text or commentary beyond the JSON
+                {additional_instructions}
+            </rules>
+            <format>
+            Return your response as valid JSON with this exact structure:
+            {{
+            "answer": "Your factual answer or 'Information not found'",
+            "url": "Source URL or 'No source available'"
+        }}
+        </format>"""
+            #print(prompt)
+            if model_source == "Anthropic":
+                import anthropic
+                client = anthropic.Anthropic(api_key=api_key)
+                try:
+                    message = client.messages.create(
+                    model=user_model,
+                    max_tokens=1024,
+                    temperature=creativity,
+                    messages=[{"role": "user", "content": prompt}],
+                    tools=[{
+                    "type": "web_search_20250305",
+                    "name": "web_search"
+                    }]
+                )
+                    reply = " ".join(
+                        block.text
+                        for block in message.content
+                        if getattr(block, "type", "") == "text"
+                    ).strip()
+                    link1.append(reply)
+                    time.sleep(time_delay)
+                    print(reply)
+                except Exception as e:
+                    print(f"An error occurred: {e}")
+                    link1.append(f"Error processing input: {e}")
+                    time.sleep(time_delay)
+            else:
+                raise ValueError("Unknown source! Currently this function only supports 'Anthropic' as model_source.")
+            # in situation that no JSON is found
+            if reply is not None:
+                extracted_json = regex.findall(r'\{(?:[^{}]|(?R))*\}', reply, regex.DOTALL)
+                if extracted_json:
+                    raw_json = extracted_json[0].strip()  # Only strip leading/trailing whitespace
+                    try:
+                        # Parse to validate JSON structure
+                        parsed_obj = json.loads(raw_json)
+                        # Re-serialize for consistent formatting (optional)
+                        cleaned_json = json.dumps(parsed_obj)
+                        extracted_jsons.append(cleaned_json)
+                    except json.JSONDecodeError as e:
+                        print(f"JSON parsing error: {e}")
+                        # Fallback to raw extraction if parsing fails
+                        extracted_jsons.append(raw_json)
+                else:
+                    # Use consistent schema for errors
+                    error_message = json.dumps({"answer": "e", "url": "e"})
+                    extracted_jsons.append(error_message)
+                    print(error_message)
+            else:
+                # Handle None reply case
+                error_message = json.dumps({"answer": "e", "url": "e"})
+                extracted_jsons.append(error_message)
+                #print(error_message)
+        # --- Safety Save ---
+        if safety:
+            # Save progress so far
+            temp_df = pd.DataFrame({
+                'survey_response': search_input[:idx+1],
+                'link1': link1,
+                'json': extracted_jsons
+            })
+            # Normalize processed jsons so far
+            normalized_data_list = []
+            for json_str in extracted_jsons:
+                try:
+                    parsed_obj = json.loads(json_str)
+                    normalized_data_list.append(pd.json_normalize(parsed_obj))
+                except json.JSONDecodeError:
+                    normalized_data_list.append(pd.DataFrame({"1": ["e"]}))
+            normalized_data = pd.concat(normalized_data_list, ignore_index=True)
+            temp_df = pd.concat([temp_df, normalized_data], axis=1)
+            # Save to CSV
+            if save_directory is None:
+                save_directory = os.getcwd()
+            temp_df.to_csv(os.path.join(save_directory, filename), index=False)
+    # --- Final DataFrame ---
+    normalized_data_list = []
+    for json_str in extracted_jsons:
+        try:
+            parsed_obj = json.loads(json_str)
+            normalized_data_list.append(pd.json_normalize(parsed_obj))
+        except json.JSONDecodeError:
+            normalized_data_list.append(pd.DataFrame({"1": ["e"]}))
+    normalized_data = pd.concat(normalized_data_list, ignore_index=True)
+    categorized_data = pd.DataFrame({
+        'survey_response': (
+            search_input.reset_index(drop=True) if isinstance(search_input, (pd.DataFrame, pd.Series))
+            else pd.Series(search_input)
+        ),
+        'link1': pd.Series(link1).reset_index(drop=True),
+        'json': pd.Series(extracted_jsons).reset_index(drop=True)
+    })
+    categorized_data = pd.concat([categorized_data, normalized_data], axis=1)
+    return categorized_data

{cat_llm-0.0.43 → cat_llm-0.0.51}/src/catllm/image_functions.py RENAMED Viewed

@@ -252,9 +252,11 @@ def image_multi_class(
         except json.JSONDecodeError:
             normalized_data_list.append(pd.DataFrame({"1": ["e"]}))
     normalized_data = pd.concat(normalized_data_list, ignore_index=True)
     categorized_data = pd.DataFrame({
-        'image_input': image_files,
+        'image_input': (
+            image_files.reset_index(drop=True) if isinstance(image_files, (pd.DataFrame, pd.Series))
+            else pd.Series(image_files)
+        ),
         'link1': pd.Series(link1).reset_index(drop=True),
         'json': pd.Series(extracted_jsons).reset_index(drop=True)
     })
@@ -549,7 +551,10 @@ def image_score_drawing(
     normalized_data = pd.concat(normalized_data_list, ignore_index=True)
     categorized_data = pd.DataFrame({
-        'image_input': image_files,
+        'image_input': (
+            image_files.reset_index(drop=True) if isinstance(image_files, (pd.DataFrame, pd.Series))
+            else pd.Series(image_files)
+        ),
         'link1': pd.Series(link1).reset_index(drop=True),
         'json': pd.Series(extracted_jsons).reset_index(drop=True)
     })
@@ -835,7 +840,10 @@ def image_features(
     normalized_data = pd.concat(normalized_data_list, ignore_index=True)
     categorized_data = pd.DataFrame({
-        'image_input': image_files,
+        'image_input': (
+            image_files.reset_index(drop=True) if isinstance(image_files, (pd.DataFrame, pd.Series))
+            else pd.Series(image_files)
+        ),
         'link1': pd.Series(link1).reset_index(drop=True),
         'json': pd.Series(extracted_jsons).reset_index(drop=True)
     })

{cat_llm-0.0.43 → cat_llm-0.0.51}/src/catllm/text_functions.py RENAMED Viewed

@@ -373,20 +373,14 @@ Provide your work in JSON format where the number belonging to each category is
         except json.JSONDecodeError:
             normalized_data_list.append(pd.DataFrame({"1": ["e"]}))
     normalized_data = pd.concat(normalized_data_list, ignore_index=True)
     categorized_data = pd.DataFrame({
-        'survey_response': survey_input.reset_index(drop=True),
+        'image_input': (
+            survey_input.reset_index(drop=True) if isinstance(survey_input, (pd.DataFrame, pd.Series))
+            else pd.Series(survey_input)
+        ),
         'link1': pd.Series(link1).reset_index(drop=True),
         'json': pd.Series(extracted_jsons).reset_index(drop=True)
     })
     categorized_data = pd.concat([categorized_data, normalized_data], axis=1)
-    if columns != "numbered": #if user wants text columns
-        categorized_data.columns = list(categorized_data.columns[:3]) + categories[:len(categorized_data.columns) - 3]
-    if to_csv:
-        if save_directory is None:
-            save_directory = os.getcwd()
-        categorized_data.to_csv(os.path.join(save_directory, filename), index=False)
     return categorized_data