PyPI - cat-llm - Versions diffs - 0.0.24__tar.gz → 0.0.26__tar.gz - Mend

cat-llm 0.0.24tar.gz → 0.0.26tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

{cat_llm-0.0.24 → cat_llm-0.0.26}/PKG-INFO +1 -1
{cat_llm-0.0.24 → cat_llm-0.0.26}/src/catllm/CERAD_functions.py +30 -9
{cat_llm-0.0.24 → cat_llm-0.0.26}/src/catllm/__about__.py +1 -1
{cat_llm-0.0.24 → cat_llm-0.0.26}/src/catllm/__init__.py +2 -1
cat_llm-0.0.26/src/catllm/cat_llm.py +395 -0
cat_llm-0.0.26/src/catllm/image_functions.py +689 -0
cat_llm-0.0.24/src/catllm/cat_llm.py +0 -1403
{cat_llm-0.0.24 → cat_llm-0.0.26}/LICENSE +0 -0
{cat_llm-0.0.24 → cat_llm-0.0.26}/README.md +0 -0
{cat_llm-0.0.24 → cat_llm-0.0.26}/pyproject.toml +0 -0

{cat_llm-0.0.24 → cat_llm-0.0.26}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cat-llm
-Version: 0.0.24
+Version: 0.0.26
 Summary: A tool for categorizing text data and images using LLMs and vision models
 Project-URL: Documentation, https://github.com/chrissoria/cat-llm#readme
 Project-URL: Issues, https://github.com/chrissoria/cat-llm/issues

{cat_llm-0.0.24 → cat_llm-0.0.26}/src/catllm/CERAD_functions.py RENAMED Viewed

@@ -1,10 +1,31 @@
 # a function specifically for CERAD Constructional Praxis test
 # specifically for pictures of drawings of shapes like circles, diamonds, rectangles, and cubes
+"""
+Ket features:
+1. Shape-specific scoring: The function can handle different shapes (circle, diamond, rectangles, cube) and provides tailored categories for each shape.
+2. Image input handling: It accepts image inputs either as file paths or a list of images.
+3. Model flexibility: The function allows users to specify different models (OpenAI, Anthropic, Perplexity, Mistral) for image analysis.
+4. Safety and progress saving: It can save progress to a CSV file, which is useful for long-running tasks or when processing many images.
+Areas for improvement:
+1. Prompt refinement: adjusting the prompt so that it produces a more accurate score.
+2. Image preprocessing: adjusting the images so that they are easier to be analyzed by the models.
+3. Model selection: using a different model that is better suited for image analysis.
+4. Model Ensembling: using multiple models and combining their scores to produce a more accurate score.
+5. Prompt ensembling: using multiple prompts and combining their scores to produce a more accurate score.
+6. Post-processing: adjusting the way scores are calculated after the model has output its assessment.
+7. Efficiency: optimizing the code to run faster, cheaper, and more efficiently.
+8. Drawn-format versatility: making the function more versatile to handle different scenarios, such as shapes drawn on tablets.
+9. Image input flexibility: allowing the function to accept images in various formats, such as URLs or raw image data.
+10. Test variety: expanding or adding functions to handle score more tests relevant for cogntive assesment, such as the MMSE.
+11. Error handling: improving error handling to better manage unexpected inputs or model failures.
+"""
 def cerad_drawn_score(
     shape,
     image_input,
     api_key,
-    user_model="gpt-4o-2024-11-20",
+    user_model="gpt-4o",
     creativity=0,
     reference_in_image=False,
     provide_reference=False,
@@ -25,8 +46,8 @@ def cerad_drawn_score(
     if shape == "circle":
         categories = ["The image contains a drawing that clearly represents a circle",
-                    "The drawing does not resemble a circle",
-                    "The drawing resembles a circle",
+                    "The image does NOT contain any drawing that resembles a circle",
+                    "The image contains a drawing that resembles a circle",
                     "The circle is closed",
                     "The circle is almost closed",
                     "The circle is circular",
@@ -43,12 +64,12 @@ def cerad_drawn_score(
                     "None of the above descriptions apply"]
     elif shape == "rectangles" or shape == "overlapping rectangles":
         categories = ["The image contains a drawing that clearly represents overlapping rectangles",
-                    "A drawn shape DOES NOT resemble a overlapping rectangles",
-                    "A drawn shape resembles a overlapping rectangles",
-                    "Rectangle 1 has 4 sides",
-                    "Rectangle 2 has 4 sides",
-                    "The rectangles are overlapping",
-                    "The rectangles overlap contains a longer vertical rectangle with top and bottom portruding",
+                    "The image does NOT contain any drawing that resembles overlapping rectangles",
+                    "The image contains a drawing that resembles overlapping rectangles",
+                    "If rectangle 1 is present it has 4 sides",
+                    "If rectablge 2 is present it has 4 sides",
+                    "The drawn rectangles are overlapping",
+                    "The drawn rectangles overlap to form a longer vertical rectangle with top and bottom sticking out",
                     "None of the above descriptions apply"]
     elif shape == "cube":
         categories = ["The image contains a drawing that clearly represents a cube (3D box shape)",

{cat_llm-0.0.24 → cat_llm-0.0.26}/src/catllm/__about__.py RENAMED Viewed

@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.0.24"
+__version__ = "0.0.26"
 __author__ = "Chris Soria"
 __email__ = "chrissoria@berkeley.edu"
 __title__ = "cat-llm"

{cat_llm-0.0.24 → cat_llm-0.0.26}/src/catllm/__init__.py RENAMED Viewed

@@ -12,4 +12,5 @@ from .__about__ import (
 )
 from .cat_llm import *
-from .CERAD_functions import *
+from .CERAD_functions import *
+from .image_functions import *

cat_llm-0.0.26/src/catllm/cat_llm.py ADDED Viewed

@@ -0,0 +1,395 @@
+#extract categories from corpus
+def explore_corpus(
+    survey_question,
+    survey_input,
+    api_key,
+    research_question=None,
+    specificity="broad",
+    cat_num=10,
+    divisions=5,
+    user_model="gpt-4o-2024-11-20",
+    creativity=0,
+    filename="corpus_exploration.csv",
+    model_source="OpenAI"
+):
+    import os
+    import pandas as pd
+    import random
+    from openai import OpenAI
+    from openai import OpenAI, BadRequestError
+    from tqdm import tqdm
+    print(f"Exploring class for question: '{survey_question}'.\n          {cat_num * divisions} unique categories to be extracted.")
+    print()
+    chunk_size = round(max(1, len(survey_input) / divisions),0)
+    chunk_size = int(chunk_size)
+    if chunk_size < (cat_num/2):
+        raise ValueError(f"Cannot extract {cat_num} {specificity} categories from chunks of only {chunk_size} responses. \n"
+                    f"Choose one solution: \n"
+                    f"(1) Reduce 'divisions' parameter (currently {divisions}) to create larger chunks, or \n"
+                    f"(2) Reduce 'cat_num' parameter (currently {cat_num}) to extract fewer categories per chunk.")
+    random_chunks = []
+    for i in range(divisions):
+        chunk = survey_input.sample(n=chunk_size).tolist()
+        random_chunks.append(chunk)
+    responses = []
+    responses_list = []
+    for i in tqdm(range(divisions), desc="Processing chunks"):
+        survey_participant_chunks = '; '.join(random_chunks[i])
+        prompt = f"""Identify {cat_num} {specificity} categories of responses to the question "{survey_question}" in the following list of responses. \
+Responses are each separated by a semicolon. \
+Responses are contained within triple backticks here: ```{survey_participant_chunks}``` \
+Number your categories from 1 through {cat_num} and be concise with the category labels and provide no description of the categories."""
+        if model_source == "OpenAI":
+            client = OpenAI(api_key=api_key)
+            try:
+                response_obj = client.chat.completions.create(
+                    model=user_model,
+                    messages=[
+                        {'role': 'system', 'content': f"""You are a helpful assistant that extracts categories from survey responses. \
+                                                    The specific task is to identify {specificity} categories of responses to a survey question. \
+                         The research question is: {research_question}""" if research_question else "You are a helpful assistant."},
+                        {'role': 'user', 'content': prompt}
+                    ],
+                    temperature=creativity
+                )
+                reply = response_obj.choices[0].message.content
+                responses.append(reply)
+            except BadRequestError as e:
+                if "context_length_exceeded" in str(e) or "maximum context length" in str(e):
+                    error_msg = (f"Token limit exceeded for model {user_model}. "
+                        f"Try increasing the 'iterations' parameter to create smaller chunks.")
+                    raise ValueError(error_msg)
+                else:
+                    print(f"OpenAI API error: {e}")
+            except Exception as e:
+                print(f"An error occurred: {e}")
+        else:
+            raise ValueError(f"Unsupported model_source: {model_source}")
+        # Extract just the text as a list
+        items = []
+        for line in responses[i].split('\n'):
+            if '. ' in line:
+                try:
+                    items.append(line.split('. ', 1)[1])
+                except IndexError:
+                    pass
+        responses_list.append(items)
+    flat_list = [item.lower() for sublist in responses_list for item in sublist]
+    #convert flat_list to a df
+    df = pd.DataFrame(flat_list, columns=['Category'])
+    counts = pd.Series(flat_list).value_counts()  # Use original list before conversion
+    df['counts'] = df['Category'].map(counts)
+    df = df.sort_values(by='counts', ascending=False).reset_index(drop=True)
+    df = df.drop_duplicates(subset='Category', keep='first').reset_index(drop=True)
+    if filename is not None:
+        df.to_csv(filename, index=False)
+    return df
+#extract top categories from corpus
+def explore_common_categories(
+    survey_question,
+    survey_input,
+    api_key,
+    top_n=10,
+    cat_num=10,
+    divisions=5,
+    user_model="gpt-4o-2024-11-20",
+    creativity=0,
+    specificity="broad",
+    research_question=None,
+    filename=None,
+    model_source="OpenAI"
+):
+    import os
+    import pandas as pd
+    import random
+    from openai import OpenAI
+    from openai import OpenAI, BadRequestError
+    from tqdm import tqdm
+    print(f"Exploring class for question: '{survey_question}'.\n          {cat_num * divisions} unique categories to be extracted and {top_n} to be identified as the most common.")
+    print()
+    chunk_size = round(max(1, len(survey_input) / divisions),0)
+    chunk_size = int(chunk_size)
+    if chunk_size < (cat_num/2):
+        raise ValueError(f"Cannot extract {cat_num} categories from chunks of only {chunk_size} responses. \n"
+                    f"Choose one solution: \n"
+                    f"(1) Reduce 'divisions' parameter (currently {divisions}) to create larger chunks, or \n"
+                    f"(2) Reduce 'cat_num' parameter (currently {cat_num}) to extract fewer categories per chunk.")
+    random_chunks = []
+    for i in range(divisions):
+        chunk = survey_input.sample(n=chunk_size).tolist()
+        random_chunks.append(chunk)
+    responses = []
+    responses_list = []
+    for i in tqdm(range(divisions), desc="Processing chunks"):
+        survey_participant_chunks = '; '.join(random_chunks[i])
+        prompt = f"""Identify {cat_num} {specificity} categories of responses to the question "{survey_question}" in the following list of responses. \
+Responses are each separated by a semicolon. \
+Responses are contained within triple backticks here: ```{survey_participant_chunks}``` \
+Number your categories from 1 through {cat_num} and be concise with the category labels and provide no description of the categories."""
+        if model_source == "OpenAI":
+            client = OpenAI(api_key=api_key)
+            try:
+                response_obj = client.chat.completions.create(
+                    model=user_model,
+                    messages=[
+                        {'role': 'system', 'content': f"""You are a helpful assistant that extracts categories from survey responses. \
+                                                    The specific task is to identify {specificity} categories of responses to a survey question. \
+                         The research question is: {research_question}""" if research_question else "You are a helpful assistant."},
+                        {'role': 'user', 'content': prompt}
+                    ],
+                    temperature=creativity
+                )
+                reply = response_obj.choices[0].message.content
+                responses.append(reply)
+            except BadRequestError as e:
+                if "context_length_exceeded" in str(e) or "maximum context length" in str(e):
+                    error_msg = (f"Token limit exceeded for model {user_model}. "
+                        f"Try increasing the 'iterations' parameter to create smaller chunks.")
+                    raise ValueError(error_msg)
+                else:
+                    print(f"OpenAI API error: {e}")
+            except Exception as e:
+                print(f"An error occurred: {e}")
+        else:
+            raise ValueError(f"Unsupported model_source: {model_source}")
+        # Extract just the text as a list
+        items = []
+        for line in responses[i].split('\n'):
+            if '. ' in line:
+                try:
+                    items.append(line.split('. ', 1)[1])
+                except IndexError:
+                    pass
+        responses_list.append(items)
+    flat_list = [item.lower() for sublist in responses_list for item in sublist]
+    #convert flat_list to a df
+    df = pd.DataFrame(flat_list, columns=['Category'])
+    counts = pd.Series(flat_list).value_counts()  # Use original list before conversion
+    df['counts'] = df['Category'].map(counts)
+    df = df.sort_values(by='counts', ascending=False).reset_index(drop=True)
+    df = df.drop_duplicates(subset='Category', keep='first').reset_index(drop=True)
+    second_prompt = f"""From this list of categories, extract the top {top_n} most common categories. \
+The categories are contained within triple backticks here: ```{df['Category'].tolist()}``` \
+Return the top {top_n} categories as a numbered list sorted from the most to least common and keep the categories {specificity}, with no additional text or explanation."""
+    if model_source == "OpenAI":
+        client = OpenAI(api_key=api_key)
+        response_obj = client.chat.completions.create(
+            model=user_model,
+            messages=[{'role': 'user', 'content': second_prompt}],
+            temperature=creativity
+        )
+    top_categories = response_obj.choices[0].message.content
+    print(top_categories)
+    top_categories_final = []
+    for line in top_categories.split('\n'):
+        if '. ' in line:
+            try:
+                top_categories_final.append(line.split('. ', 1)[1])
+            except IndexError:
+                pass
+    return top_categories_final
+#multi-class text classification
+def extract_multi_class(
+    survey_question,
+    survey_input,
+    categories,
+    api_key,
+    columns="numbered",
+    user_model="gpt-4o-2024-11-20",
+    creativity=0,
+    to_csv=False,
+    safety=False,
+    filename="categorized_data.csv",
+    save_directory=None,
+    model_source="OpenAI"
+):
+    import os
+    import json
+    import pandas as pd
+    import regex
+    from tqdm import tqdm
+    categories_str = "\n".join(f"{i + 1}. {cat}" for i, cat in enumerate(categories))
+    cat_num = len(categories)
+    category_dict = {str(i+1): "0" for i in range(cat_num)}
+    example_JSON = json.dumps(category_dict, indent=4)
+    # ensure number of categories is what user wants
+    print("\nThe categories you entered:")
+    for i, cat in enumerate(categories, 1):
+        print(f"{i}. {cat}")
+    link1 = []
+    extracted_jsons = []
+    for idx, response in enumerate(tqdm(survey_input, desc="Categorizing responses")):
+        reply = None
+        if pd.isna(response):
+            link1.append("Skipped NaN input")
+            default_json = example_JSON
+            extracted_jsons.append(default_json)
+            #print(f"Skipped NaN input.")
+        else:
+            prompt = f"""A respondent was asked: {survey_question}. \
+Categorize this survey response "{response}" into the following categories that apply: \
+{categories_str} \
+Provide your work in JSON format where the number belonging to each category is the key and a 1 if the category is present and a 0 if it is not present as key values."""
+            #print(prompt)
+            if model_source == ("OpenAI"):
+                from openai import OpenAI
+                client = OpenAI(api_key=api_key)
+                try:
+                    response_obj = client.chat.completions.create(
+                    model=user_model,
+                    messages=[{'role': 'user', 'content': prompt}],
+                    temperature=creativity
+                )
+                    reply = response_obj.choices[0].message.content
+                    link1.append(reply)
+                except Exception as e:
+                    print(f"An error occurred: {e}")
+                    link1.append(f"Error processing input: {e}")
+            elif model_source == "Perplexity":
+                from openai import OpenAI
+                client = OpenAI(api_key=api_key, base_url="https://api.perplexity.ai")
+                try:
+                    response_obj = client.chat.completions.create(
+                        model=user_model,
+                        messages=[{'role': 'user', 'content': prompt}],
+                        temperature=creativity
+                    )
+                    reply = response_obj.choices[0].message.content
+                    link1.append(reply)
+                except Exception as e:
+                    print(f"An error occurred: {e}")
+                    link1.append(f"Error processing input: {e}")
+            elif model_source == "Anthropic":
+                import anthropic
+                client = anthropic.Anthropic(api_key=api_key)
+                try:
+                    message = client.messages.create(
+                    model=user_model,
+                    max_tokens=1024,
+                    temperature=creativity,
+                    messages=[{"role": "user", "content": prompt}]
+                )
+                    reply = message.content[0].text  # Anthropic returns content as list
+                    link1.append(reply)
+                except Exception as e:
+                    print(f"An error occurred: {e}")
+                    link1.append(f"Error processing input: {e}")
+            elif model_source == "Mistral":
+                from mistralai import Mistral
+                client = Mistral(api_key=api_key)
+                try:
+                    response = client.chat.complete(
+                    model=user_model,
+                    messages=[
+                        {'role': 'user', 'content': prompt}
+                    ],
+                    temperature=creativity
+                )
+                    reply = response.choices[0].message.content
+                    link1.append(reply)
+                except Exception as e:
+                    print(f"An error occurred: {e}")
+                    link1.append(f"Error processing input: {e}")
+            else:
+                raise ValueError("Unknown source! Choose from OpenAI, Anthropic, Perplexity, or Mistral")
+            # in situation that no JSON is found
+            if reply is not None:
+                extracted_json = regex.findall(r'\{(?:[^{}]|(?R))*\}', reply, regex.DOTALL)
+                if extracted_json:
+                    cleaned_json = extracted_json[0].replace('[', '').replace(']', '').replace('\n', '').replace(" ", '').replace("  ", '')
+                    extracted_jsons.append(cleaned_json)
+                    #print(cleaned_json)
+                else:
+                    error_message = """{"1":"e"}"""
+                    extracted_jsons.append(error_message)
+                    print(error_message)
+            else:
+                error_message = """{"1":"e"}"""
+                extracted_jsons.append(error_message)
+                #print(error_message)
+        # --- Safety Save ---
+        if safety:
+            # Save progress so far
+            temp_df = pd.DataFrame({
+                'survey_response': survey_input[:idx+1],
+                'link1': link1,
+                'json': extracted_jsons
+            })
+            # Normalize processed jsons so far
+            normalized_data_list = []
+            for json_str in extracted_jsons:
+                try:
+                    parsed_obj = json.loads(json_str)
+                    normalized_data_list.append(pd.json_normalize(parsed_obj))
+                except json.JSONDecodeError:
+                    normalized_data_list.append(pd.DataFrame({"1": ["e"]}))
+            normalized_data = pd.concat(normalized_data_list, ignore_index=True)
+            temp_df = pd.concat([temp_df, normalized_data], axis=1)
+            # Save to CSV
+            if save_directory is None:
+                save_directory = os.getcwd()
+            temp_df.to_csv(os.path.join(save_directory, filename), index=False)
+    # --- Final DataFrame ---
+    normalized_data_list = []
+    for json_str in extracted_jsons:
+        try:
+            parsed_obj = json.loads(json_str)
+            normalized_data_list.append(pd.json_normalize(parsed_obj))
+        except json.JSONDecodeError:
+            normalized_data_list.append(pd.DataFrame({"1": ["e"]}))
+    normalized_data = pd.concat(normalized_data_list, ignore_index=True)
+    categorized_data = pd.DataFrame({
+        'survey_response': survey_input.reset_index(drop=True),
+        'link1': pd.Series(link1).reset_index(drop=True),
+        'json': pd.Series(extracted_jsons).reset_index(drop=True)
+    })
+    categorized_data = pd.concat([categorized_data, normalized_data], axis=1)
+    if columns != "numbered": #if user wants text columns
+        categorized_data.columns = list(categorized_data.columns[:3]) + categories[:len(categorized_data.columns) - 3]
+    if to_csv:
+        if save_directory is None:
+            save_directory = os.getcwd()
+        categorized_data.to_csv(os.path.join(save_directory, filename), index=False)
+    return categorized_data

cat-llm 0.0.24__tar.gz → 0.0.26__tar.gz

cat-llm 0.0.24tar.gz → 0.0.26tar.gz