PyPI - cat-llm - Versions diffs - 0.0.74__py3-none-any.whl → 0.0.76__py3-none-any.whl - Mend

cat-llm 0.0.74py3-none-any.whl → 0.0.76py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

{cat_llm-0.0.74.dist-info → cat_llm-0.0.76.dist-info}/METADATA +1 -1
{cat_llm-0.0.74.dist-info → cat_llm-0.0.76.dist-info}/RECORD +7 -7
catllm/__about__.py +1 -1
catllm/calls/all_calls.py +74 -0
catllm/text_functions.py +128 -28
{cat_llm-0.0.74.dist-info → cat_llm-0.0.76.dist-info}/WHEEL +0 -0
{cat_llm-0.0.74.dist-info → cat_llm-0.0.76.dist-info}/licenses/LICENSE +0 -0

{cat_llm-0.0.74.dist-info → cat_llm-0.0.76.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cat-llm
-Version: 0.0.74
+Version: 0.0.76
 Summary: A tool for categorizing text data and images using LLMs and vision models
 Project-URL: Documentation, https://github.com/chrissoria/cat-llm#readme
 Project-URL: Issues, https://github.com/chrissoria/cat-llm/issues

{cat_llm-0.0.74.dist-info → cat_llm-0.0.76.dist-info}/RECORD RENAMED Viewed

@@ -1,19 +1,19 @@
 catllm/CERAD_functions.py,sha256=q4HbP5e2Yu8NnZZ-2eX4sImyj6u3i8xWcq0pYU81iis,22676
-catllm/__about__.py,sha256=E0enlOPQDj7XaMZv62lffULZGOUEAqpRIyZ12A6f3zk,430
+catllm/__about__.py,sha256=yXzP4t-1ifCb-n2qXIRLt4j8v0AsNmsQgXS3fMChAzo,430
 catllm/__init__.py,sha256=sf02zp7N0NW0mAQi7eQ4gliWR1EwoqvXkHN2HwwjcTE,372
 catllm/build_web_research.py,sha256=880dfE2bEQb-FrXP-42JoLLtyc9ox_sBULDr38xiTiQ,22655
 catllm/image_functions.py,sha256=8_FftRU285x1HT-AgNkaobefQVD-5q7ZY_t7JFdL3Sg,36177
 catllm/model_reference_list.py,sha256=37pWwMcgnf4biE3BVRluH5oz2P6ccdJJiCVNHodBH8k,2307
-catllm/text_functions.py,sha256=O6wfDh50Xtc0JvQtjWb9L9PgtBP6cjxWBw-PCNmbiaE,33371
+catllm/text_functions.py,sha256=XF6aGuUyihnCKwGnGyLM1PbFQg3fF6nhJ_PoSX2zLaY,36101
 catllm/calls/CoVe.py,sha256=Y9OGJbaeJ3Odwira92cPXUlnm_ADFqvpOSFSNjFzMMU,10847
 catllm/calls/__init__.py,sha256=fWuMwLeSGa6zXJYd4s8IyNblsD62G-1NMUsOKrNIkoI,725
-catllm/calls/all_calls.py,sha256=E25KpZ_MakMDeCpNCOOM8kQvlfex6UMjnGN1wHkA4AI,14356
+catllm/calls/all_calls.py,sha256=AeN1QocOvL3Z36lDkq6bO0LB3ruz6pXyedvdci0YCxQ,16627
 catllm/images/circle.png,sha256=JWujAWAh08-TajAoEr_TAeFNLlfbryOLw6cgIBREBuQ,86202
 catllm/images/cube.png,sha256=nFec3e5bmRe4zrBCJ8QK-HcJLrG7u7dYdKhmdMfacfE,77275
 catllm/images/diamond.png,sha256=rJDZKtsnBGRO8FPA0iHuA8FvHFGi9PkI_DWSFdw6iv0,99568
 catllm/images/overlapping_pentagons.png,sha256=VO5plI6eoVRnjfqinn1nNzsCP2WQhuQy71V0EASouW4,71208
 catllm/images/rectangles.png,sha256=2XM16HO9EYWj2yHgN4bPXaCwPfl7iYQy0tQUGaJX9xg,40692
-cat_llm-0.0.74.dist-info/METADATA,sha256=DYaL_OFgi9MuFpWLd1DHgqVi_osTwK1DJH-E5Q2kaa8,23214
-cat_llm-0.0.74.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-cat_llm-0.0.74.dist-info/licenses/LICENSE,sha256=Vje2sS5WV4TnIwY5uQHrF4qnBAM3YOk1pGpdH0ot-2o,34969
-cat_llm-0.0.74.dist-info/RECORD,,
+cat_llm-0.0.76.dist-info/METADATA,sha256=EGlOhrerEtwgdk98DPhSCSshmOKnhXHw67-25V8wrJs,23214
+cat_llm-0.0.76.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+cat_llm-0.0.76.dist-info/licenses/LICENSE,sha256=Vje2sS5WV4TnIwY5uQHrF4qnBAM3YOk1pGpdH0ot-2o,34969
+cat_llm-0.0.76.dist-info/RECORD,,

catllm/__about__.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.0.74"
+__version__ = "0.0.76"
 __author__ = "Chris Soria"
 __email__ = "chrissoria@berkeley.edu"
 __title__ = "cat-llm"

catllm/calls/all_calls.py CHANGED Viewed

@@ -431,3 +431,77 @@ def chain_of_verification_mistral(
         print(f"ERROR in Chain of Verification: {str(e)}")
         print("Falling back to initial response.\n")
         return initial_reply
+# openai explore corpus call
+def get_openai_top_n(
+    prompt,
+    user_model,
+    specificity,
+    model_source,
+    api_key,
+    research_question,
+    creativity
+):
+    """
+    Get response from OpenAI API with system message.
+    """
+    from openai import OpenAI
+    base_url = (
+        "https://api.perplexity.ai" if model_source == "perplexity"
+        else "https://router.huggingface.co/v1" if model_source == "huggingface"
+        else None
+    )
+    client = OpenAI(api_key=api_key, base_url=base_url)
+    response_obj = client.chat.completions.create(
+        model=user_model,
+        messages=[
+            {'role': 'system', 'content': f"""You are a helpful assistant that extracts categories from survey responses. \
+                                        The specific task is to identify {specificity} categories of responses to a survey question. \
+             The research question is: {research_question}""" if research_question else "You are a helpful assistant."},
+            {'role': 'user', 'content': prompt}
+        ],
+        **({"temperature": creativity} if creativity is not None else {})
+    )
+    return response_obj.choices[0].message.content
+# anthropic explore corpus call
+def get_anthropic_top_n(
+    prompt,
+    user_model,
+    model_source,
+    specificity,
+    api_key,
+    research_question,
+    creativity
+):
+    """
+    Get response from Anthropic API with system prompt.
+    """
+    import anthropic
+    client = anthropic.Anthropic(api_key=api_key)
+    # build system prompt
+    if research_question:
+        system_content = (f"You are a helpful assistant that extracts categories from survey responses. "
+                        f"The specific task is to identify {specificity} categories of responses to a survey question. "
+                        f"The research question is: {research_question}")
+    else:
+        system_content = "You are a helpful assistant."
+    response_obj = client.messages.create(
+        model=user_model,
+        max_tokens=4096,
+        system=system_content,
+        messages=[
+            {'role': 'user', 'content': prompt}
+        ],
+        **({"temperature": creativity} if creativity is not None else {})
+    )
+    return response_obj.content[0].text

catllm/text_functions.py CHANGED Viewed

@@ -6,7 +6,9 @@ from .calls.all_calls import (
     chain_of_verification_openai,
     chain_of_verification_google,
     chain_of_verification_anthropic,
-    chain_of_verification_mistral
+    chain_of_verification_mistral,
+    get_openai_top_n,
+    get_anthropic_top_n,
 )
@@ -117,9 +119,9 @@ def explore_common_categories(
     survey_question,
     survey_input,
     api_key,
-    top_n=10,
+    top_n=12,
     cat_num=10,
-    divisions=5,
+    divisions=10,
     user_model="gpt-5",
     creativity=None,
     specificity="broad",
@@ -164,20 +166,19 @@ Responses are contained within triple backticks here: ```{survey_participant_chu
 Number your categories from 1 through {cat_num} and be concise with the category labels and provide no description of the categories."""
         if model_source == "openai":
-            client = OpenAI(api_key=api_key)
             try:
-                response_obj = client.chat.completions.create(
-                    model=user_model,
-                    messages=[
-                        {'role': 'system', 'content': f"""You are a helpful assistant that extracts categories from survey responses. \
-                                                    The specific task is to identify {specificity} categories of responses to a survey question. \
-                         The research question is: {research_question}""" if research_question else "You are a helpful assistant."},
-                        {'role': 'user', 'content': prompt}
-                    ],
-                    **({"temperature": creativity} if creativity is not None else {})
+                reply = get_openai_top_n(
+                    prompt=prompt,
+                    user_model=user_model,
+                    specificity=specificity,
+                    api_key=api_key,
+                    model_source=model_source,
+                    research_question=research_question,
+                    creativity=creativity
                 )
-                reply = response_obj.choices[0].message.content
                 responses.append(reply)
             except BadRequestError as e:
                 if "context_length_exceeded" in str(e) or "maximum context length" in str(e):
                     error_msg = (f"Token limit exceeded for model {user_model}. "
@@ -187,6 +188,20 @@ Number your categories from 1 through {cat_num} and be concise with the category
                     print(f"OpenAI API error: {e}")
             except Exception as e:
                 print(f"An error occurred: {e}")
+        elif model_source == "anthropic":
+            reply = get_anthropic_top_n(
+                prompt=prompt,
+                user_model=user_model,
+                specificity=specificity,
+                model_source=model_source,
+                api_key=api_key,
+                research_question=research_question,
+                creativity=creativity
+            )
+            responses.append(reply)
         else:
             raise ValueError(f"Unsupported model_source: {model_source}")
@@ -204,24 +219,87 @@ Number your categories from 1 through {cat_num} and be concise with the category
     flat_list = [item.lower() for sublist in responses_list for item in sublist]
     #convert flat_list to a df
+    def normalize_category(cat):
+        if pd.isna(cat):
+            return cat
+        terms = sorted([term.strip().lower() for term in str(cat).split('/')])
+        return '/'.join(terms)
+    # normalized column
     df = pd.DataFrame(flat_list, columns=['Category'])
-    counts = pd.Series(flat_list).value_counts()  # Use original list before conversion
-    df['counts'] = df['Category'].map(counts)
-    df = df.sort_values(by='counts', ascending=False).reset_index(drop=True)
-    df = df.drop_duplicates(subset='Category', keep='first').reset_index(drop=True)
+    df['normalized'] = df['Category'].apply(normalize_category)
+    # group by normalized, count, and keep most frequent original
+    result = (df.groupby('normalized')
+            .agg(Category=('Category', lambda x: x.value_counts().index[0]),
+                counts=('Category', 'size'))
+            .sort_values('counts', ascending=False)
+            .reset_index(drop=True))
+    df = result
+    second_prompt = f"""You are a data analyst reviewing categorized survey data.
+    Task: From the provided categories, identify and return the top {top_n} CONCEPTUALLY UNIQUE categories.
+    Critical Instructions:
+    1. The categories have already been deduplicated for exact string matches
+    2. However, some categories may still be SEMANTICALLY DUPLICATES (same concept, different wording):
+        - "closer to work" and "commute/proximity to work" mean the same thing
+        - "breakup/household conflict" and "relationship problems" mean the same thing
+    3. When you identify semantic duplicates:
+        - Combine their frequencies mentally
+        - Keep the version that appears most frequently OR is most clearly worded
+        - Each concept should appear ONLY ONCE in your final list
+    4. Keep category names {specificity}
+    5. Return ONLY a numbered list of {top_n} conceptually unique categories
+    6. No additional text, explanations, or commentary
+    Pre-processed Categories (sorted by frequency):
+    {df['Category'].head(top_n * 3).tolist()}
+    Note: More categories than needed are provided so you can identify and merge semantic duplicates.
+    Output Format:
+    1. category name
+    2. category name
+    3. category name
+    Top {top_n} Conceptually Unique Categories:"""
-    second_prompt = f"""From this list of categories, extract the top {top_n} most common categories. \
-The categories are contained within triple backticks here: ```{df['Category'].tolist()}``` \
-Return the top {top_n} categories as a numbered list sorted from the most to least common and keep the categories {specificity}, with no additional text or explanation."""
     if model_source == "openai":
-        client = OpenAI(api_key=api_key)
+        base_url = (
+        "https://api.perplexity.ai" if model_source == "perplexity"
+        else "https://router.huggingface.co/v1" if model_source == "huggingface"
+        else None
+        )
+        client = OpenAI(api_key=api_key, base_url=base_url)
         response_obj = client.chat.completions.create(
             model=user_model,
             messages=[{'role': 'user', 'content': second_prompt}],
-            temperature=creativity
+            **({"temperature": creativity} if creativity is not None else {})
         )
-    top_categories = response_obj.choices[0].message.content
+        top_categories = response_obj.choices[0].message.content
+    elif model_source == "anthropic":
+        import anthropic
+        client = anthropic.Anthropic(api_key=api_key)
+        response_obj = client.messages.create(
+            model=user_model,
+            max_tokens=4096,
+            messages=[{'role': 'user', 'content': second_prompt}],
+            **({"temperature": creativity} if creativity is not None else {})
+        )
+        top_categories = response_obj.content[0].text
     print(top_categories)
     top_categories_final = []
@@ -263,6 +341,10 @@ def multi_class(
     chain_of_thought = True,
     step_back_prompt = False,
     context_prompt = False,
+    top_n = 12,
+    cat_num = 10,
+    divisions = 10,
+    research_question = None,
     filename = "categorized_data.csv",
     save_directory = None,
     model_source = "auto"
@@ -273,6 +355,7 @@ def multi_class(
     import regex
     from tqdm import tqdm
+    #used in chain of verification
     def remove_numbering(line):
         line = line.strip()
@@ -321,16 +404,33 @@ def multi_class(
             raise ValueError(f"❌ Could not auto-detect model source from '{user_model}'. Please specify model_source explicitly: OpenAI, Anthropic, Perplexity, Google, Huggingface, or Mistral")
     else:
         model_source = model_source.lower()
+    if categories == "auto":
+        if survey_question == "": # step back requires the survey question to function well
+            raise TypeError("survey_question is required when using step_back_prompt. Please provide the survey question you are analyzing.")
+        categories = explore_common_categories(
+            survey_question=survey_question,
+            survey_input=survey_input,
+            research_question=research_question,
+            api_key=api_key,
+            top_n=top_n,
+            cat_num=cat_num,
+            divisions=divisions
+        )
     categories_str = "\n".join(f"{i + 1}. {cat}" for i, cat in enumerate(categories))
     cat_num = len(categories)
     category_dict = {str(i+1): "0" for i in range(cat_num)}
     example_JSON = json.dumps(category_dict, indent=4)
-    # ensure number of categories is what user wants
     print(f"\nThe categories you entered to be coded by {model_source} {user_model}:")
-    for i, cat in enumerate(categories, 1):
-        print(f"{i}. {cat}")
+    if categories != "auto":
+    # ensure number of categories is what user wants
+        for i, cat in enumerate(categories, 1):
+            print(f"{i}. {cat}")
     link1 = []
     extracted_jsons = []

{cat_llm-0.0.74.dist-info → cat_llm-0.0.76.dist-info}/WHEEL RENAMED Viewed

File without changes

{cat_llm-0.0.74.dist-info → cat_llm-0.0.76.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

cat-llm 0.0.74__py3-none-any.whl → 0.0.76__py3-none-any.whl

cat-llm 0.0.74py3-none-any.whl → 0.0.76py3-none-any.whl