PyPI - cat-llm - Versions diffs - 0.0.67__py3-none-any.whl → 0.0.69__py3-none-any.whl - Mend

cat-llm 0.0.67py3-none-any.whl → 0.0.69py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

{cat_llm-0.0.67.dist-info → cat_llm-0.0.69.dist-info}/METADATA +2 -2
{cat_llm-0.0.67.dist-info → cat_llm-0.0.69.dist-info}/RECORD +10 -6
catllm/__about__.py +1 -1
catllm/calls/CoVe.py +304 -0
catllm/calls/__init__.py +25 -0
catllm/calls/all_calls.py +433 -0
catllm/model_reference_list.py +94 -0
catllm/text_functions.py +335 -42
{cat_llm-0.0.67.dist-info → cat_llm-0.0.69.dist-info}/WHEEL +0 -0
{cat_llm-0.0.67.dist-info → cat_llm-0.0.69.dist-info}/licenses/LICENSE +0 -0

catllm/text_functions.py CHANGED Viewed

@@ -1,3 +1,15 @@
+from .calls.all_calls import (
+    get_stepback_insight_openai,
+    get_stepback_insight_anthropic,
+    get_stepback_insight_google,
+    get_stepback_insight_mistral,
+    chain_of_verification_openai,
+    chain_of_verification_google,
+    chain_of_verification_anthropic,
+    chain_of_verification_mistral
+)
 #extract categories from corpus
 def explore_corpus(
     survey_question,
@@ -232,24 +244,27 @@ Return the top {top_n} categories as a numbered list sorted from the most to lea
 # GOAL: enable step-back prompting
 # GOAL 2: enable self-consistency
 def multi_class(
-    survey_question,
     survey_input,
     categories,
     api_key,
     user_model="gpt-5",
     user_prompt = None,
+    survey_question = "",
     example1 = None,
     example2 = None,
     example3 = None,
     example4 = None,
     example5 = None,
     example6 = None,
-    creativity=None,
-    safety=False,
-    to_csv=False,
-    filename="categorized_data.csv",
-    save_directory=None,
-    model_source="OpenAI"
+    creativity = None,
+    safety = False,
+    to_csv = False,
+    chain_of_verification = False,
+    step_back_prompt = False,
+    context_prompt = False,
+    filename = "categorized_data.csv",
+    save_directory = None,
+    model_source = "auto"
 ):
     import os
     import json
@@ -257,7 +272,54 @@ def multi_class(
     import regex
     from tqdm import tqdm
+    def remove_numbering(line):
+        line = line.strip()
+        # Handle bullet points
+        if line.startswith('- '):
+            return line[2:].strip()
+        if line.startswith('• '):
+            return line[2:].strip()
+        # Handle numbered lists "1.", "10.", etc.
+        if line and line[0].isdigit():
+            # Find where the number ends
+            i = 0
+            while i < len(line) and line[i].isdigit():
+                i += 1
+            # Check if followed by '.' or ')'
+            if i < len(line) and line[i] in '.':
+                return line[i+1:].strip()
+            elif i < len(line) and line[i] in ')':
+                return line[i+1:].strip()
+        return line
     model_source = model_source.lower() # eliminating case sensitivity
+    # auto-detect model source if not provided
+    if model_source is None or model_source == "auto":
+        user_model_lower = user_model.lower()
+        if "gpt" in user_model_lower:
+            model_source = "openai"
+        elif "claude" in user_model_lower:
+            model_source = "anthropic"
+        elif "gemini" in user_model_lower or "gemma" in user_model_lower:
+            model_source = "google"
+        elif "llama" in user_model_lower or "meta" in user_model_lower:
+            model_source = "huggingface"
+        elif "mistral" in user_model_lower or "mixtral" in user_model_lower:
+            model_source = "mistral"
+        elif "sonar" in user_model_lower or "pplx" in user_model_lower:
+            model_source = "perplexity"
+        elif "deepseek"  in user_model_lower or "qwen" in user_model_lower:
+            model_source = "huggingface"
+        else:
+            raise ValueError(f"❌ Could not auto-detect model source from '{user_model}'. Please specify model_source explicitly: OpenAI, Anthropic, Perplexity, Google, Huggingface, or Mistral")
+    else:
+        model_source = model_source.lower()
     categories_str = "\n".join(f"{i + 1}. {cat}" for i, cat in enumerate(categories))
     cat_num = len(categories)
@@ -265,17 +327,66 @@ def multi_class(
     example_JSON = json.dumps(category_dict, indent=4)
     # ensure number of categories is what user wants
-    print("\nThe categories you entered:")
+    print(f"\nThe categories you entered to be coded by {model_source} {user_model}:")
     for i, cat in enumerate(categories, 1):
         print(f"{i}. {cat}")
     link1 = []
     extracted_jsons = []
     #handling example inputs
     examples = [example1, example2, example3, example4, example5, example6]
     examples_text = "\n".join(
     f"Example {i}: {ex}" for i, ex in enumerate(examples, 1) if ex is not None
 )
+    # allowing users to contextualize the survey question
+    if survey_question != None:
+        survey_question_context = f"A respondent was asked: {survey_question}."
+    else:
+        survey_question_context = ""
+    # step back insight initializationif step_back_prompt:
+    if step_back_prompt:
+        if survey_question == "": # step back requires the survey question to function well
+            raise TypeError("survey_question is required when using step_back_prompt. Please provide the survey question you are analyzing.")
+        stepback = f"""What are the underlying factors or dimensions that explain how people typically answer "{survey_question}"?"""
+        if model_source in ["openai", "perplexity", "huggingface"]:
+            stepback_insight, step_back_added = get_stepback_insight_openai(
+                stepback=stepback,
+                api_key=api_key,
+                user_model=user_model,
+                model_source=model_source,
+                creativity=creativity
+            )
+        elif model_source == "anthropic":
+            stepback_insight, step_back_added = get_stepback_insight_anthropic(
+                stepback=stepback,
+                api_key=api_key,
+                user_model=user_model,
+                model_source=model_source,
+                creativity=creativity
+            )
+        elif model_source == "google":
+            stepback_insight, step_back_added = get_stepback_insight_google(
+                stepback=stepback,
+                api_key=api_key,
+                user_model=user_model,
+                model_source=model_source,
+                creativity=creativity
+            )
+        elif model_source == "mistral":
+            stepback_insight, step_back_added = get_stepback_insight_mistral(
+                stepback=stepback,
+                api_key=api_key,
+                user_model=user_model,
+                model_source=model_source,
+                creativity=creativity
+            )
+    else:
+        stepback_insight = None
+        step_back_added = False
     for idx, response in enumerate(tqdm(survey_input, desc="Categorizing responses")):
         reply = None
@@ -287,58 +398,166 @@ def multi_class(
             #print(f"Skipped NaN input.")
         else:
-            prompt = f"""A respondent was asked: {survey_question}. \
+            prompt = f"""{survey_question_context} \
             Categorize this survey response "{response}" into the following categories that apply: \
             {categories_str}
             {examples_text}
-            Provide your work in JSON format..."""
+            Provide your work in JSON format where the number belonging to each category is the key and a 1 if the category is present and a 0 if it is not present as key values."""
+            if context_prompt:
+                context = """You are an expert researcher in survey data categorization.
+                Apply multi-label classification and base decisions on explicit and implicit meanings.
+                When uncertain, prioritize precision over recall."""
+                prompt = context + prompt
+                print(prompt)
+            if chain_of_verification:
+                step2_prompt = f"""You provided this initial categorization:
+                <<INITIAL_REPLY>>
+                Original task: {prompt}
+                Generate a focused list of 3-5 verification questions to fact-check your categorization. Each question should:
+                - Be concise and specific (one sentence)
+                - Address a distinct aspect of the categorization
+                - Be answerable independently
-            if model_source == ("openai"):
+                Focus on verifying:
+                - Whether each category assignment is accurate
+                - Whether the categories match the criteria in the original task
+                - Whether there are any logical inconsistencies
+                Provide only the verification questions as a numbered list."""
+                step3_prompt = f"""Answer the following verification question based on the survey response provided.
+                Survey response: {response}
+                Verification question: <<QUESTION>>
+                Provide a brief, direct answer (1-2 sentences maximum).
+                Answer:"""
+                step4_prompt = f"""Original task: {prompt}
+                Initial categorization:
+                <<INITIAL_REPLY>>
+                Verification questions and answers:
+                <<VERIFICATION_QA>>
+                If no categories are present, assign "0" to all categories.
+                Provide the final corrected categorization in the same JSON format:"""
+            # Main model interaction
+            if model_source in ["openai", "perplexity", "huggingface"]:
                 from openai import OpenAI
-                client = OpenAI(api_key=api_key)
+                from openai import OpenAI, BadRequestError, AuthenticationError
+                # conditional base_url setting based on model source
+                base_url = (
+                    "https://api.perplexity.ai" if model_source == "perplexity"
+                    else "https://router.huggingface.co/v1" if model_source == "huggingface"
+                    else None  # default
+                )
+                client = OpenAI(api_key=api_key, base_url=base_url)
                 try:
+                    messages = [
+                        *([{'role': 'user', 'content': stepback}] if step_back_prompt and step_back_added else []), # only if step back is enabled and successful
+                        *([{'role': 'assistant', 'content': stepback_insight}] if step_back_added else {}), # include insight if step back succeeded
+                        {'role': 'user', 'content': prompt}
+                    ]
                     response_obj = client.chat.completions.create(
                     model=user_model,
-                    messages=[{'role': 'user', 'content': prompt}],
+                    messages=messages,
                     **({"temperature": creativity} if creativity is not None else {})
-                )
-                    reply = response_obj.choices[0].message.content
-                    link1.append(reply)
-                except Exception as e:
-                    print(f"An error occurred: {e}")
-                    link1.append(f"Error processing input: {e}")
-            elif model_source == "perplexity":
-                from openai import OpenAI
-                client = OpenAI(api_key=api_key, base_url="https://api.perplexity.ai")
-                try:
-                    response_obj = client.chat.completions.create(
-                        model=user_model,
-                        messages=[{'role': 'user', 'content': prompt}],
-                        **({"temperature": creativity} if creativity is not None else {})
                     )
                     reply = response_obj.choices[0].message.content
-                    link1.append(reply)
+                    if chain_of_verification:
+                        reply = chain_of_verification_openai(
+                            initial_reply=reply,
+                            step2_prompt=step2_prompt,
+                            step3_prompt=step3_prompt,
+                            step4_prompt=step4_prompt,
+                            client=client,
+                            user_model=user_model,
+                            creativity=creativity,
+                            remove_numbering=remove_numbering
+                        )
+                        link1.append(reply)
+                    else:
+                        #if chain of verification is not enabled, just append initial reply
+                        link1.append(reply)
+                except BadRequestError as e:
+                    # Model doesn't exist - halt immediately
+                    raise ValueError(f"❌ Model '{user_model}' on {model_source} not found. Please check the model name and try again.") from e
                 except Exception as e:
                     print(f"An error occurred: {e}")
                     link1.append(f"Error processing input: {e}")
             elif model_source == "anthropic":
                 import anthropic
                 client = anthropic.Anthropic(api_key=api_key)
                 try:
-                    message = client.messages.create(
+                    response_obj = client.messages.create(
                     model=user_model,
-                    max_tokens=1024,
-                    **({"temperature": creativity} if creativity is not None else {}),
-                    messages=[{"role": "user", "content": prompt}]
-                )
-                    reply = message.content[0].text  # Anthropic returns content as list
-                    link1.append(reply)
+                    max_tokens=4096,
+                    messages=[{'role': 'user', 'content': prompt}],
+                    **({"temperature": creativity} if creativity is not None else {})
+                    )
+                    reply = response_obj.content[0].text
+                    if chain_of_verification:
+                        reply = chain_of_verification_anthropic(
+                            initial_reply=reply,
+                            step2_prompt=step2_prompt,
+                            step3_prompt=step3_prompt,
+                            step4_prompt=step4_prompt,
+                            client=client,
+                            user_model=user_model,
+                            creativity=creativity,
+                            remove_numbering=remove_numbering
+                        )
+                        link1.append(reply)
+                    else:
+                        #if chain of verification is not enabled, just append initial reply
+                        link1.append(reply)
+                except anthropic.NotFoundError as e:
+                    # Model doesn't exist - halt immediately
+                    raise ValueError(f"❌ Model '{user_model}' on {model_source} not found. Please check the model name and try again.") from e
                 except Exception as e:
                     print(f"An error occurred: {e}")
                     link1.append(f"Error processing input: {e}")
             elif model_source == "google":
                 import requests
+                def make_google_request(url, headers, payload, max_retries=3):
+                    """Make Google API request with exponential backoff on 429 errors"""
+                    for attempt in range(max_retries):
+                        try:
+                            response = requests.post(url, headers=headers, json=payload)
+                            response.raise_for_status()
+                            return response.json()
+                        except requests.exceptions.HTTPError as e:
+                            if e.response.status_code == 429 and attempt < max_retries - 1:
+                                wait_time = 10 * (2 ** attempt)
+                                print(f"⚠️ Rate limited. Waiting {wait_time}s...")
+                                time.sleep(wait_time)
+                            else:
+                                raise
                 url = f"https://generativelanguage.googleapis.com/v1beta/models/{user_model}:generateContent"
                 try:
                     headers = {
@@ -353,22 +572,49 @@ def multi_class(
                             **({"generationConfig": {"temperature": creativity}} if creativity is not None else {})
                             }
-                    response = requests.post(url, headers=headers, json=payload)
-                    response.raise_for_status()  # Raise exception for HTTP errors
-                    result = response.json()
+                    result = make_google_request(url, headers, payload)
                     if "candidates" in result and result["candidates"]:
                         reply = result["candidates"][0]["content"]["parts"][0]["text"]
                     else:
                         reply = "No response generated"
-                    link1.append(reply)
+                    if chain_of_verification:
+                        reply = chain_of_verification_google(
+                            initial_reply=reply,
+                            prompt=prompt,
+                            step2_prompt=step2_prompt,
+                            step3_prompt=step3_prompt,
+                            step4_prompt=step4_prompt,
+                            url=url,
+                            headers=headers,
+                            creativity=creativity,
+                            remove_numbering=remove_numbering,
+                            make_google_request=make_google_request
+                        )
+                        link1.append(reply)
+                    else:
+                        # if chain of verification is not enabled, just append initial reply
+                        link1.append(reply)
+                except requests.exceptions.HTTPError as e:
+                    if e.response.status_code == 404:
+                        raise ValueError(f"❌ Model '{user_model}' not found. Please check the model name and try again.") from e
+                    elif e.response.status_code == 401 or e.response.status_code == 403:
+                        raise ValueError(f"❌ Authentication failed. Please check your Google API key.") from e
+                    else:
+                        print(f"HTTP error occurred: {e}")
+                        link1.append(f"Error processing input: {e}")
                 except Exception as e:
                     print(f"An error occurred: {e}")
                     link1.append(f"Error processing input: {e}")
             elif model_source == "mistral":
                 from mistralai import Mistral
+                from mistralai.models import SDKError
                 client = Mistral(api_key=api_key)
                 try:
                     response = client.chat.complete(
@@ -379,12 +625,40 @@ def multi_class(
                     **({"temperature": creativity} if creativity is not None else {})
                 )
                     reply = response.choices[0].message.content
-                    link1.append(reply)
+                    if chain_of_verification:
+                        reply = chain_of_verification_mistral(
+                            initial_reply=reply,
+                            step2_prompt=step2_prompt,
+                            step3_prompt=step3_prompt,
+                            step4_prompt=step4_prompt,
+                            client=client,
+                            user_model=user_model,
+                            creativity=creativity,
+                            remove_numbering=remove_numbering
+                        )
+                        link1.append(reply)
+                    else:
+                        #if chain of verification is not enabled, just append initial reply
+                        link1.append(reply)
+                except SDKError as e:
+                    error_str = str(e).lower()
+                    if "invalid_model" in error_str or "invalid model" in error_str:
+                        raise ValueError(f"❌ Model '{user_model}' not found.") from e
+                    elif "401" in str(e) or "unauthorized" in str(e).lower():
+                        raise ValueError(f"❌ Authentication failed. Please check your Mistral API key.") from e
+                    else:
+                        print(f"An error occurred: {e}")
+                        link1.append(f"Error processing input: {e}")
                 except Exception as e:
-                    print(f"An error occurred: {e}")
+                    print(f"An unexpected error occurred: {e}")
                     link1.append(f"Error processing input: {e}")
             else:
-                raise ValueError("Unknown source! Choose from OpenAI, Anthropic, Perplexity, or Mistral")
+                raise ValueError("Unknown source! Choose from OpenAI, Anthropic, Perplexity, Google, Huggingface, or Mistral")
             # in situation that no JSON is found
             if reply is not None:
                 extracted_json = regex.findall(r'\{(?:[^{}]|(?R))*\}', reply, regex.DOTALL)
@@ -442,6 +716,25 @@ def multi_class(
         'json': pd.Series(extracted_jsons).reset_index(drop=True)
     })
     categorized_data = pd.concat([categorized_data, normalized_data], axis=1)
+    categorized_data = categorized_data.rename(columns=lambda x: f'category_{x}' if str(x).isdigit() else x)
+    #converting to numeric
+    cat_cols = [col for col in categorized_data.columns if col.startswith('category_')]
+    categorized_data['processing_status'] = np.where(
+        categorized_data[cat_cols].isna().all(axis=1),
+        'error',
+        'success'
+    )
+    categorized_data.loc[categorized_data[cat_cols].apply(pd.to_numeric, errors='coerce').isna().any(axis=1), cat_cols] = np.nan
+    categorized_data[cat_cols] = categorized_data[cat_cols].astype('Int64')
+    categorized_data['categories_present'] = categorized_data[cat_cols].apply(
+        lambda x: ','.join(x.dropna().astype(str)), axis=1
+    )
+    categorized_data['categories_counted'] = categorized_data[cat_cols].count(axis=1)
     if to_csv:
         if save_directory is None:

{cat_llm-0.0.67.dist-info → cat_llm-0.0.69.dist-info}/WHEEL RENAMED Viewed

File without changes

{cat_llm-0.0.67.dist-info → cat_llm-0.0.69.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

cat-llm 0.0.67__py3-none-any.whl → 0.0.69__py3-none-any.whl

cat-llm 0.0.67py3-none-any.whl → 0.0.69py3-none-any.whl