PyPI - themefinder - Versions diffs - 0.6.2__py3-none-any.whl → 0.6.3__py3-none-any.whl - Mend

themefinder 0.6.2py3-none-any.whl → 0.6.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of themefinder might be problematic. Click here for more details.

Files changed (16) hide show

themefinder/__init__.py +4 -0
themefinder/core.py +129 -33
themefinder/llm_batch_processor.py +32 -80
themefinder/models.py +307 -94
themefinder/prompts/detail_detection.txt +19 -0
themefinder/prompts/sentiment_analysis.txt +0 -14
themefinder/prompts/theme_condensation.txt +2 -22
themefinder/prompts/theme_generation.txt +6 -38
themefinder/prompts/theme_mapping.txt +6 -23
themefinder/prompts/theme_refinement.txt +2 -12
themefinder/prompts/theme_target_alignment.txt +2 -10
{themefinder-0.6.2.dist-info → themefinder-0.6.3.dist-info}/METADATA +23 -8
themefinder-0.6.3.dist-info/RECORD +17 -0
{themefinder-0.6.2.dist-info → themefinder-0.6.3.dist-info}/WHEEL +1 -1
themefinder-0.6.2.dist-info/RECORD +0 -16
{themefinder-0.6.2.dist-info → themefinder-0.6.3.dist-info}/LICENCE +0 -0

themefinder/__init__.py CHANGED Viewed

@@ -5,6 +5,8 @@ from .core import (
     theme_generation,
     theme_mapping,
     theme_refinement,
+    theme_target_alignment,
+    detail_detection,
 )
 __all__ = [
@@ -13,6 +15,8 @@ __all__ = [
     "theme_generation",
     "theme_condensation",
     "theme_refinement",
+    "theme_target_alignment",
     "theme_mapping",
+    "detail_detection",
 ]
 __version__ = "0.1.0"

themefinder/core.py CHANGED Viewed

@@ -3,10 +3,17 @@ from pathlib import Path
 import pandas as pd
 from langchain_core.prompts import PromptTemplate
-from langchain_core.runnables import Runnable
+from langchain.schema.runnable import RunnableWithFallbacks
 from .llm_batch_processor import batch_and_run, load_prompt_from_file
-from .models import SentimentAnalysisOutput, ThemeMappingOutput
+from .models import (
+    SentimentAnalysisResponses,
+    ThemeGenerationResponses,
+    ThemeCondensationResponses,
+    ThemeRefinementResponses,
+    ThemeMappingResponses,
+    DetailDetectionResponses,
+)
 from .themefinder_logging import logger
 CONSULTATION_SYSTEM_PROMPT = load_prompt_from_file("consultation_system_prompt")
@@ -14,11 +21,12 @@ CONSULTATION_SYSTEM_PROMPT = load_prompt_from_file("consultation_system_prompt")
 async def find_themes(
     responses_df: pd.DataFrame,
-    llm: Runnable,
+    llm: RunnableWithFallbacks,
     question: str,
     target_n_themes: int | None = None,
     system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
     verbose: bool = True,
+    concurrency: int = 10,
 ) -> dict[str, str | pd.DataFrame]:
     """Process survey responses through a multi-stage theme analysis pipeline.
@@ -32,7 +40,7 @@ async def find_themes(
     Args:
         responses_df (pd.DataFrame): DataFrame containing survey responses
-        llm (Runnable): Language model instance for text analysis
+        llm (RunnableWithFallbacks): Language model instance for text analysis
         question (str): The survey question
         target_n_themes (int | None, optional): Target number of themes to consolidate to.
             If None, skip theme target alignment step. Defaults to None.
@@ -40,6 +48,7 @@ async def find_themes(
             Defaults to CONSULTATION_SYSTEM_PROMPT.
         verbose (bool): Whether to show information messages during processing.
             Defaults to True.
+        concurrency (int): Number of concurrent API calls to make. Defaults to 10.
     Returns:
         dict[str, str | pd.DataFrame]: Dictionary containing results from each pipeline stage:
@@ -56,21 +65,28 @@ async def find_themes(
         llm,
         question=question,
         system_prompt=system_prompt,
+        concurrency=concurrency,
     )
     theme_df, _ = await theme_generation(
         sentiment_df,
         llm,
         question=question,
         system_prompt=system_prompt,
+        concurrency=concurrency,
     )
     condensed_theme_df, _ = await theme_condensation(
-        theme_df, llm, question=question, system_prompt=system_prompt
+        theme_df,
+        llm,
+        question=question,
+        system_prompt=system_prompt,
+        concurrency=concurrency,
     )
     refined_theme_df, _ = await theme_refinement(
         condensed_theme_df,
         llm,
         question=question,
         system_prompt=system_prompt,
+        concurrency=concurrency,
     )
     if target_n_themes is not None:
         refined_theme_df, _ = await theme_target_alignment(
@@ -79,6 +95,7 @@ async def find_themes(
             question=question,
             target_n_themes=target_n_themes,
             system_prompt=system_prompt,
+            concurrency=concurrency,
         )
     mapping_df, mapping_unprocessables = await theme_mapping(
         sentiment_df[["response_id", "response"]],
@@ -86,6 +103,14 @@ async def find_themes(
         question=question,
         refined_themes_df=refined_theme_df,
         system_prompt=system_prompt,
+        concurrency=concurrency,
+    )
+    detailed_df, _ = await detail_detection(
+        responses_df[["response_id", "response"]],
+        llm,
+        question=question,
+        system_prompt=system_prompt,
+        concurrency=concurrency,
     )
     logger.info("Finished finding themes")
@@ -97,17 +122,19 @@ async def find_themes(
         "sentiment": sentiment_df,
         "themes": refined_theme_df,
         "mapping": mapping_df,
+        "detailed_responses": detailed_df,
         "unprocessables": pd.concat([sentiment_unprocessables, mapping_unprocessables]),
     }
 async def sentiment_analysis(
     responses_df: pd.DataFrame,
-    llm: Runnable,
+    llm: RunnableWithFallbacks,
     question: str,
     batch_size: int = 20,
     prompt_template: str | Path | PromptTemplate = "sentiment_analysis",
     system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
+    concurrency: int = 10,
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Perform sentiment analysis on survey responses using an LLM.
@@ -117,7 +144,7 @@ async def sentiment_analysis(
     Args:
         responses_df (pd.DataFrame): DataFrame containing survey responses to analyze.
             Must contain 'response_id' and 'response' columns.
-        llm (Runnable): Language model instance to use for sentiment analysis.
+        llm (RunnableWithFallbacks): Language model instance to use for sentiment analysis.
         question (str): The survey question.
         batch_size (int, optional): Number of responses to process in each batch.
             Defaults to 20.
@@ -126,6 +153,7 @@ async def sentiment_analysis(
             or PromptTemplate instance. Defaults to "sentiment_analysis".
         system_prompt (str): System prompt to guide the LLM's behavior.
             Defaults to CONSULTATION_SYSTEM_PROMPT.
+        concurrency (int): Number of concurrent API calls to make. Defaults to 10.
     Returns:
         tuple[pd.DataFrame, pd.DataFrame]:
@@ -134,32 +162,33 @@ async def sentiment_analysis(
                 - The second DataFrame contains the rows that could not be processed by the LLM
     Note:
-        The function uses validation_check to ensure responses maintain
+        The function uses integrity_check to ensure responses maintain
         their original order and association after processing.
     """
     logger.info(f"Running sentiment analysis on {len(responses_df)} responses")
-    processed_rows, unprocessable_rows = await batch_and_run(
+    sentiment, unprocessable = await batch_and_run(
         responses_df,
         prompt_template,
-        llm,
+        llm.with_structured_output(SentimentAnalysisResponses),
         batch_size=batch_size,
         question=question,
-        validation_check=True,
-        task_validation_model=SentimentAnalysisOutput,
+        integrity_check=True,
         system_prompt=system_prompt,
+        concurrency=concurrency,
     )
-    return processed_rows, unprocessable_rows
+    return sentiment, unprocessable
 async def theme_generation(
     responses_df: pd.DataFrame,
-    llm: Runnable,
+    llm: RunnableWithFallbacks,
     question: str,
     batch_size: int = 50,
     partition_key: str | None = "position",
     prompt_template: str | Path | PromptTemplate = "theme_generation",
     system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
+    concurrency: int = 10,
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Generate themes from survey responses using an LLM.
@@ -168,7 +197,7 @@ async def theme_generation(
     Args:
         responses_df (pd.DataFrame): DataFrame containing survey responses.
             Must include 'response_id' and 'response' columns.
-        llm (Runnable): Language model instance to use for theme generation.
+        llm (RunnableWithFallbacks): Language model instance to use for theme generation.
         question (str): The survey question.
         batch_size (int, optional): Number of responses to process in each batch.
             Defaults to 50.
@@ -181,6 +210,7 @@ async def theme_generation(
             or PromptTemplate instance. Defaults to "theme_generation".
         system_prompt (str): System prompt to guide the LLM's behavior.
             Defaults to CONSULTATION_SYSTEM_PROMPT.
+        concurrency (int): Number of concurrent API calls to make. Defaults to 10.
     Returns:
         tuple[pd.DataFrame, pd.DataFrame]:
@@ -193,22 +223,24 @@ async def theme_generation(
     generated_themes, _ = await batch_and_run(
         responses_df,
         prompt_template,
-        llm,
+        llm.with_structured_output(ThemeGenerationResponses),
         batch_size=batch_size,
         partition_key=partition_key,
         question=question,
         system_prompt=system_prompt,
+        concurrency=concurrency,
     )
     return generated_themes, _
 async def theme_condensation(
     themes_df: pd.DataFrame,
-    llm: Runnable,
+    llm: RunnableWithFallbacks,
     question: str,
     batch_size: int = 75,
     prompt_template: str | Path | PromptTemplate = "theme_condensation",
     system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
+    concurrency: int = 10,
     **kwargs,
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Condense and combine similar themes identified from survey responses.
@@ -219,7 +251,7 @@ async def theme_condensation(
     Args:
         themes_df (pd.DataFrame): DataFrame containing the initial themes identified
             from survey responses.
-        llm (Runnable): Language model instance to use for theme condensation.
+        llm (RunnableWithFallbacks): Language model instance to use for theme condensation.
         question (str): The survey question.
         batch_size (int, optional): Number of themes to process in each batch.
             Defaults to 100.
@@ -228,6 +260,7 @@ async def theme_condensation(
             or PromptTemplate instance. Defaults to "theme_condensation".
         system_prompt (str): System prompt to guide the LLM's behavior.
             Defaults to CONSULTATION_SYSTEM_PROMPT.
+        concurrency (int): Number of concurrent API calls to make. Defaults to 10.
     Returns:
         tuple[pd.DataFrame, pd.DataFrame]:
@@ -247,10 +280,11 @@ async def theme_condensation(
         themes_df, _ = await batch_and_run(
             themes_df,
             prompt_template,
-            llm,
+            llm.with_structured_output(ThemeCondensationResponses),
             batch_size=batch_size,
             question=question,
             system_prompt=system_prompt,
+            concurrency=concurrency,
             **kwargs,
         )
         themes_df = themes_df.sample(frac=1).reset_index(drop=True)
@@ -263,10 +297,11 @@ async def theme_condensation(
     themes_df, _ = await batch_and_run(
         themes_df,
         prompt_template,
-        llm,
+        llm.with_structured_output(ThemeCondensationResponses),
         batch_size=batch_size,
         question=question,
         system_prompt=system_prompt,
+        concurrency=concurrency,
         **kwargs,
     )
@@ -276,11 +311,12 @@ async def theme_condensation(
 async def theme_refinement(
     condensed_themes_df: pd.DataFrame,
-    llm: Runnable,
+    llm: RunnableWithFallbacks,
     question: str,
     batch_size: int = 10000,
     prompt_template: str | Path | PromptTemplate = "theme_refinement",
     system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
+    concurrency: int = 10,
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Refine and standardize condensed themes using an LLM.
@@ -292,7 +328,7 @@ async def theme_refinement(
     Args:
         condensed_themes (pd.DataFrame): DataFrame containing the condensed themes
             from the previous pipeline stage.
-        llm (Runnable): Language model instance to use for theme refinement.
+        llm (RunnableWithFallbacks): Language model instance to use for theme refinement.
         question (str): The survey question.
         batch_size (int, optional): Number of themes to process in each batch.
             Defaults to 10000.
@@ -301,6 +337,7 @@ async def theme_refinement(
             or PromptTemplate instance. Defaults to "theme_refinement".
         system_prompt (str): System prompt to guide the LLM's behavior.
             Defaults to CONSULTATION_SYSTEM_PROMPT.
+        concurrency (int): Number of concurrent API calls to make. Defaults to 10.
     Returns:
         tuple[pd.DataFrame, pd.DataFrame]:
@@ -319,22 +356,24 @@ async def theme_refinement(
     refined_themes, _ = await batch_and_run(
         condensed_themes_df,
         prompt_template,
-        llm,
+        llm.with_structured_output(ThemeRefinementResponses),
         batch_size=batch_size,
         question=question,
         system_prompt=system_prompt,
+        concurrency=concurrency,
     )
     return refined_themes, _
 async def theme_target_alignment(
     refined_themes_df: pd.DataFrame,
-    llm: Runnable,
+    llm: RunnableWithFallbacks,
     question: str,
     target_n_themes: int = 10,
     batch_size: int = 10000,
     prompt_template: str | Path | PromptTemplate = "theme_target_alignment",
     system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
+    concurrency: int = 10,
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Align themes to target number using an LLM.
@@ -346,7 +385,7 @@ async def theme_target_alignment(
     Args:
         refined_themes_df (pd.DataFrame): DataFrame containing the refined themes
             from the previous pipeline stage.
-        llm (Runnable): Language model instance to use for theme alignment.
+        llm (RunnableWithFallbacks): Language model instance to use for theme alignment.
         question (str): The survey question.
         target_n_themes (int, optional): Target number of themes to consolidate to.
             Defaults to 10.
@@ -357,6 +396,7 @@ async def theme_target_alignment(
             or PromptTemplate instance. Defaults to "theme_target_alignment".
         system_prompt (str): System prompt to guide the LLM's behavior.
             Defaults to CONSULTATION_SYSTEM_PROMPT.
+        concurrency (int): Number of concurrent API calls to make. Defaults to 10.
     Returns:
         tuple[pd.DataFrame, pd.DataFrame]:
@@ -376,23 +416,25 @@ async def theme_target_alignment(
     aligned_themes, _ = await batch_and_run(
         refined_themes_df,
         prompt_template,
-        llm,
+        llm.with_structured_output(ThemeRefinementResponses),
         batch_size=batch_size,
         question=question,
         system_prompt=system_prompt,
         target_n_themes=target_n_themes,
+        concurrency=concurrency,
     )
     return aligned_themes, _
 async def theme_mapping(
     responses_df: pd.DataFrame,
-    llm: Runnable,
+    llm: RunnableWithFallbacks,
     question: str,
     refined_themes_df: pd.DataFrame,
     batch_size: int = 20,
     prompt_template: str | Path | PromptTemplate = "theme_mapping",
     system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
+    concurrency: int = 10,
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Map survey responses to refined themes using an LLM.
@@ -402,7 +444,7 @@ async def theme_mapping(
     Args:
         responses_df (pd.DataFrame): DataFrame containing survey responses.
             Must include 'response_id' and 'response' columns.
-        llm (Runnable): Language model instance to use for theme mapping.
+        llm (RunnableWithFallbacks): Language model instance to use for theme mapping.
         question (str): The survey question.
         refined_themes_df (pd.DataFrame): Single-row DataFrame where each column
             represents a theme (from theme_refinement stage).
@@ -413,6 +455,7 @@ async def theme_mapping(
             or PromptTemplate instance. Defaults to "theme_mapping".
         system_prompt (str): System prompt to guide the LLM's behavior.
             Defaults to CONSULTATION_SYSTEM_PROMPT.
+        concurrency (int): Number of concurrent API calls to make. Defaults to 10.
     Returns:
         tuple[pd.DataFrame, pd.DataFrame]:
@@ -432,17 +475,70 @@ async def theme_mapping(
         )
         return transposed_df
-    mapping, _ = await batch_and_run(
+    mapping, unprocessable = await batch_and_run(
         responses_df,
         prompt_template,
-        llm,
+        llm.with_structured_output(ThemeMappingResponses),
         batch_size=batch_size,
         question=question,
         refined_themes=transpose_refined_themes(refined_themes_df).to_dict(
             orient="records"
         ),
-        validation_check=True,
-        task_validation_model=ThemeMappingOutput,
+        integrity_check=True,
+        system_prompt=system_prompt,
+        concurrency=concurrency,
+    )
+    return mapping, unprocessable
+async def detail_detection(
+    responses_df: pd.DataFrame,
+    llm: RunnableWithFallbacks,
+    question: str,
+    batch_size: int = 20,
+    prompt_template: str | Path | PromptTemplate = "detail_detection",
+    system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
+    concurrency: int = 10,
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Identify responses that provide high-value detailed evidence.
+    This function processes survey responses in batches to analyze their level of detail
+    and evidence using a language model. It identifies responses that contain specific
+    examples, data, or detailed reasoning that provide strong supporting evidence.
+    Args:
+        responses_df (pd.DataFrame): DataFrame containing survey responses to analyze.
+            Must contain 'response_id' and 'response' columns.
+        llm (RunnableWithFallbacks): Language model instance to use for detail detection.
+        question (str): The survey question.
+        batch_size (int, optional): Number of responses to process in each batch.
+            Defaults to 20.
+        prompt_template (str | Path | PromptTemplate, optional): Template for structuring
+            the prompt to the LLM. Can be a string identifier, path to template file,
+            or PromptTemplate instance. Defaults to "detail_detection".
+        system_prompt (str): System prompt to guide the LLM's behavior.
+            Defaults to CONSULTATION_SYSTEM_PROMPT.
+        concurrency (int): Number of concurrent API calls to make. Defaults to 10.
+    Returns:
+        tuple[pd.DataFrame, pd.DataFrame]:
+            A tuple containing two DataFrames:
+                - The first DataFrame contains the rows that were successfully processed by the LLM
+                - The second DataFrame contains the rows that could not be processed by the LLM
+    Note:
+        The function uses response_id_integrity_check to ensure responses maintain
+        their original order and association after processing.
+    """
+    logger.info(f"Running detail detection on {len(responses_df)} responses")
+    detailed, _ = await batch_and_run(
+        responses_df,
+        prompt_template,
+        llm.with_structured_output(DetailDetectionResponses),
+        batch_size=batch_size,
+        question=question,
+        integrity_check=True,
         system_prompt=system_prompt,
+        concurrency=concurrency,
     )
-    return mapping, _
+    return detailed, _

themefinder/llm_batch_processor.py CHANGED Viewed

@@ -1,17 +1,16 @@
 import asyncio
-import json
 import logging
 import os
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Optional, Type
+from typing import Any, Optional
 import openai
 import pandas as pd
 import tiktoken
 from langchain_core.prompts import PromptTemplate
 from langchain_core.runnables import Runnable
-from pydantic import BaseModel, ValidationError
+from pydantic import ValidationError
 from tenacity import (
     before,
     before_sleep_log,
@@ -35,8 +34,8 @@ async def batch_and_run(
     llm: Runnable,
     batch_size: int = 10,
     partition_key: str | None = None,
-    validation_check: bool = False,
-    task_validation_model: Type[BaseModel] = None,
+    integrity_check: bool = False,
+    concurrency: int = 10,
     **kwargs: Any,
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Process a DataFrame of responses in batches using an LLM.
@@ -51,11 +50,11 @@ async def batch_and_run(
             Defaults to 10.
         partition_key (str | None, optional): Optional column name to group input rows
             before batching. Defaults to None.
-        validation_check (bool, optional): If True, verifies that all input
-            response IDs are present in LLM output and validates the rows against the validation model,
-            failed rows are retried individually.
+        integrity_check (bool, optional): If True, verifies that all input
+            response IDs are present in LLM output.
             If False, no integrity checking or retrying occurs. Defaults to False.
-        task_validation_model (Type[BaseModel]): the pydanctic model to validate each row against
+        concurrency (int, optional): Maximum number of simultaneous LLM calls allowed.
+            Defaults to 10.
         **kwargs (Any): Additional keyword arguments to pass to the prompt template.
     Returns:
@@ -80,8 +79,8 @@ async def batch_and_run(
     processed_rows, failed_ids = await call_llm(
         batch_prompts=batch_prompts,
         llm=llm,
-        validation_check=validation_check,
-        task_validation_model=task_validation_model,
+        integrity_check=integrity_check,
+        concurrency=concurrency,
     )
     processed_results = process_llm_responses(processed_rows, input_df)
@@ -93,8 +92,8 @@ async def batch_and_run(
         retry_results, unprocessable_ids = await call_llm(
             batch_prompts=retry_prompts,
             llm=llm,
-            validation_check=validation_check,
-            task_validation_model=task_validation_model,
+            integrity_check=integrity_check,
+            concurrency=concurrency,
         )
         retry_processed_results = process_llm_responses(retry_results, retry_df)
         unprocessable_df = retry_df.loc[retry_df["response_id"].isin(unprocessable_ids)]
@@ -287,32 +286,9 @@ async def call_llm(
     batch_prompts: list[BatchPrompt],
     llm: Runnable,
     concurrency: int = 10,
-    validation_check: bool = False,
-    task_validation_model: Optional[Type[BaseModel]] = None,
+    integrity_check: bool = False,
 ) -> tuple[list[dict], list[int]]:
-    """Process multiple batches of prompts concurrently through an LLM with retry logic.
-    Args:
-        batch_prompts (list[BatchPrompt]): List of BatchPrompt objects, each containing a
-            prompt string and associated response IDs to be processed.
-        llm (Runnable): LangChain Runnable instance that will process the prompts.
-        concurrency (int, optional): Maximum number of simultaneous LLM calls allowed.
-            Defaults to 10.
-        validation_check (bool, optional): If True, verifies that all input
-            response IDs are present in the LLM output. Failed batches are discarded and
-            their IDs are returned for retry. Defaults to False.
-        task_validation_model (Type[BaseModel]): The Pydantic model to check the LLM outputs against
-    Returns:
-        tuple[list[dict[str, Any]], set[str]]: A tuple containing:
-            - list of successful LLM responses as dictionaries
-            - set of failed response IDs (empty if no failures or integrity check is False)
-    Notes:
-        - Uses exponential backoff retry strategy with up to 6 attempts per batch
-        - Failed batches (when integrity check fails) return None and are filtered out
-        - Concurrency is managed via asyncio.Semaphore to prevent overwhelming the LLM
-    """
+    """Process multiple batches of prompts concurrently through an LLM with retry logic."""
     semaphore = asyncio.Semaphore(concurrency)
     @retry(
@@ -326,24 +302,30 @@ async def call_llm(
         async with semaphore:
             try:
                 llm_response = await llm.ainvoke(batch_prompt.prompt_string)
-                all_results = json.loads(llm_response.content)
-            except (openai.BadRequestError, json.JSONDecodeError) as e:
-                failed_ids = batch_prompt.response_ids
+                all_results = (
+                    llm_response.dict()
+                    if hasattr(llm_response, "dict")
+                    else llm_response
+                )
+                responses = (
+                    all_results["responses"]
+                    if isinstance(all_results, dict)
+                    else all_results.responses
+                )
+            except (openai.BadRequestError, ValueError) as e:
                 logger.warning(e)
-                return [], failed_ids
+                return [], batch_prompt.response_ids
+            except ValidationError as e:
+                logger.warning(e)
+                return [], batch_prompt.response_ids
-            if validation_check:
+            if integrity_check:
                 failed_ids = get_missing_response_ids(
                     batch_prompt.response_ids, all_results
                 )
-                validated_results, invalid_rows = validate_task_data(
-                    all_results["responses"], task_validation_model
-                )
-                failed_ids.extend([r["response_id"] for r in invalid_rows])
-                return validated_results, failed_ids
+                return responses, failed_ids
             else:
-                # Flatten the list to align with valid output format
-                return [r for r in all_results["responses"]], []
+                return responses, []
     results = await asyncio.gather(
         *[async_llm_call(batch_prompt) for batch_prompt in batch_prompts]
@@ -458,33 +440,3 @@ def build_prompt(
     )
     response_ids = input_batch["response_id"].astype(int).to_list()
     return BatchPrompt(prompt_string=prompt, response_ids=response_ids)
-def validate_task_data(
-    task_data: pd.DataFrame | list[dict], task_validation_model: Type[BaseModel] = None
-) -> tuple[list[dict], list[dict]]:
-    """
-    Validate each row in task_output against the provided Pydantic model.
-    Returns:
-        valid: a list of validated records (dicts).
-        invalid: a list of records (dicts) that failed validation.
-    """
-    records = (
-        task_data.to_dict(orient="records")
-        if isinstance(task_data, pd.DataFrame)
-        else task_data
-    )
-    if task_validation_model:
-        valid_records, invalid_records = [], []
-        for record in records:
-            try:
-                task_validation_model(**record)
-                valid_records.append(record)
-            except ValidationError as e:
-                invalid_records.append(record)
-                logger.info(f"Failed Validation: {e}")
-        return valid_records, invalid_records
-    return records, []

themefinder 0.6.2__py3-none-any.whl → 0.6.3__py3-none-any.whl

Potentially problematic release.

themefinder 0.6.2py3-none-any.whl → 0.6.3py3-none-any.whl