PyPI - themefinder - Versions diffs - 0.5.3__tar.gz → 0.6.2__tar.gz - Mend

themefinder 0.5.3tar.gz → 0.6.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of themefinder might be problematic. Click here for more details.

Files changed (18) hide show

{themefinder-0.5.3 → themefinder-0.6.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: themefinder
-Version: 0.5.3
+Version: 0.6.2
 Summary: A topic modelling Python package designed for analysing one-to-many question-answer data eg free-text survey responses.
 License: MIT
 Author: i.AI
@@ -100,7 +100,7 @@ system_prompt = "You are an AI evaluation tool analyzing survey responses about
 # Run the function to find themes
 # We use asyncio to query LLM endpoints asynchronously, so we need to await our function
 async def main():
-    result = await find_themes(responses_df, llm, question, system_prompt)
+    result = await find_themes(responses_df, llm, question, system_prompt=system_prompt)
     print(result)
 if __name__ == "__main__":
@@ -155,3 +155,4 @@ The documentation is [© Crown copyright](https://www.nationalarchives.gov.uk/in
 ## Feedback
 If you have feedback on this package, please fill in our [feedback form](https://forms.gle/85xUSMvxGzSSKQ499) or contact us with questions or feedback at packages@cabinetoffice.gov.uk.

{themefinder-0.5.3 → themefinder-0.6.2}/README.md RENAMED Viewed

@@ -69,7 +69,7 @@ system_prompt = "You are an AI evaluation tool analyzing survey responses about
 # Run the function to find themes
 # We use asyncio to query LLM endpoints asynchronously, so we need to await our function
 async def main():
-    result = await find_themes(responses_df, llm, question, system_prompt)
+    result = await find_themes(responses_df, llm, question, system_prompt=system_prompt)
     print(result)
 if __name__ == "__main__":
@@ -123,4 +123,4 @@ The documentation is [© Crown copyright](https://www.nationalarchives.gov.uk/in
 ## Feedback
-If you have feedback on this package, please fill in our [feedback form](https://forms.gle/85xUSMvxGzSSKQ499) or contact us with questions or feedback at packages@cabinetoffice.gov.uk.
+If you have feedback on this package, please fill in our [feedback form](https://forms.gle/85xUSMvxGzSSKQ499) or contact us with questions or feedback at packages@cabinetoffice.gov.uk.

{themefinder-0.5.3 → themefinder-0.6.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "themefinder"
-version = "0.5.3"
+version = "0.6.2"
 description = "A topic modelling Python package designed for analysing one-to-many question-answer data eg free-text survey responses."
 authors = ["i.AI <packages@cabinetoffice.gov.uk>"]
 packages = [{include = "themefinder", from = "src"}]

{themefinder-0.5.3 → themefinder-0.6.2}/src/themefinder/core.py RENAMED Viewed

@@ -6,6 +6,7 @@ from langchain_core.prompts import PromptTemplate
 from langchain_core.runnables import Runnable
 from .llm_batch_processor import batch_and_run, load_prompt_from_file
+from .models import SentimentAnalysisOutput, ThemeMappingOutput
 from .themefinder_logging import logger
 CONSULTATION_SYSTEM_PROMPT = load_prompt_from_file("consultation_system_prompt")
@@ -18,7 +19,7 @@ async def find_themes(
     target_n_themes: int | None = None,
     system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
     verbose: bool = True,
-) -> dict[str, pd.DataFrame]:
+) -> dict[str, str | pd.DataFrame]:
     """Process survey responses through a multi-stage theme analysis pipeline.
     This pipeline performs sequential analysis steps:
@@ -41,47 +42,46 @@ async def find_themes(
             Defaults to True.
     Returns:
-        dict[str, pd.DataFrame]: Dictionary containing results from each pipeline stage:
-            - question: The survey question
+        dict[str, str | pd.DataFrame]: Dictionary containing results from each pipeline stage:
+            - question: The survey question string
             - sentiment: DataFrame with sentiment analysis results
-            - topics: DataFrame with initial generated themes
-            - condensed_topics: DataFrame with combined similar themes
-            - refined_topics: DataFrame with refined theme definitions
+            - themes: DataFrame with the final themes output
             - mapping: DataFrame mapping responses to final themes
+            - unprocessables: Dataframe containing the inputs that could not be processed by the LLM
     """
     logger.setLevel(logging.INFO if verbose else logging.CRITICAL)
-    sentiment_df = await sentiment_analysis(
+    sentiment_df, sentiment_unprocessables = await sentiment_analysis(
         responses_df,
         llm,
         question=question,
         system_prompt=system_prompt,
     )
-    theme_df = await theme_generation(
+    theme_df, _ = await theme_generation(
         sentiment_df,
         llm,
         question=question,
         system_prompt=system_prompt,
     )
-    condensed_theme_df = await theme_condensation(
+    condensed_theme_df, _ = await theme_condensation(
         theme_df, llm, question=question, system_prompt=system_prompt
     )
-    refined_theme_df = await theme_refinement(
+    refined_theme_df, _ = await theme_refinement(
         condensed_theme_df,
         llm,
         question=question,
         system_prompt=system_prompt,
     )
     if target_n_themes is not None:
-        refined_theme_df = await theme_target_alignment(
+        refined_theme_df, _ = await theme_target_alignment(
             refined_theme_df,
             llm,
             question=question,
             target_n_themes=target_n_themes,
             system_prompt=system_prompt,
         )
-    mapping_df = await theme_mapping(
-        sentiment_df,
+    mapping_df, mapping_unprocessables = await theme_mapping(
+        sentiment_df[["response_id", "response"]],
         llm,
         question=question,
         refined_themes_df=refined_theme_df,
@@ -95,10 +95,9 @@ async def find_themes(
     return {
         "question": question,
         "sentiment": sentiment_df,
-        "themes": theme_df,
-        "condensed_themes": condensed_theme_df,
-        "refined_themes": refined_theme_df,
+        "themes": refined_theme_df,
         "mapping": mapping_df,
+        "unprocessables": pd.concat([sentiment_unprocessables, mapping_unprocessables]),
     }
@@ -109,7 +108,7 @@ async def sentiment_analysis(
     batch_size: int = 20,
     prompt_template: str | Path | PromptTemplate = "sentiment_analysis",
     system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
-) -> pd.DataFrame:
+) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Perform sentiment analysis on survey responses using an LLM.
     This function processes survey responses in batches to analyze their sentiment
@@ -129,24 +128,29 @@ async def sentiment_analysis(
             Defaults to CONSULTATION_SYSTEM_PROMPT.
     Returns:
-        pd.DataFrame: DataFrame containing the original responses enriched with
-            sentiment analysis results.
+        tuple[pd.DataFrame, pd.DataFrame]:
+            A tuple containing two DataFrames:
+                - The first DataFrame contains the rows that were successfully processed by the LLM
+                - The second DataFrame contains the rows that could not be processed by the LLM
     Note:
-        The function uses response_id_integrity_check to ensure responses maintain
+        The function uses validation_check to ensure responses maintain
         their original order and association after processing.
     """
     logger.info(f"Running sentiment analysis on {len(responses_df)} responses")
-    return await batch_and_run(
+    processed_rows, unprocessable_rows = await batch_and_run(
         responses_df,
         prompt_template,
         llm,
         batch_size=batch_size,
         question=question,
-        response_id_integrity_check=True,
+        validation_check=True,
+        task_validation_model=SentimentAnalysisOutput,
         system_prompt=system_prompt,
     )
+    return processed_rows, unprocessable_rows
 async def theme_generation(
     responses_df: pd.DataFrame,
@@ -156,7 +160,7 @@ async def theme_generation(
     partition_key: str | None = "position",
     prompt_template: str | Path | PromptTemplate = "theme_generation",
     system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
-) -> pd.DataFrame:
+) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Generate themes from survey responses using an LLM.
     This function processes batches of survey responses to identify common themes or topics.
@@ -179,10 +183,14 @@ async def theme_generation(
             Defaults to CONSULTATION_SYSTEM_PROMPT.
     Returns:
-        pd.DataFrame: DataFrame containing identified themes and their associated metadata.
+        tuple[pd.DataFrame, pd.DataFrame]:
+            A tuple containing two DataFrames:
+                - The first DataFrame contains the rows that were successfully processed by the LLM
+                - The second DataFrame contains the rows that could not be processed by the LLM
     """
     logger.info(f"Running theme generation on {len(responses_df)} responses")
-    return await batch_and_run(
+    generated_themes, _ = await batch_and_run(
         responses_df,
         prompt_template,
         llm,
@@ -191,17 +199,18 @@ async def theme_generation(
         question=question,
         system_prompt=system_prompt,
     )
+    return generated_themes, _
 async def theme_condensation(
     themes_df: pd.DataFrame,
     llm: Runnable,
     question: str,
-    batch_size: int = 100,
+    batch_size: int = 75,
     prompt_template: str | Path | PromptTemplate = "theme_condensation",
     system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
     **kwargs,
-) -> pd.DataFrame:
+) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Condense and combine similar themes identified from survey responses.
     This function processes the initially identified themes to combine similar or
@@ -221,18 +230,21 @@ async def theme_condensation(
             Defaults to CONSULTATION_SYSTEM_PROMPT.
     Returns:
-        pd.DataFrame: DataFrame containing the condensed themes, where similar topics
-            have been combined into broader categories.
+        tuple[pd.DataFrame, pd.DataFrame]:
+            A tuple containing two DataFrames:
+                - The first DataFrame contains the rows that were successfully processed by the LLM
+                - The second DataFrame contains the rows that could not be processed by the LLM
     """
     logger.info(f"Running theme condensation on {len(themes_df)} themes")
-    themes_df["response_id"] = range(len(themes_df))
+    themes_df["response_id"] = themes_df.index + 1
     n_themes = themes_df.shape[0]
     while n_themes > batch_size:
         logger.info(
             f"{n_themes} larger than batch size, using recursive theme condensation"
         )
-        themes_df = await batch_and_run(
+        themes_df, _ = await batch_and_run(
             themes_df,
             prompt_template,
             llm,
@@ -242,13 +254,13 @@ async def theme_condensation(
             **kwargs,
         )
         themes_df = themes_df.sample(frac=1).reset_index(drop=True)
-        themes_df["response_id"] = range(len(themes_df))
+        themes_df["response_id"] = themes_df.index + 1
         if len(themes_df) == n_themes:
             logger.info("Themes no longer being condensed")
             break
         n_themes = themes_df.shape[0]
-    themes_df = await batch_and_run(
+    themes_df, _ = await batch_and_run(
         themes_df,
         prompt_template,
         llm,
@@ -259,7 +271,7 @@ async def theme_condensation(
     )
     logger.info(f"Final number of condensed themes: {themes_df.shape[0]}")
-    return themes_df
+    return themes_df, _
 async def theme_refinement(
@@ -269,7 +281,7 @@ async def theme_refinement(
     batch_size: int = 10000,
     prompt_template: str | Path | PromptTemplate = "theme_refinement",
     system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
-) -> pd.DataFrame:
+) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Refine and standardize condensed themes using an LLM.
     This function processes previously condensed themes to create clear, standardized
@@ -286,15 +298,15 @@ async def theme_refinement(
             Defaults to 10000.
         prompt_template (str | Path | PromptTemplate, optional): Template for structuring
             the prompt to the LLM. Can be a string identifier, path to template file,
-            or PromptTemplate instance. Defaults to "topic_refinement".
+            or PromptTemplate instance. Defaults to "theme_refinement".
         system_prompt (str): System prompt to guide the LLM's behavior.
             Defaults to CONSULTATION_SYSTEM_PROMPT.
     Returns:
-        pd.DataFrame: A single-row DataFrame where:
-            - Each column represents a unique theme (identified by topic_id)
-            - The values contain the refined theme descriptions
-            - The format is optimized for subsequent theme mapping operations
+        tuple[pd.DataFrame, pd.DataFrame]:
+            A tuple containing two DataFrames:
+                - The first DataFrame contains the rows that were successfully processed by the LLM
+                - The second DataFrame contains the rows that could not be processed by the LLM
     Note:
         The function adds sequential response_ids to the input DataFrame and
@@ -302,16 +314,9 @@ async def theme_refinement(
         processing.
     """
     logger.info(f"Running theme refinement on {len(condensed_themes_df)} responses")
-    condensed_themes_df["response_id"] = range(len(condensed_themes_df))
+    condensed_themes_df["response_id"] = condensed_themes_df.index + 1
-    def transpose_refined_themes(refined_themes: pd.DataFrame):
-        """Transpose topics for increased legibility."""
-        transposed_df = pd.DataFrame(
-            [refined_themes["topic"].to_numpy()], columns=refined_themes["topic_id"]
-        )
-        return transposed_df
-    refined_themes = await batch_and_run(
+    refined_themes, _ = await batch_and_run(
         condensed_themes_df,
         prompt_template,
         llm,
@@ -319,7 +324,7 @@ async def theme_refinement(
         question=question,
         system_prompt=system_prompt,
     )
-    return transpose_refined_themes(refined_themes)
+    return refined_themes, _
 async def theme_target_alignment(
@@ -330,7 +335,7 @@ async def theme_target_alignment(
     batch_size: int = 10000,
     prompt_template: str | Path | PromptTemplate = "theme_target_alignment",
     system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
-) -> pd.DataFrame:
+) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Align themes to target number using an LLM.
     This function processes refined themes to consolidate them into a target number of
@@ -354,10 +359,10 @@ async def theme_target_alignment(
             Defaults to CONSULTATION_SYSTEM_PROMPT.
     Returns:
-        pd.DataFrame: A single-row DataFrame where:
-            - Each column represents a unique theme (identified by topic_id)
-            - The values contain the aligned theme descriptions
-            - The format is optimized for subsequent theme mapping operations
+        tuple[pd.DataFrame, pd.DataFrame]:
+            A tuple containing two DataFrames:
+                - The first DataFrame contains the rows that were successfully processed by the LLM
+                - The second DataFrame contains the rows that could not be processed by the LLM
     Note:
         The function adds sequential response_ids to the input DataFrame and
@@ -365,19 +370,10 @@ async def theme_target_alignment(
         processing.
     """
     logger.info(
-        f"Running theme target alignment on {len(refined_themes_df.columns)} themes compressing to {target_n_themes} themes"
+        f"Running theme target alignment on {len(refined_themes_df)} themes compressing to {target_n_themes} themes"
     )
-    refined_themes_df = refined_themes_df.T.rename(columns={0: "topic"})
-    refined_themes_df["response_id"] = range(len(refined_themes_df))
-    def transpose_aligned_themes(aligned_themes: pd.DataFrame):
-        """Transpose topics for increased legibility."""
-        transposed_df = pd.DataFrame(
-            [aligned_themes["topic"].to_numpy()], columns=aligned_themes["topic_id"]
-        )
-        return transposed_df
-    aligned_themes = await batch_and_run(
+    refined_themes_df["response_id"] = refined_themes_df.index + 1
+    aligned_themes, _ = await batch_and_run(
         refined_themes_df,
         prompt_template,
         llm,
@@ -386,7 +382,7 @@ async def theme_target_alignment(
         system_prompt=system_prompt,
         target_n_themes=target_n_themes,
     )
-    return transpose_aligned_themes(aligned_themes)
+    return aligned_themes, _
 async def theme_mapping(
@@ -397,7 +393,7 @@ async def theme_mapping(
     batch_size: int = 20,
     prompt_template: str | Path | PromptTemplate = "theme_mapping",
     system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
-) -> pd.DataFrame:
+) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Map survey responses to refined themes using an LLM.
     This function analyzes each survey response and determines which of the refined
@@ -419,19 +415,34 @@ async def theme_mapping(
             Defaults to CONSULTATION_SYSTEM_PROMPT.
     Returns:
-        pd.DataFrame: DataFrame containing the original responses enriched with
-            theme mapping results, ensuring all responses are mapped through ID integrity checks.
+        tuple[pd.DataFrame, pd.DataFrame]:
+            A tuple containing two DataFrames:
+                - The first DataFrame contains the rows that were successfully processed by the LLM
+                - The second DataFrame contains the rows that could not be processed by the LLM
     """
     logger.info(
-        f"Running theme mapping on {len(responses_df)} responses using {len(refined_themes_df.columns)} themes"
+        f"Running theme mapping on {len(responses_df)} responses using {len(refined_themes_df)} themes"
     )
-    return await batch_and_run(
+    def transpose_refined_themes(refined_themes: pd.DataFrame):
+        """Transpose topics for increased legibility."""
+        transposed_df = pd.DataFrame(
+            [refined_themes["topic"].to_numpy()], columns=refined_themes["topic_id"]
+        )
+        return transposed_df
+    mapping, _ = await batch_and_run(
         responses_df,
         prompt_template,
         llm,
         batch_size=batch_size,
         question=question,
-        refined_themes=refined_themes_df.to_dict(orient="records"),
-        response_id_integrity_check=True,
+        refined_themes=transpose_refined_themes(refined_themes_df).to_dict(
+            orient="records"
+        ),
+        validation_check=True,
+        task_validation_model=ThemeMappingOutput,
         system_prompt=system_prompt,
     )
+    return mapping, _

themefinder 0.5.3__tar.gz → 0.6.2__tar.gz

Potentially problematic release.

themefinder 0.5.3tar.gz → 0.6.2tar.gz