PyPI - themefinder - Versions diffs - 0.7.1__tar.gz → 0.7.2__tar.gz - Mend

themefinder 0.7.1tar.gz → 0.7.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of themefinder might be problematic. Click here for more details.

Files changed (19) hide show

{themefinder-0.7.1 → themefinder-0.7.2}/PKG-INFO RENAMED Viewed

@@ -1,8 +1,9 @@
-Metadata-Version: 2.3
+Metadata-Version: 2.4
 Name: themefinder
-Version: 0.7.1
+Version: 0.7.2
 Summary: A topic modelling Python package designed for analysing one-to-many question-answer data eg free-text survey responses.
 License: MIT
+License-File: LICENCE
 Author: i.AI
 Author-email: packages@cabinetoffice.gov.uk
 Requires-Python: >=3.10,<3.13

{themefinder-0.7.1 → themefinder-0.7.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "themefinder"
-version = "0.7.1"
+version = "0.7.2"
 description = "A topic modelling Python package designed for analysing one-to-many question-answer data eg free-text survey responses."
 authors = ["i.AI <packages@cabinetoffice.gov.uk>"]
 packages = [{include = "themefinder", from = "src"}]

{themefinder-0.7.1 → themefinder-0.7.2}/src/themefinder/core.py RENAMED Viewed

@@ -186,7 +186,7 @@ async def theme_generation(
     llm: RunnableWithFallbacks,
     question: str,
     batch_size: int = 50,
-    partition_key: str | None = "position",
+    partition_key: str | None = None,
     prompt_template: str | Path | PromptTemplate = "theme_generation",
     system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
     concurrency: int = 10,
@@ -317,6 +317,7 @@ def theme_clustering(
     target_themes: int = 10,
     significance_percentage: float = 10.0,
     return_all_themes: bool = False,
+    system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Perform hierarchical clustering of themes using an agentic approach.
@@ -340,6 +341,8 @@ def theme_clustering(
             selecting significant themes. Defaults to 10.0.
         return_all_themes (bool, optional): If True, returns all clustered themes.
             If False, returns only significant themes. Defaults to False.
+        system_prompt (str): System prompt to guide the LLM's behavior.
+            Defaults to CONSULTATION_SYSTEM_PROMPT.
     Returns:
         tuple[pd.DataFrame, pd.DataFrame]:
@@ -362,7 +365,10 @@ def theme_clustering(
     # Initialize clustering agent with structured output LLM
     agent = ThemeClusteringAgent(
-        llm.with_structured_output(HierarchicalClusteringResponse), initial_themes
+        llm.with_structured_output(HierarchicalClusteringResponse),
+        initial_themes,
+        system_prompt,
+        target_themes,
     )
     # Perform clustering
@@ -444,6 +450,32 @@ async def theme_refinement(
         system_prompt=system_prompt,
         concurrency=concurrency,
     )
+    def assign_sequential_topic_ids(df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Assigns sequential alphabetic topic_ids (A, B, ..., Z, AA, AB, ...) to the DataFrame.
+        """
+        def alpha_ids(n: int) -> list[str]:
+            ids = []
+            for i in range(n):
+                s = ""
+                x = i
+                while True:
+                    x, r = divmod(x, 26)
+                    s = chr(65 + r) + s
+                    if x == 0:
+                        break
+                    x -= 1
+                ids.append(s)
+            return ids
+        if not df.empty:
+            df["topic_id"] = alpha_ids(len(df))
+        return df
+    refined_themes = assign_sequential_topic_ids(refined_themes)
     return refined_themes, _

{themefinder-0.7.1 → themefinder-0.7.2}/src/themefinder/models.py RENAMED Viewed

@@ -217,9 +217,6 @@ class ThemeCondensationResponses(ValidatedModel):
 class RefinedTheme(ValidatedModel):
     """Model for a single refined theme"""
-    topic_id: str = Field(
-        ..., description="Single uppercase letter ID (A-Z, then AA, AB, etc.)"
-    )
     topic: str = Field(
         ..., description="Topic label and description combined with a colon separator"
     )
@@ -231,19 +228,9 @@ class RefinedTheme(ValidatedModel):
     def run_validations(self) -> "RefinedTheme":
         """Run all validations for RefinedTheme"""
         self.validate_non_empty_fields()
-        self.validate_topic_id_format()
         self.validate_topic_format()
         return self
-    def validate_topic_id_format(self) -> "RefinedTheme":
-        """
-        Validate that topic_id follows the expected format (A-Z, then AA, AB, etc.).
-        """
-        topic_id = self.topic_id.strip()
-        if not topic_id.isupper() or not topic_id.isalpha():
-            raise ValueError(f"topic_id must be uppercase letters only: {topic_id}")
-        return self
     def validate_topic_format(self) -> "RefinedTheme":
         """
         Validate that topic contains a label and description separated by a colon.
@@ -273,9 +260,6 @@ class ThemeRefinementResponses(ValidatedModel):
     def run_validations(self) -> "ThemeRefinementResponses":
         """Ensure there are no duplicate themes"""
         self.validate_non_empty_fields()
-        topic_ids = [theme.topic_id for theme in self.responses]
-        if len(topic_ids) != len(set(topic_ids)):
-            raise ValueError("Duplicate topic_ids detected")
         topics = [theme.topic.lower().strip() for theme in self.responses]
         if len(topics) != len(set(topics)):
             raise ValueError("Duplicate topics detected")
@@ -288,10 +272,6 @@ class ThemeMappingOutput(ValidatedModel):
     response_id: int = Field(gt=0, description="Response ID, must be greater than 0")
     labels: List[str] = Field(..., description="List of theme labels")
-    reasons: List[str] = Field(..., description="List of reasons for mapping")
-    stances: List[Stance] = Field(
-        ..., description="List of stances (POSITIVE or NEGATIVE)"
-    )
     @model_validator(mode="after")
     def run_validations(self) -> "ThemeMappingOutput":
@@ -299,7 +279,6 @@ class ThemeMappingOutput(ValidatedModel):
         Run all validations for ThemeMappingOutput.
         """
         self.validate_non_empty_fields()
-        self.validate_equal_lengths("stances", "labels", "reasons")
         self.validate_unique_items("labels")
         return self

{themefinder-0.7.1 → themefinder-0.7.2}/src/themefinder/prompts/agentic_theme_clustering.txt RENAMED Viewed

@@ -1,3 +1,5 @@
+{system_prompt}
 Analyze these topics and identify which ones should be merged based on semantic similarity.
 Your goal is to significantly reduce the number of topics by creating meaningful parent topics.
 Be aggressive in finding opportunities to merge topics that share any semantic relationship.
@@ -22,10 +24,11 @@ Guidelines:
 - source_topic_count must be the sum of all child topic counts
 - children must be a list of valid topic_ids from the input
 - should_terminate should only be true if ALL of these conditions are met:
-    * There are fewer than 10 active topics remaining
+    * There are fewer than {target_themes} active topics remaining
     * The remaining topics are fundamentally incompatible semantically
     * Any further merging would create meaninglessly broad categories
 If no topics should be merged in this iteration but future iterations might still yield meaningful merges, set should_terminate to false with an empty parent_themes list.
+If no topics should be merged and the termination conditions are met, set should_terminate to true with an empty parent_themes list.
-If no topics should be merged and the termination conditions are met, set should_terminate to true with an empty parent_themes list.
+N.B. Under no circumstances should you create a parent theme with a single child. You do not need to return all of the original themes, if they don't belong to a newly created parent feel free to omit them.

{themefinder-0.7.1 → themefinder-0.7.2}/src/themefinder/prompts/theme_condensation.txt RENAMED Viewed

@@ -1,11 +1,15 @@
 {system_prompt}
-Below is a question and a list of topics extracted from answers to that question. Each topic has a topic_label, topic_description, and may have a source_topic_count field indicating how many original topics it represents.
+Below is a question and a list of topics extracted from answers to that question.
+This list contains a large number of duplicate and redundant topics that present the same concept with different phrasing.
+Each topic has a topic_label, topic_description, and may have a source_topic_count field indicating how many original topics it represents.
 Your task is to analyze these topics and produce a refined list that:
-1. Identifies and preserves core themes that appear frequently
-2. Combines redundant topics while maintaining nuanced differences
-3. Ensures the final list represents the full spectrum of viewpoints present in the original data
+1. Significantly reduces the total number of topics
+2. Identifies and preserves core themes that appear frequently
+3. Combines redundant topics
 4. Tracks the total number of original topics combined into each new topic
 Guidelines for Topic Analysis:

{themefinder-0.7.1 → themefinder-0.7.2}/src/themefinder/prompts/theme_mapping.txt RENAMED Viewed

@@ -16,10 +16,6 @@ Your task is to analyze each response and decide which topics are present. Guide
     - Each response can be assigned to multiple topics if it matches more than one topic from the TOPIC LIST.
     - Each topic can only be assigned once per response, if the topic is mentioned more than once use the first mention for reasoning and stance.
     - There is no limit on how many topics can be assigned to a response.
-    - For each assignment provide a single rationale for why you have chosen the label.
-    - For each topic identified in a response, indicate whether the response expresses a positive or negative stance toward that topic (options: 'POSITIVE' or 'NEGATIVE')
-    - You MUST use either 'POSITIVE' or 'NEGATIVE'
-    - The order of reasons and stances must align with the order of labels (e.g., stance_a applies to topic_a)
 You MUST include every response ID in the output.
 If the response can not be labelled return empty sections where appropriate but you MUST return an entry

{themefinder-0.7.1 → themefinder-0.7.2}/src/themefinder/prompts/theme_refinement.txt RENAMED Viewed

@@ -7,10 +7,9 @@ You will receive a list of TOPICS. These topics explicitly tie opinions to wheth
 ## Output
 You will produce a list of CLEAR STANCE TOPICS based on the input. Each topic should have four parts:
-1. A topic_id that is an uppercase letter (starting from 'A', for the 27th element use AA)
-2. A brief, clear topic label (3-7 words)
-3. A more detailed topic description (1-2 sentences)
-4. The source_topic_count field should be included for each topic and should reflect the number of original source topics that were merged to create this refined topic. If multiple source topics were combined, sum their individual counts. If only one source topic was used, simply retain its original count value.
+1. A brief, clear topic label (3-7 words)
+2. A more detailed topic description (1-2 sentences)
+3. The source_topic_count field should be included for each topic and should reflect the number of original source topics that were merged to create this refined topic. If multiple source topics were combined, sum their individual counts. If only one source topic was used, simply retain its original count value.
 ## Guidelines
@@ -46,11 +45,10 @@ You will produce a list of CLEAR STANCE TOPICS based on the input. Each topic sh
 2. Group closely related topics together.
 3. For each group or individual topic:
    a. Distill the core concept, removing any bias or opinion.
-   b. Create a neutral, concise topic label.
+   b. Create a concise topic label.
    c. Write a more detailed description that provides context without taking sides.
 4. Review the entire list to ensure distinctiveness and adjust as needed.
-5. Assign each output topic a topic_id that is an uppercase letter (starting from 'A', for the 27th element use AA)
-6. Combine the topic label and description with a colon separator
+5. Combine the topic label and description with a colon separator
 TOPICS:
 {responses}

{themefinder-0.7.1 → themefinder-0.7.2}/src/themefinder/theme_clustering_agent.py RENAMED Viewed

@@ -22,6 +22,8 @@ from .models import ThemeNode
 from .llm_batch_processor import load_prompt_from_file
 from .themefinder_logging import logger
+CONSULTATION_SYSTEM_PROMPT = load_prompt_from_file("consultation_system_prompt")
 class ThemeClusteringAgent:
     """Agent for performing hierarchical clustering of topics using language models.
@@ -37,13 +39,21 @@ class ThemeClusteringAgent:
         current_iteration: Current iteration number in the clustering process
     """
-    def __init__(self, llm: Runnable, themes: List[ThemeNode]) -> None:
+    def __init__(
+        self,
+        llm: Runnable,
+        themes: List[ThemeNode],
+        system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
+        target_themes: int = 10,
+    ) -> None:
         """Initialize the clustering agent with an LLM and initial themes.
         Args:
             llm: Language model instance configured with structured output
                 for HierarchicalClusteringResponse
             themes: List of ThemeNode objects to be clustered
+            system_prompt: System prompt to guide the LLM's behavior
+            target_themes: Target number of themes to cluster down to (default 10)
         """
         self.llm = llm
         self.themes: Dict[str, ThemeNode] = {}
@@ -51,6 +61,8 @@ class ThemeClusteringAgent:
             self.themes[theme.topic_id] = theme
         self.active_themes = set(self.themes.keys())
         self.current_iteration = 0
+        self.system_prompt = system_prompt
+        self.target_themes = target_themes
     def _format_prompt(self) -> str:
         """Format the clustering prompt with current active themes.
@@ -74,7 +86,10 @@ class ThemeClusteringAgent:
         # Load the clustering prompt template
         prompt_template = load_prompt_from_file("agentic_theme_clustering")
         return prompt_template.format(
-            themes_json=themes_json, iteration=self.current_iteration
+            themes_json=themes_json,
+            iteration=self.current_iteration,
+            system_prompt=self.system_prompt,
+            target_themes=self.target_themes,
         )
     @retry(
@@ -102,11 +117,20 @@ class ThemeClusteringAgent:
         """
         prompt = self._format_prompt()
         response = self.llm.invoke(prompt)
-        # The response is already a parsed dictionary when using with_structured_output
-        result = response
-        for i, parent in enumerate(result["parent_themes"]):
-            new_theme_id = f"{chr(65 + i)}_{self.current_iteration}"
-            children = [c for c in parent["children"] if c in self.active_themes]
+        for i, parent in enumerate(response.parent_themes):
+            def to_alpha(idx: int) -> str:
+                """Convert 0-based integer to Excel-style column name (A, B, ..., Z, AA, AB, ...) without divmod."""
+                idx += 1  # 1-based for Excel logic
+                result = []
+                while idx > 0:
+                    rem = (idx - 1) % 26
+                    result.append(chr(65 + rem))
+                    idx = (idx - 1) // 26
+                return "".join(reversed(result))
+            new_theme_id = f"{to_alpha(i)}_{self.current_iteration}"
+            children = [c for c in parent.children if c in self.active_themes]
             for child in children:
                 self.themes[child].parent_id = new_theme_id
             total_source_count = sum(
@@ -114,8 +138,8 @@ class ThemeClusteringAgent:
             )
             new_theme = ThemeNode(
                 topic_id=new_theme_id,
-                topic_label=parent["topic_label"],
-                topic_description=parent["topic_description"],
+                topic_label=parent.topic_label,
+                topic_description=parent.topic_description,
                 source_topic_count=total_source_count,
                 children=children,
             )