themefinder 0.6.3__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of themefinder might be problematic. Click here for more details.

themefinder/__init__.py CHANGED
@@ -1,6 +1,7 @@
1
1
  from .core import (
2
2
  find_themes,
3
3
  sentiment_analysis,
4
+ theme_clustering,
4
5
  theme_condensation,
5
6
  theme_generation,
6
7
  theme_mapping,
@@ -12,11 +13,12 @@ from .core import (
12
13
  __all__ = [
13
14
  "find_themes",
14
15
  "sentiment_analysis",
15
- "theme_generation",
16
+ "theme_clustering",
16
17
  "theme_condensation",
18
+ "theme_generation",
19
+ "theme_mapping",
17
20
  "theme_refinement",
18
21
  "theme_target_alignment",
19
- "theme_mapping",
20
22
  "detail_detection",
21
23
  ]
22
24
  __version__ = "0.1.0"
themefinder/core.py CHANGED
@@ -5,16 +5,19 @@ import pandas as pd
5
5
  from langchain_core.prompts import PromptTemplate
6
6
  from langchain.schema.runnable import RunnableWithFallbacks
7
7
 
8
- from .llm_batch_processor import batch_and_run, load_prompt_from_file
9
- from .models import (
8
+ from themefinder.llm_batch_processor import batch_and_run, load_prompt_from_file
9
+ from themefinder.models import (
10
10
  SentimentAnalysisResponses,
11
11
  ThemeGenerationResponses,
12
12
  ThemeCondensationResponses,
13
13
  ThemeRefinementResponses,
14
14
  ThemeMappingResponses,
15
15
  DetailDetectionResponses,
16
+ HierarchicalClusteringResponse,
17
+ ThemeNode,
16
18
  )
17
- from .themefinder_logging import logger
19
+ from themefinder.theme_clustering_agent import ThemeClusteringAgent
20
+ from themefinder.themefinder_logging import logger
18
21
 
19
22
  CONSULTATION_SYSTEM_PROMPT = load_prompt_from_file("consultation_system_prompt")
20
23
 
@@ -114,9 +117,7 @@ async def find_themes(
114
117
  )
115
118
 
116
119
  logger.info("Finished finding themes")
117
- logger.info(
118
- "Provide feedback or report bugs: https://forms.gle/85xUSMvxGzSSKQ499 or packages@cabinetoffice.gov.uk"
119
- )
120
+ logger.info("Provide feedback or report bugs: packages@cabinetoffice.gov.uk")
120
121
  return {
121
122
  "question": question,
122
123
  "sentiment": sentiment_df,
@@ -309,6 +310,87 @@ async def theme_condensation(
309
310
  return themes_df, _
310
311
 
311
312
 
313
+ def theme_clustering(
314
+ themes_df: pd.DataFrame,
315
+ llm: RunnableWithFallbacks,
316
+ max_iterations: int = 5,
317
+ target_themes: int = 10,
318
+ significance_percentage: float = 10.0,
319
+ return_all_themes: bool = False,
320
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
321
+ """Perform hierarchical clustering of themes using an agentic approach.
322
+
323
+ This function takes a DataFrame of themes and uses the ThemeClusteringAgent
324
+ to iteratively merge similar themes into a hierarchical structure, then
325
+ selects the most significant themes based on a threshold.
326
+
327
+ Args:
328
+ themes_df (pd.DataFrame): DataFrame containing themes with columns:
329
+ - topic_id: Unique identifier for each theme
330
+ - topic_label: Short descriptive label for the theme
331
+ - topic_description: Detailed description of the theme
332
+ - source_topic_count: Number of source responses for this theme
333
+ llm (RunnableWithFallbacks): Language model instance configured with
334
+ structured output for HierarchicalClusteringResponse
335
+ max_iterations (int, optional): Maximum number of clustering iterations.
336
+ Defaults to 5.
337
+ target_themes (int, optional): Target number of themes to cluster down to.
338
+ Defaults to 10.
339
+ significance_percentage (float, optional): Percentage threshold for
340
+ selecting significant themes. Defaults to 10.0.
341
+ return_all_themes (bool, optional): If True, returns all clustered themes.
342
+ If False, returns only significant themes. Defaults to False.
343
+
344
+ Returns:
345
+ tuple[pd.DataFrame, pd.DataFrame]:
346
+ A tuple containing:
347
+ - DataFrame of clustered themes (all or significant based on return_all_themes)
348
+ - Empty DataFrame (for consistency with other functions)
349
+ """
350
+ logger.info(f"Starting hierarchical clustering of {len(themes_df)} themes")
351
+
352
+ # Convert DataFrame to ThemeNode objects
353
+ initial_themes = [
354
+ ThemeNode(
355
+ topic_id=row["topic_id"],
356
+ topic_label=row["topic_label"],
357
+ topic_description=row["topic_description"],
358
+ source_topic_count=row["source_topic_count"],
359
+ )
360
+ for _, row in themes_df.iterrows()
361
+ ]
362
+
363
+ # Initialize clustering agent with structured output LLM
364
+ agent = ThemeClusteringAgent(
365
+ llm.with_structured_output(HierarchicalClusteringResponse), initial_themes
366
+ )
367
+
368
+ # Perform clustering
369
+ logger.info(
370
+ f"Clustering themes with max_iterations={max_iterations}, target_themes={target_themes}"
371
+ )
372
+ all_themes_df = agent.cluster_themes(
373
+ max_iterations=max_iterations, target_themes=target_themes
374
+ )
375
+
376
+ # Return appropriate themes based on parameter
377
+ if return_all_themes:
378
+ logger.info(
379
+ f"Clustering complete: returning all {len(all_themes_df)} clustered themes"
380
+ )
381
+ return all_themes_df, pd.DataFrame()
382
+ else:
383
+ # Select significant themes
384
+ logger.info(
385
+ f"Selecting themes with significance_percentage={significance_percentage}%"
386
+ )
387
+ selected_themes_df = agent.select_themes(significance_percentage)
388
+ logger.info(
389
+ f"Clustering complete: returning {len(selected_themes_df)} significant themes"
390
+ )
391
+ return selected_themes_df, pd.DataFrame()
392
+
393
+
312
394
  async def theme_refinement(
313
395
  condensed_themes_df: pd.DataFrame,
314
396
  llm: RunnableWithFallbacks,
@@ -19,7 +19,7 @@ from tenacity import (
19
19
  wait_random_exponential,
20
20
  )
21
21
 
22
- from .themefinder_logging import logger
22
+ from themefinder.themefinder_logging import logger
23
23
 
24
24
 
25
25
  @dataclass
themefinder/models.py CHANGED
@@ -349,3 +349,67 @@ class DetailDetectionResponses(ValidatedModel):
349
349
  if len(response_ids) != len(set(response_ids)):
350
350
  raise ValueError("Response IDs must be unique")
351
351
  return self
352
+
353
+
354
+ class ThemeNode(ValidatedModel):
355
+ """Model for topic nodes created during hierarchical clustering"""
356
+
357
+ topic_id: str = Field(
358
+ ...,
359
+ description="Short alphabetic ID (e.g. 'A', 'B', 'C') - iteration prefix will be added automatically",
360
+ )
361
+ topic_label: str = Field(
362
+ ..., description="4-5 word label encompassing merged child topics"
363
+ )
364
+ topic_description: str = Field(
365
+ ..., description="1-2 sentences combining key aspects of child topics"
366
+ )
367
+ source_topic_count: int = Field(gt=0, description="Sum of all child topic counts")
368
+ parent_id: Optional[str] = Field(
369
+ default=None,
370
+ description="Internal field: ID of parent topic node, managed by clustering agent, not set by LLM",
371
+ )
372
+ children: List[str] = Field(
373
+ default_factory=list, description="List of topic_ids of merged child topics"
374
+ )
375
+
376
+ @model_validator(mode="after")
377
+ def run_validations(self) -> "ThemeNode":
378
+ """Validate topic node constraints"""
379
+ if self.children:
380
+ # Each parent must have at least 2 children
381
+ if len(self.children) < 2:
382
+ raise ValueError("Each topic node must have at least 2 children")
383
+ # Validate children are unique
384
+ if len(self.children) != len(set(self.children)):
385
+ raise ValueError("Child topic IDs must be unique")
386
+
387
+ return self
388
+
389
+
390
+ class HierarchicalClusteringResponse(ValidatedModel):
391
+ """Model for hierarchical clustering agent response"""
392
+
393
+ parent_themes: List[ThemeNode] = Field(
394
+ default=[],
395
+ description="List of parent themes created by merging similar themes",
396
+ )
397
+ should_terminate: bool = Field(
398
+ ...,
399
+ description="True if no more meaningful clustering is possible, false otherwise",
400
+ )
401
+
402
+ @model_validator(mode="after")
403
+ def run_validations(self) -> "HierarchicalClusteringResponse":
404
+ """Validate clustering response constraints"""
405
+ self.validate_non_empty_fields()
406
+
407
+ # Validate that no child appears in multiple parents
408
+ all_children = []
409
+ for parent in self.parent_themes:
410
+ all_children.extend(parent.children)
411
+
412
+ if len(all_children) != len(set(all_children)):
413
+ raise ValueError("Each child theme can have at most one parent")
414
+
415
+ return self
@@ -0,0 +1,31 @@
1
+ Analyze these topics and identify which ones should be merged based on semantic similarity.
2
+ Your goal is to significantly reduce the number of topics by creating meaningful parent topics.
3
+ Be aggressive in finding opportunities to merge topics that share any semantic relationship.
4
+
5
+ TOPICS:
6
+ {themes_json}
7
+
8
+ For each group of similar topics that should be merged, create a new parent topic.
9
+
10
+ Guidelines:
11
+ - Each parent topic must have at least 2 children, it can have more than 2 if appropriate
12
+ - Each child topic can have at most 1 parent
13
+ - topic_id should be a simple alphabetic ID (e.g. 'A', 'B', 'C') - the iteration prefix will be added automatically
14
+ - Be creative and look for higher-level abstractions that can combine seemingly different topics
15
+ - When creating parent topics, follow these naming rules:
16
+ * The label should read naturally as a single coherent topic
17
+ * Choose labels that can encompass broader categories of topics
18
+ * If merging different topics, the topic with the higher source_topic_count should dominate the label
19
+ * Never combine different topics with "and" or "/" in the label
20
+ - topic_description must be 1 or 2 sentences that:
21
+ * preserves key information from the child topics
22
+ - source_topic_count must be the sum of all child topic counts
23
+ - children must be a list of valid topic_ids from the input
24
+ - should_terminate should only be true if ALL of these conditions are met:
25
+ * There are fewer than 10 active topics remaining
26
+ * The remaining topics are fundamentally incompatible semantically
27
+ * Any further merging would create meaninglessly broad categories
28
+
29
+ If no topics should be merged in this iteration but future iterations might still yield meaningful merges, set should_terminate to false with an empty parent_themes list.
30
+
31
+ If no topics should be merged and the termination conditions are met, set should_terminate to true with an empty parent_themes list.
@@ -6,10 +6,11 @@ You are tasked with refining a list of topics generated from responses to a ques
6
6
  You will receive a list of TOPICS. These topics explicitly tie opinions to whether a person agrees or disagrees with the question.
7
7
 
8
8
  ## Output
9
- You will produce a list of CLEAR STANCE TOPICS based on the input. Each topic should have two parts:
10
- 1. A brief, clear topic label (3-7 words)
11
- 2. A more detailed topic description (1-2 sentences)
12
- 3. The source_topic_count field should be included for each topic and should reflect the number of original source topics that were merged to create this refined topic. If multiple source topics were combined, sum their individual counts. If only one source topic was used, simply retain its original count value.
9
+ You will produce a list of CLEAR STANCE TOPICS based on the input. Each topic should have four parts:
10
+ 1. A topic_id that is an uppercase letter (starting from 'A', for the 27th element use AA)
11
+ 2. A brief, clear topic label (3-7 words)
12
+ 3. A more detailed topic description (1-2 sentences)
13
+ 4. The source_topic_count field should be included for each topic and should reflect the number of original source topics that were merged to create this refined topic. If multiple source topics were combined, sum their individual counts. If only one source topic was used, simply retain its original count value.
13
14
 
14
15
 
15
16
  ## Guidelines
@@ -48,7 +49,7 @@ You will produce a list of CLEAR STANCE TOPICS based on the input. Each topic sh
48
49
  b. Create a neutral, concise topic label.
49
50
  c. Write a more detailed description that provides context without taking sides.
50
51
  4. Review the entire list to ensure distinctiveness and adjust as needed.
51
- 5. Assign each output topic a topic_id a single uppercase letters (starting from 'A', for the 27th element use AA)
52
+ 5. Assign each output topic a topic_id that is an uppercase letter (starting from 'A', for the 27th element use AA)
52
53
  6. Combine the topic label and description with a colon separator
53
54
 
54
55
  TOPICS:
@@ -0,0 +1,332 @@
1
+ """Theme clustering agent for hierarchical topic organization.
2
+
3
+ This module provides the ThemeClusteringAgent class for performing iterative
4
+ hierarchical clustering of topics using a language model.
5
+ """
6
+
7
+ import json
8
+ import logging
9
+ from typing import Dict, List, Any
10
+
11
+ import pandas as pd
12
+ from langchain.schema.runnable import Runnable
13
+ from tenacity import (
14
+ before,
15
+ before_sleep_log,
16
+ retry,
17
+ stop_after_attempt,
18
+ wait_random_exponential,
19
+ )
20
+
21
+ from .models import ThemeNode
22
+ from .llm_batch_processor import load_prompt_from_file
23
+ from .themefinder_logging import logger
24
+
25
+
26
+ class ThemeClusteringAgent:
27
+ """Agent for performing hierarchical clustering of topics using language models.
28
+
29
+ This class manages the iterative process of merging similar topics into a
30
+ hierarchical structure using an LLM to identify semantic relationships and
31
+ create meaningful parent-child topic relationships.
32
+
33
+ Attributes:
34
+ llm: Language model configured with structured output for clustering
35
+ themes: Dictionary mapping topic IDs to ThemeNode objects
36
+ active_themes: Set of topic IDs that are currently active for clustering
37
+ current_iteration: Current iteration number in the clustering process
38
+ """
39
+
40
+ def __init__(self, llm: Runnable, themes: List[ThemeNode]) -> None:
41
+ """Initialize the clustering agent with an LLM and initial themes.
42
+
43
+ Args:
44
+ llm: Language model instance configured with structured output
45
+ for HierarchicalClusteringResponse
46
+ themes: List of ThemeNode objects to be clustered
47
+ """
48
+ self.llm = llm
49
+ self.themes: Dict[str, ThemeNode] = {}
50
+ for theme in themes:
51
+ self.themes[theme.topic_id] = theme
52
+ self.active_themes = set(self.themes.keys())
53
+ self.current_iteration = 0
54
+
55
+ def _format_prompt(self) -> str:
56
+ """Format the clustering prompt with current active themes.
57
+
58
+ Creates a JSON representation of all currently active themes and
59
+ formats them into the clustering prompt template.
60
+
61
+ Returns:
62
+ str: Formatted prompt string ready for LLM processing
63
+ """
64
+ themes_for_prompt = []
65
+ for active_id in self.active_themes:
66
+ theme_dict = {
67
+ "topic_id": self.themes[active_id].topic_id,
68
+ "topic_label": self.themes[active_id].topic_label,
69
+ "topic_description": self.themes[active_id].topic_description,
70
+ }
71
+ themes_for_prompt.append(theme_dict)
72
+ themes_json = json.dumps(themes_for_prompt, indent=2)
73
+
74
+ # Load the clustering prompt template
75
+ prompt_template = load_prompt_from_file("agentic_theme_clustering")
76
+ return prompt_template.format(
77
+ themes_json=themes_json, iteration=self.current_iteration
78
+ )
79
+
80
+ @retry(
81
+ wait=wait_random_exponential(min=1, max=2),
82
+ stop=stop_after_attempt(3),
83
+ before=before.before_log(logger=logger, log_level=logging.DEBUG),
84
+ before_sleep=before_sleep_log(logger, logging.ERROR),
85
+ reraise=True,
86
+ )
87
+ def cluster_iteration(self) -> None:
88
+ """Perform one iteration of hierarchical theme clustering.
89
+
90
+ Uses the configured LLM to identify semantically similar themes
91
+ and merge them into parent themes. Updates the theme hierarchy
92
+ and active theme set based on the clustering results.
93
+
94
+ The method includes retry logic to handle transient API failures
95
+ and will automatically retry up to 3 times with exponential backoff.
96
+
97
+ Side Effects:
98
+ - Creates new parent ThemeNode objects in self.themes
99
+ - Updates parent_id relationships for child themes
100
+ - Modifies self.active_themes set
101
+ - Increments self.current_iteration
102
+ """
103
+ prompt = self._format_prompt()
104
+ response = self.llm.invoke(prompt)
105
+ # The response is already a parsed dictionary when using with_structured_output
106
+ result = response
107
+ for i, parent in enumerate(result["parent_themes"]):
108
+ new_theme_id = f"{chr(65 + i)}_{self.current_iteration}"
109
+ children = [c for c in parent["children"] if c in self.active_themes]
110
+ for child in children:
111
+ self.themes[child].parent_id = new_theme_id
112
+ total_source_count = sum(
113
+ self.themes[child_id].source_topic_count for child_id in children
114
+ )
115
+ new_theme = ThemeNode(
116
+ topic_id=new_theme_id,
117
+ topic_label=parent["topic_label"],
118
+ topic_description=parent["topic_description"],
119
+ source_topic_count=total_source_count,
120
+ children=children,
121
+ )
122
+ self.themes[new_theme_id] = new_theme
123
+ self.active_themes.add(new_theme_id)
124
+ for child in children:
125
+ self.active_themes.remove(child)
126
+ self.current_iteration += 1
127
+
128
+ def cluster_themes(
129
+ self, max_iterations: int = 5, target_themes: int = 5
130
+ ) -> pd.DataFrame:
131
+ """Perform hierarchical clustering to reduce themes to target number.
132
+
133
+ Iteratively merges similar themes using the clustering agent until
134
+ either the maximum iterations is reached or the target number of
135
+ themes is achieved. Creates a root node to represent the complete
136
+ hierarchy.
137
+
138
+ Args:
139
+ max_iterations: Maximum number of clustering iterations to perform
140
+ target_themes: Target number of themes to cluster down to
141
+
142
+ Returns:
143
+ pd.DataFrame: DataFrame containing all theme nodes (excluding root)
144
+ with their hierarchical relationships and metadata
145
+ """
146
+ logger.info(f"Starting clustering with {len(self.active_themes)} active themes")
147
+ while (
148
+ self.current_iteration <= max_iterations
149
+ and len(self.active_themes) > target_themes
150
+ ):
151
+ self.cluster_iteration()
152
+ logger.info(
153
+ f"After {self.current_iteration} iterations {len(self.active_themes)} active themes remaining"
154
+ )
155
+ root_node = ThemeNode(
156
+ topic_id="0",
157
+ topic_label="All Topics",
158
+ topic_description="",
159
+ source_topic_count=sum(
160
+ self.themes[theme_id].source_topic_count
161
+ for theme_id in self.active_themes
162
+ ),
163
+ children=list(self.active_themes),
164
+ )
165
+ self.themes["0"] = root_node
166
+ for theme in self.active_themes:
167
+ self.themes[theme].parent_id = "0"
168
+
169
+ # Convert all themes (except root) to DataFrame
170
+ theme_nodes_dicts = [
171
+ node.model_dump() for node in self.themes.values() if node.topic_id != "0"
172
+ ]
173
+ return pd.DataFrame(theme_nodes_dicts)
174
+
175
+ def convert_themes_to_tree_json(self) -> str:
176
+ """Convert themes into a hierarchical JSON structure for visualization.
177
+
178
+ Creates a nested JSON structure starting from the root node (ID '0')
179
+ that represents the complete theme hierarchy. Each node includes
180
+ metadata and references to its children.
181
+
182
+ Returns:
183
+ str: JSON string representing the hierarchical tree structure
184
+ suitable for JavaScript tree visualization libraries
185
+ """
186
+
187
+ def build_tree(node: ThemeNode) -> Dict[str, Any]:
188
+ return {
189
+ "id": node.topic_id,
190
+ "name": node.topic_label,
191
+ "description": node.topic_description,
192
+ "value": node.source_topic_count,
193
+ "children": [
194
+ build_tree(self.themes[child_id])
195
+ for child_id in node.children
196
+ if child_id in self.themes
197
+ ],
198
+ }
199
+
200
+ tree_data = build_tree(self.themes["0"])
201
+ return json.dumps(tree_data, indent=2)
202
+
203
+ def select_significant_themes(
204
+ self, significance_threshold: int, total_responses: int
205
+ ) -> Dict[str, Any]:
206
+ """Select significant themes using depth-first traversal.
207
+
208
+ Performs a depth-first search on the theme hierarchy to identify
209
+ themes that meet the significance threshold. Prioritizes leaf nodes
210
+ when possible, but selects parent nodes when children don't meet
211
+ the threshold.
212
+
213
+ Args:
214
+ significance_threshold: Minimum source_topic_count for significance
215
+ total_responses: Total number of responses across all themes
216
+
217
+ Returns:
218
+ Dict containing selected theme nodes and metadata
219
+ """
220
+ # Track selected nodes
221
+ selected_nodes: List[Dict[str, Any]] = []
222
+
223
+ # Perform the DFS selection
224
+ self._traverse_tree(self.themes["0"], selected_nodes, significance_threshold)
225
+
226
+ # Format the final result
227
+ result = {"selected_nodes": selected_nodes, "total_responses": total_responses}
228
+
229
+ return result
230
+
231
+ def _traverse_tree(
232
+ self,
233
+ node: ThemeNode,
234
+ selected_nodes: List[Dict[str, Any]],
235
+ significance_threshold: int,
236
+ ) -> bool:
237
+ """Recursively traverse theme tree to select significant nodes.
238
+
239
+ Implements depth-first traversal logic for theme selection:
240
+ 1. For leaf nodes: always select
241
+ 2. For parent nodes: select if no significant children exist
242
+ 3. For significant children: recursively process them
243
+
244
+ Args:
245
+ node: Current ThemeNode being processed
246
+ selected_nodes: List to accumulate selected theme dictionaries
247
+ significance_threshold: Minimum source_topic_count for significance
248
+
249
+ Returns:
250
+ bool: True if this node or descendants were selected, False otherwise
251
+ """
252
+ # Base case: if node has no children (leaf node)
253
+ if not node.children:
254
+ selected_nodes.append(
255
+ {
256
+ "id": node.topic_id,
257
+ "name": node.topic_label,
258
+ "value": node.source_topic_count,
259
+ }
260
+ )
261
+ return True
262
+
263
+ # Check if any children are significant
264
+ has_significant_children = any(
265
+ self.themes[child_id].source_topic_count >= significance_threshold
266
+ for child_id in node.children
267
+ if child_id in self.themes
268
+ )
269
+
270
+ # If no significant children, select this node
271
+ if not has_significant_children:
272
+ selected_nodes.append(
273
+ {
274
+ "id": node.topic_id,
275
+ "name": node.topic_label,
276
+ "value": node.source_topic_count,
277
+ }
278
+ )
279
+ return True
280
+
281
+ # If significant children exist, recursively process them
282
+ any_selected = False
283
+ for child_id in node.children:
284
+ if child_id in self.themes:
285
+ if self._traverse_tree(
286
+ self.themes[child_id], selected_nodes, significance_threshold
287
+ ):
288
+ any_selected = True
289
+
290
+ # If none of the children were selected, select this node
291
+ if not any_selected:
292
+ selected_nodes.append(
293
+ {
294
+ "id": node.topic_id,
295
+ "name": node.topic_label,
296
+ "value": node.source_topic_count,
297
+ }
298
+ )
299
+ return True
300
+
301
+ return any_selected
302
+
303
+ def select_themes(self, significance_percentage: float) -> pd.DataFrame:
304
+ """Select themes that meet the significance threshold.
305
+
306
+ Calculates the significance threshold based on the percentage of total
307
+ responses and returns only themes that meet or exceed this threshold.
308
+ Excludes the root node from results.
309
+
310
+ Args:
311
+ significance_percentage: Percentage (0-100) of total responses
312
+ required for a theme to be considered significant
313
+
314
+ Returns:
315
+ pd.DataFrame: DataFrame containing significant theme data,
316
+ excluding the root node (topic_id='0')
317
+ """
318
+ total_responses = self.themes["0"].source_topic_count
319
+ # Convert percentage to absolute threshold
320
+ significance_threshold = int(total_responses * (significance_percentage / 100))
321
+
322
+ # Filter themes that meet the significance threshold
323
+ significant_themes = [
324
+ theme_node
325
+ for theme_node in self.themes.values()
326
+ if theme_node.source_topic_count >= significance_threshold
327
+ ]
328
+ # Convert significant themes to DataFrame, excluding root node
329
+ theme_nodes_dicts = [
330
+ node.model_dump() for node in significant_themes if node.topic_id != "0"
331
+ ]
332
+ return pd.DataFrame(theme_nodes_dicts)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: themefinder
3
- Version: 0.6.3
3
+ Version: 0.7.0
4
4
  Summary: A topic modelling Python package designed for analysing one-to-many question-answer data eg free-text survey responses.
5
5
  License: MIT
6
6
  Author: i.AI
@@ -169,5 +169,5 @@ The documentation is [© Crown copyright](https://www.nationalarchives.gov.uk/in
169
169
 
170
170
  ## Feedback
171
171
 
172
- If you have feedback on this package, please fill in our [feedback form](https://forms.gle/85xUSMvxGzSSKQ499) or contact us with questions or feedback at packages@cabinetoffice.gov.uk.
172
+ Contact us with questions or feedback at packages@cabinetoffice.gov.uk.
173
173
 
@@ -0,0 +1,19 @@
1
+ themefinder/__init__.py,sha256=k3D3TpAvRdcXXZbHc_Lb7DsB53JwoGA0S4Ap5iX7PEw,477
2
+ themefinder/core.py,sha256=mqToJ-ggx8JyholNMUwFDcAT35dWX8Hnt3BJzdaNgS0,26219
3
+ themefinder/llm_batch_processor.py,sha256=Z9jm9Kr-6GD8g8kLkgdW97onjUbLLQ2M1YKwok39Q6Y,17652
4
+ themefinder/models.py,sha256=JopmD4F23Mteh60m6WDpsuTs58dRc0tUbVX-d-L8Gv8,14680
5
+ themefinder/prompts/agentic_theme_clustering.txt,sha256=6bHLpgZUQEaZXpLUB7EcMEbtXGqQ_1yniqZ6ZBJHFn0,1917
6
+ themefinder/prompts/consultation_system_prompt.txt,sha256=_A07oY_an4hnRx-9pQ0y-TLXJz0dd8vDI-MZne7Mdb4,89
7
+ themefinder/prompts/detail_detection.txt,sha256=6Vr_oN7rF5BCFipnCIHTSF8MmjerGyCixRWRT3vni1U,941
8
+ themefinder/prompts/sentiment_analysis.txt,sha256=vYCDhtEsG5I9xixwVhZbvKPJGU1Gqpw4-xAqGz72xhU,1671
9
+ themefinder/prompts/theme_condensation.txt,sha256=pHWuCtfU58gdtP2BfGZWOTvcb0MnTpb9OhOCGtkJv8U,1672
10
+ themefinder/prompts/theme_generation.txt,sha256=QRKW7DtcMSb2olT6j5jmdEPcXPMeZgogM-NYddEIKRk,1871
11
+ themefinder/prompts/theme_mapping.txt,sha256=HtGuStm-622TIEaqdb9LTaBs9xE-n9lvmcGQTG2_JOQ,2042
12
+ themefinder/prompts/theme_refinement.txt,sha256=evWMCIEdeZCJ8zn4SBNgP6bmfAb0vzKiR5C5wfAjkUk,2649
13
+ themefinder/prompts/theme_target_alignment.txt,sha256=g7AVZLiP_xIH010X5SIZyG3q7gA6OBAplPv3xvmstOY,855
14
+ themefinder/theme_clustering_agent.py,sha256=Ie-5MFvIo7ukeeDXNpLawJXqLqBb6kvUGgSH6uTGL20,12826
15
+ themefinder/themefinder_logging.py,sha256=n5SUQovEZLC4skEbxicjz_fOGF9mOk3S-Wpj5uXsaL8,314
16
+ themefinder-0.7.0.dist-info/LICENCE,sha256=C9ULIN0ctF60ZxUWH_hw1H434bDLg49Z-Qzn6BUHgqs,1060
17
+ themefinder-0.7.0.dist-info/METADATA,sha256=-PRjz0RTxp-yJsuavj8tw5NwtC1amsw12JyKNOitxZw,6737
18
+ themefinder-0.7.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
19
+ themefinder-0.7.0.dist-info/RECORD,,
@@ -1,17 +0,0 @@
1
- themefinder/__init__.py,sha256=yfIyHWPMM59u23m79igHSllT-w3r4l_euLCDZygo22Q,431
2
- themefinder/core.py,sha256=J4BJZO8BNN9xbX3LsKah4ZOGkW6YJcg_iYB9HCH7UR0,22768
3
- themefinder/llm_batch_processor.py,sha256=zdrQH1bvMR9FHWDaDp1tvdiADTHTaNDg_Z-3QQ0771k,17641
4
- themefinder/models.py,sha256=RN_7WzucXgKWSVXEoizijTgAM63rMVvXW6vdGD3o6Z8,12332
5
- themefinder/prompts/consultation_system_prompt.txt,sha256=_A07oY_an4hnRx-9pQ0y-TLXJz0dd8vDI-MZne7Mdb4,89
6
- themefinder/prompts/detail_detection.txt,sha256=6Vr_oN7rF5BCFipnCIHTSF8MmjerGyCixRWRT3vni1U,941
7
- themefinder/prompts/sentiment_analysis.txt,sha256=vYCDhtEsG5I9xixwVhZbvKPJGU1Gqpw4-xAqGz72xhU,1671
8
- themefinder/prompts/theme_condensation.txt,sha256=pHWuCtfU58gdtP2BfGZWOTvcb0MnTpb9OhOCGtkJv8U,1672
9
- themefinder/prompts/theme_generation.txt,sha256=QRKW7DtcMSb2olT6j5jmdEPcXPMeZgogM-NYddEIKRk,1871
10
- themefinder/prompts/theme_mapping.txt,sha256=HtGuStm-622TIEaqdb9LTaBs9xE-n9lvmcGQTG2_JOQ,2042
11
- themefinder/prompts/theme_refinement.txt,sha256=va9SPBbuR6F5th78Nx4lCREXDFltSO80JUsShR0FRgE,2556
12
- themefinder/prompts/theme_target_alignment.txt,sha256=g7AVZLiP_xIH010X5SIZyG3q7gA6OBAplPv3xvmstOY,855
13
- themefinder/themefinder_logging.py,sha256=n5SUQovEZLC4skEbxicjz_fOGF9mOk3S-Wpj5uXsaL8,314
14
- themefinder-0.6.3.dist-info/LICENCE,sha256=C9ULIN0ctF60ZxUWH_hw1H434bDLg49Z-Qzn6BUHgqs,1060
15
- themefinder-0.6.3.dist-info/METADATA,sha256=RtE3wRVnyr-DaHo4XFFsQtEIYJCByCmo5PsIMD0Tzh0,6850
16
- themefinder-0.6.3.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
17
- themefinder-0.6.3.dist-info/RECORD,,