themefinder 0.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,356 @@
1
+ """Theme clustering agent for hierarchical topic organization.
2
+
3
+ This module provides the ThemeClusteringAgent class for performing iterative
4
+ hierarchical clustering of topics using a language model.
5
+ """
6
+
7
+ import json
8
+ import logging
9
+ from typing import Dict, List, Any
10
+
11
+ import pandas as pd
12
+ from langchain_core.runnables import Runnable
13
+ from tenacity import (
14
+ before,
15
+ before_sleep_log,
16
+ retry,
17
+ stop_after_attempt,
18
+ wait_random_exponential,
19
+ )
20
+
21
+ from themefinder.models import ThemeNode
22
+ from themefinder.llm_batch_processor import load_prompt_from_file
23
+ from themefinder.themefinder_logging import logger
24
+
25
+ CONSULTATION_SYSTEM_PROMPT = load_prompt_from_file("consultation_system_prompt")
26
+
27
+
28
+ class ThemeClusteringAgent:
29
+ """Agent for performing hierarchical clustering of topics using language models.
30
+
31
+ This class manages the iterative process of merging similar topics into a
32
+ hierarchical structure using an LLM to identify semantic relationships and
33
+ create meaningful parent-child topic relationships.
34
+
35
+ Attributes:
36
+ llm: Language model configured with structured output for clustering
37
+ themes: Dictionary mapping topic IDs to ThemeNode objects
38
+ active_themes: Set of topic IDs that are currently active for clustering
39
+ current_iteration: Current iteration number in the clustering process
40
+ """
41
+
42
+ def __init__(
43
+ self,
44
+ llm: Runnable,
45
+ themes: List[ThemeNode],
46
+ system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
47
+ target_themes: int = 10,
48
+ ) -> None:
49
+ """Initialize the clustering agent with an LLM and initial themes.
50
+
51
+ Args:
52
+ llm: Language model instance configured with structured output
53
+ for HierarchicalClusteringResponse
54
+ themes: List of ThemeNode objects to be clustered
55
+ system_prompt: System prompt to guide the LLM's behavior
56
+ target_themes: Target number of themes to cluster down to (default 10)
57
+ """
58
+ self.llm = llm
59
+ self.themes: Dict[str, ThemeNode] = {}
60
+ for theme in themes:
61
+ self.themes[theme.topic_id] = theme
62
+ self.active_themes = set(self.themes.keys())
63
+ self.current_iteration = 0
64
+ self.system_prompt = system_prompt
65
+ self.target_themes = target_themes
66
+
67
+ def _format_prompt(self) -> str:
68
+ """Format the clustering prompt with current active themes.
69
+
70
+ Creates a JSON representation of all currently active themes and
71
+ formats them into the clustering prompt template.
72
+
73
+ Returns:
74
+ str: Formatted prompt string ready for LLM processing
75
+ """
76
+ themes_for_prompt = []
77
+ for active_id in self.active_themes:
78
+ theme_dict = {
79
+ "topic_id": self.themes[active_id].topic_id,
80
+ "topic_label": self.themes[active_id].topic_label,
81
+ "topic_description": self.themes[active_id].topic_description,
82
+ }
83
+ themes_for_prompt.append(theme_dict)
84
+ themes_json = json.dumps(themes_for_prompt, indent=2)
85
+
86
+ # Load the clustering prompt template
87
+ prompt_template = load_prompt_from_file("agentic_theme_clustering")
88
+ return prompt_template.format(
89
+ themes_json=themes_json,
90
+ iteration=self.current_iteration,
91
+ system_prompt=self.system_prompt,
92
+ target_themes=self.target_themes,
93
+ )
94
+
95
+ @retry(
96
+ wait=wait_random_exponential(min=1, max=2),
97
+ stop=stop_after_attempt(3),
98
+ before=before.before_log(logger=logger, log_level=logging.DEBUG),
99
+ before_sleep=before_sleep_log(logger, logging.ERROR),
100
+ reraise=True,
101
+ )
102
+ def cluster_iteration(self) -> None:
103
+ """Perform one iteration of hierarchical theme clustering.
104
+
105
+ Uses the configured LLM to identify semantically similar themes
106
+ and merge them into parent themes. Updates the theme hierarchy
107
+ and active theme set based on the clustering results.
108
+
109
+ The method includes retry logic to handle transient API failures
110
+ and will automatically retry up to 3 times with exponential backoff.
111
+
112
+ Side Effects:
113
+ - Creates new parent ThemeNode objects in self.themes
114
+ - Updates parent_id relationships for child themes
115
+ - Modifies self.active_themes set
116
+ - Increments self.current_iteration
117
+ """
118
+ prompt = self._format_prompt()
119
+ response = self.llm.invoke(prompt)
120
+ for i, parent in enumerate(response.parent_themes):
121
+
122
+ def to_alpha(idx: int) -> str:
123
+ """Convert 0-based integer to Excel-style column name (A, B, ..., Z, AA, AB, ...) without divmod."""
124
+ idx += 1 # 1-based for Excel logic
125
+ result = []
126
+ while idx > 0:
127
+ rem = (idx - 1) % 26
128
+ result.append(chr(65 + rem))
129
+ idx = (idx - 1) // 26
130
+ return "".join(reversed(result))
131
+
132
+ new_theme_id = f"{to_alpha(i)}_{self.current_iteration}"
133
+ children = [c for c in parent.children if c in self.active_themes]
134
+ for child in children:
135
+ self.themes[child].parent_id = new_theme_id
136
+ total_source_count = sum(
137
+ self.themes[child_id].source_topic_count for child_id in children
138
+ )
139
+ new_theme = ThemeNode(
140
+ topic_id=new_theme_id,
141
+ topic_label=parent.topic_label,
142
+ topic_description=parent.topic_description,
143
+ source_topic_count=total_source_count,
144
+ children=children,
145
+ )
146
+ self.themes[new_theme_id] = new_theme
147
+ self.active_themes.add(new_theme_id)
148
+ for child in children:
149
+ self.active_themes.remove(child)
150
+ self.current_iteration += 1
151
+
152
+ def cluster_themes(
153
+ self, max_iterations: int = 5, target_themes: int = 5
154
+ ) -> pd.DataFrame:
155
+ """Perform hierarchical clustering to reduce themes to target number.
156
+
157
+ Iteratively merges similar themes using the clustering agent until
158
+ either the maximum iterations is reached or the target number of
159
+ themes is achieved. Creates a root node to represent the complete
160
+ hierarchy.
161
+
162
+ Args:
163
+ max_iterations: Maximum number of clustering iterations to perform
164
+ target_themes: Target number of themes to cluster down to
165
+
166
+ Returns:
167
+ pd.DataFrame: DataFrame containing all theme nodes (excluding root)
168
+ with their hierarchical relationships and metadata
169
+ """
170
+ logger.info(f"Starting clustering with {len(self.active_themes)} active themes")
171
+ while (
172
+ self.current_iteration <= max_iterations
173
+ and len(self.active_themes) > target_themes
174
+ ):
175
+ self.cluster_iteration()
176
+ logger.info(
177
+ f"After {self.current_iteration} iterations {len(self.active_themes)} active themes remaining"
178
+ )
179
+ root_node = ThemeNode(
180
+ topic_id="0",
181
+ topic_label="All Topics",
182
+ topic_description="",
183
+ source_topic_count=sum(
184
+ self.themes[theme_id].source_topic_count
185
+ for theme_id in self.active_themes
186
+ ),
187
+ children=list(self.active_themes),
188
+ )
189
+ self.themes["0"] = root_node
190
+ for theme in self.active_themes:
191
+ self.themes[theme].parent_id = "0"
192
+
193
+ # Convert all themes (except root) to DataFrame
194
+ theme_nodes_dicts = [
195
+ node.model_dump() for node in self.themes.values() if node.topic_id != "0"
196
+ ]
197
+ return pd.DataFrame(theme_nodes_dicts)
198
+
199
+ def convert_themes_to_tree_json(self) -> str:
200
+ """Convert themes into a hierarchical JSON structure for visualization.
201
+
202
+ Creates a nested JSON structure starting from the root node (ID '0')
203
+ that represents the complete theme hierarchy. Each node includes
204
+ metadata and references to its children.
205
+
206
+ Returns:
207
+ str: JSON string representing the hierarchical tree structure
208
+ suitable for JavaScript tree visualization libraries
209
+ """
210
+
211
+ def build_tree(node: ThemeNode) -> Dict[str, Any]:
212
+ return {
213
+ "id": node.topic_id,
214
+ "name": node.topic_label,
215
+ "description": node.topic_description,
216
+ "value": node.source_topic_count,
217
+ "children": [
218
+ build_tree(self.themes[child_id])
219
+ for child_id in node.children
220
+ if child_id in self.themes
221
+ ],
222
+ }
223
+
224
+ tree_data = build_tree(self.themes["0"])
225
+ return json.dumps(tree_data, indent=2)
226
+
227
+ def select_significant_themes(
228
+ self, significance_threshold: int, total_responses: int
229
+ ) -> Dict[str, Any]:
230
+ """Select significant themes using depth-first traversal.
231
+
232
+ Performs a depth-first search on the theme hierarchy to identify
233
+ themes that meet the significance threshold. Prioritizes leaf nodes
234
+ when possible, but selects parent nodes when children don't meet
235
+ the threshold.
236
+
237
+ Args:
238
+ significance_threshold: Minimum source_topic_count for significance
239
+ total_responses: Total number of responses across all themes
240
+
241
+ Returns:
242
+ Dict containing selected theme nodes and metadata
243
+ """
244
+ # Track selected nodes
245
+ selected_nodes: List[Dict[str, Any]] = []
246
+
247
+ # Perform the DFS selection
248
+ self._traverse_tree(self.themes["0"], selected_nodes, significance_threshold)
249
+
250
+ # Format the final result
251
+ result = {"selected_nodes": selected_nodes, "total_responses": total_responses}
252
+
253
+ return result
254
+
255
+ def _traverse_tree(
256
+ self,
257
+ node: ThemeNode,
258
+ selected_nodes: List[Dict[str, Any]],
259
+ significance_threshold: int,
260
+ ) -> bool:
261
+ """Recursively traverse theme tree to select significant nodes.
262
+
263
+ Implements depth-first traversal logic for theme selection:
264
+ 1. For leaf nodes: always select
265
+ 2. For parent nodes: select if no significant children exist
266
+ 3. For significant children: recursively process them
267
+
268
+ Args:
269
+ node: Current ThemeNode being processed
270
+ selected_nodes: List to accumulate selected theme dictionaries
271
+ significance_threshold: Minimum source_topic_count for significance
272
+
273
+ Returns:
274
+ bool: True if this node or descendants were selected, False otherwise
275
+ """
276
+ # Base case: if node has no children (leaf node)
277
+ if not node.children:
278
+ selected_nodes.append(
279
+ {
280
+ "id": node.topic_id,
281
+ "name": node.topic_label,
282
+ "value": node.source_topic_count,
283
+ }
284
+ )
285
+ return True
286
+
287
+ # Check if any children are significant
288
+ has_significant_children = any(
289
+ self.themes[child_id].source_topic_count >= significance_threshold
290
+ for child_id in node.children
291
+ if child_id in self.themes
292
+ )
293
+
294
+ # If no significant children, select this node
295
+ if not has_significant_children:
296
+ selected_nodes.append(
297
+ {
298
+ "id": node.topic_id,
299
+ "name": node.topic_label,
300
+ "value": node.source_topic_count,
301
+ }
302
+ )
303
+ return True
304
+
305
+ # If significant children exist, recursively process them
306
+ any_selected = False
307
+ for child_id in node.children:
308
+ if child_id in self.themes:
309
+ if self._traverse_tree(
310
+ self.themes[child_id], selected_nodes, significance_threshold
311
+ ):
312
+ any_selected = True
313
+
314
+ # If none of the children were selected, select this node
315
+ if not any_selected:
316
+ selected_nodes.append(
317
+ {
318
+ "id": node.topic_id,
319
+ "name": node.topic_label,
320
+ "value": node.source_topic_count,
321
+ }
322
+ )
323
+ return True
324
+
325
+ return any_selected
326
+
327
+ def select_themes(self, significance_percentage: float) -> pd.DataFrame:
328
+ """Select themes that meet the significance threshold.
329
+
330
+ Calculates the significance threshold based on the percentage of total
331
+ responses and returns only themes that meet or exceed this threshold.
332
+ Excludes the root node from results.
333
+
334
+ Args:
335
+ significance_percentage: Percentage (0-100) of total responses
336
+ required for a theme to be considered significant
337
+
338
+ Returns:
339
+ pd.DataFrame: DataFrame containing significant theme data,
340
+ excluding the root node (topic_id='0')
341
+ """
342
+ total_responses = self.themes["0"].source_topic_count
343
+ # Convert percentage to absolute threshold
344
+ significance_threshold = int(total_responses * (significance_percentage / 100))
345
+
346
+ # Filter themes that meet the significance threshold
347
+ significant_themes = [
348
+ theme_node
349
+ for theme_node in self.themes.values()
350
+ if theme_node.source_topic_count >= significance_threshold
351
+ ]
352
+ # Convert significant themes to DataFrame, excluding root node
353
+ theme_nodes_dicts = [
354
+ node.model_dump() for node in significant_themes if node.topic_id != "0"
355
+ ]
356
+ return pd.DataFrame(theme_nodes_dicts)