themefinder 0.7.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themefinder/__init__.py +24 -0
- themefinder/advanced_tasks/__init__.py +0 -0
- themefinder/advanced_tasks/cross_cutting_themes_agent.py +404 -0
- themefinder/advanced_tasks/theme_clustering_agent.py +356 -0
- themefinder/llm_batch_processor.py +442 -0
- themefinder/models.py +438 -0
- themefinder/prompts/agentic_theme_clustering.txt +34 -0
- themefinder/prompts/consultation_system_prompt.txt +1 -0
- themefinder/prompts/cross_cutting_identification.txt +16 -0
- themefinder/prompts/cross_cutting_mapping.txt +19 -0
- themefinder/prompts/cross_cutting_refinement.txt +15 -0
- themefinder/prompts/detail_detection.txt +31 -0
- themefinder/prompts/sentiment_analysis.txt +41 -0
- themefinder/prompts/theme_condensation.txt +34 -0
- themefinder/prompts/theme_generation.txt +38 -0
- themefinder/prompts/theme_mapping.txt +36 -0
- themefinder/prompts/theme_refinement.txt +54 -0
- themefinder/prompts/theme_target_alignment.txt +18 -0
- themefinder/tasks.py +656 -0
- themefinder/themefinder_logging.py +12 -0
- themefinder-0.7.4.dist-info/METADATA +174 -0
- themefinder-0.7.4.dist-info/RECORD +24 -0
- themefinder-0.7.4.dist-info/WHEEL +4 -0
- themefinder-0.7.4.dist-info/licenses/LICENCE +21 -0
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
"""Theme clustering agent for hierarchical topic organization.
|
|
2
|
+
|
|
3
|
+
This module provides the ThemeClusteringAgent class for performing iterative
|
|
4
|
+
hierarchical clustering of topics using a language model.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Dict, List, Any
|
|
10
|
+
|
|
11
|
+
import pandas as pd
|
|
12
|
+
from langchain_core.runnables import Runnable
|
|
13
|
+
from tenacity import (
|
|
14
|
+
before,
|
|
15
|
+
before_sleep_log,
|
|
16
|
+
retry,
|
|
17
|
+
stop_after_attempt,
|
|
18
|
+
wait_random_exponential,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
from themefinder.models import ThemeNode
|
|
22
|
+
from themefinder.llm_batch_processor import load_prompt_from_file
|
|
23
|
+
from themefinder.themefinder_logging import logger
|
|
24
|
+
|
|
25
|
+
CONSULTATION_SYSTEM_PROMPT = load_prompt_from_file("consultation_system_prompt")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ThemeClusteringAgent:
|
|
29
|
+
"""Agent for performing hierarchical clustering of topics using language models.
|
|
30
|
+
|
|
31
|
+
This class manages the iterative process of merging similar topics into a
|
|
32
|
+
hierarchical structure using an LLM to identify semantic relationships and
|
|
33
|
+
create meaningful parent-child topic relationships.
|
|
34
|
+
|
|
35
|
+
Attributes:
|
|
36
|
+
llm: Language model configured with structured output for clustering
|
|
37
|
+
themes: Dictionary mapping topic IDs to ThemeNode objects
|
|
38
|
+
active_themes: Set of topic IDs that are currently active for clustering
|
|
39
|
+
current_iteration: Current iteration number in the clustering process
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
llm: Runnable,
|
|
45
|
+
themes: List[ThemeNode],
|
|
46
|
+
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
47
|
+
target_themes: int = 10,
|
|
48
|
+
) -> None:
|
|
49
|
+
"""Initialize the clustering agent with an LLM and initial themes.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
llm: Language model instance configured with structured output
|
|
53
|
+
for HierarchicalClusteringResponse
|
|
54
|
+
themes: List of ThemeNode objects to be clustered
|
|
55
|
+
system_prompt: System prompt to guide the LLM's behavior
|
|
56
|
+
target_themes: Target number of themes to cluster down to (default 10)
|
|
57
|
+
"""
|
|
58
|
+
self.llm = llm
|
|
59
|
+
self.themes: Dict[str, ThemeNode] = {}
|
|
60
|
+
for theme in themes:
|
|
61
|
+
self.themes[theme.topic_id] = theme
|
|
62
|
+
self.active_themes = set(self.themes.keys())
|
|
63
|
+
self.current_iteration = 0
|
|
64
|
+
self.system_prompt = system_prompt
|
|
65
|
+
self.target_themes = target_themes
|
|
66
|
+
|
|
67
|
+
def _format_prompt(self) -> str:
|
|
68
|
+
"""Format the clustering prompt with current active themes.
|
|
69
|
+
|
|
70
|
+
Creates a JSON representation of all currently active themes and
|
|
71
|
+
formats them into the clustering prompt template.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
str: Formatted prompt string ready for LLM processing
|
|
75
|
+
"""
|
|
76
|
+
themes_for_prompt = []
|
|
77
|
+
for active_id in self.active_themes:
|
|
78
|
+
theme_dict = {
|
|
79
|
+
"topic_id": self.themes[active_id].topic_id,
|
|
80
|
+
"topic_label": self.themes[active_id].topic_label,
|
|
81
|
+
"topic_description": self.themes[active_id].topic_description,
|
|
82
|
+
}
|
|
83
|
+
themes_for_prompt.append(theme_dict)
|
|
84
|
+
themes_json = json.dumps(themes_for_prompt, indent=2)
|
|
85
|
+
|
|
86
|
+
# Load the clustering prompt template
|
|
87
|
+
prompt_template = load_prompt_from_file("agentic_theme_clustering")
|
|
88
|
+
return prompt_template.format(
|
|
89
|
+
themes_json=themes_json,
|
|
90
|
+
iteration=self.current_iteration,
|
|
91
|
+
system_prompt=self.system_prompt,
|
|
92
|
+
target_themes=self.target_themes,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
@retry(
|
|
96
|
+
wait=wait_random_exponential(min=1, max=2),
|
|
97
|
+
stop=stop_after_attempt(3),
|
|
98
|
+
before=before.before_log(logger=logger, log_level=logging.DEBUG),
|
|
99
|
+
before_sleep=before_sleep_log(logger, logging.ERROR),
|
|
100
|
+
reraise=True,
|
|
101
|
+
)
|
|
102
|
+
def cluster_iteration(self) -> None:
|
|
103
|
+
"""Perform one iteration of hierarchical theme clustering.
|
|
104
|
+
|
|
105
|
+
Uses the configured LLM to identify semantically similar themes
|
|
106
|
+
and merge them into parent themes. Updates the theme hierarchy
|
|
107
|
+
and active theme set based on the clustering results.
|
|
108
|
+
|
|
109
|
+
The method includes retry logic to handle transient API failures
|
|
110
|
+
and will automatically retry up to 3 times with exponential backoff.
|
|
111
|
+
|
|
112
|
+
Side Effects:
|
|
113
|
+
- Creates new parent ThemeNode objects in self.themes
|
|
114
|
+
- Updates parent_id relationships for child themes
|
|
115
|
+
- Modifies self.active_themes set
|
|
116
|
+
- Increments self.current_iteration
|
|
117
|
+
"""
|
|
118
|
+
prompt = self._format_prompt()
|
|
119
|
+
response = self.llm.invoke(prompt)
|
|
120
|
+
for i, parent in enumerate(response.parent_themes):
|
|
121
|
+
|
|
122
|
+
def to_alpha(idx: int) -> str:
|
|
123
|
+
"""Convert 0-based integer to Excel-style column name (A, B, ..., Z, AA, AB, ...) without divmod."""
|
|
124
|
+
idx += 1 # 1-based for Excel logic
|
|
125
|
+
result = []
|
|
126
|
+
while idx > 0:
|
|
127
|
+
rem = (idx - 1) % 26
|
|
128
|
+
result.append(chr(65 + rem))
|
|
129
|
+
idx = (idx - 1) // 26
|
|
130
|
+
return "".join(reversed(result))
|
|
131
|
+
|
|
132
|
+
new_theme_id = f"{to_alpha(i)}_{self.current_iteration}"
|
|
133
|
+
children = [c for c in parent.children if c in self.active_themes]
|
|
134
|
+
for child in children:
|
|
135
|
+
self.themes[child].parent_id = new_theme_id
|
|
136
|
+
total_source_count = sum(
|
|
137
|
+
self.themes[child_id].source_topic_count for child_id in children
|
|
138
|
+
)
|
|
139
|
+
new_theme = ThemeNode(
|
|
140
|
+
topic_id=new_theme_id,
|
|
141
|
+
topic_label=parent.topic_label,
|
|
142
|
+
topic_description=parent.topic_description,
|
|
143
|
+
source_topic_count=total_source_count,
|
|
144
|
+
children=children,
|
|
145
|
+
)
|
|
146
|
+
self.themes[new_theme_id] = new_theme
|
|
147
|
+
self.active_themes.add(new_theme_id)
|
|
148
|
+
for child in children:
|
|
149
|
+
self.active_themes.remove(child)
|
|
150
|
+
self.current_iteration += 1
|
|
151
|
+
|
|
152
|
+
def cluster_themes(
|
|
153
|
+
self, max_iterations: int = 5, target_themes: int = 5
|
|
154
|
+
) -> pd.DataFrame:
|
|
155
|
+
"""Perform hierarchical clustering to reduce themes to target number.
|
|
156
|
+
|
|
157
|
+
Iteratively merges similar themes using the clustering agent until
|
|
158
|
+
either the maximum iterations is reached or the target number of
|
|
159
|
+
themes is achieved. Creates a root node to represent the complete
|
|
160
|
+
hierarchy.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
max_iterations: Maximum number of clustering iterations to perform
|
|
164
|
+
target_themes: Target number of themes to cluster down to
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
pd.DataFrame: DataFrame containing all theme nodes (excluding root)
|
|
168
|
+
with their hierarchical relationships and metadata
|
|
169
|
+
"""
|
|
170
|
+
logger.info(f"Starting clustering with {len(self.active_themes)} active themes")
|
|
171
|
+
while (
|
|
172
|
+
self.current_iteration <= max_iterations
|
|
173
|
+
and len(self.active_themes) > target_themes
|
|
174
|
+
):
|
|
175
|
+
self.cluster_iteration()
|
|
176
|
+
logger.info(
|
|
177
|
+
f"After {self.current_iteration} iterations {len(self.active_themes)} active themes remaining"
|
|
178
|
+
)
|
|
179
|
+
root_node = ThemeNode(
|
|
180
|
+
topic_id="0",
|
|
181
|
+
topic_label="All Topics",
|
|
182
|
+
topic_description="",
|
|
183
|
+
source_topic_count=sum(
|
|
184
|
+
self.themes[theme_id].source_topic_count
|
|
185
|
+
for theme_id in self.active_themes
|
|
186
|
+
),
|
|
187
|
+
children=list(self.active_themes),
|
|
188
|
+
)
|
|
189
|
+
self.themes["0"] = root_node
|
|
190
|
+
for theme in self.active_themes:
|
|
191
|
+
self.themes[theme].parent_id = "0"
|
|
192
|
+
|
|
193
|
+
# Convert all themes (except root) to DataFrame
|
|
194
|
+
theme_nodes_dicts = [
|
|
195
|
+
node.model_dump() for node in self.themes.values() if node.topic_id != "0"
|
|
196
|
+
]
|
|
197
|
+
return pd.DataFrame(theme_nodes_dicts)
|
|
198
|
+
|
|
199
|
+
def convert_themes_to_tree_json(self) -> str:
|
|
200
|
+
"""Convert themes into a hierarchical JSON structure for visualization.
|
|
201
|
+
|
|
202
|
+
Creates a nested JSON structure starting from the root node (ID '0')
|
|
203
|
+
that represents the complete theme hierarchy. Each node includes
|
|
204
|
+
metadata and references to its children.
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
str: JSON string representing the hierarchical tree structure
|
|
208
|
+
suitable for JavaScript tree visualization libraries
|
|
209
|
+
"""
|
|
210
|
+
|
|
211
|
+
def build_tree(node: ThemeNode) -> Dict[str, Any]:
|
|
212
|
+
return {
|
|
213
|
+
"id": node.topic_id,
|
|
214
|
+
"name": node.topic_label,
|
|
215
|
+
"description": node.topic_description,
|
|
216
|
+
"value": node.source_topic_count,
|
|
217
|
+
"children": [
|
|
218
|
+
build_tree(self.themes[child_id])
|
|
219
|
+
for child_id in node.children
|
|
220
|
+
if child_id in self.themes
|
|
221
|
+
],
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
tree_data = build_tree(self.themes["0"])
|
|
225
|
+
return json.dumps(tree_data, indent=2)
|
|
226
|
+
|
|
227
|
+
def select_significant_themes(
|
|
228
|
+
self, significance_threshold: int, total_responses: int
|
|
229
|
+
) -> Dict[str, Any]:
|
|
230
|
+
"""Select significant themes using depth-first traversal.
|
|
231
|
+
|
|
232
|
+
Performs a depth-first search on the theme hierarchy to identify
|
|
233
|
+
themes that meet the significance threshold. Prioritizes leaf nodes
|
|
234
|
+
when possible, but selects parent nodes when children don't meet
|
|
235
|
+
the threshold.
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
significance_threshold: Minimum source_topic_count for significance
|
|
239
|
+
total_responses: Total number of responses across all themes
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
Dict containing selected theme nodes and metadata
|
|
243
|
+
"""
|
|
244
|
+
# Track selected nodes
|
|
245
|
+
selected_nodes: List[Dict[str, Any]] = []
|
|
246
|
+
|
|
247
|
+
# Perform the DFS selection
|
|
248
|
+
self._traverse_tree(self.themes["0"], selected_nodes, significance_threshold)
|
|
249
|
+
|
|
250
|
+
# Format the final result
|
|
251
|
+
result = {"selected_nodes": selected_nodes, "total_responses": total_responses}
|
|
252
|
+
|
|
253
|
+
return result
|
|
254
|
+
|
|
255
|
+
def _traverse_tree(
|
|
256
|
+
self,
|
|
257
|
+
node: ThemeNode,
|
|
258
|
+
selected_nodes: List[Dict[str, Any]],
|
|
259
|
+
significance_threshold: int,
|
|
260
|
+
) -> bool:
|
|
261
|
+
"""Recursively traverse theme tree to select significant nodes.
|
|
262
|
+
|
|
263
|
+
Implements depth-first traversal logic for theme selection:
|
|
264
|
+
1. For leaf nodes: always select
|
|
265
|
+
2. For parent nodes: select if no significant children exist
|
|
266
|
+
3. For significant children: recursively process them
|
|
267
|
+
|
|
268
|
+
Args:
|
|
269
|
+
node: Current ThemeNode being processed
|
|
270
|
+
selected_nodes: List to accumulate selected theme dictionaries
|
|
271
|
+
significance_threshold: Minimum source_topic_count for significance
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
bool: True if this node or descendants were selected, False otherwise
|
|
275
|
+
"""
|
|
276
|
+
# Base case: if node has no children (leaf node)
|
|
277
|
+
if not node.children:
|
|
278
|
+
selected_nodes.append(
|
|
279
|
+
{
|
|
280
|
+
"id": node.topic_id,
|
|
281
|
+
"name": node.topic_label,
|
|
282
|
+
"value": node.source_topic_count,
|
|
283
|
+
}
|
|
284
|
+
)
|
|
285
|
+
return True
|
|
286
|
+
|
|
287
|
+
# Check if any children are significant
|
|
288
|
+
has_significant_children = any(
|
|
289
|
+
self.themes[child_id].source_topic_count >= significance_threshold
|
|
290
|
+
for child_id in node.children
|
|
291
|
+
if child_id in self.themes
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
# If no significant children, select this node
|
|
295
|
+
if not has_significant_children:
|
|
296
|
+
selected_nodes.append(
|
|
297
|
+
{
|
|
298
|
+
"id": node.topic_id,
|
|
299
|
+
"name": node.topic_label,
|
|
300
|
+
"value": node.source_topic_count,
|
|
301
|
+
}
|
|
302
|
+
)
|
|
303
|
+
return True
|
|
304
|
+
|
|
305
|
+
# If significant children exist, recursively process them
|
|
306
|
+
any_selected = False
|
|
307
|
+
for child_id in node.children:
|
|
308
|
+
if child_id in self.themes:
|
|
309
|
+
if self._traverse_tree(
|
|
310
|
+
self.themes[child_id], selected_nodes, significance_threshold
|
|
311
|
+
):
|
|
312
|
+
any_selected = True
|
|
313
|
+
|
|
314
|
+
# If none of the children were selected, select this node
|
|
315
|
+
if not any_selected:
|
|
316
|
+
selected_nodes.append(
|
|
317
|
+
{
|
|
318
|
+
"id": node.topic_id,
|
|
319
|
+
"name": node.topic_label,
|
|
320
|
+
"value": node.source_topic_count,
|
|
321
|
+
}
|
|
322
|
+
)
|
|
323
|
+
return True
|
|
324
|
+
|
|
325
|
+
return any_selected
|
|
326
|
+
|
|
327
|
+
def select_themes(self, significance_percentage: float) -> pd.DataFrame:
|
|
328
|
+
"""Select themes that meet the significance threshold.
|
|
329
|
+
|
|
330
|
+
Calculates the significance threshold based on the percentage of total
|
|
331
|
+
responses and returns only themes that meet or exceed this threshold.
|
|
332
|
+
Excludes the root node from results.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
significance_percentage: Percentage (0-100) of total responses
|
|
336
|
+
required for a theme to be considered significant
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
pd.DataFrame: DataFrame containing significant theme data,
|
|
340
|
+
excluding the root node (topic_id='0')
|
|
341
|
+
"""
|
|
342
|
+
total_responses = self.themes["0"].source_topic_count
|
|
343
|
+
# Convert percentage to absolute threshold
|
|
344
|
+
significance_threshold = int(total_responses * (significance_percentage / 100))
|
|
345
|
+
|
|
346
|
+
# Filter themes that meet the significance threshold
|
|
347
|
+
significant_themes = [
|
|
348
|
+
theme_node
|
|
349
|
+
for theme_node in self.themes.values()
|
|
350
|
+
if theme_node.source_topic_count >= significance_threshold
|
|
351
|
+
]
|
|
352
|
+
# Convert significant themes to DataFrame, excluding root node
|
|
353
|
+
theme_nodes_dicts = [
|
|
354
|
+
node.model_dump() for node in significant_themes if node.topic_id != "0"
|
|
355
|
+
]
|
|
356
|
+
return pd.DataFrame(theme_nodes_dicts)
|