themefinder 0.7.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themefinder/__init__.py +24 -0
- themefinder/advanced_tasks/__init__.py +0 -0
- themefinder/advanced_tasks/cross_cutting_themes_agent.py +404 -0
- themefinder/advanced_tasks/theme_clustering_agent.py +356 -0
- themefinder/llm_batch_processor.py +442 -0
- themefinder/models.py +438 -0
- themefinder/prompts/agentic_theme_clustering.txt +34 -0
- themefinder/prompts/consultation_system_prompt.txt +1 -0
- themefinder/prompts/cross_cutting_identification.txt +16 -0
- themefinder/prompts/cross_cutting_mapping.txt +19 -0
- themefinder/prompts/cross_cutting_refinement.txt +15 -0
- themefinder/prompts/detail_detection.txt +31 -0
- themefinder/prompts/sentiment_analysis.txt +41 -0
- themefinder/prompts/theme_condensation.txt +34 -0
- themefinder/prompts/theme_generation.txt +38 -0
- themefinder/prompts/theme_mapping.txt +36 -0
- themefinder/prompts/theme_refinement.txt +54 -0
- themefinder/prompts/theme_target_alignment.txt +18 -0
- themefinder/tasks.py +656 -0
- themefinder/themefinder_logging.py +12 -0
- themefinder-0.7.4.dist-info/METADATA +174 -0
- themefinder-0.7.4.dist-info/RECORD +24 -0
- themefinder-0.7.4.dist-info/WHEEL +4 -0
- themefinder-0.7.4.dist-info/licenses/LICENCE +21 -0
themefinder/__init__.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from .tasks import (
|
|
2
|
+
find_themes,
|
|
3
|
+
sentiment_analysis,
|
|
4
|
+
theme_clustering,
|
|
5
|
+
theme_condensation,
|
|
6
|
+
theme_generation,
|
|
7
|
+
theme_mapping,
|
|
8
|
+
theme_refinement,
|
|
9
|
+
detail_detection,
|
|
10
|
+
cross_cutting_themes,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"find_themes",
|
|
15
|
+
"sentiment_analysis",
|
|
16
|
+
"theme_clustering",
|
|
17
|
+
"theme_condensation",
|
|
18
|
+
"theme_generation",
|
|
19
|
+
"theme_mapping",
|
|
20
|
+
"theme_refinement",
|
|
21
|
+
"detail_detection",
|
|
22
|
+
"cross_cutting_themes",
|
|
23
|
+
]
|
|
24
|
+
__version__ = "0.1.0"
|
|
File without changes
|
|
@@ -0,0 +1,404 @@
|
|
|
1
|
+
"""Cross-cutting themes analysis agent for theme analysis.
|
|
2
|
+
|
|
3
|
+
This module provides the CrossCuttingThemesAgent class for identifying
|
|
4
|
+
high-level cross-cutting themes across multiple questions using a language model.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
from typing import Dict, List, Any, Optional
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from langchain_core.runnables import Runnable
|
|
12
|
+
from tenacity import (
|
|
13
|
+
before,
|
|
14
|
+
before_sleep_log,
|
|
15
|
+
retry,
|
|
16
|
+
stop_after_attempt,
|
|
17
|
+
wait_random_exponential,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
from themefinder.models import (
|
|
21
|
+
CrossCuttingThemeIdentificationResponse,
|
|
22
|
+
CrossCuttingThemeMappingResponse,
|
|
23
|
+
)
|
|
24
|
+
from themefinder.llm_batch_processor import load_prompt_from_file
|
|
25
|
+
from themefinder.themefinder_logging import logger
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class CrossCuttingThemesAgent:
|
|
29
|
+
"""Agent for identifying cross-cutting themes across multiple questions.
|
|
30
|
+
|
|
31
|
+
This class manages the process of identifying high-level themes that
|
|
32
|
+
span across different questions, mapping individual themes to those cross-cutting themes, and
|
|
33
|
+
refining cross-cutting theme descriptions based on assigned themes.
|
|
34
|
+
|
|
35
|
+
Attributes:
|
|
36
|
+
llm: Language model instance for cross-cutting theme identification and refinement
|
|
37
|
+
questions_themes: Dictionary mapping question numbers to theme DataFrames
|
|
38
|
+
question_strings: Dictionary mapping question IDs to question text
|
|
39
|
+
n_concepts: Number of high-level cross-cutting themes to identify
|
|
40
|
+
concepts: List of identified cross-cutting themes with names and descriptions
|
|
41
|
+
concept_assignments: Dictionary mapping cross-cutting theme names to assigned themes
|
|
42
|
+
concept_descriptions: Enhanced descriptions for each cross-cutting theme
|
|
43
|
+
total_themes: Total number of themes across all questions
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
llm: Runnable,
|
|
49
|
+
questions_themes: Dict[int, pd.DataFrame],
|
|
50
|
+
question_strings: Optional[Dict[str, str]] = None,
|
|
51
|
+
n_concepts: int = 5,
|
|
52
|
+
) -> None:
|
|
53
|
+
"""Initialize the cross-cutting themes agent.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
llm: Language model instance for text generation
|
|
57
|
+
questions_themes: Dictionary mapping question numbers to theme DataFrames
|
|
58
|
+
question_strings: Optional dictionary mapping question IDs to question text
|
|
59
|
+
n_concepts: Number of high-level cross-cutting themes to identify
|
|
60
|
+
|
|
61
|
+
Raises:
|
|
62
|
+
ValueError: If questions_themes is empty
|
|
63
|
+
"""
|
|
64
|
+
self.llm = llm
|
|
65
|
+
self.questions_themes = questions_themes
|
|
66
|
+
self.question_strings = question_strings or {}
|
|
67
|
+
self.n_concepts = n_concepts
|
|
68
|
+
self.concepts: List[Dict[str, str]] = []
|
|
69
|
+
self.concept_assignments: Dict[str, List[Dict[str, Any]]] = {}
|
|
70
|
+
self.concept_descriptions: Dict[str, str] = {}
|
|
71
|
+
|
|
72
|
+
# Validate input
|
|
73
|
+
if not questions_themes:
|
|
74
|
+
raise ValueError("questions_themes cannot be empty")
|
|
75
|
+
|
|
76
|
+
# Count total themes for statistics
|
|
77
|
+
self.total_themes = sum(len(df) for df in questions_themes.values())
|
|
78
|
+
|
|
79
|
+
logger.info(
|
|
80
|
+
f"Initialized CrossCuttingThemesAgent with {len(questions_themes)} questions, "
|
|
81
|
+
f"{self.total_themes} total themes"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
def _format_questions_and_themes(self) -> str:
|
|
85
|
+
"""Format all questions and themes for cross-cutting theme identification.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Formatted string with all questions and their themes
|
|
89
|
+
"""
|
|
90
|
+
formatted_lines = []
|
|
91
|
+
|
|
92
|
+
for q_id, themes_df in self.questions_themes.items():
|
|
93
|
+
# Get question text if available
|
|
94
|
+
question_text = self.question_strings.get(str(q_id), f"Question {q_id}")
|
|
95
|
+
|
|
96
|
+
# Get theme list
|
|
97
|
+
theme_list = themes_df["topic"].to_list()
|
|
98
|
+
|
|
99
|
+
# Format as single line
|
|
100
|
+
formatted_lines.append(
|
|
101
|
+
f"Question: {question_text}, theme list: {theme_list}"
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
return "\\n".join(formatted_lines)
|
|
105
|
+
|
|
106
|
+
@retry(
|
|
107
|
+
wait=wait_random_exponential(min=1, max=2),
|
|
108
|
+
stop=stop_after_attempt(3),
|
|
109
|
+
before=before.before_log(logger=logger, log_level=logging.DEBUG),
|
|
110
|
+
before_sleep=before_sleep_log(logger, logging.ERROR),
|
|
111
|
+
reraise=True,
|
|
112
|
+
)
|
|
113
|
+
def identify_concepts(self) -> List[Dict[str, str]]:
|
|
114
|
+
"""Identify high-level cross-cutting themes across all questions.
|
|
115
|
+
|
|
116
|
+
Uses a single LLM call to identify cross-cutting themes that unify individual themes
|
|
117
|
+
across multiple questions in the consultation.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
List of cross-cutting theme dictionaries with 'name' and 'description' keys
|
|
121
|
+
"""
|
|
122
|
+
logger.info(f"Identifying {self.n_concepts} high-level cross-cutting themes")
|
|
123
|
+
|
|
124
|
+
# Format all questions and themes
|
|
125
|
+
questions_and_themes = self._format_questions_and_themes()
|
|
126
|
+
|
|
127
|
+
# Load prompt template
|
|
128
|
+
prompt_template = load_prompt_from_file("cross_cutting_identification")
|
|
129
|
+
|
|
130
|
+
# Create the prompt
|
|
131
|
+
prompt = prompt_template.format(
|
|
132
|
+
n_concepts=self.n_concepts, questions_and_themes=questions_and_themes
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# Use structured output to get concepts
|
|
136
|
+
structured_llm = self.llm.with_structured_output(
|
|
137
|
+
CrossCuttingThemeIdentificationResponse
|
|
138
|
+
)
|
|
139
|
+
result = structured_llm.invoke(prompt)
|
|
140
|
+
|
|
141
|
+
if isinstance(result, dict):
|
|
142
|
+
result = CrossCuttingThemeIdentificationResponse(**result)
|
|
143
|
+
|
|
144
|
+
# Convert to our expected format
|
|
145
|
+
concepts = []
|
|
146
|
+
for theme in result.themes:
|
|
147
|
+
concepts.append({"name": theme.name, "description": theme.description})
|
|
148
|
+
|
|
149
|
+
self.concepts = concepts
|
|
150
|
+
logger.info(f"Identified {len(concepts)} cross-cutting themes")
|
|
151
|
+
return concepts
|
|
152
|
+
|
|
153
|
+
@retry(
|
|
154
|
+
wait=wait_random_exponential(min=1, max=2),
|
|
155
|
+
stop=stop_after_attempt(3),
|
|
156
|
+
before=before.before_log(logger=logger, log_level=logging.DEBUG),
|
|
157
|
+
before_sleep=before_sleep_log(logger, logging.ERROR),
|
|
158
|
+
reraise=True,
|
|
159
|
+
)
|
|
160
|
+
def map_themes_to_concepts(self) -> Dict[str, List[Dict[str, Any]]]:
|
|
161
|
+
"""Map themes to identified cross-cutting themes using semantic similarity.
|
|
162
|
+
|
|
163
|
+
This uses individual LLM calls per question to classify which themes
|
|
164
|
+
belong to which cross-cutting themes.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
Dictionary mapping cross-cutting theme names to lists of assigned themes
|
|
168
|
+
|
|
169
|
+
Raises:
|
|
170
|
+
ValueError: If cross-cutting themes have not been identified yet
|
|
171
|
+
"""
|
|
172
|
+
if not self.concepts:
|
|
173
|
+
raise ValueError("Must call identify_concepts() first")
|
|
174
|
+
|
|
175
|
+
logger.info("Mapping themes to cross-cutting themes for all questions")
|
|
176
|
+
|
|
177
|
+
all_assignments = {}
|
|
178
|
+
for concept in self.concepts:
|
|
179
|
+
all_assignments[concept["name"]] = []
|
|
180
|
+
|
|
181
|
+
# Process each question
|
|
182
|
+
for q_id, themes_df in self.questions_themes.items():
|
|
183
|
+
logger.info(f"Processing question {q_id}")
|
|
184
|
+
|
|
185
|
+
# Get question text
|
|
186
|
+
question_text = self.question_strings.get(str(q_id), f"Question {q_id}")
|
|
187
|
+
|
|
188
|
+
# Create theme dictionary
|
|
189
|
+
theme_dict = dict(zip(themes_df["topic_id"], themes_df["topic"]))
|
|
190
|
+
|
|
191
|
+
# Format question input
|
|
192
|
+
question_input = (
|
|
193
|
+
f"Question {q_id}: {question_text}, theme dictionary: {theme_dict}"
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# Format cross-cutting themes
|
|
197
|
+
concepts_text = "\\n".join(
|
|
198
|
+
[
|
|
199
|
+
f"{concept['name']}: {concept['description']}"
|
|
200
|
+
for concept in self.concepts
|
|
201
|
+
]
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# Load prompt template
|
|
205
|
+
prompt_template = load_prompt_from_file("cross_cutting_mapping")
|
|
206
|
+
|
|
207
|
+
# Create mapping prompt
|
|
208
|
+
prompt = prompt_template.format(
|
|
209
|
+
question_input=question_input, concepts_text=concepts_text
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
# Use structured output to get mappings
|
|
213
|
+
structured_llm = self.llm.with_structured_output(
|
|
214
|
+
CrossCuttingThemeMappingResponse
|
|
215
|
+
)
|
|
216
|
+
result = structured_llm.invoke(prompt)
|
|
217
|
+
|
|
218
|
+
if isinstance(result, dict):
|
|
219
|
+
result = CrossCuttingThemeMappingResponse(**result)
|
|
220
|
+
|
|
221
|
+
# Convert to our expected format
|
|
222
|
+
question_assignments = {}
|
|
223
|
+
for mapping in result.mappings:
|
|
224
|
+
if mapping.theme_name in all_assignments:
|
|
225
|
+
question_assignments[mapping.theme_name] = mapping.theme_ids
|
|
226
|
+
|
|
227
|
+
# Add to overall assignments
|
|
228
|
+
for concept_name, theme_ids in question_assignments.items():
|
|
229
|
+
if concept_name in all_assignments:
|
|
230
|
+
for theme_id in theme_ids:
|
|
231
|
+
all_assignments[concept_name].append(
|
|
232
|
+
{
|
|
233
|
+
"question_id": q_id,
|
|
234
|
+
"theme_id": theme_id,
|
|
235
|
+
"theme_text": theme_dict.get(theme_id, ""),
|
|
236
|
+
}
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
self.concept_assignments = all_assignments
|
|
240
|
+
logger.info(
|
|
241
|
+
f"Completed theme mapping for {len(self.questions_themes)} questions"
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
return all_assignments
|
|
245
|
+
|
|
246
|
+
@retry(
|
|
247
|
+
wait=wait_random_exponential(min=1, max=2),
|
|
248
|
+
stop=stop_after_attempt(3),
|
|
249
|
+
before=before.before_log(logger=logger, log_level=logging.DEBUG),
|
|
250
|
+
before_sleep=before_sleep_log(logger, logging.ERROR),
|
|
251
|
+
reraise=True,
|
|
252
|
+
)
|
|
253
|
+
def refine_concept_descriptions(self) -> Dict[str, str]:
|
|
254
|
+
"""Refine cross-cutting theme descriptions based on their assigned themes.
|
|
255
|
+
|
|
256
|
+
Creates enhanced descriptions that capture insights and details
|
|
257
|
+
from the themes actually assigned to each cross-cutting theme.
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
Dictionary mapping cross-cutting theme names to refined descriptions
|
|
261
|
+
|
|
262
|
+
Raises:
|
|
263
|
+
ValueError: If theme mapping has not been performed yet
|
|
264
|
+
"""
|
|
265
|
+
if not self.concept_assignments:
|
|
266
|
+
raise ValueError("Must call map_themes_to_concepts() first")
|
|
267
|
+
|
|
268
|
+
logger.info(
|
|
269
|
+
"Refining cross-cutting theme descriptions based on assigned themes"
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
refined_descriptions = {}
|
|
273
|
+
|
|
274
|
+
for concept_name, assignments in self.concept_assignments.items():
|
|
275
|
+
if not assignments:
|
|
276
|
+
# Keep original description if no themes assigned
|
|
277
|
+
original = next(
|
|
278
|
+
(
|
|
279
|
+
c["description"]
|
|
280
|
+
for c in self.concepts
|
|
281
|
+
if c["name"] == concept_name
|
|
282
|
+
),
|
|
283
|
+
"",
|
|
284
|
+
)
|
|
285
|
+
refined_descriptions[concept_name] = original
|
|
286
|
+
continue
|
|
287
|
+
|
|
288
|
+
# Format assigned themes for the prompt
|
|
289
|
+
theme_lines = []
|
|
290
|
+
for assignment in assignments:
|
|
291
|
+
theme_lines.append(
|
|
292
|
+
f"Question {assignment['question_id']}, "
|
|
293
|
+
f"Theme {assignment['theme_id']}: {assignment['theme_text']}"
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# Load prompt template
|
|
297
|
+
prompt_template = load_prompt_from_file("cross_cutting_refinement")
|
|
298
|
+
|
|
299
|
+
# Create refinement prompt
|
|
300
|
+
prompt = prompt_template.format(
|
|
301
|
+
concept_name=concept_name, theme_lines=chr(10).join(theme_lines)
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
# Get refined description
|
|
305
|
+
response = self.llm.invoke(prompt)
|
|
306
|
+
content = (
|
|
307
|
+
response.content if hasattr(response, "content") else str(response)
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
refined_descriptions[concept_name] = content.strip()
|
|
311
|
+
logger.info(f"Refined description for '{concept_name}'")
|
|
312
|
+
|
|
313
|
+
self.concept_descriptions = refined_descriptions
|
|
314
|
+
return refined_descriptions
|
|
315
|
+
|
|
316
|
+
def analyze(self) -> Dict[str, Any]:
|
|
317
|
+
"""Run the cross-cutting theme identification and mapping process.
|
|
318
|
+
|
|
319
|
+
This orchestrates the analysis workflow:
|
|
320
|
+
1. Identify high-level cross-cutting themes across all questions
|
|
321
|
+
2. Map individual themes to the identified cross-cutting themes
|
|
322
|
+
|
|
323
|
+
Returns:
|
|
324
|
+
Dictionary with analysis results including cross-cutting themes and assignments
|
|
325
|
+
"""
|
|
326
|
+
|
|
327
|
+
concepts = self.identify_concepts()
|
|
328
|
+
assignments = self.map_themes_to_concepts()
|
|
329
|
+
|
|
330
|
+
return {"concepts": concepts, "assignments": assignments}
|
|
331
|
+
|
|
332
|
+
def get_results_as_dataframe(self) -> pd.DataFrame:
|
|
333
|
+
"""Convert results to DataFrame format for compatibility.
|
|
334
|
+
|
|
335
|
+
Returns:
|
|
336
|
+
DataFrame with concepts and their assigned themes, compatible
|
|
337
|
+
with other themefinder output formats
|
|
338
|
+
"""
|
|
339
|
+
if not self.concepts or not self.concept_assignments:
|
|
340
|
+
return pd.DataFrame()
|
|
341
|
+
|
|
342
|
+
df_data = []
|
|
343
|
+
|
|
344
|
+
for concept in self.concepts:
|
|
345
|
+
concept_name = concept["name"]
|
|
346
|
+
|
|
347
|
+
# Group themes by question
|
|
348
|
+
themes_by_question = {}
|
|
349
|
+
if concept_name in self.concept_assignments:
|
|
350
|
+
for assignment in self.concept_assignments[concept_name]:
|
|
351
|
+
q_id = assignment["question_id"]
|
|
352
|
+
theme_id = assignment["theme_id"]
|
|
353
|
+
|
|
354
|
+
if q_id not in themes_by_question:
|
|
355
|
+
themes_by_question[q_id] = []
|
|
356
|
+
themes_by_question[q_id].append(theme_id)
|
|
357
|
+
|
|
358
|
+
# Use refined description if available, otherwise original
|
|
359
|
+
description = self.concept_descriptions.get(
|
|
360
|
+
concept_name, concept["description"]
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
df_data.append(
|
|
364
|
+
{
|
|
365
|
+
"name": concept_name,
|
|
366
|
+
"description": description,
|
|
367
|
+
"themes": themes_by_question,
|
|
368
|
+
"n_themes": sum(
|
|
369
|
+
len(themes) for themes in themes_by_question.values()
|
|
370
|
+
),
|
|
371
|
+
"n_questions": len(themes_by_question),
|
|
372
|
+
}
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
return pd.DataFrame(df_data)
|
|
376
|
+
|
|
377
|
+
def get_statistics(self) -> Dict[str, Any]:
|
|
378
|
+
"""Get statistics about the concept identification analysis.
|
|
379
|
+
|
|
380
|
+
Returns:
|
|
381
|
+
Dictionary with analysis statistics including theme utilization,
|
|
382
|
+
concept coverage, and processing metrics
|
|
383
|
+
"""
|
|
384
|
+
# Count assigned themes
|
|
385
|
+
assigned_themes = set()
|
|
386
|
+
for assignments in self.concept_assignments.values():
|
|
387
|
+
for assignment in assignments:
|
|
388
|
+
assigned_themes.add((assignment["question_id"], assignment["theme_id"]))
|
|
389
|
+
|
|
390
|
+
used_count = len(assigned_themes)
|
|
391
|
+
|
|
392
|
+
return {
|
|
393
|
+
"total_themes": self.total_themes,
|
|
394
|
+
"used_themes": used_count,
|
|
395
|
+
"unused_themes": self.total_themes - used_count,
|
|
396
|
+
"utilization_rate": used_count / self.total_themes
|
|
397
|
+
if self.total_themes > 0
|
|
398
|
+
else 0,
|
|
399
|
+
"n_concepts": len(self.concepts),
|
|
400
|
+
"n_questions": len(self.questions_themes),
|
|
401
|
+
"concepts_with_themes": sum(
|
|
402
|
+
1 for assignments in self.concept_assignments.values() if assignments
|
|
403
|
+
),
|
|
404
|
+
}
|