themefinder 0.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ from .tasks import (
2
+ find_themes,
3
+ sentiment_analysis,
4
+ theme_clustering,
5
+ theme_condensation,
6
+ theme_generation,
7
+ theme_mapping,
8
+ theme_refinement,
9
+ detail_detection,
10
+ cross_cutting_themes,
11
+ )
12
+
13
+ __all__ = [
14
+ "find_themes",
15
+ "sentiment_analysis",
16
+ "theme_clustering",
17
+ "theme_condensation",
18
+ "theme_generation",
19
+ "theme_mapping",
20
+ "theme_refinement",
21
+ "detail_detection",
22
+ "cross_cutting_themes",
23
+ ]
24
+ __version__ = "0.1.0"
File without changes
@@ -0,0 +1,404 @@
1
+ """Cross-cutting themes analysis agent for theme analysis.
2
+
3
+ This module provides the CrossCuttingThemesAgent class for identifying
4
+ high-level cross-cutting themes across multiple questions using a language model.
5
+ """
6
+
7
+ import logging
8
+ from typing import Dict, List, Any, Optional
9
+
10
+ import pandas as pd
11
+ from langchain_core.runnables import Runnable
12
+ from tenacity import (
13
+ before,
14
+ before_sleep_log,
15
+ retry,
16
+ stop_after_attempt,
17
+ wait_random_exponential,
18
+ )
19
+
20
+ from themefinder.models import (
21
+ CrossCuttingThemeIdentificationResponse,
22
+ CrossCuttingThemeMappingResponse,
23
+ )
24
+ from themefinder.llm_batch_processor import load_prompt_from_file
25
+ from themefinder.themefinder_logging import logger
26
+
27
+
28
+ class CrossCuttingThemesAgent:
29
+ """Agent for identifying cross-cutting themes across multiple questions.
30
+
31
+ This class manages the process of identifying high-level themes that
32
+ span across different questions, mapping individual themes to those cross-cutting themes, and
33
+ refining cross-cutting theme descriptions based on assigned themes.
34
+
35
+ Attributes:
36
+ llm: Language model instance for cross-cutting theme identification and refinement
37
+ questions_themes: Dictionary mapping question numbers to theme DataFrames
38
+ question_strings: Dictionary mapping question IDs to question text
39
+ n_concepts: Number of high-level cross-cutting themes to identify
40
+ concepts: List of identified cross-cutting themes with names and descriptions
41
+ concept_assignments: Dictionary mapping cross-cutting theme names to assigned themes
42
+ concept_descriptions: Enhanced descriptions for each cross-cutting theme
43
+ total_themes: Total number of themes across all questions
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ llm: Runnable,
49
+ questions_themes: Dict[int, pd.DataFrame],
50
+ question_strings: Optional[Dict[str, str]] = None,
51
+ n_concepts: int = 5,
52
+ ) -> None:
53
+ """Initialize the cross-cutting themes agent.
54
+
55
+ Args:
56
+ llm: Language model instance for text generation
57
+ questions_themes: Dictionary mapping question numbers to theme DataFrames
58
+ question_strings: Optional dictionary mapping question IDs to question text
59
+ n_concepts: Number of high-level cross-cutting themes to identify
60
+
61
+ Raises:
62
+ ValueError: If questions_themes is empty
63
+ """
64
+ self.llm = llm
65
+ self.questions_themes = questions_themes
66
+ self.question_strings = question_strings or {}
67
+ self.n_concepts = n_concepts
68
+ self.concepts: List[Dict[str, str]] = []
69
+ self.concept_assignments: Dict[str, List[Dict[str, Any]]] = {}
70
+ self.concept_descriptions: Dict[str, str] = {}
71
+
72
+ # Validate input
73
+ if not questions_themes:
74
+ raise ValueError("questions_themes cannot be empty")
75
+
76
+ # Count total themes for statistics
77
+ self.total_themes = sum(len(df) for df in questions_themes.values())
78
+
79
+ logger.info(
80
+ f"Initialized CrossCuttingThemesAgent with {len(questions_themes)} questions, "
81
+ f"{self.total_themes} total themes"
82
+ )
83
+
84
+ def _format_questions_and_themes(self) -> str:
85
+ """Format all questions and themes for cross-cutting theme identification.
86
+
87
+ Returns:
88
+ Formatted string with all questions and their themes
89
+ """
90
+ formatted_lines = []
91
+
92
+ for q_id, themes_df in self.questions_themes.items():
93
+ # Get question text if available
94
+ question_text = self.question_strings.get(str(q_id), f"Question {q_id}")
95
+
96
+ # Get theme list
97
+ theme_list = themes_df["topic"].to_list()
98
+
99
+ # Format as single line
100
+ formatted_lines.append(
101
+ f"Question: {question_text}, theme list: {theme_list}"
102
+ )
103
+
104
+ return "\\n".join(formatted_lines)
105
+
106
+ @retry(
107
+ wait=wait_random_exponential(min=1, max=2),
108
+ stop=stop_after_attempt(3),
109
+ before=before.before_log(logger=logger, log_level=logging.DEBUG),
110
+ before_sleep=before_sleep_log(logger, logging.ERROR),
111
+ reraise=True,
112
+ )
113
+ def identify_concepts(self) -> List[Dict[str, str]]:
114
+ """Identify high-level cross-cutting themes across all questions.
115
+
116
+ Uses a single LLM call to identify cross-cutting themes that unify individual themes
117
+ across multiple questions in the consultation.
118
+
119
+ Returns:
120
+ List of cross-cutting theme dictionaries with 'name' and 'description' keys
121
+ """
122
+ logger.info(f"Identifying {self.n_concepts} high-level cross-cutting themes")
123
+
124
+ # Format all questions and themes
125
+ questions_and_themes = self._format_questions_and_themes()
126
+
127
+ # Load prompt template
128
+ prompt_template = load_prompt_from_file("cross_cutting_identification")
129
+
130
+ # Create the prompt
131
+ prompt = prompt_template.format(
132
+ n_concepts=self.n_concepts, questions_and_themes=questions_and_themes
133
+ )
134
+
135
+ # Use structured output to get concepts
136
+ structured_llm = self.llm.with_structured_output(
137
+ CrossCuttingThemeIdentificationResponse
138
+ )
139
+ result = structured_llm.invoke(prompt)
140
+
141
+ if isinstance(result, dict):
142
+ result = CrossCuttingThemeIdentificationResponse(**result)
143
+
144
+ # Convert to our expected format
145
+ concepts = []
146
+ for theme in result.themes:
147
+ concepts.append({"name": theme.name, "description": theme.description})
148
+
149
+ self.concepts = concepts
150
+ logger.info(f"Identified {len(concepts)} cross-cutting themes")
151
+ return concepts
152
+
153
+ @retry(
154
+ wait=wait_random_exponential(min=1, max=2),
155
+ stop=stop_after_attempt(3),
156
+ before=before.before_log(logger=logger, log_level=logging.DEBUG),
157
+ before_sleep=before_sleep_log(logger, logging.ERROR),
158
+ reraise=True,
159
+ )
160
+ def map_themes_to_concepts(self) -> Dict[str, List[Dict[str, Any]]]:
161
+ """Map themes to identified cross-cutting themes using semantic similarity.
162
+
163
+ This uses individual LLM calls per question to classify which themes
164
+ belong to which cross-cutting themes.
165
+
166
+ Returns:
167
+ Dictionary mapping cross-cutting theme names to lists of assigned themes
168
+
169
+ Raises:
170
+ ValueError: If cross-cutting themes have not been identified yet
171
+ """
172
+ if not self.concepts:
173
+ raise ValueError("Must call identify_concepts() first")
174
+
175
+ logger.info("Mapping themes to cross-cutting themes for all questions")
176
+
177
+ all_assignments = {}
178
+ for concept in self.concepts:
179
+ all_assignments[concept["name"]] = []
180
+
181
+ # Process each question
182
+ for q_id, themes_df in self.questions_themes.items():
183
+ logger.info(f"Processing question {q_id}")
184
+
185
+ # Get question text
186
+ question_text = self.question_strings.get(str(q_id), f"Question {q_id}")
187
+
188
+ # Create theme dictionary
189
+ theme_dict = dict(zip(themes_df["topic_id"], themes_df["topic"]))
190
+
191
+ # Format question input
192
+ question_input = (
193
+ f"Question {q_id}: {question_text}, theme dictionary: {theme_dict}"
194
+ )
195
+
196
+ # Format cross-cutting themes
197
+ concepts_text = "\\n".join(
198
+ [
199
+ f"{concept['name']}: {concept['description']}"
200
+ for concept in self.concepts
201
+ ]
202
+ )
203
+
204
+ # Load prompt template
205
+ prompt_template = load_prompt_from_file("cross_cutting_mapping")
206
+
207
+ # Create mapping prompt
208
+ prompt = prompt_template.format(
209
+ question_input=question_input, concepts_text=concepts_text
210
+ )
211
+
212
+ # Use structured output to get mappings
213
+ structured_llm = self.llm.with_structured_output(
214
+ CrossCuttingThemeMappingResponse
215
+ )
216
+ result = structured_llm.invoke(prompt)
217
+
218
+ if isinstance(result, dict):
219
+ result = CrossCuttingThemeMappingResponse(**result)
220
+
221
+ # Convert to our expected format
222
+ question_assignments = {}
223
+ for mapping in result.mappings:
224
+ if mapping.theme_name in all_assignments:
225
+ question_assignments[mapping.theme_name] = mapping.theme_ids
226
+
227
+ # Add to overall assignments
228
+ for concept_name, theme_ids in question_assignments.items():
229
+ if concept_name in all_assignments:
230
+ for theme_id in theme_ids:
231
+ all_assignments[concept_name].append(
232
+ {
233
+ "question_id": q_id,
234
+ "theme_id": theme_id,
235
+ "theme_text": theme_dict.get(theme_id, ""),
236
+ }
237
+ )
238
+
239
+ self.concept_assignments = all_assignments
240
+ logger.info(
241
+ f"Completed theme mapping for {len(self.questions_themes)} questions"
242
+ )
243
+
244
+ return all_assignments
245
+
246
+ @retry(
247
+ wait=wait_random_exponential(min=1, max=2),
248
+ stop=stop_after_attempt(3),
249
+ before=before.before_log(logger=logger, log_level=logging.DEBUG),
250
+ before_sleep=before_sleep_log(logger, logging.ERROR),
251
+ reraise=True,
252
+ )
253
+ def refine_concept_descriptions(self) -> Dict[str, str]:
254
+ """Refine cross-cutting theme descriptions based on their assigned themes.
255
+
256
+ Creates enhanced descriptions that capture insights and details
257
+ from the themes actually assigned to each cross-cutting theme.
258
+
259
+ Returns:
260
+ Dictionary mapping cross-cutting theme names to refined descriptions
261
+
262
+ Raises:
263
+ ValueError: If theme mapping has not been performed yet
264
+ """
265
+ if not self.concept_assignments:
266
+ raise ValueError("Must call map_themes_to_concepts() first")
267
+
268
+ logger.info(
269
+ "Refining cross-cutting theme descriptions based on assigned themes"
270
+ )
271
+
272
+ refined_descriptions = {}
273
+
274
+ for concept_name, assignments in self.concept_assignments.items():
275
+ if not assignments:
276
+ # Keep original description if no themes assigned
277
+ original = next(
278
+ (
279
+ c["description"]
280
+ for c in self.concepts
281
+ if c["name"] == concept_name
282
+ ),
283
+ "",
284
+ )
285
+ refined_descriptions[concept_name] = original
286
+ continue
287
+
288
+ # Format assigned themes for the prompt
289
+ theme_lines = []
290
+ for assignment in assignments:
291
+ theme_lines.append(
292
+ f"Question {assignment['question_id']}, "
293
+ f"Theme {assignment['theme_id']}: {assignment['theme_text']}"
294
+ )
295
+
296
+ # Load prompt template
297
+ prompt_template = load_prompt_from_file("cross_cutting_refinement")
298
+
299
+ # Create refinement prompt
300
+ prompt = prompt_template.format(
301
+ concept_name=concept_name, theme_lines=chr(10).join(theme_lines)
302
+ )
303
+
304
+ # Get refined description
305
+ response = self.llm.invoke(prompt)
306
+ content = (
307
+ response.content if hasattr(response, "content") else str(response)
308
+ )
309
+
310
+ refined_descriptions[concept_name] = content.strip()
311
+ logger.info(f"Refined description for '{concept_name}'")
312
+
313
+ self.concept_descriptions = refined_descriptions
314
+ return refined_descriptions
315
+
316
+ def analyze(self) -> Dict[str, Any]:
317
+ """Run the cross-cutting theme identification and mapping process.
318
+
319
+ This orchestrates the analysis workflow:
320
+ 1. Identify high-level cross-cutting themes across all questions
321
+ 2. Map individual themes to the identified cross-cutting themes
322
+
323
+ Returns:
324
+ Dictionary with analysis results including cross-cutting themes and assignments
325
+ """
326
+
327
+ concepts = self.identify_concepts()
328
+ assignments = self.map_themes_to_concepts()
329
+
330
+ return {"concepts": concepts, "assignments": assignments}
331
+
332
+ def get_results_as_dataframe(self) -> pd.DataFrame:
333
+ """Convert results to DataFrame format for compatibility.
334
+
335
+ Returns:
336
+ DataFrame with concepts and their assigned themes, compatible
337
+ with other themefinder output formats
338
+ """
339
+ if not self.concepts or not self.concept_assignments:
340
+ return pd.DataFrame()
341
+
342
+ df_data = []
343
+
344
+ for concept in self.concepts:
345
+ concept_name = concept["name"]
346
+
347
+ # Group themes by question
348
+ themes_by_question = {}
349
+ if concept_name in self.concept_assignments:
350
+ for assignment in self.concept_assignments[concept_name]:
351
+ q_id = assignment["question_id"]
352
+ theme_id = assignment["theme_id"]
353
+
354
+ if q_id not in themes_by_question:
355
+ themes_by_question[q_id] = []
356
+ themes_by_question[q_id].append(theme_id)
357
+
358
+ # Use refined description if available, otherwise original
359
+ description = self.concept_descriptions.get(
360
+ concept_name, concept["description"]
361
+ )
362
+
363
+ df_data.append(
364
+ {
365
+ "name": concept_name,
366
+ "description": description,
367
+ "themes": themes_by_question,
368
+ "n_themes": sum(
369
+ len(themes) for themes in themes_by_question.values()
370
+ ),
371
+ "n_questions": len(themes_by_question),
372
+ }
373
+ )
374
+
375
+ return pd.DataFrame(df_data)
376
+
377
+ def get_statistics(self) -> Dict[str, Any]:
378
+ """Get statistics about the concept identification analysis.
379
+
380
+ Returns:
381
+ Dictionary with analysis statistics including theme utilization,
382
+ concept coverage, and processing metrics
383
+ """
384
+ # Count assigned themes
385
+ assigned_themes = set()
386
+ for assignments in self.concept_assignments.values():
387
+ for assignment in assignments:
388
+ assigned_themes.add((assignment["question_id"], assignment["theme_id"]))
389
+
390
+ used_count = len(assigned_themes)
391
+
392
+ return {
393
+ "total_themes": self.total_themes,
394
+ "used_themes": used_count,
395
+ "unused_themes": self.total_themes - used_count,
396
+ "utilization_rate": used_count / self.total_themes
397
+ if self.total_themes > 0
398
+ else 0,
399
+ "n_concepts": len(self.concepts),
400
+ "n_questions": len(self.questions_themes),
401
+ "concepts_with_themes": sum(
402
+ 1 for assignments in self.concept_assignments.values() if assignments
403
+ ),
404
+ }