themefinder 0.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
themefinder/tasks.py ADDED
@@ -0,0 +1,656 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+ from langchain_core.prompts import PromptTemplate
6
+ from langchain_core.runnables import RunnableWithFallbacks
7
+
8
+ from themefinder.advanced_tasks.cross_cutting_themes_agent import (
9
+ CrossCuttingThemesAgent,
10
+ )
11
+ from themefinder.advanced_tasks.theme_clustering_agent import ThemeClusteringAgent
12
+ from themefinder.llm_batch_processor import batch_and_run, load_prompt_from_file
13
+ from themefinder.models import (
14
+ DetailDetectionResponses,
15
+ HierarchicalClusteringResponse,
16
+ SentimentAnalysisResponses,
17
+ ThemeCondensationResponses,
18
+ ThemeGenerationResponses,
19
+ ThemeMappingResponses,
20
+ ThemeNode,
21
+ ThemeRefinementResponses,
22
+ )
23
+ from themefinder.themefinder_logging import logger
24
+
25
+ CONSULTATION_SYSTEM_PROMPT = load_prompt_from_file("consultation_system_prompt")
26
+
27
+
28
+ async def find_themes(
29
+ responses_df: pd.DataFrame,
30
+ llm: RunnableWithFallbacks,
31
+ question: str,
32
+ system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
33
+ verbose: bool = True,
34
+ concurrency: int = 10,
35
+ ) -> dict[str, str | pd.DataFrame]:
36
+ """Process survey responses through a multi-stage theme analysis pipeline.
37
+
38
+ This pipeline performs sequential analysis steps:
39
+ 1. Sentiment analysis of responses
40
+ 2. Initial theme generation
41
+ 3. Theme condensation (combining similar themes)
42
+ 4. Theme refinement
43
+ 5. Theme target alignment (optional, if target_n_themes is specified)
44
+ 6. Mapping responses to refined themes
45
+
46
+ Args:
47
+ responses_df (pd.DataFrame): DataFrame containing survey responses
48
+ llm (RunnableWithFallbacks): Language model instance for text analysis
49
+ question (str): The survey question
50
+ target_n_themes (int | None, optional): Target number of themes to consolidate to.
51
+ If None, skip theme target alignment step. Defaults to None.
52
+ system_prompt (str): System prompt to guide the LLM's behavior.
53
+ Defaults to CONSULTATION_SYSTEM_PROMPT.
54
+ verbose (bool): Whether to show information messages during processing.
55
+ Defaults to True.
56
+ concurrency (int): Number of concurrent API calls to make. Defaults to 10.
57
+
58
+ Returns:
59
+ dict[str, str | pd.DataFrame]: Dictionary containing results from each pipeline stage:
60
+ - question: The survey question string
61
+ - sentiment: DataFrame with sentiment analysis results
62
+ - themes: DataFrame with the final themes output
63
+ - mapping: DataFrame mapping responses to final themes
64
+ - unprocessables: Dataframe containing the inputs that could not be processed by the LLM
65
+ """
66
+ logger.setLevel(logging.INFO if verbose else logging.CRITICAL)
67
+
68
+ sentiment_df, sentiment_unprocessables = await sentiment_analysis(
69
+ responses_df,
70
+ llm,
71
+ question=question,
72
+ system_prompt=system_prompt,
73
+ concurrency=concurrency,
74
+ )
75
+ theme_df, _ = await theme_generation(
76
+ sentiment_df,
77
+ llm,
78
+ question=question,
79
+ system_prompt=system_prompt,
80
+ concurrency=concurrency,
81
+ )
82
+ condensed_theme_df, _ = await theme_condensation(
83
+ theme_df,
84
+ llm,
85
+ question=question,
86
+ system_prompt=system_prompt,
87
+ concurrency=concurrency,
88
+ )
89
+ refined_theme_df, _ = await theme_refinement(
90
+ condensed_theme_df,
91
+ llm,
92
+ question=question,
93
+ system_prompt=system_prompt,
94
+ concurrency=concurrency,
95
+ )
96
+
97
+ mapping_df, mapping_unprocessables = await theme_mapping(
98
+ sentiment_df[["response_id", "response"]],
99
+ llm,
100
+ question=question,
101
+ refined_themes_df=refined_theme_df,
102
+ system_prompt=system_prompt,
103
+ concurrency=concurrency,
104
+ )
105
+ detailed_df, _ = await detail_detection(
106
+ responses_df[["response_id", "response"]],
107
+ llm,
108
+ question=question,
109
+ system_prompt=system_prompt,
110
+ concurrency=concurrency,
111
+ )
112
+
113
+ logger.info("Finished finding themes")
114
+ logger.info("Provide feedback or report bugs: packages@cabinetoffice.gov.uk")
115
+ return {
116
+ "question": question,
117
+ "sentiment": sentiment_df,
118
+ "themes": refined_theme_df,
119
+ "mapping": mapping_df,
120
+ "detailed_responses": detailed_df,
121
+ "unprocessables": pd.concat([sentiment_unprocessables, mapping_unprocessables]),
122
+ }
123
+
124
+
125
+ async def sentiment_analysis(
126
+ responses_df: pd.DataFrame,
127
+ llm: RunnableWithFallbacks,
128
+ question: str,
129
+ batch_size: int = 20,
130
+ prompt_template: str | Path | PromptTemplate = "sentiment_analysis",
131
+ system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
132
+ concurrency: int = 10,
133
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
134
+ """Perform sentiment analysis on survey responses using an LLM.
135
+
136
+ This function processes survey responses in batches to analyze their sentiment
137
+ using a language model. It maintains response integrity by checking response IDs.
138
+
139
+ Args:
140
+ responses_df (pd.DataFrame): DataFrame containing survey responses to analyze.
141
+ Must contain 'response_id' and 'response' columns.
142
+ llm (RunnableWithFallbacks): Language model instance to use for sentiment analysis.
143
+ question (str): The survey question.
144
+ batch_size (int, optional): Number of responses to process in each batch.
145
+ Defaults to 20.
146
+ prompt_template (str | Path | PromptTemplate, optional): Template for structuring
147
+ the prompt to the LLM. Can be a string identifier, path to template file,
148
+ or PromptTemplate instance. Defaults to "sentiment_analysis".
149
+ system_prompt (str): System prompt to guide the LLM's behavior.
150
+ Defaults to CONSULTATION_SYSTEM_PROMPT.
151
+ concurrency (int): Number of concurrent API calls to make. Defaults to 10.
152
+
153
+ Returns:
154
+ tuple[pd.DataFrame, pd.DataFrame]:
155
+ A tuple containing two DataFrames:
156
+ - The first DataFrame contains the rows that were successfully processed by the LLM
157
+ - The second DataFrame contains the rows that could not be processed by the LLM
158
+
159
+ Note:
160
+ The function uses integrity_check to ensure responses maintain
161
+ their original order and association after processing.
162
+ """
163
+ logger.info(f"Running sentiment analysis on {len(responses_df)} responses")
164
+ sentiment, unprocessable = await batch_and_run(
165
+ responses_df,
166
+ prompt_template,
167
+ llm.with_structured_output(SentimentAnalysisResponses),
168
+ batch_size=batch_size,
169
+ question=question,
170
+ integrity_check=True,
171
+ system_prompt=system_prompt,
172
+ concurrency=concurrency,
173
+ )
174
+
175
+ return sentiment, unprocessable
176
+
177
+
178
+ async def theme_generation(
179
+ responses_df: pd.DataFrame,
180
+ llm: RunnableWithFallbacks,
181
+ question: str,
182
+ batch_size: int = 50,
183
+ partition_key: str | None = None,
184
+ prompt_template: str | Path | PromptTemplate = "theme_generation",
185
+ system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
186
+ concurrency: int = 10,
187
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
188
+ """Generate themes from survey responses using an LLM.
189
+
190
+ This function processes batches of survey responses to identify common themes or topics.
191
+
192
+ Args:
193
+ responses_df (pd.DataFrame): DataFrame containing survey responses.
194
+ Must include 'response_id' and 'response' columns.
195
+ llm (RunnableWithFallbacks): Language model instance to use for theme generation.
196
+ question (str): The survey question.
197
+ batch_size (int, optional): Number of responses to process in each batch.
198
+ Defaults to 50.
199
+ partition_key (str | None, optional): Column name to use for batching related
200
+ responses together. Defaults to "position" for sentiment-enriched responses,
201
+ but can be set to None for sequential batching or another column name for
202
+ different grouping strategies.
203
+ prompt_template (str | Path | PromptTemplate, optional): Template for structuring
204
+ the prompt to the LLM. Can be a string identifier, path to template file,
205
+ or PromptTemplate instance. Defaults to "theme_generation".
206
+ system_prompt (str): System prompt to guide the LLM's behavior.
207
+ Defaults to CONSULTATION_SYSTEM_PROMPT.
208
+ concurrency (int): Number of concurrent API calls to make. Defaults to 10.
209
+
210
+ Returns:
211
+ tuple[pd.DataFrame, pd.DataFrame]:
212
+ A tuple containing two DataFrames:
213
+ - The first DataFrame contains the rows that were successfully processed by the LLM
214
+ - The second DataFrame contains the rows that could not be processed by the LLM
215
+
216
+ """
217
+ logger.info(f"Running theme generation on {len(responses_df)} responses")
218
+ generated_themes, _ = await batch_and_run(
219
+ responses_df,
220
+ prompt_template,
221
+ llm.with_structured_output(ThemeGenerationResponses),
222
+ batch_size=batch_size,
223
+ partition_key=partition_key,
224
+ question=question,
225
+ system_prompt=system_prompt,
226
+ concurrency=concurrency,
227
+ )
228
+ return generated_themes, _
229
+
230
+
231
+ async def theme_condensation(
232
+ themes_df: pd.DataFrame,
233
+ llm: RunnableWithFallbacks,
234
+ question: str,
235
+ batch_size: int = 75,
236
+ prompt_template: str | Path | PromptTemplate = "theme_condensation",
237
+ system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
238
+ concurrency: int = 10,
239
+ **kwargs,
240
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
241
+ """Condense and combine similar themes identified from survey responses.
242
+
243
+ This function processes the initially identified themes to combine similar or
244
+ overlapping topics into more cohesive, broader categories using an LLM.
245
+
246
+ Args:
247
+ themes_df (pd.DataFrame): DataFrame containing the initial themes identified
248
+ from survey responses.
249
+ llm (RunnableWithFallbacks): Language model instance to use for theme condensation.
250
+ question (str): The survey question.
251
+ batch_size (int, optional): Number of themes to process in each batch.
252
+ Defaults to 100.
253
+ prompt_template (str | Path | PromptTemplate, optional): Template for structuring
254
+ the prompt to the LLM. Can be a string identifier, path to template file,
255
+ or PromptTemplate instance. Defaults to "theme_condensation".
256
+ system_prompt (str): System prompt to guide the LLM's behavior.
257
+ Defaults to CONSULTATION_SYSTEM_PROMPT.
258
+ concurrency (int): Number of concurrent API calls to make. Defaults to 10.
259
+
260
+ Returns:
261
+ tuple[pd.DataFrame, pd.DataFrame]:
262
+ A tuple containing two DataFrames:
263
+ - The first DataFrame contains the rows that were successfully processed by the LLM
264
+ - The second DataFrame contains the rows that could not be processed by the LLM
265
+
266
+ """
267
+ logger.info(f"Running theme condensation on {len(themes_df)} themes")
268
+ themes_df["response_id"] = themes_df.index + 1
269
+
270
+ n_themes = themes_df.shape[0]
271
+ while n_themes > batch_size:
272
+ logger.info(
273
+ f"{n_themes} larger than batch size, using recursive theme condensation"
274
+ )
275
+ themes_df, _ = await batch_and_run(
276
+ themes_df,
277
+ prompt_template,
278
+ llm.with_structured_output(ThemeCondensationResponses),
279
+ batch_size=batch_size,
280
+ question=question,
281
+ system_prompt=system_prompt,
282
+ concurrency=concurrency,
283
+ **kwargs,
284
+ )
285
+ themes_df = themes_df.sample(frac=1).reset_index(drop=True)
286
+ themes_df["response_id"] = themes_df.index + 1
287
+ if len(themes_df) == n_themes:
288
+ logger.info("Themes no longer being condensed")
289
+ break
290
+ n_themes = themes_df.shape[0]
291
+
292
+ themes_df, _ = await batch_and_run(
293
+ themes_df,
294
+ prompt_template,
295
+ llm.with_structured_output(ThemeCondensationResponses),
296
+ batch_size=batch_size,
297
+ question=question,
298
+ system_prompt=system_prompt,
299
+ concurrency=concurrency,
300
+ **kwargs,
301
+ )
302
+
303
+ logger.info(f"Final number of condensed themes: {themes_df.shape[0]}")
304
+ return themes_df, _
305
+
306
+
307
+ def theme_clustering(
308
+ themes_df: pd.DataFrame,
309
+ llm: RunnableWithFallbacks,
310
+ max_iterations: int = 5,
311
+ target_themes: int = 10,
312
+ significance_percentage: float = 10.0,
313
+ return_all_themes: bool = False,
314
+ system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
315
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
316
+ """Perform hierarchical clustering of themes using an agentic approach.
317
+
318
+ This function takes a DataFrame of themes and uses the ThemeClusteringAgent
319
+ to iteratively merge similar themes into a hierarchical structure, then
320
+ selects the most significant themes based on a threshold.
321
+
322
+ Args:
323
+ themes_df (pd.DataFrame): DataFrame containing themes with columns:
324
+ - topic_id: Unique identifier for each theme
325
+ - topic_label: Short descriptive label for the theme
326
+ - topic_description: Detailed description of the theme
327
+ - source_topic_count: Number of source responses for this theme
328
+ llm (RunnableWithFallbacks): Language model instance configured with
329
+ structured output for HierarchicalClusteringResponse
330
+ max_iterations (int, optional): Maximum number of clustering iterations.
331
+ Defaults to 5.
332
+ target_themes (int, optional): Target number of themes to cluster down to.
333
+ Defaults to 10.
334
+ significance_percentage (float, optional): Percentage threshold for
335
+ selecting significant themes. Defaults to 10.0.
336
+ return_all_themes (bool, optional): If True, returns all clustered themes.
337
+ If False, returns only significant themes. Defaults to False.
338
+ system_prompt (str): System prompt to guide the LLM's behavior.
339
+ Defaults to CONSULTATION_SYSTEM_PROMPT.
340
+
341
+ Returns:
342
+ tuple[pd.DataFrame, pd.DataFrame]:
343
+ A tuple containing:
344
+ - DataFrame of clustered themes (all or significant based on return_all_themes)
345
+ - Empty DataFrame (for consistency with other functions)
346
+ """
347
+ logger.info(f"Starting hierarchical clustering of {len(themes_df)} themes")
348
+
349
+ # Convert DataFrame to ThemeNode objects
350
+ initial_themes = [
351
+ ThemeNode(
352
+ topic_id=row["topic_id"],
353
+ topic_label=row["topic_label"],
354
+ topic_description=row["topic_description"],
355
+ source_topic_count=row["source_topic_count"],
356
+ )
357
+ for _, row in themes_df.iterrows()
358
+ ]
359
+
360
+ # Initialize clustering agent with structured output LLM
361
+ agent = ThemeClusteringAgent(
362
+ llm.with_structured_output(HierarchicalClusteringResponse),
363
+ initial_themes,
364
+ system_prompt,
365
+ target_themes,
366
+ )
367
+
368
+ # Perform clustering
369
+ logger.info(
370
+ f"Clustering themes with max_iterations={max_iterations}, target_themes={target_themes}"
371
+ )
372
+ all_themes_df = agent.cluster_themes(
373
+ max_iterations=max_iterations, target_themes=target_themes
374
+ )
375
+
376
+ # Return appropriate themes based on parameter
377
+ if return_all_themes:
378
+ logger.info(
379
+ f"Clustering complete: returning all {len(all_themes_df)} clustered themes"
380
+ )
381
+ return all_themes_df, pd.DataFrame()
382
+ else:
383
+ # Select significant themes
384
+ logger.info(
385
+ f"Selecting themes with significance_percentage={significance_percentage}%"
386
+ )
387
+ selected_themes_df = agent.select_themes(significance_percentage)
388
+ logger.info(
389
+ f"Clustering complete: returning {len(selected_themes_df)} significant themes"
390
+ )
391
+ return selected_themes_df, pd.DataFrame()
392
+
393
+
394
+ async def theme_refinement(
395
+ condensed_themes_df: pd.DataFrame,
396
+ llm: RunnableWithFallbacks,
397
+ question: str,
398
+ batch_size: int = 10000,
399
+ prompt_template: str | Path | PromptTemplate = "theme_refinement",
400
+ system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
401
+ concurrency: int = 10,
402
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
403
+ """Refine and standardize condensed themes using an LLM.
404
+
405
+ This function processes previously condensed themes to create clear, standardized
406
+ theme descriptions. It also transforms the output format for improved readability
407
+ by transposing the results into a single-row DataFrame where columns represent
408
+ individual themes.
409
+
410
+ Args:
411
+ condensed_themes (pd.DataFrame): DataFrame containing the condensed themes
412
+ from the previous pipeline stage.
413
+ llm (RunnableWithFallbacks): Language model instance to use for theme refinement.
414
+ question (str): The survey question.
415
+ batch_size (int, optional): Number of themes to process in each batch.
416
+ Defaults to 10000.
417
+ prompt_template (str | Path | PromptTemplate, optional): Template for structuring
418
+ the prompt to the LLM. Can be a string identifier, path to template file,
419
+ or PromptTemplate instance. Defaults to "theme_refinement".
420
+ system_prompt (str): System prompt to guide the LLM's behavior.
421
+ Defaults to CONSULTATION_SYSTEM_PROMPT.
422
+ concurrency (int): Number of concurrent API calls to make. Defaults to 10.
423
+
424
+ Returns:
425
+ tuple[pd.DataFrame, pd.DataFrame]:
426
+ A tuple containing two DataFrames:
427
+ - The first DataFrame contains the rows that were successfully processed by the LLM
428
+ - The second DataFrame contains the rows that could not be processed by the LLM
429
+
430
+ Note:
431
+ The function adds sequential response_ids to the input DataFrame and
432
+ transposes the output for improved readability and easier downstream
433
+ processing.
434
+ """
435
+ logger.info(f"Running theme refinement on {len(condensed_themes_df)} responses")
436
+ condensed_themes_df["response_id"] = condensed_themes_df.index + 1
437
+
438
+ refined_themes, _ = await batch_and_run(
439
+ condensed_themes_df,
440
+ prompt_template,
441
+ llm.with_structured_output(ThemeRefinementResponses),
442
+ batch_size=batch_size,
443
+ question=question,
444
+ system_prompt=system_prompt,
445
+ concurrency=concurrency,
446
+ )
447
+
448
+ def assign_sequential_topic_ids(df: pd.DataFrame) -> pd.DataFrame:
449
+ """
450
+ Assigns sequential alphabetic topic_ids (A, B, ..., Z, AA, AB, ...) to the DataFrame.
451
+ """
452
+
453
+ def alpha_ids(n: int) -> list[str]:
454
+ ids = []
455
+ for i in range(n):
456
+ s = ""
457
+ x = i
458
+ while True:
459
+ x, r = divmod(x, 26)
460
+ s = chr(65 + r) + s
461
+ if x == 0:
462
+ break
463
+ x -= 1
464
+ ids.append(s)
465
+ return ids
466
+
467
+ if not df.empty:
468
+ df["topic_id"] = alpha_ids(len(df))
469
+ return df
470
+
471
+ refined_themes = assign_sequential_topic_ids(refined_themes)
472
+
473
+ return refined_themes, _
474
+
475
+
476
+ async def theme_mapping(
477
+ responses_df: pd.DataFrame,
478
+ llm: RunnableWithFallbacks,
479
+ question: str,
480
+ refined_themes_df: pd.DataFrame,
481
+ batch_size: int = 20,
482
+ prompt_template: str | Path | PromptTemplate = "theme_mapping",
483
+ system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
484
+ concurrency: int = 10,
485
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
486
+ """Map survey responses to refined themes using an LLM.
487
+
488
+ This function analyzes each survey response and determines which of the refined
489
+ themes best matches its content. Multiple themes can be assigned to a single response.
490
+
491
+ Args:
492
+ responses_df (pd.DataFrame): DataFrame containing survey responses.
493
+ Must include 'response_id' and 'response' columns.
494
+ llm (RunnableWithFallbacks): Language model instance to use for theme mapping.
495
+ question (str): The survey question.
496
+ refined_themes_df (pd.DataFrame): Single-row DataFrame where each column
497
+ represents a theme (from theme_refinement stage).
498
+ batch_size (int, optional): Number of responses to process in each batch.
499
+ Defaults to 20.
500
+ prompt_template (str | Path | PromptTemplate, optional): Template for structuring
501
+ the prompt to the LLM. Can be a string identifier, path to template file,
502
+ or PromptTemplate instance. Defaults to "theme_mapping".
503
+ system_prompt (str): System prompt to guide the LLM's behavior.
504
+ Defaults to CONSULTATION_SYSTEM_PROMPT.
505
+ concurrency (int): Number of concurrent API calls to make. Defaults to 10.
506
+
507
+ Returns:
508
+ tuple[pd.DataFrame, pd.DataFrame]:
509
+ A tuple containing two DataFrames:
510
+ - The first DataFrame contains the rows that were successfully processed by the LLM
511
+ - The second DataFrame contains the rows that could not be processed by the LLM
512
+
513
+ """
514
+ logger.info(
515
+ f"Running theme mapping on {len(responses_df)} responses using {len(refined_themes_df)} themes"
516
+ )
517
+
518
+ def transpose_refined_themes(refined_themes: pd.DataFrame):
519
+ """Transpose topics for increased legibility."""
520
+ transposed_df = pd.DataFrame(
521
+ [refined_themes["topic"].to_numpy()], columns=refined_themes["topic_id"]
522
+ )
523
+ return transposed_df
524
+
525
+ mapping, unprocessable = await batch_and_run(
526
+ responses_df,
527
+ prompt_template,
528
+ llm.with_structured_output(ThemeMappingResponses),
529
+ batch_size=batch_size,
530
+ question=question,
531
+ refined_themes=transpose_refined_themes(refined_themes_df).to_dict(
532
+ orient="records"
533
+ ),
534
+ integrity_check=True,
535
+ system_prompt=system_prompt,
536
+ concurrency=concurrency,
537
+ )
538
+ return mapping, unprocessable
539
+
540
+
541
+ async def detail_detection(
542
+ responses_df: pd.DataFrame,
543
+ llm: RunnableWithFallbacks,
544
+ question: str,
545
+ batch_size: int = 20,
546
+ prompt_template: str | Path | PromptTemplate = "detail_detection",
547
+ system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
548
+ concurrency: int = 10,
549
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
550
+ """Identify responses that provide high-value detailed evidence.
551
+
552
+ This function processes survey responses in batches to analyze their level of detail
553
+ and evidence using a language model. It identifies responses that contain specific
554
+ examples, data, or detailed reasoning that provide strong supporting evidence.
555
+
556
+ Args:
557
+ responses_df (pd.DataFrame): DataFrame containing survey responses to analyze.
558
+ Must contain 'response_id' and 'response' columns.
559
+ llm (RunnableWithFallbacks): Language model instance to use for detail detection.
560
+ question (str): The survey question.
561
+ batch_size (int, optional): Number of responses to process in each batch.
562
+ Defaults to 20.
563
+ prompt_template (str | Path | PromptTemplate, optional): Template for structuring
564
+ the prompt to the LLM. Can be a string identifier, path to template file,
565
+ or PromptTemplate instance. Defaults to "detail_detection".
566
+ system_prompt (str): System prompt to guide the LLM's behavior.
567
+ Defaults to CONSULTATION_SYSTEM_PROMPT.
568
+ concurrency (int): Number of concurrent API calls to make. Defaults to 10.
569
+
570
+ Returns:
571
+ tuple[pd.DataFrame, pd.DataFrame]:
572
+ A tuple containing two DataFrames:
573
+ - The first DataFrame contains the rows that were successfully processed by the LLM
574
+ - The second DataFrame contains the rows that could not be processed by the LLM
575
+
576
+ Note:
577
+ The function uses response_id_integrity_check to ensure responses maintain
578
+ their original order and association after processing.
579
+ """
580
+ logger.info(f"Running detail detection on {len(responses_df)} responses")
581
+ detailed, _ = await batch_and_run(
582
+ responses_df,
583
+ prompt_template,
584
+ llm.with_structured_output(DetailDetectionResponses),
585
+ batch_size=batch_size,
586
+ question=question,
587
+ integrity_check=True,
588
+ system_prompt=system_prompt,
589
+ concurrency=concurrency,
590
+ )
591
+ return detailed, _
592
+
593
+
594
+ def cross_cutting_themes(
595
+ questions_themes: dict[int, pd.DataFrame],
596
+ llm: RunnableWithFallbacks,
597
+ n_concepts: int = 5,
598
+ min_themes: int = 5,
599
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
600
+ """Identify cross-cutting themes using a single-pass agent approach.
601
+
602
+ This function analyzes refined themes from multiple questions to identify semantic
603
+ patterns that span across different questions, creating cross-cutting theme
604
+ categories that represent common concerns or policy areas.
605
+
606
+ The analysis uses a single-pass process:
607
+ 1. Identify high-level cross-cutting themes across all questions
608
+ 2. Map individual themes to the identified cross-cutting themes
609
+ 3. Refine descriptions based on assigned themes
610
+
611
+ Args:
612
+ questions_themes (dict[int, pd.DataFrame]): Dictionary mapping question numbers
613
+ to their refined themes DataFrames. Each DataFrame should have columns:
614
+ - topic_id: Theme identifier (e.g., 'A', 'B', 'C')
615
+ - topic: String in format "topic_name: topic_description"
616
+ llm (RunnableWithFallbacks): Language model instance configured for
617
+ structured output
618
+ n_concepts (int): The target number of cross-cutting themes to generate
619
+ min_themes (int): Minimum number of themes required for a valid
620
+ cross-cutting theme group. Groups with fewer themes will be discarded.
621
+ Defaults to 5.
622
+
623
+ Returns:
624
+ tuple[pd.DataFrame, pd.DataFrame]: A tuple containing:
625
+ - DataFrame with cross-cutting themes with columns:
626
+ - name: Name of the cross-cutting theme
627
+ - description: Description of what this theme represents
628
+ - themes: Dictionary mapping question_number to list of theme_keys
629
+ e.g., {1: ["A", "B"], 3: ["C"]}
630
+ - Empty DataFrame (for consistency with other core functions)
631
+
632
+ Raises:
633
+ ValueError: If questions_themes is empty or contains invalid data
634
+ KeyError: If required columns are missing from themes DataFrames
635
+ """
636
+ # Validate input
637
+ if not questions_themes:
638
+ raise ValueError("questions_themes cannot be empty")
639
+
640
+ # Use the CrossCuttingThemesAgent with external prompt files
641
+ agent = CrossCuttingThemesAgent(
642
+ llm=llm, questions_themes=questions_themes, n_concepts=n_concepts
643
+ )
644
+
645
+ # Run the analysis
646
+ agent.analyze()
647
+
648
+ # Get results as DataFrame using the agent's method
649
+ df_results = agent.get_results_as_dataframe()
650
+
651
+ # Apply minimum themes filter
652
+ if min_themes > 0:
653
+ df_results = df_results[df_results["n_themes"] >= min_themes]
654
+
655
+ # Create and return DataFrame with empty unprocessed data for consistency
656
+ return df_results.reset_index(drop=True), pd.DataFrame()
@@ -0,0 +1,12 @@
1
+ import logging
2
+ import sys
3
+
4
+
5
+ logger = logging.getLogger("theme_finder.tasks")
6
+ logger.setLevel(logging.INFO)
7
+
8
+ handler = logging.StreamHandler(sys.stdout)
9
+ formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s")
10
+ handler.setFormatter(formatter)
11
+ handler.setLevel(logging.INFO)
12
+ logger.addHandler(handler)