themefinder 0.7.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themefinder/__init__.py +24 -0
- themefinder/advanced_tasks/__init__.py +0 -0
- themefinder/advanced_tasks/cross_cutting_themes_agent.py +404 -0
- themefinder/advanced_tasks/theme_clustering_agent.py +356 -0
- themefinder/llm_batch_processor.py +442 -0
- themefinder/models.py +438 -0
- themefinder/prompts/agentic_theme_clustering.txt +34 -0
- themefinder/prompts/consultation_system_prompt.txt +1 -0
- themefinder/prompts/cross_cutting_identification.txt +16 -0
- themefinder/prompts/cross_cutting_mapping.txt +19 -0
- themefinder/prompts/cross_cutting_refinement.txt +15 -0
- themefinder/prompts/detail_detection.txt +31 -0
- themefinder/prompts/sentiment_analysis.txt +41 -0
- themefinder/prompts/theme_condensation.txt +34 -0
- themefinder/prompts/theme_generation.txt +38 -0
- themefinder/prompts/theme_mapping.txt +36 -0
- themefinder/prompts/theme_refinement.txt +54 -0
- themefinder/prompts/theme_target_alignment.txt +18 -0
- themefinder/tasks.py +656 -0
- themefinder/themefinder_logging.py +12 -0
- themefinder-0.7.4.dist-info/METADATA +174 -0
- themefinder-0.7.4.dist-info/RECORD +24 -0
- themefinder-0.7.4.dist-info/WHEEL +4 -0
- themefinder-0.7.4.dist-info/licenses/LICENCE +21 -0
themefinder/tasks.py
ADDED
|
@@ -0,0 +1,656 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from langchain_core.prompts import PromptTemplate
|
|
6
|
+
from langchain_core.runnables import RunnableWithFallbacks
|
|
7
|
+
|
|
8
|
+
from themefinder.advanced_tasks.cross_cutting_themes_agent import (
|
|
9
|
+
CrossCuttingThemesAgent,
|
|
10
|
+
)
|
|
11
|
+
from themefinder.advanced_tasks.theme_clustering_agent import ThemeClusteringAgent
|
|
12
|
+
from themefinder.llm_batch_processor import batch_and_run, load_prompt_from_file
|
|
13
|
+
from themefinder.models import (
|
|
14
|
+
DetailDetectionResponses,
|
|
15
|
+
HierarchicalClusteringResponse,
|
|
16
|
+
SentimentAnalysisResponses,
|
|
17
|
+
ThemeCondensationResponses,
|
|
18
|
+
ThemeGenerationResponses,
|
|
19
|
+
ThemeMappingResponses,
|
|
20
|
+
ThemeNode,
|
|
21
|
+
ThemeRefinementResponses,
|
|
22
|
+
)
|
|
23
|
+
from themefinder.themefinder_logging import logger
|
|
24
|
+
|
|
25
|
+
CONSULTATION_SYSTEM_PROMPT = load_prompt_from_file("consultation_system_prompt")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
async def find_themes(
|
|
29
|
+
responses_df: pd.DataFrame,
|
|
30
|
+
llm: RunnableWithFallbacks,
|
|
31
|
+
question: str,
|
|
32
|
+
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
33
|
+
verbose: bool = True,
|
|
34
|
+
concurrency: int = 10,
|
|
35
|
+
) -> dict[str, str | pd.DataFrame]:
|
|
36
|
+
"""Process survey responses through a multi-stage theme analysis pipeline.
|
|
37
|
+
|
|
38
|
+
This pipeline performs sequential analysis steps:
|
|
39
|
+
1. Sentiment analysis of responses
|
|
40
|
+
2. Initial theme generation
|
|
41
|
+
3. Theme condensation (combining similar themes)
|
|
42
|
+
4. Theme refinement
|
|
43
|
+
5. Theme target alignment (optional, if target_n_themes is specified)
|
|
44
|
+
6. Mapping responses to refined themes
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
responses_df (pd.DataFrame): DataFrame containing survey responses
|
|
48
|
+
llm (RunnableWithFallbacks): Language model instance for text analysis
|
|
49
|
+
question (str): The survey question
|
|
50
|
+
target_n_themes (int | None, optional): Target number of themes to consolidate to.
|
|
51
|
+
If None, skip theme target alignment step. Defaults to None.
|
|
52
|
+
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
53
|
+
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
54
|
+
verbose (bool): Whether to show information messages during processing.
|
|
55
|
+
Defaults to True.
|
|
56
|
+
concurrency (int): Number of concurrent API calls to make. Defaults to 10.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
dict[str, str | pd.DataFrame]: Dictionary containing results from each pipeline stage:
|
|
60
|
+
- question: The survey question string
|
|
61
|
+
- sentiment: DataFrame with sentiment analysis results
|
|
62
|
+
- themes: DataFrame with the final themes output
|
|
63
|
+
- mapping: DataFrame mapping responses to final themes
|
|
64
|
+
- unprocessables: Dataframe containing the inputs that could not be processed by the LLM
|
|
65
|
+
"""
|
|
66
|
+
logger.setLevel(logging.INFO if verbose else logging.CRITICAL)
|
|
67
|
+
|
|
68
|
+
sentiment_df, sentiment_unprocessables = await sentiment_analysis(
|
|
69
|
+
responses_df,
|
|
70
|
+
llm,
|
|
71
|
+
question=question,
|
|
72
|
+
system_prompt=system_prompt,
|
|
73
|
+
concurrency=concurrency,
|
|
74
|
+
)
|
|
75
|
+
theme_df, _ = await theme_generation(
|
|
76
|
+
sentiment_df,
|
|
77
|
+
llm,
|
|
78
|
+
question=question,
|
|
79
|
+
system_prompt=system_prompt,
|
|
80
|
+
concurrency=concurrency,
|
|
81
|
+
)
|
|
82
|
+
condensed_theme_df, _ = await theme_condensation(
|
|
83
|
+
theme_df,
|
|
84
|
+
llm,
|
|
85
|
+
question=question,
|
|
86
|
+
system_prompt=system_prompt,
|
|
87
|
+
concurrency=concurrency,
|
|
88
|
+
)
|
|
89
|
+
refined_theme_df, _ = await theme_refinement(
|
|
90
|
+
condensed_theme_df,
|
|
91
|
+
llm,
|
|
92
|
+
question=question,
|
|
93
|
+
system_prompt=system_prompt,
|
|
94
|
+
concurrency=concurrency,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
mapping_df, mapping_unprocessables = await theme_mapping(
|
|
98
|
+
sentiment_df[["response_id", "response"]],
|
|
99
|
+
llm,
|
|
100
|
+
question=question,
|
|
101
|
+
refined_themes_df=refined_theme_df,
|
|
102
|
+
system_prompt=system_prompt,
|
|
103
|
+
concurrency=concurrency,
|
|
104
|
+
)
|
|
105
|
+
detailed_df, _ = await detail_detection(
|
|
106
|
+
responses_df[["response_id", "response"]],
|
|
107
|
+
llm,
|
|
108
|
+
question=question,
|
|
109
|
+
system_prompt=system_prompt,
|
|
110
|
+
concurrency=concurrency,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
logger.info("Finished finding themes")
|
|
114
|
+
logger.info("Provide feedback or report bugs: packages@cabinetoffice.gov.uk")
|
|
115
|
+
return {
|
|
116
|
+
"question": question,
|
|
117
|
+
"sentiment": sentiment_df,
|
|
118
|
+
"themes": refined_theme_df,
|
|
119
|
+
"mapping": mapping_df,
|
|
120
|
+
"detailed_responses": detailed_df,
|
|
121
|
+
"unprocessables": pd.concat([sentiment_unprocessables, mapping_unprocessables]),
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
async def sentiment_analysis(
|
|
126
|
+
responses_df: pd.DataFrame,
|
|
127
|
+
llm: RunnableWithFallbacks,
|
|
128
|
+
question: str,
|
|
129
|
+
batch_size: int = 20,
|
|
130
|
+
prompt_template: str | Path | PromptTemplate = "sentiment_analysis",
|
|
131
|
+
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
132
|
+
concurrency: int = 10,
|
|
133
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
134
|
+
"""Perform sentiment analysis on survey responses using an LLM.
|
|
135
|
+
|
|
136
|
+
This function processes survey responses in batches to analyze their sentiment
|
|
137
|
+
using a language model. It maintains response integrity by checking response IDs.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
responses_df (pd.DataFrame): DataFrame containing survey responses to analyze.
|
|
141
|
+
Must contain 'response_id' and 'response' columns.
|
|
142
|
+
llm (RunnableWithFallbacks): Language model instance to use for sentiment analysis.
|
|
143
|
+
question (str): The survey question.
|
|
144
|
+
batch_size (int, optional): Number of responses to process in each batch.
|
|
145
|
+
Defaults to 20.
|
|
146
|
+
prompt_template (str | Path | PromptTemplate, optional): Template for structuring
|
|
147
|
+
the prompt to the LLM. Can be a string identifier, path to template file,
|
|
148
|
+
or PromptTemplate instance. Defaults to "sentiment_analysis".
|
|
149
|
+
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
150
|
+
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
151
|
+
concurrency (int): Number of concurrent API calls to make. Defaults to 10.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
155
|
+
A tuple containing two DataFrames:
|
|
156
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
157
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
158
|
+
|
|
159
|
+
Note:
|
|
160
|
+
The function uses integrity_check to ensure responses maintain
|
|
161
|
+
their original order and association after processing.
|
|
162
|
+
"""
|
|
163
|
+
logger.info(f"Running sentiment analysis on {len(responses_df)} responses")
|
|
164
|
+
sentiment, unprocessable = await batch_and_run(
|
|
165
|
+
responses_df,
|
|
166
|
+
prompt_template,
|
|
167
|
+
llm.with_structured_output(SentimentAnalysisResponses),
|
|
168
|
+
batch_size=batch_size,
|
|
169
|
+
question=question,
|
|
170
|
+
integrity_check=True,
|
|
171
|
+
system_prompt=system_prompt,
|
|
172
|
+
concurrency=concurrency,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
return sentiment, unprocessable
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
async def theme_generation(
|
|
179
|
+
responses_df: pd.DataFrame,
|
|
180
|
+
llm: RunnableWithFallbacks,
|
|
181
|
+
question: str,
|
|
182
|
+
batch_size: int = 50,
|
|
183
|
+
partition_key: str | None = None,
|
|
184
|
+
prompt_template: str | Path | PromptTemplate = "theme_generation",
|
|
185
|
+
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
186
|
+
concurrency: int = 10,
|
|
187
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
188
|
+
"""Generate themes from survey responses using an LLM.
|
|
189
|
+
|
|
190
|
+
This function processes batches of survey responses to identify common themes or topics.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
responses_df (pd.DataFrame): DataFrame containing survey responses.
|
|
194
|
+
Must include 'response_id' and 'response' columns.
|
|
195
|
+
llm (RunnableWithFallbacks): Language model instance to use for theme generation.
|
|
196
|
+
question (str): The survey question.
|
|
197
|
+
batch_size (int, optional): Number of responses to process in each batch.
|
|
198
|
+
Defaults to 50.
|
|
199
|
+
partition_key (str | None, optional): Column name to use for batching related
|
|
200
|
+
responses together. Defaults to "position" for sentiment-enriched responses,
|
|
201
|
+
but can be set to None for sequential batching or another column name for
|
|
202
|
+
different grouping strategies.
|
|
203
|
+
prompt_template (str | Path | PromptTemplate, optional): Template for structuring
|
|
204
|
+
the prompt to the LLM. Can be a string identifier, path to template file,
|
|
205
|
+
or PromptTemplate instance. Defaults to "theme_generation".
|
|
206
|
+
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
207
|
+
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
208
|
+
concurrency (int): Number of concurrent API calls to make. Defaults to 10.
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
212
|
+
A tuple containing two DataFrames:
|
|
213
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
214
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
215
|
+
|
|
216
|
+
"""
|
|
217
|
+
logger.info(f"Running theme generation on {len(responses_df)} responses")
|
|
218
|
+
generated_themes, _ = await batch_and_run(
|
|
219
|
+
responses_df,
|
|
220
|
+
prompt_template,
|
|
221
|
+
llm.with_structured_output(ThemeGenerationResponses),
|
|
222
|
+
batch_size=batch_size,
|
|
223
|
+
partition_key=partition_key,
|
|
224
|
+
question=question,
|
|
225
|
+
system_prompt=system_prompt,
|
|
226
|
+
concurrency=concurrency,
|
|
227
|
+
)
|
|
228
|
+
return generated_themes, _
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
async def theme_condensation(
|
|
232
|
+
themes_df: pd.DataFrame,
|
|
233
|
+
llm: RunnableWithFallbacks,
|
|
234
|
+
question: str,
|
|
235
|
+
batch_size: int = 75,
|
|
236
|
+
prompt_template: str | Path | PromptTemplate = "theme_condensation",
|
|
237
|
+
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
238
|
+
concurrency: int = 10,
|
|
239
|
+
**kwargs,
|
|
240
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
241
|
+
"""Condense and combine similar themes identified from survey responses.
|
|
242
|
+
|
|
243
|
+
This function processes the initially identified themes to combine similar or
|
|
244
|
+
overlapping topics into more cohesive, broader categories using an LLM.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
themes_df (pd.DataFrame): DataFrame containing the initial themes identified
|
|
248
|
+
from survey responses.
|
|
249
|
+
llm (RunnableWithFallbacks): Language model instance to use for theme condensation.
|
|
250
|
+
question (str): The survey question.
|
|
251
|
+
batch_size (int, optional): Number of themes to process in each batch.
|
|
252
|
+
Defaults to 100.
|
|
253
|
+
prompt_template (str | Path | PromptTemplate, optional): Template for structuring
|
|
254
|
+
the prompt to the LLM. Can be a string identifier, path to template file,
|
|
255
|
+
or PromptTemplate instance. Defaults to "theme_condensation".
|
|
256
|
+
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
257
|
+
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
258
|
+
concurrency (int): Number of concurrent API calls to make. Defaults to 10.
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
262
|
+
A tuple containing two DataFrames:
|
|
263
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
264
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
265
|
+
|
|
266
|
+
"""
|
|
267
|
+
logger.info(f"Running theme condensation on {len(themes_df)} themes")
|
|
268
|
+
themes_df["response_id"] = themes_df.index + 1
|
|
269
|
+
|
|
270
|
+
n_themes = themes_df.shape[0]
|
|
271
|
+
while n_themes > batch_size:
|
|
272
|
+
logger.info(
|
|
273
|
+
f"{n_themes} larger than batch size, using recursive theme condensation"
|
|
274
|
+
)
|
|
275
|
+
themes_df, _ = await batch_and_run(
|
|
276
|
+
themes_df,
|
|
277
|
+
prompt_template,
|
|
278
|
+
llm.with_structured_output(ThemeCondensationResponses),
|
|
279
|
+
batch_size=batch_size,
|
|
280
|
+
question=question,
|
|
281
|
+
system_prompt=system_prompt,
|
|
282
|
+
concurrency=concurrency,
|
|
283
|
+
**kwargs,
|
|
284
|
+
)
|
|
285
|
+
themes_df = themes_df.sample(frac=1).reset_index(drop=True)
|
|
286
|
+
themes_df["response_id"] = themes_df.index + 1
|
|
287
|
+
if len(themes_df) == n_themes:
|
|
288
|
+
logger.info("Themes no longer being condensed")
|
|
289
|
+
break
|
|
290
|
+
n_themes = themes_df.shape[0]
|
|
291
|
+
|
|
292
|
+
themes_df, _ = await batch_and_run(
|
|
293
|
+
themes_df,
|
|
294
|
+
prompt_template,
|
|
295
|
+
llm.with_structured_output(ThemeCondensationResponses),
|
|
296
|
+
batch_size=batch_size,
|
|
297
|
+
question=question,
|
|
298
|
+
system_prompt=system_prompt,
|
|
299
|
+
concurrency=concurrency,
|
|
300
|
+
**kwargs,
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
logger.info(f"Final number of condensed themes: {themes_df.shape[0]}")
|
|
304
|
+
return themes_df, _
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def theme_clustering(
|
|
308
|
+
themes_df: pd.DataFrame,
|
|
309
|
+
llm: RunnableWithFallbacks,
|
|
310
|
+
max_iterations: int = 5,
|
|
311
|
+
target_themes: int = 10,
|
|
312
|
+
significance_percentage: float = 10.0,
|
|
313
|
+
return_all_themes: bool = False,
|
|
314
|
+
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
315
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
316
|
+
"""Perform hierarchical clustering of themes using an agentic approach.
|
|
317
|
+
|
|
318
|
+
This function takes a DataFrame of themes and uses the ThemeClusteringAgent
|
|
319
|
+
to iteratively merge similar themes into a hierarchical structure, then
|
|
320
|
+
selects the most significant themes based on a threshold.
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
themes_df (pd.DataFrame): DataFrame containing themes with columns:
|
|
324
|
+
- topic_id: Unique identifier for each theme
|
|
325
|
+
- topic_label: Short descriptive label for the theme
|
|
326
|
+
- topic_description: Detailed description of the theme
|
|
327
|
+
- source_topic_count: Number of source responses for this theme
|
|
328
|
+
llm (RunnableWithFallbacks): Language model instance configured with
|
|
329
|
+
structured output for HierarchicalClusteringResponse
|
|
330
|
+
max_iterations (int, optional): Maximum number of clustering iterations.
|
|
331
|
+
Defaults to 5.
|
|
332
|
+
target_themes (int, optional): Target number of themes to cluster down to.
|
|
333
|
+
Defaults to 10.
|
|
334
|
+
significance_percentage (float, optional): Percentage threshold for
|
|
335
|
+
selecting significant themes. Defaults to 10.0.
|
|
336
|
+
return_all_themes (bool, optional): If True, returns all clustered themes.
|
|
337
|
+
If False, returns only significant themes. Defaults to False.
|
|
338
|
+
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
339
|
+
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
340
|
+
|
|
341
|
+
Returns:
|
|
342
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
343
|
+
A tuple containing:
|
|
344
|
+
- DataFrame of clustered themes (all or significant based on return_all_themes)
|
|
345
|
+
- Empty DataFrame (for consistency with other functions)
|
|
346
|
+
"""
|
|
347
|
+
logger.info(f"Starting hierarchical clustering of {len(themes_df)} themes")
|
|
348
|
+
|
|
349
|
+
# Convert DataFrame to ThemeNode objects
|
|
350
|
+
initial_themes = [
|
|
351
|
+
ThemeNode(
|
|
352
|
+
topic_id=row["topic_id"],
|
|
353
|
+
topic_label=row["topic_label"],
|
|
354
|
+
topic_description=row["topic_description"],
|
|
355
|
+
source_topic_count=row["source_topic_count"],
|
|
356
|
+
)
|
|
357
|
+
for _, row in themes_df.iterrows()
|
|
358
|
+
]
|
|
359
|
+
|
|
360
|
+
# Initialize clustering agent with structured output LLM
|
|
361
|
+
agent = ThemeClusteringAgent(
|
|
362
|
+
llm.with_structured_output(HierarchicalClusteringResponse),
|
|
363
|
+
initial_themes,
|
|
364
|
+
system_prompt,
|
|
365
|
+
target_themes,
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
# Perform clustering
|
|
369
|
+
logger.info(
|
|
370
|
+
f"Clustering themes with max_iterations={max_iterations}, target_themes={target_themes}"
|
|
371
|
+
)
|
|
372
|
+
all_themes_df = agent.cluster_themes(
|
|
373
|
+
max_iterations=max_iterations, target_themes=target_themes
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
# Return appropriate themes based on parameter
|
|
377
|
+
if return_all_themes:
|
|
378
|
+
logger.info(
|
|
379
|
+
f"Clustering complete: returning all {len(all_themes_df)} clustered themes"
|
|
380
|
+
)
|
|
381
|
+
return all_themes_df, pd.DataFrame()
|
|
382
|
+
else:
|
|
383
|
+
# Select significant themes
|
|
384
|
+
logger.info(
|
|
385
|
+
f"Selecting themes with significance_percentage={significance_percentage}%"
|
|
386
|
+
)
|
|
387
|
+
selected_themes_df = agent.select_themes(significance_percentage)
|
|
388
|
+
logger.info(
|
|
389
|
+
f"Clustering complete: returning {len(selected_themes_df)} significant themes"
|
|
390
|
+
)
|
|
391
|
+
return selected_themes_df, pd.DataFrame()
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
async def theme_refinement(
|
|
395
|
+
condensed_themes_df: pd.DataFrame,
|
|
396
|
+
llm: RunnableWithFallbacks,
|
|
397
|
+
question: str,
|
|
398
|
+
batch_size: int = 10000,
|
|
399
|
+
prompt_template: str | Path | PromptTemplate = "theme_refinement",
|
|
400
|
+
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
401
|
+
concurrency: int = 10,
|
|
402
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
403
|
+
"""Refine and standardize condensed themes using an LLM.
|
|
404
|
+
|
|
405
|
+
This function processes previously condensed themes to create clear, standardized
|
|
406
|
+
theme descriptions. It also transforms the output format for improved readability
|
|
407
|
+
by transposing the results into a single-row DataFrame where columns represent
|
|
408
|
+
individual themes.
|
|
409
|
+
|
|
410
|
+
Args:
|
|
411
|
+
condensed_themes (pd.DataFrame): DataFrame containing the condensed themes
|
|
412
|
+
from the previous pipeline stage.
|
|
413
|
+
llm (RunnableWithFallbacks): Language model instance to use for theme refinement.
|
|
414
|
+
question (str): The survey question.
|
|
415
|
+
batch_size (int, optional): Number of themes to process in each batch.
|
|
416
|
+
Defaults to 10000.
|
|
417
|
+
prompt_template (str | Path | PromptTemplate, optional): Template for structuring
|
|
418
|
+
the prompt to the LLM. Can be a string identifier, path to template file,
|
|
419
|
+
or PromptTemplate instance. Defaults to "theme_refinement".
|
|
420
|
+
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
421
|
+
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
422
|
+
concurrency (int): Number of concurrent API calls to make. Defaults to 10.
|
|
423
|
+
|
|
424
|
+
Returns:
|
|
425
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
426
|
+
A tuple containing two DataFrames:
|
|
427
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
428
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
429
|
+
|
|
430
|
+
Note:
|
|
431
|
+
The function adds sequential response_ids to the input DataFrame and
|
|
432
|
+
transposes the output for improved readability and easier downstream
|
|
433
|
+
processing.
|
|
434
|
+
"""
|
|
435
|
+
logger.info(f"Running theme refinement on {len(condensed_themes_df)} responses")
|
|
436
|
+
condensed_themes_df["response_id"] = condensed_themes_df.index + 1
|
|
437
|
+
|
|
438
|
+
refined_themes, _ = await batch_and_run(
|
|
439
|
+
condensed_themes_df,
|
|
440
|
+
prompt_template,
|
|
441
|
+
llm.with_structured_output(ThemeRefinementResponses),
|
|
442
|
+
batch_size=batch_size,
|
|
443
|
+
question=question,
|
|
444
|
+
system_prompt=system_prompt,
|
|
445
|
+
concurrency=concurrency,
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
def assign_sequential_topic_ids(df: pd.DataFrame) -> pd.DataFrame:
|
|
449
|
+
"""
|
|
450
|
+
Assigns sequential alphabetic topic_ids (A, B, ..., Z, AA, AB, ...) to the DataFrame.
|
|
451
|
+
"""
|
|
452
|
+
|
|
453
|
+
def alpha_ids(n: int) -> list[str]:
|
|
454
|
+
ids = []
|
|
455
|
+
for i in range(n):
|
|
456
|
+
s = ""
|
|
457
|
+
x = i
|
|
458
|
+
while True:
|
|
459
|
+
x, r = divmod(x, 26)
|
|
460
|
+
s = chr(65 + r) + s
|
|
461
|
+
if x == 0:
|
|
462
|
+
break
|
|
463
|
+
x -= 1
|
|
464
|
+
ids.append(s)
|
|
465
|
+
return ids
|
|
466
|
+
|
|
467
|
+
if not df.empty:
|
|
468
|
+
df["topic_id"] = alpha_ids(len(df))
|
|
469
|
+
return df
|
|
470
|
+
|
|
471
|
+
refined_themes = assign_sequential_topic_ids(refined_themes)
|
|
472
|
+
|
|
473
|
+
return refined_themes, _
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
async def theme_mapping(
|
|
477
|
+
responses_df: pd.DataFrame,
|
|
478
|
+
llm: RunnableWithFallbacks,
|
|
479
|
+
question: str,
|
|
480
|
+
refined_themes_df: pd.DataFrame,
|
|
481
|
+
batch_size: int = 20,
|
|
482
|
+
prompt_template: str | Path | PromptTemplate = "theme_mapping",
|
|
483
|
+
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
484
|
+
concurrency: int = 10,
|
|
485
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
486
|
+
"""Map survey responses to refined themes using an LLM.
|
|
487
|
+
|
|
488
|
+
This function analyzes each survey response and determines which of the refined
|
|
489
|
+
themes best matches its content. Multiple themes can be assigned to a single response.
|
|
490
|
+
|
|
491
|
+
Args:
|
|
492
|
+
responses_df (pd.DataFrame): DataFrame containing survey responses.
|
|
493
|
+
Must include 'response_id' and 'response' columns.
|
|
494
|
+
llm (RunnableWithFallbacks): Language model instance to use for theme mapping.
|
|
495
|
+
question (str): The survey question.
|
|
496
|
+
refined_themes_df (pd.DataFrame): Single-row DataFrame where each column
|
|
497
|
+
represents a theme (from theme_refinement stage).
|
|
498
|
+
batch_size (int, optional): Number of responses to process in each batch.
|
|
499
|
+
Defaults to 20.
|
|
500
|
+
prompt_template (str | Path | PromptTemplate, optional): Template for structuring
|
|
501
|
+
the prompt to the LLM. Can be a string identifier, path to template file,
|
|
502
|
+
or PromptTemplate instance. Defaults to "theme_mapping".
|
|
503
|
+
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
504
|
+
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
505
|
+
concurrency (int): Number of concurrent API calls to make. Defaults to 10.
|
|
506
|
+
|
|
507
|
+
Returns:
|
|
508
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
509
|
+
A tuple containing two DataFrames:
|
|
510
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
511
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
512
|
+
|
|
513
|
+
"""
|
|
514
|
+
logger.info(
|
|
515
|
+
f"Running theme mapping on {len(responses_df)} responses using {len(refined_themes_df)} themes"
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
def transpose_refined_themes(refined_themes: pd.DataFrame):
|
|
519
|
+
"""Transpose topics for increased legibility."""
|
|
520
|
+
transposed_df = pd.DataFrame(
|
|
521
|
+
[refined_themes["topic"].to_numpy()], columns=refined_themes["topic_id"]
|
|
522
|
+
)
|
|
523
|
+
return transposed_df
|
|
524
|
+
|
|
525
|
+
mapping, unprocessable = await batch_and_run(
|
|
526
|
+
responses_df,
|
|
527
|
+
prompt_template,
|
|
528
|
+
llm.with_structured_output(ThemeMappingResponses),
|
|
529
|
+
batch_size=batch_size,
|
|
530
|
+
question=question,
|
|
531
|
+
refined_themes=transpose_refined_themes(refined_themes_df).to_dict(
|
|
532
|
+
orient="records"
|
|
533
|
+
),
|
|
534
|
+
integrity_check=True,
|
|
535
|
+
system_prompt=system_prompt,
|
|
536
|
+
concurrency=concurrency,
|
|
537
|
+
)
|
|
538
|
+
return mapping, unprocessable
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
async def detail_detection(
|
|
542
|
+
responses_df: pd.DataFrame,
|
|
543
|
+
llm: RunnableWithFallbacks,
|
|
544
|
+
question: str,
|
|
545
|
+
batch_size: int = 20,
|
|
546
|
+
prompt_template: str | Path | PromptTemplate = "detail_detection",
|
|
547
|
+
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
548
|
+
concurrency: int = 10,
|
|
549
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
550
|
+
"""Identify responses that provide high-value detailed evidence.
|
|
551
|
+
|
|
552
|
+
This function processes survey responses in batches to analyze their level of detail
|
|
553
|
+
and evidence using a language model. It identifies responses that contain specific
|
|
554
|
+
examples, data, or detailed reasoning that provide strong supporting evidence.
|
|
555
|
+
|
|
556
|
+
Args:
|
|
557
|
+
responses_df (pd.DataFrame): DataFrame containing survey responses to analyze.
|
|
558
|
+
Must contain 'response_id' and 'response' columns.
|
|
559
|
+
llm (RunnableWithFallbacks): Language model instance to use for detail detection.
|
|
560
|
+
question (str): The survey question.
|
|
561
|
+
batch_size (int, optional): Number of responses to process in each batch.
|
|
562
|
+
Defaults to 20.
|
|
563
|
+
prompt_template (str | Path | PromptTemplate, optional): Template for structuring
|
|
564
|
+
the prompt to the LLM. Can be a string identifier, path to template file,
|
|
565
|
+
or PromptTemplate instance. Defaults to "detail_detection".
|
|
566
|
+
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
567
|
+
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
568
|
+
concurrency (int): Number of concurrent API calls to make. Defaults to 10.
|
|
569
|
+
|
|
570
|
+
Returns:
|
|
571
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
572
|
+
A tuple containing two DataFrames:
|
|
573
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
574
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
575
|
+
|
|
576
|
+
Note:
|
|
577
|
+
The function uses response_id_integrity_check to ensure responses maintain
|
|
578
|
+
their original order and association after processing.
|
|
579
|
+
"""
|
|
580
|
+
logger.info(f"Running detail detection on {len(responses_df)} responses")
|
|
581
|
+
detailed, _ = await batch_and_run(
|
|
582
|
+
responses_df,
|
|
583
|
+
prompt_template,
|
|
584
|
+
llm.with_structured_output(DetailDetectionResponses),
|
|
585
|
+
batch_size=batch_size,
|
|
586
|
+
question=question,
|
|
587
|
+
integrity_check=True,
|
|
588
|
+
system_prompt=system_prompt,
|
|
589
|
+
concurrency=concurrency,
|
|
590
|
+
)
|
|
591
|
+
return detailed, _
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
def cross_cutting_themes(
|
|
595
|
+
questions_themes: dict[int, pd.DataFrame],
|
|
596
|
+
llm: RunnableWithFallbacks,
|
|
597
|
+
n_concepts: int = 5,
|
|
598
|
+
min_themes: int = 5,
|
|
599
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
600
|
+
"""Identify cross-cutting themes using a single-pass agent approach.
|
|
601
|
+
|
|
602
|
+
This function analyzes refined themes from multiple questions to identify semantic
|
|
603
|
+
patterns that span across different questions, creating cross-cutting theme
|
|
604
|
+
categories that represent common concerns or policy areas.
|
|
605
|
+
|
|
606
|
+
The analysis uses a single-pass process:
|
|
607
|
+
1. Identify high-level cross-cutting themes across all questions
|
|
608
|
+
2. Map individual themes to the identified cross-cutting themes
|
|
609
|
+
3. Refine descriptions based on assigned themes
|
|
610
|
+
|
|
611
|
+
Args:
|
|
612
|
+
questions_themes (dict[int, pd.DataFrame]): Dictionary mapping question numbers
|
|
613
|
+
to their refined themes DataFrames. Each DataFrame should have columns:
|
|
614
|
+
- topic_id: Theme identifier (e.g., 'A', 'B', 'C')
|
|
615
|
+
- topic: String in format "topic_name: topic_description"
|
|
616
|
+
llm (RunnableWithFallbacks): Language model instance configured for
|
|
617
|
+
structured output
|
|
618
|
+
n_concepts (int): The target number of cross-cutting themes to generate
|
|
619
|
+
min_themes (int): Minimum number of themes required for a valid
|
|
620
|
+
cross-cutting theme group. Groups with fewer themes will be discarded.
|
|
621
|
+
Defaults to 5.
|
|
622
|
+
|
|
623
|
+
Returns:
|
|
624
|
+
tuple[pd.DataFrame, pd.DataFrame]: A tuple containing:
|
|
625
|
+
- DataFrame with cross-cutting themes with columns:
|
|
626
|
+
- name: Name of the cross-cutting theme
|
|
627
|
+
- description: Description of what this theme represents
|
|
628
|
+
- themes: Dictionary mapping question_number to list of theme_keys
|
|
629
|
+
e.g., {1: ["A", "B"], 3: ["C"]}
|
|
630
|
+
- Empty DataFrame (for consistency with other core functions)
|
|
631
|
+
|
|
632
|
+
Raises:
|
|
633
|
+
ValueError: If questions_themes is empty or contains invalid data
|
|
634
|
+
KeyError: If required columns are missing from themes DataFrames
|
|
635
|
+
"""
|
|
636
|
+
# Validate input
|
|
637
|
+
if not questions_themes:
|
|
638
|
+
raise ValueError("questions_themes cannot be empty")
|
|
639
|
+
|
|
640
|
+
# Use the CrossCuttingThemesAgent with external prompt files
|
|
641
|
+
agent = CrossCuttingThemesAgent(
|
|
642
|
+
llm=llm, questions_themes=questions_themes, n_concepts=n_concepts
|
|
643
|
+
)
|
|
644
|
+
|
|
645
|
+
# Run the analysis
|
|
646
|
+
agent.analyze()
|
|
647
|
+
|
|
648
|
+
# Get results as DataFrame using the agent's method
|
|
649
|
+
df_results = agent.get_results_as_dataframe()
|
|
650
|
+
|
|
651
|
+
# Apply minimum themes filter
|
|
652
|
+
if min_themes > 0:
|
|
653
|
+
df_results = df_results[df_results["n_themes"] >= min_themes]
|
|
654
|
+
|
|
655
|
+
# Create and return DataFrame with empty unprocessed data for consistency
|
|
656
|
+
return df_results.reset_index(drop=True), pd.DataFrame()
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
logger = logging.getLogger("theme_finder.tasks")
|
|
6
|
+
logger.setLevel(logging.INFO)
|
|
7
|
+
|
|
8
|
+
handler = logging.StreamHandler(sys.stdout)
|
|
9
|
+
formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s")
|
|
10
|
+
handler.setFormatter(formatter)
|
|
11
|
+
handler.setLevel(logging.INFO)
|
|
12
|
+
logger.addHandler(handler)
|