themefinder 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of themefinder might be problematic. Click here for more details.
- themefinder/__init__.py +8 -2
- themefinder/core.py +217 -39
- themefinder/llm_batch_processor.py +33 -81
- themefinder/models.py +371 -94
- themefinder/prompts/agentic_theme_clustering.txt +31 -0
- themefinder/prompts/detail_detection.txt +19 -0
- themefinder/prompts/sentiment_analysis.txt +0 -14
- themefinder/prompts/theme_condensation.txt +2 -22
- themefinder/prompts/theme_generation.txt +6 -38
- themefinder/prompts/theme_mapping.txt +6 -23
- themefinder/prompts/theme_refinement.txt +7 -16
- themefinder/prompts/theme_target_alignment.txt +2 -10
- themefinder/theme_clustering_agent.py +332 -0
- {themefinder-0.6.2.dist-info → themefinder-0.7.0.dist-info}/METADATA +24 -9
- themefinder-0.7.0.dist-info/RECORD +19 -0
- {themefinder-0.6.2.dist-info → themefinder-0.7.0.dist-info}/WHEEL +1 -1
- themefinder-0.6.2.dist-info/RECORD +0 -16
- {themefinder-0.6.2.dist-info → themefinder-0.7.0.dist-info}/LICENCE +0 -0
themefinder/__init__.py
CHANGED
|
@@ -1,18 +1,24 @@
|
|
|
1
1
|
from .core import (
|
|
2
2
|
find_themes,
|
|
3
3
|
sentiment_analysis,
|
|
4
|
+
theme_clustering,
|
|
4
5
|
theme_condensation,
|
|
5
6
|
theme_generation,
|
|
6
7
|
theme_mapping,
|
|
7
8
|
theme_refinement,
|
|
9
|
+
theme_target_alignment,
|
|
10
|
+
detail_detection,
|
|
8
11
|
)
|
|
9
12
|
|
|
10
13
|
__all__ = [
|
|
11
14
|
"find_themes",
|
|
12
15
|
"sentiment_analysis",
|
|
13
|
-
"
|
|
16
|
+
"theme_clustering",
|
|
14
17
|
"theme_condensation",
|
|
15
|
-
"
|
|
18
|
+
"theme_generation",
|
|
16
19
|
"theme_mapping",
|
|
20
|
+
"theme_refinement",
|
|
21
|
+
"theme_target_alignment",
|
|
22
|
+
"detail_detection",
|
|
17
23
|
]
|
|
18
24
|
__version__ = "0.1.0"
|
themefinder/core.py
CHANGED
|
@@ -3,22 +3,33 @@ from pathlib import Path
|
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
from langchain_core.prompts import PromptTemplate
|
|
6
|
-
from
|
|
7
|
-
|
|
8
|
-
from .llm_batch_processor import batch_and_run, load_prompt_from_file
|
|
9
|
-
from .models import
|
|
10
|
-
|
|
6
|
+
from langchain.schema.runnable import RunnableWithFallbacks
|
|
7
|
+
|
|
8
|
+
from themefinder.llm_batch_processor import batch_and_run, load_prompt_from_file
|
|
9
|
+
from themefinder.models import (
|
|
10
|
+
SentimentAnalysisResponses,
|
|
11
|
+
ThemeGenerationResponses,
|
|
12
|
+
ThemeCondensationResponses,
|
|
13
|
+
ThemeRefinementResponses,
|
|
14
|
+
ThemeMappingResponses,
|
|
15
|
+
DetailDetectionResponses,
|
|
16
|
+
HierarchicalClusteringResponse,
|
|
17
|
+
ThemeNode,
|
|
18
|
+
)
|
|
19
|
+
from themefinder.theme_clustering_agent import ThemeClusteringAgent
|
|
20
|
+
from themefinder.themefinder_logging import logger
|
|
11
21
|
|
|
12
22
|
CONSULTATION_SYSTEM_PROMPT = load_prompt_from_file("consultation_system_prompt")
|
|
13
23
|
|
|
14
24
|
|
|
15
25
|
async def find_themes(
|
|
16
26
|
responses_df: pd.DataFrame,
|
|
17
|
-
llm:
|
|
27
|
+
llm: RunnableWithFallbacks,
|
|
18
28
|
question: str,
|
|
19
29
|
target_n_themes: int | None = None,
|
|
20
30
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
21
31
|
verbose: bool = True,
|
|
32
|
+
concurrency: int = 10,
|
|
22
33
|
) -> dict[str, str | pd.DataFrame]:
|
|
23
34
|
"""Process survey responses through a multi-stage theme analysis pipeline.
|
|
24
35
|
|
|
@@ -32,7 +43,7 @@ async def find_themes(
|
|
|
32
43
|
|
|
33
44
|
Args:
|
|
34
45
|
responses_df (pd.DataFrame): DataFrame containing survey responses
|
|
35
|
-
llm (
|
|
46
|
+
llm (RunnableWithFallbacks): Language model instance for text analysis
|
|
36
47
|
question (str): The survey question
|
|
37
48
|
target_n_themes (int | None, optional): Target number of themes to consolidate to.
|
|
38
49
|
If None, skip theme target alignment step. Defaults to None.
|
|
@@ -40,6 +51,7 @@ async def find_themes(
|
|
|
40
51
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
41
52
|
verbose (bool): Whether to show information messages during processing.
|
|
42
53
|
Defaults to True.
|
|
54
|
+
concurrency (int): Number of concurrent API calls to make. Defaults to 10.
|
|
43
55
|
|
|
44
56
|
Returns:
|
|
45
57
|
dict[str, str | pd.DataFrame]: Dictionary containing results from each pipeline stage:
|
|
@@ -56,21 +68,28 @@ async def find_themes(
|
|
|
56
68
|
llm,
|
|
57
69
|
question=question,
|
|
58
70
|
system_prompt=system_prompt,
|
|
71
|
+
concurrency=concurrency,
|
|
59
72
|
)
|
|
60
73
|
theme_df, _ = await theme_generation(
|
|
61
74
|
sentiment_df,
|
|
62
75
|
llm,
|
|
63
76
|
question=question,
|
|
64
77
|
system_prompt=system_prompt,
|
|
78
|
+
concurrency=concurrency,
|
|
65
79
|
)
|
|
66
80
|
condensed_theme_df, _ = await theme_condensation(
|
|
67
|
-
theme_df,
|
|
81
|
+
theme_df,
|
|
82
|
+
llm,
|
|
83
|
+
question=question,
|
|
84
|
+
system_prompt=system_prompt,
|
|
85
|
+
concurrency=concurrency,
|
|
68
86
|
)
|
|
69
87
|
refined_theme_df, _ = await theme_refinement(
|
|
70
88
|
condensed_theme_df,
|
|
71
89
|
llm,
|
|
72
90
|
question=question,
|
|
73
91
|
system_prompt=system_prompt,
|
|
92
|
+
concurrency=concurrency,
|
|
74
93
|
)
|
|
75
94
|
if target_n_themes is not None:
|
|
76
95
|
refined_theme_df, _ = await theme_target_alignment(
|
|
@@ -79,6 +98,7 @@ async def find_themes(
|
|
|
79
98
|
question=question,
|
|
80
99
|
target_n_themes=target_n_themes,
|
|
81
100
|
system_prompt=system_prompt,
|
|
101
|
+
concurrency=concurrency,
|
|
82
102
|
)
|
|
83
103
|
mapping_df, mapping_unprocessables = await theme_mapping(
|
|
84
104
|
sentiment_df[["response_id", "response"]],
|
|
@@ -86,28 +106,36 @@ async def find_themes(
|
|
|
86
106
|
question=question,
|
|
87
107
|
refined_themes_df=refined_theme_df,
|
|
88
108
|
system_prompt=system_prompt,
|
|
109
|
+
concurrency=concurrency,
|
|
110
|
+
)
|
|
111
|
+
detailed_df, _ = await detail_detection(
|
|
112
|
+
responses_df[["response_id", "response"]],
|
|
113
|
+
llm,
|
|
114
|
+
question=question,
|
|
115
|
+
system_prompt=system_prompt,
|
|
116
|
+
concurrency=concurrency,
|
|
89
117
|
)
|
|
90
118
|
|
|
91
119
|
logger.info("Finished finding themes")
|
|
92
|
-
logger.info(
|
|
93
|
-
"Provide feedback or report bugs: https://forms.gle/85xUSMvxGzSSKQ499 or packages@cabinetoffice.gov.uk"
|
|
94
|
-
)
|
|
120
|
+
logger.info("Provide feedback or report bugs: packages@cabinetoffice.gov.uk")
|
|
95
121
|
return {
|
|
96
122
|
"question": question,
|
|
97
123
|
"sentiment": sentiment_df,
|
|
98
124
|
"themes": refined_theme_df,
|
|
99
125
|
"mapping": mapping_df,
|
|
126
|
+
"detailed_responses": detailed_df,
|
|
100
127
|
"unprocessables": pd.concat([sentiment_unprocessables, mapping_unprocessables]),
|
|
101
128
|
}
|
|
102
129
|
|
|
103
130
|
|
|
104
131
|
async def sentiment_analysis(
|
|
105
132
|
responses_df: pd.DataFrame,
|
|
106
|
-
llm:
|
|
133
|
+
llm: RunnableWithFallbacks,
|
|
107
134
|
question: str,
|
|
108
135
|
batch_size: int = 20,
|
|
109
136
|
prompt_template: str | Path | PromptTemplate = "sentiment_analysis",
|
|
110
137
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
138
|
+
concurrency: int = 10,
|
|
111
139
|
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
112
140
|
"""Perform sentiment analysis on survey responses using an LLM.
|
|
113
141
|
|
|
@@ -117,7 +145,7 @@ async def sentiment_analysis(
|
|
|
117
145
|
Args:
|
|
118
146
|
responses_df (pd.DataFrame): DataFrame containing survey responses to analyze.
|
|
119
147
|
Must contain 'response_id' and 'response' columns.
|
|
120
|
-
llm (
|
|
148
|
+
llm (RunnableWithFallbacks): Language model instance to use for sentiment analysis.
|
|
121
149
|
question (str): The survey question.
|
|
122
150
|
batch_size (int, optional): Number of responses to process in each batch.
|
|
123
151
|
Defaults to 20.
|
|
@@ -126,6 +154,7 @@ async def sentiment_analysis(
|
|
|
126
154
|
or PromptTemplate instance. Defaults to "sentiment_analysis".
|
|
127
155
|
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
128
156
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
157
|
+
concurrency (int): Number of concurrent API calls to make. Defaults to 10.
|
|
129
158
|
|
|
130
159
|
Returns:
|
|
131
160
|
tuple[pd.DataFrame, pd.DataFrame]:
|
|
@@ -134,32 +163,33 @@ async def sentiment_analysis(
|
|
|
134
163
|
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
135
164
|
|
|
136
165
|
Note:
|
|
137
|
-
The function uses
|
|
166
|
+
The function uses integrity_check to ensure responses maintain
|
|
138
167
|
their original order and association after processing.
|
|
139
168
|
"""
|
|
140
169
|
logger.info(f"Running sentiment analysis on {len(responses_df)} responses")
|
|
141
|
-
|
|
170
|
+
sentiment, unprocessable = await batch_and_run(
|
|
142
171
|
responses_df,
|
|
143
172
|
prompt_template,
|
|
144
|
-
llm,
|
|
173
|
+
llm.with_structured_output(SentimentAnalysisResponses),
|
|
145
174
|
batch_size=batch_size,
|
|
146
175
|
question=question,
|
|
147
|
-
|
|
148
|
-
task_validation_model=SentimentAnalysisOutput,
|
|
176
|
+
integrity_check=True,
|
|
149
177
|
system_prompt=system_prompt,
|
|
178
|
+
concurrency=concurrency,
|
|
150
179
|
)
|
|
151
180
|
|
|
152
|
-
return
|
|
181
|
+
return sentiment, unprocessable
|
|
153
182
|
|
|
154
183
|
|
|
155
184
|
async def theme_generation(
|
|
156
185
|
responses_df: pd.DataFrame,
|
|
157
|
-
llm:
|
|
186
|
+
llm: RunnableWithFallbacks,
|
|
158
187
|
question: str,
|
|
159
188
|
batch_size: int = 50,
|
|
160
189
|
partition_key: str | None = "position",
|
|
161
190
|
prompt_template: str | Path | PromptTemplate = "theme_generation",
|
|
162
191
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
192
|
+
concurrency: int = 10,
|
|
163
193
|
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
164
194
|
"""Generate themes from survey responses using an LLM.
|
|
165
195
|
|
|
@@ -168,7 +198,7 @@ async def theme_generation(
|
|
|
168
198
|
Args:
|
|
169
199
|
responses_df (pd.DataFrame): DataFrame containing survey responses.
|
|
170
200
|
Must include 'response_id' and 'response' columns.
|
|
171
|
-
llm (
|
|
201
|
+
llm (RunnableWithFallbacks): Language model instance to use for theme generation.
|
|
172
202
|
question (str): The survey question.
|
|
173
203
|
batch_size (int, optional): Number of responses to process in each batch.
|
|
174
204
|
Defaults to 50.
|
|
@@ -181,6 +211,7 @@ async def theme_generation(
|
|
|
181
211
|
or PromptTemplate instance. Defaults to "theme_generation".
|
|
182
212
|
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
183
213
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
214
|
+
concurrency (int): Number of concurrent API calls to make. Defaults to 10.
|
|
184
215
|
|
|
185
216
|
Returns:
|
|
186
217
|
tuple[pd.DataFrame, pd.DataFrame]:
|
|
@@ -193,22 +224,24 @@ async def theme_generation(
|
|
|
193
224
|
generated_themes, _ = await batch_and_run(
|
|
194
225
|
responses_df,
|
|
195
226
|
prompt_template,
|
|
196
|
-
llm,
|
|
227
|
+
llm.with_structured_output(ThemeGenerationResponses),
|
|
197
228
|
batch_size=batch_size,
|
|
198
229
|
partition_key=partition_key,
|
|
199
230
|
question=question,
|
|
200
231
|
system_prompt=system_prompt,
|
|
232
|
+
concurrency=concurrency,
|
|
201
233
|
)
|
|
202
234
|
return generated_themes, _
|
|
203
235
|
|
|
204
236
|
|
|
205
237
|
async def theme_condensation(
|
|
206
238
|
themes_df: pd.DataFrame,
|
|
207
|
-
llm:
|
|
239
|
+
llm: RunnableWithFallbacks,
|
|
208
240
|
question: str,
|
|
209
241
|
batch_size: int = 75,
|
|
210
242
|
prompt_template: str | Path | PromptTemplate = "theme_condensation",
|
|
211
243
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
244
|
+
concurrency: int = 10,
|
|
212
245
|
**kwargs,
|
|
213
246
|
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
214
247
|
"""Condense and combine similar themes identified from survey responses.
|
|
@@ -219,7 +252,7 @@ async def theme_condensation(
|
|
|
219
252
|
Args:
|
|
220
253
|
themes_df (pd.DataFrame): DataFrame containing the initial themes identified
|
|
221
254
|
from survey responses.
|
|
222
|
-
llm (
|
|
255
|
+
llm (RunnableWithFallbacks): Language model instance to use for theme condensation.
|
|
223
256
|
question (str): The survey question.
|
|
224
257
|
batch_size (int, optional): Number of themes to process in each batch.
|
|
225
258
|
Defaults to 100.
|
|
@@ -228,6 +261,7 @@ async def theme_condensation(
|
|
|
228
261
|
or PromptTemplate instance. Defaults to "theme_condensation".
|
|
229
262
|
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
230
263
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
264
|
+
concurrency (int): Number of concurrent API calls to make. Defaults to 10.
|
|
231
265
|
|
|
232
266
|
Returns:
|
|
233
267
|
tuple[pd.DataFrame, pd.DataFrame]:
|
|
@@ -247,10 +281,11 @@ async def theme_condensation(
|
|
|
247
281
|
themes_df, _ = await batch_and_run(
|
|
248
282
|
themes_df,
|
|
249
283
|
prompt_template,
|
|
250
|
-
llm,
|
|
284
|
+
llm.with_structured_output(ThemeCondensationResponses),
|
|
251
285
|
batch_size=batch_size,
|
|
252
286
|
question=question,
|
|
253
287
|
system_prompt=system_prompt,
|
|
288
|
+
concurrency=concurrency,
|
|
254
289
|
**kwargs,
|
|
255
290
|
)
|
|
256
291
|
themes_df = themes_df.sample(frac=1).reset_index(drop=True)
|
|
@@ -263,10 +298,11 @@ async def theme_condensation(
|
|
|
263
298
|
themes_df, _ = await batch_and_run(
|
|
264
299
|
themes_df,
|
|
265
300
|
prompt_template,
|
|
266
|
-
llm,
|
|
301
|
+
llm.with_structured_output(ThemeCondensationResponses),
|
|
267
302
|
batch_size=batch_size,
|
|
268
303
|
question=question,
|
|
269
304
|
system_prompt=system_prompt,
|
|
305
|
+
concurrency=concurrency,
|
|
270
306
|
**kwargs,
|
|
271
307
|
)
|
|
272
308
|
|
|
@@ -274,13 +310,95 @@ async def theme_condensation(
|
|
|
274
310
|
return themes_df, _
|
|
275
311
|
|
|
276
312
|
|
|
313
|
+
def theme_clustering(
|
|
314
|
+
themes_df: pd.DataFrame,
|
|
315
|
+
llm: RunnableWithFallbacks,
|
|
316
|
+
max_iterations: int = 5,
|
|
317
|
+
target_themes: int = 10,
|
|
318
|
+
significance_percentage: float = 10.0,
|
|
319
|
+
return_all_themes: bool = False,
|
|
320
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
321
|
+
"""Perform hierarchical clustering of themes using an agentic approach.
|
|
322
|
+
|
|
323
|
+
This function takes a DataFrame of themes and uses the ThemeClusteringAgent
|
|
324
|
+
to iteratively merge similar themes into a hierarchical structure, then
|
|
325
|
+
selects the most significant themes based on a threshold.
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
themes_df (pd.DataFrame): DataFrame containing themes with columns:
|
|
329
|
+
- topic_id: Unique identifier for each theme
|
|
330
|
+
- topic_label: Short descriptive label for the theme
|
|
331
|
+
- topic_description: Detailed description of the theme
|
|
332
|
+
- source_topic_count: Number of source responses for this theme
|
|
333
|
+
llm (RunnableWithFallbacks): Language model instance configured with
|
|
334
|
+
structured output for HierarchicalClusteringResponse
|
|
335
|
+
max_iterations (int, optional): Maximum number of clustering iterations.
|
|
336
|
+
Defaults to 5.
|
|
337
|
+
target_themes (int, optional): Target number of themes to cluster down to.
|
|
338
|
+
Defaults to 10.
|
|
339
|
+
significance_percentage (float, optional): Percentage threshold for
|
|
340
|
+
selecting significant themes. Defaults to 10.0.
|
|
341
|
+
return_all_themes (bool, optional): If True, returns all clustered themes.
|
|
342
|
+
If False, returns only significant themes. Defaults to False.
|
|
343
|
+
|
|
344
|
+
Returns:
|
|
345
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
346
|
+
A tuple containing:
|
|
347
|
+
- DataFrame of clustered themes (all or significant based on return_all_themes)
|
|
348
|
+
- Empty DataFrame (for consistency with other functions)
|
|
349
|
+
"""
|
|
350
|
+
logger.info(f"Starting hierarchical clustering of {len(themes_df)} themes")
|
|
351
|
+
|
|
352
|
+
# Convert DataFrame to ThemeNode objects
|
|
353
|
+
initial_themes = [
|
|
354
|
+
ThemeNode(
|
|
355
|
+
topic_id=row["topic_id"],
|
|
356
|
+
topic_label=row["topic_label"],
|
|
357
|
+
topic_description=row["topic_description"],
|
|
358
|
+
source_topic_count=row["source_topic_count"],
|
|
359
|
+
)
|
|
360
|
+
for _, row in themes_df.iterrows()
|
|
361
|
+
]
|
|
362
|
+
|
|
363
|
+
# Initialize clustering agent with structured output LLM
|
|
364
|
+
agent = ThemeClusteringAgent(
|
|
365
|
+
llm.with_structured_output(HierarchicalClusteringResponse), initial_themes
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
# Perform clustering
|
|
369
|
+
logger.info(
|
|
370
|
+
f"Clustering themes with max_iterations={max_iterations}, target_themes={target_themes}"
|
|
371
|
+
)
|
|
372
|
+
all_themes_df = agent.cluster_themes(
|
|
373
|
+
max_iterations=max_iterations, target_themes=target_themes
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
# Return appropriate themes based on parameter
|
|
377
|
+
if return_all_themes:
|
|
378
|
+
logger.info(
|
|
379
|
+
f"Clustering complete: returning all {len(all_themes_df)} clustered themes"
|
|
380
|
+
)
|
|
381
|
+
return all_themes_df, pd.DataFrame()
|
|
382
|
+
else:
|
|
383
|
+
# Select significant themes
|
|
384
|
+
logger.info(
|
|
385
|
+
f"Selecting themes with significance_percentage={significance_percentage}%"
|
|
386
|
+
)
|
|
387
|
+
selected_themes_df = agent.select_themes(significance_percentage)
|
|
388
|
+
logger.info(
|
|
389
|
+
f"Clustering complete: returning {len(selected_themes_df)} significant themes"
|
|
390
|
+
)
|
|
391
|
+
return selected_themes_df, pd.DataFrame()
|
|
392
|
+
|
|
393
|
+
|
|
277
394
|
async def theme_refinement(
|
|
278
395
|
condensed_themes_df: pd.DataFrame,
|
|
279
|
-
llm:
|
|
396
|
+
llm: RunnableWithFallbacks,
|
|
280
397
|
question: str,
|
|
281
398
|
batch_size: int = 10000,
|
|
282
399
|
prompt_template: str | Path | PromptTemplate = "theme_refinement",
|
|
283
400
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
401
|
+
concurrency: int = 10,
|
|
284
402
|
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
285
403
|
"""Refine and standardize condensed themes using an LLM.
|
|
286
404
|
|
|
@@ -292,7 +410,7 @@ async def theme_refinement(
|
|
|
292
410
|
Args:
|
|
293
411
|
condensed_themes (pd.DataFrame): DataFrame containing the condensed themes
|
|
294
412
|
from the previous pipeline stage.
|
|
295
|
-
llm (
|
|
413
|
+
llm (RunnableWithFallbacks): Language model instance to use for theme refinement.
|
|
296
414
|
question (str): The survey question.
|
|
297
415
|
batch_size (int, optional): Number of themes to process in each batch.
|
|
298
416
|
Defaults to 10000.
|
|
@@ -301,6 +419,7 @@ async def theme_refinement(
|
|
|
301
419
|
or PromptTemplate instance. Defaults to "theme_refinement".
|
|
302
420
|
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
303
421
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
422
|
+
concurrency (int): Number of concurrent API calls to make. Defaults to 10.
|
|
304
423
|
|
|
305
424
|
Returns:
|
|
306
425
|
tuple[pd.DataFrame, pd.DataFrame]:
|
|
@@ -319,22 +438,24 @@ async def theme_refinement(
|
|
|
319
438
|
refined_themes, _ = await batch_and_run(
|
|
320
439
|
condensed_themes_df,
|
|
321
440
|
prompt_template,
|
|
322
|
-
llm,
|
|
441
|
+
llm.with_structured_output(ThemeRefinementResponses),
|
|
323
442
|
batch_size=batch_size,
|
|
324
443
|
question=question,
|
|
325
444
|
system_prompt=system_prompt,
|
|
445
|
+
concurrency=concurrency,
|
|
326
446
|
)
|
|
327
447
|
return refined_themes, _
|
|
328
448
|
|
|
329
449
|
|
|
330
450
|
async def theme_target_alignment(
|
|
331
451
|
refined_themes_df: pd.DataFrame,
|
|
332
|
-
llm:
|
|
452
|
+
llm: RunnableWithFallbacks,
|
|
333
453
|
question: str,
|
|
334
454
|
target_n_themes: int = 10,
|
|
335
455
|
batch_size: int = 10000,
|
|
336
456
|
prompt_template: str | Path | PromptTemplate = "theme_target_alignment",
|
|
337
457
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
458
|
+
concurrency: int = 10,
|
|
338
459
|
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
339
460
|
"""Align themes to target number using an LLM.
|
|
340
461
|
|
|
@@ -346,7 +467,7 @@ async def theme_target_alignment(
|
|
|
346
467
|
Args:
|
|
347
468
|
refined_themes_df (pd.DataFrame): DataFrame containing the refined themes
|
|
348
469
|
from the previous pipeline stage.
|
|
349
|
-
llm (
|
|
470
|
+
llm (RunnableWithFallbacks): Language model instance to use for theme alignment.
|
|
350
471
|
question (str): The survey question.
|
|
351
472
|
target_n_themes (int, optional): Target number of themes to consolidate to.
|
|
352
473
|
Defaults to 10.
|
|
@@ -357,6 +478,7 @@ async def theme_target_alignment(
|
|
|
357
478
|
or PromptTemplate instance. Defaults to "theme_target_alignment".
|
|
358
479
|
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
359
480
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
481
|
+
concurrency (int): Number of concurrent API calls to make. Defaults to 10.
|
|
360
482
|
|
|
361
483
|
Returns:
|
|
362
484
|
tuple[pd.DataFrame, pd.DataFrame]:
|
|
@@ -376,23 +498,25 @@ async def theme_target_alignment(
|
|
|
376
498
|
aligned_themes, _ = await batch_and_run(
|
|
377
499
|
refined_themes_df,
|
|
378
500
|
prompt_template,
|
|
379
|
-
llm,
|
|
501
|
+
llm.with_structured_output(ThemeRefinementResponses),
|
|
380
502
|
batch_size=batch_size,
|
|
381
503
|
question=question,
|
|
382
504
|
system_prompt=system_prompt,
|
|
383
505
|
target_n_themes=target_n_themes,
|
|
506
|
+
concurrency=concurrency,
|
|
384
507
|
)
|
|
385
508
|
return aligned_themes, _
|
|
386
509
|
|
|
387
510
|
|
|
388
511
|
async def theme_mapping(
|
|
389
512
|
responses_df: pd.DataFrame,
|
|
390
|
-
llm:
|
|
513
|
+
llm: RunnableWithFallbacks,
|
|
391
514
|
question: str,
|
|
392
515
|
refined_themes_df: pd.DataFrame,
|
|
393
516
|
batch_size: int = 20,
|
|
394
517
|
prompt_template: str | Path | PromptTemplate = "theme_mapping",
|
|
395
518
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
519
|
+
concurrency: int = 10,
|
|
396
520
|
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
397
521
|
"""Map survey responses to refined themes using an LLM.
|
|
398
522
|
|
|
@@ -402,7 +526,7 @@ async def theme_mapping(
|
|
|
402
526
|
Args:
|
|
403
527
|
responses_df (pd.DataFrame): DataFrame containing survey responses.
|
|
404
528
|
Must include 'response_id' and 'response' columns.
|
|
405
|
-
llm (
|
|
529
|
+
llm (RunnableWithFallbacks): Language model instance to use for theme mapping.
|
|
406
530
|
question (str): The survey question.
|
|
407
531
|
refined_themes_df (pd.DataFrame): Single-row DataFrame where each column
|
|
408
532
|
represents a theme (from theme_refinement stage).
|
|
@@ -413,6 +537,7 @@ async def theme_mapping(
|
|
|
413
537
|
or PromptTemplate instance. Defaults to "theme_mapping".
|
|
414
538
|
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
415
539
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
540
|
+
concurrency (int): Number of concurrent API calls to make. Defaults to 10.
|
|
416
541
|
|
|
417
542
|
Returns:
|
|
418
543
|
tuple[pd.DataFrame, pd.DataFrame]:
|
|
@@ -432,17 +557,70 @@ async def theme_mapping(
|
|
|
432
557
|
)
|
|
433
558
|
return transposed_df
|
|
434
559
|
|
|
435
|
-
mapping,
|
|
560
|
+
mapping, unprocessable = await batch_and_run(
|
|
436
561
|
responses_df,
|
|
437
562
|
prompt_template,
|
|
438
|
-
llm,
|
|
563
|
+
llm.with_structured_output(ThemeMappingResponses),
|
|
439
564
|
batch_size=batch_size,
|
|
440
565
|
question=question,
|
|
441
566
|
refined_themes=transpose_refined_themes(refined_themes_df).to_dict(
|
|
442
567
|
orient="records"
|
|
443
568
|
),
|
|
444
|
-
|
|
445
|
-
|
|
569
|
+
integrity_check=True,
|
|
570
|
+
system_prompt=system_prompt,
|
|
571
|
+
concurrency=concurrency,
|
|
572
|
+
)
|
|
573
|
+
return mapping, unprocessable
|
|
574
|
+
|
|
575
|
+
|
|
576
|
+
async def detail_detection(
|
|
577
|
+
responses_df: pd.DataFrame,
|
|
578
|
+
llm: RunnableWithFallbacks,
|
|
579
|
+
question: str,
|
|
580
|
+
batch_size: int = 20,
|
|
581
|
+
prompt_template: str | Path | PromptTemplate = "detail_detection",
|
|
582
|
+
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
583
|
+
concurrency: int = 10,
|
|
584
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
585
|
+
"""Identify responses that provide high-value detailed evidence.
|
|
586
|
+
|
|
587
|
+
This function processes survey responses in batches to analyze their level of detail
|
|
588
|
+
and evidence using a language model. It identifies responses that contain specific
|
|
589
|
+
examples, data, or detailed reasoning that provide strong supporting evidence.
|
|
590
|
+
|
|
591
|
+
Args:
|
|
592
|
+
responses_df (pd.DataFrame): DataFrame containing survey responses to analyze.
|
|
593
|
+
Must contain 'response_id' and 'response' columns.
|
|
594
|
+
llm (RunnableWithFallbacks): Language model instance to use for detail detection.
|
|
595
|
+
question (str): The survey question.
|
|
596
|
+
batch_size (int, optional): Number of responses to process in each batch.
|
|
597
|
+
Defaults to 20.
|
|
598
|
+
prompt_template (str | Path | PromptTemplate, optional): Template for structuring
|
|
599
|
+
the prompt to the LLM. Can be a string identifier, path to template file,
|
|
600
|
+
or PromptTemplate instance. Defaults to "detail_detection".
|
|
601
|
+
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
602
|
+
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
603
|
+
concurrency (int): Number of concurrent API calls to make. Defaults to 10.
|
|
604
|
+
|
|
605
|
+
Returns:
|
|
606
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
607
|
+
A tuple containing two DataFrames:
|
|
608
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
609
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
610
|
+
|
|
611
|
+
Note:
|
|
612
|
+
The function uses response_id_integrity_check to ensure responses maintain
|
|
613
|
+
their original order and association after processing.
|
|
614
|
+
"""
|
|
615
|
+
logger.info(f"Running detail detection on {len(responses_df)} responses")
|
|
616
|
+
detailed, _ = await batch_and_run(
|
|
617
|
+
responses_df,
|
|
618
|
+
prompt_template,
|
|
619
|
+
llm.with_structured_output(DetailDetectionResponses),
|
|
620
|
+
batch_size=batch_size,
|
|
621
|
+
question=question,
|
|
622
|
+
integrity_check=True,
|
|
446
623
|
system_prompt=system_prompt,
|
|
624
|
+
concurrency=concurrency,
|
|
447
625
|
)
|
|
448
|
-
return
|
|
626
|
+
return detailed, _
|