themefinder 0.5.4__py3-none-any.whl → 0.6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of themefinder might be problematic. Click here for more details.
- themefinder/__init__.py +6 -2
- themefinder/core.py +204 -97
- themefinder/llm_batch_processor.py +277 -145
- themefinder/models.py +351 -0
- themefinder/prompts/detail_detection.txt +19 -0
- themefinder/prompts/sentiment_analysis.txt +8 -19
- themefinder/prompts/theme_condensation.txt +2 -22
- themefinder/prompts/theme_generation.txt +6 -38
- themefinder/prompts/theme_mapping.txt +6 -23
- themefinder/prompts/theme_refinement.txt +14 -40
- themefinder/prompts/theme_target_alignment.txt +2 -10
- {themefinder-0.5.4.dist-info → themefinder-0.6.3.dist-info}/METADATA +25 -9
- themefinder-0.6.3.dist-info/RECORD +17 -0
- {themefinder-0.5.4.dist-info → themefinder-0.6.3.dist-info}/WHEEL +1 -1
- themefinder-0.5.4.dist-info/RECORD +0 -15
- {themefinder-0.5.4.dist-info → themefinder-0.6.3.dist-info}/LICENCE +0 -0
themefinder/__init__.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
from .core import (
|
|
2
2
|
find_themes,
|
|
3
3
|
sentiment_analysis,
|
|
4
|
-
theme_generation,
|
|
5
4
|
theme_condensation,
|
|
6
|
-
|
|
5
|
+
theme_generation,
|
|
7
6
|
theme_mapping,
|
|
7
|
+
theme_refinement,
|
|
8
|
+
theme_target_alignment,
|
|
9
|
+
detail_detection,
|
|
8
10
|
)
|
|
9
11
|
|
|
10
12
|
__all__ = [
|
|
@@ -13,6 +15,8 @@ __all__ = [
|
|
|
13
15
|
"theme_generation",
|
|
14
16
|
"theme_condensation",
|
|
15
17
|
"theme_refinement",
|
|
18
|
+
"theme_target_alignment",
|
|
16
19
|
"theme_mapping",
|
|
20
|
+
"detail_detection",
|
|
17
21
|
]
|
|
18
22
|
__version__ = "0.1.0"
|
themefinder/core.py
CHANGED
|
@@ -3,9 +3,17 @@ from pathlib import Path
|
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
from langchain_core.prompts import PromptTemplate
|
|
6
|
-
from
|
|
6
|
+
from langchain.schema.runnable import RunnableWithFallbacks
|
|
7
7
|
|
|
8
8
|
from .llm_batch_processor import batch_and_run, load_prompt_from_file
|
|
9
|
+
from .models import (
|
|
10
|
+
SentimentAnalysisResponses,
|
|
11
|
+
ThemeGenerationResponses,
|
|
12
|
+
ThemeCondensationResponses,
|
|
13
|
+
ThemeRefinementResponses,
|
|
14
|
+
ThemeMappingResponses,
|
|
15
|
+
DetailDetectionResponses,
|
|
16
|
+
)
|
|
9
17
|
from .themefinder_logging import logger
|
|
10
18
|
|
|
11
19
|
CONSULTATION_SYSTEM_PROMPT = load_prompt_from_file("consultation_system_prompt")
|
|
@@ -13,12 +21,13 @@ CONSULTATION_SYSTEM_PROMPT = load_prompt_from_file("consultation_system_prompt")
|
|
|
13
21
|
|
|
14
22
|
async def find_themes(
|
|
15
23
|
responses_df: pd.DataFrame,
|
|
16
|
-
llm:
|
|
24
|
+
llm: RunnableWithFallbacks,
|
|
17
25
|
question: str,
|
|
18
26
|
target_n_themes: int | None = None,
|
|
19
27
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
20
28
|
verbose: bool = True,
|
|
21
|
-
|
|
29
|
+
concurrency: int = 10,
|
|
30
|
+
) -> dict[str, str | pd.DataFrame]:
|
|
22
31
|
"""Process survey responses through a multi-stage theme analysis pipeline.
|
|
23
32
|
|
|
24
33
|
This pipeline performs sequential analysis steps:
|
|
@@ -31,7 +40,7 @@ async def find_themes(
|
|
|
31
40
|
|
|
32
41
|
Args:
|
|
33
42
|
responses_df (pd.DataFrame): DataFrame containing survey responses
|
|
34
|
-
llm (
|
|
43
|
+
llm (RunnableWithFallbacks): Language model instance for text analysis
|
|
35
44
|
question (str): The survey question
|
|
36
45
|
target_n_themes (int | None, optional): Target number of themes to consolidate to.
|
|
37
46
|
If None, skip theme target alignment step. Defaults to None.
|
|
@@ -39,53 +48,69 @@ async def find_themes(
|
|
|
39
48
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
40
49
|
verbose (bool): Whether to show information messages during processing.
|
|
41
50
|
Defaults to True.
|
|
51
|
+
concurrency (int): Number of concurrent API calls to make. Defaults to 10.
|
|
42
52
|
|
|
43
53
|
Returns:
|
|
44
|
-
dict[str, pd.DataFrame]: Dictionary containing results from each pipeline stage:
|
|
45
|
-
- question: The survey question
|
|
54
|
+
dict[str, str | pd.DataFrame]: Dictionary containing results from each pipeline stage:
|
|
55
|
+
- question: The survey question string
|
|
46
56
|
- sentiment: DataFrame with sentiment analysis results
|
|
47
|
-
-
|
|
48
|
-
- condensed_topics: DataFrame with combined similar themes
|
|
49
|
-
- refined_topics: DataFrame with refined theme definitions
|
|
57
|
+
- themes: DataFrame with the final themes output
|
|
50
58
|
- mapping: DataFrame mapping responses to final themes
|
|
59
|
+
- unprocessables: Dataframe containing the inputs that could not be processed by the LLM
|
|
51
60
|
"""
|
|
52
61
|
logger.setLevel(logging.INFO if verbose else logging.CRITICAL)
|
|
53
62
|
|
|
54
|
-
sentiment_df = await sentiment_analysis(
|
|
63
|
+
sentiment_df, sentiment_unprocessables = await sentiment_analysis(
|
|
55
64
|
responses_df,
|
|
56
65
|
llm,
|
|
57
66
|
question=question,
|
|
58
67
|
system_prompt=system_prompt,
|
|
68
|
+
concurrency=concurrency,
|
|
59
69
|
)
|
|
60
|
-
theme_df = await theme_generation(
|
|
70
|
+
theme_df, _ = await theme_generation(
|
|
61
71
|
sentiment_df,
|
|
62
72
|
llm,
|
|
63
73
|
question=question,
|
|
64
74
|
system_prompt=system_prompt,
|
|
75
|
+
concurrency=concurrency,
|
|
65
76
|
)
|
|
66
|
-
condensed_theme_df = await theme_condensation(
|
|
67
|
-
theme_df,
|
|
77
|
+
condensed_theme_df, _ = await theme_condensation(
|
|
78
|
+
theme_df,
|
|
79
|
+
llm,
|
|
80
|
+
question=question,
|
|
81
|
+
system_prompt=system_prompt,
|
|
82
|
+
concurrency=concurrency,
|
|
68
83
|
)
|
|
69
|
-
refined_theme_df = await theme_refinement(
|
|
84
|
+
refined_theme_df, _ = await theme_refinement(
|
|
70
85
|
condensed_theme_df,
|
|
71
86
|
llm,
|
|
72
87
|
question=question,
|
|
73
88
|
system_prompt=system_prompt,
|
|
89
|
+
concurrency=concurrency,
|
|
74
90
|
)
|
|
75
91
|
if target_n_themes is not None:
|
|
76
|
-
refined_theme_df = await theme_target_alignment(
|
|
92
|
+
refined_theme_df, _ = await theme_target_alignment(
|
|
77
93
|
refined_theme_df,
|
|
78
94
|
llm,
|
|
79
95
|
question=question,
|
|
80
96
|
target_n_themes=target_n_themes,
|
|
81
97
|
system_prompt=system_prompt,
|
|
98
|
+
concurrency=concurrency,
|
|
82
99
|
)
|
|
83
|
-
mapping_df = await theme_mapping(
|
|
84
|
-
sentiment_df,
|
|
100
|
+
mapping_df, mapping_unprocessables = await theme_mapping(
|
|
101
|
+
sentiment_df[["response_id", "response"]],
|
|
85
102
|
llm,
|
|
86
103
|
question=question,
|
|
87
104
|
refined_themes_df=refined_theme_df,
|
|
88
105
|
system_prompt=system_prompt,
|
|
106
|
+
concurrency=concurrency,
|
|
107
|
+
)
|
|
108
|
+
detailed_df, _ = await detail_detection(
|
|
109
|
+
responses_df[["response_id", "response"]],
|
|
110
|
+
llm,
|
|
111
|
+
question=question,
|
|
112
|
+
system_prompt=system_prompt,
|
|
113
|
+
concurrency=concurrency,
|
|
89
114
|
)
|
|
90
115
|
|
|
91
116
|
logger.info("Finished finding themes")
|
|
@@ -95,21 +120,22 @@ async def find_themes(
|
|
|
95
120
|
return {
|
|
96
121
|
"question": question,
|
|
97
122
|
"sentiment": sentiment_df,
|
|
98
|
-
"themes":
|
|
99
|
-
"condensed_themes": condensed_theme_df,
|
|
100
|
-
"refined_themes": refined_theme_df,
|
|
123
|
+
"themes": refined_theme_df,
|
|
101
124
|
"mapping": mapping_df,
|
|
125
|
+
"detailed_responses": detailed_df,
|
|
126
|
+
"unprocessables": pd.concat([sentiment_unprocessables, mapping_unprocessables]),
|
|
102
127
|
}
|
|
103
128
|
|
|
104
129
|
|
|
105
130
|
async def sentiment_analysis(
|
|
106
131
|
responses_df: pd.DataFrame,
|
|
107
|
-
llm:
|
|
132
|
+
llm: RunnableWithFallbacks,
|
|
108
133
|
question: str,
|
|
109
134
|
batch_size: int = 20,
|
|
110
135
|
prompt_template: str | Path | PromptTemplate = "sentiment_analysis",
|
|
111
136
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
112
|
-
|
|
137
|
+
concurrency: int = 10,
|
|
138
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
113
139
|
"""Perform sentiment analysis on survey responses using an LLM.
|
|
114
140
|
|
|
115
141
|
This function processes survey responses in batches to analyze their sentiment
|
|
@@ -118,7 +144,7 @@ async def sentiment_analysis(
|
|
|
118
144
|
Args:
|
|
119
145
|
responses_df (pd.DataFrame): DataFrame containing survey responses to analyze.
|
|
120
146
|
Must contain 'response_id' and 'response' columns.
|
|
121
|
-
llm (
|
|
147
|
+
llm (RunnableWithFallbacks): Language model instance to use for sentiment analysis.
|
|
122
148
|
question (str): The survey question.
|
|
123
149
|
batch_size (int, optional): Number of responses to process in each batch.
|
|
124
150
|
Defaults to 20.
|
|
@@ -127,36 +153,43 @@ async def sentiment_analysis(
|
|
|
127
153
|
or PromptTemplate instance. Defaults to "sentiment_analysis".
|
|
128
154
|
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
129
155
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
156
|
+
concurrency (int): Number of concurrent API calls to make. Defaults to 10.
|
|
130
157
|
|
|
131
158
|
Returns:
|
|
132
|
-
pd.DataFrame
|
|
133
|
-
|
|
159
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
160
|
+
A tuple containing two DataFrames:
|
|
161
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
162
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
134
163
|
|
|
135
164
|
Note:
|
|
136
|
-
The function uses
|
|
165
|
+
The function uses integrity_check to ensure responses maintain
|
|
137
166
|
their original order and association after processing.
|
|
138
167
|
"""
|
|
139
168
|
logger.info(f"Running sentiment analysis on {len(responses_df)} responses")
|
|
140
|
-
|
|
169
|
+
sentiment, unprocessable = await batch_and_run(
|
|
141
170
|
responses_df,
|
|
142
171
|
prompt_template,
|
|
143
|
-
llm,
|
|
172
|
+
llm.with_structured_output(SentimentAnalysisResponses),
|
|
144
173
|
batch_size=batch_size,
|
|
145
174
|
question=question,
|
|
146
|
-
|
|
175
|
+
integrity_check=True,
|
|
147
176
|
system_prompt=system_prompt,
|
|
177
|
+
concurrency=concurrency,
|
|
148
178
|
)
|
|
149
179
|
|
|
180
|
+
return sentiment, unprocessable
|
|
181
|
+
|
|
150
182
|
|
|
151
183
|
async def theme_generation(
|
|
152
184
|
responses_df: pd.DataFrame,
|
|
153
|
-
llm:
|
|
185
|
+
llm: RunnableWithFallbacks,
|
|
154
186
|
question: str,
|
|
155
187
|
batch_size: int = 50,
|
|
156
188
|
partition_key: str | None = "position",
|
|
157
189
|
prompt_template: str | Path | PromptTemplate = "theme_generation",
|
|
158
190
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
159
|
-
|
|
191
|
+
concurrency: int = 10,
|
|
192
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
160
193
|
"""Generate themes from survey responses using an LLM.
|
|
161
194
|
|
|
162
195
|
This function processes batches of survey responses to identify common themes or topics.
|
|
@@ -164,7 +197,7 @@ async def theme_generation(
|
|
|
164
197
|
Args:
|
|
165
198
|
responses_df (pd.DataFrame): DataFrame containing survey responses.
|
|
166
199
|
Must include 'response_id' and 'response' columns.
|
|
167
|
-
llm (
|
|
200
|
+
llm (RunnableWithFallbacks): Language model instance to use for theme generation.
|
|
168
201
|
question (str): The survey question.
|
|
169
202
|
batch_size (int, optional): Number of responses to process in each batch.
|
|
170
203
|
Defaults to 50.
|
|
@@ -177,31 +210,39 @@ async def theme_generation(
|
|
|
177
210
|
or PromptTemplate instance. Defaults to "theme_generation".
|
|
178
211
|
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
179
212
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
213
|
+
concurrency (int): Number of concurrent API calls to make. Defaults to 10.
|
|
180
214
|
|
|
181
215
|
Returns:
|
|
182
|
-
pd.DataFrame
|
|
216
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
217
|
+
A tuple containing two DataFrames:
|
|
218
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
219
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
220
|
+
|
|
183
221
|
"""
|
|
184
222
|
logger.info(f"Running theme generation on {len(responses_df)} responses")
|
|
185
|
-
|
|
223
|
+
generated_themes, _ = await batch_and_run(
|
|
186
224
|
responses_df,
|
|
187
225
|
prompt_template,
|
|
188
|
-
llm,
|
|
226
|
+
llm.with_structured_output(ThemeGenerationResponses),
|
|
189
227
|
batch_size=batch_size,
|
|
190
228
|
partition_key=partition_key,
|
|
191
229
|
question=question,
|
|
192
230
|
system_prompt=system_prompt,
|
|
231
|
+
concurrency=concurrency,
|
|
193
232
|
)
|
|
233
|
+
return generated_themes, _
|
|
194
234
|
|
|
195
235
|
|
|
196
236
|
async def theme_condensation(
|
|
197
237
|
themes_df: pd.DataFrame,
|
|
198
|
-
llm:
|
|
238
|
+
llm: RunnableWithFallbacks,
|
|
199
239
|
question: str,
|
|
200
240
|
batch_size: int = 75,
|
|
201
241
|
prompt_template: str | Path | PromptTemplate = "theme_condensation",
|
|
202
242
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
243
|
+
concurrency: int = 10,
|
|
203
244
|
**kwargs,
|
|
204
|
-
) -> pd.DataFrame:
|
|
245
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
205
246
|
"""Condense and combine similar themes identified from survey responses.
|
|
206
247
|
|
|
207
248
|
This function processes the initially identified themes to combine similar or
|
|
@@ -210,7 +251,7 @@ async def theme_condensation(
|
|
|
210
251
|
Args:
|
|
211
252
|
themes_df (pd.DataFrame): DataFrame containing the initial themes identified
|
|
212
253
|
from survey responses.
|
|
213
|
-
llm (
|
|
254
|
+
llm (RunnableWithFallbacks): Language model instance to use for theme condensation.
|
|
214
255
|
question (str): The survey question.
|
|
215
256
|
batch_size (int, optional): Number of themes to process in each batch.
|
|
216
257
|
Defaults to 100.
|
|
@@ -219,57 +260,64 @@ async def theme_condensation(
|
|
|
219
260
|
or PromptTemplate instance. Defaults to "theme_condensation".
|
|
220
261
|
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
221
262
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
263
|
+
concurrency (int): Number of concurrent API calls to make. Defaults to 10.
|
|
222
264
|
|
|
223
265
|
Returns:
|
|
224
|
-
pd.DataFrame
|
|
225
|
-
|
|
266
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
267
|
+
A tuple containing two DataFrames:
|
|
268
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
269
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
270
|
+
|
|
226
271
|
"""
|
|
227
272
|
logger.info(f"Running theme condensation on {len(themes_df)} themes")
|
|
228
|
-
themes_df["response_id"] =
|
|
273
|
+
themes_df["response_id"] = themes_df.index + 1
|
|
229
274
|
|
|
230
275
|
n_themes = themes_df.shape[0]
|
|
231
276
|
while n_themes > batch_size:
|
|
232
277
|
logger.info(
|
|
233
278
|
f"{n_themes} larger than batch size, using recursive theme condensation"
|
|
234
279
|
)
|
|
235
|
-
themes_df = await batch_and_run(
|
|
280
|
+
themes_df, _ = await batch_and_run(
|
|
236
281
|
themes_df,
|
|
237
282
|
prompt_template,
|
|
238
|
-
llm,
|
|
283
|
+
llm.with_structured_output(ThemeCondensationResponses),
|
|
239
284
|
batch_size=batch_size,
|
|
240
285
|
question=question,
|
|
241
286
|
system_prompt=system_prompt,
|
|
287
|
+
concurrency=concurrency,
|
|
242
288
|
**kwargs,
|
|
243
289
|
)
|
|
244
290
|
themes_df = themes_df.sample(frac=1).reset_index(drop=True)
|
|
245
|
-
themes_df["response_id"] =
|
|
291
|
+
themes_df["response_id"] = themes_df.index + 1
|
|
246
292
|
if len(themes_df) == n_themes:
|
|
247
293
|
logger.info("Themes no longer being condensed")
|
|
248
294
|
break
|
|
249
295
|
n_themes = themes_df.shape[0]
|
|
250
296
|
|
|
251
|
-
themes_df = await batch_and_run(
|
|
297
|
+
themes_df, _ = await batch_and_run(
|
|
252
298
|
themes_df,
|
|
253
299
|
prompt_template,
|
|
254
|
-
llm,
|
|
300
|
+
llm.with_structured_output(ThemeCondensationResponses),
|
|
255
301
|
batch_size=batch_size,
|
|
256
302
|
question=question,
|
|
257
303
|
system_prompt=system_prompt,
|
|
304
|
+
concurrency=concurrency,
|
|
258
305
|
**kwargs,
|
|
259
306
|
)
|
|
260
307
|
|
|
261
308
|
logger.info(f"Final number of condensed themes: {themes_df.shape[0]}")
|
|
262
|
-
return themes_df
|
|
309
|
+
return themes_df, _
|
|
263
310
|
|
|
264
311
|
|
|
265
312
|
async def theme_refinement(
|
|
266
313
|
condensed_themes_df: pd.DataFrame,
|
|
267
|
-
llm:
|
|
314
|
+
llm: RunnableWithFallbacks,
|
|
268
315
|
question: str,
|
|
269
316
|
batch_size: int = 10000,
|
|
270
317
|
prompt_template: str | Path | PromptTemplate = "theme_refinement",
|
|
271
318
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
272
|
-
|
|
319
|
+
concurrency: int = 10,
|
|
320
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
273
321
|
"""Refine and standardize condensed themes using an LLM.
|
|
274
322
|
|
|
275
323
|
This function processes previously condensed themes to create clear, standardized
|
|
@@ -280,21 +328,22 @@ async def theme_refinement(
|
|
|
280
328
|
Args:
|
|
281
329
|
condensed_themes (pd.DataFrame): DataFrame containing the condensed themes
|
|
282
330
|
from the previous pipeline stage.
|
|
283
|
-
llm (
|
|
331
|
+
llm (RunnableWithFallbacks): Language model instance to use for theme refinement.
|
|
284
332
|
question (str): The survey question.
|
|
285
333
|
batch_size (int, optional): Number of themes to process in each batch.
|
|
286
334
|
Defaults to 10000.
|
|
287
335
|
prompt_template (str | Path | PromptTemplate, optional): Template for structuring
|
|
288
336
|
the prompt to the LLM. Can be a string identifier, path to template file,
|
|
289
|
-
or PromptTemplate instance. Defaults to "
|
|
337
|
+
or PromptTemplate instance. Defaults to "theme_refinement".
|
|
290
338
|
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
291
339
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
340
|
+
concurrency (int): Number of concurrent API calls to make. Defaults to 10.
|
|
292
341
|
|
|
293
342
|
Returns:
|
|
294
|
-
pd.DataFrame
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
343
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
344
|
+
A tuple containing two DataFrames:
|
|
345
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
346
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
298
347
|
|
|
299
348
|
Note:
|
|
300
349
|
The function adds sequential response_ids to the input DataFrame and
|
|
@@ -302,35 +351,30 @@ async def theme_refinement(
|
|
|
302
351
|
processing.
|
|
303
352
|
"""
|
|
304
353
|
logger.info(f"Running theme refinement on {len(condensed_themes_df)} responses")
|
|
305
|
-
condensed_themes_df["response_id"] =
|
|
306
|
-
|
|
307
|
-
def transpose_refined_themes(refined_themes: pd.DataFrame):
|
|
308
|
-
"""Transpose topics for increased legibility."""
|
|
309
|
-
transposed_df = pd.DataFrame(
|
|
310
|
-
[refined_themes["topic"].to_numpy()], columns=refined_themes["topic_id"]
|
|
311
|
-
)
|
|
312
|
-
return transposed_df
|
|
354
|
+
condensed_themes_df["response_id"] = condensed_themes_df.index + 1
|
|
313
355
|
|
|
314
|
-
refined_themes = await batch_and_run(
|
|
356
|
+
refined_themes, _ = await batch_and_run(
|
|
315
357
|
condensed_themes_df,
|
|
316
358
|
prompt_template,
|
|
317
|
-
llm,
|
|
359
|
+
llm.with_structured_output(ThemeRefinementResponses),
|
|
318
360
|
batch_size=batch_size,
|
|
319
361
|
question=question,
|
|
320
362
|
system_prompt=system_prompt,
|
|
363
|
+
concurrency=concurrency,
|
|
321
364
|
)
|
|
322
|
-
return
|
|
365
|
+
return refined_themes, _
|
|
323
366
|
|
|
324
367
|
|
|
325
368
|
async def theme_target_alignment(
|
|
326
369
|
refined_themes_df: pd.DataFrame,
|
|
327
|
-
llm:
|
|
370
|
+
llm: RunnableWithFallbacks,
|
|
328
371
|
question: str,
|
|
329
372
|
target_n_themes: int = 10,
|
|
330
373
|
batch_size: int = 10000,
|
|
331
374
|
prompt_template: str | Path | PromptTemplate = "theme_target_alignment",
|
|
332
375
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
333
|
-
|
|
376
|
+
concurrency: int = 10,
|
|
377
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
334
378
|
"""Align themes to target number using an LLM.
|
|
335
379
|
|
|
336
380
|
This function processes refined themes to consolidate them into a target number of
|
|
@@ -341,7 +385,7 @@ async def theme_target_alignment(
|
|
|
341
385
|
Args:
|
|
342
386
|
refined_themes_df (pd.DataFrame): DataFrame containing the refined themes
|
|
343
387
|
from the previous pipeline stage.
|
|
344
|
-
llm (
|
|
388
|
+
llm (RunnableWithFallbacks): Language model instance to use for theme alignment.
|
|
345
389
|
question (str): The survey question.
|
|
346
390
|
target_n_themes (int, optional): Target number of themes to consolidate to.
|
|
347
391
|
Defaults to 10.
|
|
@@ -352,12 +396,13 @@ async def theme_target_alignment(
|
|
|
352
396
|
or PromptTemplate instance. Defaults to "theme_target_alignment".
|
|
353
397
|
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
354
398
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
399
|
+
concurrency (int): Number of concurrent API calls to make. Defaults to 10.
|
|
355
400
|
|
|
356
401
|
Returns:
|
|
357
|
-
pd.DataFrame
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
402
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
403
|
+
A tuple containing two DataFrames:
|
|
404
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
405
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
361
406
|
|
|
362
407
|
Note:
|
|
363
408
|
The function adds sequential response_ids to the input DataFrame and
|
|
@@ -365,39 +410,32 @@ async def theme_target_alignment(
|
|
|
365
410
|
processing.
|
|
366
411
|
"""
|
|
367
412
|
logger.info(
|
|
368
|
-
f"Running theme target alignment on {len(refined_themes_df
|
|
413
|
+
f"Running theme target alignment on {len(refined_themes_df)} themes compressing to {target_n_themes} themes"
|
|
369
414
|
)
|
|
370
|
-
refined_themes_df = refined_themes_df.
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
def transpose_aligned_themes(aligned_themes: pd.DataFrame):
|
|
374
|
-
"""Transpose topics for increased legibility."""
|
|
375
|
-
transposed_df = pd.DataFrame(
|
|
376
|
-
[aligned_themes["topic"].to_numpy()], columns=aligned_themes["topic_id"]
|
|
377
|
-
)
|
|
378
|
-
return transposed_df
|
|
379
|
-
|
|
380
|
-
aligned_themes = await batch_and_run(
|
|
415
|
+
refined_themes_df["response_id"] = refined_themes_df.index + 1
|
|
416
|
+
aligned_themes, _ = await batch_and_run(
|
|
381
417
|
refined_themes_df,
|
|
382
418
|
prompt_template,
|
|
383
|
-
llm,
|
|
419
|
+
llm.with_structured_output(ThemeRefinementResponses),
|
|
384
420
|
batch_size=batch_size,
|
|
385
421
|
question=question,
|
|
386
422
|
system_prompt=system_prompt,
|
|
387
423
|
target_n_themes=target_n_themes,
|
|
424
|
+
concurrency=concurrency,
|
|
388
425
|
)
|
|
389
|
-
return
|
|
426
|
+
return aligned_themes, _
|
|
390
427
|
|
|
391
428
|
|
|
392
429
|
async def theme_mapping(
|
|
393
430
|
responses_df: pd.DataFrame,
|
|
394
|
-
llm:
|
|
431
|
+
llm: RunnableWithFallbacks,
|
|
395
432
|
question: str,
|
|
396
433
|
refined_themes_df: pd.DataFrame,
|
|
397
434
|
batch_size: int = 20,
|
|
398
435
|
prompt_template: str | Path | PromptTemplate = "theme_mapping",
|
|
399
436
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
400
|
-
|
|
437
|
+
concurrency: int = 10,
|
|
438
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
401
439
|
"""Map survey responses to refined themes using an LLM.
|
|
402
440
|
|
|
403
441
|
This function analyzes each survey response and determines which of the refined
|
|
@@ -406,7 +444,7 @@ async def theme_mapping(
|
|
|
406
444
|
Args:
|
|
407
445
|
responses_df (pd.DataFrame): DataFrame containing survey responses.
|
|
408
446
|
Must include 'response_id' and 'response' columns.
|
|
409
|
-
llm (
|
|
447
|
+
llm (RunnableWithFallbacks): Language model instance to use for theme mapping.
|
|
410
448
|
question (str): The survey question.
|
|
411
449
|
refined_themes_df (pd.DataFrame): Single-row DataFrame where each column
|
|
412
450
|
represents a theme (from theme_refinement stage).
|
|
@@ -417,21 +455,90 @@ async def theme_mapping(
|
|
|
417
455
|
or PromptTemplate instance. Defaults to "theme_mapping".
|
|
418
456
|
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
419
457
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
458
|
+
concurrency (int): Number of concurrent API calls to make. Defaults to 10.
|
|
420
459
|
|
|
421
460
|
Returns:
|
|
422
|
-
pd.DataFrame
|
|
423
|
-
|
|
461
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
462
|
+
A tuple containing two DataFrames:
|
|
463
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
464
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
465
|
+
|
|
424
466
|
"""
|
|
425
467
|
logger.info(
|
|
426
|
-
f"Running theme mapping on {len(responses_df)} responses using {len(refined_themes_df
|
|
468
|
+
f"Running theme mapping on {len(responses_df)} responses using {len(refined_themes_df)} themes"
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
def transpose_refined_themes(refined_themes: pd.DataFrame):
|
|
472
|
+
"""Transpose topics for increased legibility."""
|
|
473
|
+
transposed_df = pd.DataFrame(
|
|
474
|
+
[refined_themes["topic"].to_numpy()], columns=refined_themes["topic_id"]
|
|
475
|
+
)
|
|
476
|
+
return transposed_df
|
|
477
|
+
|
|
478
|
+
mapping, unprocessable = await batch_and_run(
|
|
479
|
+
responses_df,
|
|
480
|
+
prompt_template,
|
|
481
|
+
llm.with_structured_output(ThemeMappingResponses),
|
|
482
|
+
batch_size=batch_size,
|
|
483
|
+
question=question,
|
|
484
|
+
refined_themes=transpose_refined_themes(refined_themes_df).to_dict(
|
|
485
|
+
orient="records"
|
|
486
|
+
),
|
|
487
|
+
integrity_check=True,
|
|
488
|
+
system_prompt=system_prompt,
|
|
489
|
+
concurrency=concurrency,
|
|
427
490
|
)
|
|
428
|
-
return
|
|
491
|
+
return mapping, unprocessable
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
async def detail_detection(
|
|
495
|
+
responses_df: pd.DataFrame,
|
|
496
|
+
llm: RunnableWithFallbacks,
|
|
497
|
+
question: str,
|
|
498
|
+
batch_size: int = 20,
|
|
499
|
+
prompt_template: str | Path | PromptTemplate = "detail_detection",
|
|
500
|
+
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
501
|
+
concurrency: int = 10,
|
|
502
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
503
|
+
"""Identify responses that provide high-value detailed evidence.
|
|
504
|
+
|
|
505
|
+
This function processes survey responses in batches to analyze their level of detail
|
|
506
|
+
and evidence using a language model. It identifies responses that contain specific
|
|
507
|
+
examples, data, or detailed reasoning that provide strong supporting evidence.
|
|
508
|
+
|
|
509
|
+
Args:
|
|
510
|
+
responses_df (pd.DataFrame): DataFrame containing survey responses to analyze.
|
|
511
|
+
Must contain 'response_id' and 'response' columns.
|
|
512
|
+
llm (RunnableWithFallbacks): Language model instance to use for detail detection.
|
|
513
|
+
question (str): The survey question.
|
|
514
|
+
batch_size (int, optional): Number of responses to process in each batch.
|
|
515
|
+
Defaults to 20.
|
|
516
|
+
prompt_template (str | Path | PromptTemplate, optional): Template for structuring
|
|
517
|
+
the prompt to the LLM. Can be a string identifier, path to template file,
|
|
518
|
+
or PromptTemplate instance. Defaults to "detail_detection".
|
|
519
|
+
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
520
|
+
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
521
|
+
concurrency (int): Number of concurrent API calls to make. Defaults to 10.
|
|
522
|
+
|
|
523
|
+
Returns:
|
|
524
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
525
|
+
A tuple containing two DataFrames:
|
|
526
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
527
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
528
|
+
|
|
529
|
+
Note:
|
|
530
|
+
The function uses response_id_integrity_check to ensure responses maintain
|
|
531
|
+
their original order and association after processing.
|
|
532
|
+
"""
|
|
533
|
+
logger.info(f"Running detail detection on {len(responses_df)} responses")
|
|
534
|
+
detailed, _ = await batch_and_run(
|
|
429
535
|
responses_df,
|
|
430
536
|
prompt_template,
|
|
431
|
-
llm,
|
|
537
|
+
llm.with_structured_output(DetailDetectionResponses),
|
|
432
538
|
batch_size=batch_size,
|
|
433
539
|
question=question,
|
|
434
|
-
|
|
435
|
-
response_id_integrity_check=True,
|
|
540
|
+
integrity_check=True,
|
|
436
541
|
system_prompt=system_prompt,
|
|
542
|
+
concurrency=concurrency,
|
|
437
543
|
)
|
|
544
|
+
return detailed, _
|