themefinder 0.5.3__tar.gz → 0.6.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of themefinder might be problematic. Click here for more details.
- {themefinder-0.5.3 → themefinder-0.6.2}/PKG-INFO +3 -2
- {themefinder-0.5.3 → themefinder-0.6.2}/README.md +2 -2
- {themefinder-0.5.3 → themefinder-0.6.2}/pyproject.toml +1 -1
- {themefinder-0.5.3 → themefinder-0.6.2}/src/themefinder/core.py +86 -75
- themefinder-0.6.2/src/themefinder/llm_batch_processor.py +490 -0
- themefinder-0.6.2/src/themefinder/models.py +138 -0
- {themefinder-0.5.3 → themefinder-0.6.2}/src/themefinder/prompts/sentiment_analysis.txt +8 -5
- themefinder-0.6.2/src/themefinder/prompts/theme_condensation.txt +50 -0
- {themefinder-0.5.3 → themefinder-0.6.2}/src/themefinder/prompts/theme_mapping.txt +3 -3
- {themefinder-0.5.3 → themefinder-0.6.2}/src/themefinder/prompts/theme_refinement.txt +15 -28
- themefinder-0.5.3/src/themefinder/llm_batch_processor.py +0 -310
- themefinder-0.5.3/src/themefinder/prompts/theme_condensation.txt +0 -37
- {themefinder-0.5.3 → themefinder-0.6.2}/LICENCE +0 -0
- {themefinder-0.5.3 → themefinder-0.6.2}/src/themefinder/__init__.py +2 -2
- {themefinder-0.5.3 → themefinder-0.6.2}/src/themefinder/prompts/consultation_system_prompt.txt +0 -0
- {themefinder-0.5.3 → themefinder-0.6.2}/src/themefinder/prompts/theme_generation.txt +0 -0
- {themefinder-0.5.3 → themefinder-0.6.2}/src/themefinder/prompts/theme_target_alignment.txt +0 -0
- {themefinder-0.5.3 → themefinder-0.6.2}/src/themefinder/themefinder_logging.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: themefinder
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.2
|
|
4
4
|
Summary: A topic modelling Python package designed for analysing one-to-many question-answer data eg free-text survey responses.
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: i.AI
|
|
@@ -100,7 +100,7 @@ system_prompt = "You are an AI evaluation tool analyzing survey responses about
|
|
|
100
100
|
# Run the function to find themes
|
|
101
101
|
# We use asyncio to query LLM endpoints asynchronously, so we need to await our function
|
|
102
102
|
async def main():
|
|
103
|
-
result = await find_themes(responses_df, llm, question, system_prompt)
|
|
103
|
+
result = await find_themes(responses_df, llm, question, system_prompt=system_prompt)
|
|
104
104
|
print(result)
|
|
105
105
|
|
|
106
106
|
if __name__ == "__main__":
|
|
@@ -155,3 +155,4 @@ The documentation is [© Crown copyright](https://www.nationalarchives.gov.uk/in
|
|
|
155
155
|
## Feedback
|
|
156
156
|
|
|
157
157
|
If you have feedback on this package, please fill in our [feedback form](https://forms.gle/85xUSMvxGzSSKQ499) or contact us with questions or feedback at packages@cabinetoffice.gov.uk.
|
|
158
|
+
|
|
@@ -69,7 +69,7 @@ system_prompt = "You are an AI evaluation tool analyzing survey responses about
|
|
|
69
69
|
# Run the function to find themes
|
|
70
70
|
# We use asyncio to query LLM endpoints asynchronously, so we need to await our function
|
|
71
71
|
async def main():
|
|
72
|
-
result = await find_themes(responses_df, llm, question, system_prompt)
|
|
72
|
+
result = await find_themes(responses_df, llm, question, system_prompt=system_prompt)
|
|
73
73
|
print(result)
|
|
74
74
|
|
|
75
75
|
if __name__ == "__main__":
|
|
@@ -123,4 +123,4 @@ The documentation is [© Crown copyright](https://www.nationalarchives.gov.uk/in
|
|
|
123
123
|
|
|
124
124
|
## Feedback
|
|
125
125
|
|
|
126
|
-
If you have feedback on this package, please fill in our [feedback form](https://forms.gle/85xUSMvxGzSSKQ499) or contact us with questions or feedback at packages@cabinetoffice.gov.uk.
|
|
126
|
+
If you have feedback on this package, please fill in our [feedback form](https://forms.gle/85xUSMvxGzSSKQ499) or contact us with questions or feedback at packages@cabinetoffice.gov.uk.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "themefinder"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.6.2"
|
|
4
4
|
description = "A topic modelling Python package designed for analysing one-to-many question-answer data eg free-text survey responses."
|
|
5
5
|
authors = ["i.AI <packages@cabinetoffice.gov.uk>"]
|
|
6
6
|
packages = [{include = "themefinder", from = "src"}]
|
|
@@ -6,6 +6,7 @@ from langchain_core.prompts import PromptTemplate
|
|
|
6
6
|
from langchain_core.runnables import Runnable
|
|
7
7
|
|
|
8
8
|
from .llm_batch_processor import batch_and_run, load_prompt_from_file
|
|
9
|
+
from .models import SentimentAnalysisOutput, ThemeMappingOutput
|
|
9
10
|
from .themefinder_logging import logger
|
|
10
11
|
|
|
11
12
|
CONSULTATION_SYSTEM_PROMPT = load_prompt_from_file("consultation_system_prompt")
|
|
@@ -18,7 +19,7 @@ async def find_themes(
|
|
|
18
19
|
target_n_themes: int | None = None,
|
|
19
20
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
20
21
|
verbose: bool = True,
|
|
21
|
-
) -> dict[str, pd.DataFrame]:
|
|
22
|
+
) -> dict[str, str | pd.DataFrame]:
|
|
22
23
|
"""Process survey responses through a multi-stage theme analysis pipeline.
|
|
23
24
|
|
|
24
25
|
This pipeline performs sequential analysis steps:
|
|
@@ -41,47 +42,46 @@ async def find_themes(
|
|
|
41
42
|
Defaults to True.
|
|
42
43
|
|
|
43
44
|
Returns:
|
|
44
|
-
dict[str, pd.DataFrame]: Dictionary containing results from each pipeline stage:
|
|
45
|
-
- question: The survey question
|
|
45
|
+
dict[str, str | pd.DataFrame]: Dictionary containing results from each pipeline stage:
|
|
46
|
+
- question: The survey question string
|
|
46
47
|
- sentiment: DataFrame with sentiment analysis results
|
|
47
|
-
-
|
|
48
|
-
- condensed_topics: DataFrame with combined similar themes
|
|
49
|
-
- refined_topics: DataFrame with refined theme definitions
|
|
48
|
+
- themes: DataFrame with the final themes output
|
|
50
49
|
- mapping: DataFrame mapping responses to final themes
|
|
50
|
+
- unprocessables: Dataframe containing the inputs that could not be processed by the LLM
|
|
51
51
|
"""
|
|
52
52
|
logger.setLevel(logging.INFO if verbose else logging.CRITICAL)
|
|
53
53
|
|
|
54
|
-
sentiment_df = await sentiment_analysis(
|
|
54
|
+
sentiment_df, sentiment_unprocessables = await sentiment_analysis(
|
|
55
55
|
responses_df,
|
|
56
56
|
llm,
|
|
57
57
|
question=question,
|
|
58
58
|
system_prompt=system_prompt,
|
|
59
59
|
)
|
|
60
|
-
theme_df = await theme_generation(
|
|
60
|
+
theme_df, _ = await theme_generation(
|
|
61
61
|
sentiment_df,
|
|
62
62
|
llm,
|
|
63
63
|
question=question,
|
|
64
64
|
system_prompt=system_prompt,
|
|
65
65
|
)
|
|
66
|
-
condensed_theme_df = await theme_condensation(
|
|
66
|
+
condensed_theme_df, _ = await theme_condensation(
|
|
67
67
|
theme_df, llm, question=question, system_prompt=system_prompt
|
|
68
68
|
)
|
|
69
|
-
refined_theme_df = await theme_refinement(
|
|
69
|
+
refined_theme_df, _ = await theme_refinement(
|
|
70
70
|
condensed_theme_df,
|
|
71
71
|
llm,
|
|
72
72
|
question=question,
|
|
73
73
|
system_prompt=system_prompt,
|
|
74
74
|
)
|
|
75
75
|
if target_n_themes is not None:
|
|
76
|
-
refined_theme_df = await theme_target_alignment(
|
|
76
|
+
refined_theme_df, _ = await theme_target_alignment(
|
|
77
77
|
refined_theme_df,
|
|
78
78
|
llm,
|
|
79
79
|
question=question,
|
|
80
80
|
target_n_themes=target_n_themes,
|
|
81
81
|
system_prompt=system_prompt,
|
|
82
82
|
)
|
|
83
|
-
mapping_df = await theme_mapping(
|
|
84
|
-
sentiment_df,
|
|
83
|
+
mapping_df, mapping_unprocessables = await theme_mapping(
|
|
84
|
+
sentiment_df[["response_id", "response"]],
|
|
85
85
|
llm,
|
|
86
86
|
question=question,
|
|
87
87
|
refined_themes_df=refined_theme_df,
|
|
@@ -95,10 +95,9 @@ async def find_themes(
|
|
|
95
95
|
return {
|
|
96
96
|
"question": question,
|
|
97
97
|
"sentiment": sentiment_df,
|
|
98
|
-
"themes":
|
|
99
|
-
"condensed_themes": condensed_theme_df,
|
|
100
|
-
"refined_themes": refined_theme_df,
|
|
98
|
+
"themes": refined_theme_df,
|
|
101
99
|
"mapping": mapping_df,
|
|
100
|
+
"unprocessables": pd.concat([sentiment_unprocessables, mapping_unprocessables]),
|
|
102
101
|
}
|
|
103
102
|
|
|
104
103
|
|
|
@@ -109,7 +108,7 @@ async def sentiment_analysis(
|
|
|
109
108
|
batch_size: int = 20,
|
|
110
109
|
prompt_template: str | Path | PromptTemplate = "sentiment_analysis",
|
|
111
110
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
112
|
-
) -> pd.DataFrame:
|
|
111
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
113
112
|
"""Perform sentiment analysis on survey responses using an LLM.
|
|
114
113
|
|
|
115
114
|
This function processes survey responses in batches to analyze their sentiment
|
|
@@ -129,24 +128,29 @@ async def sentiment_analysis(
|
|
|
129
128
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
130
129
|
|
|
131
130
|
Returns:
|
|
132
|
-
pd.DataFrame
|
|
133
|
-
|
|
131
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
132
|
+
A tuple containing two DataFrames:
|
|
133
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
134
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
134
135
|
|
|
135
136
|
Note:
|
|
136
|
-
The function uses
|
|
137
|
+
The function uses validation_check to ensure responses maintain
|
|
137
138
|
their original order and association after processing.
|
|
138
139
|
"""
|
|
139
140
|
logger.info(f"Running sentiment analysis on {len(responses_df)} responses")
|
|
140
|
-
|
|
141
|
+
processed_rows, unprocessable_rows = await batch_and_run(
|
|
141
142
|
responses_df,
|
|
142
143
|
prompt_template,
|
|
143
144
|
llm,
|
|
144
145
|
batch_size=batch_size,
|
|
145
146
|
question=question,
|
|
146
|
-
|
|
147
|
+
validation_check=True,
|
|
148
|
+
task_validation_model=SentimentAnalysisOutput,
|
|
147
149
|
system_prompt=system_prompt,
|
|
148
150
|
)
|
|
149
151
|
|
|
152
|
+
return processed_rows, unprocessable_rows
|
|
153
|
+
|
|
150
154
|
|
|
151
155
|
async def theme_generation(
|
|
152
156
|
responses_df: pd.DataFrame,
|
|
@@ -156,7 +160,7 @@ async def theme_generation(
|
|
|
156
160
|
partition_key: str | None = "position",
|
|
157
161
|
prompt_template: str | Path | PromptTemplate = "theme_generation",
|
|
158
162
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
159
|
-
) -> pd.DataFrame:
|
|
163
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
160
164
|
"""Generate themes from survey responses using an LLM.
|
|
161
165
|
|
|
162
166
|
This function processes batches of survey responses to identify common themes or topics.
|
|
@@ -179,10 +183,14 @@ async def theme_generation(
|
|
|
179
183
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
180
184
|
|
|
181
185
|
Returns:
|
|
182
|
-
pd.DataFrame
|
|
186
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
187
|
+
A tuple containing two DataFrames:
|
|
188
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
189
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
190
|
+
|
|
183
191
|
"""
|
|
184
192
|
logger.info(f"Running theme generation on {len(responses_df)} responses")
|
|
185
|
-
|
|
193
|
+
generated_themes, _ = await batch_and_run(
|
|
186
194
|
responses_df,
|
|
187
195
|
prompt_template,
|
|
188
196
|
llm,
|
|
@@ -191,17 +199,18 @@ async def theme_generation(
|
|
|
191
199
|
question=question,
|
|
192
200
|
system_prompt=system_prompt,
|
|
193
201
|
)
|
|
202
|
+
return generated_themes, _
|
|
194
203
|
|
|
195
204
|
|
|
196
205
|
async def theme_condensation(
|
|
197
206
|
themes_df: pd.DataFrame,
|
|
198
207
|
llm: Runnable,
|
|
199
208
|
question: str,
|
|
200
|
-
batch_size: int =
|
|
209
|
+
batch_size: int = 75,
|
|
201
210
|
prompt_template: str | Path | PromptTemplate = "theme_condensation",
|
|
202
211
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
203
212
|
**kwargs,
|
|
204
|
-
) -> pd.DataFrame:
|
|
213
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
205
214
|
"""Condense and combine similar themes identified from survey responses.
|
|
206
215
|
|
|
207
216
|
This function processes the initially identified themes to combine similar or
|
|
@@ -221,18 +230,21 @@ async def theme_condensation(
|
|
|
221
230
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
222
231
|
|
|
223
232
|
Returns:
|
|
224
|
-
pd.DataFrame
|
|
225
|
-
|
|
233
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
234
|
+
A tuple containing two DataFrames:
|
|
235
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
236
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
237
|
+
|
|
226
238
|
"""
|
|
227
239
|
logger.info(f"Running theme condensation on {len(themes_df)} themes")
|
|
228
|
-
themes_df["response_id"] =
|
|
240
|
+
themes_df["response_id"] = themes_df.index + 1
|
|
229
241
|
|
|
230
242
|
n_themes = themes_df.shape[0]
|
|
231
243
|
while n_themes > batch_size:
|
|
232
244
|
logger.info(
|
|
233
245
|
f"{n_themes} larger than batch size, using recursive theme condensation"
|
|
234
246
|
)
|
|
235
|
-
themes_df = await batch_and_run(
|
|
247
|
+
themes_df, _ = await batch_and_run(
|
|
236
248
|
themes_df,
|
|
237
249
|
prompt_template,
|
|
238
250
|
llm,
|
|
@@ -242,13 +254,13 @@ async def theme_condensation(
|
|
|
242
254
|
**kwargs,
|
|
243
255
|
)
|
|
244
256
|
themes_df = themes_df.sample(frac=1).reset_index(drop=True)
|
|
245
|
-
themes_df["response_id"] =
|
|
257
|
+
themes_df["response_id"] = themes_df.index + 1
|
|
246
258
|
if len(themes_df) == n_themes:
|
|
247
259
|
logger.info("Themes no longer being condensed")
|
|
248
260
|
break
|
|
249
261
|
n_themes = themes_df.shape[0]
|
|
250
262
|
|
|
251
|
-
themes_df = await batch_and_run(
|
|
263
|
+
themes_df, _ = await batch_and_run(
|
|
252
264
|
themes_df,
|
|
253
265
|
prompt_template,
|
|
254
266
|
llm,
|
|
@@ -259,7 +271,7 @@ async def theme_condensation(
|
|
|
259
271
|
)
|
|
260
272
|
|
|
261
273
|
logger.info(f"Final number of condensed themes: {themes_df.shape[0]}")
|
|
262
|
-
return themes_df
|
|
274
|
+
return themes_df, _
|
|
263
275
|
|
|
264
276
|
|
|
265
277
|
async def theme_refinement(
|
|
@@ -269,7 +281,7 @@ async def theme_refinement(
|
|
|
269
281
|
batch_size: int = 10000,
|
|
270
282
|
prompt_template: str | Path | PromptTemplate = "theme_refinement",
|
|
271
283
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
272
|
-
) -> pd.DataFrame:
|
|
284
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
273
285
|
"""Refine and standardize condensed themes using an LLM.
|
|
274
286
|
|
|
275
287
|
This function processes previously condensed themes to create clear, standardized
|
|
@@ -286,15 +298,15 @@ async def theme_refinement(
|
|
|
286
298
|
Defaults to 10000.
|
|
287
299
|
prompt_template (str | Path | PromptTemplate, optional): Template for structuring
|
|
288
300
|
the prompt to the LLM. Can be a string identifier, path to template file,
|
|
289
|
-
or PromptTemplate instance. Defaults to "
|
|
301
|
+
or PromptTemplate instance. Defaults to "theme_refinement".
|
|
290
302
|
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
291
303
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
292
304
|
|
|
293
305
|
Returns:
|
|
294
|
-
pd.DataFrame
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
306
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
307
|
+
A tuple containing two DataFrames:
|
|
308
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
309
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
298
310
|
|
|
299
311
|
Note:
|
|
300
312
|
The function adds sequential response_ids to the input DataFrame and
|
|
@@ -302,16 +314,9 @@ async def theme_refinement(
|
|
|
302
314
|
processing.
|
|
303
315
|
"""
|
|
304
316
|
logger.info(f"Running theme refinement on {len(condensed_themes_df)} responses")
|
|
305
|
-
condensed_themes_df["response_id"] =
|
|
317
|
+
condensed_themes_df["response_id"] = condensed_themes_df.index + 1
|
|
306
318
|
|
|
307
|
-
|
|
308
|
-
"""Transpose topics for increased legibility."""
|
|
309
|
-
transposed_df = pd.DataFrame(
|
|
310
|
-
[refined_themes["topic"].to_numpy()], columns=refined_themes["topic_id"]
|
|
311
|
-
)
|
|
312
|
-
return transposed_df
|
|
313
|
-
|
|
314
|
-
refined_themes = await batch_and_run(
|
|
319
|
+
refined_themes, _ = await batch_and_run(
|
|
315
320
|
condensed_themes_df,
|
|
316
321
|
prompt_template,
|
|
317
322
|
llm,
|
|
@@ -319,7 +324,7 @@ async def theme_refinement(
|
|
|
319
324
|
question=question,
|
|
320
325
|
system_prompt=system_prompt,
|
|
321
326
|
)
|
|
322
|
-
return
|
|
327
|
+
return refined_themes, _
|
|
323
328
|
|
|
324
329
|
|
|
325
330
|
async def theme_target_alignment(
|
|
@@ -330,7 +335,7 @@ async def theme_target_alignment(
|
|
|
330
335
|
batch_size: int = 10000,
|
|
331
336
|
prompt_template: str | Path | PromptTemplate = "theme_target_alignment",
|
|
332
337
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
333
|
-
) -> pd.DataFrame:
|
|
338
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
334
339
|
"""Align themes to target number using an LLM.
|
|
335
340
|
|
|
336
341
|
This function processes refined themes to consolidate them into a target number of
|
|
@@ -354,10 +359,10 @@ async def theme_target_alignment(
|
|
|
354
359
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
355
360
|
|
|
356
361
|
Returns:
|
|
357
|
-
pd.DataFrame
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
362
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
363
|
+
A tuple containing two DataFrames:
|
|
364
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
365
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
361
366
|
|
|
362
367
|
Note:
|
|
363
368
|
The function adds sequential response_ids to the input DataFrame and
|
|
@@ -365,19 +370,10 @@ async def theme_target_alignment(
|
|
|
365
370
|
processing.
|
|
366
371
|
"""
|
|
367
372
|
logger.info(
|
|
368
|
-
f"Running theme target alignment on {len(refined_themes_df
|
|
373
|
+
f"Running theme target alignment on {len(refined_themes_df)} themes compressing to {target_n_themes} themes"
|
|
369
374
|
)
|
|
370
|
-
refined_themes_df = refined_themes_df.
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
def transpose_aligned_themes(aligned_themes: pd.DataFrame):
|
|
374
|
-
"""Transpose topics for increased legibility."""
|
|
375
|
-
transposed_df = pd.DataFrame(
|
|
376
|
-
[aligned_themes["topic"].to_numpy()], columns=aligned_themes["topic_id"]
|
|
377
|
-
)
|
|
378
|
-
return transposed_df
|
|
379
|
-
|
|
380
|
-
aligned_themes = await batch_and_run(
|
|
375
|
+
refined_themes_df["response_id"] = refined_themes_df.index + 1
|
|
376
|
+
aligned_themes, _ = await batch_and_run(
|
|
381
377
|
refined_themes_df,
|
|
382
378
|
prompt_template,
|
|
383
379
|
llm,
|
|
@@ -386,7 +382,7 @@ async def theme_target_alignment(
|
|
|
386
382
|
system_prompt=system_prompt,
|
|
387
383
|
target_n_themes=target_n_themes,
|
|
388
384
|
)
|
|
389
|
-
return
|
|
385
|
+
return aligned_themes, _
|
|
390
386
|
|
|
391
387
|
|
|
392
388
|
async def theme_mapping(
|
|
@@ -397,7 +393,7 @@ async def theme_mapping(
|
|
|
397
393
|
batch_size: int = 20,
|
|
398
394
|
prompt_template: str | Path | PromptTemplate = "theme_mapping",
|
|
399
395
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
400
|
-
) -> pd.DataFrame:
|
|
396
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
401
397
|
"""Map survey responses to refined themes using an LLM.
|
|
402
398
|
|
|
403
399
|
This function analyzes each survey response and determines which of the refined
|
|
@@ -419,19 +415,34 @@ async def theme_mapping(
|
|
|
419
415
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
420
416
|
|
|
421
417
|
Returns:
|
|
422
|
-
pd.DataFrame
|
|
423
|
-
|
|
418
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
419
|
+
A tuple containing two DataFrames:
|
|
420
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
421
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
422
|
+
|
|
424
423
|
"""
|
|
425
424
|
logger.info(
|
|
426
|
-
f"Running theme mapping on {len(responses_df)} responses using {len(refined_themes_df
|
|
425
|
+
f"Running theme mapping on {len(responses_df)} responses using {len(refined_themes_df)} themes"
|
|
427
426
|
)
|
|
428
|
-
|
|
427
|
+
|
|
428
|
+
def transpose_refined_themes(refined_themes: pd.DataFrame):
|
|
429
|
+
"""Transpose topics for increased legibility."""
|
|
430
|
+
transposed_df = pd.DataFrame(
|
|
431
|
+
[refined_themes["topic"].to_numpy()], columns=refined_themes["topic_id"]
|
|
432
|
+
)
|
|
433
|
+
return transposed_df
|
|
434
|
+
|
|
435
|
+
mapping, _ = await batch_and_run(
|
|
429
436
|
responses_df,
|
|
430
437
|
prompt_template,
|
|
431
438
|
llm,
|
|
432
439
|
batch_size=batch_size,
|
|
433
440
|
question=question,
|
|
434
|
-
refined_themes=refined_themes_df.to_dict(
|
|
435
|
-
|
|
441
|
+
refined_themes=transpose_refined_themes(refined_themes_df).to_dict(
|
|
442
|
+
orient="records"
|
|
443
|
+
),
|
|
444
|
+
validation_check=True,
|
|
445
|
+
task_validation_model=ThemeMappingOutput,
|
|
436
446
|
system_prompt=system_prompt,
|
|
437
447
|
)
|
|
448
|
+
return mapping, _
|