themefinder 0.5.4__py3-none-any.whl → 0.6.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of themefinder might be problematic. Click here for more details.

themefinder/__init__.py CHANGED
@@ -1,10 +1,12 @@
1
1
  from .core import (
2
2
  find_themes,
3
3
  sentiment_analysis,
4
- theme_generation,
5
4
  theme_condensation,
6
- theme_refinement,
5
+ theme_generation,
7
6
  theme_mapping,
7
+ theme_refinement,
8
+ theme_target_alignment,
9
+ detail_detection,
8
10
  )
9
11
 
10
12
  __all__ = [
@@ -13,6 +15,8 @@ __all__ = [
13
15
  "theme_generation",
14
16
  "theme_condensation",
15
17
  "theme_refinement",
18
+ "theme_target_alignment",
16
19
  "theme_mapping",
20
+ "detail_detection",
17
21
  ]
18
22
  __version__ = "0.1.0"
themefinder/core.py CHANGED
@@ -3,9 +3,17 @@ from pathlib import Path
3
3
 
4
4
  import pandas as pd
5
5
  from langchain_core.prompts import PromptTemplate
6
- from langchain_core.runnables import Runnable
6
+ from langchain.schema.runnable import RunnableWithFallbacks
7
7
 
8
8
  from .llm_batch_processor import batch_and_run, load_prompt_from_file
9
+ from .models import (
10
+ SentimentAnalysisResponses,
11
+ ThemeGenerationResponses,
12
+ ThemeCondensationResponses,
13
+ ThemeRefinementResponses,
14
+ ThemeMappingResponses,
15
+ DetailDetectionResponses,
16
+ )
9
17
  from .themefinder_logging import logger
10
18
 
11
19
  CONSULTATION_SYSTEM_PROMPT = load_prompt_from_file("consultation_system_prompt")
@@ -13,12 +21,13 @@ CONSULTATION_SYSTEM_PROMPT = load_prompt_from_file("consultation_system_prompt")
13
21
 
14
22
  async def find_themes(
15
23
  responses_df: pd.DataFrame,
16
- llm: Runnable,
24
+ llm: RunnableWithFallbacks,
17
25
  question: str,
18
26
  target_n_themes: int | None = None,
19
27
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
20
28
  verbose: bool = True,
21
- ) -> dict[str, pd.DataFrame]:
29
+ concurrency: int = 10,
30
+ ) -> dict[str, str | pd.DataFrame]:
22
31
  """Process survey responses through a multi-stage theme analysis pipeline.
23
32
 
24
33
  This pipeline performs sequential analysis steps:
@@ -31,7 +40,7 @@ async def find_themes(
31
40
 
32
41
  Args:
33
42
  responses_df (pd.DataFrame): DataFrame containing survey responses
34
- llm (Runnable): Language model instance for text analysis
43
+ llm (RunnableWithFallbacks): Language model instance for text analysis
35
44
  question (str): The survey question
36
45
  target_n_themes (int | None, optional): Target number of themes to consolidate to.
37
46
  If None, skip theme target alignment step. Defaults to None.
@@ -39,53 +48,69 @@ async def find_themes(
39
48
  Defaults to CONSULTATION_SYSTEM_PROMPT.
40
49
  verbose (bool): Whether to show information messages during processing.
41
50
  Defaults to True.
51
+ concurrency (int): Number of concurrent API calls to make. Defaults to 10.
42
52
 
43
53
  Returns:
44
- dict[str, pd.DataFrame]: Dictionary containing results from each pipeline stage:
45
- - question: The survey question
54
+ dict[str, str | pd.DataFrame]: Dictionary containing results from each pipeline stage:
55
+ - question: The survey question string
46
56
  - sentiment: DataFrame with sentiment analysis results
47
- - topics: DataFrame with initial generated themes
48
- - condensed_topics: DataFrame with combined similar themes
49
- - refined_topics: DataFrame with refined theme definitions
57
+ - themes: DataFrame with the final themes output
50
58
  - mapping: DataFrame mapping responses to final themes
59
+ - unprocessables: Dataframe containing the inputs that could not be processed by the LLM
51
60
  """
52
61
  logger.setLevel(logging.INFO if verbose else logging.CRITICAL)
53
62
 
54
- sentiment_df = await sentiment_analysis(
63
+ sentiment_df, sentiment_unprocessables = await sentiment_analysis(
55
64
  responses_df,
56
65
  llm,
57
66
  question=question,
58
67
  system_prompt=system_prompt,
68
+ concurrency=concurrency,
59
69
  )
60
- theme_df = await theme_generation(
70
+ theme_df, _ = await theme_generation(
61
71
  sentiment_df,
62
72
  llm,
63
73
  question=question,
64
74
  system_prompt=system_prompt,
75
+ concurrency=concurrency,
65
76
  )
66
- condensed_theme_df = await theme_condensation(
67
- theme_df, llm, question=question, system_prompt=system_prompt
77
+ condensed_theme_df, _ = await theme_condensation(
78
+ theme_df,
79
+ llm,
80
+ question=question,
81
+ system_prompt=system_prompt,
82
+ concurrency=concurrency,
68
83
  )
69
- refined_theme_df = await theme_refinement(
84
+ refined_theme_df, _ = await theme_refinement(
70
85
  condensed_theme_df,
71
86
  llm,
72
87
  question=question,
73
88
  system_prompt=system_prompt,
89
+ concurrency=concurrency,
74
90
  )
75
91
  if target_n_themes is not None:
76
- refined_theme_df = await theme_target_alignment(
92
+ refined_theme_df, _ = await theme_target_alignment(
77
93
  refined_theme_df,
78
94
  llm,
79
95
  question=question,
80
96
  target_n_themes=target_n_themes,
81
97
  system_prompt=system_prompt,
98
+ concurrency=concurrency,
82
99
  )
83
- mapping_df = await theme_mapping(
84
- sentiment_df,
100
+ mapping_df, mapping_unprocessables = await theme_mapping(
101
+ sentiment_df[["response_id", "response"]],
85
102
  llm,
86
103
  question=question,
87
104
  refined_themes_df=refined_theme_df,
88
105
  system_prompt=system_prompt,
106
+ concurrency=concurrency,
107
+ )
108
+ detailed_df, _ = await detail_detection(
109
+ responses_df[["response_id", "response"]],
110
+ llm,
111
+ question=question,
112
+ system_prompt=system_prompt,
113
+ concurrency=concurrency,
89
114
  )
90
115
 
91
116
  logger.info("Finished finding themes")
@@ -95,21 +120,22 @@ async def find_themes(
95
120
  return {
96
121
  "question": question,
97
122
  "sentiment": sentiment_df,
98
- "themes": theme_df,
99
- "condensed_themes": condensed_theme_df,
100
- "refined_themes": refined_theme_df,
123
+ "themes": refined_theme_df,
101
124
  "mapping": mapping_df,
125
+ "detailed_responses": detailed_df,
126
+ "unprocessables": pd.concat([sentiment_unprocessables, mapping_unprocessables]),
102
127
  }
103
128
 
104
129
 
105
130
  async def sentiment_analysis(
106
131
  responses_df: pd.DataFrame,
107
- llm: Runnable,
132
+ llm: RunnableWithFallbacks,
108
133
  question: str,
109
134
  batch_size: int = 20,
110
135
  prompt_template: str | Path | PromptTemplate = "sentiment_analysis",
111
136
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
112
- ) -> pd.DataFrame:
137
+ concurrency: int = 10,
138
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
113
139
  """Perform sentiment analysis on survey responses using an LLM.
114
140
 
115
141
  This function processes survey responses in batches to analyze their sentiment
@@ -118,7 +144,7 @@ async def sentiment_analysis(
118
144
  Args:
119
145
  responses_df (pd.DataFrame): DataFrame containing survey responses to analyze.
120
146
  Must contain 'response_id' and 'response' columns.
121
- llm (Runnable): Language model instance to use for sentiment analysis.
147
+ llm (RunnableWithFallbacks): Language model instance to use for sentiment analysis.
122
148
  question (str): The survey question.
123
149
  batch_size (int, optional): Number of responses to process in each batch.
124
150
  Defaults to 20.
@@ -127,36 +153,43 @@ async def sentiment_analysis(
127
153
  or PromptTemplate instance. Defaults to "sentiment_analysis".
128
154
  system_prompt (str): System prompt to guide the LLM's behavior.
129
155
  Defaults to CONSULTATION_SYSTEM_PROMPT.
156
+ concurrency (int): Number of concurrent API calls to make. Defaults to 10.
130
157
 
131
158
  Returns:
132
- pd.DataFrame: DataFrame containing the original responses enriched with
133
- sentiment analysis results.
159
+ tuple[pd.DataFrame, pd.DataFrame]:
160
+ A tuple containing two DataFrames:
161
+ - The first DataFrame contains the rows that were successfully processed by the LLM
162
+ - The second DataFrame contains the rows that could not be processed by the LLM
134
163
 
135
164
  Note:
136
- The function uses response_id_integrity_check to ensure responses maintain
165
+ The function uses integrity_check to ensure responses maintain
137
166
  their original order and association after processing.
138
167
  """
139
168
  logger.info(f"Running sentiment analysis on {len(responses_df)} responses")
140
- return await batch_and_run(
169
+ sentiment, unprocessable = await batch_and_run(
141
170
  responses_df,
142
171
  prompt_template,
143
- llm,
172
+ llm.with_structured_output(SentimentAnalysisResponses),
144
173
  batch_size=batch_size,
145
174
  question=question,
146
- response_id_integrity_check=True,
175
+ integrity_check=True,
147
176
  system_prompt=system_prompt,
177
+ concurrency=concurrency,
148
178
  )
149
179
 
180
+ return sentiment, unprocessable
181
+
150
182
 
151
183
  async def theme_generation(
152
184
  responses_df: pd.DataFrame,
153
- llm: Runnable,
185
+ llm: RunnableWithFallbacks,
154
186
  question: str,
155
187
  batch_size: int = 50,
156
188
  partition_key: str | None = "position",
157
189
  prompt_template: str | Path | PromptTemplate = "theme_generation",
158
190
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
159
- ) -> pd.DataFrame:
191
+ concurrency: int = 10,
192
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
160
193
  """Generate themes from survey responses using an LLM.
161
194
 
162
195
  This function processes batches of survey responses to identify common themes or topics.
@@ -164,7 +197,7 @@ async def theme_generation(
164
197
  Args:
165
198
  responses_df (pd.DataFrame): DataFrame containing survey responses.
166
199
  Must include 'response_id' and 'response' columns.
167
- llm (Runnable): Language model instance to use for theme generation.
200
+ llm (RunnableWithFallbacks): Language model instance to use for theme generation.
168
201
  question (str): The survey question.
169
202
  batch_size (int, optional): Number of responses to process in each batch.
170
203
  Defaults to 50.
@@ -177,31 +210,39 @@ async def theme_generation(
177
210
  or PromptTemplate instance. Defaults to "theme_generation".
178
211
  system_prompt (str): System prompt to guide the LLM's behavior.
179
212
  Defaults to CONSULTATION_SYSTEM_PROMPT.
213
+ concurrency (int): Number of concurrent API calls to make. Defaults to 10.
180
214
 
181
215
  Returns:
182
- pd.DataFrame: DataFrame containing identified themes and their associated metadata.
216
+ tuple[pd.DataFrame, pd.DataFrame]:
217
+ A tuple containing two DataFrames:
218
+ - The first DataFrame contains the rows that were successfully processed by the LLM
219
+ - The second DataFrame contains the rows that could not be processed by the LLM
220
+
183
221
  """
184
222
  logger.info(f"Running theme generation on {len(responses_df)} responses")
185
- return await batch_and_run(
223
+ generated_themes, _ = await batch_and_run(
186
224
  responses_df,
187
225
  prompt_template,
188
- llm,
226
+ llm.with_structured_output(ThemeGenerationResponses),
189
227
  batch_size=batch_size,
190
228
  partition_key=partition_key,
191
229
  question=question,
192
230
  system_prompt=system_prompt,
231
+ concurrency=concurrency,
193
232
  )
233
+ return generated_themes, _
194
234
 
195
235
 
196
236
  async def theme_condensation(
197
237
  themes_df: pd.DataFrame,
198
- llm: Runnable,
238
+ llm: RunnableWithFallbacks,
199
239
  question: str,
200
240
  batch_size: int = 75,
201
241
  prompt_template: str | Path | PromptTemplate = "theme_condensation",
202
242
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
243
+ concurrency: int = 10,
203
244
  **kwargs,
204
- ) -> pd.DataFrame:
245
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
205
246
  """Condense and combine similar themes identified from survey responses.
206
247
 
207
248
  This function processes the initially identified themes to combine similar or
@@ -210,7 +251,7 @@ async def theme_condensation(
210
251
  Args:
211
252
  themes_df (pd.DataFrame): DataFrame containing the initial themes identified
212
253
  from survey responses.
213
- llm (Runnable): Language model instance to use for theme condensation.
254
+ llm (RunnableWithFallbacks): Language model instance to use for theme condensation.
214
255
  question (str): The survey question.
215
256
  batch_size (int, optional): Number of themes to process in each batch.
216
257
  Defaults to 100.
@@ -219,57 +260,64 @@ async def theme_condensation(
219
260
  or PromptTemplate instance. Defaults to "theme_condensation".
220
261
  system_prompt (str): System prompt to guide the LLM's behavior.
221
262
  Defaults to CONSULTATION_SYSTEM_PROMPT.
263
+ concurrency (int): Number of concurrent API calls to make. Defaults to 10.
222
264
 
223
265
  Returns:
224
- pd.DataFrame: DataFrame containing the condensed themes, where similar topics
225
- have been combined into broader categories.
266
+ tuple[pd.DataFrame, pd.DataFrame]:
267
+ A tuple containing two DataFrames:
268
+ - The first DataFrame contains the rows that were successfully processed by the LLM
269
+ - The second DataFrame contains the rows that could not be processed by the LLM
270
+
226
271
  """
227
272
  logger.info(f"Running theme condensation on {len(themes_df)} themes")
228
- themes_df["response_id"] = range(len(themes_df))
273
+ themes_df["response_id"] = themes_df.index + 1
229
274
 
230
275
  n_themes = themes_df.shape[0]
231
276
  while n_themes > batch_size:
232
277
  logger.info(
233
278
  f"{n_themes} larger than batch size, using recursive theme condensation"
234
279
  )
235
- themes_df = await batch_and_run(
280
+ themes_df, _ = await batch_and_run(
236
281
  themes_df,
237
282
  prompt_template,
238
- llm,
283
+ llm.with_structured_output(ThemeCondensationResponses),
239
284
  batch_size=batch_size,
240
285
  question=question,
241
286
  system_prompt=system_prompt,
287
+ concurrency=concurrency,
242
288
  **kwargs,
243
289
  )
244
290
  themes_df = themes_df.sample(frac=1).reset_index(drop=True)
245
- themes_df["response_id"] = range(len(themes_df))
291
+ themes_df["response_id"] = themes_df.index + 1
246
292
  if len(themes_df) == n_themes:
247
293
  logger.info("Themes no longer being condensed")
248
294
  break
249
295
  n_themes = themes_df.shape[0]
250
296
 
251
- themes_df = await batch_and_run(
297
+ themes_df, _ = await batch_and_run(
252
298
  themes_df,
253
299
  prompt_template,
254
- llm,
300
+ llm.with_structured_output(ThemeCondensationResponses),
255
301
  batch_size=batch_size,
256
302
  question=question,
257
303
  system_prompt=system_prompt,
304
+ concurrency=concurrency,
258
305
  **kwargs,
259
306
  )
260
307
 
261
308
  logger.info(f"Final number of condensed themes: {themes_df.shape[0]}")
262
- return themes_df
309
+ return themes_df, _
263
310
 
264
311
 
265
312
  async def theme_refinement(
266
313
  condensed_themes_df: pd.DataFrame,
267
- llm: Runnable,
314
+ llm: RunnableWithFallbacks,
268
315
  question: str,
269
316
  batch_size: int = 10000,
270
317
  prompt_template: str | Path | PromptTemplate = "theme_refinement",
271
318
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
272
- ) -> pd.DataFrame:
319
+ concurrency: int = 10,
320
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
273
321
  """Refine and standardize condensed themes using an LLM.
274
322
 
275
323
  This function processes previously condensed themes to create clear, standardized
@@ -280,21 +328,22 @@ async def theme_refinement(
280
328
  Args:
281
329
  condensed_themes (pd.DataFrame): DataFrame containing the condensed themes
282
330
  from the previous pipeline stage.
283
- llm (Runnable): Language model instance to use for theme refinement.
331
+ llm (RunnableWithFallbacks): Language model instance to use for theme refinement.
284
332
  question (str): The survey question.
285
333
  batch_size (int, optional): Number of themes to process in each batch.
286
334
  Defaults to 10000.
287
335
  prompt_template (str | Path | PromptTemplate, optional): Template for structuring
288
336
  the prompt to the LLM. Can be a string identifier, path to template file,
289
- or PromptTemplate instance. Defaults to "topic_refinement".
337
+ or PromptTemplate instance. Defaults to "theme_refinement".
290
338
  system_prompt (str): System prompt to guide the LLM's behavior.
291
339
  Defaults to CONSULTATION_SYSTEM_PROMPT.
340
+ concurrency (int): Number of concurrent API calls to make. Defaults to 10.
292
341
 
293
342
  Returns:
294
- pd.DataFrame: A single-row DataFrame where:
295
- - Each column represents a unique theme (identified by topic_id)
296
- - The values contain the refined theme descriptions
297
- - The format is optimized for subsequent theme mapping operations
343
+ tuple[pd.DataFrame, pd.DataFrame]:
344
+ A tuple containing two DataFrames:
345
+ - The first DataFrame contains the rows that were successfully processed by the LLM
346
+ - The second DataFrame contains the rows that could not be processed by the LLM
298
347
 
299
348
  Note:
300
349
  The function adds sequential response_ids to the input DataFrame and
@@ -302,35 +351,30 @@ async def theme_refinement(
302
351
  processing.
303
352
  """
304
353
  logger.info(f"Running theme refinement on {len(condensed_themes_df)} responses")
305
- condensed_themes_df["response_id"] = range(len(condensed_themes_df))
306
-
307
- def transpose_refined_themes(refined_themes: pd.DataFrame):
308
- """Transpose topics for increased legibility."""
309
- transposed_df = pd.DataFrame(
310
- [refined_themes["topic"].to_numpy()], columns=refined_themes["topic_id"]
311
- )
312
- return transposed_df
354
+ condensed_themes_df["response_id"] = condensed_themes_df.index + 1
313
355
 
314
- refined_themes = await batch_and_run(
356
+ refined_themes, _ = await batch_and_run(
315
357
  condensed_themes_df,
316
358
  prompt_template,
317
- llm,
359
+ llm.with_structured_output(ThemeRefinementResponses),
318
360
  batch_size=batch_size,
319
361
  question=question,
320
362
  system_prompt=system_prompt,
363
+ concurrency=concurrency,
321
364
  )
322
- return transpose_refined_themes(refined_themes)
365
+ return refined_themes, _
323
366
 
324
367
 
325
368
  async def theme_target_alignment(
326
369
  refined_themes_df: pd.DataFrame,
327
- llm: Runnable,
370
+ llm: RunnableWithFallbacks,
328
371
  question: str,
329
372
  target_n_themes: int = 10,
330
373
  batch_size: int = 10000,
331
374
  prompt_template: str | Path | PromptTemplate = "theme_target_alignment",
332
375
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
333
- ) -> pd.DataFrame:
376
+ concurrency: int = 10,
377
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
334
378
  """Align themes to target number using an LLM.
335
379
 
336
380
  This function processes refined themes to consolidate them into a target number of
@@ -341,7 +385,7 @@ async def theme_target_alignment(
341
385
  Args:
342
386
  refined_themes_df (pd.DataFrame): DataFrame containing the refined themes
343
387
  from the previous pipeline stage.
344
- llm (Runnable): Language model instance to use for theme alignment.
388
+ llm (RunnableWithFallbacks): Language model instance to use for theme alignment.
345
389
  question (str): The survey question.
346
390
  target_n_themes (int, optional): Target number of themes to consolidate to.
347
391
  Defaults to 10.
@@ -352,12 +396,13 @@ async def theme_target_alignment(
352
396
  or PromptTemplate instance. Defaults to "theme_target_alignment".
353
397
  system_prompt (str): System prompt to guide the LLM's behavior.
354
398
  Defaults to CONSULTATION_SYSTEM_PROMPT.
399
+ concurrency (int): Number of concurrent API calls to make. Defaults to 10.
355
400
 
356
401
  Returns:
357
- pd.DataFrame: A single-row DataFrame where:
358
- - Each column represents a unique theme (identified by topic_id)
359
- - The values contain the aligned theme descriptions
360
- - The format is optimized for subsequent theme mapping operations
402
+ tuple[pd.DataFrame, pd.DataFrame]:
403
+ A tuple containing two DataFrames:
404
+ - The first DataFrame contains the rows that were successfully processed by the LLM
405
+ - The second DataFrame contains the rows that could not be processed by the LLM
361
406
 
362
407
  Note:
363
408
  The function adds sequential response_ids to the input DataFrame and
@@ -365,39 +410,32 @@ async def theme_target_alignment(
365
410
  processing.
366
411
  """
367
412
  logger.info(
368
- f"Running theme target alignment on {len(refined_themes_df.columns)} themes compressing to {target_n_themes} themes"
413
+ f"Running theme target alignment on {len(refined_themes_df)} themes compressing to {target_n_themes} themes"
369
414
  )
370
- refined_themes_df = refined_themes_df.T.rename(columns={0: "topic"})
371
- refined_themes_df["response_id"] = range(len(refined_themes_df))
372
-
373
- def transpose_aligned_themes(aligned_themes: pd.DataFrame):
374
- """Transpose topics for increased legibility."""
375
- transposed_df = pd.DataFrame(
376
- [aligned_themes["topic"].to_numpy()], columns=aligned_themes["topic_id"]
377
- )
378
- return transposed_df
379
-
380
- aligned_themes = await batch_and_run(
415
+ refined_themes_df["response_id"] = refined_themes_df.index + 1
416
+ aligned_themes, _ = await batch_and_run(
381
417
  refined_themes_df,
382
418
  prompt_template,
383
- llm,
419
+ llm.with_structured_output(ThemeRefinementResponses),
384
420
  batch_size=batch_size,
385
421
  question=question,
386
422
  system_prompt=system_prompt,
387
423
  target_n_themes=target_n_themes,
424
+ concurrency=concurrency,
388
425
  )
389
- return transpose_aligned_themes(aligned_themes)
426
+ return aligned_themes, _
390
427
 
391
428
 
392
429
  async def theme_mapping(
393
430
  responses_df: pd.DataFrame,
394
- llm: Runnable,
431
+ llm: RunnableWithFallbacks,
395
432
  question: str,
396
433
  refined_themes_df: pd.DataFrame,
397
434
  batch_size: int = 20,
398
435
  prompt_template: str | Path | PromptTemplate = "theme_mapping",
399
436
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
400
- ) -> pd.DataFrame:
437
+ concurrency: int = 10,
438
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
401
439
  """Map survey responses to refined themes using an LLM.
402
440
 
403
441
  This function analyzes each survey response and determines which of the refined
@@ -406,7 +444,7 @@ async def theme_mapping(
406
444
  Args:
407
445
  responses_df (pd.DataFrame): DataFrame containing survey responses.
408
446
  Must include 'response_id' and 'response' columns.
409
- llm (Runnable): Language model instance to use for theme mapping.
447
+ llm (RunnableWithFallbacks): Language model instance to use for theme mapping.
410
448
  question (str): The survey question.
411
449
  refined_themes_df (pd.DataFrame): Single-row DataFrame where each column
412
450
  represents a theme (from theme_refinement stage).
@@ -417,21 +455,90 @@ async def theme_mapping(
417
455
  or PromptTemplate instance. Defaults to "theme_mapping".
418
456
  system_prompt (str): System prompt to guide the LLM's behavior.
419
457
  Defaults to CONSULTATION_SYSTEM_PROMPT.
458
+ concurrency (int): Number of concurrent API calls to make. Defaults to 10.
420
459
 
421
460
  Returns:
422
- pd.DataFrame: DataFrame containing the original responses enriched with
423
- theme mapping results, ensuring all responses are mapped through ID integrity checks.
461
+ tuple[pd.DataFrame, pd.DataFrame]:
462
+ A tuple containing two DataFrames:
463
+ - The first DataFrame contains the rows that were successfully processed by the LLM
464
+ - The second DataFrame contains the rows that could not be processed by the LLM
465
+
424
466
  """
425
467
  logger.info(
426
- f"Running theme mapping on {len(responses_df)} responses using {len(refined_themes_df.columns)} themes"
468
+ f"Running theme mapping on {len(responses_df)} responses using {len(refined_themes_df)} themes"
469
+ )
470
+
471
+ def transpose_refined_themes(refined_themes: pd.DataFrame):
472
+ """Transpose topics for increased legibility."""
473
+ transposed_df = pd.DataFrame(
474
+ [refined_themes["topic"].to_numpy()], columns=refined_themes["topic_id"]
475
+ )
476
+ return transposed_df
477
+
478
+ mapping, unprocessable = await batch_and_run(
479
+ responses_df,
480
+ prompt_template,
481
+ llm.with_structured_output(ThemeMappingResponses),
482
+ batch_size=batch_size,
483
+ question=question,
484
+ refined_themes=transpose_refined_themes(refined_themes_df).to_dict(
485
+ orient="records"
486
+ ),
487
+ integrity_check=True,
488
+ system_prompt=system_prompt,
489
+ concurrency=concurrency,
427
490
  )
428
- return await batch_and_run(
491
+ return mapping, unprocessable
492
+
493
+
494
+ async def detail_detection(
495
+ responses_df: pd.DataFrame,
496
+ llm: RunnableWithFallbacks,
497
+ question: str,
498
+ batch_size: int = 20,
499
+ prompt_template: str | Path | PromptTemplate = "detail_detection",
500
+ system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
501
+ concurrency: int = 10,
502
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
503
+ """Identify responses that provide high-value detailed evidence.
504
+
505
+ This function processes survey responses in batches to analyze their level of detail
506
+ and evidence using a language model. It identifies responses that contain specific
507
+ examples, data, or detailed reasoning that provide strong supporting evidence.
508
+
509
+ Args:
510
+ responses_df (pd.DataFrame): DataFrame containing survey responses to analyze.
511
+ Must contain 'response_id' and 'response' columns.
512
+ llm (RunnableWithFallbacks): Language model instance to use for detail detection.
513
+ question (str): The survey question.
514
+ batch_size (int, optional): Number of responses to process in each batch.
515
+ Defaults to 20.
516
+ prompt_template (str | Path | PromptTemplate, optional): Template for structuring
517
+ the prompt to the LLM. Can be a string identifier, path to template file,
518
+ or PromptTemplate instance. Defaults to "detail_detection".
519
+ system_prompt (str): System prompt to guide the LLM's behavior.
520
+ Defaults to CONSULTATION_SYSTEM_PROMPT.
521
+ concurrency (int): Number of concurrent API calls to make. Defaults to 10.
522
+
523
+ Returns:
524
+ tuple[pd.DataFrame, pd.DataFrame]:
525
+ A tuple containing two DataFrames:
526
+ - The first DataFrame contains the rows that were successfully processed by the LLM
527
+ - The second DataFrame contains the rows that could not be processed by the LLM
528
+
529
+ Note:
530
+ The function uses response_id_integrity_check to ensure responses maintain
531
+ their original order and association after processing.
532
+ """
533
+ logger.info(f"Running detail detection on {len(responses_df)} responses")
534
+ detailed, _ = await batch_and_run(
429
535
  responses_df,
430
536
  prompt_template,
431
- llm,
537
+ llm.with_structured_output(DetailDetectionResponses),
432
538
  batch_size=batch_size,
433
539
  question=question,
434
- refined_themes=refined_themes_df.to_dict(orient="records"),
435
- response_id_integrity_check=True,
540
+ integrity_check=True,
436
541
  system_prompt=system_prompt,
542
+ concurrency=concurrency,
437
543
  )
544
+ return detailed, _