themefinder 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of themefinder might be problematic. Click here for more details.

themefinder/__init__.py CHANGED
@@ -1,18 +1,24 @@
1
1
  from .core import (
2
2
  find_themes,
3
3
  sentiment_analysis,
4
+ theme_clustering,
4
5
  theme_condensation,
5
6
  theme_generation,
6
7
  theme_mapping,
7
8
  theme_refinement,
9
+ theme_target_alignment,
10
+ detail_detection,
8
11
  )
9
12
 
10
13
  __all__ = [
11
14
  "find_themes",
12
15
  "sentiment_analysis",
13
- "theme_generation",
16
+ "theme_clustering",
14
17
  "theme_condensation",
15
- "theme_refinement",
18
+ "theme_generation",
16
19
  "theme_mapping",
20
+ "theme_refinement",
21
+ "theme_target_alignment",
22
+ "detail_detection",
17
23
  ]
18
24
  __version__ = "0.1.0"
themefinder/core.py CHANGED
@@ -3,22 +3,33 @@ from pathlib import Path
3
3
 
4
4
  import pandas as pd
5
5
  from langchain_core.prompts import PromptTemplate
6
- from langchain_core.runnables import Runnable
7
-
8
- from .llm_batch_processor import batch_and_run, load_prompt_from_file
9
- from .models import SentimentAnalysisOutput, ThemeMappingOutput
10
- from .themefinder_logging import logger
6
+ from langchain.schema.runnable import RunnableWithFallbacks
7
+
8
+ from themefinder.llm_batch_processor import batch_and_run, load_prompt_from_file
9
+ from themefinder.models import (
10
+ SentimentAnalysisResponses,
11
+ ThemeGenerationResponses,
12
+ ThemeCondensationResponses,
13
+ ThemeRefinementResponses,
14
+ ThemeMappingResponses,
15
+ DetailDetectionResponses,
16
+ HierarchicalClusteringResponse,
17
+ ThemeNode,
18
+ )
19
+ from themefinder.theme_clustering_agent import ThemeClusteringAgent
20
+ from themefinder.themefinder_logging import logger
11
21
 
12
22
  CONSULTATION_SYSTEM_PROMPT = load_prompt_from_file("consultation_system_prompt")
13
23
 
14
24
 
15
25
  async def find_themes(
16
26
  responses_df: pd.DataFrame,
17
- llm: Runnable,
27
+ llm: RunnableWithFallbacks,
18
28
  question: str,
19
29
  target_n_themes: int | None = None,
20
30
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
21
31
  verbose: bool = True,
32
+ concurrency: int = 10,
22
33
  ) -> dict[str, str | pd.DataFrame]:
23
34
  """Process survey responses through a multi-stage theme analysis pipeline.
24
35
 
@@ -32,7 +43,7 @@ async def find_themes(
32
43
 
33
44
  Args:
34
45
  responses_df (pd.DataFrame): DataFrame containing survey responses
35
- llm (Runnable): Language model instance for text analysis
46
+ llm (RunnableWithFallbacks): Language model instance for text analysis
36
47
  question (str): The survey question
37
48
  target_n_themes (int | None, optional): Target number of themes to consolidate to.
38
49
  If None, skip theme target alignment step. Defaults to None.
@@ -40,6 +51,7 @@ async def find_themes(
40
51
  Defaults to CONSULTATION_SYSTEM_PROMPT.
41
52
  verbose (bool): Whether to show information messages during processing.
42
53
  Defaults to True.
54
+ concurrency (int): Number of concurrent API calls to make. Defaults to 10.
43
55
 
44
56
  Returns:
45
57
  dict[str, str | pd.DataFrame]: Dictionary containing results from each pipeline stage:
@@ -56,21 +68,28 @@ async def find_themes(
56
68
  llm,
57
69
  question=question,
58
70
  system_prompt=system_prompt,
71
+ concurrency=concurrency,
59
72
  )
60
73
  theme_df, _ = await theme_generation(
61
74
  sentiment_df,
62
75
  llm,
63
76
  question=question,
64
77
  system_prompt=system_prompt,
78
+ concurrency=concurrency,
65
79
  )
66
80
  condensed_theme_df, _ = await theme_condensation(
67
- theme_df, llm, question=question, system_prompt=system_prompt
81
+ theme_df,
82
+ llm,
83
+ question=question,
84
+ system_prompt=system_prompt,
85
+ concurrency=concurrency,
68
86
  )
69
87
  refined_theme_df, _ = await theme_refinement(
70
88
  condensed_theme_df,
71
89
  llm,
72
90
  question=question,
73
91
  system_prompt=system_prompt,
92
+ concurrency=concurrency,
74
93
  )
75
94
  if target_n_themes is not None:
76
95
  refined_theme_df, _ = await theme_target_alignment(
@@ -79,6 +98,7 @@ async def find_themes(
79
98
  question=question,
80
99
  target_n_themes=target_n_themes,
81
100
  system_prompt=system_prompt,
101
+ concurrency=concurrency,
82
102
  )
83
103
  mapping_df, mapping_unprocessables = await theme_mapping(
84
104
  sentiment_df[["response_id", "response"]],
@@ -86,28 +106,36 @@ async def find_themes(
86
106
  question=question,
87
107
  refined_themes_df=refined_theme_df,
88
108
  system_prompt=system_prompt,
109
+ concurrency=concurrency,
110
+ )
111
+ detailed_df, _ = await detail_detection(
112
+ responses_df[["response_id", "response"]],
113
+ llm,
114
+ question=question,
115
+ system_prompt=system_prompt,
116
+ concurrency=concurrency,
89
117
  )
90
118
 
91
119
  logger.info("Finished finding themes")
92
- logger.info(
93
- "Provide feedback or report bugs: https://forms.gle/85xUSMvxGzSSKQ499 or packages@cabinetoffice.gov.uk"
94
- )
120
+ logger.info("Provide feedback or report bugs: packages@cabinetoffice.gov.uk")
95
121
  return {
96
122
  "question": question,
97
123
  "sentiment": sentiment_df,
98
124
  "themes": refined_theme_df,
99
125
  "mapping": mapping_df,
126
+ "detailed_responses": detailed_df,
100
127
  "unprocessables": pd.concat([sentiment_unprocessables, mapping_unprocessables]),
101
128
  }
102
129
 
103
130
 
104
131
  async def sentiment_analysis(
105
132
  responses_df: pd.DataFrame,
106
- llm: Runnable,
133
+ llm: RunnableWithFallbacks,
107
134
  question: str,
108
135
  batch_size: int = 20,
109
136
  prompt_template: str | Path | PromptTemplate = "sentiment_analysis",
110
137
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
138
+ concurrency: int = 10,
111
139
  ) -> tuple[pd.DataFrame, pd.DataFrame]:
112
140
  """Perform sentiment analysis on survey responses using an LLM.
113
141
 
@@ -117,7 +145,7 @@ async def sentiment_analysis(
117
145
  Args:
118
146
  responses_df (pd.DataFrame): DataFrame containing survey responses to analyze.
119
147
  Must contain 'response_id' and 'response' columns.
120
- llm (Runnable): Language model instance to use for sentiment analysis.
148
+ llm (RunnableWithFallbacks): Language model instance to use for sentiment analysis.
121
149
  question (str): The survey question.
122
150
  batch_size (int, optional): Number of responses to process in each batch.
123
151
  Defaults to 20.
@@ -126,6 +154,7 @@ async def sentiment_analysis(
126
154
  or PromptTemplate instance. Defaults to "sentiment_analysis".
127
155
  system_prompt (str): System prompt to guide the LLM's behavior.
128
156
  Defaults to CONSULTATION_SYSTEM_PROMPT.
157
+ concurrency (int): Number of concurrent API calls to make. Defaults to 10.
129
158
 
130
159
  Returns:
131
160
  tuple[pd.DataFrame, pd.DataFrame]:
@@ -134,32 +163,33 @@ async def sentiment_analysis(
134
163
  - The second DataFrame contains the rows that could not be processed by the LLM
135
164
 
136
165
  Note:
137
- The function uses validation_check to ensure responses maintain
166
+ The function uses integrity_check to ensure responses maintain
138
167
  their original order and association after processing.
139
168
  """
140
169
  logger.info(f"Running sentiment analysis on {len(responses_df)} responses")
141
- processed_rows, unprocessable_rows = await batch_and_run(
170
+ sentiment, unprocessable = await batch_and_run(
142
171
  responses_df,
143
172
  prompt_template,
144
- llm,
173
+ llm.with_structured_output(SentimentAnalysisResponses),
145
174
  batch_size=batch_size,
146
175
  question=question,
147
- validation_check=True,
148
- task_validation_model=SentimentAnalysisOutput,
176
+ integrity_check=True,
149
177
  system_prompt=system_prompt,
178
+ concurrency=concurrency,
150
179
  )
151
180
 
152
- return processed_rows, unprocessable_rows
181
+ return sentiment, unprocessable
153
182
 
154
183
 
155
184
  async def theme_generation(
156
185
  responses_df: pd.DataFrame,
157
- llm: Runnable,
186
+ llm: RunnableWithFallbacks,
158
187
  question: str,
159
188
  batch_size: int = 50,
160
189
  partition_key: str | None = "position",
161
190
  prompt_template: str | Path | PromptTemplate = "theme_generation",
162
191
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
192
+ concurrency: int = 10,
163
193
  ) -> tuple[pd.DataFrame, pd.DataFrame]:
164
194
  """Generate themes from survey responses using an LLM.
165
195
 
@@ -168,7 +198,7 @@ async def theme_generation(
168
198
  Args:
169
199
  responses_df (pd.DataFrame): DataFrame containing survey responses.
170
200
  Must include 'response_id' and 'response' columns.
171
- llm (Runnable): Language model instance to use for theme generation.
201
+ llm (RunnableWithFallbacks): Language model instance to use for theme generation.
172
202
  question (str): The survey question.
173
203
  batch_size (int, optional): Number of responses to process in each batch.
174
204
  Defaults to 50.
@@ -181,6 +211,7 @@ async def theme_generation(
181
211
  or PromptTemplate instance. Defaults to "theme_generation".
182
212
  system_prompt (str): System prompt to guide the LLM's behavior.
183
213
  Defaults to CONSULTATION_SYSTEM_PROMPT.
214
+ concurrency (int): Number of concurrent API calls to make. Defaults to 10.
184
215
 
185
216
  Returns:
186
217
  tuple[pd.DataFrame, pd.DataFrame]:
@@ -193,22 +224,24 @@ async def theme_generation(
193
224
  generated_themes, _ = await batch_and_run(
194
225
  responses_df,
195
226
  prompt_template,
196
- llm,
227
+ llm.with_structured_output(ThemeGenerationResponses),
197
228
  batch_size=batch_size,
198
229
  partition_key=partition_key,
199
230
  question=question,
200
231
  system_prompt=system_prompt,
232
+ concurrency=concurrency,
201
233
  )
202
234
  return generated_themes, _
203
235
 
204
236
 
205
237
  async def theme_condensation(
206
238
  themes_df: pd.DataFrame,
207
- llm: Runnable,
239
+ llm: RunnableWithFallbacks,
208
240
  question: str,
209
241
  batch_size: int = 75,
210
242
  prompt_template: str | Path | PromptTemplate = "theme_condensation",
211
243
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
244
+ concurrency: int = 10,
212
245
  **kwargs,
213
246
  ) -> tuple[pd.DataFrame, pd.DataFrame]:
214
247
  """Condense and combine similar themes identified from survey responses.
@@ -219,7 +252,7 @@ async def theme_condensation(
219
252
  Args:
220
253
  themes_df (pd.DataFrame): DataFrame containing the initial themes identified
221
254
  from survey responses.
222
- llm (Runnable): Language model instance to use for theme condensation.
255
+ llm (RunnableWithFallbacks): Language model instance to use for theme condensation.
223
256
  question (str): The survey question.
224
257
  batch_size (int, optional): Number of themes to process in each batch.
225
258
  Defaults to 100.
@@ -228,6 +261,7 @@ async def theme_condensation(
228
261
  or PromptTemplate instance. Defaults to "theme_condensation".
229
262
  system_prompt (str): System prompt to guide the LLM's behavior.
230
263
  Defaults to CONSULTATION_SYSTEM_PROMPT.
264
+ concurrency (int): Number of concurrent API calls to make. Defaults to 10.
231
265
 
232
266
  Returns:
233
267
  tuple[pd.DataFrame, pd.DataFrame]:
@@ -247,10 +281,11 @@ async def theme_condensation(
247
281
  themes_df, _ = await batch_and_run(
248
282
  themes_df,
249
283
  prompt_template,
250
- llm,
284
+ llm.with_structured_output(ThemeCondensationResponses),
251
285
  batch_size=batch_size,
252
286
  question=question,
253
287
  system_prompt=system_prompt,
288
+ concurrency=concurrency,
254
289
  **kwargs,
255
290
  )
256
291
  themes_df = themes_df.sample(frac=1).reset_index(drop=True)
@@ -263,10 +298,11 @@ async def theme_condensation(
263
298
  themes_df, _ = await batch_and_run(
264
299
  themes_df,
265
300
  prompt_template,
266
- llm,
301
+ llm.with_structured_output(ThemeCondensationResponses),
267
302
  batch_size=batch_size,
268
303
  question=question,
269
304
  system_prompt=system_prompt,
305
+ concurrency=concurrency,
270
306
  **kwargs,
271
307
  )
272
308
 
@@ -274,13 +310,95 @@ async def theme_condensation(
274
310
  return themes_df, _
275
311
 
276
312
 
313
+ def theme_clustering(
314
+ themes_df: pd.DataFrame,
315
+ llm: RunnableWithFallbacks,
316
+ max_iterations: int = 5,
317
+ target_themes: int = 10,
318
+ significance_percentage: float = 10.0,
319
+ return_all_themes: bool = False,
320
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
321
+ """Perform hierarchical clustering of themes using an agentic approach.
322
+
323
+ This function takes a DataFrame of themes and uses the ThemeClusteringAgent
324
+ to iteratively merge similar themes into a hierarchical structure, then
325
+ selects the most significant themes based on a threshold.
326
+
327
+ Args:
328
+ themes_df (pd.DataFrame): DataFrame containing themes with columns:
329
+ - topic_id: Unique identifier for each theme
330
+ - topic_label: Short descriptive label for the theme
331
+ - topic_description: Detailed description of the theme
332
+ - source_topic_count: Number of source responses for this theme
333
+ llm (RunnableWithFallbacks): Language model instance configured with
334
+ structured output for HierarchicalClusteringResponse
335
+ max_iterations (int, optional): Maximum number of clustering iterations.
336
+ Defaults to 5.
337
+ target_themes (int, optional): Target number of themes to cluster down to.
338
+ Defaults to 10.
339
+ significance_percentage (float, optional): Percentage threshold for
340
+ selecting significant themes. Defaults to 10.0.
341
+ return_all_themes (bool, optional): If True, returns all clustered themes.
342
+ If False, returns only significant themes. Defaults to False.
343
+
344
+ Returns:
345
+ tuple[pd.DataFrame, pd.DataFrame]:
346
+ A tuple containing:
347
+ - DataFrame of clustered themes (all or significant based on return_all_themes)
348
+ - Empty DataFrame (for consistency with other functions)
349
+ """
350
+ logger.info(f"Starting hierarchical clustering of {len(themes_df)} themes")
351
+
352
+ # Convert DataFrame to ThemeNode objects
353
+ initial_themes = [
354
+ ThemeNode(
355
+ topic_id=row["topic_id"],
356
+ topic_label=row["topic_label"],
357
+ topic_description=row["topic_description"],
358
+ source_topic_count=row["source_topic_count"],
359
+ )
360
+ for _, row in themes_df.iterrows()
361
+ ]
362
+
363
+ # Initialize clustering agent with structured output LLM
364
+ agent = ThemeClusteringAgent(
365
+ llm.with_structured_output(HierarchicalClusteringResponse), initial_themes
366
+ )
367
+
368
+ # Perform clustering
369
+ logger.info(
370
+ f"Clustering themes with max_iterations={max_iterations}, target_themes={target_themes}"
371
+ )
372
+ all_themes_df = agent.cluster_themes(
373
+ max_iterations=max_iterations, target_themes=target_themes
374
+ )
375
+
376
+ # Return appropriate themes based on parameter
377
+ if return_all_themes:
378
+ logger.info(
379
+ f"Clustering complete: returning all {len(all_themes_df)} clustered themes"
380
+ )
381
+ return all_themes_df, pd.DataFrame()
382
+ else:
383
+ # Select significant themes
384
+ logger.info(
385
+ f"Selecting themes with significance_percentage={significance_percentage}%"
386
+ )
387
+ selected_themes_df = agent.select_themes(significance_percentage)
388
+ logger.info(
389
+ f"Clustering complete: returning {len(selected_themes_df)} significant themes"
390
+ )
391
+ return selected_themes_df, pd.DataFrame()
392
+
393
+
277
394
  async def theme_refinement(
278
395
  condensed_themes_df: pd.DataFrame,
279
- llm: Runnable,
396
+ llm: RunnableWithFallbacks,
280
397
  question: str,
281
398
  batch_size: int = 10000,
282
399
  prompt_template: str | Path | PromptTemplate = "theme_refinement",
283
400
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
401
+ concurrency: int = 10,
284
402
  ) -> tuple[pd.DataFrame, pd.DataFrame]:
285
403
  """Refine and standardize condensed themes using an LLM.
286
404
 
@@ -292,7 +410,7 @@ async def theme_refinement(
292
410
  Args:
293
411
  condensed_themes (pd.DataFrame): DataFrame containing the condensed themes
294
412
  from the previous pipeline stage.
295
- llm (Runnable): Language model instance to use for theme refinement.
413
+ llm (RunnableWithFallbacks): Language model instance to use for theme refinement.
296
414
  question (str): The survey question.
297
415
  batch_size (int, optional): Number of themes to process in each batch.
298
416
  Defaults to 10000.
@@ -301,6 +419,7 @@ async def theme_refinement(
301
419
  or PromptTemplate instance. Defaults to "theme_refinement".
302
420
  system_prompt (str): System prompt to guide the LLM's behavior.
303
421
  Defaults to CONSULTATION_SYSTEM_PROMPT.
422
+ concurrency (int): Number of concurrent API calls to make. Defaults to 10.
304
423
 
305
424
  Returns:
306
425
  tuple[pd.DataFrame, pd.DataFrame]:
@@ -319,22 +438,24 @@ async def theme_refinement(
319
438
  refined_themes, _ = await batch_and_run(
320
439
  condensed_themes_df,
321
440
  prompt_template,
322
- llm,
441
+ llm.with_structured_output(ThemeRefinementResponses),
323
442
  batch_size=batch_size,
324
443
  question=question,
325
444
  system_prompt=system_prompt,
445
+ concurrency=concurrency,
326
446
  )
327
447
  return refined_themes, _
328
448
 
329
449
 
330
450
  async def theme_target_alignment(
331
451
  refined_themes_df: pd.DataFrame,
332
- llm: Runnable,
452
+ llm: RunnableWithFallbacks,
333
453
  question: str,
334
454
  target_n_themes: int = 10,
335
455
  batch_size: int = 10000,
336
456
  prompt_template: str | Path | PromptTemplate = "theme_target_alignment",
337
457
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
458
+ concurrency: int = 10,
338
459
  ) -> tuple[pd.DataFrame, pd.DataFrame]:
339
460
  """Align themes to target number using an LLM.
340
461
 
@@ -346,7 +467,7 @@ async def theme_target_alignment(
346
467
  Args:
347
468
  refined_themes_df (pd.DataFrame): DataFrame containing the refined themes
348
469
  from the previous pipeline stage.
349
- llm (Runnable): Language model instance to use for theme alignment.
470
+ llm (RunnableWithFallbacks): Language model instance to use for theme alignment.
350
471
  question (str): The survey question.
351
472
  target_n_themes (int, optional): Target number of themes to consolidate to.
352
473
  Defaults to 10.
@@ -357,6 +478,7 @@ async def theme_target_alignment(
357
478
  or PromptTemplate instance. Defaults to "theme_target_alignment".
358
479
  system_prompt (str): System prompt to guide the LLM's behavior.
359
480
  Defaults to CONSULTATION_SYSTEM_PROMPT.
481
+ concurrency (int): Number of concurrent API calls to make. Defaults to 10.
360
482
 
361
483
  Returns:
362
484
  tuple[pd.DataFrame, pd.DataFrame]:
@@ -376,23 +498,25 @@ async def theme_target_alignment(
376
498
  aligned_themes, _ = await batch_and_run(
377
499
  refined_themes_df,
378
500
  prompt_template,
379
- llm,
501
+ llm.with_structured_output(ThemeRefinementResponses),
380
502
  batch_size=batch_size,
381
503
  question=question,
382
504
  system_prompt=system_prompt,
383
505
  target_n_themes=target_n_themes,
506
+ concurrency=concurrency,
384
507
  )
385
508
  return aligned_themes, _
386
509
 
387
510
 
388
511
  async def theme_mapping(
389
512
  responses_df: pd.DataFrame,
390
- llm: Runnable,
513
+ llm: RunnableWithFallbacks,
391
514
  question: str,
392
515
  refined_themes_df: pd.DataFrame,
393
516
  batch_size: int = 20,
394
517
  prompt_template: str | Path | PromptTemplate = "theme_mapping",
395
518
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
519
+ concurrency: int = 10,
396
520
  ) -> tuple[pd.DataFrame, pd.DataFrame]:
397
521
  """Map survey responses to refined themes using an LLM.
398
522
 
@@ -402,7 +526,7 @@ async def theme_mapping(
402
526
  Args:
403
527
  responses_df (pd.DataFrame): DataFrame containing survey responses.
404
528
  Must include 'response_id' and 'response' columns.
405
- llm (Runnable): Language model instance to use for theme mapping.
529
+ llm (RunnableWithFallbacks): Language model instance to use for theme mapping.
406
530
  question (str): The survey question.
407
531
  refined_themes_df (pd.DataFrame): Single-row DataFrame where each column
408
532
  represents a theme (from theme_refinement stage).
@@ -413,6 +537,7 @@ async def theme_mapping(
413
537
  or PromptTemplate instance. Defaults to "theme_mapping".
414
538
  system_prompt (str): System prompt to guide the LLM's behavior.
415
539
  Defaults to CONSULTATION_SYSTEM_PROMPT.
540
+ concurrency (int): Number of concurrent API calls to make. Defaults to 10.
416
541
 
417
542
  Returns:
418
543
  tuple[pd.DataFrame, pd.DataFrame]:
@@ -432,17 +557,70 @@ async def theme_mapping(
432
557
  )
433
558
  return transposed_df
434
559
 
435
- mapping, _ = await batch_and_run(
560
+ mapping, unprocessable = await batch_and_run(
436
561
  responses_df,
437
562
  prompt_template,
438
- llm,
563
+ llm.with_structured_output(ThemeMappingResponses),
439
564
  batch_size=batch_size,
440
565
  question=question,
441
566
  refined_themes=transpose_refined_themes(refined_themes_df).to_dict(
442
567
  orient="records"
443
568
  ),
444
- validation_check=True,
445
- task_validation_model=ThemeMappingOutput,
569
+ integrity_check=True,
570
+ system_prompt=system_prompt,
571
+ concurrency=concurrency,
572
+ )
573
+ return mapping, unprocessable
574
+
575
+
576
+ async def detail_detection(
577
+ responses_df: pd.DataFrame,
578
+ llm: RunnableWithFallbacks,
579
+ question: str,
580
+ batch_size: int = 20,
581
+ prompt_template: str | Path | PromptTemplate = "detail_detection",
582
+ system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
583
+ concurrency: int = 10,
584
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
585
+ """Identify responses that provide high-value detailed evidence.
586
+
587
+ This function processes survey responses in batches to analyze their level of detail
588
+ and evidence using a language model. It identifies responses that contain specific
589
+ examples, data, or detailed reasoning that provide strong supporting evidence.
590
+
591
+ Args:
592
+ responses_df (pd.DataFrame): DataFrame containing survey responses to analyze.
593
+ Must contain 'response_id' and 'response' columns.
594
+ llm (RunnableWithFallbacks): Language model instance to use for detail detection.
595
+ question (str): The survey question.
596
+ batch_size (int, optional): Number of responses to process in each batch.
597
+ Defaults to 20.
598
+ prompt_template (str | Path | PromptTemplate, optional): Template for structuring
599
+ the prompt to the LLM. Can be a string identifier, path to template file,
600
+ or PromptTemplate instance. Defaults to "detail_detection".
601
+ system_prompt (str): System prompt to guide the LLM's behavior.
602
+ Defaults to CONSULTATION_SYSTEM_PROMPT.
603
+ concurrency (int): Number of concurrent API calls to make. Defaults to 10.
604
+
605
+ Returns:
606
+ tuple[pd.DataFrame, pd.DataFrame]:
607
+ A tuple containing two DataFrames:
608
+ - The first DataFrame contains the rows that were successfully processed by the LLM
609
+ - The second DataFrame contains the rows that could not be processed by the LLM
610
+
611
+ Note:
612
+ The function uses response_id_integrity_check to ensure responses maintain
613
+ their original order and association after processing.
614
+ """
615
+ logger.info(f"Running detail detection on {len(responses_df)} responses")
616
+ detailed, _ = await batch_and_run(
617
+ responses_df,
618
+ prompt_template,
619
+ llm.with_structured_output(DetailDetectionResponses),
620
+ batch_size=batch_size,
621
+ question=question,
622
+ integrity_check=True,
446
623
  system_prompt=system_prompt,
624
+ concurrency=concurrency,
447
625
  )
448
- return mapping, _
626
+ return detailed, _