themefinder 0.5.3__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of themefinder might be problematic. Click here for more details.

themefinder/__init__.py CHANGED
@@ -1,10 +1,10 @@
1
1
  from .core import (
2
2
  find_themes,
3
3
  sentiment_analysis,
4
- theme_generation,
5
4
  theme_condensation,
6
- theme_refinement,
5
+ theme_generation,
7
6
  theme_mapping,
7
+ theme_refinement,
8
8
  )
9
9
 
10
10
  __all__ = [
themefinder/core.py CHANGED
@@ -6,6 +6,7 @@ from langchain_core.prompts import PromptTemplate
6
6
  from langchain_core.runnables import Runnable
7
7
 
8
8
  from .llm_batch_processor import batch_and_run, load_prompt_from_file
9
+ from .models import SentimentAnalysisOutput, ThemeMappingOutput
9
10
  from .themefinder_logging import logger
10
11
 
11
12
  CONSULTATION_SYSTEM_PROMPT = load_prompt_from_file("consultation_system_prompt")
@@ -18,7 +19,7 @@ async def find_themes(
18
19
  target_n_themes: int | None = None,
19
20
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
20
21
  verbose: bool = True,
21
- ) -> dict[str, pd.DataFrame]:
22
+ ) -> dict[str, str | pd.DataFrame]:
22
23
  """Process survey responses through a multi-stage theme analysis pipeline.
23
24
 
24
25
  This pipeline performs sequential analysis steps:
@@ -41,47 +42,46 @@ async def find_themes(
41
42
  Defaults to True.
42
43
 
43
44
  Returns:
44
- dict[str, pd.DataFrame]: Dictionary containing results from each pipeline stage:
45
- - question: The survey question
45
+ dict[str, str | pd.DataFrame]: Dictionary containing results from each pipeline stage:
46
+ - question: The survey question string
46
47
  - sentiment: DataFrame with sentiment analysis results
47
- - topics: DataFrame with initial generated themes
48
- - condensed_topics: DataFrame with combined similar themes
49
- - refined_topics: DataFrame with refined theme definitions
48
+ - themes: DataFrame with the final themes output
50
49
  - mapping: DataFrame mapping responses to final themes
50
+ - unprocessables: Dataframe containing the inputs that could not be processed by the LLM
51
51
  """
52
52
  logger.setLevel(logging.INFO if verbose else logging.CRITICAL)
53
53
 
54
- sentiment_df = await sentiment_analysis(
54
+ sentiment_df, sentiment_unprocessables = await sentiment_analysis(
55
55
  responses_df,
56
56
  llm,
57
57
  question=question,
58
58
  system_prompt=system_prompt,
59
59
  )
60
- theme_df = await theme_generation(
60
+ theme_df, _ = await theme_generation(
61
61
  sentiment_df,
62
62
  llm,
63
63
  question=question,
64
64
  system_prompt=system_prompt,
65
65
  )
66
- condensed_theme_df = await theme_condensation(
66
+ condensed_theme_df, _ = await theme_condensation(
67
67
  theme_df, llm, question=question, system_prompt=system_prompt
68
68
  )
69
- refined_theme_df = await theme_refinement(
69
+ refined_theme_df, _ = await theme_refinement(
70
70
  condensed_theme_df,
71
71
  llm,
72
72
  question=question,
73
73
  system_prompt=system_prompt,
74
74
  )
75
75
  if target_n_themes is not None:
76
- refined_theme_df = await theme_target_alignment(
76
+ refined_theme_df, _ = await theme_target_alignment(
77
77
  refined_theme_df,
78
78
  llm,
79
79
  question=question,
80
80
  target_n_themes=target_n_themes,
81
81
  system_prompt=system_prompt,
82
82
  )
83
- mapping_df = await theme_mapping(
84
- sentiment_df,
83
+ mapping_df, mapping_unprocessables = await theme_mapping(
84
+ sentiment_df[["response_id", "response"]],
85
85
  llm,
86
86
  question=question,
87
87
  refined_themes_df=refined_theme_df,
@@ -95,10 +95,9 @@ async def find_themes(
95
95
  return {
96
96
  "question": question,
97
97
  "sentiment": sentiment_df,
98
- "themes": theme_df,
99
- "condensed_themes": condensed_theme_df,
100
- "refined_themes": refined_theme_df,
98
+ "themes": refined_theme_df,
101
99
  "mapping": mapping_df,
100
+ "unprocessables": pd.concat([sentiment_unprocessables, mapping_unprocessables]),
102
101
  }
103
102
 
104
103
 
@@ -109,7 +108,7 @@ async def sentiment_analysis(
109
108
  batch_size: int = 20,
110
109
  prompt_template: str | Path | PromptTemplate = "sentiment_analysis",
111
110
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
112
- ) -> pd.DataFrame:
111
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
113
112
  """Perform sentiment analysis on survey responses using an LLM.
114
113
 
115
114
  This function processes survey responses in batches to analyze their sentiment
@@ -129,24 +128,29 @@ async def sentiment_analysis(
129
128
  Defaults to CONSULTATION_SYSTEM_PROMPT.
130
129
 
131
130
  Returns:
132
- pd.DataFrame: DataFrame containing the original responses enriched with
133
- sentiment analysis results.
131
+ tuple[pd.DataFrame, pd.DataFrame]:
132
+ A tuple containing two DataFrames:
133
+ - The first DataFrame contains the rows that were successfully processed by the LLM
134
+ - The second DataFrame contains the rows that could not be processed by the LLM
134
135
 
135
136
  Note:
136
- The function uses response_id_integrity_check to ensure responses maintain
137
+ The function uses validation_check to ensure responses maintain
137
138
  their original order and association after processing.
138
139
  """
139
140
  logger.info(f"Running sentiment analysis on {len(responses_df)} responses")
140
- return await batch_and_run(
141
+ processed_rows, unprocessable_rows = await batch_and_run(
141
142
  responses_df,
142
143
  prompt_template,
143
144
  llm,
144
145
  batch_size=batch_size,
145
146
  question=question,
146
- response_id_integrity_check=True,
147
+ validation_check=True,
148
+ task_validation_model=SentimentAnalysisOutput,
147
149
  system_prompt=system_prompt,
148
150
  )
149
151
 
152
+ return processed_rows, unprocessable_rows
153
+
150
154
 
151
155
  async def theme_generation(
152
156
  responses_df: pd.DataFrame,
@@ -156,7 +160,7 @@ async def theme_generation(
156
160
  partition_key: str | None = "position",
157
161
  prompt_template: str | Path | PromptTemplate = "theme_generation",
158
162
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
159
- ) -> pd.DataFrame:
163
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
160
164
  """Generate themes from survey responses using an LLM.
161
165
 
162
166
  This function processes batches of survey responses to identify common themes or topics.
@@ -179,10 +183,14 @@ async def theme_generation(
179
183
  Defaults to CONSULTATION_SYSTEM_PROMPT.
180
184
 
181
185
  Returns:
182
- pd.DataFrame: DataFrame containing identified themes and their associated metadata.
186
+ tuple[pd.DataFrame, pd.DataFrame]:
187
+ A tuple containing two DataFrames:
188
+ - The first DataFrame contains the rows that were successfully processed by the LLM
189
+ - The second DataFrame contains the rows that could not be processed by the LLM
190
+
183
191
  """
184
192
  logger.info(f"Running theme generation on {len(responses_df)} responses")
185
- return await batch_and_run(
193
+ generated_themes, _ = await batch_and_run(
186
194
  responses_df,
187
195
  prompt_template,
188
196
  llm,
@@ -191,17 +199,18 @@ async def theme_generation(
191
199
  question=question,
192
200
  system_prompt=system_prompt,
193
201
  )
202
+ return generated_themes, _
194
203
 
195
204
 
196
205
  async def theme_condensation(
197
206
  themes_df: pd.DataFrame,
198
207
  llm: Runnable,
199
208
  question: str,
200
- batch_size: int = 100,
209
+ batch_size: int = 75,
201
210
  prompt_template: str | Path | PromptTemplate = "theme_condensation",
202
211
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
203
212
  **kwargs,
204
- ) -> pd.DataFrame:
213
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
205
214
  """Condense and combine similar themes identified from survey responses.
206
215
 
207
216
  This function processes the initially identified themes to combine similar or
@@ -221,18 +230,21 @@ async def theme_condensation(
221
230
  Defaults to CONSULTATION_SYSTEM_PROMPT.
222
231
 
223
232
  Returns:
224
- pd.DataFrame: DataFrame containing the condensed themes, where similar topics
225
- have been combined into broader categories.
233
+ tuple[pd.DataFrame, pd.DataFrame]:
234
+ A tuple containing two DataFrames:
235
+ - The first DataFrame contains the rows that were successfully processed by the LLM
236
+ - The second DataFrame contains the rows that could not be processed by the LLM
237
+
226
238
  """
227
239
  logger.info(f"Running theme condensation on {len(themes_df)} themes")
228
- themes_df["response_id"] = range(len(themes_df))
240
+ themes_df["response_id"] = themes_df.index + 1
229
241
 
230
242
  n_themes = themes_df.shape[0]
231
243
  while n_themes > batch_size:
232
244
  logger.info(
233
245
  f"{n_themes} larger than batch size, using recursive theme condensation"
234
246
  )
235
- themes_df = await batch_and_run(
247
+ themes_df, _ = await batch_and_run(
236
248
  themes_df,
237
249
  prompt_template,
238
250
  llm,
@@ -242,13 +254,13 @@ async def theme_condensation(
242
254
  **kwargs,
243
255
  )
244
256
  themes_df = themes_df.sample(frac=1).reset_index(drop=True)
245
- themes_df["response_id"] = range(len(themes_df))
257
+ themes_df["response_id"] = themes_df.index + 1
246
258
  if len(themes_df) == n_themes:
247
259
  logger.info("Themes no longer being condensed")
248
260
  break
249
261
  n_themes = themes_df.shape[0]
250
262
 
251
- themes_df = await batch_and_run(
263
+ themes_df, _ = await batch_and_run(
252
264
  themes_df,
253
265
  prompt_template,
254
266
  llm,
@@ -259,7 +271,7 @@ async def theme_condensation(
259
271
  )
260
272
 
261
273
  logger.info(f"Final number of condensed themes: {themes_df.shape[0]}")
262
- return themes_df
274
+ return themes_df, _
263
275
 
264
276
 
265
277
  async def theme_refinement(
@@ -269,7 +281,7 @@ async def theme_refinement(
269
281
  batch_size: int = 10000,
270
282
  prompt_template: str | Path | PromptTemplate = "theme_refinement",
271
283
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
272
- ) -> pd.DataFrame:
284
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
273
285
  """Refine and standardize condensed themes using an LLM.
274
286
 
275
287
  This function processes previously condensed themes to create clear, standardized
@@ -286,15 +298,15 @@ async def theme_refinement(
286
298
  Defaults to 10000.
287
299
  prompt_template (str | Path | PromptTemplate, optional): Template for structuring
288
300
  the prompt to the LLM. Can be a string identifier, path to template file,
289
- or PromptTemplate instance. Defaults to "topic_refinement".
301
+ or PromptTemplate instance. Defaults to "theme_refinement".
290
302
  system_prompt (str): System prompt to guide the LLM's behavior.
291
303
  Defaults to CONSULTATION_SYSTEM_PROMPT.
292
304
 
293
305
  Returns:
294
- pd.DataFrame: A single-row DataFrame where:
295
- - Each column represents a unique theme (identified by topic_id)
296
- - The values contain the refined theme descriptions
297
- - The format is optimized for subsequent theme mapping operations
306
+ tuple[pd.DataFrame, pd.DataFrame]:
307
+ A tuple containing two DataFrames:
308
+ - The first DataFrame contains the rows that were successfully processed by the LLM
309
+ - The second DataFrame contains the rows that could not be processed by the LLM
298
310
 
299
311
  Note:
300
312
  The function adds sequential response_ids to the input DataFrame and
@@ -302,16 +314,9 @@ async def theme_refinement(
302
314
  processing.
303
315
  """
304
316
  logger.info(f"Running theme refinement on {len(condensed_themes_df)} responses")
305
- condensed_themes_df["response_id"] = range(len(condensed_themes_df))
317
+ condensed_themes_df["response_id"] = condensed_themes_df.index + 1
306
318
 
307
- def transpose_refined_themes(refined_themes: pd.DataFrame):
308
- """Transpose topics for increased legibility."""
309
- transposed_df = pd.DataFrame(
310
- [refined_themes["topic"].to_numpy()], columns=refined_themes["topic_id"]
311
- )
312
- return transposed_df
313
-
314
- refined_themes = await batch_and_run(
319
+ refined_themes, _ = await batch_and_run(
315
320
  condensed_themes_df,
316
321
  prompt_template,
317
322
  llm,
@@ -319,7 +324,7 @@ async def theme_refinement(
319
324
  question=question,
320
325
  system_prompt=system_prompt,
321
326
  )
322
- return transpose_refined_themes(refined_themes)
327
+ return refined_themes, _
323
328
 
324
329
 
325
330
  async def theme_target_alignment(
@@ -330,7 +335,7 @@ async def theme_target_alignment(
330
335
  batch_size: int = 10000,
331
336
  prompt_template: str | Path | PromptTemplate = "theme_target_alignment",
332
337
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
333
- ) -> pd.DataFrame:
338
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
334
339
  """Align themes to target number using an LLM.
335
340
 
336
341
  This function processes refined themes to consolidate them into a target number of
@@ -354,10 +359,10 @@ async def theme_target_alignment(
354
359
  Defaults to CONSULTATION_SYSTEM_PROMPT.
355
360
 
356
361
  Returns:
357
- pd.DataFrame: A single-row DataFrame where:
358
- - Each column represents a unique theme (identified by topic_id)
359
- - The values contain the aligned theme descriptions
360
- - The format is optimized for subsequent theme mapping operations
362
+ tuple[pd.DataFrame, pd.DataFrame]:
363
+ A tuple containing two DataFrames:
364
+ - The first DataFrame contains the rows that were successfully processed by the LLM
365
+ - The second DataFrame contains the rows that could not be processed by the LLM
361
366
 
362
367
  Note:
363
368
  The function adds sequential response_ids to the input DataFrame and
@@ -365,19 +370,10 @@ async def theme_target_alignment(
365
370
  processing.
366
371
  """
367
372
  logger.info(
368
- f"Running theme target alignment on {len(refined_themes_df.columns)} themes compressing to {target_n_themes} themes"
373
+ f"Running theme target alignment on {len(refined_themes_df)} themes compressing to {target_n_themes} themes"
369
374
  )
370
- refined_themes_df = refined_themes_df.T.rename(columns={0: "topic"})
371
- refined_themes_df["response_id"] = range(len(refined_themes_df))
372
-
373
- def transpose_aligned_themes(aligned_themes: pd.DataFrame):
374
- """Transpose topics for increased legibility."""
375
- transposed_df = pd.DataFrame(
376
- [aligned_themes["topic"].to_numpy()], columns=aligned_themes["topic_id"]
377
- )
378
- return transposed_df
379
-
380
- aligned_themes = await batch_and_run(
375
+ refined_themes_df["response_id"] = refined_themes_df.index + 1
376
+ aligned_themes, _ = await batch_and_run(
381
377
  refined_themes_df,
382
378
  prompt_template,
383
379
  llm,
@@ -386,7 +382,7 @@ async def theme_target_alignment(
386
382
  system_prompt=system_prompt,
387
383
  target_n_themes=target_n_themes,
388
384
  )
389
- return transpose_aligned_themes(aligned_themes)
385
+ return aligned_themes, _
390
386
 
391
387
 
392
388
  async def theme_mapping(
@@ -397,7 +393,7 @@ async def theme_mapping(
397
393
  batch_size: int = 20,
398
394
  prompt_template: str | Path | PromptTemplate = "theme_mapping",
399
395
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
400
- ) -> pd.DataFrame:
396
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
401
397
  """Map survey responses to refined themes using an LLM.
402
398
 
403
399
  This function analyzes each survey response and determines which of the refined
@@ -419,19 +415,34 @@ async def theme_mapping(
419
415
  Defaults to CONSULTATION_SYSTEM_PROMPT.
420
416
 
421
417
  Returns:
422
- pd.DataFrame: DataFrame containing the original responses enriched with
423
- theme mapping results, ensuring all responses are mapped through ID integrity checks.
418
+ tuple[pd.DataFrame, pd.DataFrame]:
419
+ A tuple containing two DataFrames:
420
+ - The first DataFrame contains the rows that were successfully processed by the LLM
421
+ - The second DataFrame contains the rows that could not be processed by the LLM
422
+
424
423
  """
425
424
  logger.info(
426
- f"Running theme mapping on {len(responses_df)} responses using {len(refined_themes_df.columns)} themes"
425
+ f"Running theme mapping on {len(responses_df)} responses using {len(refined_themes_df)} themes"
427
426
  )
428
- return await batch_and_run(
427
+
428
+ def transpose_refined_themes(refined_themes: pd.DataFrame):
429
+ """Transpose topics for increased legibility."""
430
+ transposed_df = pd.DataFrame(
431
+ [refined_themes["topic"].to_numpy()], columns=refined_themes["topic_id"]
432
+ )
433
+ return transposed_df
434
+
435
+ mapping, _ = await batch_and_run(
429
436
  responses_df,
430
437
  prompt_template,
431
438
  llm,
432
439
  batch_size=batch_size,
433
440
  question=question,
434
- refined_themes=refined_themes_df.to_dict(orient="records"),
435
- response_id_integrity_check=True,
441
+ refined_themes=transpose_refined_themes(refined_themes_df).to_dict(
442
+ orient="records"
443
+ ),
444
+ validation_check=True,
445
+ task_validation_model=ThemeMappingOutput,
436
446
  system_prompt=system_prompt,
437
447
  )
448
+ return mapping, _