themefinder 0.5.4__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of themefinder might be problematic. Click here for more details.

themefinder/__init__.py CHANGED
@@ -1,10 +1,10 @@
1
1
  from .core import (
2
2
  find_themes,
3
3
  sentiment_analysis,
4
- theme_generation,
5
4
  theme_condensation,
6
- theme_refinement,
5
+ theme_generation,
7
6
  theme_mapping,
7
+ theme_refinement,
8
8
  )
9
9
 
10
10
  __all__ = [
themefinder/core.py CHANGED
@@ -6,6 +6,7 @@ from langchain_core.prompts import PromptTemplate
6
6
  from langchain_core.runnables import Runnable
7
7
 
8
8
  from .llm_batch_processor import batch_and_run, load_prompt_from_file
9
+ from .models import SentimentAnalysisOutput, ThemeMappingOutput
9
10
  from .themefinder_logging import logger
10
11
 
11
12
  CONSULTATION_SYSTEM_PROMPT = load_prompt_from_file("consultation_system_prompt")
@@ -18,7 +19,7 @@ async def find_themes(
18
19
  target_n_themes: int | None = None,
19
20
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
20
21
  verbose: bool = True,
21
- ) -> dict[str, pd.DataFrame]:
22
+ ) -> dict[str, str | pd.DataFrame]:
22
23
  """Process survey responses through a multi-stage theme analysis pipeline.
23
24
 
24
25
  This pipeline performs sequential analysis steps:
@@ -41,47 +42,46 @@ async def find_themes(
41
42
  Defaults to True.
42
43
 
43
44
  Returns:
44
- dict[str, pd.DataFrame]: Dictionary containing results from each pipeline stage:
45
- - question: The survey question
45
+ dict[str, str | pd.DataFrame]: Dictionary containing results from each pipeline stage:
46
+ - question: The survey question string
46
47
  - sentiment: DataFrame with sentiment analysis results
47
- - topics: DataFrame with initial generated themes
48
- - condensed_topics: DataFrame with combined similar themes
49
- - refined_topics: DataFrame with refined theme definitions
48
+ - themes: DataFrame with the final themes output
50
49
  - mapping: DataFrame mapping responses to final themes
50
+ - unprocessables: Dataframe containing the inputs that could not be processed by the LLM
51
51
  """
52
52
  logger.setLevel(logging.INFO if verbose else logging.CRITICAL)
53
53
 
54
- sentiment_df = await sentiment_analysis(
54
+ sentiment_df, sentiment_unprocessables = await sentiment_analysis(
55
55
  responses_df,
56
56
  llm,
57
57
  question=question,
58
58
  system_prompt=system_prompt,
59
59
  )
60
- theme_df = await theme_generation(
60
+ theme_df, _ = await theme_generation(
61
61
  sentiment_df,
62
62
  llm,
63
63
  question=question,
64
64
  system_prompt=system_prompt,
65
65
  )
66
- condensed_theme_df = await theme_condensation(
66
+ condensed_theme_df, _ = await theme_condensation(
67
67
  theme_df, llm, question=question, system_prompt=system_prompt
68
68
  )
69
- refined_theme_df = await theme_refinement(
69
+ refined_theme_df, _ = await theme_refinement(
70
70
  condensed_theme_df,
71
71
  llm,
72
72
  question=question,
73
73
  system_prompt=system_prompt,
74
74
  )
75
75
  if target_n_themes is not None:
76
- refined_theme_df = await theme_target_alignment(
76
+ refined_theme_df, _ = await theme_target_alignment(
77
77
  refined_theme_df,
78
78
  llm,
79
79
  question=question,
80
80
  target_n_themes=target_n_themes,
81
81
  system_prompt=system_prompt,
82
82
  )
83
- mapping_df = await theme_mapping(
84
- sentiment_df,
83
+ mapping_df, mapping_unprocessables = await theme_mapping(
84
+ sentiment_df[["response_id", "response"]],
85
85
  llm,
86
86
  question=question,
87
87
  refined_themes_df=refined_theme_df,
@@ -95,10 +95,9 @@ async def find_themes(
95
95
  return {
96
96
  "question": question,
97
97
  "sentiment": sentiment_df,
98
- "themes": theme_df,
99
- "condensed_themes": condensed_theme_df,
100
- "refined_themes": refined_theme_df,
98
+ "themes": refined_theme_df,
101
99
  "mapping": mapping_df,
100
+ "unprocessables": pd.concat([sentiment_unprocessables, mapping_unprocessables]),
102
101
  }
103
102
 
104
103
 
@@ -109,7 +108,7 @@ async def sentiment_analysis(
109
108
  batch_size: int = 20,
110
109
  prompt_template: str | Path | PromptTemplate = "sentiment_analysis",
111
110
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
112
- ) -> pd.DataFrame:
111
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
113
112
  """Perform sentiment analysis on survey responses using an LLM.
114
113
 
115
114
  This function processes survey responses in batches to analyze their sentiment
@@ -129,24 +128,29 @@ async def sentiment_analysis(
129
128
  Defaults to CONSULTATION_SYSTEM_PROMPT.
130
129
 
131
130
  Returns:
132
- pd.DataFrame: DataFrame containing the original responses enriched with
133
- sentiment analysis results.
131
+ tuple[pd.DataFrame, pd.DataFrame]:
132
+ A tuple containing two DataFrames:
133
+ - The first DataFrame contains the rows that were successfully processed by the LLM
134
+ - The second DataFrame contains the rows that could not be processed by the LLM
134
135
 
135
136
  Note:
136
- The function uses response_id_integrity_check to ensure responses maintain
137
+ The function uses validation_check to ensure responses maintain
137
138
  their original order and association after processing.
138
139
  """
139
140
  logger.info(f"Running sentiment analysis on {len(responses_df)} responses")
140
- return await batch_and_run(
141
+ processed_rows, unprocessable_rows = await batch_and_run(
141
142
  responses_df,
142
143
  prompt_template,
143
144
  llm,
144
145
  batch_size=batch_size,
145
146
  question=question,
146
- response_id_integrity_check=True,
147
+ validation_check=True,
148
+ task_validation_model=SentimentAnalysisOutput,
147
149
  system_prompt=system_prompt,
148
150
  )
149
151
 
152
+ return processed_rows, unprocessable_rows
153
+
150
154
 
151
155
  async def theme_generation(
152
156
  responses_df: pd.DataFrame,
@@ -156,7 +160,7 @@ async def theme_generation(
156
160
  partition_key: str | None = "position",
157
161
  prompt_template: str | Path | PromptTemplate = "theme_generation",
158
162
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
159
- ) -> pd.DataFrame:
163
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
160
164
  """Generate themes from survey responses using an LLM.
161
165
 
162
166
  This function processes batches of survey responses to identify common themes or topics.
@@ -179,10 +183,14 @@ async def theme_generation(
179
183
  Defaults to CONSULTATION_SYSTEM_PROMPT.
180
184
 
181
185
  Returns:
182
- pd.DataFrame: DataFrame containing identified themes and their associated metadata.
186
+ tuple[pd.DataFrame, pd.DataFrame]:
187
+ A tuple containing two DataFrames:
188
+ - The first DataFrame contains the rows that were successfully processed by the LLM
189
+ - The second DataFrame contains the rows that could not be processed by the LLM
190
+
183
191
  """
184
192
  logger.info(f"Running theme generation on {len(responses_df)} responses")
185
- return await batch_and_run(
193
+ generated_themes, _ = await batch_and_run(
186
194
  responses_df,
187
195
  prompt_template,
188
196
  llm,
@@ -191,6 +199,7 @@ async def theme_generation(
191
199
  question=question,
192
200
  system_prompt=system_prompt,
193
201
  )
202
+ return generated_themes, _
194
203
 
195
204
 
196
205
  async def theme_condensation(
@@ -201,7 +210,7 @@ async def theme_condensation(
201
210
  prompt_template: str | Path | PromptTemplate = "theme_condensation",
202
211
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
203
212
  **kwargs,
204
- ) -> pd.DataFrame:
213
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
205
214
  """Condense and combine similar themes identified from survey responses.
206
215
 
207
216
  This function processes the initially identified themes to combine similar or
@@ -221,18 +230,21 @@ async def theme_condensation(
221
230
  Defaults to CONSULTATION_SYSTEM_PROMPT.
222
231
 
223
232
  Returns:
224
- pd.DataFrame: DataFrame containing the condensed themes, where similar topics
225
- have been combined into broader categories.
233
+ tuple[pd.DataFrame, pd.DataFrame]:
234
+ A tuple containing two DataFrames:
235
+ - The first DataFrame contains the rows that were successfully processed by the LLM
236
+ - The second DataFrame contains the rows that could not be processed by the LLM
237
+
226
238
  """
227
239
  logger.info(f"Running theme condensation on {len(themes_df)} themes")
228
- themes_df["response_id"] = range(len(themes_df))
240
+ themes_df["response_id"] = themes_df.index + 1
229
241
 
230
242
  n_themes = themes_df.shape[0]
231
243
  while n_themes > batch_size:
232
244
  logger.info(
233
245
  f"{n_themes} larger than batch size, using recursive theme condensation"
234
246
  )
235
- themes_df = await batch_and_run(
247
+ themes_df, _ = await batch_and_run(
236
248
  themes_df,
237
249
  prompt_template,
238
250
  llm,
@@ -242,13 +254,13 @@ async def theme_condensation(
242
254
  **kwargs,
243
255
  )
244
256
  themes_df = themes_df.sample(frac=1).reset_index(drop=True)
245
- themes_df["response_id"] = range(len(themes_df))
257
+ themes_df["response_id"] = themes_df.index + 1
246
258
  if len(themes_df) == n_themes:
247
259
  logger.info("Themes no longer being condensed")
248
260
  break
249
261
  n_themes = themes_df.shape[0]
250
262
 
251
- themes_df = await batch_and_run(
263
+ themes_df, _ = await batch_and_run(
252
264
  themes_df,
253
265
  prompt_template,
254
266
  llm,
@@ -259,7 +271,7 @@ async def theme_condensation(
259
271
  )
260
272
 
261
273
  logger.info(f"Final number of condensed themes: {themes_df.shape[0]}")
262
- return themes_df
274
+ return themes_df, _
263
275
 
264
276
 
265
277
  async def theme_refinement(
@@ -269,7 +281,7 @@ async def theme_refinement(
269
281
  batch_size: int = 10000,
270
282
  prompt_template: str | Path | PromptTemplate = "theme_refinement",
271
283
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
272
- ) -> pd.DataFrame:
284
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
273
285
  """Refine and standardize condensed themes using an LLM.
274
286
 
275
287
  This function processes previously condensed themes to create clear, standardized
@@ -286,15 +298,15 @@ async def theme_refinement(
286
298
  Defaults to 10000.
287
299
  prompt_template (str | Path | PromptTemplate, optional): Template for structuring
288
300
  the prompt to the LLM. Can be a string identifier, path to template file,
289
- or PromptTemplate instance. Defaults to "topic_refinement".
301
+ or PromptTemplate instance. Defaults to "theme_refinement".
290
302
  system_prompt (str): System prompt to guide the LLM's behavior.
291
303
  Defaults to CONSULTATION_SYSTEM_PROMPT.
292
304
 
293
305
  Returns:
294
- pd.DataFrame: A single-row DataFrame where:
295
- - Each column represents a unique theme (identified by topic_id)
296
- - The values contain the refined theme descriptions
297
- - The format is optimized for subsequent theme mapping operations
306
+ tuple[pd.DataFrame, pd.DataFrame]:
307
+ A tuple containing two DataFrames:
308
+ - The first DataFrame contains the rows that were successfully processed by the LLM
309
+ - The second DataFrame contains the rows that could not be processed by the LLM
298
310
 
299
311
  Note:
300
312
  The function adds sequential response_ids to the input DataFrame and
@@ -302,16 +314,9 @@ async def theme_refinement(
302
314
  processing.
303
315
  """
304
316
  logger.info(f"Running theme refinement on {len(condensed_themes_df)} responses")
305
- condensed_themes_df["response_id"] = range(len(condensed_themes_df))
317
+ condensed_themes_df["response_id"] = condensed_themes_df.index + 1
306
318
 
307
- def transpose_refined_themes(refined_themes: pd.DataFrame):
308
- """Transpose topics for increased legibility."""
309
- transposed_df = pd.DataFrame(
310
- [refined_themes["topic"].to_numpy()], columns=refined_themes["topic_id"]
311
- )
312
- return transposed_df
313
-
314
- refined_themes = await batch_and_run(
319
+ refined_themes, _ = await batch_and_run(
315
320
  condensed_themes_df,
316
321
  prompt_template,
317
322
  llm,
@@ -319,7 +324,7 @@ async def theme_refinement(
319
324
  question=question,
320
325
  system_prompt=system_prompt,
321
326
  )
322
- return transpose_refined_themes(refined_themes)
327
+ return refined_themes, _
323
328
 
324
329
 
325
330
  async def theme_target_alignment(
@@ -330,7 +335,7 @@ async def theme_target_alignment(
330
335
  batch_size: int = 10000,
331
336
  prompt_template: str | Path | PromptTemplate = "theme_target_alignment",
332
337
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
333
- ) -> pd.DataFrame:
338
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
334
339
  """Align themes to target number using an LLM.
335
340
 
336
341
  This function processes refined themes to consolidate them into a target number of
@@ -354,10 +359,10 @@ async def theme_target_alignment(
354
359
  Defaults to CONSULTATION_SYSTEM_PROMPT.
355
360
 
356
361
  Returns:
357
- pd.DataFrame: A single-row DataFrame where:
358
- - Each column represents a unique theme (identified by topic_id)
359
- - The values contain the aligned theme descriptions
360
- - The format is optimized for subsequent theme mapping operations
362
+ tuple[pd.DataFrame, pd.DataFrame]:
363
+ A tuple containing two DataFrames:
364
+ - The first DataFrame contains the rows that were successfully processed by the LLM
365
+ - The second DataFrame contains the rows that could not be processed by the LLM
361
366
 
362
367
  Note:
363
368
  The function adds sequential response_ids to the input DataFrame and
@@ -365,19 +370,10 @@ async def theme_target_alignment(
365
370
  processing.
366
371
  """
367
372
  logger.info(
368
- f"Running theme target alignment on {len(refined_themes_df.columns)} themes compressing to {target_n_themes} themes"
373
+ f"Running theme target alignment on {len(refined_themes_df)} themes compressing to {target_n_themes} themes"
369
374
  )
370
- refined_themes_df = refined_themes_df.T.rename(columns={0: "topic"})
371
- refined_themes_df["response_id"] = range(len(refined_themes_df))
372
-
373
- def transpose_aligned_themes(aligned_themes: pd.DataFrame):
374
- """Transpose topics for increased legibility."""
375
- transposed_df = pd.DataFrame(
376
- [aligned_themes["topic"].to_numpy()], columns=aligned_themes["topic_id"]
377
- )
378
- return transposed_df
379
-
380
- aligned_themes = await batch_and_run(
375
+ refined_themes_df["response_id"] = refined_themes_df.index + 1
376
+ aligned_themes, _ = await batch_and_run(
381
377
  refined_themes_df,
382
378
  prompt_template,
383
379
  llm,
@@ -386,7 +382,7 @@ async def theme_target_alignment(
386
382
  system_prompt=system_prompt,
387
383
  target_n_themes=target_n_themes,
388
384
  )
389
- return transpose_aligned_themes(aligned_themes)
385
+ return aligned_themes, _
390
386
 
391
387
 
392
388
  async def theme_mapping(
@@ -397,7 +393,7 @@ async def theme_mapping(
397
393
  batch_size: int = 20,
398
394
  prompt_template: str | Path | PromptTemplate = "theme_mapping",
399
395
  system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
400
- ) -> pd.DataFrame:
396
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
401
397
  """Map survey responses to refined themes using an LLM.
402
398
 
403
399
  This function analyzes each survey response and determines which of the refined
@@ -419,19 +415,34 @@ async def theme_mapping(
419
415
  Defaults to CONSULTATION_SYSTEM_PROMPT.
420
416
 
421
417
  Returns:
422
- pd.DataFrame: DataFrame containing the original responses enriched with
423
- theme mapping results, ensuring all responses are mapped through ID integrity checks.
418
+ tuple[pd.DataFrame, pd.DataFrame]:
419
+ A tuple containing two DataFrames:
420
+ - The first DataFrame contains the rows that were successfully processed by the LLM
421
+ - The second DataFrame contains the rows that could not be processed by the LLM
422
+
424
423
  """
425
424
  logger.info(
426
- f"Running theme mapping on {len(responses_df)} responses using {len(refined_themes_df.columns)} themes"
425
+ f"Running theme mapping on {len(responses_df)} responses using {len(refined_themes_df)} themes"
427
426
  )
428
- return await batch_and_run(
427
+
428
+ def transpose_refined_themes(refined_themes: pd.DataFrame):
429
+ """Transpose topics for increased legibility."""
430
+ transposed_df = pd.DataFrame(
431
+ [refined_themes["topic"].to_numpy()], columns=refined_themes["topic_id"]
432
+ )
433
+ return transposed_df
434
+
435
+ mapping, _ = await batch_and_run(
429
436
  responses_df,
430
437
  prompt_template,
431
438
  llm,
432
439
  batch_size=batch_size,
433
440
  question=question,
434
- refined_themes=refined_themes_df.to_dict(orient="records"),
435
- response_id_integrity_check=True,
441
+ refined_themes=transpose_refined_themes(refined_themes_df).to_dict(
442
+ orient="records"
443
+ ),
444
+ validation_check=True,
445
+ task_validation_model=ThemeMappingOutput,
436
446
  system_prompt=system_prompt,
437
447
  )
448
+ return mapping, _
@@ -1,14 +1,24 @@
1
1
  import asyncio
2
2
  import json
3
3
  import logging
4
+ import os
4
5
  from dataclasses import dataclass
5
6
  from pathlib import Path
6
- from typing import Any
7
+ from typing import Any, Optional, Type
7
8
 
9
+ import openai
8
10
  import pandas as pd
11
+ import tiktoken
9
12
  from langchain_core.prompts import PromptTemplate
10
13
  from langchain_core.runnables import Runnable
11
- from tenacity import before, retry, stop_after_attempt, wait_random_exponential
14
+ from pydantic import BaseModel, ValidationError
15
+ from tenacity import (
16
+ before,
17
+ before_sleep_log,
18
+ retry,
19
+ stop_after_attempt,
20
+ wait_random_exponential,
21
+ )
12
22
 
13
23
  from .themefinder_logging import logger
14
24
 
@@ -16,63 +26,82 @@ from .themefinder_logging import logger
16
26
  @dataclass
17
27
  class BatchPrompt:
18
28
  prompt_string: str
19
- response_ids: list[str]
29
+ response_ids: list[int]
20
30
 
21
31
 
22
32
  async def batch_and_run(
23
- responses_df: pd.DataFrame,
33
+ input_df: pd.DataFrame,
24
34
  prompt_template: str | Path | PromptTemplate,
25
35
  llm: Runnable,
26
36
  batch_size: int = 10,
27
37
  partition_key: str | None = None,
28
- response_id_integrity_check: bool = False,
38
+ validation_check: bool = False,
39
+ task_validation_model: Type[BaseModel] = None,
29
40
  **kwargs: Any,
30
- ) -> pd.DataFrame:
41
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
31
42
  """Process a DataFrame of responses in batches using an LLM.
32
43
 
33
44
  Args:
34
- responses_df (pd.DataFrame): DataFrame containing responses to be processed.
45
+ input_df (pd.DataFrame): DataFrame containing input to be processed.
35
46
  Must include a 'response_id' column.
36
47
  prompt_template (Union[str, Path, PromptTemplate]): Template for LLM prompts.
37
48
  Can be a string (file path), Path object, or PromptTemplate.
38
49
  llm (Runnable): LangChain Runnable instance that will process the prompts.
39
- batch_size (int, optional): Number of responses to process in each batch.
50
+ batch_size (int, optional): Number of input rows to process in each batch.
40
51
  Defaults to 10.
41
- partition_key (str | None, optional): Optional column name to group responses
52
+ partition_key (str | None, optional): Optional column name to group input rows
42
53
  before batching. Defaults to None.
43
- response_id_integrity_check (bool, optional): If True, verifies that all input
44
- response IDs are present in LLM output and retries failed responses individually.
54
+ validation_check (bool, optional): If True, verifies that all input
55
+ response IDs are present in LLM output and validates the rows against the validation model,
56
+ failed rows are retried individually.
45
57
  If False, no integrity checking or retrying occurs. Defaults to False.
58
+ task_validation_model (Type[BaseModel]): the pydanctic model to validate each row against
46
59
  **kwargs (Any): Additional keyword arguments to pass to the prompt template.
47
60
 
48
61
  Returns:
49
62
  pd.DataFrame: DataFrame containing the original responses merged with the
50
63
  LLM-processed results.
64
+ Returns:
65
+ tuple[pd.DataFrame, pd.DataFrame]:
66
+ A tuple containing two DataFrames:
67
+ - The first DataFrame contains the rows that were successfully processes by the LLM
68
+ - The second DataFrame contains the rows that could not be processed by the LLM
51
69
  """
70
+
52
71
  logger.info(f"Running batch and run with batch size {batch_size}")
53
72
  prompt_template = convert_to_prompt_template(prompt_template)
54
- batched_response_dfs = batch_responses(
55
- responses_df, batch_size=batch_size, partition_key=partition_key
73
+ batch_prompts = generate_prompts(
74
+ prompt_template,
75
+ input_df,
76
+ batch_size=batch_size,
77
+ partition_key=partition_key,
78
+ **kwargs,
56
79
  )
57
- batch_prompts = generate_prompts(batched_response_dfs, prompt_template, **kwargs)
58
- llm_responses, failed_ids = await call_llm(
80
+ processed_rows, failed_ids = await call_llm(
59
81
  batch_prompts=batch_prompts,
60
82
  llm=llm,
61
- response_id_integrity_check=response_id_integrity_check,
83
+ validation_check=validation_check,
84
+ task_validation_model=task_validation_model,
62
85
  )
63
- processed_responses = process_llm_responses(llm_responses, responses_df)
86
+ processed_results = process_llm_responses(processed_rows, input_df)
87
+
64
88
  if failed_ids:
65
- new_df = responses_df[responses_df["response_id"].astype(str).isin(failed_ids)]
66
- processed_failed_responses = await batch_and_run(
67
- responses_df=new_df,
68
- prompt_template=prompt_template,
89
+ retry_df = input_df[input_df["response_id"].isin(failed_ids)]
90
+ retry_prompts = generate_prompts(
91
+ prompt_template, retry_df, batch_size=1, **kwargs
92
+ )
93
+ retry_results, unprocessable_ids = await call_llm(
94
+ batch_prompts=retry_prompts,
69
95
  llm=llm,
70
- batch_size=1,
71
- partition_key=partition_key,
72
- **kwargs,
96
+ validation_check=validation_check,
97
+ task_validation_model=task_validation_model,
73
98
  )
74
- return pd.concat(objs=[processed_failed_responses, processed_responses])
75
- return processed_responses
99
+ retry_processed_results = process_llm_responses(retry_results, retry_df)
100
+ unprocessable_df = retry_df.loc[retry_df["response_id"].isin(unprocessable_ids)]
101
+ processed_results = pd.concat([processed_results, retry_processed_results])
102
+ else:
103
+ unprocessable_df = pd.DataFrame()
104
+ return processed_results, unprocessable_df
76
105
 
77
106
 
78
107
  def load_prompt_from_file(file_path: str | Path) -> str:
@@ -117,81 +146,150 @@ def convert_to_prompt_template(prompt_template: str | Path | PromptTemplate):
117
146
  return template
118
147
 
119
148
 
120
- def batch_responses(
121
- responses_df: pd.DataFrame, batch_size: int = 10, partition_key: str | None = None
149
+ def partition_dataframe(
150
+ df: pd.DataFrame, partition_key: Optional[str]
151
+ ) -> list[pd.DataFrame]:
152
+ """Splits the DataFrame into partitions based on the partition_key if provided."""
153
+ if partition_key:
154
+ return [group.reset_index(drop=True) for _, group in df.groupby(partition_key)]
155
+ return [df]
156
+
157
+
158
+ def split_overflowing_batch(
159
+ batch: pd.DataFrame, allowed_tokens: int
122
160
  ) -> list[pd.DataFrame]:
123
- """Split a DataFrame into batches, optionally partitioned by a key column.
161
+ """
162
+ Splits a DataFrame batch into smaller sub-batches such that each sub-batch's total token count
163
+ does not exceed the allowed token limit.
124
164
 
125
165
  Args:
126
- responses_df (pd.DataFrame): Input DataFrame to be split into batches.
127
- batch_size (int, optional): Maximum number of rows in each batch. Defaults to 10.
128
- partition_key (str | None, optional): Column name to group by before batching.
129
- If provided, ensures rows with the same partition key value stay together
130
- and each group is batched separately. Defaults to None.
166
+ batch (pd.DataFrame): The input DataFrame to split.
167
+ allowed_tokens (int): The maximum allowed number of tokens per sub-batch.
131
168
 
132
169
  Returns:
133
- list[pd.DataFrame]: List of DataFrame batches, where each batch contains
134
- at most batch_size rows. If partition_key is used, rows within each
135
- partition are kept together and batched separately.
170
+ list[pd.DataFrame]: A list of sub-batches, each within the token limit.
136
171
  """
137
- if partition_key:
138
- grouped = responses_df.groupby(partition_key)
139
- batches = []
140
- for _, group in grouped:
141
- group_batches = [
142
- group.iloc[i : i + batch_size].reset_index(drop=True)
143
- for i in range(0, len(group), batch_size)
144
- ]
145
- batches.extend(group_batches)
146
- return batches
147
-
148
- return [
149
- responses_df.iloc[i : i + batch_size].reset_index(drop=True)
150
- for i in range(0, len(responses_df), batch_size)
151
- ]
172
+ sub_batches = []
173
+ current_indices = []
174
+ current_token_sum = 0
175
+ token_counts = batch.apply(
176
+ lambda row: calculate_string_token_length(row.to_json()), axis=1
177
+ ).tolist()
178
+
179
+ for i, token_count in enumerate(token_counts):
180
+ if token_count > allowed_tokens:
181
+ logging.warning(
182
+ f"Row at index {batch.index[i]} exceeds allowed token limit ({token_count} > {allowed_tokens}). Skipping row."
183
+ )
184
+ continue
185
+
186
+ if current_token_sum + token_count > allowed_tokens:
187
+ if current_indices:
188
+ sub_batch = batch.iloc[current_indices].reset_index(drop=True)
189
+ if not sub_batch.empty:
190
+ sub_batches.append(sub_batch)
191
+ current_indices = [i]
192
+ current_token_sum = token_count
193
+ else:
194
+ current_indices.append(i)
195
+ current_token_sum += token_count
196
+
197
+ if current_indices:
198
+ sub_batch = batch.iloc[current_indices].reset_index(drop=True)
199
+ if not sub_batch.empty:
200
+ sub_batches.append(sub_batch)
201
+ return sub_batches
202
+
203
+
204
+ def batch_task_input_df(
205
+ df: pd.DataFrame,
206
+ allowed_tokens: int,
207
+ batch_size: int,
208
+ partition_key: Optional[str] = None,
209
+ ) -> list[pd.DataFrame]:
210
+ """
211
+ Partitions and batches a DataFrame according to a token limit and batch size, optionally using a partition key. Batches that exceed the token limit are further split.
212
+
213
+ Args:
214
+ df (pd.DataFrame): The input DataFrame to batch.
215
+ allowed_tokens (int): Maximum allowed tokens per batch.
216
+ batch_size (int): Maximum number of rows per batch before token filtering.
217
+ partition_key (Optional[str], optional): Column name to partition the DataFrame by.
218
+ Defaults to None.
219
+
220
+ Returns:
221
+ list[pd.DataFrame]: A list of batches, each within the specified token and size limits.
222
+ """
223
+ batches = []
224
+ partitions = partition_dataframe(df, partition_key)
225
+
226
+ for partition in partitions:
227
+ partition_batches = [
228
+ partition.iloc[i : i + batch_size].reset_index(drop=True)
229
+ for i in range(0, len(partition), batch_size)
230
+ ]
231
+ for batch in partition_batches:
232
+ batch_length = calculate_string_token_length(batch.to_json())
233
+ if batch_length <= allowed_tokens:
234
+ batches.append(batch)
235
+ else:
236
+ sub_batches = split_overflowing_batch(batch, allowed_tokens)
237
+ batches.extend(sub_batches)
238
+ return batches
152
239
 
153
240
 
154
241
  def generate_prompts(
155
- response_dfs: list[pd.DataFrame], prompt_template: PromptTemplate, **kwargs: Any
242
+ prompt_template: PromptTemplate,
243
+ input_data: pd.DataFrame,
244
+ batch_size: int = 50,
245
+ max_prompt_length: int = 50_000,
246
+ partition_key: str | None = None,
247
+ **kwargs,
156
248
  ) -> list[BatchPrompt]:
157
- """Generate a list of BatchPrompts from DataFrames using a prompt template.
249
+ """
250
+ Generate a list of BatchPrompt objects by splitting the input DataFrame into batches
251
+ and formatting each batch using a prompt template.
252
+
253
+ The function first calculates the token length of the prompt template to determine
254
+ the allowed tokens available for the input data. It then splits the input data into batches,
255
+ optionally partitioning by a specified key. Each batch is then formatted into a prompt string
256
+ using the provided prompt template, and a BatchPrompt is created containing the prompt string
257
+ and a list of response IDs from the batch.
158
258
 
159
259
  Args:
160
- response_dfs (list[pd.DataFrame]): List of DataFrames, each containing a batch
161
- of responses to be processed. Each DataFrame must include a 'response_id' column.
162
- prompt_template (PromptTemplate): LangChain PromptTemplate object used to format
163
- the prompts for each batch.
164
- **kwargs (Any): Additional keyword arguments to pass to the prompt template's
165
- format method.
260
+ prompt_template (PromptTemplate): An object with a 'template' attribute and a 'format' method
261
+ used to create a prompt string from a list of response dictionaries.
262
+ input_data (pd.DataFrame): A DataFrame containing the input responses, with at least a
263
+ 'response_id' column.
264
+ batch_size (int, optional): Maximum number of rows to include in each batch. Defaults to 50.
265
+ max_prompt_length (int, optional): The maximum total token length allowed for the prompt,
266
+ including both the prompt template and the input data. Defaults to 50,000.
267
+ partition_key (str | None, optional): Column name used to partition the DataFrame before batching.
268
+ If provided, the DataFrame will be grouped by this key so that rows with the same value
269
+ remain in the same batch. Defaults to None.
270
+ **kwargs: Additional keyword arguments to pass to the prompt template's format method.
166
271
 
167
272
  Returns:
168
- list[BatchPrompt]: List of BatchPrompt objects, each containing:
169
- - prompt_string: Formatted prompt text for the batch
170
- - response_ids: List of response IDs included in the batch
171
-
172
- Note:
173
- The function converts each DataFrame to a list of dictionaries and passes it
174
- to the prompt template as the 'responses' variable.
273
+ list[BatchPrompt]: A list of BatchPrompt objects where each object contains:
274
+ - prompt_string: The formatted prompt string for a batch.
275
+ - response_ids: A list of response IDs corresponding to the rows in that batch.
175
276
  """
176
- batched_prompts = []
177
- for df in response_dfs:
178
- prompt = prompt_template.format(
179
- responses=df.to_dict(orient="records"), **kwargs
180
- )
181
- response_ids = df["response_id"].astype(str).to_list()
182
- batched_prompts.append(
183
- BatchPrompt(prompt_string=prompt, response_ids=response_ids)
184
- )
185
-
186
- return batched_prompts
277
+ prompt_token_length = calculate_string_token_length(prompt_template.template)
278
+ allowed_tokens_for_data = max_prompt_length - prompt_token_length
279
+ batches = batch_task_input_df(
280
+ input_data, allowed_tokens_for_data, batch_size, partition_key
281
+ )
282
+ prompts = [build_prompt(prompt_template, batch, **kwargs) for batch in batches]
283
+ return prompts
187
284
 
188
285
 
189
286
  async def call_llm(
190
287
  batch_prompts: list[BatchPrompt],
191
288
  llm: Runnable,
192
289
  concurrency: int = 10,
193
- response_id_integrity_check: bool = False,
194
- ):
290
+ validation_check: bool = False,
291
+ task_validation_model: Optional[Type[BaseModel]] = None,
292
+ ) -> tuple[list[dict], list[int]]:
195
293
  """Process multiple batches of prompts concurrently through an LLM with retry logic.
196
294
 
197
295
  Args:
@@ -200,9 +298,10 @@ async def call_llm(
200
298
  llm (Runnable): LangChain Runnable instance that will process the prompts.
201
299
  concurrency (int, optional): Maximum number of simultaneous LLM calls allowed.
202
300
  Defaults to 10.
203
- response_id_integrity_check (bool, optional): If True, verifies that all input
301
+ validation_check (bool, optional): If True, verifies that all input
204
302
  response IDs are present in the LLM output. Failed batches are discarded and
205
303
  their IDs are returned for retry. Defaults to False.
304
+ task_validation_model (Type[BaseModel]): The Pydantic model to check the LLM outputs against
206
305
 
207
306
  Returns:
208
307
  tuple[list[dict[str, Any]], set[str]]: A tuple containing:
@@ -215,69 +314,76 @@ async def call_llm(
215
314
  - Concurrency is managed via asyncio.Semaphore to prevent overwhelming the LLM
216
315
  """
217
316
  semaphore = asyncio.Semaphore(concurrency)
218
- failed_ids: set = set()
219
317
 
220
318
  @retry(
221
319
  wait=wait_random_exponential(min=1, max=20),
222
320
  stop=stop_after_attempt(6),
223
321
  before=before.before_log(logger=logger, log_level=logging.DEBUG),
322
+ before_sleep=before_sleep_log(logger, logging.ERROR),
224
323
  reraise=True,
225
324
  )
226
- async def async_llm_call(batch_prompt):
325
+ async def async_llm_call(batch_prompt) -> tuple[list[dict], list[int]]:
227
326
  async with semaphore:
228
- response = await llm.ainvoke(batch_prompt.prompt_string)
229
- parsed_response = json.loads(response.content)
230
-
231
- if response_id_integrity_check and not check_response_integrity(
232
- batch_prompt.response_ids, parsed_response
233
- ):
234
- # discard this response but keep track of failed response ids
235
- failed_ids.update(batch_prompt.response_ids)
236
- return None
237
-
238
- return parsed_response
327
+ try:
328
+ llm_response = await llm.ainvoke(batch_prompt.prompt_string)
329
+ all_results = json.loads(llm_response.content)
330
+ except (openai.BadRequestError, json.JSONDecodeError) as e:
331
+ failed_ids = batch_prompt.response_ids
332
+ logger.warning(e)
333
+ return [], failed_ids
334
+
335
+ if validation_check:
336
+ failed_ids = get_missing_response_ids(
337
+ batch_prompt.response_ids, all_results
338
+ )
339
+ validated_results, invalid_rows = validate_task_data(
340
+ all_results["responses"], task_validation_model
341
+ )
342
+ failed_ids.extend([r["response_id"] for r in invalid_rows])
343
+ return validated_results, failed_ids
344
+ else:
345
+ # Flatten the list to align with valid output format
346
+ return [r for r in all_results["responses"]], []
239
347
 
240
348
  results = await asyncio.gather(
241
349
  *[async_llm_call(batch_prompt) for batch_prompt in batch_prompts]
242
350
  )
243
- successful_responses = [
244
- r for r in results if r is not None
245
- ] # ignore discarded responses
246
- return (successful_responses, failed_ids)
351
+ valid_inputs = [row for result, _ in results for row in result]
352
+ failed_response_ids = [
353
+ failed_response_id
354
+ for _, batch_failures in results
355
+ for failed_response_id in batch_failures
356
+ ]
247
357
 
358
+ return valid_inputs, failed_response_ids
248
359
 
249
- def check_response_integrity(
250
- input_response_ids: set[str], parsed_response: dict
251
- ) -> bool:
252
- """Verify that all input response IDs are present in the LLM's parsed response.
360
+
361
+ def get_missing_response_ids(
362
+ input_response_ids: list[int], parsed_response: dict
363
+ ) -> list[int]:
364
+ """Identify which response IDs are missing from the LLM's parsed response.
253
365
 
254
366
  Args:
255
367
  input_response_ids (set[str]): Set of response IDs that were included in the
256
- original prompt sent to the LLM.
368
+ original prompt.
257
369
  parsed_response (dict): Parsed response from the LLM containing a 'responses' key
258
370
  with a list of dictionaries, each containing a 'response_id' field.
259
371
 
260
372
  Returns:
261
- bool: True if all input response IDs are present in the parsed response and
262
- no additional IDs are present, False otherwise.
373
+ set[str]: Set of response IDs that are missing from the parsed response.
263
374
  """
264
- response_ids_set = set(input_response_ids)
265
375
 
376
+ response_ids_set = {int(response_id) for response_id in input_response_ids}
266
377
  returned_ids_set = {
267
- str(
268
- element["response_id"]
269
- ) # treat ids as strings to match response_ids_in_each_prompt
378
+ int(element["response_id"])
270
379
  for element in parsed_response["responses"]
271
380
  if element.get("response_id", False)
272
381
  }
273
- # assumes: all input ids ought to be present in output
274
- if returned_ids_set != response_ids_set:
275
- logger.info("Failed integrity check")
276
- logger.info(
277
- f"Present in original but not returned from LLM: {response_ids_set - returned_ids_set}. Returned in LLM but not present in original: {returned_ids_set - response_ids_set}"
278
- )
279
- return False
280
- return True
382
+
383
+ missing_ids = list(response_ids_set - returned_ids_set)
384
+ if missing_ids:
385
+ logger.info(f"Missing response IDs from LLM output: {missing_ids}")
386
+ return missing_ids
281
387
 
282
388
 
283
389
  def process_llm_responses(
@@ -298,13 +404,87 @@ def process_llm_responses(
298
404
  - If no response_id in LLM output: DataFrame containing only the LLM results
299
405
  """
300
406
  responses.loc[:, "response_id"] = responses["response_id"].astype(int)
301
- unpacked_responses = [
302
- response
303
- for batch_response in llm_responses
304
- for response in batch_response.get("responses", [])
305
- ]
306
- task_responses = pd.DataFrame(unpacked_responses)
407
+ task_responses = pd.DataFrame(llm_responses)
307
408
  if "response_id" in task_responses.columns:
308
409
  task_responses["response_id"] = task_responses["response_id"].astype(int)
309
410
  return responses.merge(task_responses, how="inner", on="response_id")
310
411
  return task_responses
412
+
413
+
414
+ def calculate_string_token_length(input_text: str, model: str = None) -> int:
415
+ """
416
+ Calculates the number of tokens in a given string using the specified model's tokenizer.
417
+
418
+ Args:
419
+ input_text (str): The input string to tokenize.
420
+ model (str, optional): The model name used for tokenization. If not provided,
421
+ uses the MODEL_NAME environment variable or defaults to "gpt-4o".
422
+
423
+ Returns:
424
+ int: The number of tokens in the input string.
425
+ """
426
+ # Use the MODEL_NAME env var if no model is provided; otherwise default to "gpt-4o"
427
+ model = model or os.environ.get("MODEL_NAME", "gpt-4o")
428
+ tokenizer_encoding = tiktoken.encoding_for_model(model)
429
+ number_of_tokens = len(tokenizer_encoding.encode(input_text))
430
+ return number_of_tokens
431
+
432
+
433
+ def build_prompt(
434
+ prompt_template: PromptTemplate, input_batch: pd.DataFrame, **kwargs
435
+ ) -> BatchPrompt:
436
+ """
437
+ Constructs a BatchPrompt by formatting a prompt template with a batch of responses.
438
+
439
+ The function converts the input DataFrame batch into a list of dictionaries (one per row) and passes
440
+ this list to the prompt template's format method under the key 'responses', along with any additional
441
+ keyword arguments. It also extracts the 'response_id' column from the batch,
442
+ and uses these to create the BatchPrompt.
443
+
444
+ Args:
445
+ prompt_template (PromptTemplate): An object with a 'template' attribute and a 'format' method that is used
446
+ to generate the prompt string.
447
+ input_batch (pd.DataFrame): A DataFrame containing the batch of responses, which must include a 'response_id'
448
+ column.
449
+ **kwargs: Additional keyword arguments to pass to the prompt template's format method.
450
+
451
+ Returns:
452
+ BatchPrompt: An object containing:
453
+ - prompt_string: The formatted prompt string for the batch.
454
+ - response_ids: A list of response IDs (as strings) corresponding to the responses in the batch.
455
+ """
456
+ prompt = prompt_template.format(
457
+ responses=input_batch.to_dict(orient="records"), **kwargs
458
+ )
459
+ response_ids = input_batch["response_id"].astype(int).to_list()
460
+ return BatchPrompt(prompt_string=prompt, response_ids=response_ids)
461
+
462
+
463
+ def validate_task_data(
464
+ task_data: pd.DataFrame | list[dict], task_validation_model: Type[BaseModel] = None
465
+ ) -> tuple[list[dict], list[dict]]:
466
+ """
467
+ Validate each row in task_output against the provided Pydantic model.
468
+
469
+ Returns:
470
+ valid: a list of validated records (dicts).
471
+ invalid: a list of records (dicts) that failed validation.
472
+ """
473
+
474
+ records = (
475
+ task_data.to_dict(orient="records")
476
+ if isinstance(task_data, pd.DataFrame)
477
+ else task_data
478
+ )
479
+
480
+ if task_validation_model:
481
+ valid_records, invalid_records = [], []
482
+ for record in records:
483
+ try:
484
+ task_validation_model(**record)
485
+ valid_records.append(record)
486
+ except ValidationError as e:
487
+ invalid_records.append(record)
488
+ logger.info(f"Failed Validation: {e}")
489
+ return valid_records, invalid_records
490
+ return records, []
themefinder/models.py ADDED
@@ -0,0 +1,138 @@
1
+ from pydantic import BaseModel, Field, model_validator
2
+
3
+
4
+ def validate_non_empty_fields(model: BaseModel) -> BaseModel:
5
+ """
6
+ Validate that all string fields in the model are non-empty (after stripping)
7
+ and that list fields are not empty.
8
+
9
+ Args:
10
+ model (BaseModel): A Pydantic model instance.
11
+
12
+ Returns:
13
+ BaseModel: The same model if validation passes.
14
+
15
+ Raises:
16
+ ValueError: If any string field is empty or any list field is empty.
17
+ """
18
+ for field_name, value in model.__dict__.items():
19
+ if isinstance(value, str) and not value.strip():
20
+ raise ValueError(f"{field_name} cannot be empty or only whitespace")
21
+ if isinstance(value, list) and not value:
22
+ raise ValueError(f"{field_name} cannot be an empty list")
23
+ return model
24
+
25
+
26
+ def validate_position(model: BaseModel) -> BaseModel:
27
+ """
28
+ Validate that the model's 'position' field is one of the allowed values.
29
+
30
+ Args:
31
+ model (BaseModel): A Pydantic model instance with a 'position' attribute.
32
+
33
+ Returns:
34
+ BaseModel: The same model if validation passes.
35
+
36
+ Raises:
37
+ ValueError: If the 'position' field is not one of the allowed values.
38
+ """
39
+ allowed_positions = {"AGREEMENT", "DISAGREEMENT", "UNCLEAR"}
40
+ if model.position not in allowed_positions:
41
+ raise ValueError(f"position must be one of {allowed_positions}")
42
+ return model
43
+
44
+
45
+ def validate_stances(model: BaseModel) -> BaseModel:
46
+ """
47
+ Validate that every stance in the model's 'stances' field is allowed.
48
+
49
+ Args:
50
+ model (BaseModel): A Pydantic model instance with a 'stances' attribute.
51
+
52
+ Returns:
53
+ BaseModel: The same model if validation passes.
54
+
55
+ Raises:
56
+ ValueError: If any stance is not among the allowed stances.
57
+ """
58
+ allowed_stances = {"POSITIVE", "NEGATIVE"}
59
+ for stance in model.stances:
60
+ if stance not in allowed_stances:
61
+ raise ValueError(f"stances must be one of {allowed_stances}")
62
+ return model
63
+
64
+
65
+ def validate_mapping_stance_lengths(model: BaseModel) -> BaseModel:
66
+ """
67
+ Validate that the lengths of the model's 'stances' and 'labels' fields match.
68
+
69
+ Args:
70
+ model (BaseModel): A Pydantic model instance with 'stances' and 'labels' attributes.
71
+
72
+ Returns:
73
+ BaseModel: The same model if validation passes.
74
+
75
+ Raises:
76
+ ValueError: If the lengths of 'stances' and 'labels' do not match.
77
+ """
78
+ if len(model.stances) != len(model.labels):
79
+ raise ValueError("'stances' must have the same length as 'labels'")
80
+ return model
81
+
82
+
83
+ def validate_mapping_unique_labels(model: BaseModel) -> BaseModel:
84
+ """
85
+ Validate that the model's 'labels' field contains unique values.
86
+
87
+ Args:
88
+ model (BaseModel): A Pydantic model instance with a 'labels' attribute.
89
+
90
+ Returns:
91
+ BaseModel: The same model if validation passes.
92
+
93
+ Raises:
94
+ ValueError: If 'labels' contains duplicate values.
95
+ """
96
+ if len(model.labels) != len(set(model.labels)):
97
+ raise ValueError("'labels' must be unique")
98
+ return model
99
+
100
+
101
+ class SentimentAnalysisOutput(BaseModel):
102
+ response_id: int = Field(gt=0)
103
+ position: str
104
+
105
+ @model_validator(mode="after")
106
+ def run_validations(self) -> "SentimentAnalysisOutput":
107
+ """
108
+ Run all validations for SentimentAnalysisOutput.
109
+
110
+ Validates that:
111
+ - 'position' is one of the allowed values.
112
+ - No fields are empty or only whitespace (for strings) and no lists are empty.
113
+ """
114
+ validate_position(self)
115
+ validate_non_empty_fields(self)
116
+ return self
117
+
118
+
119
+ class ThemeMappingOutput(BaseModel):
120
+ response_id: int = Field(gt=0)
121
+ labels: list[str]
122
+ reasons: list[str]
123
+ stances: list[str]
124
+
125
+ @model_validator(mode="after")
126
+ def run_validations(self) -> "ThemeMappingOutput":
127
+ """
128
+ Run all validations for ThemeMappingOutput.
129
+
130
+ Validates that:
131
+ - 'stances' are only 'POSITIVE' or 'NEGATIVE'.
132
+ - The 'stances' and 'labels' have matching lengths.
133
+ - 'labels' are unique.
134
+ """
135
+ validate_stances(self)
136
+ validate_mapping_stance_lengths(self)
137
+ validate_mapping_unique_labels(self)
138
+ return self
@@ -3,8 +3,8 @@
3
3
  You will receive a list of RESPONSES, each containing a response_id and a response.
4
4
  Your job is to analyze each response to the QUESTION below and decide:
5
5
 
6
- POSITION - is the response agreeing or disagreeing or is it unclear about the change being proposed in the question.
7
- Choose one from [agreement, disagreement, unclear]
6
+ POSITION - is the response AGREEING or DISAGREEING or is it UNCLEAR about the change being proposed in the question.
7
+ Choose one from [AGREEMENT, DISAGREEMENT, UNCLEAR]
8
8
 
9
9
  The final output should be in the following JSON format:
10
10
 
@@ -24,20 +24,23 @@ You MUST include every response ID in the output.
24
24
  If the response can not be labelled return empty sections where appropriate but you MUST return an entry
25
25
  with the correct response ID for each input object
26
26
 
27
+ You MUST pick one of the given POSITION values.
28
+ You MUST not return an empty value for the POSITION of a response.
29
+
27
30
  ## EXAMPLE
28
31
  Example 1:
29
32
  Question: \n What are your thoughts on the proposed government changes to the policy about reducing school holidays?
30
33
  Response: \n as a parent I have no idea why you would make this change. I guess you were thinking about increasing productivity but any productivity gains would be totally offset by the decrease in family time. \n
31
34
 
32
35
  Output:
33
- POSITION: disagreement
36
+ POSITION: DISAGREEMENT
34
37
 
35
38
  Example 2:
36
39
  Question: \n What are your thoughts on the proposed government changes to the policy about reducing school holidays?
37
40
  Response: \n I think this is a great idea, our children will learn more if they are in school more \n
38
41
 
39
42
  Output:
40
- POSITION: agreement
43
+ POSITION: AGREEMENT
41
44
 
42
45
  Example 3:
43
46
  Question: \n What are your thoughts on the proposed government changes to the policy about reducing school holidays?
@@ -45,7 +48,7 @@ Response: \n it will be good for our children to be around their friends more bu
45
48
  less time with their children \n
46
49
 
47
50
  Output:
48
- POSITION: unclear
51
+ POSITION: UNCLEAR
49
52
 
50
53
 
51
54
  QUESTION: \n {question}
@@ -17,7 +17,7 @@ Your task is to analyze each response and decide which topics are present. Guide
17
17
  - There is no limit on how many topics can be assigned to a response.
18
18
  - For each assignment provide a single rationale for why you have chosen the label.
19
19
  - For each topic identified in a response, indicate whether the response expresses a positive or negative stance toward that topic (options: 'POSITIVE' or 'NEGATIVE')
20
- - If a response contains both positive and negative statements about a topic within the same response, choose the stance that receives more emphasis or appears more central to the argument
20
+ - You MUST use either 'POSTIVE' or 'NEGATIVE'
21
21
  - The order of reasons and stances must align with the order of labels (e.g., stance_a applies to topic_a)
22
22
 
23
23
  You MUST include every response ID in the output.
@@ -30,13 +30,13 @@ The final output should be in the following JSON format:
30
30
  {{
31
31
  "responses": [
32
32
  {{
33
- "response_id": "response_id_1",
33
+ "response_id": response_id_1,
34
34
  "reasons": ["reason_a", "reason_b"],
35
35
  "labels": ["topic_a", "topic_b"],
36
36
  "stances": ["stance_a", "stance_b"],
37
37
  }},
38
38
  {{
39
- "response_id": "response_id_2",
39
+ "response_id": response_id_2,
40
40
  "reasons": ["reason_c"],
41
41
  "labels": ["topic_c"],
42
42
  "stances": ["stance_c"],
@@ -1,13 +1,12 @@
1
1
  {system_prompt}
2
2
 
3
- You are tasked with refining and neutralizing a list of topics generated from responses to a question.
4
- Your goal is to transform opinionated topics into neutral, well-structured, and distinct topics while preserving the essential information.
3
+ You are tasked with refining a list of topics generated from responses to a question.
5
4
 
6
5
  ## Input
7
- You will receive a list of OPINIONATED TOPICS. These topics explicitly tie opinions to whether a person agrees or disagrees with the question.
6
+ You will receive a list of TOPICS. These topics explicitly tie opinions to whether a person agrees or disagrees with the question.
8
7
 
9
8
  ## Output
10
- You will produce a list of NEUTRAL TOPICS based on the input. Each neutral topic should have two parts:
9
+ You will produce a list of CLEAR STANCE TOPICS based on the input. Each topic should have two parts:
11
10
  1. A brief, clear topic label (3-7 words)
12
11
  2. A more detailed topic description (1-2 sentences)
13
12
 
@@ -17,10 +16,11 @@ You will produce a list of NEUTRAL TOPICS based on the input. Each neutral topic
17
16
  - Preserve all key information, details and concepts from the original topics.
18
17
  - Ensure no significant details are lost in the refinement process.
19
18
 
20
- 2. Neutrality:
21
- - Remove all language indicating agreement or disagreement.
22
- - Present topics objectively without favoring any particular stance.
23
- - Avoid phrases like "supporters believe" or "critics argue".
19
+ 2. Clear Stance Formulation:
20
+ - Reformulate topics to express a clear stance that can be agreed or disagreed with.
21
+ - Use direct language like "Increased risk of X" rather than "X"
22
+ - Avoid double negatives and ambiguous phrasing.
23
+ - Phrase topics as definitive statements.
24
24
 
25
25
  3. Avoid Response References:
26
26
  - Do not use language that refers to multiple responses or respondents.
@@ -39,16 +39,15 @@ You will produce a list of NEUTRAL TOPICS based on the input. Each neutral topic
39
39
 
40
40
  ## Process
41
41
 
42
- 1. Analyze the OPINIONATED TOPICS to identify key themes and information.
42
+ 1. Analyze the TOPICS to identify key themes and information.
43
43
  2. Group closely related topics together.
44
44
  3. For each group or individual topic:
45
45
  a. Distill the core concept, removing any bias or opinion.
46
46
  b. Create a neutral, concise topic label.
47
47
  c. Write a more detailed description that provides context without taking sides.
48
48
  4. Review the entire list to ensure distinctiveness and adjust as needed.
49
- 5. Double-check that all topics are truly neutral and free of response references.
50
- 6. Assign each output topic a topic_id a single uppercase letters (starting from 'A')
51
- 7. Combine the topic label and description with a colon separator
49
+ 5. Assign each output topic a topic_id a single uppercase letters (starting from 'A', for the 27th element use AA)
50
+ 6. Combine the topic label and description with a colon separator
52
51
 
53
52
  Return your output in the following JSON format:
54
53
  {{
@@ -61,21 +60,6 @@ Return your output in the following JSON format:
61
60
  }}
62
61
 
63
62
 
64
- ## EXAMPLE
65
63
 
66
- OPINIONATED TOPIC:
67
- "Economic impact: Many respondents who support the policy believe it will create jobs and boost the economy, it could raise GDP by 2%. [source_topic_count: 15]"
68
-
69
- NEUTRAL TOPIC:
70
- {{
71
- "topic_id": "A",
72
- "topic": "Economic Impact on Employment: The policy's potential effects on job creation and overall economic growth, including potential for a 2% increase in GDP.",
73
- "source_topic_count": 15
74
- }}
75
-
76
- Remember, your goal is to create a list of neutral, informative, and distinct topics that accurately represent the content of the original opinionated topics without any bias or references to responses.
77
-
78
-
79
-
80
- OPINIONATED TOPIC:
64
+ TOPICS:
81
65
  {responses}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: themefinder
3
- Version: 0.5.4
3
+ Version: 0.6.2
4
4
  Summary: A topic modelling Python package designed for analysing one-to-many question-answer data eg free-text survey responses.
5
5
  License: MIT
6
6
  Author: i.AI
@@ -100,7 +100,7 @@ system_prompt = "You are an AI evaluation tool analyzing survey responses about
100
100
  # Run the function to find themes
101
101
  # We use asyncio to query LLM endpoints asynchronously, so we need to await our function
102
102
  async def main():
103
- result = await find_themes(responses_df, llm, question, system_prompt)
103
+ result = await find_themes(responses_df, llm, question, system_prompt=system_prompt)
104
104
  print(result)
105
105
 
106
106
  if __name__ == "__main__":
@@ -155,3 +155,4 @@ The documentation is [© Crown copyright](https://www.nationalarchives.gov.uk/in
155
155
  ## Feedback
156
156
 
157
157
  If you have feedback on this package, please fill in our [feedback form](https://forms.gle/85xUSMvxGzSSKQ499) or contact us with questions or feedback at packages@cabinetoffice.gov.uk.
158
+
@@ -0,0 +1,16 @@
1
+ themefinder/__init__.py,sha256=wSpW2fEnC4gTzbeNC78nSD3DpJq43-h_H-LK_cqt1cw,327
2
+ themefinder/core.py,sha256=u1DY9gbzn-tFhQS3hrXQ8_1mIbR-iBWYVAdKeAX1BdE,18304
3
+ themefinder/llm_batch_processor.py,sha256=OrFEl1nSi5ninbSZSiE1HFMcYZiQ-NzuYPj_iDcPPoE,19988
4
+ themefinder/models.py,sha256=Y5-okndYwtBO09n_qUlYNVmHRVNEnJviArQZukm8Ox8,4251
5
+ themefinder/prompts/consultation_system_prompt.txt,sha256=_A07oY_an4hnRx-9pQ0y-TLXJz0dd8vDI-MZne7Mdb4,89
6
+ themefinder/prompts/sentiment_analysis.txt,sha256=9-LkdR95JTHXRKUXknAgNf86uVdv6jSaXMf-OtFL9_0,1948
7
+ themefinder/prompts/theme_condensation.txt,sha256=DB4pqUmMpo0OG4AZWGTj0FfLFfjbX6wOMUr44HBxZ1o,2433
8
+ themefinder/prompts/theme_generation.txt,sha256=JMXuNojxdSAcxPRU1Jg12Xunv_dX4hNvXYU2pXMWTAw,2500
9
+ themefinder/prompts/theme_mapping.txt,sha256=YcRGMkuTyTPzPQPtsDY31DUwX60c8AdmdHKw0XeUejQ,2258
10
+ themefinder/prompts/theme_refinement.txt,sha256=hBXwZnNZmhmoEFXpY5OJinp-7xxdoDRf_5LmgrilYgc,2713
11
+ themefinder/prompts/theme_target_alignment.txt,sha256=-_ghr4--KAN6Tz8ExO9s2IXvI6pjWaEA_nG5L83GV5I,1035
12
+ themefinder/themefinder_logging.py,sha256=n5SUQovEZLC4skEbxicjz_fOGF9mOk3S-Wpj5uXsaL8,314
13
+ themefinder-0.6.2.dist-info/LICENCE,sha256=C9ULIN0ctF60ZxUWH_hw1H434bDLg49Z-Qzn6BUHgqs,1060
14
+ themefinder-0.6.2.dist-info/METADATA,sha256=gI9Hp754EjopJQWw0QZIPb9dex8TalPMGnorUEOJlp0,6498
15
+ themefinder-0.6.2.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
16
+ themefinder-0.6.2.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.1.1
2
+ Generator: poetry-core 2.1.2
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,15 +0,0 @@
1
- themefinder/__init__.py,sha256=p6QoCgA-BYWljk8yPOeTgkNcN5m_gA_o3Q86Eh0QjSM,327
2
- themefinder/core.py,sha256=yH68-DtpIv0jX__LnjuBaKJn01hj-VurW3WnFxk0wMQ,17537
3
- themefinder/llm_batch_processor.py,sha256=SDDeMJeX1J3u7FGFddRhVSxty6U8lFVXwG4eNI_0C5o,12573
4
- themefinder/prompts/consultation_system_prompt.txt,sha256=_A07oY_an4hnRx-9pQ0y-TLXJz0dd8vDI-MZne7Mdb4,89
5
- themefinder/prompts/sentiment_analysis.txt,sha256=e3DcUKga6pSFcfeo2TAq8x9LXk0YDV-D7P2gtymcyuc,1832
6
- themefinder/prompts/theme_condensation.txt,sha256=DB4pqUmMpo0OG4AZWGTj0FfLFfjbX6wOMUr44HBxZ1o,2433
7
- themefinder/prompts/theme_generation.txt,sha256=JMXuNojxdSAcxPRU1Jg12Xunv_dX4hNvXYU2pXMWTAw,2500
8
- themefinder/prompts/theme_mapping.txt,sha256=nb_D7gwKGd8BzrAlzSZC3mQIPYaCRXdE6XmoJaJEKZQ,2405
9
- themefinder/prompts/theme_refinement.txt,sha256=_NVHdXBfqCFX2u0R5oZEqWQo70MAjJ5nXQfZ7p_HRAM,3528
10
- themefinder/prompts/theme_target_alignment.txt,sha256=-_ghr4--KAN6Tz8ExO9s2IXvI6pjWaEA_nG5L83GV5I,1035
11
- themefinder/themefinder_logging.py,sha256=n5SUQovEZLC4skEbxicjz_fOGF9mOk3S-Wpj5uXsaL8,314
12
- themefinder-0.5.4.dist-info/LICENCE,sha256=C9ULIN0ctF60ZxUWH_hw1H434bDLg49Z-Qzn6BUHgqs,1060
13
- themefinder-0.5.4.dist-info/METADATA,sha256=JKSxdzARGcJ-OJwrd5ScuPzm4Uln2cBQ_SnrxFAhQLQ,6483
14
- themefinder-0.5.4.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
15
- themefinder-0.5.4.dist-info/RECORD,,