themefinder 0.5.4__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of themefinder might be problematic. Click here for more details.
- themefinder/__init__.py +2 -2
- themefinder/core.py +85 -74
- themefinder/llm_batch_processor.py +303 -123
- themefinder/models.py +138 -0
- themefinder/prompts/sentiment_analysis.txt +8 -5
- themefinder/prompts/theme_mapping.txt +3 -3
- themefinder/prompts/theme_refinement.txt +12 -28
- {themefinder-0.5.4.dist-info → themefinder-0.6.2.dist-info}/METADATA +3 -2
- themefinder-0.6.2.dist-info/RECORD +16 -0
- {themefinder-0.5.4.dist-info → themefinder-0.6.2.dist-info}/WHEEL +1 -1
- themefinder-0.5.4.dist-info/RECORD +0 -15
- {themefinder-0.5.4.dist-info → themefinder-0.6.2.dist-info}/LICENCE +0 -0
themefinder/__init__.py
CHANGED
themefinder/core.py
CHANGED
|
@@ -6,6 +6,7 @@ from langchain_core.prompts import PromptTemplate
|
|
|
6
6
|
from langchain_core.runnables import Runnable
|
|
7
7
|
|
|
8
8
|
from .llm_batch_processor import batch_and_run, load_prompt_from_file
|
|
9
|
+
from .models import SentimentAnalysisOutput, ThemeMappingOutput
|
|
9
10
|
from .themefinder_logging import logger
|
|
10
11
|
|
|
11
12
|
CONSULTATION_SYSTEM_PROMPT = load_prompt_from_file("consultation_system_prompt")
|
|
@@ -18,7 +19,7 @@ async def find_themes(
|
|
|
18
19
|
target_n_themes: int | None = None,
|
|
19
20
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
20
21
|
verbose: bool = True,
|
|
21
|
-
) -> dict[str, pd.DataFrame]:
|
|
22
|
+
) -> dict[str, str | pd.DataFrame]:
|
|
22
23
|
"""Process survey responses through a multi-stage theme analysis pipeline.
|
|
23
24
|
|
|
24
25
|
This pipeline performs sequential analysis steps:
|
|
@@ -41,47 +42,46 @@ async def find_themes(
|
|
|
41
42
|
Defaults to True.
|
|
42
43
|
|
|
43
44
|
Returns:
|
|
44
|
-
dict[str, pd.DataFrame]: Dictionary containing results from each pipeline stage:
|
|
45
|
-
- question: The survey question
|
|
45
|
+
dict[str, str | pd.DataFrame]: Dictionary containing results from each pipeline stage:
|
|
46
|
+
- question: The survey question string
|
|
46
47
|
- sentiment: DataFrame with sentiment analysis results
|
|
47
|
-
-
|
|
48
|
-
- condensed_topics: DataFrame with combined similar themes
|
|
49
|
-
- refined_topics: DataFrame with refined theme definitions
|
|
48
|
+
- themes: DataFrame with the final themes output
|
|
50
49
|
- mapping: DataFrame mapping responses to final themes
|
|
50
|
+
- unprocessables: Dataframe containing the inputs that could not be processed by the LLM
|
|
51
51
|
"""
|
|
52
52
|
logger.setLevel(logging.INFO if verbose else logging.CRITICAL)
|
|
53
53
|
|
|
54
|
-
sentiment_df = await sentiment_analysis(
|
|
54
|
+
sentiment_df, sentiment_unprocessables = await sentiment_analysis(
|
|
55
55
|
responses_df,
|
|
56
56
|
llm,
|
|
57
57
|
question=question,
|
|
58
58
|
system_prompt=system_prompt,
|
|
59
59
|
)
|
|
60
|
-
theme_df = await theme_generation(
|
|
60
|
+
theme_df, _ = await theme_generation(
|
|
61
61
|
sentiment_df,
|
|
62
62
|
llm,
|
|
63
63
|
question=question,
|
|
64
64
|
system_prompt=system_prompt,
|
|
65
65
|
)
|
|
66
|
-
condensed_theme_df = await theme_condensation(
|
|
66
|
+
condensed_theme_df, _ = await theme_condensation(
|
|
67
67
|
theme_df, llm, question=question, system_prompt=system_prompt
|
|
68
68
|
)
|
|
69
|
-
refined_theme_df = await theme_refinement(
|
|
69
|
+
refined_theme_df, _ = await theme_refinement(
|
|
70
70
|
condensed_theme_df,
|
|
71
71
|
llm,
|
|
72
72
|
question=question,
|
|
73
73
|
system_prompt=system_prompt,
|
|
74
74
|
)
|
|
75
75
|
if target_n_themes is not None:
|
|
76
|
-
refined_theme_df = await theme_target_alignment(
|
|
76
|
+
refined_theme_df, _ = await theme_target_alignment(
|
|
77
77
|
refined_theme_df,
|
|
78
78
|
llm,
|
|
79
79
|
question=question,
|
|
80
80
|
target_n_themes=target_n_themes,
|
|
81
81
|
system_prompt=system_prompt,
|
|
82
82
|
)
|
|
83
|
-
mapping_df = await theme_mapping(
|
|
84
|
-
sentiment_df,
|
|
83
|
+
mapping_df, mapping_unprocessables = await theme_mapping(
|
|
84
|
+
sentiment_df[["response_id", "response"]],
|
|
85
85
|
llm,
|
|
86
86
|
question=question,
|
|
87
87
|
refined_themes_df=refined_theme_df,
|
|
@@ -95,10 +95,9 @@ async def find_themes(
|
|
|
95
95
|
return {
|
|
96
96
|
"question": question,
|
|
97
97
|
"sentiment": sentiment_df,
|
|
98
|
-
"themes":
|
|
99
|
-
"condensed_themes": condensed_theme_df,
|
|
100
|
-
"refined_themes": refined_theme_df,
|
|
98
|
+
"themes": refined_theme_df,
|
|
101
99
|
"mapping": mapping_df,
|
|
100
|
+
"unprocessables": pd.concat([sentiment_unprocessables, mapping_unprocessables]),
|
|
102
101
|
}
|
|
103
102
|
|
|
104
103
|
|
|
@@ -109,7 +108,7 @@ async def sentiment_analysis(
|
|
|
109
108
|
batch_size: int = 20,
|
|
110
109
|
prompt_template: str | Path | PromptTemplate = "sentiment_analysis",
|
|
111
110
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
112
|
-
) -> pd.DataFrame:
|
|
111
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
113
112
|
"""Perform sentiment analysis on survey responses using an LLM.
|
|
114
113
|
|
|
115
114
|
This function processes survey responses in batches to analyze their sentiment
|
|
@@ -129,24 +128,29 @@ async def sentiment_analysis(
|
|
|
129
128
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
130
129
|
|
|
131
130
|
Returns:
|
|
132
|
-
pd.DataFrame
|
|
133
|
-
|
|
131
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
132
|
+
A tuple containing two DataFrames:
|
|
133
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
134
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
134
135
|
|
|
135
136
|
Note:
|
|
136
|
-
The function uses
|
|
137
|
+
The function uses validation_check to ensure responses maintain
|
|
137
138
|
their original order and association after processing.
|
|
138
139
|
"""
|
|
139
140
|
logger.info(f"Running sentiment analysis on {len(responses_df)} responses")
|
|
140
|
-
|
|
141
|
+
processed_rows, unprocessable_rows = await batch_and_run(
|
|
141
142
|
responses_df,
|
|
142
143
|
prompt_template,
|
|
143
144
|
llm,
|
|
144
145
|
batch_size=batch_size,
|
|
145
146
|
question=question,
|
|
146
|
-
|
|
147
|
+
validation_check=True,
|
|
148
|
+
task_validation_model=SentimentAnalysisOutput,
|
|
147
149
|
system_prompt=system_prompt,
|
|
148
150
|
)
|
|
149
151
|
|
|
152
|
+
return processed_rows, unprocessable_rows
|
|
153
|
+
|
|
150
154
|
|
|
151
155
|
async def theme_generation(
|
|
152
156
|
responses_df: pd.DataFrame,
|
|
@@ -156,7 +160,7 @@ async def theme_generation(
|
|
|
156
160
|
partition_key: str | None = "position",
|
|
157
161
|
prompt_template: str | Path | PromptTemplate = "theme_generation",
|
|
158
162
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
159
|
-
) -> pd.DataFrame:
|
|
163
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
160
164
|
"""Generate themes from survey responses using an LLM.
|
|
161
165
|
|
|
162
166
|
This function processes batches of survey responses to identify common themes or topics.
|
|
@@ -179,10 +183,14 @@ async def theme_generation(
|
|
|
179
183
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
180
184
|
|
|
181
185
|
Returns:
|
|
182
|
-
pd.DataFrame
|
|
186
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
187
|
+
A tuple containing two DataFrames:
|
|
188
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
189
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
190
|
+
|
|
183
191
|
"""
|
|
184
192
|
logger.info(f"Running theme generation on {len(responses_df)} responses")
|
|
185
|
-
|
|
193
|
+
generated_themes, _ = await batch_and_run(
|
|
186
194
|
responses_df,
|
|
187
195
|
prompt_template,
|
|
188
196
|
llm,
|
|
@@ -191,6 +199,7 @@ async def theme_generation(
|
|
|
191
199
|
question=question,
|
|
192
200
|
system_prompt=system_prompt,
|
|
193
201
|
)
|
|
202
|
+
return generated_themes, _
|
|
194
203
|
|
|
195
204
|
|
|
196
205
|
async def theme_condensation(
|
|
@@ -201,7 +210,7 @@ async def theme_condensation(
|
|
|
201
210
|
prompt_template: str | Path | PromptTemplate = "theme_condensation",
|
|
202
211
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
203
212
|
**kwargs,
|
|
204
|
-
) -> pd.DataFrame:
|
|
213
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
205
214
|
"""Condense and combine similar themes identified from survey responses.
|
|
206
215
|
|
|
207
216
|
This function processes the initially identified themes to combine similar or
|
|
@@ -221,18 +230,21 @@ async def theme_condensation(
|
|
|
221
230
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
222
231
|
|
|
223
232
|
Returns:
|
|
224
|
-
pd.DataFrame
|
|
225
|
-
|
|
233
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
234
|
+
A tuple containing two DataFrames:
|
|
235
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
236
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
237
|
+
|
|
226
238
|
"""
|
|
227
239
|
logger.info(f"Running theme condensation on {len(themes_df)} themes")
|
|
228
|
-
themes_df["response_id"] =
|
|
240
|
+
themes_df["response_id"] = themes_df.index + 1
|
|
229
241
|
|
|
230
242
|
n_themes = themes_df.shape[0]
|
|
231
243
|
while n_themes > batch_size:
|
|
232
244
|
logger.info(
|
|
233
245
|
f"{n_themes} larger than batch size, using recursive theme condensation"
|
|
234
246
|
)
|
|
235
|
-
themes_df = await batch_and_run(
|
|
247
|
+
themes_df, _ = await batch_and_run(
|
|
236
248
|
themes_df,
|
|
237
249
|
prompt_template,
|
|
238
250
|
llm,
|
|
@@ -242,13 +254,13 @@ async def theme_condensation(
|
|
|
242
254
|
**kwargs,
|
|
243
255
|
)
|
|
244
256
|
themes_df = themes_df.sample(frac=1).reset_index(drop=True)
|
|
245
|
-
themes_df["response_id"] =
|
|
257
|
+
themes_df["response_id"] = themes_df.index + 1
|
|
246
258
|
if len(themes_df) == n_themes:
|
|
247
259
|
logger.info("Themes no longer being condensed")
|
|
248
260
|
break
|
|
249
261
|
n_themes = themes_df.shape[0]
|
|
250
262
|
|
|
251
|
-
themes_df = await batch_and_run(
|
|
263
|
+
themes_df, _ = await batch_and_run(
|
|
252
264
|
themes_df,
|
|
253
265
|
prompt_template,
|
|
254
266
|
llm,
|
|
@@ -259,7 +271,7 @@ async def theme_condensation(
|
|
|
259
271
|
)
|
|
260
272
|
|
|
261
273
|
logger.info(f"Final number of condensed themes: {themes_df.shape[0]}")
|
|
262
|
-
return themes_df
|
|
274
|
+
return themes_df, _
|
|
263
275
|
|
|
264
276
|
|
|
265
277
|
async def theme_refinement(
|
|
@@ -269,7 +281,7 @@ async def theme_refinement(
|
|
|
269
281
|
batch_size: int = 10000,
|
|
270
282
|
prompt_template: str | Path | PromptTemplate = "theme_refinement",
|
|
271
283
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
272
|
-
) -> pd.DataFrame:
|
|
284
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
273
285
|
"""Refine and standardize condensed themes using an LLM.
|
|
274
286
|
|
|
275
287
|
This function processes previously condensed themes to create clear, standardized
|
|
@@ -286,15 +298,15 @@ async def theme_refinement(
|
|
|
286
298
|
Defaults to 10000.
|
|
287
299
|
prompt_template (str | Path | PromptTemplate, optional): Template for structuring
|
|
288
300
|
the prompt to the LLM. Can be a string identifier, path to template file,
|
|
289
|
-
or PromptTemplate instance. Defaults to "
|
|
301
|
+
or PromptTemplate instance. Defaults to "theme_refinement".
|
|
290
302
|
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
291
303
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
292
304
|
|
|
293
305
|
Returns:
|
|
294
|
-
pd.DataFrame
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
306
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
307
|
+
A tuple containing two DataFrames:
|
|
308
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
309
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
298
310
|
|
|
299
311
|
Note:
|
|
300
312
|
The function adds sequential response_ids to the input DataFrame and
|
|
@@ -302,16 +314,9 @@ async def theme_refinement(
|
|
|
302
314
|
processing.
|
|
303
315
|
"""
|
|
304
316
|
logger.info(f"Running theme refinement on {len(condensed_themes_df)} responses")
|
|
305
|
-
condensed_themes_df["response_id"] =
|
|
317
|
+
condensed_themes_df["response_id"] = condensed_themes_df.index + 1
|
|
306
318
|
|
|
307
|
-
|
|
308
|
-
"""Transpose topics for increased legibility."""
|
|
309
|
-
transposed_df = pd.DataFrame(
|
|
310
|
-
[refined_themes["topic"].to_numpy()], columns=refined_themes["topic_id"]
|
|
311
|
-
)
|
|
312
|
-
return transposed_df
|
|
313
|
-
|
|
314
|
-
refined_themes = await batch_and_run(
|
|
319
|
+
refined_themes, _ = await batch_and_run(
|
|
315
320
|
condensed_themes_df,
|
|
316
321
|
prompt_template,
|
|
317
322
|
llm,
|
|
@@ -319,7 +324,7 @@ async def theme_refinement(
|
|
|
319
324
|
question=question,
|
|
320
325
|
system_prompt=system_prompt,
|
|
321
326
|
)
|
|
322
|
-
return
|
|
327
|
+
return refined_themes, _
|
|
323
328
|
|
|
324
329
|
|
|
325
330
|
async def theme_target_alignment(
|
|
@@ -330,7 +335,7 @@ async def theme_target_alignment(
|
|
|
330
335
|
batch_size: int = 10000,
|
|
331
336
|
prompt_template: str | Path | PromptTemplate = "theme_target_alignment",
|
|
332
337
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
333
|
-
) -> pd.DataFrame:
|
|
338
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
334
339
|
"""Align themes to target number using an LLM.
|
|
335
340
|
|
|
336
341
|
This function processes refined themes to consolidate them into a target number of
|
|
@@ -354,10 +359,10 @@ async def theme_target_alignment(
|
|
|
354
359
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
355
360
|
|
|
356
361
|
Returns:
|
|
357
|
-
pd.DataFrame
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
362
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
363
|
+
A tuple containing two DataFrames:
|
|
364
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
365
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
361
366
|
|
|
362
367
|
Note:
|
|
363
368
|
The function adds sequential response_ids to the input DataFrame and
|
|
@@ -365,19 +370,10 @@ async def theme_target_alignment(
|
|
|
365
370
|
processing.
|
|
366
371
|
"""
|
|
367
372
|
logger.info(
|
|
368
|
-
f"Running theme target alignment on {len(refined_themes_df
|
|
373
|
+
f"Running theme target alignment on {len(refined_themes_df)} themes compressing to {target_n_themes} themes"
|
|
369
374
|
)
|
|
370
|
-
refined_themes_df = refined_themes_df.
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
def transpose_aligned_themes(aligned_themes: pd.DataFrame):
|
|
374
|
-
"""Transpose topics for increased legibility."""
|
|
375
|
-
transposed_df = pd.DataFrame(
|
|
376
|
-
[aligned_themes["topic"].to_numpy()], columns=aligned_themes["topic_id"]
|
|
377
|
-
)
|
|
378
|
-
return transposed_df
|
|
379
|
-
|
|
380
|
-
aligned_themes = await batch_and_run(
|
|
375
|
+
refined_themes_df["response_id"] = refined_themes_df.index + 1
|
|
376
|
+
aligned_themes, _ = await batch_and_run(
|
|
381
377
|
refined_themes_df,
|
|
382
378
|
prompt_template,
|
|
383
379
|
llm,
|
|
@@ -386,7 +382,7 @@ async def theme_target_alignment(
|
|
|
386
382
|
system_prompt=system_prompt,
|
|
387
383
|
target_n_themes=target_n_themes,
|
|
388
384
|
)
|
|
389
|
-
return
|
|
385
|
+
return aligned_themes, _
|
|
390
386
|
|
|
391
387
|
|
|
392
388
|
async def theme_mapping(
|
|
@@ -397,7 +393,7 @@ async def theme_mapping(
|
|
|
397
393
|
batch_size: int = 20,
|
|
398
394
|
prompt_template: str | Path | PromptTemplate = "theme_mapping",
|
|
399
395
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
400
|
-
) -> pd.DataFrame:
|
|
396
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
401
397
|
"""Map survey responses to refined themes using an LLM.
|
|
402
398
|
|
|
403
399
|
This function analyzes each survey response and determines which of the refined
|
|
@@ -419,19 +415,34 @@ async def theme_mapping(
|
|
|
419
415
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
420
416
|
|
|
421
417
|
Returns:
|
|
422
|
-
pd.DataFrame
|
|
423
|
-
|
|
418
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
419
|
+
A tuple containing two DataFrames:
|
|
420
|
+
- The first DataFrame contains the rows that were successfully processed by the LLM
|
|
421
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
422
|
+
|
|
424
423
|
"""
|
|
425
424
|
logger.info(
|
|
426
|
-
f"Running theme mapping on {len(responses_df)} responses using {len(refined_themes_df
|
|
425
|
+
f"Running theme mapping on {len(responses_df)} responses using {len(refined_themes_df)} themes"
|
|
427
426
|
)
|
|
428
|
-
|
|
427
|
+
|
|
428
|
+
def transpose_refined_themes(refined_themes: pd.DataFrame):
|
|
429
|
+
"""Transpose topics for increased legibility."""
|
|
430
|
+
transposed_df = pd.DataFrame(
|
|
431
|
+
[refined_themes["topic"].to_numpy()], columns=refined_themes["topic_id"]
|
|
432
|
+
)
|
|
433
|
+
return transposed_df
|
|
434
|
+
|
|
435
|
+
mapping, _ = await batch_and_run(
|
|
429
436
|
responses_df,
|
|
430
437
|
prompt_template,
|
|
431
438
|
llm,
|
|
432
439
|
batch_size=batch_size,
|
|
433
440
|
question=question,
|
|
434
|
-
refined_themes=refined_themes_df.to_dict(
|
|
435
|
-
|
|
441
|
+
refined_themes=transpose_refined_themes(refined_themes_df).to_dict(
|
|
442
|
+
orient="records"
|
|
443
|
+
),
|
|
444
|
+
validation_check=True,
|
|
445
|
+
task_validation_model=ThemeMappingOutput,
|
|
436
446
|
system_prompt=system_prompt,
|
|
437
447
|
)
|
|
448
|
+
return mapping, _
|
|
@@ -1,14 +1,24 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
|
+
import os
|
|
4
5
|
from dataclasses import dataclass
|
|
5
6
|
from pathlib import Path
|
|
6
|
-
from typing import Any
|
|
7
|
+
from typing import Any, Optional, Type
|
|
7
8
|
|
|
9
|
+
import openai
|
|
8
10
|
import pandas as pd
|
|
11
|
+
import tiktoken
|
|
9
12
|
from langchain_core.prompts import PromptTemplate
|
|
10
13
|
from langchain_core.runnables import Runnable
|
|
11
|
-
from
|
|
14
|
+
from pydantic import BaseModel, ValidationError
|
|
15
|
+
from tenacity import (
|
|
16
|
+
before,
|
|
17
|
+
before_sleep_log,
|
|
18
|
+
retry,
|
|
19
|
+
stop_after_attempt,
|
|
20
|
+
wait_random_exponential,
|
|
21
|
+
)
|
|
12
22
|
|
|
13
23
|
from .themefinder_logging import logger
|
|
14
24
|
|
|
@@ -16,63 +26,82 @@ from .themefinder_logging import logger
|
|
|
16
26
|
@dataclass
|
|
17
27
|
class BatchPrompt:
|
|
18
28
|
prompt_string: str
|
|
19
|
-
response_ids: list[
|
|
29
|
+
response_ids: list[int]
|
|
20
30
|
|
|
21
31
|
|
|
22
32
|
async def batch_and_run(
|
|
23
|
-
|
|
33
|
+
input_df: pd.DataFrame,
|
|
24
34
|
prompt_template: str | Path | PromptTemplate,
|
|
25
35
|
llm: Runnable,
|
|
26
36
|
batch_size: int = 10,
|
|
27
37
|
partition_key: str | None = None,
|
|
28
|
-
|
|
38
|
+
validation_check: bool = False,
|
|
39
|
+
task_validation_model: Type[BaseModel] = None,
|
|
29
40
|
**kwargs: Any,
|
|
30
|
-
) -> pd.DataFrame:
|
|
41
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
31
42
|
"""Process a DataFrame of responses in batches using an LLM.
|
|
32
43
|
|
|
33
44
|
Args:
|
|
34
|
-
|
|
45
|
+
input_df (pd.DataFrame): DataFrame containing input to be processed.
|
|
35
46
|
Must include a 'response_id' column.
|
|
36
47
|
prompt_template (Union[str, Path, PromptTemplate]): Template for LLM prompts.
|
|
37
48
|
Can be a string (file path), Path object, or PromptTemplate.
|
|
38
49
|
llm (Runnable): LangChain Runnable instance that will process the prompts.
|
|
39
|
-
batch_size (int, optional): Number of
|
|
50
|
+
batch_size (int, optional): Number of input rows to process in each batch.
|
|
40
51
|
Defaults to 10.
|
|
41
|
-
partition_key (str | None, optional): Optional column name to group
|
|
52
|
+
partition_key (str | None, optional): Optional column name to group input rows
|
|
42
53
|
before batching. Defaults to None.
|
|
43
|
-
|
|
44
|
-
response IDs are present in LLM output and
|
|
54
|
+
validation_check (bool, optional): If True, verifies that all input
|
|
55
|
+
response IDs are present in LLM output and validates the rows against the validation model,
|
|
56
|
+
failed rows are retried individually.
|
|
45
57
|
If False, no integrity checking or retrying occurs. Defaults to False.
|
|
58
|
+
task_validation_model (Type[BaseModel]): the pydanctic model to validate each row against
|
|
46
59
|
**kwargs (Any): Additional keyword arguments to pass to the prompt template.
|
|
47
60
|
|
|
48
61
|
Returns:
|
|
49
62
|
pd.DataFrame: DataFrame containing the original responses merged with the
|
|
50
63
|
LLM-processed results.
|
|
64
|
+
Returns:
|
|
65
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
66
|
+
A tuple containing two DataFrames:
|
|
67
|
+
- The first DataFrame contains the rows that were successfully processes by the LLM
|
|
68
|
+
- The second DataFrame contains the rows that could not be processed by the LLM
|
|
51
69
|
"""
|
|
70
|
+
|
|
52
71
|
logger.info(f"Running batch and run with batch size {batch_size}")
|
|
53
72
|
prompt_template = convert_to_prompt_template(prompt_template)
|
|
54
|
-
|
|
55
|
-
|
|
73
|
+
batch_prompts = generate_prompts(
|
|
74
|
+
prompt_template,
|
|
75
|
+
input_df,
|
|
76
|
+
batch_size=batch_size,
|
|
77
|
+
partition_key=partition_key,
|
|
78
|
+
**kwargs,
|
|
56
79
|
)
|
|
57
|
-
|
|
58
|
-
llm_responses, failed_ids = await call_llm(
|
|
80
|
+
processed_rows, failed_ids = await call_llm(
|
|
59
81
|
batch_prompts=batch_prompts,
|
|
60
82
|
llm=llm,
|
|
61
|
-
|
|
83
|
+
validation_check=validation_check,
|
|
84
|
+
task_validation_model=task_validation_model,
|
|
62
85
|
)
|
|
63
|
-
|
|
86
|
+
processed_results = process_llm_responses(processed_rows, input_df)
|
|
87
|
+
|
|
64
88
|
if failed_ids:
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
89
|
+
retry_df = input_df[input_df["response_id"].isin(failed_ids)]
|
|
90
|
+
retry_prompts = generate_prompts(
|
|
91
|
+
prompt_template, retry_df, batch_size=1, **kwargs
|
|
92
|
+
)
|
|
93
|
+
retry_results, unprocessable_ids = await call_llm(
|
|
94
|
+
batch_prompts=retry_prompts,
|
|
69
95
|
llm=llm,
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
**kwargs,
|
|
96
|
+
validation_check=validation_check,
|
|
97
|
+
task_validation_model=task_validation_model,
|
|
73
98
|
)
|
|
74
|
-
|
|
75
|
-
|
|
99
|
+
retry_processed_results = process_llm_responses(retry_results, retry_df)
|
|
100
|
+
unprocessable_df = retry_df.loc[retry_df["response_id"].isin(unprocessable_ids)]
|
|
101
|
+
processed_results = pd.concat([processed_results, retry_processed_results])
|
|
102
|
+
else:
|
|
103
|
+
unprocessable_df = pd.DataFrame()
|
|
104
|
+
return processed_results, unprocessable_df
|
|
76
105
|
|
|
77
106
|
|
|
78
107
|
def load_prompt_from_file(file_path: str | Path) -> str:
|
|
@@ -117,81 +146,150 @@ def convert_to_prompt_template(prompt_template: str | Path | PromptTemplate):
|
|
|
117
146
|
return template
|
|
118
147
|
|
|
119
148
|
|
|
120
|
-
def
|
|
121
|
-
|
|
149
|
+
def partition_dataframe(
|
|
150
|
+
df: pd.DataFrame, partition_key: Optional[str]
|
|
151
|
+
) -> list[pd.DataFrame]:
|
|
152
|
+
"""Splits the DataFrame into partitions based on the partition_key if provided."""
|
|
153
|
+
if partition_key:
|
|
154
|
+
return [group.reset_index(drop=True) for _, group in df.groupby(partition_key)]
|
|
155
|
+
return [df]
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def split_overflowing_batch(
|
|
159
|
+
batch: pd.DataFrame, allowed_tokens: int
|
|
122
160
|
) -> list[pd.DataFrame]:
|
|
123
|
-
"""
|
|
161
|
+
"""
|
|
162
|
+
Splits a DataFrame batch into smaller sub-batches such that each sub-batch's total token count
|
|
163
|
+
does not exceed the allowed token limit.
|
|
124
164
|
|
|
125
165
|
Args:
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
partition_key (str | None, optional): Column name to group by before batching.
|
|
129
|
-
If provided, ensures rows with the same partition key value stay together
|
|
130
|
-
and each group is batched separately. Defaults to None.
|
|
166
|
+
batch (pd.DataFrame): The input DataFrame to split.
|
|
167
|
+
allowed_tokens (int): The maximum allowed number of tokens per sub-batch.
|
|
131
168
|
|
|
132
169
|
Returns:
|
|
133
|
-
list[pd.DataFrame]:
|
|
134
|
-
at most batch_size rows. If partition_key is used, rows within each
|
|
135
|
-
partition are kept together and batched separately.
|
|
170
|
+
list[pd.DataFrame]: A list of sub-batches, each within the token limit.
|
|
136
171
|
"""
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
172
|
+
sub_batches = []
|
|
173
|
+
current_indices = []
|
|
174
|
+
current_token_sum = 0
|
|
175
|
+
token_counts = batch.apply(
|
|
176
|
+
lambda row: calculate_string_token_length(row.to_json()), axis=1
|
|
177
|
+
).tolist()
|
|
178
|
+
|
|
179
|
+
for i, token_count in enumerate(token_counts):
|
|
180
|
+
if token_count > allowed_tokens:
|
|
181
|
+
logging.warning(
|
|
182
|
+
f"Row at index {batch.index[i]} exceeds allowed token limit ({token_count} > {allowed_tokens}). Skipping row."
|
|
183
|
+
)
|
|
184
|
+
continue
|
|
185
|
+
|
|
186
|
+
if current_token_sum + token_count > allowed_tokens:
|
|
187
|
+
if current_indices:
|
|
188
|
+
sub_batch = batch.iloc[current_indices].reset_index(drop=True)
|
|
189
|
+
if not sub_batch.empty:
|
|
190
|
+
sub_batches.append(sub_batch)
|
|
191
|
+
current_indices = [i]
|
|
192
|
+
current_token_sum = token_count
|
|
193
|
+
else:
|
|
194
|
+
current_indices.append(i)
|
|
195
|
+
current_token_sum += token_count
|
|
196
|
+
|
|
197
|
+
if current_indices:
|
|
198
|
+
sub_batch = batch.iloc[current_indices].reset_index(drop=True)
|
|
199
|
+
if not sub_batch.empty:
|
|
200
|
+
sub_batches.append(sub_batch)
|
|
201
|
+
return sub_batches
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def batch_task_input_df(
|
|
205
|
+
df: pd.DataFrame,
|
|
206
|
+
allowed_tokens: int,
|
|
207
|
+
batch_size: int,
|
|
208
|
+
partition_key: Optional[str] = None,
|
|
209
|
+
) -> list[pd.DataFrame]:
|
|
210
|
+
"""
|
|
211
|
+
Partitions and batches a DataFrame according to a token limit and batch size, optionally using a partition key. Batches that exceed the token limit are further split.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
df (pd.DataFrame): The input DataFrame to batch.
|
|
215
|
+
allowed_tokens (int): Maximum allowed tokens per batch.
|
|
216
|
+
batch_size (int): Maximum number of rows per batch before token filtering.
|
|
217
|
+
partition_key (Optional[str], optional): Column name to partition the DataFrame by.
|
|
218
|
+
Defaults to None.
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
list[pd.DataFrame]: A list of batches, each within the specified token and size limits.
|
|
222
|
+
"""
|
|
223
|
+
batches = []
|
|
224
|
+
partitions = partition_dataframe(df, partition_key)
|
|
225
|
+
|
|
226
|
+
for partition in partitions:
|
|
227
|
+
partition_batches = [
|
|
228
|
+
partition.iloc[i : i + batch_size].reset_index(drop=True)
|
|
229
|
+
for i in range(0, len(partition), batch_size)
|
|
230
|
+
]
|
|
231
|
+
for batch in partition_batches:
|
|
232
|
+
batch_length = calculate_string_token_length(batch.to_json())
|
|
233
|
+
if batch_length <= allowed_tokens:
|
|
234
|
+
batches.append(batch)
|
|
235
|
+
else:
|
|
236
|
+
sub_batches = split_overflowing_batch(batch, allowed_tokens)
|
|
237
|
+
batches.extend(sub_batches)
|
|
238
|
+
return batches
|
|
152
239
|
|
|
153
240
|
|
|
154
241
|
def generate_prompts(
|
|
155
|
-
|
|
242
|
+
prompt_template: PromptTemplate,
|
|
243
|
+
input_data: pd.DataFrame,
|
|
244
|
+
batch_size: int = 50,
|
|
245
|
+
max_prompt_length: int = 50_000,
|
|
246
|
+
partition_key: str | None = None,
|
|
247
|
+
**kwargs,
|
|
156
248
|
) -> list[BatchPrompt]:
|
|
157
|
-
"""
|
|
249
|
+
"""
|
|
250
|
+
Generate a list of BatchPrompt objects by splitting the input DataFrame into batches
|
|
251
|
+
and formatting each batch using a prompt template.
|
|
252
|
+
|
|
253
|
+
The function first calculates the token length of the prompt template to determine
|
|
254
|
+
the allowed tokens available for the input data. It then splits the input data into batches,
|
|
255
|
+
optionally partitioning by a specified key. Each batch is then formatted into a prompt string
|
|
256
|
+
using the provided prompt template, and a BatchPrompt is created containing the prompt string
|
|
257
|
+
and a list of response IDs from the batch.
|
|
158
258
|
|
|
159
259
|
Args:
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
260
|
+
prompt_template (PromptTemplate): An object with a 'template' attribute and a 'format' method
|
|
261
|
+
used to create a prompt string from a list of response dictionaries.
|
|
262
|
+
input_data (pd.DataFrame): A DataFrame containing the input responses, with at least a
|
|
263
|
+
'response_id' column.
|
|
264
|
+
batch_size (int, optional): Maximum number of rows to include in each batch. Defaults to 50.
|
|
265
|
+
max_prompt_length (int, optional): The maximum total token length allowed for the prompt,
|
|
266
|
+
including both the prompt template and the input data. Defaults to 50,000.
|
|
267
|
+
partition_key (str | None, optional): Column name used to partition the DataFrame before batching.
|
|
268
|
+
If provided, the DataFrame will be grouped by this key so that rows with the same value
|
|
269
|
+
remain in the same batch. Defaults to None.
|
|
270
|
+
**kwargs: Additional keyword arguments to pass to the prompt template's format method.
|
|
166
271
|
|
|
167
272
|
Returns:
|
|
168
|
-
list[BatchPrompt]:
|
|
169
|
-
- prompt_string:
|
|
170
|
-
- response_ids:
|
|
171
|
-
|
|
172
|
-
Note:
|
|
173
|
-
The function converts each DataFrame to a list of dictionaries and passes it
|
|
174
|
-
to the prompt template as the 'responses' variable.
|
|
273
|
+
list[BatchPrompt]: A list of BatchPrompt objects where each object contains:
|
|
274
|
+
- prompt_string: The formatted prompt string for a batch.
|
|
275
|
+
- response_ids: A list of response IDs corresponding to the rows in that batch.
|
|
175
276
|
"""
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
BatchPrompt(prompt_string=prompt, response_ids=response_ids)
|
|
184
|
-
)
|
|
185
|
-
|
|
186
|
-
return batched_prompts
|
|
277
|
+
prompt_token_length = calculate_string_token_length(prompt_template.template)
|
|
278
|
+
allowed_tokens_for_data = max_prompt_length - prompt_token_length
|
|
279
|
+
batches = batch_task_input_df(
|
|
280
|
+
input_data, allowed_tokens_for_data, batch_size, partition_key
|
|
281
|
+
)
|
|
282
|
+
prompts = [build_prompt(prompt_template, batch, **kwargs) for batch in batches]
|
|
283
|
+
return prompts
|
|
187
284
|
|
|
188
285
|
|
|
189
286
|
async def call_llm(
|
|
190
287
|
batch_prompts: list[BatchPrompt],
|
|
191
288
|
llm: Runnable,
|
|
192
289
|
concurrency: int = 10,
|
|
193
|
-
|
|
194
|
-
|
|
290
|
+
validation_check: bool = False,
|
|
291
|
+
task_validation_model: Optional[Type[BaseModel]] = None,
|
|
292
|
+
) -> tuple[list[dict], list[int]]:
|
|
195
293
|
"""Process multiple batches of prompts concurrently through an LLM with retry logic.
|
|
196
294
|
|
|
197
295
|
Args:
|
|
@@ -200,9 +298,10 @@ async def call_llm(
|
|
|
200
298
|
llm (Runnable): LangChain Runnable instance that will process the prompts.
|
|
201
299
|
concurrency (int, optional): Maximum number of simultaneous LLM calls allowed.
|
|
202
300
|
Defaults to 10.
|
|
203
|
-
|
|
301
|
+
validation_check (bool, optional): If True, verifies that all input
|
|
204
302
|
response IDs are present in the LLM output. Failed batches are discarded and
|
|
205
303
|
their IDs are returned for retry. Defaults to False.
|
|
304
|
+
task_validation_model (Type[BaseModel]): The Pydantic model to check the LLM outputs against
|
|
206
305
|
|
|
207
306
|
Returns:
|
|
208
307
|
tuple[list[dict[str, Any]], set[str]]: A tuple containing:
|
|
@@ -215,69 +314,76 @@ async def call_llm(
|
|
|
215
314
|
- Concurrency is managed via asyncio.Semaphore to prevent overwhelming the LLM
|
|
216
315
|
"""
|
|
217
316
|
semaphore = asyncio.Semaphore(concurrency)
|
|
218
|
-
failed_ids: set = set()
|
|
219
317
|
|
|
220
318
|
@retry(
|
|
221
319
|
wait=wait_random_exponential(min=1, max=20),
|
|
222
320
|
stop=stop_after_attempt(6),
|
|
223
321
|
before=before.before_log(logger=logger, log_level=logging.DEBUG),
|
|
322
|
+
before_sleep=before_sleep_log(logger, logging.ERROR),
|
|
224
323
|
reraise=True,
|
|
225
324
|
)
|
|
226
|
-
async def async_llm_call(batch_prompt):
|
|
325
|
+
async def async_llm_call(batch_prompt) -> tuple[list[dict], list[int]]:
|
|
227
326
|
async with semaphore:
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
batch_prompt.response_ids
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
327
|
+
try:
|
|
328
|
+
llm_response = await llm.ainvoke(batch_prompt.prompt_string)
|
|
329
|
+
all_results = json.loads(llm_response.content)
|
|
330
|
+
except (openai.BadRequestError, json.JSONDecodeError) as e:
|
|
331
|
+
failed_ids = batch_prompt.response_ids
|
|
332
|
+
logger.warning(e)
|
|
333
|
+
return [], failed_ids
|
|
334
|
+
|
|
335
|
+
if validation_check:
|
|
336
|
+
failed_ids = get_missing_response_ids(
|
|
337
|
+
batch_prompt.response_ids, all_results
|
|
338
|
+
)
|
|
339
|
+
validated_results, invalid_rows = validate_task_data(
|
|
340
|
+
all_results["responses"], task_validation_model
|
|
341
|
+
)
|
|
342
|
+
failed_ids.extend([r["response_id"] for r in invalid_rows])
|
|
343
|
+
return validated_results, failed_ids
|
|
344
|
+
else:
|
|
345
|
+
# Flatten the list to align with valid output format
|
|
346
|
+
return [r for r in all_results["responses"]], []
|
|
239
347
|
|
|
240
348
|
results = await asyncio.gather(
|
|
241
349
|
*[async_llm_call(batch_prompt) for batch_prompt in batch_prompts]
|
|
242
350
|
)
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
351
|
+
valid_inputs = [row for result, _ in results for row in result]
|
|
352
|
+
failed_response_ids = [
|
|
353
|
+
failed_response_id
|
|
354
|
+
for _, batch_failures in results
|
|
355
|
+
for failed_response_id in batch_failures
|
|
356
|
+
]
|
|
247
357
|
|
|
358
|
+
return valid_inputs, failed_response_ids
|
|
248
359
|
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
360
|
+
|
|
361
|
+
def get_missing_response_ids(
|
|
362
|
+
input_response_ids: list[int], parsed_response: dict
|
|
363
|
+
) -> list[int]:
|
|
364
|
+
"""Identify which response IDs are missing from the LLM's parsed response.
|
|
253
365
|
|
|
254
366
|
Args:
|
|
255
367
|
input_response_ids (set[str]): Set of response IDs that were included in the
|
|
256
|
-
original prompt
|
|
368
|
+
original prompt.
|
|
257
369
|
parsed_response (dict): Parsed response from the LLM containing a 'responses' key
|
|
258
370
|
with a list of dictionaries, each containing a 'response_id' field.
|
|
259
371
|
|
|
260
372
|
Returns:
|
|
261
|
-
|
|
262
|
-
no additional IDs are present, False otherwise.
|
|
373
|
+
set[str]: Set of response IDs that are missing from the parsed response.
|
|
263
374
|
"""
|
|
264
|
-
response_ids_set = set(input_response_ids)
|
|
265
375
|
|
|
376
|
+
response_ids_set = {int(response_id) for response_id in input_response_ids}
|
|
266
377
|
returned_ids_set = {
|
|
267
|
-
|
|
268
|
-
element["response_id"]
|
|
269
|
-
) # treat ids as strings to match response_ids_in_each_prompt
|
|
378
|
+
int(element["response_id"])
|
|
270
379
|
for element in parsed_response["responses"]
|
|
271
380
|
if element.get("response_id", False)
|
|
272
381
|
}
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
logger.info(
|
|
277
|
-
|
|
278
|
-
)
|
|
279
|
-
return False
|
|
280
|
-
return True
|
|
382
|
+
|
|
383
|
+
missing_ids = list(response_ids_set - returned_ids_set)
|
|
384
|
+
if missing_ids:
|
|
385
|
+
logger.info(f"Missing response IDs from LLM output: {missing_ids}")
|
|
386
|
+
return missing_ids
|
|
281
387
|
|
|
282
388
|
|
|
283
389
|
def process_llm_responses(
|
|
@@ -298,13 +404,87 @@ def process_llm_responses(
|
|
|
298
404
|
- If no response_id in LLM output: DataFrame containing only the LLM results
|
|
299
405
|
"""
|
|
300
406
|
responses.loc[:, "response_id"] = responses["response_id"].astype(int)
|
|
301
|
-
|
|
302
|
-
response
|
|
303
|
-
for batch_response in llm_responses
|
|
304
|
-
for response in batch_response.get("responses", [])
|
|
305
|
-
]
|
|
306
|
-
task_responses = pd.DataFrame(unpacked_responses)
|
|
407
|
+
task_responses = pd.DataFrame(llm_responses)
|
|
307
408
|
if "response_id" in task_responses.columns:
|
|
308
409
|
task_responses["response_id"] = task_responses["response_id"].astype(int)
|
|
309
410
|
return responses.merge(task_responses, how="inner", on="response_id")
|
|
310
411
|
return task_responses
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def calculate_string_token_length(input_text: str, model: str = None) -> int:
|
|
415
|
+
"""
|
|
416
|
+
Calculates the number of tokens in a given string using the specified model's tokenizer.
|
|
417
|
+
|
|
418
|
+
Args:
|
|
419
|
+
input_text (str): The input string to tokenize.
|
|
420
|
+
model (str, optional): The model name used for tokenization. If not provided,
|
|
421
|
+
uses the MODEL_NAME environment variable or defaults to "gpt-4o".
|
|
422
|
+
|
|
423
|
+
Returns:
|
|
424
|
+
int: The number of tokens in the input string.
|
|
425
|
+
"""
|
|
426
|
+
# Use the MODEL_NAME env var if no model is provided; otherwise default to "gpt-4o"
|
|
427
|
+
model = model or os.environ.get("MODEL_NAME", "gpt-4o")
|
|
428
|
+
tokenizer_encoding = tiktoken.encoding_for_model(model)
|
|
429
|
+
number_of_tokens = len(tokenizer_encoding.encode(input_text))
|
|
430
|
+
return number_of_tokens
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def build_prompt(
|
|
434
|
+
prompt_template: PromptTemplate, input_batch: pd.DataFrame, **kwargs
|
|
435
|
+
) -> BatchPrompt:
|
|
436
|
+
"""
|
|
437
|
+
Constructs a BatchPrompt by formatting a prompt template with a batch of responses.
|
|
438
|
+
|
|
439
|
+
The function converts the input DataFrame batch into a list of dictionaries (one per row) and passes
|
|
440
|
+
this list to the prompt template's format method under the key 'responses', along with any additional
|
|
441
|
+
keyword arguments. It also extracts the 'response_id' column from the batch,
|
|
442
|
+
and uses these to create the BatchPrompt.
|
|
443
|
+
|
|
444
|
+
Args:
|
|
445
|
+
prompt_template (PromptTemplate): An object with a 'template' attribute and a 'format' method that is used
|
|
446
|
+
to generate the prompt string.
|
|
447
|
+
input_batch (pd.DataFrame): A DataFrame containing the batch of responses, which must include a 'response_id'
|
|
448
|
+
column.
|
|
449
|
+
**kwargs: Additional keyword arguments to pass to the prompt template's format method.
|
|
450
|
+
|
|
451
|
+
Returns:
|
|
452
|
+
BatchPrompt: An object containing:
|
|
453
|
+
- prompt_string: The formatted prompt string for the batch.
|
|
454
|
+
- response_ids: A list of response IDs (as strings) corresponding to the responses in the batch.
|
|
455
|
+
"""
|
|
456
|
+
prompt = prompt_template.format(
|
|
457
|
+
responses=input_batch.to_dict(orient="records"), **kwargs
|
|
458
|
+
)
|
|
459
|
+
response_ids = input_batch["response_id"].astype(int).to_list()
|
|
460
|
+
return BatchPrompt(prompt_string=prompt, response_ids=response_ids)
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
def validate_task_data(
|
|
464
|
+
task_data: pd.DataFrame | list[dict], task_validation_model: Type[BaseModel] = None
|
|
465
|
+
) -> tuple[list[dict], list[dict]]:
|
|
466
|
+
"""
|
|
467
|
+
Validate each row in task_output against the provided Pydantic model.
|
|
468
|
+
|
|
469
|
+
Returns:
|
|
470
|
+
valid: a list of validated records (dicts).
|
|
471
|
+
invalid: a list of records (dicts) that failed validation.
|
|
472
|
+
"""
|
|
473
|
+
|
|
474
|
+
records = (
|
|
475
|
+
task_data.to_dict(orient="records")
|
|
476
|
+
if isinstance(task_data, pd.DataFrame)
|
|
477
|
+
else task_data
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
if task_validation_model:
|
|
481
|
+
valid_records, invalid_records = [], []
|
|
482
|
+
for record in records:
|
|
483
|
+
try:
|
|
484
|
+
task_validation_model(**record)
|
|
485
|
+
valid_records.append(record)
|
|
486
|
+
except ValidationError as e:
|
|
487
|
+
invalid_records.append(record)
|
|
488
|
+
logger.info(f"Failed Validation: {e}")
|
|
489
|
+
return valid_records, invalid_records
|
|
490
|
+
return records, []
|
themefinder/models.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field, model_validator
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def validate_non_empty_fields(model: BaseModel) -> BaseModel:
|
|
5
|
+
"""
|
|
6
|
+
Validate that all string fields in the model are non-empty (after stripping)
|
|
7
|
+
and that list fields are not empty.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
model (BaseModel): A Pydantic model instance.
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
BaseModel: The same model if validation passes.
|
|
14
|
+
|
|
15
|
+
Raises:
|
|
16
|
+
ValueError: If any string field is empty or any list field is empty.
|
|
17
|
+
"""
|
|
18
|
+
for field_name, value in model.__dict__.items():
|
|
19
|
+
if isinstance(value, str) and not value.strip():
|
|
20
|
+
raise ValueError(f"{field_name} cannot be empty or only whitespace")
|
|
21
|
+
if isinstance(value, list) and not value:
|
|
22
|
+
raise ValueError(f"{field_name} cannot be an empty list")
|
|
23
|
+
return model
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def validate_position(model: BaseModel) -> BaseModel:
|
|
27
|
+
"""
|
|
28
|
+
Validate that the model's 'position' field is one of the allowed values.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
model (BaseModel): A Pydantic model instance with a 'position' attribute.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
BaseModel: The same model if validation passes.
|
|
35
|
+
|
|
36
|
+
Raises:
|
|
37
|
+
ValueError: If the 'position' field is not one of the allowed values.
|
|
38
|
+
"""
|
|
39
|
+
allowed_positions = {"AGREEMENT", "DISAGREEMENT", "UNCLEAR"}
|
|
40
|
+
if model.position not in allowed_positions:
|
|
41
|
+
raise ValueError(f"position must be one of {allowed_positions}")
|
|
42
|
+
return model
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def validate_stances(model: BaseModel) -> BaseModel:
|
|
46
|
+
"""
|
|
47
|
+
Validate that every stance in the model's 'stances' field is allowed.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
model (BaseModel): A Pydantic model instance with a 'stances' attribute.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
BaseModel: The same model if validation passes.
|
|
54
|
+
|
|
55
|
+
Raises:
|
|
56
|
+
ValueError: If any stance is not among the allowed stances.
|
|
57
|
+
"""
|
|
58
|
+
allowed_stances = {"POSITIVE", "NEGATIVE"}
|
|
59
|
+
for stance in model.stances:
|
|
60
|
+
if stance not in allowed_stances:
|
|
61
|
+
raise ValueError(f"stances must be one of {allowed_stances}")
|
|
62
|
+
return model
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def validate_mapping_stance_lengths(model: BaseModel) -> BaseModel:
|
|
66
|
+
"""
|
|
67
|
+
Validate that the lengths of the model's 'stances' and 'labels' fields match.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
model (BaseModel): A Pydantic model instance with 'stances' and 'labels' attributes.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
BaseModel: The same model if validation passes.
|
|
74
|
+
|
|
75
|
+
Raises:
|
|
76
|
+
ValueError: If the lengths of 'stances' and 'labels' do not match.
|
|
77
|
+
"""
|
|
78
|
+
if len(model.stances) != len(model.labels):
|
|
79
|
+
raise ValueError("'stances' must have the same length as 'labels'")
|
|
80
|
+
return model
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def validate_mapping_unique_labels(model: BaseModel) -> BaseModel:
|
|
84
|
+
"""
|
|
85
|
+
Validate that the model's 'labels' field contains unique values.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
model (BaseModel): A Pydantic model instance with a 'labels' attribute.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
BaseModel: The same model if validation passes.
|
|
92
|
+
|
|
93
|
+
Raises:
|
|
94
|
+
ValueError: If 'labels' contains duplicate values.
|
|
95
|
+
"""
|
|
96
|
+
if len(model.labels) != len(set(model.labels)):
|
|
97
|
+
raise ValueError("'labels' must be unique")
|
|
98
|
+
return model
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class SentimentAnalysisOutput(BaseModel):
|
|
102
|
+
response_id: int = Field(gt=0)
|
|
103
|
+
position: str
|
|
104
|
+
|
|
105
|
+
@model_validator(mode="after")
|
|
106
|
+
def run_validations(self) -> "SentimentAnalysisOutput":
|
|
107
|
+
"""
|
|
108
|
+
Run all validations for SentimentAnalysisOutput.
|
|
109
|
+
|
|
110
|
+
Validates that:
|
|
111
|
+
- 'position' is one of the allowed values.
|
|
112
|
+
- No fields are empty or only whitespace (for strings) and no lists are empty.
|
|
113
|
+
"""
|
|
114
|
+
validate_position(self)
|
|
115
|
+
validate_non_empty_fields(self)
|
|
116
|
+
return self
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class ThemeMappingOutput(BaseModel):
|
|
120
|
+
response_id: int = Field(gt=0)
|
|
121
|
+
labels: list[str]
|
|
122
|
+
reasons: list[str]
|
|
123
|
+
stances: list[str]
|
|
124
|
+
|
|
125
|
+
@model_validator(mode="after")
|
|
126
|
+
def run_validations(self) -> "ThemeMappingOutput":
|
|
127
|
+
"""
|
|
128
|
+
Run all validations for ThemeMappingOutput.
|
|
129
|
+
|
|
130
|
+
Validates that:
|
|
131
|
+
- 'stances' are only 'POSITIVE' or 'NEGATIVE'.
|
|
132
|
+
- The 'stances' and 'labels' have matching lengths.
|
|
133
|
+
- 'labels' are unique.
|
|
134
|
+
"""
|
|
135
|
+
validate_stances(self)
|
|
136
|
+
validate_mapping_stance_lengths(self)
|
|
137
|
+
validate_mapping_unique_labels(self)
|
|
138
|
+
return self
|
|
@@ -3,8 +3,8 @@
|
|
|
3
3
|
You will receive a list of RESPONSES, each containing a response_id and a response.
|
|
4
4
|
Your job is to analyze each response to the QUESTION below and decide:
|
|
5
5
|
|
|
6
|
-
POSITION - is the response
|
|
7
|
-
Choose one from [
|
|
6
|
+
POSITION - is the response AGREEING or DISAGREEING or is it UNCLEAR about the change being proposed in the question.
|
|
7
|
+
Choose one from [AGREEMENT, DISAGREEMENT, UNCLEAR]
|
|
8
8
|
|
|
9
9
|
The final output should be in the following JSON format:
|
|
10
10
|
|
|
@@ -24,20 +24,23 @@ You MUST include every response ID in the output.
|
|
|
24
24
|
If the response can not be labelled return empty sections where appropriate but you MUST return an entry
|
|
25
25
|
with the correct response ID for each input object
|
|
26
26
|
|
|
27
|
+
You MUST pick one of the given POSITION values.
|
|
28
|
+
You MUST not return an empty value for the POSITION of a response.
|
|
29
|
+
|
|
27
30
|
## EXAMPLE
|
|
28
31
|
Example 1:
|
|
29
32
|
Question: \n What are your thoughts on the proposed government changes to the policy about reducing school holidays?
|
|
30
33
|
Response: \n as a parent I have no idea why you would make this change. I guess you were thinking about increasing productivity but any productivity gains would be totally offset by the decrease in family time. \n
|
|
31
34
|
|
|
32
35
|
Output:
|
|
33
|
-
POSITION:
|
|
36
|
+
POSITION: DISAGREEMENT
|
|
34
37
|
|
|
35
38
|
Example 2:
|
|
36
39
|
Question: \n What are your thoughts on the proposed government changes to the policy about reducing school holidays?
|
|
37
40
|
Response: \n I think this is a great idea, our children will learn more if they are in school more \n
|
|
38
41
|
|
|
39
42
|
Output:
|
|
40
|
-
POSITION:
|
|
43
|
+
POSITION: AGREEMENT
|
|
41
44
|
|
|
42
45
|
Example 3:
|
|
43
46
|
Question: \n What are your thoughts on the proposed government changes to the policy about reducing school holidays?
|
|
@@ -45,7 +48,7 @@ Response: \n it will be good for our children to be around their friends more bu
|
|
|
45
48
|
less time with their children \n
|
|
46
49
|
|
|
47
50
|
Output:
|
|
48
|
-
POSITION:
|
|
51
|
+
POSITION: UNCLEAR
|
|
49
52
|
|
|
50
53
|
|
|
51
54
|
QUESTION: \n {question}
|
|
@@ -17,7 +17,7 @@ Your task is to analyze each response and decide which topics are present. Guide
|
|
|
17
17
|
- There is no limit on how many topics can be assigned to a response.
|
|
18
18
|
- For each assignment provide a single rationale for why you have chosen the label.
|
|
19
19
|
- For each topic identified in a response, indicate whether the response expresses a positive or negative stance toward that topic (options: 'POSITIVE' or 'NEGATIVE')
|
|
20
|
-
-
|
|
20
|
+
- You MUST use either 'POSTIVE' or 'NEGATIVE'
|
|
21
21
|
- The order of reasons and stances must align with the order of labels (e.g., stance_a applies to topic_a)
|
|
22
22
|
|
|
23
23
|
You MUST include every response ID in the output.
|
|
@@ -30,13 +30,13 @@ The final output should be in the following JSON format:
|
|
|
30
30
|
{{
|
|
31
31
|
"responses": [
|
|
32
32
|
{{
|
|
33
|
-
"response_id":
|
|
33
|
+
"response_id": response_id_1,
|
|
34
34
|
"reasons": ["reason_a", "reason_b"],
|
|
35
35
|
"labels": ["topic_a", "topic_b"],
|
|
36
36
|
"stances": ["stance_a", "stance_b"],
|
|
37
37
|
}},
|
|
38
38
|
{{
|
|
39
|
-
"response_id":
|
|
39
|
+
"response_id": response_id_2,
|
|
40
40
|
"reasons": ["reason_c"],
|
|
41
41
|
"labels": ["topic_c"],
|
|
42
42
|
"stances": ["stance_c"],
|
|
@@ -1,13 +1,12 @@
|
|
|
1
1
|
{system_prompt}
|
|
2
2
|
|
|
3
|
-
You are tasked with refining
|
|
4
|
-
Your goal is to transform opinionated topics into neutral, well-structured, and distinct topics while preserving the essential information.
|
|
3
|
+
You are tasked with refining a list of topics generated from responses to a question.
|
|
5
4
|
|
|
6
5
|
## Input
|
|
7
|
-
You will receive a list of
|
|
6
|
+
You will receive a list of TOPICS. These topics explicitly tie opinions to whether a person agrees or disagrees with the question.
|
|
8
7
|
|
|
9
8
|
## Output
|
|
10
|
-
You will produce a list of
|
|
9
|
+
You will produce a list of CLEAR STANCE TOPICS based on the input. Each topic should have two parts:
|
|
11
10
|
1. A brief, clear topic label (3-7 words)
|
|
12
11
|
2. A more detailed topic description (1-2 sentences)
|
|
13
12
|
|
|
@@ -17,10 +16,11 @@ You will produce a list of NEUTRAL TOPICS based on the input. Each neutral topic
|
|
|
17
16
|
- Preserve all key information, details and concepts from the original topics.
|
|
18
17
|
- Ensure no significant details are lost in the refinement process.
|
|
19
18
|
|
|
20
|
-
2.
|
|
21
|
-
-
|
|
22
|
-
-
|
|
23
|
-
- Avoid
|
|
19
|
+
2. Clear Stance Formulation:
|
|
20
|
+
- Reformulate topics to express a clear stance that can be agreed or disagreed with.
|
|
21
|
+
- Use direct language like "Increased risk of X" rather than "X"
|
|
22
|
+
- Avoid double negatives and ambiguous phrasing.
|
|
23
|
+
- Phrase topics as definitive statements.
|
|
24
24
|
|
|
25
25
|
3. Avoid Response References:
|
|
26
26
|
- Do not use language that refers to multiple responses or respondents.
|
|
@@ -39,16 +39,15 @@ You will produce a list of NEUTRAL TOPICS based on the input. Each neutral topic
|
|
|
39
39
|
|
|
40
40
|
## Process
|
|
41
41
|
|
|
42
|
-
1. Analyze the
|
|
42
|
+
1. Analyze the TOPICS to identify key themes and information.
|
|
43
43
|
2. Group closely related topics together.
|
|
44
44
|
3. For each group or individual topic:
|
|
45
45
|
a. Distill the core concept, removing any bias or opinion.
|
|
46
46
|
b. Create a neutral, concise topic label.
|
|
47
47
|
c. Write a more detailed description that provides context without taking sides.
|
|
48
48
|
4. Review the entire list to ensure distinctiveness and adjust as needed.
|
|
49
|
-
5.
|
|
50
|
-
6.
|
|
51
|
-
7. Combine the topic label and description with a colon separator
|
|
49
|
+
5. Assign each output topic a topic_id a single uppercase letters (starting from 'A', for the 27th element use AA)
|
|
50
|
+
6. Combine the topic label and description with a colon separator
|
|
52
51
|
|
|
53
52
|
Return your output in the following JSON format:
|
|
54
53
|
{{
|
|
@@ -61,21 +60,6 @@ Return your output in the following JSON format:
|
|
|
61
60
|
}}
|
|
62
61
|
|
|
63
62
|
|
|
64
|
-
## EXAMPLE
|
|
65
63
|
|
|
66
|
-
|
|
67
|
-
"Economic impact: Many respondents who support the policy believe it will create jobs and boost the economy, it could raise GDP by 2%. [source_topic_count: 15]"
|
|
68
|
-
|
|
69
|
-
NEUTRAL TOPIC:
|
|
70
|
-
{{
|
|
71
|
-
"topic_id": "A",
|
|
72
|
-
"topic": "Economic Impact on Employment: The policy's potential effects on job creation and overall economic growth, including potential for a 2% increase in GDP.",
|
|
73
|
-
"source_topic_count": 15
|
|
74
|
-
}}
|
|
75
|
-
|
|
76
|
-
Remember, your goal is to create a list of neutral, informative, and distinct topics that accurately represent the content of the original opinionated topics without any bias or references to responses.
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
OPINIONATED TOPIC:
|
|
64
|
+
TOPICS:
|
|
81
65
|
{responses}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: themefinder
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.2
|
|
4
4
|
Summary: A topic modelling Python package designed for analysing one-to-many question-answer data eg free-text survey responses.
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: i.AI
|
|
@@ -100,7 +100,7 @@ system_prompt = "You are an AI evaluation tool analyzing survey responses about
|
|
|
100
100
|
# Run the function to find themes
|
|
101
101
|
# We use asyncio to query LLM endpoints asynchronously, so we need to await our function
|
|
102
102
|
async def main():
|
|
103
|
-
result = await find_themes(responses_df, llm, question, system_prompt)
|
|
103
|
+
result = await find_themes(responses_df, llm, question, system_prompt=system_prompt)
|
|
104
104
|
print(result)
|
|
105
105
|
|
|
106
106
|
if __name__ == "__main__":
|
|
@@ -155,3 +155,4 @@ The documentation is [© Crown copyright](https://www.nationalarchives.gov.uk/in
|
|
|
155
155
|
## Feedback
|
|
156
156
|
|
|
157
157
|
If you have feedback on this package, please fill in our [feedback form](https://forms.gle/85xUSMvxGzSSKQ499) or contact us with questions or feedback at packages@cabinetoffice.gov.uk.
|
|
158
|
+
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
themefinder/__init__.py,sha256=wSpW2fEnC4gTzbeNC78nSD3DpJq43-h_H-LK_cqt1cw,327
|
|
2
|
+
themefinder/core.py,sha256=u1DY9gbzn-tFhQS3hrXQ8_1mIbR-iBWYVAdKeAX1BdE,18304
|
|
3
|
+
themefinder/llm_batch_processor.py,sha256=OrFEl1nSi5ninbSZSiE1HFMcYZiQ-NzuYPj_iDcPPoE,19988
|
|
4
|
+
themefinder/models.py,sha256=Y5-okndYwtBO09n_qUlYNVmHRVNEnJviArQZukm8Ox8,4251
|
|
5
|
+
themefinder/prompts/consultation_system_prompt.txt,sha256=_A07oY_an4hnRx-9pQ0y-TLXJz0dd8vDI-MZne7Mdb4,89
|
|
6
|
+
themefinder/prompts/sentiment_analysis.txt,sha256=9-LkdR95JTHXRKUXknAgNf86uVdv6jSaXMf-OtFL9_0,1948
|
|
7
|
+
themefinder/prompts/theme_condensation.txt,sha256=DB4pqUmMpo0OG4AZWGTj0FfLFfjbX6wOMUr44HBxZ1o,2433
|
|
8
|
+
themefinder/prompts/theme_generation.txt,sha256=JMXuNojxdSAcxPRU1Jg12Xunv_dX4hNvXYU2pXMWTAw,2500
|
|
9
|
+
themefinder/prompts/theme_mapping.txt,sha256=YcRGMkuTyTPzPQPtsDY31DUwX60c8AdmdHKw0XeUejQ,2258
|
|
10
|
+
themefinder/prompts/theme_refinement.txt,sha256=hBXwZnNZmhmoEFXpY5OJinp-7xxdoDRf_5LmgrilYgc,2713
|
|
11
|
+
themefinder/prompts/theme_target_alignment.txt,sha256=-_ghr4--KAN6Tz8ExO9s2IXvI6pjWaEA_nG5L83GV5I,1035
|
|
12
|
+
themefinder/themefinder_logging.py,sha256=n5SUQovEZLC4skEbxicjz_fOGF9mOk3S-Wpj5uXsaL8,314
|
|
13
|
+
themefinder-0.6.2.dist-info/LICENCE,sha256=C9ULIN0ctF60ZxUWH_hw1H434bDLg49Z-Qzn6BUHgqs,1060
|
|
14
|
+
themefinder-0.6.2.dist-info/METADATA,sha256=gI9Hp754EjopJQWw0QZIPb9dex8TalPMGnorUEOJlp0,6498
|
|
15
|
+
themefinder-0.6.2.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
|
|
16
|
+
themefinder-0.6.2.dist-info/RECORD,,
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
themefinder/__init__.py,sha256=p6QoCgA-BYWljk8yPOeTgkNcN5m_gA_o3Q86Eh0QjSM,327
|
|
2
|
-
themefinder/core.py,sha256=yH68-DtpIv0jX__LnjuBaKJn01hj-VurW3WnFxk0wMQ,17537
|
|
3
|
-
themefinder/llm_batch_processor.py,sha256=SDDeMJeX1J3u7FGFddRhVSxty6U8lFVXwG4eNI_0C5o,12573
|
|
4
|
-
themefinder/prompts/consultation_system_prompt.txt,sha256=_A07oY_an4hnRx-9pQ0y-TLXJz0dd8vDI-MZne7Mdb4,89
|
|
5
|
-
themefinder/prompts/sentiment_analysis.txt,sha256=e3DcUKga6pSFcfeo2TAq8x9LXk0YDV-D7P2gtymcyuc,1832
|
|
6
|
-
themefinder/prompts/theme_condensation.txt,sha256=DB4pqUmMpo0OG4AZWGTj0FfLFfjbX6wOMUr44HBxZ1o,2433
|
|
7
|
-
themefinder/prompts/theme_generation.txt,sha256=JMXuNojxdSAcxPRU1Jg12Xunv_dX4hNvXYU2pXMWTAw,2500
|
|
8
|
-
themefinder/prompts/theme_mapping.txt,sha256=nb_D7gwKGd8BzrAlzSZC3mQIPYaCRXdE6XmoJaJEKZQ,2405
|
|
9
|
-
themefinder/prompts/theme_refinement.txt,sha256=_NVHdXBfqCFX2u0R5oZEqWQo70MAjJ5nXQfZ7p_HRAM,3528
|
|
10
|
-
themefinder/prompts/theme_target_alignment.txt,sha256=-_ghr4--KAN6Tz8ExO9s2IXvI6pjWaEA_nG5L83GV5I,1035
|
|
11
|
-
themefinder/themefinder_logging.py,sha256=n5SUQovEZLC4skEbxicjz_fOGF9mOk3S-Wpj5uXsaL8,314
|
|
12
|
-
themefinder-0.5.4.dist-info/LICENCE,sha256=C9ULIN0ctF60ZxUWH_hw1H434bDLg49Z-Qzn6BUHgqs,1060
|
|
13
|
-
themefinder-0.5.4.dist-info/METADATA,sha256=JKSxdzARGcJ-OJwrd5ScuPzm4Uln2cBQ_SnrxFAhQLQ,6483
|
|
14
|
-
themefinder-0.5.4.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
|
15
|
-
themefinder-0.5.4.dist-info/RECORD,,
|
|
File without changes
|