themefinder 0.3.1__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of themefinder might be problematic. Click here for more details.
- themefinder/core.py +124 -13
- themefinder/llm_batch_processor.py +2 -3
- themefinder/prompts/sentiment_analysis.txt +6 -1
- themefinder/prompts/theme_condensation.txt +10 -15
- themefinder/prompts/theme_generation.txt +40 -40
- themefinder/prompts/theme_mapping.txt +3 -0
- themefinder/prompts/theme_refinement.txt +3 -2
- themefinder/prompts/theme_target_alignment.txt +26 -0
- {themefinder-0.3.1.dist-info → themefinder-0.5.2.dist-info}/METADATA +4 -1
- themefinder-0.5.2.dist-info/RECORD +15 -0
- {themefinder-0.3.1.dist-info → themefinder-0.5.2.dist-info}/WHEEL +1 -1
- themefinder-0.3.1.dist-info/RECORD +0 -14
- {themefinder-0.3.1.dist-info → themefinder-0.5.2.dist-info}/LICENCE +0 -0
themefinder/core.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
from pathlib import Path
|
|
2
3
|
|
|
3
4
|
import pandas as pd
|
|
@@ -7,7 +8,6 @@ from langchain_core.runnables import Runnable
|
|
|
7
8
|
from .llm_batch_processor import batch_and_run, load_prompt_from_file
|
|
8
9
|
from .themefinder_logging import logger
|
|
9
10
|
|
|
10
|
-
|
|
11
11
|
CONSULTATION_SYSTEM_PROMPT = load_prompt_from_file("consultation_system_prompt")
|
|
12
12
|
|
|
13
13
|
|
|
@@ -15,7 +15,9 @@ async def find_themes(
|
|
|
15
15
|
responses_df: pd.DataFrame,
|
|
16
16
|
llm: Runnable,
|
|
17
17
|
question: str,
|
|
18
|
+
target_n_themes: int | None = None,
|
|
18
19
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
20
|
+
verbose: bool = True,
|
|
19
21
|
) -> dict[str, pd.DataFrame]:
|
|
20
22
|
"""Process survey responses through a multi-stage theme analysis pipeline.
|
|
21
23
|
|
|
@@ -24,14 +26,19 @@ async def find_themes(
|
|
|
24
26
|
2. Initial theme generation
|
|
25
27
|
3. Theme condensation (combining similar themes)
|
|
26
28
|
4. Theme refinement
|
|
27
|
-
5.
|
|
29
|
+
5. Theme target alignment (optional, if target_n_themes is specified)
|
|
30
|
+
6. Mapping responses to refined themes
|
|
28
31
|
|
|
29
32
|
Args:
|
|
30
33
|
responses_df (pd.DataFrame): DataFrame containing survey responses
|
|
31
34
|
llm (Runnable): Language model instance for text analysis
|
|
32
35
|
question (str): The survey question
|
|
36
|
+
target_n_themes (int | None, optional): Target number of themes to consolidate to.
|
|
37
|
+
If None, skip theme target alignment step. Defaults to None.
|
|
33
38
|
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
34
39
|
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
40
|
+
verbose (bool): Whether to show information messages during processing.
|
|
41
|
+
Defaults to True.
|
|
35
42
|
|
|
36
43
|
Returns:
|
|
37
44
|
dict[str, pd.DataFrame]: Dictionary containing results from each pipeline stage:
|
|
@@ -42,6 +49,8 @@ async def find_themes(
|
|
|
42
49
|
- refined_topics: DataFrame with refined theme definitions
|
|
43
50
|
- mapping: DataFrame mapping responses to final themes
|
|
44
51
|
"""
|
|
52
|
+
logger.setLevel(logging.INFO if verbose else logging.CRITICAL)
|
|
53
|
+
|
|
45
54
|
sentiment_df = await sentiment_analysis(
|
|
46
55
|
responses_df,
|
|
47
56
|
llm,
|
|
@@ -63,6 +72,14 @@ async def find_themes(
|
|
|
63
72
|
question=question,
|
|
64
73
|
system_prompt=system_prompt,
|
|
65
74
|
)
|
|
75
|
+
if target_n_themes is not None:
|
|
76
|
+
refined_theme_df = await theme_target_alignment(
|
|
77
|
+
refined_theme_df,
|
|
78
|
+
llm,
|
|
79
|
+
question=question,
|
|
80
|
+
target_n_themes=target_n_themes,
|
|
81
|
+
system_prompt=system_prompt,
|
|
82
|
+
)
|
|
66
83
|
mapping_df = await theme_mapping(
|
|
67
84
|
sentiment_df,
|
|
68
85
|
llm,
|
|
@@ -79,8 +96,8 @@ async def find_themes(
|
|
|
79
96
|
"question": question,
|
|
80
97
|
"sentiment": sentiment_df,
|
|
81
98
|
"topics": theme_df,
|
|
82
|
-
"
|
|
83
|
-
"
|
|
99
|
+
"condensed_themes": condensed_theme_df,
|
|
100
|
+
"refined_themes": refined_theme_df,
|
|
84
101
|
"mapping": mapping_df,
|
|
85
102
|
}
|
|
86
103
|
|
|
@@ -89,7 +106,7 @@ async def sentiment_analysis(
|
|
|
89
106
|
responses_df: pd.DataFrame,
|
|
90
107
|
llm: Runnable,
|
|
91
108
|
question: str,
|
|
92
|
-
batch_size: int =
|
|
109
|
+
batch_size: int = 20,
|
|
93
110
|
prompt_template: str | Path | PromptTemplate = "sentiment_analysis",
|
|
94
111
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
95
112
|
) -> pd.DataFrame:
|
|
@@ -104,7 +121,7 @@ async def sentiment_analysis(
|
|
|
104
121
|
llm (Runnable): Language model instance to use for sentiment analysis.
|
|
105
122
|
question (str): The survey question.
|
|
106
123
|
batch_size (int, optional): Number of responses to process in each batch.
|
|
107
|
-
Defaults to
|
|
124
|
+
Defaults to 20.
|
|
108
125
|
prompt_template (str | Path | PromptTemplate, optional): Template for structuring
|
|
109
126
|
the prompt to the LLM. Can be a string identifier, path to template file,
|
|
110
127
|
or PromptTemplate instance. Defaults to "sentiment_analysis".
|
|
@@ -180,9 +197,10 @@ async def theme_condensation(
|
|
|
180
197
|
themes_df: pd.DataFrame,
|
|
181
198
|
llm: Runnable,
|
|
182
199
|
question: str,
|
|
183
|
-
batch_size: int =
|
|
200
|
+
batch_size: int = 100,
|
|
184
201
|
prompt_template: str | Path | PromptTemplate = "theme_condensation",
|
|
185
202
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
203
|
+
**kwargs,
|
|
186
204
|
) -> pd.DataFrame:
|
|
187
205
|
"""Condense and combine similar themes identified from survey responses.
|
|
188
206
|
|
|
@@ -195,7 +213,7 @@ async def theme_condensation(
|
|
|
195
213
|
llm (Runnable): Language model instance to use for theme condensation.
|
|
196
214
|
question (str): The survey question.
|
|
197
215
|
batch_size (int, optional): Number of themes to process in each batch.
|
|
198
|
-
Defaults to
|
|
216
|
+
Defaults to 100.
|
|
199
217
|
prompt_template (str | Path | PromptTemplate, optional): Template for structuring
|
|
200
218
|
the prompt to the LLM. Can be a string identifier, path to template file,
|
|
201
219
|
or PromptTemplate instance. Defaults to "theme_condensation".
|
|
@@ -206,17 +224,43 @@ async def theme_condensation(
|
|
|
206
224
|
pd.DataFrame: DataFrame containing the condensed themes, where similar topics
|
|
207
225
|
have been combined into broader categories.
|
|
208
226
|
"""
|
|
209
|
-
logger.info(f"Running theme condensation on {len(themes_df)}
|
|
227
|
+
logger.info(f"Running theme condensation on {len(themes_df)} responses")
|
|
210
228
|
themes_df["response_id"] = range(len(themes_df))
|
|
211
|
-
|
|
229
|
+
|
|
230
|
+
n_themes = themes_df.shape[0]
|
|
231
|
+
while n_themes > batch_size:
|
|
232
|
+
logger.info(
|
|
233
|
+
f"{n_themes} larger than batch size, using recursive theme condensation"
|
|
234
|
+
)
|
|
235
|
+
themes_df = await batch_and_run(
|
|
236
|
+
themes_df,
|
|
237
|
+
prompt_template,
|
|
238
|
+
llm,
|
|
239
|
+
batch_size=batch_size,
|
|
240
|
+
question=question,
|
|
241
|
+
system_prompt=system_prompt,
|
|
242
|
+
**kwargs,
|
|
243
|
+
)
|
|
244
|
+
themes_df = themes_df.sample(frac=1).reset_index(drop=True)
|
|
245
|
+
themes_df["response_id"] = range(len(themes_df))
|
|
246
|
+
if len(themes_df) == n_themes:
|
|
247
|
+
logger.info("Themes no longer being condensed")
|
|
248
|
+
break
|
|
249
|
+
n_themes = themes_df.shape[0]
|
|
250
|
+
|
|
251
|
+
themes_df = await batch_and_run(
|
|
212
252
|
themes_df,
|
|
213
253
|
prompt_template,
|
|
214
254
|
llm,
|
|
215
255
|
batch_size=batch_size,
|
|
216
256
|
question=question,
|
|
217
257
|
system_prompt=system_prompt,
|
|
258
|
+
**kwargs,
|
|
218
259
|
)
|
|
219
260
|
|
|
261
|
+
logger.info(f"Final number of condensed themes: {themes_df.shape[0]}")
|
|
262
|
+
return themes_df
|
|
263
|
+
|
|
220
264
|
|
|
221
265
|
async def theme_refinement(
|
|
222
266
|
condensed_themes_df: pd.DataFrame,
|
|
@@ -257,10 +301,10 @@ async def theme_refinement(
|
|
|
257
301
|
transposes the output for improved readability and easier downstream
|
|
258
302
|
processing.
|
|
259
303
|
"""
|
|
260
|
-
logger.info(f"Running
|
|
304
|
+
logger.info(f"Running theme refinement on {len(condensed_themes_df)} responses")
|
|
261
305
|
condensed_themes_df["response_id"] = range(len(condensed_themes_df))
|
|
262
306
|
|
|
263
|
-
def
|
|
307
|
+
def transpose_refined_themes(refined_themes: pd.DataFrame):
|
|
264
308
|
"""Transpose topics for increased legibility."""
|
|
265
309
|
transposed_df = pd.DataFrame(
|
|
266
310
|
[refined_themes["topic"].to_numpy()], columns=refined_themes["topic_id"]
|
|
@@ -275,7 +319,74 @@ async def theme_refinement(
|
|
|
275
319
|
question=question,
|
|
276
320
|
system_prompt=system_prompt,
|
|
277
321
|
)
|
|
278
|
-
return
|
|
322
|
+
return transpose_refined_themes(refined_themes)
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
async def theme_target_alignment(
|
|
326
|
+
refined_themes_df: pd.DataFrame,
|
|
327
|
+
llm: Runnable,
|
|
328
|
+
question: str,
|
|
329
|
+
target_n_themes: int = 10,
|
|
330
|
+
batch_size: int = 10000,
|
|
331
|
+
prompt_template: str | Path | PromptTemplate = "theme_target_alignment",
|
|
332
|
+
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
333
|
+
) -> pd.DataFrame:
|
|
334
|
+
"""Align themes to target number using an LLM.
|
|
335
|
+
|
|
336
|
+
This function processes refined themes to consolidate them into a target number of
|
|
337
|
+
distinct categories while preserving all significant details and perspectives.
|
|
338
|
+
It transforms the output format for improved readability by transposing the
|
|
339
|
+
results into a single-row DataFrame where columns represent individual themes.
|
|
340
|
+
|
|
341
|
+
Args:
|
|
342
|
+
refined_themes_df (pd.DataFrame): DataFrame containing the refined themes
|
|
343
|
+
from the previous pipeline stage.
|
|
344
|
+
llm (Runnable): Language model instance to use for theme alignment.
|
|
345
|
+
question (str): The survey question.
|
|
346
|
+
target_n_themes (int, optional): Target number of themes to consolidate to.
|
|
347
|
+
Defaults to 10.
|
|
348
|
+
batch_size (int, optional): Number of themes to process in each batch.
|
|
349
|
+
Defaults to 10000.
|
|
350
|
+
prompt_template (str | Path | PromptTemplate, optional): Template for structuring
|
|
351
|
+
the prompt to the LLM. Can be a string identifier, path to template file,
|
|
352
|
+
or PromptTemplate instance. Defaults to "theme_target_alignment".
|
|
353
|
+
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
354
|
+
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
pd.DataFrame: A single-row DataFrame where:
|
|
358
|
+
- Each column represents a unique theme (identified by topic_id)
|
|
359
|
+
- The values contain the aligned theme descriptions
|
|
360
|
+
- The format is optimized for subsequent theme mapping operations
|
|
361
|
+
|
|
362
|
+
Note:
|
|
363
|
+
The function adds sequential response_ids to the input DataFrame and
|
|
364
|
+
transposes the output for improved readability and easier downstream
|
|
365
|
+
processing.
|
|
366
|
+
"""
|
|
367
|
+
logger.info(
|
|
368
|
+
f"Running theme target alignment on {len(refined_themes_df.columns)} themes compressing to {target_n_themes} themes"
|
|
369
|
+
)
|
|
370
|
+
refined_themes_df = refined_themes_df.T.rename(columns={0: "topic"})
|
|
371
|
+
refined_themes_df["response_id"] = range(len(refined_themes_df))
|
|
372
|
+
|
|
373
|
+
def transpose_aligned_themes(aligned_themes: pd.DataFrame):
|
|
374
|
+
"""Transpose topics for increased legibility."""
|
|
375
|
+
transposed_df = pd.DataFrame(
|
|
376
|
+
[aligned_themes["topic"].to_numpy()], columns=aligned_themes["topic_id"]
|
|
377
|
+
)
|
|
378
|
+
return transposed_df
|
|
379
|
+
|
|
380
|
+
aligned_themes = await batch_and_run(
|
|
381
|
+
refined_themes_df,
|
|
382
|
+
prompt_template,
|
|
383
|
+
llm,
|
|
384
|
+
batch_size=batch_size,
|
|
385
|
+
question=question,
|
|
386
|
+
system_prompt=system_prompt,
|
|
387
|
+
target_n_themes=target_n_themes,
|
|
388
|
+
)
|
|
389
|
+
return transpose_aligned_themes(aligned_themes)
|
|
279
390
|
|
|
280
391
|
|
|
281
392
|
async def theme_mapping(
|
|
@@ -174,7 +174,6 @@ def generate_prompts(
|
|
|
174
174
|
to the prompt template as the 'responses' variable.
|
|
175
175
|
"""
|
|
176
176
|
batched_prompts = []
|
|
177
|
-
|
|
178
177
|
for df in response_dfs:
|
|
179
178
|
prompt = prompt_template.format(
|
|
180
179
|
responses=df.to_dict(orient="records"), **kwargs
|
|
@@ -219,7 +218,7 @@ async def call_llm(
|
|
|
219
218
|
failed_ids: set = set()
|
|
220
219
|
|
|
221
220
|
@retry(
|
|
222
|
-
wait=wait_random_exponential(min=1, max=
|
|
221
|
+
wait=wait_random_exponential(min=1, max=20),
|
|
223
222
|
stop=stop_after_attempt(6),
|
|
224
223
|
before=before.before_log(logger=logger, log_level=logging.DEBUG),
|
|
225
224
|
reraise=True,
|
|
@@ -275,7 +274,7 @@ def check_response_integrity(
|
|
|
275
274
|
if returned_ids_set != response_ids_set:
|
|
276
275
|
logger.info("Failed integrity check")
|
|
277
276
|
logger.info(
|
|
278
|
-
f"Present in original but not returned from LLM: {response_ids_set - returned_ids_set}. Returned in LLM but not present in original: {returned_ids_set -response_ids_set}"
|
|
277
|
+
f"Present in original but not returned from LLM: {response_ids_set - returned_ids_set}. Returned in LLM but not present in original: {returned_ids_set - response_ids_set}"
|
|
279
278
|
)
|
|
280
279
|
return False
|
|
281
280
|
return True
|
|
@@ -6,7 +6,7 @@ Your job is to analyze each response to the QUESTION below and decide:
|
|
|
6
6
|
POSITION - is the response agreeing or disagreeing or is it unclear about the change being proposed in the question.
|
|
7
7
|
Choose one from [agreement, disagreement, unclear]
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
The final output should be in the following JSON format:
|
|
10
10
|
|
|
11
11
|
{{"responses": [
|
|
12
12
|
{{
|
|
@@ -20,6 +20,11 @@ You should only return a response in strict json and nothing else. The final out
|
|
|
20
20
|
...
|
|
21
21
|
]}}
|
|
22
22
|
|
|
23
|
+
You MUST include every response ID in the output.
|
|
24
|
+
If the response can not be labelled return empty sections where appropriate but you MUST return an entry
|
|
25
|
+
with the correct response ID for each input object
|
|
26
|
+
|
|
27
|
+
## EXAMPLE
|
|
23
28
|
Example 1:
|
|
24
29
|
Question: \n What are your thoughts on the proposed government changes to the policy about reducing school holidays?
|
|
25
30
|
Response: \n as a parent I have no idea why you would make this change. I guess you were thinking about increasing productivity but any productivity gains would be totally offset by the decrease in family time. \n
|
|
@@ -4,32 +4,27 @@ Below is a question and a list of topics extracted from answers to that question
|
|
|
4
4
|
|
|
5
5
|
Your task is to analyze these topics and produce a refined list that:
|
|
6
6
|
1. Identifies and preserves core themes that appear frequently
|
|
7
|
-
2.
|
|
8
|
-
3.
|
|
9
|
-
4. Ensures the final list represents the full spectrum of viewpoints present in the original data
|
|
7
|
+
2. Combines redundant topics while maintaining nuanced differences
|
|
8
|
+
3. Ensures the final list represents the full spectrum of viewpoints present in the original data
|
|
10
9
|
|
|
11
10
|
Guidelines for Topic Analysis:
|
|
12
11
|
- Begin by identifying distinct concept clusters in the topics
|
|
13
|
-
- When a topic appears only once, evaluate its unique contribution before deciding to merge or preserve it
|
|
14
12
|
- Consider the context of the question when determining topic relevance
|
|
15
13
|
- Look for complementary perspectives that could enrich understanding of the same core concept
|
|
16
|
-
-
|
|
17
|
-
- Maintain granularity where different aspects of the same broader theme offer distinct insights
|
|
18
|
-
|
|
19
|
-
The topics you are analyzing are all extracted from answers with the same position, where "position" means that the answer agrees ("Y") or disagrees ("N") with the question.
|
|
14
|
+
- Consider the key ideas behind themes when merging, don't simply focus on the words used in the label and description
|
|
20
15
|
|
|
21
16
|
For each topic in your output:
|
|
22
17
|
1. Choose a clear, representative label that captures the essence of the combined or preserved topic
|
|
23
|
-
2. Write a
|
|
24
|
-
|
|
25
|
-
|
|
18
|
+
2. Write a concise description that incorporates key insights from all constituent topics, this should only be a single sentence
|
|
19
|
+
|
|
20
|
+
Return at most 30 topics
|
|
26
21
|
|
|
27
22
|
The final output should be in the following JSON format:
|
|
28
23
|
|
|
29
24
|
{{"responses": [
|
|
30
|
-
{{"topic_label": "{{label for condensed topic 1}}", "topic_description": "{{description for condensed topic 1}}"
|
|
31
|
-
{{"topic_label": "{{label for condensed topic 2}}", "topic_description": "{{description for condensed topic 2}}"
|
|
32
|
-
{{"topic_label": "{{label for condensed topic 3}}", "topic_description": "{{description for condensed topic 3}}"
|
|
25
|
+
{{"topic_label": "{{label for condensed topic 1}}", "topic_description": "{{description for condensed topic 1}}"}},
|
|
26
|
+
{{"topic_label": "{{label for condensed topic 2}}", "topic_description": "{{description for condensed topic 2}}"}},
|
|
27
|
+
{{"topic_label": "{{label for condensed topic 3}}", "topic_description": "{{description for condensed topic 3}}"}},
|
|
33
28
|
// Additional topics as necessary
|
|
34
29
|
]}}
|
|
35
30
|
|
|
@@ -39,4 +34,4 @@ The final output should be in the following JSON format:
|
|
|
39
34
|
|
|
40
35
|
[Themes]
|
|
41
36
|
|
|
42
|
-
{responses}
|
|
37
|
+
{responses}
|
|
@@ -1,53 +1,20 @@
|
|
|
1
1
|
{system_prompt}
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
Below is a question and a list of responses to that question.
|
|
4
|
+
|
|
5
|
+
Your task is to analyze the RESPONSES below and extract TOPICS such that:
|
|
6
|
+
1. Each topic summarizes a point of view expressed in the responses
|
|
5
7
|
2. Every distinct and relevant point of view in the responses should be captured by a topic
|
|
6
|
-
3. Each topic has a topic_label which
|
|
8
|
+
3. Each topic has a topic_label which summarizes the topic in a few words
|
|
7
9
|
4. Each topic has a topic_description which gives more detail about the topic in one or two sentences
|
|
8
|
-
5. The position field should just be the sentiment stated, and is either "agreement" or "disagreement"
|
|
10
|
+
5. The position field should just be the sentiment stated, and is either "agreement" or "disagreement" or "unclear"
|
|
9
11
|
6. There should be no duplicate topics
|
|
10
12
|
|
|
11
13
|
The topics identified will be used by policy makers to understand what the public like and don't like about the proposals.
|
|
12
14
|
|
|
13
15
|
Here is an example of how to extract topics from some responses
|
|
14
16
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
POSITION
|
|
18
|
-
disagreement
|
|
19
|
-
|
|
20
|
-
QUESTION
|
|
21
|
-
What are your views on the proposed change by the government to introduce a 2% tax on fast food meat products.
|
|
22
|
-
|
|
23
|
-
RESPONSES
|
|
24
|
-
[
|
|
25
|
-
{{"response": "I wish the government would stop interfering in the lves of its citizens. It only ever makes things worse. This change will just cost us all more money, and especially poorer people", "position": "disagreement"}},
|
|
26
|
-
{{"response": "Even though it will make people eat more healthier, I beleibe the government should interfer less and not more!", "position": "disagreement"}},
|
|
27
|
-
{{"response": "I hate grapes", "position": "disagreement"}},
|
|
28
|
-
]
|
|
29
|
-
|
|
30
|
-
OUTPUTS
|
|
31
|
-
|
|
32
|
-
{{"responses": [
|
|
33
|
-
{{
|
|
34
|
-
"topic_label": "Government overreach",
|
|
35
|
-
"topic_description": "Some people thought the proposals would result in government interfering too much with citizen's lives",
|
|
36
|
-
"position": "disagreement"
|
|
37
|
-
}},
|
|
38
|
-
{{
|
|
39
|
-
"topic_label": "Regressive change",
|
|
40
|
-
"topic_description": "Some people thought the change would have a larger negative impact on poorer people",
|
|
41
|
-
"position": "disagreement"
|
|
42
|
-
}},
|
|
43
|
-
{{
|
|
44
|
-
"topic_label": "Health",
|
|
45
|
-
"topic_description": "Some people thought the change would result in people eating healthier diets",
|
|
46
|
-
"position": "disagreement"
|
|
47
|
-
}},
|
|
48
|
-
]}}
|
|
49
|
-
|
|
50
|
-
You should only return a response in strict json and nothing else. The final output should be in the following JSON format:
|
|
17
|
+
The final output should be in the following JSON format:
|
|
51
18
|
|
|
52
19
|
{{"responses": [
|
|
53
20
|
{{
|
|
@@ -63,6 +30,39 @@ You should only return a response in strict json and nothing else. The final out
|
|
|
63
30
|
// Additional topics as necessary
|
|
64
31
|
]}}
|
|
65
32
|
|
|
33
|
+
## EXAMPLE
|
|
34
|
+
|
|
35
|
+
QUESTION
|
|
36
|
+
What are your views on the proposed change by the government to introduce a 2% tax on fast food meat products.
|
|
37
|
+
|
|
38
|
+
RESPONSES
|
|
39
|
+
[
|
|
40
|
+
{{"response": "I wish the government would stop interfering in the lves of its citizens. It only ever makes things worse. This change will just cost us all more money, and especially poorer people", "position": "disagreement"}},
|
|
41
|
+
{{"response": "Even though it will make people eat more healthier, I beleibe the government should interfer less and not more!", "position": "disagreement"}},
|
|
42
|
+
{{"response": "I hate grapes", "position": "disagreement"}},
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
OUTPUTS
|
|
46
|
+
|
|
47
|
+
{{"responses": [
|
|
48
|
+
{{
|
|
49
|
+
"topic_label": "Government overreach",
|
|
50
|
+
"topic_description": "The proposals would result in government interfering too much with citizen's lives",
|
|
51
|
+
"position": "disagreement"
|
|
52
|
+
}},
|
|
53
|
+
{{
|
|
54
|
+
"topic_label": "Regressive change",
|
|
55
|
+
"topic_description": "The change would have a larger negative impact on poorer people",
|
|
56
|
+
"position": "disagreement"
|
|
57
|
+
}},
|
|
58
|
+
{{
|
|
59
|
+
"topic_label": "Health",
|
|
60
|
+
"topic_description": "The change would result in people eating healthier diets",
|
|
61
|
+
"position": "disagreement"
|
|
62
|
+
}},
|
|
63
|
+
]}}
|
|
64
|
+
|
|
65
|
+
|
|
66
66
|
QUESTION:
|
|
67
67
|
{question}
|
|
68
68
|
|
|
@@ -20,6 +20,9 @@ Your task is to analyze each response and decide which topics are present. Guide
|
|
|
20
20
|
- If a response contains both positive and negative statements about a topic within the same response, choose the stance that receives more emphasis or appears more central to the argument
|
|
21
21
|
- The order of reasons and stances must align with the order of labels (e.g., stance_a applies to topic_a)
|
|
22
22
|
|
|
23
|
+
You MUST include every response ID in the output.
|
|
24
|
+
If the response can not be labelled return empty sections where appropriate but you MUST return an entry
|
|
25
|
+
with the correct response ID for each input object
|
|
23
26
|
|
|
24
27
|
The final output should be in the following JSON format:
|
|
25
28
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
{system_prompt}
|
|
2
2
|
|
|
3
|
-
You are tasked with refining and neutralizing a list of topics generated from responses to a question.
|
|
3
|
+
You are tasked with refining and neutralizing a list of topics generated from responses to a question.
|
|
4
|
+
Your goal is to transform opinionated topics into neutral, well-structured, and distinct topics while preserving the essential information.
|
|
4
5
|
|
|
5
6
|
## Input
|
|
6
7
|
You will receive a list of OPINIONATED TOPICS. These topics explicitly tie opinions to whether a person agrees or disagrees with the question.
|
|
@@ -60,7 +61,7 @@ Return your output in the following JSON format:
|
|
|
60
61
|
}}
|
|
61
62
|
|
|
62
63
|
|
|
63
|
-
##
|
|
64
|
+
## EXAMPLE
|
|
64
65
|
|
|
65
66
|
OPINIONATED TOPIC:
|
|
66
67
|
"Economic impact: Many respondents who support the policy believe it will create jobs and boost the economy, it could raise GDP by 2%."
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
{system_prompt}
|
|
2
|
+
Input: You will receive a JSON array of themes, where each theme contains a description of a topic or concept.
|
|
3
|
+
|
|
4
|
+
Goal: Consolidate these themes into approximately {target_n_themes} distinct categories by:
|
|
5
|
+
1. Identifying and combining similar or overlapping themes
|
|
6
|
+
2. Preserving all significant details and perspectives
|
|
7
|
+
3. Creating clear, comprehensive descriptions for each merged theme
|
|
8
|
+
|
|
9
|
+
Requirements:
|
|
10
|
+
- Each consolidated theme should capture all relevant information from its source themes
|
|
11
|
+
- Final descriptions should be concise but thorough
|
|
12
|
+
- The merged themes should be distinct from each other with minimal overlap
|
|
13
|
+
|
|
14
|
+
Return your output in the following JSON format:
|
|
15
|
+
|
|
16
|
+
{{
|
|
17
|
+
"responses": [
|
|
18
|
+
{{"topic_id": "A", "topic": "{{topic label 1}}: {{topic description 1}}"}},
|
|
19
|
+
{{"topic_id": "B", "topic": "{{topic label 2}}: {{topic description 2}}"}},
|
|
20
|
+
{{"topic_id": "C", "topic": "{{topic label 3}}: {{topic description 3}}"}},
|
|
21
|
+
// Additional topics as necessary
|
|
22
|
+
]
|
|
23
|
+
}}
|
|
24
|
+
|
|
25
|
+
Themes to analyze:
|
|
26
|
+
{responses}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: themefinder
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.2
|
|
4
4
|
Summary: A topic modelling Python package designed for analysing one-to-many question-answer data eg free-text survey responses.
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: i.AI
|
|
@@ -128,6 +128,9 @@ ThemeFinder's pipeline consists of five distinct stages, each utilizing a specia
|
|
|
128
128
|
- Leverages standardisation prompts to normalise theme descriptions
|
|
129
129
|
- Creates clear, consistent theme definitions through structured refinement
|
|
130
130
|
|
|
131
|
+
### Theme target alignment
|
|
132
|
+
- Optional step to consolidate themes down to a target number
|
|
133
|
+
|
|
131
134
|
### Theme mapping
|
|
132
135
|
- Utilizes classification prompts to map individual responses to refined themes
|
|
133
136
|
- Supports multiple theme assignments per response through detailed analysis
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
themefinder/__init__.py,sha256=p6QoCgA-BYWljk8yPOeTgkNcN5m_gA_o3Q86Eh0QjSM,327
|
|
2
|
+
themefinder/core.py,sha256=pDm6HTAbkk382THIHuFBN2qI5UIMDKJsfpsP2nBzUIg,17541
|
|
3
|
+
themefinder/llm_batch_processor.py,sha256=SDDeMJeX1J3u7FGFddRhVSxty6U8lFVXwG4eNI_0C5o,12573
|
|
4
|
+
themefinder/prompts/consultation_system_prompt.txt,sha256=_A07oY_an4hnRx-9pQ0y-TLXJz0dd8vDI-MZne7Mdb4,89
|
|
5
|
+
themefinder/prompts/sentiment_analysis.txt,sha256=e3DcUKga6pSFcfeo2TAq8x9LXk0YDV-D7P2gtymcyuc,1832
|
|
6
|
+
themefinder/prompts/theme_condensation.txt,sha256=GFwwQO_oZHhqhPnAfTn887fDzAIVxKoCyj0hXagyBIU,1645
|
|
7
|
+
themefinder/prompts/theme_generation.txt,sha256=JMXuNojxdSAcxPRU1Jg12Xunv_dX4hNvXYU2pXMWTAw,2500
|
|
8
|
+
themefinder/prompts/theme_mapping.txt,sha256=_7AUGraX4LrnZywO3RiG58NkGbM9vaPwGI1r0dFNGik,2297
|
|
9
|
+
themefinder/prompts/theme_refinement.txt,sha256=HCgvWAoz-cpFgjX_QS_VVY0X06d4ds0ekBgcoWyFyfg,3360
|
|
10
|
+
themefinder/prompts/theme_target_alignment.txt,sha256=-_ghr4--KAN6Tz8ExO9s2IXvI6pjWaEA_nG5L83GV5I,1035
|
|
11
|
+
themefinder/themefinder_logging.py,sha256=n5SUQovEZLC4skEbxicjz_fOGF9mOk3S-Wpj5uXsaL8,314
|
|
12
|
+
themefinder-0.5.2.dist-info/LICENCE,sha256=C9ULIN0ctF60ZxUWH_hw1H434bDLg49Z-Qzn6BUHgqs,1060
|
|
13
|
+
themefinder-0.5.2.dist-info/METADATA,sha256=RBCyjI9-oU6hoC9MRcf9HEs4Sb_regeK8aoDAy_AQco,6431
|
|
14
|
+
themefinder-0.5.2.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
|
15
|
+
themefinder-0.5.2.dist-info/RECORD,,
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
themefinder/__init__.py,sha256=p6QoCgA-BYWljk8yPOeTgkNcN5m_gA_o3Q86Eh0QjSM,327
|
|
2
|
-
themefinder/core.py,sha256=oFwy4ZTDR7H63QHh-0QOv1OCEUL9XMOUSRZZtj7Pl_4,13012
|
|
3
|
-
themefinder/llm_batch_processor.py,sha256=TnpArgqHk4QtBf6FSMLZKGs692Vx6Oy4YA1Ci1POiWQ,12573
|
|
4
|
-
themefinder/prompts/consultation_system_prompt.txt,sha256=_A07oY_an4hnRx-9pQ0y-TLXJz0dd8vDI-MZne7Mdb4,89
|
|
5
|
-
themefinder/prompts/sentiment_analysis.txt,sha256=hd6ZvXuMttOENwxtmiBJ-UHUE99sEBW46rBbXWk990c,1681
|
|
6
|
-
themefinder/prompts/theme_condensation.txt,sha256=3Xz5apHLirBtzayHU-d8H2U2xbs-oDMdSAMOjzN1nFw,2289
|
|
7
|
-
themefinder/prompts/theme_generation.txt,sha256=09m2OKNHbdNAAi5Bifi_jmc53Ktg8JnlBqbMggR1nQw,2679
|
|
8
|
-
themefinder/prompts/theme_mapping.txt,sha256=8qJXDAyXSsM9qt3yTjsoaiN2F3RYBGqp-vSJ9ZLfU1Q,2091
|
|
9
|
-
themefinder/prompts/theme_refinement.txt,sha256=QB0w_veTAJk533rqVyh42OHNfdZ9BmKTxoHO4UMCVwo,3359
|
|
10
|
-
themefinder/themefinder_logging.py,sha256=n5SUQovEZLC4skEbxicjz_fOGF9mOk3S-Wpj5uXsaL8,314
|
|
11
|
-
themefinder-0.3.1.dist-info/LICENCE,sha256=C9ULIN0ctF60ZxUWH_hw1H434bDLg49Z-Qzn6BUHgqs,1060
|
|
12
|
-
themefinder-0.3.1.dist-info/METADATA,sha256=irff0k8bE-BkaP1RTTMs3idFVPr_1tyzvUXxCPQbsIc,6341
|
|
13
|
-
themefinder-0.3.1.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
|
14
|
-
themefinder-0.3.1.dist-info/RECORD,,
|
|
File without changes
|