themefinder 0.5.2__tar.gz → 0.5.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of themefinder might be problematic. Click here for more details.
- {themefinder-0.5.2 → themefinder-0.5.4}/PKG-INFO +4 -3
- {themefinder-0.5.2 → themefinder-0.5.4}/pyproject.toml +2 -2
- {themefinder-0.5.2 → themefinder-0.5.4}/src/themefinder/core.py +3 -3
- themefinder-0.5.4/src/themefinder/prompts/theme_condensation.txt +50 -0
- {themefinder-0.5.2 → themefinder-0.5.4}/src/themefinder/prompts/theme_mapping.txt +3 -2
- {themefinder-0.5.2 → themefinder-0.5.4}/src/themefinder/prompts/theme_refinement.txt +9 -6
- themefinder-0.5.2/src/themefinder/prompts/theme_condensation.txt +0 -37
- {themefinder-0.5.2 → themefinder-0.5.4}/LICENCE +0 -0
- {themefinder-0.5.2 → themefinder-0.5.4}/README.md +0 -0
- {themefinder-0.5.2 → themefinder-0.5.4}/src/themefinder/__init__.py +0 -0
- {themefinder-0.5.2 → themefinder-0.5.4}/src/themefinder/llm_batch_processor.py +0 -0
- {themefinder-0.5.2 → themefinder-0.5.4}/src/themefinder/prompts/consultation_system_prompt.txt +0 -0
- {themefinder-0.5.2 → themefinder-0.5.4}/src/themefinder/prompts/sentiment_analysis.txt +0 -0
- {themefinder-0.5.2 → themefinder-0.5.4}/src/themefinder/prompts/theme_generation.txt +0 -0
- {themefinder-0.5.2 → themefinder-0.5.4}/src/themefinder/prompts/theme_target_alignment.txt +0 -0
- {themefinder-0.5.2 → themefinder-0.5.4}/src/themefinder/themefinder_logging.py +0 -0
|
@@ -1,17 +1,18 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: themefinder
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.4
|
|
4
4
|
Summary: A topic modelling Python package designed for analysing one-to-many question-answer data eg free-text survey responses.
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: i.AI
|
|
7
7
|
Author-email: packages@cabinetoffice.gov.uk
|
|
8
|
-
Requires-Python: >=3.
|
|
8
|
+
Requires-Python: >=3.10,<3.13
|
|
9
9
|
Classifier: Intended Audience :: Developers
|
|
10
10
|
Classifier: Intended Audience :: Science/Research
|
|
11
11
|
Classifier: License :: OSI Approved :: MIT License
|
|
12
12
|
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
15
|
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
15
16
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
17
|
Classifier: Topic :: Text Processing :: Linguistic
|
|
17
18
|
Requires-Dist: boto3 (>=1.29,<2.0)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "themefinder"
|
|
3
|
-
version = "0.5.
|
|
3
|
+
version = "0.5.4"
|
|
4
4
|
description = "A topic modelling Python package designed for analysing one-to-many question-answer data eg free-text survey responses."
|
|
5
5
|
authors = ["i.AI <packages@cabinetoffice.gov.uk>"]
|
|
6
6
|
packages = [{include = "themefinder", from = "src"}]
|
|
@@ -17,7 +17,7 @@ classifiers = [
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
[tool.poetry.dependencies]
|
|
20
|
-
python = ">=3.
|
|
20
|
+
python = ">=3.10,<3.13"
|
|
21
21
|
langchain = "*"
|
|
22
22
|
langchain-openai = "0.1.17"
|
|
23
23
|
pandas = "^2.2.2"
|
|
@@ -95,7 +95,7 @@ async def find_themes(
|
|
|
95
95
|
return {
|
|
96
96
|
"question": question,
|
|
97
97
|
"sentiment": sentiment_df,
|
|
98
|
-
"
|
|
98
|
+
"themes": theme_df,
|
|
99
99
|
"condensed_themes": condensed_theme_df,
|
|
100
100
|
"refined_themes": refined_theme_df,
|
|
101
101
|
"mapping": mapping_df,
|
|
@@ -197,7 +197,7 @@ async def theme_condensation(
|
|
|
197
197
|
themes_df: pd.DataFrame,
|
|
198
198
|
llm: Runnable,
|
|
199
199
|
question: str,
|
|
200
|
-
batch_size: int =
|
|
200
|
+
batch_size: int = 75,
|
|
201
201
|
prompt_template: str | Path | PromptTemplate = "theme_condensation",
|
|
202
202
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
203
203
|
**kwargs,
|
|
@@ -224,7 +224,7 @@ async def theme_condensation(
|
|
|
224
224
|
pd.DataFrame: DataFrame containing the condensed themes, where similar topics
|
|
225
225
|
have been combined into broader categories.
|
|
226
226
|
"""
|
|
227
|
-
logger.info(f"Running theme condensation on {len(themes_df)}
|
|
227
|
+
logger.info(f"Running theme condensation on {len(themes_df)} themes")
|
|
228
228
|
themes_df["response_id"] = range(len(themes_df))
|
|
229
229
|
|
|
230
230
|
n_themes = themes_df.shape[0]
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
{system_prompt}
|
|
2
|
+
|
|
3
|
+
Below is a question and a list of topics extracted from answers to that question. Each topic has a topic_label, topic_description, and may have a source_topic_count field indicating how many original topics it represents.
|
|
4
|
+
|
|
5
|
+
Your task is to analyze these topics and produce a refined list that:
|
|
6
|
+
1. Identifies and preserves core themes that appear frequently
|
|
7
|
+
2. Combines redundant topics while maintaining nuanced differences
|
|
8
|
+
3. Ensures the final list represents the full spectrum of viewpoints present in the original data
|
|
9
|
+
4. Tracks the total number of original topics combined into each new topic
|
|
10
|
+
|
|
11
|
+
Guidelines for Topic Analysis:
|
|
12
|
+
- Begin by identifying distinct concept clusters in the topics
|
|
13
|
+
- Consider the context of the question when determining topic relevance
|
|
14
|
+
- Look for complementary perspectives that could enrich understanding of the same core concept
|
|
15
|
+
- Consider the key ideas behind themes when merging, don't simply focus on the words used in the label and description
|
|
16
|
+
- When combining topics:
|
|
17
|
+
* For topics without a source_topic_count field, assume count = 1
|
|
18
|
+
* For topics with source_topic_count, use their existing count
|
|
19
|
+
* The new topic's count should be the sum of all combined topics' counts
|
|
20
|
+
|
|
21
|
+
For each topic in your output:
|
|
22
|
+
1. Choose a clear, representative label that captures the essence of the combined or preserved topic
|
|
23
|
+
2. Write a concise description that incorporates key insights from all constituent topics, this should only be a single sentence
|
|
24
|
+
3. Include the total count of original topics combined by summing the source_topic_counts of merged topics (or 1 for topics without a count)
|
|
25
|
+
|
|
26
|
+
The final output should be in the following JSON format:
|
|
27
|
+
|
|
28
|
+
{{"responses": [
|
|
29
|
+
{{"topic_label": "{{label for condensed topic 1}}",
|
|
30
|
+
"topic_description": "{{description for condensed topic 1}}",
|
|
31
|
+
"source_topic_count": {{sum of source_topic_counts from combined topics}}
|
|
32
|
+
}},
|
|
33
|
+
{{"topic_label": "{{label for condensed topic 2}}",
|
|
34
|
+
"topic_description": "{{description for condensed topic 2}}",
|
|
35
|
+
"source_topic_count": {{sum of source_topic_counts from combined topics}}
|
|
36
|
+
}},
|
|
37
|
+
{{"topic_label": "{{label for condensed topic 3}}",
|
|
38
|
+
"topic_description": "{{description for condensed topic 3}}",
|
|
39
|
+
"source_topic_count": {{sum of source_topic_counts from combined topics}}
|
|
40
|
+
}},
|
|
41
|
+
// Additional topics as necessary
|
|
42
|
+
]}}
|
|
43
|
+
|
|
44
|
+
[Question]
|
|
45
|
+
|
|
46
|
+
{question}
|
|
47
|
+
|
|
48
|
+
[Themes]
|
|
49
|
+
|
|
50
|
+
{responses}
|
|
@@ -12,7 +12,7 @@ You will be given:
|
|
|
12
12
|
Your task is to analyze each response and decide which topics are present. Guidelines:
|
|
13
13
|
- You can only assign to a response to a topic in the provided TOPIC LIST
|
|
14
14
|
- A response doesn't need to exactly match the language used in the TOPIC LIST, it should be considered a match if it expresses a similar sentiment.
|
|
15
|
-
- You must use the alphabetic 'topic_id' to indicate which topic you have assigned.
|
|
15
|
+
- You must use the alphabetic 'topic_id' to indicate which topic you have assigned. Do not use the full topic description
|
|
16
16
|
- Each response can be assigned to multiple topics if it matches more than one topic from the TOPIC LIST.
|
|
17
17
|
- There is no limit on how many topics can be assigned to a response.
|
|
18
18
|
- For each assignment provide a single rationale for why you have chosen the label.
|
|
@@ -22,7 +22,8 @@ Your task is to analyze each response and decide which topics are present. Guide
|
|
|
22
22
|
|
|
23
23
|
You MUST include every response ID in the output.
|
|
24
24
|
If the response can not be labelled return empty sections where appropriate but you MUST return an entry
|
|
25
|
-
with the correct response ID for each input object
|
|
25
|
+
with the correct response ID for each input object.
|
|
26
|
+
You must only return the alphabetic topic_ids in the labels section.
|
|
26
27
|
|
|
27
28
|
The final output should be in the following JSON format:
|
|
28
29
|
|
|
@@ -53,9 +53,9 @@ You will produce a list of NEUTRAL TOPICS based on the input. Each neutral topic
|
|
|
53
53
|
Return your output in the following JSON format:
|
|
54
54
|
{{
|
|
55
55
|
"responses": [
|
|
56
|
-
{{"topic_id": "A", "topic": "{{topic label 1}}: {{topic description 1}}"}},
|
|
57
|
-
{{"topic_id": "B", "topic": "{{topic label 2}}: {{topic description 2}}"}},
|
|
58
|
-
{{"topic_id": "C", "topic": "{{topic label 3}}: {{topic description 3}}"}},
|
|
56
|
+
{{"topic_id": "A", "topic": "{{topic label 1}}: {{topic description 1}}", "source_topic_count": {{count1}}}},
|
|
57
|
+
{{"topic_id": "B", "topic": "{{topic label 2}}: {{topic description 2}}", "source_topic_count": {{count2}}}},
|
|
58
|
+
{{"topic_id": "C", "topic": "{{topic label 3}}: {{topic description 3}}", "source_topic_count": {{count3}}}},
|
|
59
59
|
// Additional topics as necessary
|
|
60
60
|
]
|
|
61
61
|
}}
|
|
@@ -64,11 +64,14 @@ Return your output in the following JSON format:
|
|
|
64
64
|
## EXAMPLE
|
|
65
65
|
|
|
66
66
|
OPINIONATED TOPIC:
|
|
67
|
-
"Economic impact: Many respondents who support the policy believe it will create jobs and boost the economy, it could raise GDP by 2%."
|
|
67
|
+
"Economic impact: Many respondents who support the policy believe it will create jobs and boost the economy, it could raise GDP by 2%. [source_topic_count: 15]"
|
|
68
68
|
|
|
69
69
|
NEUTRAL TOPIC:
|
|
70
|
-
|
|
71
|
-
|
|
70
|
+
{{
|
|
71
|
+
"topic_id": "A",
|
|
72
|
+
"topic": "Economic Impact on Employment: The policy's potential effects on job creation and overall economic growth, including potential for a 2% increase in GDP.",
|
|
73
|
+
"source_topic_count": 15
|
|
74
|
+
}}
|
|
72
75
|
|
|
73
76
|
Remember, your goal is to create a list of neutral, informative, and distinct topics that accurately represent the content of the original opinionated topics without any bias or references to responses.
|
|
74
77
|
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
{system_prompt}
|
|
2
|
-
|
|
3
|
-
Below is a question and a list of topics extracted from answers to that question. Each topic has a topic_label and a topic_description.
|
|
4
|
-
|
|
5
|
-
Your task is to analyze these topics and produce a refined list that:
|
|
6
|
-
1. Identifies and preserves core themes that appear frequently
|
|
7
|
-
2. Combines redundant topics while maintaining nuanced differences
|
|
8
|
-
3. Ensures the final list represents the full spectrum of viewpoints present in the original data
|
|
9
|
-
|
|
10
|
-
Guidelines for Topic Analysis:
|
|
11
|
-
- Begin by identifying distinct concept clusters in the topics
|
|
12
|
-
- Consider the context of the question when determining topic relevance
|
|
13
|
-
- Look for complementary perspectives that could enrich understanding of the same core concept
|
|
14
|
-
- Consider the key ideas behind themes when merging, don't simply focus on the words used in the label and description
|
|
15
|
-
|
|
16
|
-
For each topic in your output:
|
|
17
|
-
1. Choose a clear, representative label that captures the essence of the combined or preserved topic
|
|
18
|
-
2. Write a concise description that incorporates key insights from all constituent topics, this should only be a single sentence
|
|
19
|
-
|
|
20
|
-
Return at most 30 topics
|
|
21
|
-
|
|
22
|
-
The final output should be in the following JSON format:
|
|
23
|
-
|
|
24
|
-
{{"responses": [
|
|
25
|
-
{{"topic_label": "{{label for condensed topic 1}}", "topic_description": "{{description for condensed topic 1}}"}},
|
|
26
|
-
{{"topic_label": "{{label for condensed topic 2}}", "topic_description": "{{description for condensed topic 2}}"}},
|
|
27
|
-
{{"topic_label": "{{label for condensed topic 3}}", "topic_description": "{{description for condensed topic 3}}"}},
|
|
28
|
-
// Additional topics as necessary
|
|
29
|
-
]}}
|
|
30
|
-
|
|
31
|
-
[Question]
|
|
32
|
-
|
|
33
|
-
{question}
|
|
34
|
-
|
|
35
|
-
[Themes]
|
|
36
|
-
|
|
37
|
-
{responses}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{themefinder-0.5.2 → themefinder-0.5.4}/src/themefinder/prompts/consultation_system_prompt.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|