themefinder 0.5.3__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of themefinder might be problematic. Click here for more details.

themefinder/models.py ADDED
@@ -0,0 +1,138 @@
1
+ from pydantic import BaseModel, Field, model_validator
2
+
3
+
4
+ def validate_non_empty_fields(model: BaseModel) -> BaseModel:
5
+ """
6
+ Validate that all string fields in the model are non-empty (after stripping)
7
+ and that list fields are not empty.
8
+
9
+ Args:
10
+ model (BaseModel): A Pydantic model instance.
11
+
12
+ Returns:
13
+ BaseModel: The same model if validation passes.
14
+
15
+ Raises:
16
+ ValueError: If any string field is empty or any list field is empty.
17
+ """
18
+ for field_name, value in model.__dict__.items():
19
+ if isinstance(value, str) and not value.strip():
20
+ raise ValueError(f"{field_name} cannot be empty or only whitespace")
21
+ if isinstance(value, list) and not value:
22
+ raise ValueError(f"{field_name} cannot be an empty list")
23
+ return model
24
+
25
+
26
+ def validate_position(model: BaseModel) -> BaseModel:
27
+ """
28
+ Validate that the model's 'position' field is one of the allowed values.
29
+
30
+ Args:
31
+ model (BaseModel): A Pydantic model instance with a 'position' attribute.
32
+
33
+ Returns:
34
+ BaseModel: The same model if validation passes.
35
+
36
+ Raises:
37
+ ValueError: If the 'position' field is not one of the allowed values.
38
+ """
39
+ allowed_positions = {"AGREEMENT", "DISAGREEMENT", "UNCLEAR"}
40
+ if model.position not in allowed_positions:
41
+ raise ValueError(f"position must be one of {allowed_positions}")
42
+ return model
43
+
44
+
45
+ def validate_stances(model: BaseModel) -> BaseModel:
46
+ """
47
+ Validate that every stance in the model's 'stances' field is allowed.
48
+
49
+ Args:
50
+ model (BaseModel): A Pydantic model instance with a 'stances' attribute.
51
+
52
+ Returns:
53
+ BaseModel: The same model if validation passes.
54
+
55
+ Raises:
56
+ ValueError: If any stance is not among the allowed stances.
57
+ """
58
+ allowed_stances = {"POSITIVE", "NEGATIVE"}
59
+ for stance in model.stances:
60
+ if stance not in allowed_stances:
61
+ raise ValueError(f"stances must be one of {allowed_stances}")
62
+ return model
63
+
64
+
65
+ def validate_mapping_stance_lengths(model: BaseModel) -> BaseModel:
66
+ """
67
+ Validate that the lengths of the model's 'stances' and 'labels' fields match.
68
+
69
+ Args:
70
+ model (BaseModel): A Pydantic model instance with 'stances' and 'labels' attributes.
71
+
72
+ Returns:
73
+ BaseModel: The same model if validation passes.
74
+
75
+ Raises:
76
+ ValueError: If the lengths of 'stances' and 'labels' do not match.
77
+ """
78
+ if len(model.stances) != len(model.labels):
79
+ raise ValueError("'stances' must have the same length as 'labels'")
80
+ return model
81
+
82
+
83
+ def validate_mapping_unique_labels(model: BaseModel) -> BaseModel:
84
+ """
85
+ Validate that the model's 'labels' field contains unique values.
86
+
87
+ Args:
88
+ model (BaseModel): A Pydantic model instance with a 'labels' attribute.
89
+
90
+ Returns:
91
+ BaseModel: The same model if validation passes.
92
+
93
+ Raises:
94
+ ValueError: If 'labels' contains duplicate values.
95
+ """
96
+ if len(model.labels) != len(set(model.labels)):
97
+ raise ValueError("'labels' must be unique")
98
+ return model
99
+
100
+
101
+ class SentimentAnalysisOutput(BaseModel):
102
+ response_id: int = Field(gt=0)
103
+ position: str
104
+
105
+ @model_validator(mode="after")
106
+ def run_validations(self) -> "SentimentAnalysisOutput":
107
+ """
108
+ Run all validations for SentimentAnalysisOutput.
109
+
110
+ Validates that:
111
+ - 'position' is one of the allowed values.
112
+ - No fields are empty or only whitespace (for strings) and no lists are empty.
113
+ """
114
+ validate_position(self)
115
+ validate_non_empty_fields(self)
116
+ return self
117
+
118
+
119
+ class ThemeMappingOutput(BaseModel):
120
+ response_id: int = Field(gt=0)
121
+ labels: list[str]
122
+ reasons: list[str]
123
+ stances: list[str]
124
+
125
+ @model_validator(mode="after")
126
+ def run_validations(self) -> "ThemeMappingOutput":
127
+ """
128
+ Run all validations for ThemeMappingOutput.
129
+
130
+ Validates that:
131
+ - 'stances' are only 'POSITIVE' or 'NEGATIVE'.
132
+ - The 'stances' and 'labels' have matching lengths.
133
+ - 'labels' are unique.
134
+ """
135
+ validate_stances(self)
136
+ validate_mapping_stance_lengths(self)
137
+ validate_mapping_unique_labels(self)
138
+ return self
@@ -3,8 +3,8 @@
3
3
  You will receive a list of RESPONSES, each containing a response_id and a response.
4
4
  Your job is to analyze each response to the QUESTION below and decide:
5
5
 
6
- POSITION - is the response agreeing or disagreeing or is it unclear about the change being proposed in the question.
7
- Choose one from [agreement, disagreement, unclear]
6
+ POSITION - is the response AGREEING or DISAGREEING or is it UNCLEAR about the change being proposed in the question.
7
+ Choose one from [AGREEMENT, DISAGREEMENT, UNCLEAR]
8
8
 
9
9
  The final output should be in the following JSON format:
10
10
 
@@ -24,20 +24,23 @@ You MUST include every response ID in the output.
24
24
  If the response can not be labelled return empty sections where appropriate but you MUST return an entry
25
25
  with the correct response ID for each input object
26
26
 
27
+ You MUST pick one of the given POSITION values.
28
+ You MUST not return an empty value for the POSITION of a response.
29
+
27
30
  ## EXAMPLE
28
31
  Example 1:
29
32
  Question: \n What are your thoughts on the proposed government changes to the policy about reducing school holidays?
30
33
  Response: \n as a parent I have no idea why you would make this change. I guess you were thinking about increasing productivity but any productivity gains would be totally offset by the decrease in family time. \n
31
34
 
32
35
  Output:
33
- POSITION: disagreement
36
+ POSITION: DISAGREEMENT
34
37
 
35
38
  Example 2:
36
39
  Question: \n What are your thoughts on the proposed government changes to the policy about reducing school holidays?
37
40
  Response: \n I think this is a great idea, our children will learn more if they are in school more \n
38
41
 
39
42
  Output:
40
- POSITION: agreement
43
+ POSITION: AGREEMENT
41
44
 
42
45
  Example 3:
43
46
  Question: \n What are your thoughts on the proposed government changes to the policy about reducing school holidays?
@@ -45,7 +48,7 @@ Response: \n it will be good for our children to be around their friends more bu
45
48
  less time with their children \n
46
49
 
47
50
  Output:
48
- POSITION: unclear
51
+ POSITION: UNCLEAR
49
52
 
50
53
 
51
54
  QUESTION: \n {question}
@@ -1,30 +1,43 @@
1
1
  {system_prompt}
2
2
 
3
- Below is a question and a list of topics extracted from answers to that question. Each topic has a topic_label and a topic_description.
3
+ Below is a question and a list of topics extracted from answers to that question. Each topic has a topic_label, topic_description, and may have a source_topic_count field indicating how many original topics it represents.
4
4
 
5
5
  Your task is to analyze these topics and produce a refined list that:
6
6
  1. Identifies and preserves core themes that appear frequently
7
7
  2. Combines redundant topics while maintaining nuanced differences
8
8
  3. Ensures the final list represents the full spectrum of viewpoints present in the original data
9
+ 4. Tracks the total number of original topics combined into each new topic
9
10
 
10
11
  Guidelines for Topic Analysis:
11
12
  - Begin by identifying distinct concept clusters in the topics
12
13
  - Consider the context of the question when determining topic relevance
13
14
  - Look for complementary perspectives that could enrich understanding of the same core concept
14
15
  - Consider the key ideas behind themes when merging, don't simply focus on the words used in the label and description
16
+ - When combining topics:
17
+ * For topics without a source_topic_count field, assume count = 1
18
+ * For topics with source_topic_count, use their existing count
19
+ * The new topic's count should be the sum of all combined topics' counts
15
20
 
16
21
  For each topic in your output:
17
22
  1. Choose a clear, representative label that captures the essence of the combined or preserved topic
18
23
  2. Write a concise description that incorporates key insights from all constituent topics, this should only be a single sentence
19
-
20
- Return at most 30 topics
24
+ 3. Include the total count of original topics combined by summing the source_topic_counts of merged topics (or 1 for topics without a count)
21
25
 
22
26
  The final output should be in the following JSON format:
23
27
 
24
28
  {{"responses": [
25
- {{"topic_label": "{{label for condensed topic 1}}", "topic_description": "{{description for condensed topic 1}}"}},
26
- {{"topic_label": "{{label for condensed topic 2}}", "topic_description": "{{description for condensed topic 2}}"}},
27
- {{"topic_label": "{{label for condensed topic 3}}", "topic_description": "{{description for condensed topic 3}}"}},
29
+ {{"topic_label": "{{label for condensed topic 1}}",
30
+ "topic_description": "{{description for condensed topic 1}}",
31
+ "source_topic_count": {{sum of source_topic_counts from combined topics}}
32
+ }},
33
+ {{"topic_label": "{{label for condensed topic 2}}",
34
+ "topic_description": "{{description for condensed topic 2}}",
35
+ "source_topic_count": {{sum of source_topic_counts from combined topics}}
36
+ }},
37
+ {{"topic_label": "{{label for condensed topic 3}}",
38
+ "topic_description": "{{description for condensed topic 3}}",
39
+ "source_topic_count": {{sum of source_topic_counts from combined topics}}
40
+ }},
28
41
  // Additional topics as necessary
29
42
  ]}}
30
43
 
@@ -17,7 +17,7 @@ Your task is to analyze each response and decide which topics are present. Guide
17
17
  - There is no limit on how many topics can be assigned to a response.
18
18
  - For each assignment provide a single rationale for why you have chosen the label.
19
19
  - For each topic identified in a response, indicate whether the response expresses a positive or negative stance toward that topic (options: 'POSITIVE' or 'NEGATIVE')
20
- - If a response contains both positive and negative statements about a topic within the same response, choose the stance that receives more emphasis or appears more central to the argument
20
+ - You MUST use either 'POSTIVE' or 'NEGATIVE'
21
21
  - The order of reasons and stances must align with the order of labels (e.g., stance_a applies to topic_a)
22
22
 
23
23
  You MUST include every response ID in the output.
@@ -30,13 +30,13 @@ The final output should be in the following JSON format:
30
30
  {{
31
31
  "responses": [
32
32
  {{
33
- "response_id": "response_id_1",
33
+ "response_id": response_id_1,
34
34
  "reasons": ["reason_a", "reason_b"],
35
35
  "labels": ["topic_a", "topic_b"],
36
36
  "stances": ["stance_a", "stance_b"],
37
37
  }},
38
38
  {{
39
- "response_id": "response_id_2",
39
+ "response_id": response_id_2,
40
40
  "reasons": ["reason_c"],
41
41
  "labels": ["topic_c"],
42
42
  "stances": ["stance_c"],
@@ -1,13 +1,12 @@
1
1
  {system_prompt}
2
2
 
3
- You are tasked with refining and neutralizing a list of topics generated from responses to a question.
4
- Your goal is to transform opinionated topics into neutral, well-structured, and distinct topics while preserving the essential information.
3
+ You are tasked with refining a list of topics generated from responses to a question.
5
4
 
6
5
  ## Input
7
- You will receive a list of OPINIONATED TOPICS. These topics explicitly tie opinions to whether a person agrees or disagrees with the question.
6
+ You will receive a list of TOPICS. These topics explicitly tie opinions to whether a person agrees or disagrees with the question.
8
7
 
9
8
  ## Output
10
- You will produce a list of NEUTRAL TOPICS based on the input. Each neutral topic should have two parts:
9
+ You will produce a list of CLEAR STANCE TOPICS based on the input. Each topic should have two parts:
11
10
  1. A brief, clear topic label (3-7 words)
12
11
  2. A more detailed topic description (1-2 sentences)
13
12
 
@@ -17,10 +16,11 @@ You will produce a list of NEUTRAL TOPICS based on the input. Each neutral topic
17
16
  - Preserve all key information, details and concepts from the original topics.
18
17
  - Ensure no significant details are lost in the refinement process.
19
18
 
20
- 2. Neutrality:
21
- - Remove all language indicating agreement or disagreement.
22
- - Present topics objectively without favoring any particular stance.
23
- - Avoid phrases like "supporters believe" or "critics argue".
19
+ 2. Clear Stance Formulation:
20
+ - Reformulate topics to express a clear stance that can be agreed or disagreed with.
21
+ - Use direct language like "Increased risk of X" rather than "X"
22
+ - Avoid double negatives and ambiguous phrasing.
23
+ - Phrase topics as definitive statements.
24
24
 
25
25
  3. Avoid Response References:
26
26
  - Do not use language that refers to multiple responses or respondents.
@@ -39,40 +39,27 @@ You will produce a list of NEUTRAL TOPICS based on the input. Each neutral topic
39
39
 
40
40
  ## Process
41
41
 
42
- 1. Analyze the OPINIONATED TOPICS to identify key themes and information.
42
+ 1. Analyze the TOPICS to identify key themes and information.
43
43
  2. Group closely related topics together.
44
44
  3. For each group or individual topic:
45
45
  a. Distill the core concept, removing any bias or opinion.
46
46
  b. Create a neutral, concise topic label.
47
47
  c. Write a more detailed description that provides context without taking sides.
48
48
  4. Review the entire list to ensure distinctiveness and adjust as needed.
49
- 5. Double-check that all topics are truly neutral and free of response references.
50
- 6. Assign each output topic a topic_id a single uppercase letters (starting from 'A')
51
- 7. Combine the topic label and description with a colon separator
49
+ 5. Assign each output topic a topic_id a single uppercase letters (starting from 'A', for the 27th element use AA)
50
+ 6. Combine the topic label and description with a colon separator
52
51
 
53
52
  Return your output in the following JSON format:
54
53
  {{
55
54
  "responses": [
56
- {{"topic_id": "A", "topic": "{{topic label 1}}: {{topic description 1}}"}},
57
- {{"topic_id": "B", "topic": "{{topic label 2}}: {{topic description 2}}"}},
58
- {{"topic_id": "C", "topic": "{{topic label 3}}: {{topic description 3}}"}},
55
+ {{"topic_id": "A", "topic": "{{topic label 1}}: {{topic description 1}}", "source_topic_count": {{count1}}}},
56
+ {{"topic_id": "B", "topic": "{{topic label 2}}: {{topic description 2}}", "source_topic_count": {{count2}}}},
57
+ {{"topic_id": "C", "topic": "{{topic label 3}}: {{topic description 3}}", "source_topic_count": {{count3}}}},
59
58
  // Additional topics as necessary
60
59
  ]
61
60
  }}
62
61
 
63
62
 
64
- ## EXAMPLE
65
63
 
66
- OPINIONATED TOPIC:
67
- "Economic impact: Many respondents who support the policy believe it will create jobs and boost the economy, it could raise GDP by 2%."
68
-
69
- NEUTRAL TOPIC:
70
- Topic Label: Economic Impact on Employment
71
- Description: The policy's potential effects on job creation and overall economic growth, including potential for a 2% increase in GDP.
72
-
73
- Remember, your goal is to create a list of neutral, informative, and distinct topics that accurately represent the content of the original opinionated topics without any bias or references to responses.
74
-
75
-
76
-
77
- OPINIONATED TOPIC:
64
+ TOPICS:
78
65
  {responses}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: themefinder
3
- Version: 0.5.3
3
+ Version: 0.6.2
4
4
  Summary: A topic modelling Python package designed for analysing one-to-many question-answer data eg free-text survey responses.
5
5
  License: MIT
6
6
  Author: i.AI
@@ -100,7 +100,7 @@ system_prompt = "You are an AI evaluation tool analyzing survey responses about
100
100
  # Run the function to find themes
101
101
  # We use asyncio to query LLM endpoints asynchronously, so we need to await our function
102
102
  async def main():
103
- result = await find_themes(responses_df, llm, question, system_prompt)
103
+ result = await find_themes(responses_df, llm, question, system_prompt=system_prompt)
104
104
  print(result)
105
105
 
106
106
  if __name__ == "__main__":
@@ -155,3 +155,4 @@ The documentation is [© Crown copyright](https://www.nationalarchives.gov.uk/in
155
155
  ## Feedback
156
156
 
157
157
  If you have feedback on this package, please fill in our [feedback form](https://forms.gle/85xUSMvxGzSSKQ499) or contact us with questions or feedback at packages@cabinetoffice.gov.uk.
158
+
@@ -0,0 +1,16 @@
1
+ themefinder/__init__.py,sha256=wSpW2fEnC4gTzbeNC78nSD3DpJq43-h_H-LK_cqt1cw,327
2
+ themefinder/core.py,sha256=u1DY9gbzn-tFhQS3hrXQ8_1mIbR-iBWYVAdKeAX1BdE,18304
3
+ themefinder/llm_batch_processor.py,sha256=OrFEl1nSi5ninbSZSiE1HFMcYZiQ-NzuYPj_iDcPPoE,19988
4
+ themefinder/models.py,sha256=Y5-okndYwtBO09n_qUlYNVmHRVNEnJviArQZukm8Ox8,4251
5
+ themefinder/prompts/consultation_system_prompt.txt,sha256=_A07oY_an4hnRx-9pQ0y-TLXJz0dd8vDI-MZne7Mdb4,89
6
+ themefinder/prompts/sentiment_analysis.txt,sha256=9-LkdR95JTHXRKUXknAgNf86uVdv6jSaXMf-OtFL9_0,1948
7
+ themefinder/prompts/theme_condensation.txt,sha256=DB4pqUmMpo0OG4AZWGTj0FfLFfjbX6wOMUr44HBxZ1o,2433
8
+ themefinder/prompts/theme_generation.txt,sha256=JMXuNojxdSAcxPRU1Jg12Xunv_dX4hNvXYU2pXMWTAw,2500
9
+ themefinder/prompts/theme_mapping.txt,sha256=YcRGMkuTyTPzPQPtsDY31DUwX60c8AdmdHKw0XeUejQ,2258
10
+ themefinder/prompts/theme_refinement.txt,sha256=hBXwZnNZmhmoEFXpY5OJinp-7xxdoDRf_5LmgrilYgc,2713
11
+ themefinder/prompts/theme_target_alignment.txt,sha256=-_ghr4--KAN6Tz8ExO9s2IXvI6pjWaEA_nG5L83GV5I,1035
12
+ themefinder/themefinder_logging.py,sha256=n5SUQovEZLC4skEbxicjz_fOGF9mOk3S-Wpj5uXsaL8,314
13
+ themefinder-0.6.2.dist-info/LICENCE,sha256=C9ULIN0ctF60ZxUWH_hw1H434bDLg49Z-Qzn6BUHgqs,1060
14
+ themefinder-0.6.2.dist-info/METADATA,sha256=gI9Hp754EjopJQWw0QZIPb9dex8TalPMGnorUEOJlp0,6498
15
+ themefinder-0.6.2.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
16
+ themefinder-0.6.2.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.1.1
2
+ Generator: poetry-core 2.1.2
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,15 +0,0 @@
1
- themefinder/__init__.py,sha256=p6QoCgA-BYWljk8yPOeTgkNcN5m_gA_o3Q86Eh0QjSM,327
2
- themefinder/core.py,sha256=B6Du59rPsZbBcP8tkKmXQn6h5vvLN_PZIferPnF3LNY,17538
3
- themefinder/llm_batch_processor.py,sha256=SDDeMJeX1J3u7FGFddRhVSxty6U8lFVXwG4eNI_0C5o,12573
4
- themefinder/prompts/consultation_system_prompt.txt,sha256=_A07oY_an4hnRx-9pQ0y-TLXJz0dd8vDI-MZne7Mdb4,89
5
- themefinder/prompts/sentiment_analysis.txt,sha256=e3DcUKga6pSFcfeo2TAq8x9LXk0YDV-D7P2gtymcyuc,1832
6
- themefinder/prompts/theme_condensation.txt,sha256=GFwwQO_oZHhqhPnAfTn887fDzAIVxKoCyj0hXagyBIU,1645
7
- themefinder/prompts/theme_generation.txt,sha256=JMXuNojxdSAcxPRU1Jg12Xunv_dX4hNvXYU2pXMWTAw,2500
8
- themefinder/prompts/theme_mapping.txt,sha256=nb_D7gwKGd8BzrAlzSZC3mQIPYaCRXdE6XmoJaJEKZQ,2405
9
- themefinder/prompts/theme_refinement.txt,sha256=HCgvWAoz-cpFgjX_QS_VVY0X06d4ds0ekBgcoWyFyfg,3360
10
- themefinder/prompts/theme_target_alignment.txt,sha256=-_ghr4--KAN6Tz8ExO9s2IXvI6pjWaEA_nG5L83GV5I,1035
11
- themefinder/themefinder_logging.py,sha256=n5SUQovEZLC4skEbxicjz_fOGF9mOk3S-Wpj5uXsaL8,314
12
- themefinder-0.5.3.dist-info/LICENCE,sha256=C9ULIN0ctF60ZxUWH_hw1H434bDLg49Z-Qzn6BUHgqs,1060
13
- themefinder-0.5.3.dist-info/METADATA,sha256=o9rzrhRK-4PMAv9wS8ZrnmTw1rTSYGU8zfPbB31r1DU,6483
14
- themefinder-0.5.3.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
15
- themefinder-0.5.3.dist-info/RECORD,,