themefinder 0.7.1__tar.gz → 0.7.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of themefinder might be problematic. Click here for more details.
- {themefinder-0.7.1 → themefinder-0.7.2}/PKG-INFO +3 -2
- {themefinder-0.7.1 → themefinder-0.7.2}/pyproject.toml +1 -1
- {themefinder-0.7.1 → themefinder-0.7.2}/src/themefinder/core.py +34 -2
- {themefinder-0.7.1 → themefinder-0.7.2}/src/themefinder/models.py +0 -21
- {themefinder-0.7.1 → themefinder-0.7.2}/src/themefinder/prompts/agentic_theme_clustering.txt +5 -2
- {themefinder-0.7.1 → themefinder-0.7.2}/src/themefinder/prompts/theme_condensation.txt +8 -4
- {themefinder-0.7.1 → themefinder-0.7.2}/src/themefinder/prompts/theme_mapping.txt +0 -4
- {themefinder-0.7.1 → themefinder-0.7.2}/src/themefinder/prompts/theme_refinement.txt +5 -7
- {themefinder-0.7.1 → themefinder-0.7.2}/src/themefinder/theme_clustering_agent.py +33 -9
- {themefinder-0.7.1 → themefinder-0.7.2}/LICENCE +0 -0
- {themefinder-0.7.1 → themefinder-0.7.2}/README.md +0 -0
- {themefinder-0.7.1 → themefinder-0.7.2}/src/themefinder/__init__.py +0 -0
- {themefinder-0.7.1 → themefinder-0.7.2}/src/themefinder/llm_batch_processor.py +0 -0
- {themefinder-0.7.1 → themefinder-0.7.2}/src/themefinder/prompts/consultation_system_prompt.txt +0 -0
- {themefinder-0.7.1 → themefinder-0.7.2}/src/themefinder/prompts/detail_detection.txt +0 -0
- {themefinder-0.7.1 → themefinder-0.7.2}/src/themefinder/prompts/sentiment_analysis.txt +0 -0
- {themefinder-0.7.1 → themefinder-0.7.2}/src/themefinder/prompts/theme_generation.txt +0 -0
- {themefinder-0.7.1 → themefinder-0.7.2}/src/themefinder/prompts/theme_target_alignment.txt +0 -0
- {themefinder-0.7.1 → themefinder-0.7.2}/src/themefinder/themefinder_logging.py +0 -0
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: themefinder
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.2
|
|
4
4
|
Summary: A topic modelling Python package designed for analysing one-to-many question-answer data eg free-text survey responses.
|
|
5
5
|
License: MIT
|
|
6
|
+
License-File: LICENCE
|
|
6
7
|
Author: i.AI
|
|
7
8
|
Author-email: packages@cabinetoffice.gov.uk
|
|
8
9
|
Requires-Python: >=3.10,<3.13
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "themefinder"
|
|
3
|
-
version = "0.7.
|
|
3
|
+
version = "0.7.2"
|
|
4
4
|
description = "A topic modelling Python package designed for analysing one-to-many question-answer data eg free-text survey responses."
|
|
5
5
|
authors = ["i.AI <packages@cabinetoffice.gov.uk>"]
|
|
6
6
|
packages = [{include = "themefinder", from = "src"}]
|
|
@@ -186,7 +186,7 @@ async def theme_generation(
|
|
|
186
186
|
llm: RunnableWithFallbacks,
|
|
187
187
|
question: str,
|
|
188
188
|
batch_size: int = 50,
|
|
189
|
-
partition_key: str | None =
|
|
189
|
+
partition_key: str | None = None,
|
|
190
190
|
prompt_template: str | Path | PromptTemplate = "theme_generation",
|
|
191
191
|
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
192
192
|
concurrency: int = 10,
|
|
@@ -317,6 +317,7 @@ def theme_clustering(
|
|
|
317
317
|
target_themes: int = 10,
|
|
318
318
|
significance_percentage: float = 10.0,
|
|
319
319
|
return_all_themes: bool = False,
|
|
320
|
+
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
320
321
|
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
321
322
|
"""Perform hierarchical clustering of themes using an agentic approach.
|
|
322
323
|
|
|
@@ -340,6 +341,8 @@ def theme_clustering(
|
|
|
340
341
|
selecting significant themes. Defaults to 10.0.
|
|
341
342
|
return_all_themes (bool, optional): If True, returns all clustered themes.
|
|
342
343
|
If False, returns only significant themes. Defaults to False.
|
|
344
|
+
system_prompt (str): System prompt to guide the LLM's behavior.
|
|
345
|
+
Defaults to CONSULTATION_SYSTEM_PROMPT.
|
|
343
346
|
|
|
344
347
|
Returns:
|
|
345
348
|
tuple[pd.DataFrame, pd.DataFrame]:
|
|
@@ -362,7 +365,10 @@ def theme_clustering(
|
|
|
362
365
|
|
|
363
366
|
# Initialize clustering agent with structured output LLM
|
|
364
367
|
agent = ThemeClusteringAgent(
|
|
365
|
-
llm.with_structured_output(HierarchicalClusteringResponse),
|
|
368
|
+
llm.with_structured_output(HierarchicalClusteringResponse),
|
|
369
|
+
initial_themes,
|
|
370
|
+
system_prompt,
|
|
371
|
+
target_themes,
|
|
366
372
|
)
|
|
367
373
|
|
|
368
374
|
# Perform clustering
|
|
@@ -444,6 +450,32 @@ async def theme_refinement(
|
|
|
444
450
|
system_prompt=system_prompt,
|
|
445
451
|
concurrency=concurrency,
|
|
446
452
|
)
|
|
453
|
+
|
|
454
|
+
def assign_sequential_topic_ids(df: pd.DataFrame) -> pd.DataFrame:
|
|
455
|
+
"""
|
|
456
|
+
Assigns sequential alphabetic topic_ids (A, B, ..., Z, AA, AB, ...) to the DataFrame.
|
|
457
|
+
"""
|
|
458
|
+
|
|
459
|
+
def alpha_ids(n: int) -> list[str]:
|
|
460
|
+
ids = []
|
|
461
|
+
for i in range(n):
|
|
462
|
+
s = ""
|
|
463
|
+
x = i
|
|
464
|
+
while True:
|
|
465
|
+
x, r = divmod(x, 26)
|
|
466
|
+
s = chr(65 + r) + s
|
|
467
|
+
if x == 0:
|
|
468
|
+
break
|
|
469
|
+
x -= 1
|
|
470
|
+
ids.append(s)
|
|
471
|
+
return ids
|
|
472
|
+
|
|
473
|
+
if not df.empty:
|
|
474
|
+
df["topic_id"] = alpha_ids(len(df))
|
|
475
|
+
return df
|
|
476
|
+
|
|
477
|
+
refined_themes = assign_sequential_topic_ids(refined_themes)
|
|
478
|
+
|
|
447
479
|
return refined_themes, _
|
|
448
480
|
|
|
449
481
|
|
|
@@ -217,9 +217,6 @@ class ThemeCondensationResponses(ValidatedModel):
|
|
|
217
217
|
class RefinedTheme(ValidatedModel):
|
|
218
218
|
"""Model for a single refined theme"""
|
|
219
219
|
|
|
220
|
-
topic_id: str = Field(
|
|
221
|
-
..., description="Single uppercase letter ID (A-Z, then AA, AB, etc.)"
|
|
222
|
-
)
|
|
223
220
|
topic: str = Field(
|
|
224
221
|
..., description="Topic label and description combined with a colon separator"
|
|
225
222
|
)
|
|
@@ -231,19 +228,9 @@ class RefinedTheme(ValidatedModel):
|
|
|
231
228
|
def run_validations(self) -> "RefinedTheme":
|
|
232
229
|
"""Run all validations for RefinedTheme"""
|
|
233
230
|
self.validate_non_empty_fields()
|
|
234
|
-
self.validate_topic_id_format()
|
|
235
231
|
self.validate_topic_format()
|
|
236
232
|
return self
|
|
237
233
|
|
|
238
|
-
def validate_topic_id_format(self) -> "RefinedTheme":
|
|
239
|
-
"""
|
|
240
|
-
Validate that topic_id follows the expected format (A-Z, then AA, AB, etc.).
|
|
241
|
-
"""
|
|
242
|
-
topic_id = self.topic_id.strip()
|
|
243
|
-
if not topic_id.isupper() or not topic_id.isalpha():
|
|
244
|
-
raise ValueError(f"topic_id must be uppercase letters only: {topic_id}")
|
|
245
|
-
return self
|
|
246
|
-
|
|
247
234
|
def validate_topic_format(self) -> "RefinedTheme":
|
|
248
235
|
"""
|
|
249
236
|
Validate that topic contains a label and description separated by a colon.
|
|
@@ -273,9 +260,6 @@ class ThemeRefinementResponses(ValidatedModel):
|
|
|
273
260
|
def run_validations(self) -> "ThemeRefinementResponses":
|
|
274
261
|
"""Ensure there are no duplicate themes"""
|
|
275
262
|
self.validate_non_empty_fields()
|
|
276
|
-
topic_ids = [theme.topic_id for theme in self.responses]
|
|
277
|
-
if len(topic_ids) != len(set(topic_ids)):
|
|
278
|
-
raise ValueError("Duplicate topic_ids detected")
|
|
279
263
|
topics = [theme.topic.lower().strip() for theme in self.responses]
|
|
280
264
|
if len(topics) != len(set(topics)):
|
|
281
265
|
raise ValueError("Duplicate topics detected")
|
|
@@ -288,10 +272,6 @@ class ThemeMappingOutput(ValidatedModel):
|
|
|
288
272
|
|
|
289
273
|
response_id: int = Field(gt=0, description="Response ID, must be greater than 0")
|
|
290
274
|
labels: List[str] = Field(..., description="List of theme labels")
|
|
291
|
-
reasons: List[str] = Field(..., description="List of reasons for mapping")
|
|
292
|
-
stances: List[Stance] = Field(
|
|
293
|
-
..., description="List of stances (POSITIVE or NEGATIVE)"
|
|
294
|
-
)
|
|
295
275
|
|
|
296
276
|
@model_validator(mode="after")
|
|
297
277
|
def run_validations(self) -> "ThemeMappingOutput":
|
|
@@ -299,7 +279,6 @@ class ThemeMappingOutput(ValidatedModel):
|
|
|
299
279
|
Run all validations for ThemeMappingOutput.
|
|
300
280
|
"""
|
|
301
281
|
self.validate_non_empty_fields()
|
|
302
|
-
self.validate_equal_lengths("stances", "labels", "reasons")
|
|
303
282
|
self.validate_unique_items("labels")
|
|
304
283
|
return self
|
|
305
284
|
|
{themefinder-0.7.1 → themefinder-0.7.2}/src/themefinder/prompts/agentic_theme_clustering.txt
RENAMED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
{system_prompt}
|
|
2
|
+
|
|
1
3
|
Analyze these topics and identify which ones should be merged based on semantic similarity.
|
|
2
4
|
Your goal is to significantly reduce the number of topics by creating meaningful parent topics.
|
|
3
5
|
Be aggressive in finding opportunities to merge topics that share any semantic relationship.
|
|
@@ -22,10 +24,11 @@ Guidelines:
|
|
|
22
24
|
- source_topic_count must be the sum of all child topic counts
|
|
23
25
|
- children must be a list of valid topic_ids from the input
|
|
24
26
|
- should_terminate should only be true if ALL of these conditions are met:
|
|
25
|
-
* There are fewer than
|
|
27
|
+
* There are fewer than {target_themes} active topics remaining
|
|
26
28
|
* The remaining topics are fundamentally incompatible semantically
|
|
27
29
|
* Any further merging would create meaninglessly broad categories
|
|
28
30
|
|
|
29
31
|
If no topics should be merged in this iteration but future iterations might still yield meaningful merges, set should_terminate to false with an empty parent_themes list.
|
|
32
|
+
If no topics should be merged and the termination conditions are met, set should_terminate to true with an empty parent_themes list.
|
|
30
33
|
|
|
31
|
-
|
|
34
|
+
N.B. Under no circumstances should you create a parent theme with a single child. You do not need to return all of the original themes, if they don't belong to a newly created parent feel free to omit them.
|
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
{system_prompt}
|
|
2
2
|
|
|
3
|
-
Below is a question and a list of topics extracted from answers to that question.
|
|
3
|
+
Below is a question and a list of topics extracted from answers to that question.
|
|
4
|
+
|
|
5
|
+
This list contains a large number of duplicate and redundant topics that present the same concept with different phrasing.
|
|
6
|
+
|
|
7
|
+
Each topic has a topic_label, topic_description, and may have a source_topic_count field indicating how many original topics it represents.
|
|
4
8
|
|
|
5
9
|
Your task is to analyze these topics and produce a refined list that:
|
|
6
|
-
1.
|
|
7
|
-
2.
|
|
8
|
-
3.
|
|
10
|
+
1. Significantly reduces the total number of topics
|
|
11
|
+
2. Identifies and preserves core themes that appear frequently
|
|
12
|
+
3. Combines redundant topics
|
|
9
13
|
4. Tracks the total number of original topics combined into each new topic
|
|
10
14
|
|
|
11
15
|
Guidelines for Topic Analysis:
|
|
@@ -16,10 +16,6 @@ Your task is to analyze each response and decide which topics are present. Guide
|
|
|
16
16
|
- Each response can be assigned to multiple topics if it matches more than one topic from the TOPIC LIST.
|
|
17
17
|
- Each topic can only be assigned once per response, if the topic is mentioned more than once use the first mention for reasoning and stance.
|
|
18
18
|
- There is no limit on how many topics can be assigned to a response.
|
|
19
|
-
- For each assignment provide a single rationale for why you have chosen the label.
|
|
20
|
-
- For each topic identified in a response, indicate whether the response expresses a positive or negative stance toward that topic (options: 'POSITIVE' or 'NEGATIVE')
|
|
21
|
-
- You MUST use either 'POSITIVE' or 'NEGATIVE'
|
|
22
|
-
- The order of reasons and stances must align with the order of labels (e.g., stance_a applies to topic_a)
|
|
23
19
|
|
|
24
20
|
You MUST include every response ID in the output.
|
|
25
21
|
If the response can not be labelled return empty sections where appropriate but you MUST return an entry
|
|
@@ -7,10 +7,9 @@ You will receive a list of TOPICS. These topics explicitly tie opinions to wheth
|
|
|
7
7
|
|
|
8
8
|
## Output
|
|
9
9
|
You will produce a list of CLEAR STANCE TOPICS based on the input. Each topic should have four parts:
|
|
10
|
-
1. A
|
|
11
|
-
2. A
|
|
12
|
-
3.
|
|
13
|
-
4. The source_topic_count field should be included for each topic and should reflect the number of original source topics that were merged to create this refined topic. If multiple source topics were combined, sum their individual counts. If only one source topic was used, simply retain its original count value.
|
|
10
|
+
1. A brief, clear topic label (3-7 words)
|
|
11
|
+
2. A more detailed topic description (1-2 sentences)
|
|
12
|
+
3. The source_topic_count field should be included for each topic and should reflect the number of original source topics that were merged to create this refined topic. If multiple source topics were combined, sum their individual counts. If only one source topic was used, simply retain its original count value.
|
|
14
13
|
|
|
15
14
|
|
|
16
15
|
## Guidelines
|
|
@@ -46,11 +45,10 @@ You will produce a list of CLEAR STANCE TOPICS based on the input. Each topic sh
|
|
|
46
45
|
2. Group closely related topics together.
|
|
47
46
|
3. For each group or individual topic:
|
|
48
47
|
a. Distill the core concept, removing any bias or opinion.
|
|
49
|
-
b. Create a
|
|
48
|
+
b. Create a concise topic label.
|
|
50
49
|
c. Write a more detailed description that provides context without taking sides.
|
|
51
50
|
4. Review the entire list to ensure distinctiveness and adjust as needed.
|
|
52
|
-
5.
|
|
53
|
-
6. Combine the topic label and description with a colon separator
|
|
51
|
+
5. Combine the topic label and description with a colon separator
|
|
54
52
|
|
|
55
53
|
TOPICS:
|
|
56
54
|
{responses}
|
|
@@ -22,6 +22,8 @@ from .models import ThemeNode
|
|
|
22
22
|
from .llm_batch_processor import load_prompt_from_file
|
|
23
23
|
from .themefinder_logging import logger
|
|
24
24
|
|
|
25
|
+
CONSULTATION_SYSTEM_PROMPT = load_prompt_from_file("consultation_system_prompt")
|
|
26
|
+
|
|
25
27
|
|
|
26
28
|
class ThemeClusteringAgent:
|
|
27
29
|
"""Agent for performing hierarchical clustering of topics using language models.
|
|
@@ -37,13 +39,21 @@ class ThemeClusteringAgent:
|
|
|
37
39
|
current_iteration: Current iteration number in the clustering process
|
|
38
40
|
"""
|
|
39
41
|
|
|
40
|
-
def __init__(
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
llm: Runnable,
|
|
45
|
+
themes: List[ThemeNode],
|
|
46
|
+
system_prompt: str = CONSULTATION_SYSTEM_PROMPT,
|
|
47
|
+
target_themes: int = 10,
|
|
48
|
+
) -> None:
|
|
41
49
|
"""Initialize the clustering agent with an LLM and initial themes.
|
|
42
50
|
|
|
43
51
|
Args:
|
|
44
52
|
llm: Language model instance configured with structured output
|
|
45
53
|
for HierarchicalClusteringResponse
|
|
46
54
|
themes: List of ThemeNode objects to be clustered
|
|
55
|
+
system_prompt: System prompt to guide the LLM's behavior
|
|
56
|
+
target_themes: Target number of themes to cluster down to (default 10)
|
|
47
57
|
"""
|
|
48
58
|
self.llm = llm
|
|
49
59
|
self.themes: Dict[str, ThemeNode] = {}
|
|
@@ -51,6 +61,8 @@ class ThemeClusteringAgent:
|
|
|
51
61
|
self.themes[theme.topic_id] = theme
|
|
52
62
|
self.active_themes = set(self.themes.keys())
|
|
53
63
|
self.current_iteration = 0
|
|
64
|
+
self.system_prompt = system_prompt
|
|
65
|
+
self.target_themes = target_themes
|
|
54
66
|
|
|
55
67
|
def _format_prompt(self) -> str:
|
|
56
68
|
"""Format the clustering prompt with current active themes.
|
|
@@ -74,7 +86,10 @@ class ThemeClusteringAgent:
|
|
|
74
86
|
# Load the clustering prompt template
|
|
75
87
|
prompt_template = load_prompt_from_file("agentic_theme_clustering")
|
|
76
88
|
return prompt_template.format(
|
|
77
|
-
themes_json=themes_json,
|
|
89
|
+
themes_json=themes_json,
|
|
90
|
+
iteration=self.current_iteration,
|
|
91
|
+
system_prompt=self.system_prompt,
|
|
92
|
+
target_themes=self.target_themes,
|
|
78
93
|
)
|
|
79
94
|
|
|
80
95
|
@retry(
|
|
@@ -102,11 +117,20 @@ class ThemeClusteringAgent:
|
|
|
102
117
|
"""
|
|
103
118
|
prompt = self._format_prompt()
|
|
104
119
|
response = self.llm.invoke(prompt)
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
120
|
+
for i, parent in enumerate(response.parent_themes):
|
|
121
|
+
|
|
122
|
+
def to_alpha(idx: int) -> str:
|
|
123
|
+
"""Convert 0-based integer to Excel-style column name (A, B, ..., Z, AA, AB, ...) without divmod."""
|
|
124
|
+
idx += 1 # 1-based for Excel logic
|
|
125
|
+
result = []
|
|
126
|
+
while idx > 0:
|
|
127
|
+
rem = (idx - 1) % 26
|
|
128
|
+
result.append(chr(65 + rem))
|
|
129
|
+
idx = (idx - 1) // 26
|
|
130
|
+
return "".join(reversed(result))
|
|
131
|
+
|
|
132
|
+
new_theme_id = f"{to_alpha(i)}_{self.current_iteration}"
|
|
133
|
+
children = [c for c in parent.children if c in self.active_themes]
|
|
110
134
|
for child in children:
|
|
111
135
|
self.themes[child].parent_id = new_theme_id
|
|
112
136
|
total_source_count = sum(
|
|
@@ -114,8 +138,8 @@ class ThemeClusteringAgent:
|
|
|
114
138
|
)
|
|
115
139
|
new_theme = ThemeNode(
|
|
116
140
|
topic_id=new_theme_id,
|
|
117
|
-
topic_label=parent
|
|
118
|
-
topic_description=parent
|
|
141
|
+
topic_label=parent.topic_label,
|
|
142
|
+
topic_description=parent.topic_description,
|
|
119
143
|
source_topic_count=total_source_count,
|
|
120
144
|
children=children,
|
|
121
145
|
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{themefinder-0.7.1 → themefinder-0.7.2}/src/themefinder/prompts/consultation_system_prompt.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|