themefinder 0.6.3__tar.gz → 0.7.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of themefinder might be problematic. Click here for more details.
- {themefinder-0.6.3 → themefinder-0.7.1}/PKG-INFO +3 -3
- {themefinder-0.6.3 → themefinder-0.7.1}/README.md +1 -1
- {themefinder-0.6.3 → themefinder-0.7.1}/pyproject.toml +2 -2
- {themefinder-0.6.3 → themefinder-0.7.1}/src/themefinder/__init__.py +4 -2
- {themefinder-0.6.3 → themefinder-0.7.1}/src/themefinder/core.py +88 -6
- {themefinder-0.6.3 → themefinder-0.7.1}/src/themefinder/llm_batch_processor.py +1 -1
- {themefinder-0.6.3 → themefinder-0.7.1}/src/themefinder/models.py +64 -0
- themefinder-0.7.1/src/themefinder/prompts/agentic_theme_clustering.txt +31 -0
- themefinder-0.7.1/src/themefinder/prompts/detail_detection.txt +31 -0
- {themefinder-0.6.3 → themefinder-0.7.1}/src/themefinder/prompts/theme_refinement.txt +6 -5
- themefinder-0.7.1/src/themefinder/theme_clustering_agent.py +332 -0
- themefinder-0.6.3/src/themefinder/prompts/detail_detection.txt +0 -19
- {themefinder-0.6.3 → themefinder-0.7.1}/LICENCE +0 -0
- {themefinder-0.6.3 → themefinder-0.7.1}/src/themefinder/prompts/consultation_system_prompt.txt +0 -0
- {themefinder-0.6.3 → themefinder-0.7.1}/src/themefinder/prompts/sentiment_analysis.txt +0 -0
- {themefinder-0.6.3 → themefinder-0.7.1}/src/themefinder/prompts/theme_condensation.txt +0 -0
- {themefinder-0.6.3 → themefinder-0.7.1}/src/themefinder/prompts/theme_generation.txt +0 -0
- {themefinder-0.6.3 → themefinder-0.7.1}/src/themefinder/prompts/theme_mapping.txt +0 -0
- {themefinder-0.6.3 → themefinder-0.7.1}/src/themefinder/prompts/theme_target_alignment.txt +0 -0
- {themefinder-0.6.3 → themefinder-0.7.1}/src/themefinder/themefinder_logging.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: themefinder
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.1
|
|
4
4
|
Summary: A topic modelling Python package designed for analysing one-to-many question-answer data eg free-text survey responses.
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: i.AI
|
|
@@ -17,7 +17,7 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
17
17
|
Classifier: Topic :: Text Processing :: Linguistic
|
|
18
18
|
Requires-Dist: boto3 (>=1.29,<2.0)
|
|
19
19
|
Requires-Dist: langchain
|
|
20
|
-
Requires-Dist: langchain-openai
|
|
20
|
+
Requires-Dist: langchain-openai
|
|
21
21
|
Requires-Dist: langfuse (==2.29.1)
|
|
22
22
|
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
|
23
23
|
Requires-Dist: pandas (>=2.2.2,<3.0.0)
|
|
@@ -169,5 +169,5 @@ The documentation is [© Crown copyright](https://www.nationalarchives.gov.uk/in
|
|
|
169
169
|
|
|
170
170
|
## Feedback
|
|
171
171
|
|
|
172
|
-
|
|
172
|
+
Contact us with questions or feedback at packages@cabinetoffice.gov.uk.
|
|
173
173
|
|
|
@@ -138,4 +138,4 @@ The documentation is [© Crown copyright](https://www.nationalarchives.gov.uk/in
|
|
|
138
138
|
|
|
139
139
|
## Feedback
|
|
140
140
|
|
|
141
|
-
|
|
141
|
+
Contact us with questions or feedback at packages@cabinetoffice.gov.uk.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "themefinder"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.7.1"
|
|
4
4
|
description = "A topic modelling Python package designed for analysing one-to-many question-answer data eg free-text survey responses."
|
|
5
5
|
authors = ["i.AI <packages@cabinetoffice.gov.uk>"]
|
|
6
6
|
packages = [{include = "themefinder", from = "src"}]
|
|
@@ -19,7 +19,7 @@ classifiers = [
|
|
|
19
19
|
[tool.poetry.dependencies]
|
|
20
20
|
python = ">=3.10,<3.13"
|
|
21
21
|
langchain = "*"
|
|
22
|
-
langchain-openai = "
|
|
22
|
+
langchain-openai = "*"
|
|
23
23
|
pandas = "^2.2.2"
|
|
24
24
|
python-dotenv = "^1.0.1"
|
|
25
25
|
langfuse = "2.29.1"
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from .core import (
|
|
2
2
|
find_themes,
|
|
3
3
|
sentiment_analysis,
|
|
4
|
+
theme_clustering,
|
|
4
5
|
theme_condensation,
|
|
5
6
|
theme_generation,
|
|
6
7
|
theme_mapping,
|
|
@@ -12,11 +13,12 @@ from .core import (
|
|
|
12
13
|
__all__ = [
|
|
13
14
|
"find_themes",
|
|
14
15
|
"sentiment_analysis",
|
|
15
|
-
"
|
|
16
|
+
"theme_clustering",
|
|
16
17
|
"theme_condensation",
|
|
18
|
+
"theme_generation",
|
|
19
|
+
"theme_mapping",
|
|
17
20
|
"theme_refinement",
|
|
18
21
|
"theme_target_alignment",
|
|
19
|
-
"theme_mapping",
|
|
20
22
|
"detail_detection",
|
|
21
23
|
]
|
|
22
24
|
__version__ = "0.1.0"
|
|
@@ -5,16 +5,19 @@ import pandas as pd
|
|
|
5
5
|
from langchain_core.prompts import PromptTemplate
|
|
6
6
|
from langchain.schema.runnable import RunnableWithFallbacks
|
|
7
7
|
|
|
8
|
-
from .llm_batch_processor import batch_and_run, load_prompt_from_file
|
|
9
|
-
from .models import (
|
|
8
|
+
from themefinder.llm_batch_processor import batch_and_run, load_prompt_from_file
|
|
9
|
+
from themefinder.models import (
|
|
10
10
|
SentimentAnalysisResponses,
|
|
11
11
|
ThemeGenerationResponses,
|
|
12
12
|
ThemeCondensationResponses,
|
|
13
13
|
ThemeRefinementResponses,
|
|
14
14
|
ThemeMappingResponses,
|
|
15
15
|
DetailDetectionResponses,
|
|
16
|
+
HierarchicalClusteringResponse,
|
|
17
|
+
ThemeNode,
|
|
16
18
|
)
|
|
17
|
-
from .
|
|
19
|
+
from themefinder.theme_clustering_agent import ThemeClusteringAgent
|
|
20
|
+
from themefinder.themefinder_logging import logger
|
|
18
21
|
|
|
19
22
|
CONSULTATION_SYSTEM_PROMPT = load_prompt_from_file("consultation_system_prompt")
|
|
20
23
|
|
|
@@ -114,9 +117,7 @@ async def find_themes(
|
|
|
114
117
|
)
|
|
115
118
|
|
|
116
119
|
logger.info("Finished finding themes")
|
|
117
|
-
logger.info(
|
|
118
|
-
"Provide feedback or report bugs: https://forms.gle/85xUSMvxGzSSKQ499 or packages@cabinetoffice.gov.uk"
|
|
119
|
-
)
|
|
120
|
+
logger.info("Provide feedback or report bugs: packages@cabinetoffice.gov.uk")
|
|
120
121
|
return {
|
|
121
122
|
"question": question,
|
|
122
123
|
"sentiment": sentiment_df,
|
|
@@ -309,6 +310,87 @@ async def theme_condensation(
|
|
|
309
310
|
return themes_df, _
|
|
310
311
|
|
|
311
312
|
|
|
313
|
+
def theme_clustering(
|
|
314
|
+
themes_df: pd.DataFrame,
|
|
315
|
+
llm: RunnableWithFallbacks,
|
|
316
|
+
max_iterations: int = 5,
|
|
317
|
+
target_themes: int = 10,
|
|
318
|
+
significance_percentage: float = 10.0,
|
|
319
|
+
return_all_themes: bool = False,
|
|
320
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
321
|
+
"""Perform hierarchical clustering of themes using an agentic approach.
|
|
322
|
+
|
|
323
|
+
This function takes a DataFrame of themes and uses the ThemeClusteringAgent
|
|
324
|
+
to iteratively merge similar themes into a hierarchical structure, then
|
|
325
|
+
selects the most significant themes based on a threshold.
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
themes_df (pd.DataFrame): DataFrame containing themes with columns:
|
|
329
|
+
- topic_id: Unique identifier for each theme
|
|
330
|
+
- topic_label: Short descriptive label for the theme
|
|
331
|
+
- topic_description: Detailed description of the theme
|
|
332
|
+
- source_topic_count: Number of source responses for this theme
|
|
333
|
+
llm (RunnableWithFallbacks): Language model instance configured with
|
|
334
|
+
structured output for HierarchicalClusteringResponse
|
|
335
|
+
max_iterations (int, optional): Maximum number of clustering iterations.
|
|
336
|
+
Defaults to 5.
|
|
337
|
+
target_themes (int, optional): Target number of themes to cluster down to.
|
|
338
|
+
Defaults to 10.
|
|
339
|
+
significance_percentage (float, optional): Percentage threshold for
|
|
340
|
+
selecting significant themes. Defaults to 10.0.
|
|
341
|
+
return_all_themes (bool, optional): If True, returns all clustered themes.
|
|
342
|
+
If False, returns only significant themes. Defaults to False.
|
|
343
|
+
|
|
344
|
+
Returns:
|
|
345
|
+
tuple[pd.DataFrame, pd.DataFrame]:
|
|
346
|
+
A tuple containing:
|
|
347
|
+
- DataFrame of clustered themes (all or significant based on return_all_themes)
|
|
348
|
+
- Empty DataFrame (for consistency with other functions)
|
|
349
|
+
"""
|
|
350
|
+
logger.info(f"Starting hierarchical clustering of {len(themes_df)} themes")
|
|
351
|
+
|
|
352
|
+
# Convert DataFrame to ThemeNode objects
|
|
353
|
+
initial_themes = [
|
|
354
|
+
ThemeNode(
|
|
355
|
+
topic_id=row["topic_id"],
|
|
356
|
+
topic_label=row["topic_label"],
|
|
357
|
+
topic_description=row["topic_description"],
|
|
358
|
+
source_topic_count=row["source_topic_count"],
|
|
359
|
+
)
|
|
360
|
+
for _, row in themes_df.iterrows()
|
|
361
|
+
]
|
|
362
|
+
|
|
363
|
+
# Initialize clustering agent with structured output LLM
|
|
364
|
+
agent = ThemeClusteringAgent(
|
|
365
|
+
llm.with_structured_output(HierarchicalClusteringResponse), initial_themes
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
# Perform clustering
|
|
369
|
+
logger.info(
|
|
370
|
+
f"Clustering themes with max_iterations={max_iterations}, target_themes={target_themes}"
|
|
371
|
+
)
|
|
372
|
+
all_themes_df = agent.cluster_themes(
|
|
373
|
+
max_iterations=max_iterations, target_themes=target_themes
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
# Return appropriate themes based on parameter
|
|
377
|
+
if return_all_themes:
|
|
378
|
+
logger.info(
|
|
379
|
+
f"Clustering complete: returning all {len(all_themes_df)} clustered themes"
|
|
380
|
+
)
|
|
381
|
+
return all_themes_df, pd.DataFrame()
|
|
382
|
+
else:
|
|
383
|
+
# Select significant themes
|
|
384
|
+
logger.info(
|
|
385
|
+
f"Selecting themes with significance_percentage={significance_percentage}%"
|
|
386
|
+
)
|
|
387
|
+
selected_themes_df = agent.select_themes(significance_percentage)
|
|
388
|
+
logger.info(
|
|
389
|
+
f"Clustering complete: returning {len(selected_themes_df)} significant themes"
|
|
390
|
+
)
|
|
391
|
+
return selected_themes_df, pd.DataFrame()
|
|
392
|
+
|
|
393
|
+
|
|
312
394
|
async def theme_refinement(
|
|
313
395
|
condensed_themes_df: pd.DataFrame,
|
|
314
396
|
llm: RunnableWithFallbacks,
|
|
@@ -349,3 +349,67 @@ class DetailDetectionResponses(ValidatedModel):
|
|
|
349
349
|
if len(response_ids) != len(set(response_ids)):
|
|
350
350
|
raise ValueError("Response IDs must be unique")
|
|
351
351
|
return self
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
class ThemeNode(ValidatedModel):
|
|
355
|
+
"""Model for topic nodes created during hierarchical clustering"""
|
|
356
|
+
|
|
357
|
+
topic_id: str = Field(
|
|
358
|
+
...,
|
|
359
|
+
description="Short alphabetic ID (e.g. 'A', 'B', 'C') - iteration prefix will be added automatically",
|
|
360
|
+
)
|
|
361
|
+
topic_label: str = Field(
|
|
362
|
+
..., description="4-5 word label encompassing merged child topics"
|
|
363
|
+
)
|
|
364
|
+
topic_description: str = Field(
|
|
365
|
+
..., description="1-2 sentences combining key aspects of child topics"
|
|
366
|
+
)
|
|
367
|
+
source_topic_count: int = Field(gt=0, description="Sum of all child topic counts")
|
|
368
|
+
parent_id: Optional[str] = Field(
|
|
369
|
+
default=None,
|
|
370
|
+
description="Internal field: ID of parent topic node, managed by clustering agent, not set by LLM",
|
|
371
|
+
)
|
|
372
|
+
children: List[str] = Field(
|
|
373
|
+
default_factory=list, description="List of topic_ids of merged child topics"
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
@model_validator(mode="after")
|
|
377
|
+
def run_validations(self) -> "ThemeNode":
|
|
378
|
+
"""Validate topic node constraints"""
|
|
379
|
+
if self.children:
|
|
380
|
+
# Each parent must have at least 2 children
|
|
381
|
+
if len(self.children) < 2:
|
|
382
|
+
raise ValueError("Each topic node must have at least 2 children")
|
|
383
|
+
# Validate children are unique
|
|
384
|
+
if len(self.children) != len(set(self.children)):
|
|
385
|
+
raise ValueError("Child topic IDs must be unique")
|
|
386
|
+
|
|
387
|
+
return self
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
class HierarchicalClusteringResponse(ValidatedModel):
|
|
391
|
+
"""Model for hierarchical clustering agent response"""
|
|
392
|
+
|
|
393
|
+
parent_themes: List[ThemeNode] = Field(
|
|
394
|
+
default=[],
|
|
395
|
+
description="List of parent themes created by merging similar themes",
|
|
396
|
+
)
|
|
397
|
+
should_terminate: bool = Field(
|
|
398
|
+
...,
|
|
399
|
+
description="True if no more meaningful clustering is possible, false otherwise",
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
@model_validator(mode="after")
|
|
403
|
+
def run_validations(self) -> "HierarchicalClusteringResponse":
|
|
404
|
+
"""Validate clustering response constraints"""
|
|
405
|
+
self.validate_non_empty_fields()
|
|
406
|
+
|
|
407
|
+
# Validate that no child appears in multiple parents
|
|
408
|
+
all_children = []
|
|
409
|
+
for parent in self.parent_themes:
|
|
410
|
+
all_children.extend(parent.children)
|
|
411
|
+
|
|
412
|
+
if len(all_children) != len(set(all_children)):
|
|
413
|
+
raise ValueError("Each child theme can have at most one parent")
|
|
414
|
+
|
|
415
|
+
return self
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
Analyze these topics and identify which ones should be merged based on semantic similarity.
|
|
2
|
+
Your goal is to significantly reduce the number of topics by creating meaningful parent topics.
|
|
3
|
+
Be aggressive in finding opportunities to merge topics that share any semantic relationship.
|
|
4
|
+
|
|
5
|
+
TOPICS:
|
|
6
|
+
{themes_json}
|
|
7
|
+
|
|
8
|
+
For each group of similar topics that should be merged, create a new parent topic.
|
|
9
|
+
|
|
10
|
+
Guidelines:
|
|
11
|
+
- Each parent topic must have at least 2 children, it can have more than 2 if appropriate
|
|
12
|
+
- Each child topic can have at most 1 parent
|
|
13
|
+
- topic_id should be a simple alphabetic ID (e.g. 'A', 'B', 'C') - the iteration prefix will be added automatically
|
|
14
|
+
- Be creative and look for higher-level abstractions that can combine seemingly different topics
|
|
15
|
+
- When creating parent topics, follow these naming rules:
|
|
16
|
+
* The label should read naturally as a single coherent topic
|
|
17
|
+
* Choose labels that can encompass broader categories of topics
|
|
18
|
+
* If merging different topics, the topic with the higher source_topic_count should dominate the label
|
|
19
|
+
* Never combine different topics with "and" or "/" in the label
|
|
20
|
+
- topic_description must be 1 or 2 sentences that:
|
|
21
|
+
* preserves key information from the child topics
|
|
22
|
+
- source_topic_count must be the sum of all child topic counts
|
|
23
|
+
- children must be a list of valid topic_ids from the input
|
|
24
|
+
- should_terminate should only be true if ALL of these conditions are met:
|
|
25
|
+
* There are fewer than 10 active topics remaining
|
|
26
|
+
* The remaining topics are fundamentally incompatible semantically
|
|
27
|
+
* Any further merging would create meaninglessly broad categories
|
|
28
|
+
|
|
29
|
+
If no topics should be merged in this iteration but future iterations might still yield meaningful merges, set should_terminate to false with an empty parent_themes list.
|
|
30
|
+
|
|
31
|
+
If no topics should be merged and the termination conditions are met, set should_terminate to true with an empty parent_themes list.
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
{system_prompt}
|
|
2
|
+
|
|
3
|
+
You will receive a list of RESPONSES, each containing a response_id and a response.
|
|
4
|
+
Your job is to analyze each response to the QUESTION below and decide if a response contains rich evidence.
|
|
5
|
+
You MUST include every response ID in the output.
|
|
6
|
+
|
|
7
|
+
A response is evidence-rich only if it satisfies both of the following:
|
|
8
|
+
|
|
9
|
+
Relevance and depth:
|
|
10
|
+
- It clearly answers the question
|
|
11
|
+
- AND provides insights that go beyond generic opinion, such as nuanced reasoning, contextual explanation, or argumentation that could inform decision-making
|
|
12
|
+
|
|
13
|
+
Substantive evidence, including at least one of:
|
|
14
|
+
- Specific, verifiable facts or data (e.g., statistics, dates, named reports or studies)
|
|
15
|
+
- Concrete, illustrative examples that clearly support a broader claim
|
|
16
|
+
- Detailed personal or professional experiences that include contextual information (e.g., roles, locations, timelines)
|
|
17
|
+
|
|
18
|
+
Do NOT classify a response as evidence-rich if it:
|
|
19
|
+
- Uses vague or general language with no supporting detail
|
|
20
|
+
- Restates commonly known points without adding new information
|
|
21
|
+
- Shares personal anecdotes without sufficient context or a clear takeaway
|
|
22
|
+
|
|
23
|
+
Before answering, ask: Would this response provide useful input to someone drafting policy, beyond what is already commonly known or expected?
|
|
24
|
+
|
|
25
|
+
For each response, determine:
|
|
26
|
+
EVIDENCE_RICH - does the response contain significant evidence as defined above?
|
|
27
|
+
Choose one from ['YES', 'NO']
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
QUESTION: \n {question}
|
|
31
|
+
RESPONSES: \n {responses}
|
|
@@ -6,10 +6,11 @@ You are tasked with refining a list of topics generated from responses to a ques
|
|
|
6
6
|
You will receive a list of TOPICS. These topics explicitly tie opinions to whether a person agrees or disagrees with the question.
|
|
7
7
|
|
|
8
8
|
## Output
|
|
9
|
-
You will produce a list of CLEAR STANCE TOPICS based on the input. Each topic should have
|
|
10
|
-
1. A
|
|
11
|
-
2. A
|
|
12
|
-
3.
|
|
9
|
+
You will produce a list of CLEAR STANCE TOPICS based on the input. Each topic should have four parts:
|
|
10
|
+
1. A topic_id that is an uppercase letter (starting from 'A', for the 27th element use AA)
|
|
11
|
+
2. A brief, clear topic label (3-7 words)
|
|
12
|
+
3. A more detailed topic description (1-2 sentences)
|
|
13
|
+
4. The source_topic_count field should be included for each topic and should reflect the number of original source topics that were merged to create this refined topic. If multiple source topics were combined, sum their individual counts. If only one source topic was used, simply retain its original count value.
|
|
13
14
|
|
|
14
15
|
|
|
15
16
|
## Guidelines
|
|
@@ -48,7 +49,7 @@ You will produce a list of CLEAR STANCE TOPICS based on the input. Each topic sh
|
|
|
48
49
|
b. Create a neutral, concise topic label.
|
|
49
50
|
c. Write a more detailed description that provides context without taking sides.
|
|
50
51
|
4. Review the entire list to ensure distinctiveness and adjust as needed.
|
|
51
|
-
5. Assign each output topic a topic_id
|
|
52
|
+
5. Assign each output topic a topic_id that is an uppercase letter (starting from 'A', for the 27th element use AA)
|
|
52
53
|
6. Combine the topic label and description with a colon separator
|
|
53
54
|
|
|
54
55
|
TOPICS:
|
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
"""Theme clustering agent for hierarchical topic organization.
|
|
2
|
+
|
|
3
|
+
This module provides the ThemeClusteringAgent class for performing iterative
|
|
4
|
+
hierarchical clustering of topics using a language model.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Dict, List, Any
|
|
10
|
+
|
|
11
|
+
import pandas as pd
|
|
12
|
+
from langchain.schema.runnable import Runnable
|
|
13
|
+
from tenacity import (
|
|
14
|
+
before,
|
|
15
|
+
before_sleep_log,
|
|
16
|
+
retry,
|
|
17
|
+
stop_after_attempt,
|
|
18
|
+
wait_random_exponential,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
from .models import ThemeNode
|
|
22
|
+
from .llm_batch_processor import load_prompt_from_file
|
|
23
|
+
from .themefinder_logging import logger
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ThemeClusteringAgent:
|
|
27
|
+
"""Agent for performing hierarchical clustering of topics using language models.
|
|
28
|
+
|
|
29
|
+
This class manages the iterative process of merging similar topics into a
|
|
30
|
+
hierarchical structure using an LLM to identify semantic relationships and
|
|
31
|
+
create meaningful parent-child topic relationships.
|
|
32
|
+
|
|
33
|
+
Attributes:
|
|
34
|
+
llm: Language model configured with structured output for clustering
|
|
35
|
+
themes: Dictionary mapping topic IDs to ThemeNode objects
|
|
36
|
+
active_themes: Set of topic IDs that are currently active for clustering
|
|
37
|
+
current_iteration: Current iteration number in the clustering process
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(self, llm: Runnable, themes: List[ThemeNode]) -> None:
|
|
41
|
+
"""Initialize the clustering agent with an LLM and initial themes.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
llm: Language model instance configured with structured output
|
|
45
|
+
for HierarchicalClusteringResponse
|
|
46
|
+
themes: List of ThemeNode objects to be clustered
|
|
47
|
+
"""
|
|
48
|
+
self.llm = llm
|
|
49
|
+
self.themes: Dict[str, ThemeNode] = {}
|
|
50
|
+
for theme in themes:
|
|
51
|
+
self.themes[theme.topic_id] = theme
|
|
52
|
+
self.active_themes = set(self.themes.keys())
|
|
53
|
+
self.current_iteration = 0
|
|
54
|
+
|
|
55
|
+
def _format_prompt(self) -> str:
|
|
56
|
+
"""Format the clustering prompt with current active themes.
|
|
57
|
+
|
|
58
|
+
Creates a JSON representation of all currently active themes and
|
|
59
|
+
formats them into the clustering prompt template.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
str: Formatted prompt string ready for LLM processing
|
|
63
|
+
"""
|
|
64
|
+
themes_for_prompt = []
|
|
65
|
+
for active_id in self.active_themes:
|
|
66
|
+
theme_dict = {
|
|
67
|
+
"topic_id": self.themes[active_id].topic_id,
|
|
68
|
+
"topic_label": self.themes[active_id].topic_label,
|
|
69
|
+
"topic_description": self.themes[active_id].topic_description,
|
|
70
|
+
}
|
|
71
|
+
themes_for_prompt.append(theme_dict)
|
|
72
|
+
themes_json = json.dumps(themes_for_prompt, indent=2)
|
|
73
|
+
|
|
74
|
+
# Load the clustering prompt template
|
|
75
|
+
prompt_template = load_prompt_from_file("agentic_theme_clustering")
|
|
76
|
+
return prompt_template.format(
|
|
77
|
+
themes_json=themes_json, iteration=self.current_iteration
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
@retry(
|
|
81
|
+
wait=wait_random_exponential(min=1, max=2),
|
|
82
|
+
stop=stop_after_attempt(3),
|
|
83
|
+
before=before.before_log(logger=logger, log_level=logging.DEBUG),
|
|
84
|
+
before_sleep=before_sleep_log(logger, logging.ERROR),
|
|
85
|
+
reraise=True,
|
|
86
|
+
)
|
|
87
|
+
def cluster_iteration(self) -> None:
|
|
88
|
+
"""Perform one iteration of hierarchical theme clustering.
|
|
89
|
+
|
|
90
|
+
Uses the configured LLM to identify semantically similar themes
|
|
91
|
+
and merge them into parent themes. Updates the theme hierarchy
|
|
92
|
+
and active theme set based on the clustering results.
|
|
93
|
+
|
|
94
|
+
The method includes retry logic to handle transient API failures
|
|
95
|
+
and will automatically retry up to 3 times with exponential backoff.
|
|
96
|
+
|
|
97
|
+
Side Effects:
|
|
98
|
+
- Creates new parent ThemeNode objects in self.themes
|
|
99
|
+
- Updates parent_id relationships for child themes
|
|
100
|
+
- Modifies self.active_themes set
|
|
101
|
+
- Increments self.current_iteration
|
|
102
|
+
"""
|
|
103
|
+
prompt = self._format_prompt()
|
|
104
|
+
response = self.llm.invoke(prompt)
|
|
105
|
+
# The response is already a parsed dictionary when using with_structured_output
|
|
106
|
+
result = response
|
|
107
|
+
for i, parent in enumerate(result["parent_themes"]):
|
|
108
|
+
new_theme_id = f"{chr(65 + i)}_{self.current_iteration}"
|
|
109
|
+
children = [c for c in parent["children"] if c in self.active_themes]
|
|
110
|
+
for child in children:
|
|
111
|
+
self.themes[child].parent_id = new_theme_id
|
|
112
|
+
total_source_count = sum(
|
|
113
|
+
self.themes[child_id].source_topic_count for child_id in children
|
|
114
|
+
)
|
|
115
|
+
new_theme = ThemeNode(
|
|
116
|
+
topic_id=new_theme_id,
|
|
117
|
+
topic_label=parent["topic_label"],
|
|
118
|
+
topic_description=parent["topic_description"],
|
|
119
|
+
source_topic_count=total_source_count,
|
|
120
|
+
children=children,
|
|
121
|
+
)
|
|
122
|
+
self.themes[new_theme_id] = new_theme
|
|
123
|
+
self.active_themes.add(new_theme_id)
|
|
124
|
+
for child in children:
|
|
125
|
+
self.active_themes.remove(child)
|
|
126
|
+
self.current_iteration += 1
|
|
127
|
+
|
|
128
|
+
def cluster_themes(
|
|
129
|
+
self, max_iterations: int = 5, target_themes: int = 5
|
|
130
|
+
) -> pd.DataFrame:
|
|
131
|
+
"""Perform hierarchical clustering to reduce themes to target number.
|
|
132
|
+
|
|
133
|
+
Iteratively merges similar themes using the clustering agent until
|
|
134
|
+
either the maximum iterations is reached or the target number of
|
|
135
|
+
themes is achieved. Creates a root node to represent the complete
|
|
136
|
+
hierarchy.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
max_iterations: Maximum number of clustering iterations to perform
|
|
140
|
+
target_themes: Target number of themes to cluster down to
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
pd.DataFrame: DataFrame containing all theme nodes (excluding root)
|
|
144
|
+
with their hierarchical relationships and metadata
|
|
145
|
+
"""
|
|
146
|
+
logger.info(f"Starting clustering with {len(self.active_themes)} active themes")
|
|
147
|
+
while (
|
|
148
|
+
self.current_iteration <= max_iterations
|
|
149
|
+
and len(self.active_themes) > target_themes
|
|
150
|
+
):
|
|
151
|
+
self.cluster_iteration()
|
|
152
|
+
logger.info(
|
|
153
|
+
f"After {self.current_iteration} iterations {len(self.active_themes)} active themes remaining"
|
|
154
|
+
)
|
|
155
|
+
root_node = ThemeNode(
|
|
156
|
+
topic_id="0",
|
|
157
|
+
topic_label="All Topics",
|
|
158
|
+
topic_description="",
|
|
159
|
+
source_topic_count=sum(
|
|
160
|
+
self.themes[theme_id].source_topic_count
|
|
161
|
+
for theme_id in self.active_themes
|
|
162
|
+
),
|
|
163
|
+
children=list(self.active_themes),
|
|
164
|
+
)
|
|
165
|
+
self.themes["0"] = root_node
|
|
166
|
+
for theme in self.active_themes:
|
|
167
|
+
self.themes[theme].parent_id = "0"
|
|
168
|
+
|
|
169
|
+
# Convert all themes (except root) to DataFrame
|
|
170
|
+
theme_nodes_dicts = [
|
|
171
|
+
node.model_dump() for node in self.themes.values() if node.topic_id != "0"
|
|
172
|
+
]
|
|
173
|
+
return pd.DataFrame(theme_nodes_dicts)
|
|
174
|
+
|
|
175
|
+
def convert_themes_to_tree_json(self) -> str:
|
|
176
|
+
"""Convert themes into a hierarchical JSON structure for visualization.
|
|
177
|
+
|
|
178
|
+
Creates a nested JSON structure starting from the root node (ID '0')
|
|
179
|
+
that represents the complete theme hierarchy. Each node includes
|
|
180
|
+
metadata and references to its children.
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
str: JSON string representing the hierarchical tree structure
|
|
184
|
+
suitable for JavaScript tree visualization libraries
|
|
185
|
+
"""
|
|
186
|
+
|
|
187
|
+
def build_tree(node: ThemeNode) -> Dict[str, Any]:
|
|
188
|
+
return {
|
|
189
|
+
"id": node.topic_id,
|
|
190
|
+
"name": node.topic_label,
|
|
191
|
+
"description": node.topic_description,
|
|
192
|
+
"value": node.source_topic_count,
|
|
193
|
+
"children": [
|
|
194
|
+
build_tree(self.themes[child_id])
|
|
195
|
+
for child_id in node.children
|
|
196
|
+
if child_id in self.themes
|
|
197
|
+
],
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
tree_data = build_tree(self.themes["0"])
|
|
201
|
+
return json.dumps(tree_data, indent=2)
|
|
202
|
+
|
|
203
|
+
def select_significant_themes(
|
|
204
|
+
self, significance_threshold: int, total_responses: int
|
|
205
|
+
) -> Dict[str, Any]:
|
|
206
|
+
"""Select significant themes using depth-first traversal.
|
|
207
|
+
|
|
208
|
+
Performs a depth-first search on the theme hierarchy to identify
|
|
209
|
+
themes that meet the significance threshold. Prioritizes leaf nodes
|
|
210
|
+
when possible, but selects parent nodes when children don't meet
|
|
211
|
+
the threshold.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
significance_threshold: Minimum source_topic_count for significance
|
|
215
|
+
total_responses: Total number of responses across all themes
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
Dict containing selected theme nodes and metadata
|
|
219
|
+
"""
|
|
220
|
+
# Track selected nodes
|
|
221
|
+
selected_nodes: List[Dict[str, Any]] = []
|
|
222
|
+
|
|
223
|
+
# Perform the DFS selection
|
|
224
|
+
self._traverse_tree(self.themes["0"], selected_nodes, significance_threshold)
|
|
225
|
+
|
|
226
|
+
# Format the final result
|
|
227
|
+
result = {"selected_nodes": selected_nodes, "total_responses": total_responses}
|
|
228
|
+
|
|
229
|
+
return result
|
|
230
|
+
|
|
231
|
+
def _traverse_tree(
|
|
232
|
+
self,
|
|
233
|
+
node: ThemeNode,
|
|
234
|
+
selected_nodes: List[Dict[str, Any]],
|
|
235
|
+
significance_threshold: int,
|
|
236
|
+
) -> bool:
|
|
237
|
+
"""Recursively traverse theme tree to select significant nodes.
|
|
238
|
+
|
|
239
|
+
Implements depth-first traversal logic for theme selection:
|
|
240
|
+
1. For leaf nodes: always select
|
|
241
|
+
2. For parent nodes: select if no significant children exist
|
|
242
|
+
3. For significant children: recursively process them
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
node: Current ThemeNode being processed
|
|
246
|
+
selected_nodes: List to accumulate selected theme dictionaries
|
|
247
|
+
significance_threshold: Minimum source_topic_count for significance
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
bool: True if this node or descendants were selected, False otherwise
|
|
251
|
+
"""
|
|
252
|
+
# Base case: if node has no children (leaf node)
|
|
253
|
+
if not node.children:
|
|
254
|
+
selected_nodes.append(
|
|
255
|
+
{
|
|
256
|
+
"id": node.topic_id,
|
|
257
|
+
"name": node.topic_label,
|
|
258
|
+
"value": node.source_topic_count,
|
|
259
|
+
}
|
|
260
|
+
)
|
|
261
|
+
return True
|
|
262
|
+
|
|
263
|
+
# Check if any children are significant
|
|
264
|
+
has_significant_children = any(
|
|
265
|
+
self.themes[child_id].source_topic_count >= significance_threshold
|
|
266
|
+
for child_id in node.children
|
|
267
|
+
if child_id in self.themes
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# If no significant children, select this node
|
|
271
|
+
if not has_significant_children:
|
|
272
|
+
selected_nodes.append(
|
|
273
|
+
{
|
|
274
|
+
"id": node.topic_id,
|
|
275
|
+
"name": node.topic_label,
|
|
276
|
+
"value": node.source_topic_count,
|
|
277
|
+
}
|
|
278
|
+
)
|
|
279
|
+
return True
|
|
280
|
+
|
|
281
|
+
# If significant children exist, recursively process them
|
|
282
|
+
any_selected = False
|
|
283
|
+
for child_id in node.children:
|
|
284
|
+
if child_id in self.themes:
|
|
285
|
+
if self._traverse_tree(
|
|
286
|
+
self.themes[child_id], selected_nodes, significance_threshold
|
|
287
|
+
):
|
|
288
|
+
any_selected = True
|
|
289
|
+
|
|
290
|
+
# If none of the children were selected, select this node
|
|
291
|
+
if not any_selected:
|
|
292
|
+
selected_nodes.append(
|
|
293
|
+
{
|
|
294
|
+
"id": node.topic_id,
|
|
295
|
+
"name": node.topic_label,
|
|
296
|
+
"value": node.source_topic_count,
|
|
297
|
+
}
|
|
298
|
+
)
|
|
299
|
+
return True
|
|
300
|
+
|
|
301
|
+
return any_selected
|
|
302
|
+
|
|
303
|
+
def select_themes(self, significance_percentage: float) -> pd.DataFrame:
|
|
304
|
+
"""Select themes that meet the significance threshold.
|
|
305
|
+
|
|
306
|
+
Calculates the significance threshold based on the percentage of total
|
|
307
|
+
responses and returns only themes that meet or exceed this threshold.
|
|
308
|
+
Excludes the root node from results.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
significance_percentage: Percentage (0-100) of total responses
|
|
312
|
+
required for a theme to be considered significant
|
|
313
|
+
|
|
314
|
+
Returns:
|
|
315
|
+
pd.DataFrame: DataFrame containing significant theme data,
|
|
316
|
+
excluding the root node (topic_id='0')
|
|
317
|
+
"""
|
|
318
|
+
total_responses = self.themes["0"].source_topic_count
|
|
319
|
+
# Convert percentage to absolute threshold
|
|
320
|
+
significance_threshold = int(total_responses * (significance_percentage / 100))
|
|
321
|
+
|
|
322
|
+
# Filter themes that meet the significance threshold
|
|
323
|
+
significant_themes = [
|
|
324
|
+
theme_node
|
|
325
|
+
for theme_node in self.themes.values()
|
|
326
|
+
if theme_node.source_topic_count >= significance_threshold
|
|
327
|
+
]
|
|
328
|
+
# Convert significant themes to DataFrame, excluding root node
|
|
329
|
+
theme_nodes_dicts = [
|
|
330
|
+
node.model_dump() for node in significant_themes if node.topic_id != "0"
|
|
331
|
+
]
|
|
332
|
+
return pd.DataFrame(theme_nodes_dicts)
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
{system_prompt}
|
|
2
|
-
|
|
3
|
-
You will receive a list of RESPONSES, each containing a response_id and a response.
|
|
4
|
-
Your job is to analyze each response to the QUESTION below and decide if a response contains rich evidence.
|
|
5
|
-
You MUST include every response ID in the output.
|
|
6
|
-
|
|
7
|
-
Evidence-rich responses contain one or more of the following:
|
|
8
|
-
- Specific facts or figures that shed new light on the issue (e.g., statistics, percentages, measurements, dates)
|
|
9
|
-
- Concrete examples and specific insights that could inform decision-making
|
|
10
|
-
- Detailed personal or professional experiences with clear contextual information or specific incidents
|
|
11
|
-
In addition to the above an evidence rich response should answer the question and provide deeper insights than an average response.
|
|
12
|
-
|
|
13
|
-
For each response, determine:
|
|
14
|
-
EVIDENCE_RICH - does the response contain significant evidence as defined above?
|
|
15
|
-
Choose one from ['YES', 'NO']
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
QUESTION: \n {question}
|
|
19
|
-
RESPONSES: \n {responses}
|
|
File without changes
|
{themefinder-0.6.3 → themefinder-0.7.1}/src/themefinder/prompts/consultation_system_prompt.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|