themefinder 0.7.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themefinder/__init__.py +24 -0
- themefinder/advanced_tasks/__init__.py +0 -0
- themefinder/advanced_tasks/cross_cutting_themes_agent.py +404 -0
- themefinder/advanced_tasks/theme_clustering_agent.py +356 -0
- themefinder/llm_batch_processor.py +442 -0
- themefinder/models.py +438 -0
- themefinder/prompts/agentic_theme_clustering.txt +34 -0
- themefinder/prompts/consultation_system_prompt.txt +1 -0
- themefinder/prompts/cross_cutting_identification.txt +16 -0
- themefinder/prompts/cross_cutting_mapping.txt +19 -0
- themefinder/prompts/cross_cutting_refinement.txt +15 -0
- themefinder/prompts/detail_detection.txt +31 -0
- themefinder/prompts/sentiment_analysis.txt +41 -0
- themefinder/prompts/theme_condensation.txt +34 -0
- themefinder/prompts/theme_generation.txt +38 -0
- themefinder/prompts/theme_mapping.txt +36 -0
- themefinder/prompts/theme_refinement.txt +54 -0
- themefinder/prompts/theme_target_alignment.txt +18 -0
- themefinder/tasks.py +656 -0
- themefinder/themefinder_logging.py +12 -0
- themefinder-0.7.4.dist-info/METADATA +174 -0
- themefinder-0.7.4.dist-info/RECORD +24 -0
- themefinder-0.7.4.dist-info/WHEEL +4 -0
- themefinder-0.7.4.dist-info/licenses/LICENCE +21 -0
themefinder/models.py
ADDED
|
@@ -0,0 +1,438 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from pydantic import BaseModel, Field, model_validator
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Position(str, Enum):
|
|
7
|
+
"""Enum for valid position values"""
|
|
8
|
+
|
|
9
|
+
AGREEMENT = "AGREEMENT"
|
|
10
|
+
DISAGREEMENT = "DISAGREEMENT"
|
|
11
|
+
UNCLEAR = "UNCLEAR"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Stance(str, Enum):
|
|
15
|
+
"""Enum for valid stance values"""
|
|
16
|
+
|
|
17
|
+
POSITIVE = "POSITIVE"
|
|
18
|
+
NEGATIVE = "NEGATIVE"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class EvidenceRich(str, Enum):
|
|
22
|
+
"""Enum for valid evidence_rich values"""
|
|
23
|
+
|
|
24
|
+
YES = "YES"
|
|
25
|
+
NO = "NO"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ValidatedModel(BaseModel):
|
|
29
|
+
"""Base model with common validation methods"""
|
|
30
|
+
|
|
31
|
+
def validate_non_empty_fields(self) -> "ValidatedModel":
|
|
32
|
+
"""
|
|
33
|
+
Validate that all string fields are non-empty and all list fields are not empty.
|
|
34
|
+
"""
|
|
35
|
+
for field_name, value in self.__dict__.items():
|
|
36
|
+
if isinstance(value, str) and not value.strip():
|
|
37
|
+
raise ValueError(f"{field_name} cannot be empty or only whitespace")
|
|
38
|
+
if isinstance(value, list) and not value:
|
|
39
|
+
raise ValueError(f"{field_name} cannot be an empty list")
|
|
40
|
+
if isinstance(value, list):
|
|
41
|
+
for i, item in enumerate(value):
|
|
42
|
+
if isinstance(item, str) and not item.strip():
|
|
43
|
+
raise ValueError(
|
|
44
|
+
f"Item {i} in {field_name} cannot be empty or only whitespace"
|
|
45
|
+
)
|
|
46
|
+
return self
|
|
47
|
+
|
|
48
|
+
def validate_unique_items(
|
|
49
|
+
self, field_name: str, transform_func: Optional[callable] = None
|
|
50
|
+
) -> "ValidatedModel":
|
|
51
|
+
"""
|
|
52
|
+
Validate that a field contains unique values.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
field_name: The name of the field to check for uniqueness
|
|
56
|
+
transform_func: Optional function to transform items before checking uniqueness
|
|
57
|
+
(e.g., lowercasing strings)
|
|
58
|
+
"""
|
|
59
|
+
if not hasattr(self, field_name):
|
|
60
|
+
raise ValueError(f"Field '{field_name}' does not exist")
|
|
61
|
+
items = getattr(self, field_name)
|
|
62
|
+
if not isinstance(items, list):
|
|
63
|
+
raise ValueError(f"Field '{field_name}' is not a list")
|
|
64
|
+
if transform_func:
|
|
65
|
+
transformed_items = [transform_func(item) for item in items]
|
|
66
|
+
else:
|
|
67
|
+
transformed_items = items
|
|
68
|
+
if len(transformed_items) != len(set(transformed_items)):
|
|
69
|
+
raise ValueError(f"'{field_name}' must contain unique values")
|
|
70
|
+
return self
|
|
71
|
+
|
|
72
|
+
def validate_unique_attribute_in_list(
|
|
73
|
+
self, list_field: str, attr_name: str
|
|
74
|
+
) -> "ValidatedModel":
|
|
75
|
+
"""
|
|
76
|
+
Validate that an attribute across all objects in a list field is unique.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
list_field: The name of the list field containing objects
|
|
80
|
+
attr_name: The attribute within each object to check for uniqueness
|
|
81
|
+
"""
|
|
82
|
+
if not hasattr(self, list_field):
|
|
83
|
+
raise ValueError(f"Field '{list_field}' does not exist")
|
|
84
|
+
|
|
85
|
+
items = getattr(self, list_field)
|
|
86
|
+
if not isinstance(items, list):
|
|
87
|
+
raise ValueError(f"Field '{list_field}' is not a list")
|
|
88
|
+
|
|
89
|
+
attr_values = []
|
|
90
|
+
for item in items:
|
|
91
|
+
if not hasattr(item, attr_name):
|
|
92
|
+
raise ValueError(
|
|
93
|
+
f"Item in '{list_field}' does not have attribute '{attr_name}'"
|
|
94
|
+
)
|
|
95
|
+
attr_values.append(getattr(item, attr_name))
|
|
96
|
+
if len(attr_values) != len(set(attr_values)):
|
|
97
|
+
raise ValueError(
|
|
98
|
+
f"'{attr_name}' must be unique across all items in '{list_field}'"
|
|
99
|
+
)
|
|
100
|
+
return self
|
|
101
|
+
|
|
102
|
+
def validate_equal_lengths(self, *field_names) -> "ValidatedModel":
|
|
103
|
+
"""
|
|
104
|
+
Validate that multiple list fields have the same length.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
*field_names: Variable number of field names to check for equal lengths
|
|
108
|
+
"""
|
|
109
|
+
if len(field_names) < 2:
|
|
110
|
+
return self
|
|
111
|
+
lengths = []
|
|
112
|
+
for field_name in field_names:
|
|
113
|
+
if not hasattr(self, field_name):
|
|
114
|
+
raise ValueError(f"Field '{field_name}' does not exist")
|
|
115
|
+
|
|
116
|
+
items = getattr(self, field_name)
|
|
117
|
+
if not isinstance(items, list):
|
|
118
|
+
raise ValueError(f"Field '{field_name}' is not a list")
|
|
119
|
+
|
|
120
|
+
lengths.append(len(items))
|
|
121
|
+
if len(set(lengths)) > 1:
|
|
122
|
+
raise ValueError(
|
|
123
|
+
f"Fields {', '.join(field_names)} must all have the same length"
|
|
124
|
+
)
|
|
125
|
+
return self
|
|
126
|
+
|
|
127
|
+
@model_validator(mode="after")
|
|
128
|
+
def run_validations(self) -> "ValidatedModel":
|
|
129
|
+
"""
|
|
130
|
+
Run common validations. Override in subclasses to add specific validations.
|
|
131
|
+
"""
|
|
132
|
+
return self.validate_non_empty_fields()
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class SentimentAnalysisOutput(ValidatedModel):
|
|
136
|
+
"""Model for sentiment analysis output"""
|
|
137
|
+
|
|
138
|
+
response_id: int = Field(gt=0)
|
|
139
|
+
position: Position
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class SentimentAnalysisResponses(ValidatedModel):
|
|
143
|
+
"""Container for all sentiment analysis responses"""
|
|
144
|
+
|
|
145
|
+
responses: List[SentimentAnalysisOutput]
|
|
146
|
+
|
|
147
|
+
@model_validator(mode="after")
|
|
148
|
+
def run_validations(self) -> "SentimentAnalysisResponses":
|
|
149
|
+
"""Validate that response_ids are unique"""
|
|
150
|
+
self.validate_non_empty_fields()
|
|
151
|
+
response_ids = [resp.response_id for resp in self.responses]
|
|
152
|
+
if len(response_ids) != len(set(response_ids)):
|
|
153
|
+
raise ValueError("Response IDs must be unique")
|
|
154
|
+
return self
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class Theme(ValidatedModel):
|
|
158
|
+
"""Model for a single extracted theme"""
|
|
159
|
+
|
|
160
|
+
topic_label: str = Field(
|
|
161
|
+
..., description="Short label summarizing the topic in a few words"
|
|
162
|
+
)
|
|
163
|
+
topic_description: str = Field(
|
|
164
|
+
..., description="More detailed description of the topic in 1-2 sentences"
|
|
165
|
+
)
|
|
166
|
+
position: Position = Field(
|
|
167
|
+
...,
|
|
168
|
+
description="SENTIMENT ABOUT THIS TOPIC (AGREEMENT, DISAGREEMENT, OR UNCLEAR)",
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
class ThemeGenerationResponses(ValidatedModel):
|
|
173
|
+
"""Container for all extracted themes"""
|
|
174
|
+
|
|
175
|
+
responses: List[Theme] = Field(..., description="List of extracted themes")
|
|
176
|
+
|
|
177
|
+
@model_validator(mode="after")
|
|
178
|
+
def run_validations(self) -> "ThemeGenerationResponses":
|
|
179
|
+
"""Ensure there are no duplicate themes"""
|
|
180
|
+
self.validate_non_empty_fields()
|
|
181
|
+
labels = [theme.topic_label.lower().strip() for theme in self.responses]
|
|
182
|
+
if len(labels) != len(set(labels)):
|
|
183
|
+
raise ValueError("Duplicate topic labels detected")
|
|
184
|
+
return self
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
class CondensedTheme(ValidatedModel):
|
|
188
|
+
"""Model for a single condensed theme"""
|
|
189
|
+
|
|
190
|
+
topic_label: str = Field(
|
|
191
|
+
..., description="Representative label for the condensed topic"
|
|
192
|
+
)
|
|
193
|
+
topic_description: str = Field(
|
|
194
|
+
...,
|
|
195
|
+
description="Concise description incorporating key insights from constituent topics",
|
|
196
|
+
)
|
|
197
|
+
source_topic_count: int = Field(
|
|
198
|
+
..., gt=0, description="Sum of source_topic_counts from combined topics"
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
class ThemeCondensationResponses(ValidatedModel):
|
|
203
|
+
"""Container for all condensed themes"""
|
|
204
|
+
|
|
205
|
+
responses: List[CondensedTheme] = Field(..., description="List of condensed themes")
|
|
206
|
+
|
|
207
|
+
@model_validator(mode="after")
|
|
208
|
+
def run_validations(self) -> "ThemeCondensationResponses":
|
|
209
|
+
"""Ensure there are no duplicate themes"""
|
|
210
|
+
self.validate_non_empty_fields()
|
|
211
|
+
labels = [theme.topic_label.lower().strip() for theme in self.responses]
|
|
212
|
+
if len(labels) != len(set(labels)):
|
|
213
|
+
raise ValueError("Duplicate topic labels detected")
|
|
214
|
+
return self
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
class RefinedTheme(ValidatedModel):
|
|
218
|
+
"""Model for a single refined theme"""
|
|
219
|
+
|
|
220
|
+
topic: str = Field(
|
|
221
|
+
..., description="Topic label and description combined with a colon separator"
|
|
222
|
+
)
|
|
223
|
+
source_topic_count: int = Field(
|
|
224
|
+
..., gt=0, description="Count of source topics combined"
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
@model_validator(mode="after")
|
|
228
|
+
def run_validations(self) -> "RefinedTheme":
|
|
229
|
+
"""Run all validations for RefinedTheme"""
|
|
230
|
+
self.validate_non_empty_fields()
|
|
231
|
+
self.validate_topic_format()
|
|
232
|
+
return self
|
|
233
|
+
|
|
234
|
+
def validate_topic_format(self) -> "RefinedTheme":
|
|
235
|
+
"""
|
|
236
|
+
Validate that topic contains a label and description separated by a colon.
|
|
237
|
+
"""
|
|
238
|
+
if ":" not in self.topic:
|
|
239
|
+
raise ValueError(
|
|
240
|
+
"Topic must contain a label and description separated by a colon"
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
label, description = self.topic.split(":", 1)
|
|
244
|
+
if not label.strip() or not description.strip():
|
|
245
|
+
raise ValueError("Both label and description must be non-empty")
|
|
246
|
+
|
|
247
|
+
word_count = len(label.strip().split())
|
|
248
|
+
if word_count > 10:
|
|
249
|
+
raise ValueError(f"Topic label must be under 10 words (found {word_count})")
|
|
250
|
+
|
|
251
|
+
return self
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
class ThemeRefinementResponses(ValidatedModel):
|
|
255
|
+
"""Container for all refined themes"""
|
|
256
|
+
|
|
257
|
+
responses: List[RefinedTheme] = Field(..., description="List of refined themes")
|
|
258
|
+
|
|
259
|
+
@model_validator(mode="after")
|
|
260
|
+
def run_validations(self) -> "ThemeRefinementResponses":
|
|
261
|
+
"""Ensure there are no duplicate themes"""
|
|
262
|
+
self.validate_non_empty_fields()
|
|
263
|
+
topics = [theme.topic.lower().strip() for theme in self.responses]
|
|
264
|
+
if len(topics) != len(set(topics)):
|
|
265
|
+
raise ValueError("Duplicate topics detected")
|
|
266
|
+
|
|
267
|
+
return self
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
class ThemeMappingOutput(ValidatedModel):
|
|
271
|
+
"""Model for theme mapping output"""
|
|
272
|
+
|
|
273
|
+
response_id: int = Field(gt=0, description="Response ID, must be greater than 0")
|
|
274
|
+
labels: List[str] = Field(..., description="List of theme labels")
|
|
275
|
+
|
|
276
|
+
@model_validator(mode="after")
|
|
277
|
+
def run_validations(self) -> "ThemeMappingOutput":
|
|
278
|
+
"""
|
|
279
|
+
Run all validations for ThemeMappingOutput.
|
|
280
|
+
"""
|
|
281
|
+
self.validate_non_empty_fields()
|
|
282
|
+
self.validate_unique_items("labels")
|
|
283
|
+
return self
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
class ThemeMappingResponses(ValidatedModel):
|
|
287
|
+
"""Container for all theme mapping responses"""
|
|
288
|
+
|
|
289
|
+
responses: List[ThemeMappingOutput] = Field(
|
|
290
|
+
..., description="List of theme mapping outputs"
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
@model_validator(mode="after")
|
|
294
|
+
def run_validations(self) -> "ThemeMappingResponses":
|
|
295
|
+
"""
|
|
296
|
+
Validate that response_ids are unique.
|
|
297
|
+
"""
|
|
298
|
+
self.validate_non_empty_fields()
|
|
299
|
+
response_ids = [resp.response_id for resp in self.responses]
|
|
300
|
+
if len(response_ids) != len(set(response_ids)):
|
|
301
|
+
raise ValueError("Response IDs must be unique")
|
|
302
|
+
return self
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
class DetailDetectionOutput(ValidatedModel):
|
|
306
|
+
"""Model for detail detection output"""
|
|
307
|
+
|
|
308
|
+
response_id: int = Field(gt=0, description="Response ID, must be greater than 0")
|
|
309
|
+
evidence_rich: EvidenceRich = Field(
|
|
310
|
+
..., description="Whether the response is evidence-rich (YES or NO)"
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
class DetailDetectionResponses(ValidatedModel):
|
|
315
|
+
"""Container for all detail detection responses"""
|
|
316
|
+
|
|
317
|
+
responses: List[DetailDetectionOutput] = Field(
|
|
318
|
+
..., description="List of detail detection outputs"
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
@model_validator(mode="after")
|
|
322
|
+
def run_validations(self) -> "DetailDetectionResponses":
|
|
323
|
+
"""
|
|
324
|
+
Validate that response_ids are unique.
|
|
325
|
+
"""
|
|
326
|
+
self.validate_non_empty_fields()
|
|
327
|
+
response_ids = [resp.response_id for resp in self.responses]
|
|
328
|
+
if len(response_ids) != len(set(response_ids)):
|
|
329
|
+
raise ValueError("Response IDs must be unique")
|
|
330
|
+
return self
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
class ThemeNode(ValidatedModel):
|
|
334
|
+
"""Model for topic nodes created during hierarchical clustering"""
|
|
335
|
+
|
|
336
|
+
topic_id: str = Field(
|
|
337
|
+
...,
|
|
338
|
+
description="Short alphabetic ID (e.g. 'A', 'B', 'C') - iteration prefix will be added automatically",
|
|
339
|
+
)
|
|
340
|
+
topic_label: str = Field(
|
|
341
|
+
..., description="4-5 word label encompassing merged child topics"
|
|
342
|
+
)
|
|
343
|
+
topic_description: str = Field(
|
|
344
|
+
..., description="1-2 sentences combining key aspects of child topics"
|
|
345
|
+
)
|
|
346
|
+
source_topic_count: int = Field(gt=0, description="Sum of all child topic counts")
|
|
347
|
+
parent_id: Optional[str] = Field(
|
|
348
|
+
default=None,
|
|
349
|
+
description="Internal field: ID of parent topic node, managed by clustering agent, not set by LLM",
|
|
350
|
+
)
|
|
351
|
+
children: List[str] = Field(
|
|
352
|
+
default_factory=list, description="List of topic_ids of merged child topics"
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
@model_validator(mode="after")
|
|
356
|
+
def run_validations(self) -> "ThemeNode":
|
|
357
|
+
"""Validate topic node constraints"""
|
|
358
|
+
if self.children:
|
|
359
|
+
# Each parent must have at least 2 children
|
|
360
|
+
if len(self.children) < 2:
|
|
361
|
+
raise ValueError("Each topic node must have at least 2 children")
|
|
362
|
+
# Validate children are unique
|
|
363
|
+
if len(self.children) != len(set(self.children)):
|
|
364
|
+
raise ValueError("Child topic IDs must be unique")
|
|
365
|
+
|
|
366
|
+
return self
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
class HierarchicalClusteringResponse(ValidatedModel):
|
|
370
|
+
"""Model for hierarchical clustering agent response"""
|
|
371
|
+
|
|
372
|
+
parent_themes: List[ThemeNode] = Field(
|
|
373
|
+
default=[],
|
|
374
|
+
description="List of parent themes created by merging similar themes",
|
|
375
|
+
)
|
|
376
|
+
should_terminate: bool = Field(
|
|
377
|
+
...,
|
|
378
|
+
description="True if no more meaningful clustering is possible, false otherwise",
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
@model_validator(mode="after")
|
|
382
|
+
def run_validations(self) -> "HierarchicalClusteringResponse":
|
|
383
|
+
"""Validate clustering response constraints"""
|
|
384
|
+
self.validate_non_empty_fields()
|
|
385
|
+
|
|
386
|
+
# Validate that no child appears in multiple parents
|
|
387
|
+
all_children = []
|
|
388
|
+
for parent in self.parent_themes:
|
|
389
|
+
all_children.extend(parent.children)
|
|
390
|
+
|
|
391
|
+
if len(all_children) != len(set(all_children)):
|
|
392
|
+
raise ValueError("Each child theme can have at most one parent")
|
|
393
|
+
|
|
394
|
+
return self
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
# Cross-Cutting Theme Identification Models
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
class CrossCuttingThemeDefinition(BaseModel):
|
|
401
|
+
"""Model for a high-level cross-cutting theme."""
|
|
402
|
+
|
|
403
|
+
name: str = Field(
|
|
404
|
+
...,
|
|
405
|
+
description="Short, descriptive name for the cross-cutting theme (3-7 words)",
|
|
406
|
+
)
|
|
407
|
+
description: str = Field(
|
|
408
|
+
...,
|
|
409
|
+
description="2-sentence description of what this cross-cutting theme represents",
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
class CrossCuttingThemeIdentificationResponse(BaseModel):
|
|
414
|
+
"""Response model for identifying cross-cutting themes."""
|
|
415
|
+
|
|
416
|
+
themes: List[CrossCuttingThemeDefinition] = Field(
|
|
417
|
+
default=[], description="List of identified cross-cutting themes"
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
class CrossCuttingThemeMapping(BaseModel):
|
|
422
|
+
"""Model for mapping individual themes to a cross-cutting theme."""
|
|
423
|
+
|
|
424
|
+
theme_name: str = Field(
|
|
425
|
+
..., description="Name of the cross-cutting theme this theme belongs to"
|
|
426
|
+
)
|
|
427
|
+
theme_ids: List[str] = Field(
|
|
428
|
+
...,
|
|
429
|
+
description="List of theme IDs that belong to this cross-cutting theme (e.g., ['A', 'B', 'C'])",
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
class CrossCuttingThemeMappingResponse(BaseModel):
|
|
434
|
+
"""Response model for mapping question themes to cross-cutting themes."""
|
|
435
|
+
|
|
436
|
+
mappings: List[CrossCuttingThemeMapping] = Field(
|
|
437
|
+
default=[], description="List of cross-cutting theme mappings for this question"
|
|
438
|
+
)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
{system_prompt}
|
|
2
|
+
|
|
3
|
+
Analyze these topics and identify which ones should be merged based on semantic similarity.
|
|
4
|
+
Your goal is to significantly reduce the number of topics by creating meaningful parent topics.
|
|
5
|
+
Be aggressive in finding opportunities to merge topics that share any semantic relationship.
|
|
6
|
+
|
|
7
|
+
TOPICS:
|
|
8
|
+
{themes_json}
|
|
9
|
+
|
|
10
|
+
For each group of similar topics that should be merged, create a new parent topic.
|
|
11
|
+
|
|
12
|
+
Guidelines:
|
|
13
|
+
- Each parent topic must have at least 2 children, it can have more than 2 if appropriate
|
|
14
|
+
- Each child topic can have at most 1 parent
|
|
15
|
+
- topic_id should be a simple alphabetic ID (e.g. 'A', 'B', 'C') - the iteration prefix will be added automatically
|
|
16
|
+
- Be creative and look for higher-level abstractions that can combine seemingly different topics
|
|
17
|
+
- When creating parent topics, follow these naming rules:
|
|
18
|
+
* The label should read naturally as a single coherent topic
|
|
19
|
+
* Choose labels that can encompass broader categories of topics
|
|
20
|
+
* If merging different topics, the topic with the higher source_topic_count should dominate the label
|
|
21
|
+
* Never combine different topics with "and" or "/" in the label
|
|
22
|
+
- topic_description must be 1 or 2 sentences that:
|
|
23
|
+
* preserves key information from the child topics
|
|
24
|
+
- source_topic_count must be the sum of all child topic counts
|
|
25
|
+
- children must be a list of valid topic_ids from the input
|
|
26
|
+
- should_terminate should only be true if ALL of these conditions are met:
|
|
27
|
+
* There are fewer than {target_themes} active topics remaining
|
|
28
|
+
* The remaining topics are fundamentally incompatible semantically
|
|
29
|
+
* Any further merging would create meaninglessly broad categories
|
|
30
|
+
|
|
31
|
+
If no topics should be merged in this iteration but future iterations might still yield meaningful merges, set should_terminate to false with an empty parent_themes list.
|
|
32
|
+
If no topics should be merged and the termination conditions are met, set should_terminate to true with an empty parent_themes list.
|
|
33
|
+
|
|
34
|
+
N.B. Under no circumstances should you create a parent theme with a single child. You do not need to return all of the original themes, if they don't belong to a newly created parent feel free to omit them.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
You are an AI evaluation tool analyzing responses to a UK Government public consultation.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
You are helping in the analysis of a consultation (a survey on a proposed policy).
|
|
2
|
+
|
|
3
|
+
We have already run a topic modelling algorithm on individual questions in the consultation.
|
|
4
|
+
You are going to be shown questions in the consultation and the themes that were extracted from them by the model.
|
|
5
|
+
Your job is to come up with {n_concepts} high level concepts that unify a large number of the themes across the consultation.
|
|
6
|
+
Not every question theme has to belong to one of these concepts but the concepts you identify should capture multiple themes across multiple questions.
|
|
7
|
+
|
|
8
|
+
These high level concepts should be the type of take homes that policy maker should consider while trying to implement a policy related to this consultation.
|
|
9
|
+
|
|
10
|
+
IMPORTANT: Each cross-cutting concept should be distinct and non-overlapping. When designing these concepts, ensure that themes would naturally belong to only one concept, not multiple. Avoid creating concepts that could capture the same themes.
|
|
11
|
+
|
|
12
|
+
You should output a short name of each concept and a 2 sentence description of the concept.
|
|
13
|
+
|
|
14
|
+
Questions and topics:
|
|
15
|
+
|
|
16
|
+
{questions_and_themes}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
You are helping to analyse themes extracted in responses to a government consultation.
|
|
2
|
+
You have identified cross-cutting themes that capture topics raised across different questions.
|
|
3
|
+
You are now looking at themes from a specific question and need to determine which themes
|
|
4
|
+
belong to which cross-cutting themes.
|
|
5
|
+
|
|
6
|
+
IMPORTANT CONSTRAINTS:
|
|
7
|
+
1. Only assign themes to cross-cutting themes if there is a strong semantic match.
|
|
8
|
+
2. Not every theme needs to be assigned to a cross-cutting theme.
|
|
9
|
+
3. Each theme can be assigned to AT MOST ONE cross-cutting theme - never assign the same theme to multiple cross-cutting themes.
|
|
10
|
+
|
|
11
|
+
For each cross-cutting theme that has matching themes, provide:
|
|
12
|
+
- theme_name: The exact name of the cross-cutting theme (must match one from the list)
|
|
13
|
+
- theme_ids: List of theme IDs that belong to this cross-cutting theme (e.g., ["A", "B"])
|
|
14
|
+
|
|
15
|
+
Question themes:
|
|
16
|
+
{question_input}
|
|
17
|
+
|
|
18
|
+
Cross-cutting themes:
|
|
19
|
+
{concepts_text}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
You are going to be given a number of themes extracted from survey responses to a range of questions.
|
|
2
|
+
Each theme will be presented alongside the question it was raised in.
|
|
3
|
+
All of the themes were grouped together under the following parent theme:
|
|
4
|
+
{concept_name}
|
|
5
|
+
|
|
6
|
+
Your task is to read each of the themes and generate a brief 2-3 sentence synthesis of what these themes represent collectively.
|
|
7
|
+
Capture interesting nuances and details from each of the themes in the list.
|
|
8
|
+
Produce a concise, insightful summary rather than a lengthy description that simply combines all original theme descriptions.
|
|
9
|
+
Focus on insights that would be valuable for policy makers reviewing this survey and implementing changes.
|
|
10
|
+
|
|
11
|
+
Return only the description text without any preamble.
|
|
12
|
+
Do not begin the description with the name of the parent theme.
|
|
13
|
+
|
|
14
|
+
QUESTIONS AND THEMES:
|
|
15
|
+
{theme_lines}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
{system_prompt}
|
|
2
|
+
|
|
3
|
+
You will receive a list of RESPONSES, each containing a response_id and a response.
|
|
4
|
+
Your job is to analyze each response to the QUESTION below and decide if a response contains rich evidence.
|
|
5
|
+
You MUST include every response ID in the output.
|
|
6
|
+
|
|
7
|
+
A response is evidence-rich only if it satisfies both of the following:
|
|
8
|
+
|
|
9
|
+
Relevance and depth:
|
|
10
|
+
- It clearly answers the question
|
|
11
|
+
- AND provides insights that go beyond generic opinion, such as nuanced reasoning, contextual explanation, or argumentation that could inform decision-making
|
|
12
|
+
|
|
13
|
+
Substantive evidence, including at least one of:
|
|
14
|
+
- Specific, verifiable facts or data (e.g., statistics, dates, named reports or studies)
|
|
15
|
+
- Concrete, illustrative examples that clearly support a broader claim
|
|
16
|
+
- Detailed personal or professional experiences that include contextual information (e.g., roles, locations, timelines)
|
|
17
|
+
|
|
18
|
+
Do NOT classify a response as evidence-rich if it:
|
|
19
|
+
- Uses vague or general language with no supporting detail
|
|
20
|
+
- Restates commonly known points without adding new information
|
|
21
|
+
- Shares personal anecdotes without sufficient context or a clear takeaway
|
|
22
|
+
|
|
23
|
+
Before answering, ask: Would this response provide useful input to someone drafting policy, beyond what is already commonly known or expected?
|
|
24
|
+
|
|
25
|
+
For each response, determine:
|
|
26
|
+
EVIDENCE_RICH - does the response contain significant evidence as defined above?
|
|
27
|
+
Choose one from ['YES', 'NO']
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
QUESTION: \n {question}
|
|
31
|
+
RESPONSES: \n {responses}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
{system_prompt}
|
|
2
|
+
|
|
3
|
+
You will receive a list of RESPONSES, each containing a response_id and a response.
|
|
4
|
+
Your job is to analyze each response to the QUESTION below and decide:
|
|
5
|
+
|
|
6
|
+
POSITION - is the response AGREEING or DISAGREEING or is it UNCLEAR about the change being proposed in the question.
|
|
7
|
+
Choose one from [AGREEMENT, DISAGREEMENT, UNCLEAR]
|
|
8
|
+
|
|
9
|
+
You MUST include every response ID in the output.
|
|
10
|
+
If the response can not be labelled return empty sections where appropriate but you MUST return an entry
|
|
11
|
+
with the correct response ID for each input object
|
|
12
|
+
|
|
13
|
+
You MUST pick one of the given POSITION values.
|
|
14
|
+
You MUST not return an empty value for the POSITION of a response.
|
|
15
|
+
|
|
16
|
+
## EXAMPLE
|
|
17
|
+
Example 1:
|
|
18
|
+
Question: \n What are your thoughts on the proposed government changes to the policy about reducing school holidays?
|
|
19
|
+
Response: \n as a parent I have no idea why you would make this change. I guess you were thinking about increasing productivity but any productivity gains would be totally offset by the decrease in family time. \n
|
|
20
|
+
|
|
21
|
+
Output:
|
|
22
|
+
POSITION: DISAGREEMENT
|
|
23
|
+
|
|
24
|
+
Example 2:
|
|
25
|
+
Question: \n What are your thoughts on the proposed government changes to the policy about reducing school holidays?
|
|
26
|
+
Response: \n I think this is a great idea, our children will learn more if they are in school more \n
|
|
27
|
+
|
|
28
|
+
Output:
|
|
29
|
+
POSITION: AGREEMENT
|
|
30
|
+
|
|
31
|
+
Example 3:
|
|
32
|
+
Question: \n What are your thoughts on the proposed government changes to the policy about reducing school holidays?
|
|
33
|
+
Response: \n it will be good for our children to be around their friends more but it will be hard for some parents spend
|
|
34
|
+
less time with their children \n
|
|
35
|
+
|
|
36
|
+
Output:
|
|
37
|
+
POSITION: UNCLEAR
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
QUESTION: \n {question}
|
|
41
|
+
RESPONSES: \n {responses}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
{system_prompt}
|
|
2
|
+
|
|
3
|
+
Below is a question and a list of topics extracted from answers to that question.
|
|
4
|
+
|
|
5
|
+
This list contains a large number of duplicate and redundant topics that present the same concept with different phrasing.
|
|
6
|
+
|
|
7
|
+
Each topic has a topic_label, topic_description, and may have a source_topic_count field indicating how many original topics it represents.
|
|
8
|
+
|
|
9
|
+
Your task is to analyze these topics and produce a refined list that:
|
|
10
|
+
1. Significantly reduces the total number of topics
|
|
11
|
+
2. Identifies and preserves core themes that appear frequently
|
|
12
|
+
3. Combines redundant topics
|
|
13
|
+
4. Tracks the total number of original topics combined into each new topic
|
|
14
|
+
|
|
15
|
+
Guidelines for Topic Analysis:
|
|
16
|
+
- Begin by identifying distinct concept clusters in the topics
|
|
17
|
+
- Consider the context of the question when determining topic relevance
|
|
18
|
+
- Look for complementary perspectives that could enrich understanding of the same core concept
|
|
19
|
+
- Consider the key ideas behind themes when merging, don't simply focus on the words used in the label and description
|
|
20
|
+
- When combining topics:
|
|
21
|
+
* For topics without a source_topic_count field, assume count = 1
|
|
22
|
+
* For topics with source_topic_count, use their existing count
|
|
23
|
+
* The new topic's count should be the sum of all combined topics' counts
|
|
24
|
+
|
|
25
|
+
For each topic in your output:
|
|
26
|
+
1. Choose a clear, representative label that captures the essence of the combined or preserved topic
|
|
27
|
+
2. Write a concise description that incorporates key insights from all constituent topics, this should only be a single sentence
|
|
28
|
+
3. Include the total count of original topics combined by summing the source_topic_counts of merged topics (or 1 for topics without a count)
|
|
29
|
+
|
|
30
|
+
QUESTION:
|
|
31
|
+
{question}
|
|
32
|
+
|
|
33
|
+
TOPICS:
|
|
34
|
+
{responses}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
{system_prompt}
|
|
2
|
+
|
|
3
|
+
Below is a question and a list of responses to that question.
|
|
4
|
+
|
|
5
|
+
Your task is to analyze the RESPONSES below and extract TOPICS such that:
|
|
6
|
+
1. Each topic summarizes a point of view expressed in the responses
|
|
7
|
+
2. Every distinct and relevant point of view in the responses should be captured by a topic
|
|
8
|
+
3. Each topic has a topic_label which summarizes the topic in a few words
|
|
9
|
+
4. Each topic has a topic_description which gives more detail about the topic in one or two sentences
|
|
10
|
+
5. The position field should just be the sentiment stated, and is either "AGREEMENT" or "DISAGREEMENT" or "UNCLEAR"
|
|
11
|
+
6. There should be no duplicate topics
|
|
12
|
+
|
|
13
|
+
The topics identified will be used by policy makers to understand what the public like and don't like about the proposals.
|
|
14
|
+
|
|
15
|
+
Here is an example of how to extract topics from some responses:
|
|
16
|
+
|
|
17
|
+
## EXAMPLE
|
|
18
|
+
|
|
19
|
+
QUESTION
|
|
20
|
+
What are your views on the proposed change by the government to introduce a 2% tax on fast food meat products.
|
|
21
|
+
|
|
22
|
+
RESPONSES
|
|
23
|
+
[
|
|
24
|
+
{{"response": "I wish the government would stop interfering in the lves of its citizens. It only ever makes things worse. This change will just cost us all more money, and especially poorer people", "position": "disagreement"}},
|
|
25
|
+
{{"response": "Even though it will make people eat more healthier, I beleibe the government should interfer less and not more!", "position": "disagreement"}},
|
|
26
|
+
{{"response": "I hate grapes", "position": "disagreement"}},
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
EXAMPLE OUTPUT (showing the structure)
|
|
30
|
+
- Topic 1: Government overreach (The proposals would result in government interfering too much with citizen's lives) - DISAGREEMENT
|
|
31
|
+
- Topic 2: Regressive change (The change would have a larger negative impact on poorer people) - DISAGREEMENT
|
|
32
|
+
- Topic 3: Health (The change would result in people eating healthier diets) - DISAGREEMENT
|
|
33
|
+
|
|
34
|
+
QUESTION:
|
|
35
|
+
{question}
|
|
36
|
+
|
|
37
|
+
RESPONSES:
|
|
38
|
+
{responses}
|