tritopic 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tritopic might be problematic. Click here for more details.
- tritopic/__init__.py +22 -32
- tritopic/config.py +289 -0
- tritopic/core/__init__.py +0 -17
- tritopic/core/clustering.py +229 -243
- tritopic/core/embeddings.py +151 -157
- tritopic/core/graph.py +435 -0
- tritopic/core/keywords.py +213 -249
- tritopic/core/refinement.py +231 -0
- tritopic/core/representatives.py +560 -0
- tritopic/labeling.py +313 -0
- tritopic/model.py +718 -0
- tritopic/multilingual/__init__.py +38 -0
- tritopic/multilingual/detection.py +208 -0
- tritopic/multilingual/stopwords.py +467 -0
- tritopic/multilingual/tokenizers.py +275 -0
- tritopic/visualization.py +371 -0
- {tritopic-0.1.0.dist-info → tritopic-1.1.0.dist-info}/METADATA +91 -51
- tritopic-1.1.0.dist-info/RECORD +20 -0
- tritopic/core/graph_builder.py +0 -493
- tritopic/core/model.py +0 -810
- tritopic/labeling/__init__.py +0 -5
- tritopic/labeling/llm_labeler.py +0 -279
- tritopic/utils/__init__.py +0 -13
- tritopic/utils/metrics.py +0 -254
- tritopic/visualization/__init__.py +0 -5
- tritopic/visualization/plotter.py +0 -523
- tritopic-0.1.0.dist-info/RECORD +0 -18
- tritopic-0.1.0.dist-info/licenses/LICENSE +0 -21
- {tritopic-0.1.0.dist-info → tritopic-1.1.0.dist-info}/WHEEL +0 -0
- {tritopic-0.1.0.dist-info → tritopic-1.1.0.dist-info}/top_level.txt +0 -0
tritopic/labeling.py
ADDED
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LLM-powered topic labeling for TriTopic.
|
|
3
|
+
|
|
4
|
+
Supports multiple LLM providers for generating human-readable topic labels.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import re
|
|
11
|
+
from abc import ABC, abstractmethod
|
|
12
|
+
from typing import Any, Dict, List, Optional
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BaseLLMLabeler(ABC):
|
|
16
|
+
"""Abstract base class for LLM-based topic labelers."""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
language: str = "english",
|
|
21
|
+
max_label_words: int = 5,
|
|
22
|
+
include_context: bool = True
|
|
23
|
+
):
|
|
24
|
+
self.language = language
|
|
25
|
+
self.max_label_words = max_label_words
|
|
26
|
+
self.include_context = include_context
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
def _call_llm(self, prompt: str) -> str:
|
|
30
|
+
"""Call the LLM with the given prompt."""
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
def generate_label(
|
|
34
|
+
self,
|
|
35
|
+
keywords: List[str],
|
|
36
|
+
representative_docs: List[str],
|
|
37
|
+
context: Optional[str] = None
|
|
38
|
+
) -> str:
|
|
39
|
+
"""
|
|
40
|
+
Generate a human-readable label for a topic.
|
|
41
|
+
|
|
42
|
+
Parameters
|
|
43
|
+
----------
|
|
44
|
+
keywords : List[str]
|
|
45
|
+
Top keywords for the topic.
|
|
46
|
+
representative_docs : List[str]
|
|
47
|
+
Representative document excerpts.
|
|
48
|
+
context : str, optional
|
|
49
|
+
Additional context about the corpus.
|
|
50
|
+
|
|
51
|
+
Returns
|
|
52
|
+
-------
|
|
53
|
+
str
|
|
54
|
+
Generated topic label.
|
|
55
|
+
"""
|
|
56
|
+
prompt = self._build_prompt(keywords, representative_docs, context)
|
|
57
|
+
response = self._call_llm(prompt)
|
|
58
|
+
return self._parse_response(response)
|
|
59
|
+
|
|
60
|
+
def _build_prompt(
|
|
61
|
+
self,
|
|
62
|
+
keywords: List[str],
|
|
63
|
+
representative_docs: List[str],
|
|
64
|
+
context: Optional[str] = None
|
|
65
|
+
) -> str:
|
|
66
|
+
"""Build the prompt for label generation."""
|
|
67
|
+
keyword_str = ", ".join(keywords[:10])
|
|
68
|
+
|
|
69
|
+
docs_str = ""
|
|
70
|
+
for i, doc in enumerate(representative_docs[:3], 1):
|
|
71
|
+
# Truncate long documents
|
|
72
|
+
doc_preview = doc[:300] + "..." if len(doc) > 300 else doc
|
|
73
|
+
docs_str += f"\n{i}. \"{doc_preview}\""
|
|
74
|
+
|
|
75
|
+
language_instruction = ""
|
|
76
|
+
if self.language.lower() != "english":
|
|
77
|
+
language_instruction = f"\nIMPORTANT: Generate the label in {self.language}."
|
|
78
|
+
|
|
79
|
+
context_str = ""
|
|
80
|
+
if context and self.include_context:
|
|
81
|
+
context_str = f"\nCorpus context: {context}"
|
|
82
|
+
|
|
83
|
+
prompt = f"""Generate a concise, descriptive label for this topic.
|
|
84
|
+
|
|
85
|
+
Keywords: {keyword_str}
|
|
86
|
+
|
|
87
|
+
Representative documents:{docs_str}
|
|
88
|
+
{context_str}
|
|
89
|
+
{language_instruction}
|
|
90
|
+
Requirements:
|
|
91
|
+
- Maximum {self.max_label_words} words
|
|
92
|
+
- Be specific and descriptive
|
|
93
|
+
- Capture the main theme
|
|
94
|
+
- Use title case
|
|
95
|
+
|
|
96
|
+
Respond with ONLY the label, nothing else."""
|
|
97
|
+
|
|
98
|
+
return prompt
|
|
99
|
+
|
|
100
|
+
def _parse_response(self, response: str) -> str:
|
|
101
|
+
"""Parse and clean the LLM response."""
|
|
102
|
+
# Clean up the response
|
|
103
|
+
label = response.strip()
|
|
104
|
+
|
|
105
|
+
# Remove quotes if present
|
|
106
|
+
label = label.strip('"\'')
|
|
107
|
+
|
|
108
|
+
# Remove any prefix like "Label:" or "Topic:"
|
|
109
|
+
for prefix in ["Label:", "Topic:", "Title:"]:
|
|
110
|
+
if label.lower().startswith(prefix.lower()):
|
|
111
|
+
label = label[len(prefix):].strip()
|
|
112
|
+
|
|
113
|
+
# Truncate if too long
|
|
114
|
+
words = label.split()
|
|
115
|
+
if len(words) > self.max_label_words:
|
|
116
|
+
label = " ".join(words[:self.max_label_words])
|
|
117
|
+
|
|
118
|
+
return label
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class AnthropicLabeler(BaseLLMLabeler):
|
|
122
|
+
"""Topic labeler using Anthropic's Claude models."""
|
|
123
|
+
|
|
124
|
+
def __init__(
|
|
125
|
+
self,
|
|
126
|
+
api_key: str,
|
|
127
|
+
model: str = "claude-sonnet-4-20250514",
|
|
128
|
+
language: str = "english",
|
|
129
|
+
max_label_words: int = 5,
|
|
130
|
+
**kwargs
|
|
131
|
+
):
|
|
132
|
+
super().__init__(language, max_label_words, **kwargs)
|
|
133
|
+
self.api_key = api_key
|
|
134
|
+
self.model = model
|
|
135
|
+
self._client = None
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def client(self):
|
|
139
|
+
if self._client is None:
|
|
140
|
+
try:
|
|
141
|
+
from anthropic import Anthropic
|
|
142
|
+
self._client = Anthropic(api_key=self.api_key)
|
|
143
|
+
except ImportError:
|
|
144
|
+
raise ImportError(
|
|
145
|
+
"Anthropic package required. Install with: pip install anthropic"
|
|
146
|
+
)
|
|
147
|
+
return self._client
|
|
148
|
+
|
|
149
|
+
def _call_llm(self, prompt: str) -> str:
|
|
150
|
+
message = self.client.messages.create(
|
|
151
|
+
model=self.model,
|
|
152
|
+
max_tokens=100,
|
|
153
|
+
messages=[
|
|
154
|
+
{"role": "user", "content": prompt}
|
|
155
|
+
]
|
|
156
|
+
)
|
|
157
|
+
return message.content[0].text
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class OpenAILabeler(BaseLLMLabeler):
|
|
161
|
+
"""Topic labeler using OpenAI's GPT models."""
|
|
162
|
+
|
|
163
|
+
def __init__(
|
|
164
|
+
self,
|
|
165
|
+
api_key: str,
|
|
166
|
+
model: str = "gpt-4o",
|
|
167
|
+
language: str = "english",
|
|
168
|
+
max_label_words: int = 5,
|
|
169
|
+
**kwargs
|
|
170
|
+
):
|
|
171
|
+
super().__init__(language, max_label_words, **kwargs)
|
|
172
|
+
self.api_key = api_key
|
|
173
|
+
self.model = model
|
|
174
|
+
self._client = None
|
|
175
|
+
|
|
176
|
+
@property
|
|
177
|
+
def client(self):
|
|
178
|
+
if self._client is None:
|
|
179
|
+
try:
|
|
180
|
+
from openai import OpenAI
|
|
181
|
+
self._client = OpenAI(api_key=self.api_key)
|
|
182
|
+
except ImportError:
|
|
183
|
+
raise ImportError(
|
|
184
|
+
"OpenAI package required. Install with: pip install openai"
|
|
185
|
+
)
|
|
186
|
+
return self._client
|
|
187
|
+
|
|
188
|
+
def _call_llm(self, prompt: str) -> str:
|
|
189
|
+
response = self.client.chat.completions.create(
|
|
190
|
+
model=self.model,
|
|
191
|
+
max_tokens=100,
|
|
192
|
+
messages=[
|
|
193
|
+
{"role": "user", "content": prompt}
|
|
194
|
+
]
|
|
195
|
+
)
|
|
196
|
+
return response.choices[0].message.content
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
class LLMLabeler:
|
|
200
|
+
"""
|
|
201
|
+
Unified interface for LLM-based topic labeling.
|
|
202
|
+
|
|
203
|
+
Supports multiple providers: Anthropic (Claude), OpenAI (GPT).
|
|
204
|
+
|
|
205
|
+
Parameters
|
|
206
|
+
----------
|
|
207
|
+
provider : str
|
|
208
|
+
LLM provider: "anthropic" or "openai".
|
|
209
|
+
api_key : str
|
|
210
|
+
API key for the provider.
|
|
211
|
+
model : str, optional
|
|
212
|
+
Specific model to use. Defaults vary by provider.
|
|
213
|
+
language : str
|
|
214
|
+
Output language for labels.
|
|
215
|
+
max_label_words : int
|
|
216
|
+
Maximum words in generated labels.
|
|
217
|
+
|
|
218
|
+
Examples
|
|
219
|
+
--------
|
|
220
|
+
>>> labeler = LLMLabeler(
|
|
221
|
+
... provider="anthropic",
|
|
222
|
+
... api_key="your-api-key",
|
|
223
|
+
... language="german"
|
|
224
|
+
... )
|
|
225
|
+
>>> model.generate_labels(labeler)
|
|
226
|
+
"""
|
|
227
|
+
|
|
228
|
+
def __init__(
|
|
229
|
+
self,
|
|
230
|
+
provider: str = "anthropic",
|
|
231
|
+
api_key: Optional[str] = None,
|
|
232
|
+
model: Optional[str] = None,
|
|
233
|
+
language: str = "english",
|
|
234
|
+
max_label_words: int = 5,
|
|
235
|
+
**kwargs
|
|
236
|
+
):
|
|
237
|
+
provider = provider.lower()
|
|
238
|
+
|
|
239
|
+
if provider == "anthropic":
|
|
240
|
+
self._labeler = AnthropicLabeler(
|
|
241
|
+
api_key=api_key,
|
|
242
|
+
model=model or "claude-sonnet-4-20250514",
|
|
243
|
+
language=language,
|
|
244
|
+
max_label_words=max_label_words,
|
|
245
|
+
**kwargs
|
|
246
|
+
)
|
|
247
|
+
elif provider == "openai":
|
|
248
|
+
self._labeler = OpenAILabeler(
|
|
249
|
+
api_key=api_key,
|
|
250
|
+
model=model or "gpt-4o",
|
|
251
|
+
language=language,
|
|
252
|
+
max_label_words=max_label_words,
|
|
253
|
+
**kwargs
|
|
254
|
+
)
|
|
255
|
+
else:
|
|
256
|
+
raise ValueError(f"Unknown provider: {provider}. Use 'anthropic' or 'openai'.")
|
|
257
|
+
|
|
258
|
+
def generate_label(
|
|
259
|
+
self,
|
|
260
|
+
keywords: List[str],
|
|
261
|
+
representative_docs: List[str],
|
|
262
|
+
context: Optional[str] = None
|
|
263
|
+
) -> str:
|
|
264
|
+
"""
|
|
265
|
+
Generate a label for a topic.
|
|
266
|
+
|
|
267
|
+
Parameters
|
|
268
|
+
----------
|
|
269
|
+
keywords : List[str]
|
|
270
|
+
Topic keywords.
|
|
271
|
+
representative_docs : List[str]
|
|
272
|
+
Representative documents.
|
|
273
|
+
context : str, optional
|
|
274
|
+
Corpus context.
|
|
275
|
+
|
|
276
|
+
Returns
|
|
277
|
+
-------
|
|
278
|
+
str
|
|
279
|
+
Generated label.
|
|
280
|
+
"""
|
|
281
|
+
return self._labeler.generate_label(keywords, representative_docs, context)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
class KeywordLabeler:
|
|
285
|
+
"""
|
|
286
|
+
Simple labeler that combines top keywords into a label.
|
|
287
|
+
|
|
288
|
+
No LLM required - uses keyword-based heuristics.
|
|
289
|
+
"""
|
|
290
|
+
|
|
291
|
+
def __init__(
|
|
292
|
+
self,
|
|
293
|
+
n_keywords: int = 3,
|
|
294
|
+
separator: str = " & ",
|
|
295
|
+
capitalize: bool = True
|
|
296
|
+
):
|
|
297
|
+
self.n_keywords = n_keywords
|
|
298
|
+
self.separator = separator
|
|
299
|
+
self.capitalize = capitalize
|
|
300
|
+
|
|
301
|
+
def generate_label(
|
|
302
|
+
self,
|
|
303
|
+
keywords: List[str],
|
|
304
|
+
representative_docs: Optional[List[str]] = None,
|
|
305
|
+
context: Optional[str] = None
|
|
306
|
+
) -> str:
|
|
307
|
+
"""Generate label from keywords."""
|
|
308
|
+
top_keywords = keywords[:self.n_keywords]
|
|
309
|
+
|
|
310
|
+
if self.capitalize:
|
|
311
|
+
top_keywords = [kw.title() for kw in top_keywords]
|
|
312
|
+
|
|
313
|
+
return self.separator.join(top_keywords)
|