tritopic 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tritopic might be problematic. Click here for more details.

tritopic/labeling.py ADDED
@@ -0,0 +1,313 @@
1
+ """
2
+ LLM-powered topic labeling for TriTopic.
3
+
4
+ Supports multiple LLM providers for generating human-readable topic labels.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import re
11
+ from abc import ABC, abstractmethod
12
+ from typing import Any, Dict, List, Optional
13
+
14
+
15
+ class BaseLLMLabeler(ABC):
16
+ """Abstract base class for LLM-based topic labelers."""
17
+
18
+ def __init__(
19
+ self,
20
+ language: str = "english",
21
+ max_label_words: int = 5,
22
+ include_context: bool = True
23
+ ):
24
+ self.language = language
25
+ self.max_label_words = max_label_words
26
+ self.include_context = include_context
27
+
28
+ @abstractmethod
29
+ def _call_llm(self, prompt: str) -> str:
30
+ """Call the LLM with the given prompt."""
31
+ pass
32
+
33
+ def generate_label(
34
+ self,
35
+ keywords: List[str],
36
+ representative_docs: List[str],
37
+ context: Optional[str] = None
38
+ ) -> str:
39
+ """
40
+ Generate a human-readable label for a topic.
41
+
42
+ Parameters
43
+ ----------
44
+ keywords : List[str]
45
+ Top keywords for the topic.
46
+ representative_docs : List[str]
47
+ Representative document excerpts.
48
+ context : str, optional
49
+ Additional context about the corpus.
50
+
51
+ Returns
52
+ -------
53
+ str
54
+ Generated topic label.
55
+ """
56
+ prompt = self._build_prompt(keywords, representative_docs, context)
57
+ response = self._call_llm(prompt)
58
+ return self._parse_response(response)
59
+
60
+ def _build_prompt(
61
+ self,
62
+ keywords: List[str],
63
+ representative_docs: List[str],
64
+ context: Optional[str] = None
65
+ ) -> str:
66
+ """Build the prompt for label generation."""
67
+ keyword_str = ", ".join(keywords[:10])
68
+
69
+ docs_str = ""
70
+ for i, doc in enumerate(representative_docs[:3], 1):
71
+ # Truncate long documents
72
+ doc_preview = doc[:300] + "..." if len(doc) > 300 else doc
73
+ docs_str += f"\n{i}. \"{doc_preview}\""
74
+
75
+ language_instruction = ""
76
+ if self.language.lower() != "english":
77
+ language_instruction = f"\nIMPORTANT: Generate the label in {self.language}."
78
+
79
+ context_str = ""
80
+ if context and self.include_context:
81
+ context_str = f"\nCorpus context: {context}"
82
+
83
+ prompt = f"""Generate a concise, descriptive label for this topic.
84
+
85
+ Keywords: {keyword_str}
86
+
87
+ Representative documents:{docs_str}
88
+ {context_str}
89
+ {language_instruction}
90
+ Requirements:
91
+ - Maximum {self.max_label_words} words
92
+ - Be specific and descriptive
93
+ - Capture the main theme
94
+ - Use title case
95
+
96
+ Respond with ONLY the label, nothing else."""
97
+
98
+ return prompt
99
+
100
+ def _parse_response(self, response: str) -> str:
101
+ """Parse and clean the LLM response."""
102
+ # Clean up the response
103
+ label = response.strip()
104
+
105
+ # Remove quotes if present
106
+ label = label.strip('"\'')
107
+
108
+ # Remove any prefix like "Label:" or "Topic:"
109
+ for prefix in ["Label:", "Topic:", "Title:"]:
110
+ if label.lower().startswith(prefix.lower()):
111
+ label = label[len(prefix):].strip()
112
+
113
+ # Truncate if too long
114
+ words = label.split()
115
+ if len(words) > self.max_label_words:
116
+ label = " ".join(words[:self.max_label_words])
117
+
118
+ return label
119
+
120
+
121
+ class AnthropicLabeler(BaseLLMLabeler):
122
+ """Topic labeler using Anthropic's Claude models."""
123
+
124
+ def __init__(
125
+ self,
126
+ api_key: str,
127
+ model: str = "claude-sonnet-4-20250514",
128
+ language: str = "english",
129
+ max_label_words: int = 5,
130
+ **kwargs
131
+ ):
132
+ super().__init__(language, max_label_words, **kwargs)
133
+ self.api_key = api_key
134
+ self.model = model
135
+ self._client = None
136
+
137
+ @property
138
+ def client(self):
139
+ if self._client is None:
140
+ try:
141
+ from anthropic import Anthropic
142
+ self._client = Anthropic(api_key=self.api_key)
143
+ except ImportError:
144
+ raise ImportError(
145
+ "Anthropic package required. Install with: pip install anthropic"
146
+ )
147
+ return self._client
148
+
149
+ def _call_llm(self, prompt: str) -> str:
150
+ message = self.client.messages.create(
151
+ model=self.model,
152
+ max_tokens=100,
153
+ messages=[
154
+ {"role": "user", "content": prompt}
155
+ ]
156
+ )
157
+ return message.content[0].text
158
+
159
+
160
+ class OpenAILabeler(BaseLLMLabeler):
161
+ """Topic labeler using OpenAI's GPT models."""
162
+
163
+ def __init__(
164
+ self,
165
+ api_key: str,
166
+ model: str = "gpt-4o",
167
+ language: str = "english",
168
+ max_label_words: int = 5,
169
+ **kwargs
170
+ ):
171
+ super().__init__(language, max_label_words, **kwargs)
172
+ self.api_key = api_key
173
+ self.model = model
174
+ self._client = None
175
+
176
+ @property
177
+ def client(self):
178
+ if self._client is None:
179
+ try:
180
+ from openai import OpenAI
181
+ self._client = OpenAI(api_key=self.api_key)
182
+ except ImportError:
183
+ raise ImportError(
184
+ "OpenAI package required. Install with: pip install openai"
185
+ )
186
+ return self._client
187
+
188
+ def _call_llm(self, prompt: str) -> str:
189
+ response = self.client.chat.completions.create(
190
+ model=self.model,
191
+ max_tokens=100,
192
+ messages=[
193
+ {"role": "user", "content": prompt}
194
+ ]
195
+ )
196
+ return response.choices[0].message.content
197
+
198
+
199
+ class LLMLabeler:
200
+ """
201
+ Unified interface for LLM-based topic labeling.
202
+
203
+ Supports multiple providers: Anthropic (Claude), OpenAI (GPT).
204
+
205
+ Parameters
206
+ ----------
207
+ provider : str
208
+ LLM provider: "anthropic" or "openai".
209
+ api_key : str
210
+ API key for the provider.
211
+ model : str, optional
212
+ Specific model to use. Defaults vary by provider.
213
+ language : str
214
+ Output language for labels.
215
+ max_label_words : int
216
+ Maximum words in generated labels.
217
+
218
+ Examples
219
+ --------
220
+ >>> labeler = LLMLabeler(
221
+ ... provider="anthropic",
222
+ ... api_key="your-api-key",
223
+ ... language="german"
224
+ ... )
225
+ >>> model.generate_labels(labeler)
226
+ """
227
+
228
+ def __init__(
229
+ self,
230
+ provider: str = "anthropic",
231
+ api_key: Optional[str] = None,
232
+ model: Optional[str] = None,
233
+ language: str = "english",
234
+ max_label_words: int = 5,
235
+ **kwargs
236
+ ):
237
+ provider = provider.lower()
238
+
239
+ if provider == "anthropic":
240
+ self._labeler = AnthropicLabeler(
241
+ api_key=api_key,
242
+ model=model or "claude-sonnet-4-20250514",
243
+ language=language,
244
+ max_label_words=max_label_words,
245
+ **kwargs
246
+ )
247
+ elif provider == "openai":
248
+ self._labeler = OpenAILabeler(
249
+ api_key=api_key,
250
+ model=model or "gpt-4o",
251
+ language=language,
252
+ max_label_words=max_label_words,
253
+ **kwargs
254
+ )
255
+ else:
256
+ raise ValueError(f"Unknown provider: {provider}. Use 'anthropic' or 'openai'.")
257
+
258
+ def generate_label(
259
+ self,
260
+ keywords: List[str],
261
+ representative_docs: List[str],
262
+ context: Optional[str] = None
263
+ ) -> str:
264
+ """
265
+ Generate a label for a topic.
266
+
267
+ Parameters
268
+ ----------
269
+ keywords : List[str]
270
+ Topic keywords.
271
+ representative_docs : List[str]
272
+ Representative documents.
273
+ context : str, optional
274
+ Corpus context.
275
+
276
+ Returns
277
+ -------
278
+ str
279
+ Generated label.
280
+ """
281
+ return self._labeler.generate_label(keywords, representative_docs, context)
282
+
283
+
284
+ class KeywordLabeler:
285
+ """
286
+ Simple labeler that combines top keywords into a label.
287
+
288
+ No LLM required - uses keyword-based heuristics.
289
+ """
290
+
291
+ def __init__(
292
+ self,
293
+ n_keywords: int = 3,
294
+ separator: str = " & ",
295
+ capitalize: bool = True
296
+ ):
297
+ self.n_keywords = n_keywords
298
+ self.separator = separator
299
+ self.capitalize = capitalize
300
+
301
+ def generate_label(
302
+ self,
303
+ keywords: List[str],
304
+ representative_docs: Optional[List[str]] = None,
305
+ context: Optional[str] = None
306
+ ) -> str:
307
+ """Generate label from keywords."""
308
+ top_keywords = keywords[:self.n_keywords]
309
+
310
+ if self.capitalize:
311
+ top_keywords = [kw.title() for kw in top_keywords]
312
+
313
+ return self.separator.join(top_keywords)