sdg-hub 0.3.1__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. sdg_hub/__init__.py +0 -2
  2. sdg_hub/_version.py +2 -2
  3. sdg_hub/core/__init__.py +1 -2
  4. sdg_hub/core/blocks/__init__.py +2 -4
  5. sdg_hub/core/blocks/base.py +61 -6
  6. sdg_hub/core/blocks/filtering/column_value_filter.py +3 -2
  7. sdg_hub/core/blocks/llm/__init__.py +2 -4
  8. sdg_hub/core/blocks/llm/llm_chat_block.py +251 -265
  9. sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +216 -98
  10. sdg_hub/core/blocks/llm/llm_parser_block.py +320 -0
  11. sdg_hub/core/blocks/llm/text_parser_block.py +53 -152
  12. sdg_hub/core/flow/__init__.py +3 -4
  13. sdg_hub/core/flow/base.py +11 -73
  14. sdg_hub/core/flow/metadata.py +1 -68
  15. sdg_hub/core/flow/registry.py +0 -1
  16. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +51 -12
  17. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/__init__.py +0 -0
  18. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/flow.yaml +158 -0
  19. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +51 -12
  20. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +14 -3
  21. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +147 -28
  22. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/README.md +0 -0
  23. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/__init__.py +0 -0
  24. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml +41 -0
  25. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml +14 -0
  26. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml +14 -0
  27. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +303 -0
  28. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml +55 -0
  29. sdg_hub/flows/text_analysis/structured_insights/flow.yaml +28 -5
  30. {sdg_hub-0.3.1.dist-info → sdg_hub-0.4.1.dist-info}/METADATA +2 -1
  31. {sdg_hub-0.3.1.dist-info → sdg_hub-0.4.1.dist-info}/RECORD +34 -30
  32. sdg_hub/core/blocks/evaluation/__init__.py +0 -9
  33. sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +0 -323
  34. sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +0 -323
  35. sdg_hub/core/blocks/evaluation/verify_question_block.py +0 -329
  36. sdg_hub/core/blocks/llm/client_manager.py +0 -472
  37. sdg_hub/core/blocks/llm/config.py +0 -337
  38. {sdg_hub-0.3.1.dist-info → sdg_hub-0.4.1.dist-info}/WHEEL +0 -0
  39. {sdg_hub-0.3.1.dist-info → sdg_hub-0.4.1.dist-info}/licenses/LICENSE +0 -0
  40. {sdg_hub-0.3.1.dist-info → sdg_hub-0.4.1.dist-info}/top_level.txt +0 -0
@@ -1,337 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
- """Configuration system for LLM blocks supporting all providers via LiteLLM."""
3
-
4
- # Standard
5
- from dataclasses import dataclass
6
- from typing import Any, Optional, Union
7
- import os
8
-
9
-
10
- @dataclass
11
- class LLMConfig:
12
- """Configuration for LLM blocks supporting all providers via LiteLLM.
13
-
14
- This configuration supports 100+ LLM providers including OpenAI, Anthropic,
15
- Google, local models (vLLM, Ollama), and more through LiteLLM.
16
-
17
- Parameters
18
- ----------
19
- model : Optional[str], optional
20
- Model identifier in LiteLLM format. Can be None initially and set later via set_model_config(). Examples:
21
- - "openai/gpt-4"
22
- - "anthropic/claude-3-sonnet-20240229"
23
- - "hosted_vllm/meta-llama/Llama-2-7b-chat-hf"
24
- - "ollama/llama2"
25
-
26
- api_key : Optional[str], optional
27
- API key for the provider. Falls back to environment variables:
28
- - OPENAI_API_KEY for OpenAI models
29
- - ANTHROPIC_API_KEY for Anthropic models
30
- - GOOGLE_API_KEY for Google models
31
- - etc.
32
-
33
- api_base : Optional[str], optional
34
- Base URL for the API. Required for local models.
35
-
36
- Examples
37
- --------
38
- - "http://localhost:8000/v1" for local vLLM
39
- - "http://localhost:11434" for Ollama
40
-
41
- timeout : float, optional
42
- Request timeout in seconds, by default 120.0
43
-
44
- max_retries : int, optional
45
- Maximum number of retry attempts, by default 6
46
-
47
- ### Generation Parameters ###
48
-
49
- temperature : Optional[float], optional
50
- Sampling temperature (0.0 to 2.0), by default None
51
-
52
- max_tokens : Optional[int], optional
53
- Maximum tokens to generate, by default None
54
-
55
- top_p : Optional[float], optional
56
- Nucleus sampling parameter (0.0 to 1.0), by default None
57
-
58
- frequency_penalty : Optional[float], optional
59
- Frequency penalty (-2.0 to 2.0), by default None
60
-
61
- presence_penalty : Optional[float], optional
62
- Presence penalty (-2.0 to 2.0), by default None
63
-
64
- stop : Optional[Union[str, List[str]]], optional
65
- Stop sequences, by default None
66
-
67
- seed : Optional[int], optional
68
- Random seed for reproducible outputs, by default None
69
-
70
- response_format : Optional[Dict[str, Any]], optional
71
- Response format specification (e.g., JSON mode), by default None
72
-
73
- stream : Optional[bool], optional
74
- Whether to stream responses, by default None
75
-
76
- n : Optional[int], optional
77
- Number of completions to generate, by default None
78
-
79
- logprobs : Optional[bool], optional
80
- Whether to return log probabilities, by default None
81
-
82
- top_logprobs : Optional[int], optional
83
- Number of top log probabilities to return, by default None
84
-
85
- user : Optional[str], optional
86
- End-user identifier, by default None
87
-
88
- extra_headers : Optional[Dict[str, str]], optional
89
- Additional headers to send with requests, by default None
90
-
91
- extra_body : Optional[Dict[str, Any]], optional
92
- Additional parameters for the request body, by default None
93
-
94
- provider_specific : Optional[Dict[str, Any]], optional
95
- Provider-specific parameters that don't map to standard OpenAI params, by default None
96
- """
97
-
98
- model: Optional[str] = None
99
- api_key: Optional[str] = None
100
- api_base: Optional[str] = None
101
- timeout: float = 120.0
102
- max_retries: int = 6
103
-
104
- # Generation parameters (OpenAI-compatible)
105
- temperature: Optional[float] = None
106
- max_tokens: Optional[int] = None
107
- top_p: Optional[float] = None
108
- frequency_penalty: Optional[float] = None
109
- presence_penalty: Optional[float] = None
110
- stop: Optional[Union[str, list[str]]] = None
111
- seed: Optional[int] = None
112
- response_format: Optional[dict[str, Any]] = None
113
- stream: Optional[bool] = None
114
- n: Optional[int] = None
115
- logprobs: Optional[bool] = None
116
- top_logprobs: Optional[int] = None
117
- user: Optional[str] = None
118
-
119
- # Additional parameters
120
- extra_headers: Optional[dict[str, str]] = None
121
- extra_body: Optional[dict[str, Any]] = None
122
- provider_specific: Optional[dict[str, Any]] = None
123
-
124
- def __post_init__(self) -> None:
125
- """Validate configuration after initialization."""
126
- self._validate_model()
127
- self._validate_parameters()
128
- self._resolve_api_key()
129
-
130
- def _validate_model(self) -> None:
131
- """Validate model identifier format."""
132
- # Model is optional - will be set later via set_model_config()
133
- if self.model is None:
134
- return
135
-
136
- # Check if it's a valid LiteLLM model format
137
- if "/" not in self.model:
138
- raise ValueError(
139
- f"Model '{self.model}' should be in format 'provider/model-name'. "
140
- f"Examples: 'openai/gpt-4', 'anthropic/claude-3-sonnet-20240229', "
141
- f"'hosted_vllm/meta-llama/Llama-2-7b-chat-hf'"
142
- )
143
-
144
- def _validate_parameters(self) -> None:
145
- """Validate generation parameters."""
146
- if self.temperature is not None and not (0.0 <= self.temperature <= 2.0):
147
- raise ValueError(
148
- f"Temperature must be between 0.0 and 2.0, got {self.temperature}"
149
- )
150
-
151
- if self.max_tokens is not None and self.max_tokens <= 0:
152
- raise ValueError(f"max_tokens must be positive, got {self.max_tokens}")
153
-
154
- if self.top_p is not None and not (0.0 <= self.top_p <= 1.0):
155
- raise ValueError(f"top_p must be between 0.0 and 1.0, got {self.top_p}")
156
-
157
- if self.frequency_penalty is not None and not (
158
- -2.0 <= self.frequency_penalty <= 2.0
159
- ):
160
- raise ValueError(
161
- f"frequency_penalty must be between -2.0 and 2.0, got {self.frequency_penalty}"
162
- )
163
-
164
- if self.presence_penalty is not None and not (
165
- -2.0 <= self.presence_penalty <= 2.0
166
- ):
167
- raise ValueError(
168
- f"presence_penalty must be between -2.0 and 2.0, got {self.presence_penalty}"
169
- )
170
-
171
- if self.n is not None and self.n <= 0:
172
- raise ValueError(f"n must be positive, got {self.n}")
173
-
174
- if self.max_retries < 0:
175
- raise ValueError(
176
- f"max_retries must be non-negative, got {self.max_retries}"
177
- )
178
-
179
- if self.timeout <= 0:
180
- raise ValueError(f"timeout must be positive, got {self.timeout}")
181
-
182
- def _resolve_api_key(self) -> None:
183
- """Resolve API key from environment variables if not provided.
184
-
185
- This method only reads from environment variables and does not modify them,
186
- ensuring thread-safety when multiple instances are used concurrently.
187
- """
188
- if self.api_key is not None:
189
- return
190
-
191
- # Skip API key resolution if model is not set yet
192
- if self.model is None:
193
- return
194
-
195
- # Extract provider from model
196
- provider = self.model.split("/")[0].lower()
197
-
198
- # Map provider to environment variable
199
- provider_env_map = {
200
- "openai": "OPENAI_API_KEY",
201
- "anthropic": "ANTHROPIC_API_KEY",
202
- "google": "GOOGLE_API_KEY",
203
- "azure": "AZURE_API_KEY",
204
- "huggingface": "HUGGINGFACE_API_KEY",
205
- "cohere": "COHERE_API_KEY",
206
- "replicate": "REPLICATE_API_KEY",
207
- "together": "TOGETHER_API_KEY",
208
- "anyscale": "ANYSCALE_API_KEY",
209
- "perplexity": "PERPLEXITY_API_KEY",
210
- "groq": "GROQ_API_KEY",
211
- "mistral": "MISTRAL_API_KEY",
212
- "deepinfra": "DEEPINFRA_API_KEY",
213
- "ai21": "AI21_API_KEY",
214
- "nlp_cloud": "NLP_CLOUD_API_KEY",
215
- "aleph_alpha": "ALEPH_ALPHA_API_KEY",
216
- "bedrock": "AWS_ACCESS_KEY_ID",
217
- "vertex_ai": "GOOGLE_APPLICATION_CREDENTIALS",
218
- }
219
-
220
- env_var = provider_env_map.get(provider)
221
- if env_var:
222
- self.api_key = os.getenv(env_var)
223
-
224
- def get_generation_kwargs(self) -> dict[str, Any]:
225
- """Get generation parameters as kwargs for LiteLLM completion."""
226
- kwargs = {}
227
-
228
- # Standard parameters
229
- for param in [
230
- "temperature",
231
- "max_tokens",
232
- "top_p",
233
- "frequency_penalty",
234
- "presence_penalty",
235
- "stop",
236
- "seed",
237
- "response_format",
238
- "stream",
239
- "n",
240
- "logprobs",
241
- "top_logprobs",
242
- "user",
243
- "timeout",
244
- ]:
245
- value = getattr(self, param)
246
- if value is not None:
247
- kwargs[param] = value
248
-
249
- # Additional parameters
250
- if self.extra_headers:
251
- kwargs["extra_headers"] = self.extra_headers
252
-
253
- if self.extra_body:
254
- kwargs["extra_body"] = self.extra_body
255
-
256
- if self.provider_specific:
257
- kwargs.update(self.provider_specific)
258
-
259
- return kwargs
260
-
261
- def merge_overrides(self, **overrides: Any) -> "LLMConfig":
262
- """Create a new config with runtime overrides.
263
-
264
- Parameters
265
- ----------
266
- **overrides : Any
267
- Runtime parameter overrides.
268
-
269
- Returns
270
- -------
271
- LLMConfig
272
- New configuration with overrides applied.
273
- """
274
- # Get current values as dict
275
- # Standard
276
- from dataclasses import fields
277
-
278
- current_values = {
279
- field.name: getattr(self, field.name) for field in fields(self)
280
- }
281
-
282
- # Apply overrides
283
- current_values.update(overrides)
284
-
285
- # Create new config
286
- return LLMConfig(**current_values)
287
-
288
- def get_provider(self) -> Optional[str]:
289
- """Get the provider name from the model identifier.
290
-
291
- Returns
292
- -------
293
- Optional[str]
294
- Provider name (e.g., "openai", "anthropic", "hosted_vllm"), or None if model is not set.
295
- """
296
- if self.model is None:
297
- return None
298
- return self.model.split("/")[0]
299
-
300
- def get_model_name(self) -> Optional[str]:
301
- """Get the model name without provider prefix.
302
-
303
- Returns
304
- -------
305
- Optional[str]
306
- Model name (e.g., "gpt-4", "claude-3-sonnet-20240229"), or None if model is not set.
307
- """
308
- if self.model is None:
309
- return None
310
- parts = self.model.split("/", 1)
311
- return parts[1] if len(parts) > 1 else parts[0]
312
-
313
- def is_local_model(self) -> bool:
314
- """Check if this is a local model deployment.
315
-
316
- Returns
317
- -------
318
- bool
319
- True if the model is hosted locally (vLLM, Ollama, etc.).
320
- """
321
- provider = self.get_provider()
322
- if provider is None:
323
- return False
324
- local_providers = {"hosted_vllm", "ollama", "local", "vllm"}
325
- return provider.lower() in local_providers
326
-
327
- def __str__(self) -> str:
328
- """String representation of the configuration."""
329
- return f"LLMConfig(model='{self.model}', provider='{self.get_provider()}')"
330
-
331
- def __repr__(self) -> str:
332
- """Detailed representation of the configuration."""
333
- return (
334
- f"LLMConfig(model='{self.model}', provider='{self.get_provider()}', "
335
- f"api_base='{self.api_base}', timeout={self.timeout}, "
336
- f"max_retries={self.max_retries})"
337
- )