sdg-hub 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. sdg_hub/_version.py +16 -3
  2. sdg_hub/core/blocks/deprecated_blocks/selector.py +1 -1
  3. sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +175 -416
  4. sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +174 -415
  5. sdg_hub/core/blocks/evaluation/verify_question_block.py +180 -415
  6. sdg_hub/core/blocks/llm/client_manager.py +92 -43
  7. sdg_hub/core/blocks/llm/config.py +1 -0
  8. sdg_hub/core/blocks/llm/llm_chat_block.py +74 -16
  9. sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +277 -115
  10. sdg_hub/core/blocks/llm/text_parser_block.py +88 -23
  11. sdg_hub/core/blocks/registry.py +48 -34
  12. sdg_hub/core/blocks/transform/__init__.py +2 -0
  13. sdg_hub/core/blocks/transform/index_based_mapper.py +1 -1
  14. sdg_hub/core/blocks/transform/json_structure_block.py +142 -0
  15. sdg_hub/core/flow/base.py +326 -62
  16. sdg_hub/core/utils/datautils.py +54 -0
  17. sdg_hub/core/utils/flow_metrics.py +261 -0
  18. sdg_hub/core/utils/logger_config.py +50 -9
  19. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/__init__.py +0 -0
  20. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/__init__.py +0 -0
  21. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/detailed_summary.yaml +11 -0
  22. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +159 -0
  23. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/__init__.py +0 -0
  24. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/extractive_summary.yaml +65 -0
  25. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +161 -0
  26. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_answers.yaml +15 -0
  27. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_multiple_qa.yaml +21 -0
  28. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_question_list.yaml +44 -0
  29. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/__init__.py +0 -0
  30. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +104 -0
  31. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/key_facts_summary.yaml +61 -0
  32. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +0 -7
  33. sdg_hub/flows/text_analysis/__init__.py +2 -0
  34. sdg_hub/flows/text_analysis/structured_insights/__init__.py +6 -0
  35. sdg_hub/flows/text_analysis/structured_insights/analyze_sentiment.yaml +27 -0
  36. sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml +38 -0
  37. sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml +21 -0
  38. sdg_hub/flows/text_analysis/structured_insights/flow.yaml +153 -0
  39. sdg_hub/flows/text_analysis/structured_insights/summarize.yaml +21 -0
  40. {sdg_hub-0.2.1.dist-info → sdg_hub-0.3.0.dist-info}/METADATA +42 -15
  41. {sdg_hub-0.2.1.dist-info → sdg_hub-0.3.0.dist-info}/RECORD +44 -22
  42. {sdg_hub-0.2.1.dist-info → sdg_hub-0.3.0.dist-info}/WHEEL +0 -0
  43. {sdg_hub-0.2.1.dist-info → sdg_hub-0.3.0.dist-info}/licenses/LICENSE +0 -0
  44. {sdg_hub-0.2.1.dist-info → sdg_hub-0.3.0.dist-info}/top_level.txt +0 -0
@@ -107,9 +107,18 @@ class LLMClientManager:
107
107
  f"Could not validate setup for model '{self.config.model}': {e}"
108
108
  )
109
109
 
110
+ def _message_to_dict(self, message: Any) -> dict[str, Any]:
111
+ """Convert a message to a dict."""
112
+ if hasattr(message, "to_dict"):
113
+ return message.to_dict()
114
+ elif hasattr(message, "__dict__"):
115
+ return message.__dict__
116
+ else:
117
+ return dict(message)
118
+
110
119
  def create_completion(
111
120
  self, messages: list[dict[str, Any]], **overrides: Any
112
- ) -> Union[str, list[str]]:
121
+ ) -> Union[dict, list[dict]]:
113
122
  """Create a completion using LiteLLM.
114
123
 
115
124
  Parameters
@@ -121,9 +130,9 @@ class LLMClientManager:
121
130
 
122
131
  Returns
123
132
  -------
124
- Union[str, List[str]]
125
- The completion text(s). Returns a single string when n=1 or n is None,
126
- returns a list of strings when n>1.
133
+ Union[dict, List[dict]]
134
+ The completion response(s). Returns a single response when n=1 or n is None,
135
+ returns a list of responses when n>1. Response dicts contain 'content' and may contain 'reasoning_content'.
127
136
 
128
137
  Raises
129
138
  ------
@@ -151,18 +160,80 @@ class LLMClientManager:
151
160
  # Make the completion call
152
161
  response = completion_func(kwargs)
153
162
 
154
- # Extract content from response
163
+ # Extract message objects from response
155
164
  # Check if n > 1 to determine return type
156
165
  n_value = final_config.n or 1
157
166
  if n_value > 1:
158
- return [choice.message.content for choice in response.choices]
167
+ return [
168
+ self._message_to_dict(choice.message) for choice in response.choices
169
+ ]
159
170
  else:
160
- return response.choices[0].message.content
171
+ return self._message_to_dict(response.choices[0].message)
161
172
 
162
173
  async def acreate_completion(
174
+ self,
175
+ messages: Union[list[dict[str, Any]], list[list[dict[str, Any]]]],
176
+ max_concurrency: Optional[int] = None,
177
+ **overrides: Any,
178
+ ) -> Union[dict, list[dict]] | list[Union[dict, list[dict]]]:
179
+ """Create async completion(s) using LiteLLM with optional concurrency control.
180
+
181
+ Parameters
182
+ ----------
183
+ messages : Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]
184
+ Single message list or list of message lists.
185
+ - For single: List[Dict[str, Any]] - returns Union[Any, List[Any]]
186
+ - For multiple: List[List[Dict[str, Any]]] - returns List[Union[Any, List[Any]]]
187
+ max_concurrency : Optional[int], optional
188
+ Maximum number of concurrent requests when processing multiple messages.
189
+ If None, all requests run concurrently.
190
+ **overrides : Any
191
+ Runtime parameter overrides.
192
+
193
+ Returns
194
+ -------
195
+ Union[dict, List[dict], List[Union[dict, List[dict]]]]
196
+ For single message: completion response (dict when n=1, List[dict] when n>1)
197
+ For multiple messages: list of completion responses (each element can be dict or List[dict])
198
+
199
+ Raises
200
+ ------
201
+ Exception
202
+ If the completion fails after all retries.
203
+ """
204
+ # Detect if we have single message or multiple messages
205
+ if not messages:
206
+ raise ValueError("messages cannot be empty")
207
+
208
+ # Check if first element is a dict (single message) or list (multiple messages)
209
+ if isinstance(messages[0], dict):
210
+ # Single message case
211
+ return await self._acreate_single(messages, **overrides)
212
+ else:
213
+ # Multiple messages case
214
+ messages_list = messages
215
+
216
+ if max_concurrency is not None:
217
+ # Use semaphore for concurrency control
218
+ semaphore = asyncio.Semaphore(max_concurrency)
219
+
220
+ async def _create_with_semaphore(msgs):
221
+ async with semaphore:
222
+ return await self._acreate_single(msgs, **overrides)
223
+
224
+ tasks = [_create_with_semaphore(msgs) for msgs in messages_list]
225
+ return await asyncio.gather(*tasks)
226
+ else:
227
+ # No concurrency limit - process all at once
228
+ tasks = [
229
+ self._acreate_single(msgs, **overrides) for msgs in messages_list
230
+ ]
231
+ return await asyncio.gather(*tasks)
232
+
233
+ async def _acreate_single(
163
234
  self, messages: list[dict[str, Any]], **overrides: Any
164
- ) -> Union[str, list[str]]:
165
- """Create an async completion using LiteLLM.
235
+ ) -> Union[dict, list[dict]]:
236
+ """Create a single async completion using LiteLLM.
166
237
 
167
238
  Parameters
168
239
  ----------
@@ -173,10 +244,9 @@ class LLMClientManager:
173
244
 
174
245
  Returns
175
246
  -------
176
- Union[str, List[str]]
177
- The completion text(s). Returns a single string when n=1 or n is None,
178
- returns a list of strings when n>1.
179
-
247
+ Union[dict, List[dict]]
248
+ List of completion message objects. Each element is a dict when n=1 or n is None,
249
+ or a list of dicts when n>1. Message dicts contain 'content' and may contain 'reasoning_content'.
180
250
  Raises
181
251
  ------
182
252
  Exception
@@ -203,17 +273,19 @@ class LLMClientManager:
203
273
  # Make the async completion call
204
274
  response = await completion_func(kwargs)
205
275
 
206
- # Extract content from response
276
+ # Extract message objects from response
207
277
  # Check if n > 1 to determine return type
208
278
  n_value = final_config.n or 1
209
279
  if n_value > 1:
210
- return [choice.message.content for choice in response.choices]
280
+ return [
281
+ self._message_to_dict(choice.message) for choice in response.choices
282
+ ]
211
283
  else:
212
- return response.choices[0].message.content
284
+ return self._message_to_dict(response.choices[0].message)
213
285
 
214
286
  def create_completions_batch(
215
287
  self, messages_list: list[list[dict[str, Any]]], **overrides: Any
216
- ) -> list[Union[str, list[str]]]:
288
+ ) -> list[Union[dict, list[dict]]]:
217
289
  """Create multiple completions in batch.
218
290
 
219
291
  Parameters
@@ -225,9 +297,9 @@ class LLMClientManager:
225
297
 
226
298
  Returns
227
299
  -------
228
- List[Union[str, List[str]]]
229
- List of completion texts. Each element is a single string when n=1 or n is None,
230
- or a list of strings when n>1.
300
+ List[dict] | List[List[dict]]
301
+ List of completion responses. Each element is a dict when n=1 or n is None,
302
+ or a list of dicts when n>1. Response dicts contain 'content' and may contain 'reasoning_content'.
231
303
  """
232
304
  results = []
233
305
  for messages in messages_list:
@@ -235,29 +307,6 @@ class LLMClientManager:
235
307
  results.append(result)
236
308
  return results
237
309
 
238
- async def acreate_completions_batch(
239
- self, messages_list: list[list[dict[str, Any]]], **overrides: Any
240
- ) -> list[Union[str, list[str]]]:
241
- """Create multiple completions in batch asynchronously.
242
-
243
- Parameters
244
- ----------
245
- messages_list : List[List[Dict[str, Any]]]
246
- List of message lists to process.
247
- **overrides : Any
248
- Runtime parameter overrides.
249
-
250
- Returns
251
- -------
252
- List[Union[str, List[str]]]
253
- List of completion texts. Each element is a single string when n=1 or n is None,
254
- or a list of strings when n>1.
255
- """
256
- tasks = [
257
- self.acreate_completion(messages, **overrides) for messages in messages_list
258
- ]
259
- return await asyncio.gather(*tasks)
260
-
261
310
  def _build_completion_kwargs(
262
311
  self, messages: list[dict[str, Any]], config: LLMConfig
263
312
  ) -> dict[str, Any]:
@@ -240,6 +240,7 @@ class LLMConfig:
240
240
  "logprobs",
241
241
  "top_logprobs",
242
242
  "user",
243
+ "timeout",
243
244
  ]:
244
245
  value = getattr(self, param)
245
246
  if value is not None:
@@ -42,9 +42,10 @@ class LLMChatBlock(BaseBlock):
42
42
  Name of the block.
43
43
  input_cols : Union[str, List[str]]
44
44
  Input column name(s). Should contain the messages list.
45
- output_cols : Union[str, List[str]]
45
+ output_cols : Union[dict, List[dict]]
46
46
  Output column name(s) for the response. When n > 1, the column will contain
47
- a list of responses instead of a single string.
47
+ a list of responses instead of a single response. Responses contain 'content',
48
+ may contain 'reasoning_content' and other fields if any.
48
49
  model : str
49
50
  Model identifier in LiteLLM format. Examples:
50
51
  - "openai/gpt-4"
@@ -131,7 +132,7 @@ class LLMChatBlock(BaseBlock):
131
132
  >>> block = LLMChatBlock(
132
133
  ... block_name="gpt4_multiple",
133
134
  ... input_cols="messages",
134
- ... output_cols="responses", # Will contain lists of strings
135
+ ... output_cols="responses", # Will contain lists of responses
135
136
  ... model="openai/gpt-4",
136
137
  ... n=3, # Generate 3 responses per input
137
138
  ... temperature=0.8
@@ -297,6 +298,10 @@ class LLMChatBlock(BaseBlock):
297
298
  temperature, max_tokens, top_p, frequency_penalty, presence_penalty,
298
299
  stop, seed, response_format, stream, n, and provider-specific params.
299
300
 
301
+ Special flow-level parameters:
302
+ _flow_max_concurrency : int, optional
303
+ Maximum concurrency for async requests (passed by Flow).
304
+
300
305
  Returns
301
306
  -------
302
307
  Dataset
@@ -314,27 +319,73 @@ class LLMChatBlock(BaseBlock):
314
319
  f"Call flow.set_model_config() before generating."
315
320
  )
316
321
 
322
+ # Extract max_concurrency if provided by flow
323
+ flow_max_concurrency = override_kwargs.pop("_flow_max_concurrency", None)
324
+
317
325
  # Extract messages
318
326
  messages_list = samples[self.input_cols[0]]
319
327
 
320
328
  # Log generation start
321
329
  logger.info(
322
- f"Starting {'async' if self.async_mode else 'sync'} generation for {len(messages_list)} samples",
330
+ f"Starting {'async' if self.async_mode else 'sync'} generation for {len(messages_list)} samples"
331
+ + (
332
+ f" (max_concurrency={flow_max_concurrency})"
333
+ if flow_max_concurrency
334
+ else ""
335
+ ),
323
336
  extra={
324
337
  "block_name": self.block_name,
325
338
  "model": self.model,
326
339
  "provider": self.client_manager.config.get_provider(),
327
340
  "batch_size": len(messages_list),
328
341
  "async_mode": self.async_mode,
329
- "override_params": override_kwargs,
342
+ "flow_max_concurrency": flow_max_concurrency,
343
+ "override_params": {
344
+ k: (
345
+ "***"
346
+ if any(
347
+ s in k.lower()
348
+ for s in ["key", "token", "secret", "authorization"]
349
+ )
350
+ else v
351
+ )
352
+ for k, v in override_kwargs.items()
353
+ },
330
354
  },
331
355
  )
332
356
 
333
357
  # Generate responses
334
358
  if self.async_mode:
335
- responses = asyncio.run(
336
- self._generate_async(messages_list, **override_kwargs)
337
- )
359
+ try:
360
+ # Check if there's already a running event loop
361
+ loop = asyncio.get_running_loop()
362
+ # Check if nest_asyncio is applied (allows nested asyncio.run)
363
+ # Use multiple detection methods for robustness
364
+ nest_asyncio_applied = (
365
+ hasattr(loop, "_nest_patched")
366
+ or getattr(asyncio.run, "__module__", "") == "nest_asyncio"
367
+ )
368
+
369
+ if nest_asyncio_applied:
370
+ # nest_asyncio is applied, safe to use asyncio.run
371
+ responses = asyncio.run(
372
+ self._generate_async(
373
+ messages_list, flow_max_concurrency, **override_kwargs
374
+ )
375
+ )
376
+ else:
377
+ # Running inside an event loop without nest_asyncio
378
+ raise BlockValidationError(
379
+ f"async_mode=True cannot be used from within a running event loop for '{self.block_name}'. "
380
+ "Use an async entrypoint, set async_mode=False, or apply nest_asyncio.apply() in notebook environments."
381
+ )
382
+ except RuntimeError:
383
+ # No running loop; safe to create one
384
+ responses = asyncio.run(
385
+ self._generate_async(
386
+ messages_list, flow_max_concurrency, **override_kwargs
387
+ )
388
+ )
338
389
  else:
339
390
  responses = self._generate_sync(messages_list, **override_kwargs)
340
391
 
@@ -356,7 +407,7 @@ class LLMChatBlock(BaseBlock):
356
407
  self,
357
408
  messages_list: list[list[dict[str, Any]]],
358
409
  **override_kwargs: dict[str, Any],
359
- ) -> list[Union[str, list[str]]]:
410
+ ) -> list[Union[dict, list[dict]]]:
360
411
  """Generate responses synchronously.
361
412
 
362
413
  Parameters
@@ -368,8 +419,9 @@ class LLMChatBlock(BaseBlock):
368
419
 
369
420
  Returns
370
421
  -------
371
- List[Union[str, List[str]]]
372
- List of response strings or lists of response strings (when n > 1).
422
+ List[Union[dict, List[dict]]]
423
+ List of responses. Each element is a dict when n=1 or n is None,
424
+ or a list of dicts when n>1. Response dicts contain 'content', may contain 'reasoning_content' and other fields if any.
373
425
  """
374
426
  responses = []
375
427
 
@@ -409,26 +461,32 @@ class LLMChatBlock(BaseBlock):
409
461
  async def _generate_async(
410
462
  self,
411
463
  messages_list: list[list[dict[str, Any]]],
464
+ flow_max_concurrency: Optional[int] = None,
412
465
  **override_kwargs: dict[str, Any],
413
- ) -> list[Union[str, list[str]]]:
466
+ ) -> list[Union[dict, list[dict]]]:
414
467
  """Generate responses asynchronously.
415
468
 
416
469
  Parameters
417
470
  ----------
418
471
  messages_list : List[List[Dict[str, Any]]]
419
472
  List of message lists to process.
473
+ flow_max_concurrency : Optional[int], optional
474
+ Maximum concurrency for async requests.
420
475
  **override_kwargs : Dict[str, Any]
421
476
  Runtime parameter overrides.
422
477
 
423
478
  Returns
424
479
  -------
425
- List[Union[str, List[str]]]
426
- List of response strings or lists of response strings (when n > 1).
480
+ List[Union[dict, List[dict]]]
481
+ List of responses. Each element is a dict when n=1 or n is None,
482
+ or a list of dicts when n>1. Response dicts contain 'content', may contain 'reasoning_content' and other fields if any.
427
483
  """
428
484
  try:
429
- responses = await self.client_manager.acreate_completions_batch(
430
- messages_list, **override_kwargs
485
+ # Use unified client manager method with optional concurrency control
486
+ responses = await self.client_manager.acreate_completion(
487
+ messages_list, max_concurrency=flow_max_concurrency, **override_kwargs
431
488
  )
489
+
432
490
  return responses
433
491
 
434
492
  except Exception as e: