sdg-hub 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sdg_hub/__init__.py CHANGED
@@ -8,7 +8,6 @@ from .core import (
8
8
  BlockRegistry,
9
9
  Flow,
10
10
  FlowMetadata,
11
- FlowParameter,
12
11
  FlowRegistry,
13
12
  FlowValidator,
14
13
  GenerateError,
@@ -23,7 +22,6 @@ __all__ = [
23
22
  "FlowRegistry",
24
23
  # Metadata and utilities
25
24
  "FlowMetadata",
26
- "FlowParameter",
27
25
  "FlowValidator",
28
26
  "GenerateError",
29
27
  "resolve_path",
sdg_hub/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.4.0'
32
- __version_tuple__ = version_tuple = (0, 4, 0)
31
+ __version__ = version = '0.4.2'
32
+ __version_tuple__ = version_tuple = (0, 4, 2)
33
33
 
34
34
  __commit_id__ = commit_id = None
sdg_hub/core/__init__.py CHANGED
@@ -3,7 +3,7 @@
3
3
 
4
4
  # Local
5
5
  from .blocks import BaseBlock, BlockRegistry
6
- from .flow import Flow, FlowMetadata, FlowParameter, FlowRegistry, FlowValidator
6
+ from .flow import Flow, FlowMetadata, FlowRegistry, FlowValidator
7
7
  from .utils import GenerateError, resolve_path
8
8
 
9
9
  __all__ = [
@@ -14,7 +14,6 @@ __all__ = [
14
14
  "Flow",
15
15
  "FlowRegistry",
16
16
  "FlowMetadata",
17
- "FlowParameter",
18
17
  "FlowValidator",
19
18
  # Utils
20
19
  "GenerateError",
@@ -1,20 +1,19 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
2
  """New flow implementation for SDG Hub.
3
3
 
4
- This module provides a redesigned Flow class with metadata support,
5
- dual initialization modes, and runtime parameter overrides.
4
+ This module provides a redesigned Flow class with metadata support
5
+ and dual initialization modes.
6
6
  """
7
7
 
8
8
  # Local
9
9
  from .base import Flow
10
- from .metadata import FlowMetadata, FlowParameter
10
+ from .metadata import FlowMetadata
11
11
  from .registry import FlowRegistry
12
12
  from .validation import FlowValidator
13
13
 
14
14
  __all__ = [
15
15
  "Flow",
16
16
  "FlowMetadata",
17
- "FlowParameter",
18
17
  "FlowRegistry",
19
18
  "FlowValidator",
20
19
  ]
sdg_hub/core/flow/base.py CHANGED
@@ -30,12 +30,17 @@ from ..blocks.base import BaseBlock
30
30
  from ..blocks.registry import BlockRegistry
31
31
  from ..utils.datautils import safe_concatenate_with_validation, validate_no_duplicates
32
32
  from ..utils.error_handling import EmptyDatasetError, FlowValidationError
33
- from ..utils.flow_metrics import display_metrics_summary, save_metrics_to_json
33
+ from ..utils.flow_metrics import (
34
+ display_metrics_summary,
35
+ display_time_estimation_summary,
36
+ save_metrics_to_json,
37
+ )
34
38
  from ..utils.logger_config import setup_logger
35
39
  from ..utils.path_resolution import resolve_path
40
+ from ..utils.time_estimator import estimate_execution_time
36
41
  from ..utils.yaml_utils import save_flow_yaml
37
42
  from .checkpointer import FlowCheckpointer
38
- from .metadata import DatasetRequirements, FlowMetadata, FlowParameter
43
+ from .metadata import DatasetRequirements, FlowMetadata
39
44
  from .migration import FlowMigration
40
45
  from .validation import FlowValidator
41
46
 
@@ -55,8 +60,6 @@ class Flow(BaseModel):
55
60
  Ordered list of blocks to execute in the flow.
56
61
  metadata : FlowMetadata
57
62
  Flow metadata including name, version, author, etc.
58
- parameters : Dict[str, FlowParameter]
59
- Runtime parameters that can be overridden during execution.
60
63
  """
61
64
 
62
65
  blocks: list[BaseBlock] = Field(
@@ -66,10 +69,6 @@ class Flow(BaseModel):
66
69
  metadata: FlowMetadata = Field(
67
70
  description="Flow metadata including name, version, author, etc."
68
71
  )
69
- parameters: dict[str, FlowParameter] = Field(
70
- default_factory=dict,
71
- description="Runtime parameters that can be overridden during execution",
72
- )
73
72
 
74
73
  model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
75
74
 
@@ -96,32 +95,6 @@ class Flow(BaseModel):
96
95
 
97
96
  return v
98
97
 
99
- @field_validator("parameters")
100
- @classmethod
101
- def validate_parameters(
102
- cls, v: dict[str, FlowParameter]
103
- ) -> dict[str, FlowParameter]:
104
- """Validate parameter names and ensure they are FlowParameter instances."""
105
- if not v:
106
- return v
107
-
108
- validated = {}
109
- for param_name, param_value in v.items():
110
- if not isinstance(param_name, str) or not param_name.strip():
111
- raise ValueError(
112
- f"Parameter name must be a non-empty string: {param_name}"
113
- )
114
-
115
- if not isinstance(param_value, FlowParameter):
116
- raise ValueError(
117
- f"Parameter '{param_name}' must be a FlowParameter instance, "
118
- f"got: {type(param_value)}"
119
- )
120
-
121
- validated[param_name.strip()] = param_value
122
-
123
- return validated
124
-
125
98
  @model_validator(mode="after")
126
99
  def validate_block_names_unique(self) -> "Flow":
127
100
  """Ensure all block names are unique within the flow."""
@@ -215,17 +188,6 @@ class Flow(BaseModel):
215
188
  except Exception as exc:
216
189
  raise FlowValidationError(f"Invalid metadata configuration: {exc}") from exc
217
190
 
218
- # Extract and validate parameters
219
- parameters = {}
220
- params_dict = flow_config.get("parameters", {})
221
- for param_name, param_config in params_dict.items():
222
- try:
223
- parameters[param_name] = FlowParameter(**param_config)
224
- except Exception as exc:
225
- raise FlowValidationError(
226
- f"Invalid parameter '{param_name}': {exc}"
227
- ) from exc
228
-
229
191
  # Create blocks with validation
230
192
  blocks = []
231
193
  block_configs = flow_config.get("blocks", [])
@@ -254,7 +216,7 @@ class Flow(BaseModel):
254
216
 
255
217
  # Create and validate the flow
256
218
  try:
257
- flow = cls(blocks=blocks, metadata=metadata, parameters=parameters)
219
+ flow = cls(blocks=blocks, metadata=metadata)
258
220
  # Persist generated id back to the YAML file (only on initial load)
259
221
  # If the file had no metadata.id originally, update and rewrite
260
222
  if not flow_config.get("metadata", {}).get("id"):
@@ -1049,6 +1011,8 @@ class Flow(BaseModel):
1049
1011
  dataset: Dataset,
1050
1012
  sample_size: int = 2,
1051
1013
  runtime_params: Optional[dict[str, dict[str, Any]]] = None,
1014
+ max_concurrency: Optional[int] = None,
1015
+ enable_time_estimation: bool = False,
1052
1016
  ) -> dict[str, Any]:
1053
1017
  """Perform a dry run of the flow with a subset of data.
1054
1018
 
@@ -1060,11 +1024,18 @@ class Flow(BaseModel):
1060
1024
  Number of samples to use for dry run testing.
1061
1025
  runtime_params : Optional[Dict[str, Dict[str, Any]]], optional
1062
1026
  Runtime parameters organized by block name.
1027
+ max_concurrency : Optional[int], optional
1028
+ Maximum concurrent requests for LLM blocks. If None, no limit is applied.
1029
+ enable_time_estimation : bool, default=False
1030
+ If True, estimates execution time for the full dataset and displays it
1031
+ in a Rich table. Automatically runs a second dry run if needed for
1032
+ accurate scaling analysis.
1063
1033
 
1064
1034
  Returns
1065
1035
  -------
1066
1036
  Dict[str, Any]
1067
1037
  Dry run results with execution info and sample outputs.
1038
+ Time estimation is displayed in a table but not included in return value.
1068
1039
 
1069
1040
  Raises
1070
1041
  ------
@@ -1082,6 +1053,19 @@ class Flow(BaseModel):
1082
1053
 
1083
1054
  validate_no_duplicates(dataset)
1084
1055
 
1056
+ # Validate max_concurrency parameter
1057
+ if max_concurrency is not None:
1058
+ if isinstance(max_concurrency, bool) or not isinstance(
1059
+ max_concurrency, int
1060
+ ):
1061
+ raise FlowValidationError(
1062
+ f"max_concurrency must be an int, got {type(max_concurrency).__name__}"
1063
+ )
1064
+ if max_concurrency <= 0:
1065
+ raise FlowValidationError(
1066
+ f"max_concurrency must be greater than 0, got {max_concurrency}"
1067
+ )
1068
+
1085
1069
  # Use smaller sample size if dataset is smaller
1086
1070
  actual_sample_size = min(sample_size, len(dataset))
1087
1071
 
@@ -1099,6 +1083,7 @@ class Flow(BaseModel):
1099
1083
  "flow_version": self.metadata.version,
1100
1084
  "sample_size": actual_sample_size,
1101
1085
  "original_dataset_size": len(dataset),
1086
+ "max_concurrency": max_concurrency,
1102
1087
  "input_columns": dataset.column_names,
1103
1088
  "blocks_executed": [],
1104
1089
  "final_dataset": None,
@@ -1125,6 +1110,10 @@ class Flow(BaseModel):
1125
1110
  # Prepare block execution parameters
1126
1111
  block_kwargs = self._prepare_block_kwargs(block, runtime_params)
1127
1112
 
1113
+ # Add max_concurrency to block kwargs if provided
1114
+ if max_concurrency is not None:
1115
+ block_kwargs["_flow_max_concurrency"] = max_concurrency
1116
+
1128
1117
  # Check if this is a deprecated block and skip validations
1129
1118
  is_deprecated_block = (
1130
1119
  hasattr(block, "__class__")
@@ -1142,7 +1131,9 @@ class Flow(BaseModel):
1142
1131
  # Execute block with validation and logging
1143
1132
  current_dataset = block(current_dataset, **block_kwargs)
1144
1133
 
1145
- block_execution_time = time.time() - block_start_time
1134
+ block_execution_time = (
1135
+ time.perf_counter() - block_start_time
1136
+ ) # Fixed: use perf_counter consistently
1146
1137
 
1147
1138
  # Record block execution info
1148
1139
  block_info = {
@@ -1181,6 +1172,12 @@ class Flow(BaseModel):
1181
1172
  f"in {execution_time:.2f}s"
1182
1173
  )
1183
1174
 
1175
+ # Perform time estimation if requested (displays table but doesn't store in results)
1176
+ if enable_time_estimation:
1177
+ self._estimate_total_time(
1178
+ dry_run_results, dataset, runtime_params, max_concurrency
1179
+ )
1180
+
1184
1181
  return dry_run_results
1185
1182
 
1186
1183
  except Exception as exc:
@@ -1193,6 +1190,103 @@ class Flow(BaseModel):
1193
1190
 
1194
1191
  raise FlowValidationError(f"Dry run failed: {exc}") from exc
1195
1192
 
1193
+ def _estimate_total_time(
1194
+ self,
1195
+ first_run_results: dict[str, Any],
1196
+ dataset: Dataset,
1197
+ runtime_params: Optional[dict[str, dict[str, Any]]],
1198
+ max_concurrency: Optional[int],
1199
+ ) -> dict[str, Any]:
1200
+ """Estimate execution time using 2 dry runs (private method).
1201
+
1202
+ This method contains all the estimation logic. It determines if a second
1203
+ dry run is needed, executes it, and calls estimate_execution_time.
1204
+
1205
+ Parameters
1206
+ ----------
1207
+ first_run_results : dict
1208
+ Results from the first dry run.
1209
+ dataset : Dataset
1210
+ Full dataset for estimation.
1211
+ runtime_params : Optional[dict]
1212
+ Runtime parameters.
1213
+ max_concurrency : Optional[int]
1214
+ Maximum concurrency.
1215
+
1216
+ Returns
1217
+ -------
1218
+ dict
1219
+ Estimation results with estimated_time_seconds, total_estimated_requests, etc.
1220
+ """
1221
+ first_sample_size = first_run_results["sample_size"]
1222
+
1223
+ # Check if we need a second dry run
1224
+ has_async_blocks = any(
1225
+ getattr(block, "async_mode", False) for block in self.blocks
1226
+ )
1227
+
1228
+ # For sequential or no async blocks, single run is sufficient
1229
+ if max_concurrency == 1 or not has_async_blocks:
1230
+ estimation = estimate_execution_time(
1231
+ dry_run_1=first_run_results,
1232
+ dry_run_2=None,
1233
+ total_dataset_size=len(dataset),
1234
+ max_concurrency=max_concurrency,
1235
+ )
1236
+ else:
1237
+ # Need second measurement - always use canonical (1, 5) pair
1238
+ if first_sample_size == 1:
1239
+ # Already have 1, need 5
1240
+ logger.info("Running second dry run with 5 samples for time estimation")
1241
+ second_run = self.dry_run(
1242
+ dataset,
1243
+ 5,
1244
+ runtime_params,
1245
+ max_concurrency,
1246
+ enable_time_estimation=False,
1247
+ )
1248
+ dry_run_1, dry_run_2 = first_run_results, second_run
1249
+ elif first_sample_size == 5:
1250
+ # Already have 5, need 1
1251
+ logger.info("Running second dry run with 1 sample for time estimation")
1252
+ second_run = self.dry_run(
1253
+ dataset,
1254
+ 1,
1255
+ runtime_params,
1256
+ max_concurrency,
1257
+ enable_time_estimation=False,
1258
+ )
1259
+ dry_run_1, dry_run_2 = second_run, first_run_results
1260
+ else:
1261
+ # For other sizes: run both 1 and 5 for canonical pair
1262
+ logger.info("Running dry runs with 1 and 5 samples for time estimation")
1263
+ dry_run_1 = self.dry_run(
1264
+ dataset,
1265
+ 1,
1266
+ runtime_params,
1267
+ max_concurrency,
1268
+ enable_time_estimation=False,
1269
+ )
1270
+ dry_run_2 = self.dry_run(
1271
+ dataset,
1272
+ 5,
1273
+ runtime_params,
1274
+ max_concurrency,
1275
+ enable_time_estimation=False,
1276
+ )
1277
+
1278
+ estimation = estimate_execution_time(
1279
+ dry_run_1=dry_run_1,
1280
+ dry_run_2=dry_run_2,
1281
+ total_dataset_size=len(dataset),
1282
+ max_concurrency=max_concurrency,
1283
+ )
1284
+
1285
+ # Display estimation summary
1286
+ display_time_estimation_summary(estimation, len(dataset), max_concurrency)
1287
+
1288
+ return estimation
1289
+
1196
1290
  def add_block(self, block: BaseBlock) -> "Flow":
1197
1291
  """Add a block to the flow, returning a new Flow instance.
1198
1292
 
@@ -1225,17 +1319,12 @@ class Flow(BaseModel):
1225
1319
  # Create new flow with added block
1226
1320
  new_blocks = self.blocks + [block]
1227
1321
 
1228
- return Flow(
1229
- blocks=new_blocks, metadata=self.metadata, parameters=self.parameters
1230
- )
1322
+ return Flow(blocks=new_blocks, metadata=self.metadata)
1231
1323
 
1232
1324
  def get_info(self) -> dict[str, Any]:
1233
1325
  """Get information about the flow."""
1234
1326
  return {
1235
1327
  "metadata": self.metadata.model_dump(),
1236
- "parameters": {
1237
- name: param.model_dump() for name, param in self.parameters.items()
1238
- },
1239
1328
  "blocks": [
1240
1329
  {
1241
1330
  "block_type": block.__class__.__name__,
@@ -1339,8 +1428,7 @@ class Flow(BaseModel):
1339
1428
 
1340
1429
  The summary contains:
1341
1430
  1. Flow metadata (name, version, author, description)
1342
- 2. Defined runtime parameters with type hints and defaults
1343
- 3. A table of all blocks with their input and output columns
1431
+ 2. A table of all blocks with their input and output columns
1344
1432
 
1345
1433
  Notes
1346
1434
  -----
@@ -1374,17 +1462,6 @@ class Flow(BaseModel):
1374
1462
  f"Description: [white]{self.metadata.description}[/white]"
1375
1463
  )
1376
1464
 
1377
- # Parameters section
1378
- if self.parameters:
1379
- params_branch = flow_tree.add(
1380
- "[bold bright_yellow]Parameters[/bold bright_yellow]"
1381
- )
1382
- for name, param in self.parameters.items():
1383
- param_info = f"[bright_cyan]{name}[/bright_cyan]: [white]{param.type_hint}[/white]"
1384
- if param.default is not None:
1385
- param_info += f" = [bright_white]{param.default}[/bright_white]"
1386
- params_branch.add(param_info)
1387
-
1388
1465
  # Blocks overview
1389
1466
  flow_tree.add(
1390
1467
  f"[bold bright_magenta]Blocks[/bold bright_magenta] ({len(self.blocks)} total)"
@@ -1446,11 +1523,6 @@ class Flow(BaseModel):
1446
1523
  ],
1447
1524
  }
1448
1525
 
1449
- if self.parameters:
1450
- config["parameters"] = {
1451
- name: param.model_dump() for name, param in self.parameters.items()
1452
- }
1453
-
1454
1526
  save_flow_yaml(output_path, config)
1455
1527
 
1456
1528
  def __len__(self) -> int:
@@ -2,9 +2,8 @@
2
2
  """Flow metadata and parameter definitions."""
3
3
 
4
4
  # Standard
5
- from datetime import datetime
6
5
  from enum import Enum
7
- from typing import Any, Optional
6
+ from typing import Optional
8
7
 
9
8
  # Third Party
10
9
  from pydantic import BaseModel, Field, field_validator, model_validator
@@ -118,39 +117,6 @@ class RecommendedModels(BaseModel):
118
117
  return None
119
118
 
120
119
 
121
- class FlowParameter(BaseModel):
122
- """Represents a runtime parameter for a flow.
123
-
124
- Attributes
125
- ----------
126
- default : Any
127
- Default value for the parameter.
128
- description : str
129
- Human-readable description of the parameter.
130
- type_hint : str
131
- Type hint as string (e.g., "float", "str").
132
- required : bool
133
- Whether this parameter is required at runtime.
134
- constraints : Dict[str, Any]
135
- Additional constraints for the parameter.
136
- """
137
-
138
- default: Any = Field(..., description="Default value for the parameter")
139
- description: str = Field(default="", description="Human-readable description")
140
- type_hint: str = Field(default="Any", description="Type hint as string")
141
- required: bool = Field(default=False, description="Whether parameter is required")
142
- constraints: dict[str, Any] = Field(
143
- default_factory=dict, description="Additional constraints for the parameter"
144
- )
145
-
146
- @model_validator(mode="after")
147
- def validate_required_default(self) -> "FlowParameter":
148
- """Validate that required parameters have appropriate defaults."""
149
- if self.required and self.default is None:
150
- raise ValueError("Required parameters cannot have None as default")
151
- return self
152
-
153
-
154
120
  class DatasetRequirements(BaseModel):
155
121
  """Dataset requirements for flow execution.
156
122
 
@@ -255,20 +221,10 @@ class FlowMetadata(BaseModel):
255
221
  Simplified recommended models structure with default, compatible, and experimental lists.
256
222
  tags : List[str]
257
223
  Tags for categorization and search.
258
- created_at : str
259
- Creation timestamp.
260
- updated_at : str
261
- Last update timestamp.
262
224
  license : str
263
225
  License identifier.
264
- min_sdg_hub_version : str
265
- Minimum required SDG Hub version.
266
226
  dataset_requirements : Optional[DatasetRequirements]
267
227
  Requirements for input datasets.
268
- estimated_cost : str
269
- Estimated cost tier for running the flow.
270
- estimated_duration : str
271
- Estimated duration for flow execution.
272
228
  """
273
229
 
274
230
  name: str = Field(..., min_length=1, description="Human-readable name")
@@ -288,29 +244,10 @@ class FlowMetadata(BaseModel):
288
244
  tags: list[str] = Field(
289
245
  default_factory=list, description="Tags for categorization and search"
290
246
  )
291
- created_at: str = Field(
292
- default_factory=lambda: datetime.now().isoformat(),
293
- description="Creation timestamp",
294
- )
295
- updated_at: str = Field(
296
- default_factory=lambda: datetime.now().isoformat(),
297
- description="Last update timestamp",
298
- )
299
247
  license: str = Field(default="Apache-2.0", description="License identifier")
300
- min_sdg_hub_version: str = Field(
301
- default="", description="Minimum required SDG Hub version"
302
- )
303
248
  dataset_requirements: Optional[DatasetRequirements] = Field(
304
249
  default=None, description="Requirements for input datasets"
305
250
  )
306
- estimated_cost: str = Field(
307
- default="medium",
308
- pattern="^(low|medium|high)$",
309
- description="Estimated cost tier for running the flow",
310
- )
311
- estimated_duration: str = Field(
312
- default="", description="Estimated duration for flow execution"
313
- )
314
251
 
315
252
  @field_validator("id")
316
253
  @classmethod
@@ -352,10 +289,6 @@ class FlowMetadata(BaseModel):
352
289
  # Validation is handled within RecommendedModels class
353
290
  return v
354
291
 
355
- def update_timestamp(self) -> None:
356
- """Update the updated_at timestamp."""
357
- self.updated_at = datetime.now().isoformat()
358
-
359
292
  @model_validator(mode="after")
360
293
  def ensure_id(self) -> "FlowMetadata":
361
294
  """Ensure id is set.
@@ -360,7 +360,6 @@ class FlowRegistry:
360
360
  "tags": ", ".join(metadata.tags) if metadata.tags else "-",
361
361
  "description": metadata.description or "No description",
362
362
  "version": metadata.version,
363
- "cost": metadata.estimated_cost,
364
363
  }
365
364
  )
366
365
 
@@ -1,8 +1,10 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
2
 
3
3
  # Local
4
- from .flow_identifier import get_flow_identifier
5
- from .path_resolution import resolve_path
4
+ from .flow_identifier import get_flow_identifier as get_flow_identifier
5
+ from .path_resolution import resolve_path as resolve_path
6
+ from .time_estimator import estimate_execution_time as estimate_execution_time
7
+ from .time_estimator import is_llm_using_block as is_llm_using_block
6
8
 
7
9
 
8
10
  # This is part of the public API, and used by instructlab
@@ -10,4 +12,10 @@ class GenerateError(Exception):
10
12
  """An exception raised during generate step."""
11
13
 
12
14
 
13
- __all__ = ["GenerateError", "resolve_path", "get_flow_identifier"]
15
+ __all__ = [
16
+ "GenerateError",
17
+ "resolve_path",
18
+ "get_flow_identifier",
19
+ "estimate_execution_time",
20
+ "is_llm_using_block",
21
+ ]
@@ -188,6 +188,122 @@ def display_metrics_summary(
188
188
  console.print()
189
189
 
190
190
 
191
+ def display_time_estimation_summary(
192
+ time_estimation: dict[str, Any],
193
+ dataset_size: int,
194
+ max_concurrency: Optional[int] = None,
195
+ ) -> None:
196
+ """Display a rich table summarizing time estimation results.
197
+
198
+ Parameters
199
+ ----------
200
+ time_estimation : dict[str, Any]
201
+ Time estimation results from estimate_total_time().
202
+ dataset_size : int
203
+ Total number of samples in the dataset.
204
+ max_concurrency : Optional[int], optional
205
+ Maximum concurrency used for estimation.
206
+ """
207
+ console = Console()
208
+
209
+ # Create main summary table
210
+ summary_table = Table(
211
+ show_header=False,
212
+ box=None,
213
+ padding=(0, 1),
214
+ )
215
+ summary_table.add_column("Metric", style="bright_cyan")
216
+ summary_table.add_column("Value", style="bright_white")
217
+
218
+ # Format time
219
+ est_seconds = time_estimation["estimated_time_seconds"]
220
+ if est_seconds < 60:
221
+ time_str = f"{est_seconds:.1f} seconds"
222
+ elif est_seconds < 3600:
223
+ time_str = f"{est_seconds / 60:.1f} minutes ({est_seconds / 3600:.2f} hours)"
224
+ else:
225
+ time_str = f"{est_seconds / 3600:.2f} hours ({est_seconds / 60:.0f} minutes)"
226
+
227
+ summary_table.add_row("Estimated Time:", time_str)
228
+ summary_table.add_row(
229
+ "Total LLM Requests:", f"{time_estimation.get('total_estimated_requests', 0):,}"
230
+ )
231
+
232
+ if time_estimation.get("total_estimated_requests", 0) > 0:
233
+ requests_per_sample = time_estimation["total_estimated_requests"] / dataset_size
234
+ summary_table.add_row("Requests per Sample:", f"{requests_per_sample:.1f}")
235
+
236
+ if max_concurrency is not None:
237
+ summary_table.add_row("Max Concurrency:", str(max_concurrency))
238
+
239
+ # Display summary panel
240
+ console.print()
241
+ console.print(
242
+ Panel(
243
+ summary_table,
244
+ title=f"[bold bright_white]Time Estimation for {dataset_size:,} Samples[/bold bright_white]",
245
+ border_style="bright_blue",
246
+ )
247
+ )
248
+
249
+ # Display per-block breakdown if available
250
+ block_estimates = time_estimation.get("block_estimates", [])
251
+ if block_estimates:
252
+ console.print()
253
+
254
+ # Create per-block table
255
+ block_table = Table(
256
+ show_header=True,
257
+ header_style="bold bright_white",
258
+ )
259
+ block_table.add_column("Block Name", style="bright_cyan", width=20)
260
+ block_table.add_column("Time", justify="right", style="bright_yellow", width=10)
261
+ block_table.add_column(
262
+ "Requests", justify="right", style="bright_green", width=10
263
+ )
264
+ block_table.add_column(
265
+ "Throughput", justify="right", style="bright_blue", width=12
266
+ )
267
+ block_table.add_column(
268
+ "Amplif.", justify="right", style="bright_magenta", width=10
269
+ )
270
+
271
+ for block in block_estimates:
272
+ # Format time
273
+ block_seconds = block["estimated_time"]
274
+ if block_seconds < 60:
275
+ time_str = f"{block_seconds:.1f}s"
276
+ else:
277
+ time_str = f"{block_seconds / 60:.1f}min"
278
+
279
+ # Format requests
280
+ requests_str = f"{block['estimated_requests']:,.0f}"
281
+
282
+ # Format throughput
283
+ throughput_str = f"{block['throughput']:.2f}/s"
284
+
285
+ # Format amplification
286
+ amplif_str = f"{block['amplification']:.1f}x"
287
+
288
+ block_table.add_row(
289
+ block["block"],
290
+ time_str,
291
+ requests_str,
292
+ throughput_str,
293
+ amplif_str,
294
+ )
295
+
296
+ console.print(
297
+ Panel(
298
+ block_table,
299
+ title="[bold bright_white]Per-Block Breakdown[/bold bright_white]",
300
+ border_style="bright_blue",
301
+ )
302
+ )
303
+
304
+ console.print()
305
+
306
+
191
307
  def save_metrics_to_json(
192
308
  block_metrics: list[dict[str, Any]],
193
309
  flow_name: str,
@@ -0,0 +1,344 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Time estimation utility for predicting full dataset execution time from dry_run results."""
3
+
4
+ # Standard
5
+ from typing import Dict, Optional
6
+ import math
7
+
8
+ # Default max concurrent requests used during dry runs
9
+ DRY_RUN_MAX_CONCURRENT = 100
10
+
11
+ # Conservative estimation factor (20% buffer for API variability, network latency, etc.)
12
+ ESTIMATION_BUFFER_FACTOR = 1.2
13
+
14
+
15
+ def is_llm_using_block(block_info: Dict) -> bool:
16
+ """Detect if a block uses LLMs.
17
+
18
+ Identifies blocks that make LLM API calls based on their type or parameters.
19
+ This is used to calculate request amplification for LLM blocks.
20
+
21
+ Parameters
22
+ ----------
23
+ block_info : Dict
24
+ Block information from dry_run results containing block_type and parameters_used.
25
+
26
+ Returns
27
+ -------
28
+ bool
29
+ True if the block uses LLMs, False otherwise.
30
+
31
+ Examples
32
+ --------
33
+ >>> block = {"block_type": "LLMChatBlock", "parameters_used": {"model": "gpt-4"}}
34
+ >>> is_llm_using_block(block)
35
+ True
36
+ """
37
+ block_type = block_info.get("block_type", "")
38
+
39
+ # Direct LLM blocks or evaluation/verification blocks
40
+ if any(kw in block_type for kw in ["LLMChatBlock", "Evaluate", "Verify"]):
41
+ return True
42
+
43
+ # Check for model parameters
44
+ params = block_info.get("parameters_used", {})
45
+ if any(key in params for key in ["model", "api_base", "api_key"]):
46
+ return True
47
+
48
+ return False
49
+
50
+
51
+ def calculate_block_throughput(
52
+ block_1: Dict, block_2: Dict, samples_1: int, samples_2: int
53
+ ) -> Dict:
54
+ """Calculate throughput and amplification from two dry runs.
55
+
56
+ Analyzes performance metrics from two dry runs with different sample sizes
57
+ to estimate throughput (requests/second), amplification factor, and startup overhead.
58
+
59
+ Parameters
60
+ ----------
61
+ block_1 : Dict
62
+ Block execution info from first dry run.
63
+ block_2 : Dict
64
+ Block execution info from second dry run.
65
+ samples_1 : int
66
+ Number of samples in first dry run.
67
+ samples_2 : int
68
+ Number of samples in second dry run.
69
+
70
+ Returns
71
+ -------
72
+ Dict
73
+ Dictionary containing:
74
+ - throughput: float, requests per second
75
+ - amplification: float, average requests per input sample
76
+ - startup_overhead: float, fixed startup time in seconds
77
+
78
+ Raises
79
+ ------
80
+ ValueError
81
+ If throughput cannot be calculated due to invalid measurements.
82
+
83
+ Examples
84
+ --------
85
+ >>> block1 = {"execution_time_seconds": 1.0, "input_rows": 1, "block_name": "test"}
86
+ >>> block2 = {"execution_time_seconds": 2.0, "input_rows": 5, "block_name": "test"}
87
+ >>> result = calculate_block_throughput(block1, block2, 1, 5)
88
+ >>> assert result["throughput"] > 0
89
+ """
90
+ time_1 = block_1.get("execution_time_seconds", 0)
91
+ time_2 = block_2.get("execution_time_seconds", 0)
92
+ requests_1 = block_1.get("input_rows", 0)
93
+ requests_2 = block_2.get("input_rows", 0)
94
+
95
+ # Calculate amplification (requests per sample)
96
+ amp_1 = requests_1 / samples_1 if samples_1 > 0 else 1
97
+ amp_2 = requests_2 / samples_2 if samples_2 > 0 else 1
98
+ avg_amplification = (amp_1 + amp_2) / 2
99
+
100
+ # Use linear scaling to extract throughput and overhead from two data points
101
+ # Model: time = startup_overhead + (requests / throughput)
102
+
103
+ if requests_2 > requests_1 and time_2 > time_1:
104
+ # Calculate marginal time per request (slope of the line)
105
+ marginal_time = (time_2 - time_1) / (requests_2 - requests_1)
106
+
107
+ # Throughput is the inverse of marginal time
108
+ measured_throughput = 1.0 / marginal_time if marginal_time > 0 else 0
109
+
110
+ # Y-intercept is the startup overhead
111
+ startup_overhead = max(0, time_1 - (requests_1 * marginal_time))
112
+ else:
113
+ # Fallback to simple calculation if we don't have good data for scaling
114
+ throughput_1 = requests_1 / time_1 if time_1 > 0 else 0
115
+ throughput_2 = requests_2 / time_2 if time_2 > 0 else 0
116
+ measured_throughput = max(throughput_1, throughput_2)
117
+
118
+ # Estimate overhead as a small fraction of time
119
+ startup_overhead = min(2.0, time_1 * 0.1) # Assume 10% overhead, max 2 seconds
120
+
121
+ # If we have no valid measurements, raise an error
122
+ if measured_throughput == 0:
123
+ raise ValueError(
124
+ f"Cannot calculate throughput for block '{block_1.get('block_name', 'unknown')}': "
125
+ f"No valid measurements from dry runs (time_1={time_1}, time_2={time_2}, "
126
+ f"requests_1={requests_1}, requests_2={requests_2})"
127
+ )
128
+
129
+ return {
130
+ "throughput": measured_throughput,
131
+ "amplification": avg_amplification,
132
+ "startup_overhead": startup_overhead,
133
+ }
134
+
135
+
136
+ def calculate_time_with_pipeline(
137
+ num_requests: float,
138
+ throughput: float,
139
+ startup_overhead: float,
140
+ max_concurrent: int = DRY_RUN_MAX_CONCURRENT,
141
+ ) -> float:
142
+ """Calculate time considering pipeline behavior and max concurrent limit.
143
+
144
+ Models the execution time for a given number of requests based on throughput,
145
+ startup overhead, and concurrency constraints. Applies non-linear scaling
146
+ for diminishing returns at high concurrency levels.
147
+
148
+ Parameters
149
+ ----------
150
+ num_requests : float
151
+ Total number of requests to process.
152
+ throughput : float
153
+ Base throughput in requests per second.
154
+ startup_overhead : float
155
+ Fixed startup time overhead in seconds.
156
+ max_concurrent : int, optional
157
+ Maximum number of concurrent requests, by default 100.
158
+
159
+ Returns
160
+ -------
161
+ float
162
+ Estimated total execution time in seconds.
163
+
164
+ Examples
165
+ --------
166
+ >>> time = calculate_time_with_pipeline(1000, 10.0, 0.5, 50)
167
+ >>> assert time > 0
168
+ """
169
+ if num_requests <= 0:
170
+ return 0
171
+
172
+ # Validate and clamp max_concurrent to avoid division by zero
173
+ if max_concurrent is None or max_concurrent <= 0:
174
+ max_concurrent = 1
175
+
176
+ # The throughput is what we measured - it represents the server's processing capability
177
+ if max_concurrent == 1:
178
+ # Sequential execution - no pipelining benefit
179
+ effective_throughput = throughput
180
+ else:
181
+ # Concurrent execution - small pipelining benefit
182
+ # At most 10% improvement from perfect pipelining (conservative estimate)
183
+ # Logarithmic growth to model diminishing returns
184
+ pipelining_factor = 1.0 + (0.1 * math.log(max_concurrent) / math.log(100))
185
+ pipelining_factor = min(pipelining_factor, 1.1) # Cap at 10% improvement
186
+ effective_throughput = throughput * pipelining_factor
187
+
188
+ # Calculate total time
189
+ base_time = startup_overhead + (num_requests / effective_throughput)
190
+
191
+ return base_time
192
+
193
+
194
+ def estimate_execution_time(
195
+ dry_run_1: Dict,
196
+ dry_run_2: Optional[Dict] = None,
197
+ total_dataset_size: Optional[int] = None,
198
+ max_concurrency: Optional[int] = None,
199
+ ) -> Dict:
200
+ """Estimate execution time based on dry run results.
201
+
202
+ Estimates the total execution time for a full dataset based on one or two
203
+ dry runs with smaller sample sizes. For async blocks (with two dry runs),
204
+ calculates throughput and concurrency benefits. For sync blocks (single dry run),
205
+ performs simple linear scaling.
206
+
207
+ The estimates include a conservative buffer (20%) to account for API variability,
208
+ network latency, and other real-world factors.
209
+
210
+ Parameters
211
+ ----------
212
+ dry_run_1 : Dict
213
+ Results from first dry run, must contain 'sample_size' and 'execution_time_seconds'.
214
+ dry_run_2 : Optional[Dict], optional
215
+ Results from second dry run for async estimation, by default None.
216
+ total_dataset_size : Optional[int], optional
217
+ Size of full dataset to estimate for. If None, uses original_dataset_size from dry_run_1.
218
+ max_concurrency : Optional[int], optional
219
+ Maximum concurrent requests allowed, by default 100.
220
+
221
+ Returns
222
+ -------
223
+ Dict
224
+ Estimation results containing:
225
+ - estimated_time_seconds: float, estimated time with current configuration (includes buffer)
226
+ - total_estimated_requests: int, total LLM requests (0 for sync blocks)
227
+ - block_estimates: list, per-block estimates (for async blocks)
228
+ - note: str, additional information about the estimation
229
+
230
+ Examples
231
+ --------
232
+ >>> dry_run = {"sample_size": 2, "execution_time_seconds": 10.0}
233
+ >>> result = estimate_execution_time(dry_run, total_dataset_size=100)
234
+ >>> assert result["estimated_time_seconds"] > 0
235
+ >>>
236
+ >>> # With two dry runs for async estimation
237
+ >>> dry_run_1 = {"sample_size": 1, "execution_time_seconds": 5.0, "blocks_executed": [...]}
238
+ >>> dry_run_2 = {"sample_size": 5, "execution_time_seconds": 20.0, "blocks_executed": [...]}
239
+ >>> result = estimate_execution_time(dry_run_1, dry_run_2, total_dataset_size=1000)
240
+ >>> assert result["estimated_time_seconds"] > 0
241
+ """
242
+ # Set defaults
243
+ if max_concurrency is None:
244
+ max_concurrency = DRY_RUN_MAX_CONCURRENT
245
+
246
+ if total_dataset_size is None:
247
+ total_dataset_size = dry_run_1.get(
248
+ "original_dataset_size", dry_run_1["sample_size"]
249
+ )
250
+
251
+ # Get sample sizes
252
+ samples_1 = dry_run_1["sample_size"]
253
+ samples_2 = (
254
+ dry_run_2["sample_size"] if dry_run_2 else 5
255
+ ) # Default to 5 if not provided
256
+
257
+ # If only one dry run, do simple scaling
258
+ if dry_run_2 is None:
259
+ # Process each block individually for synchronous execution
260
+ blocks_executed = dry_run_1.get("blocks_executed", [])
261
+ if not blocks_executed:
262
+ # Fallback to simple scaling if no block details available
263
+ total_time = dry_run_1["execution_time_seconds"]
264
+ simple_estimate = (total_time / samples_1) * total_dataset_size
265
+ # Apply conservative buffer
266
+ simple_estimate = simple_estimate * ESTIMATION_BUFFER_FACTOR
267
+ return {
268
+ "estimated_time_seconds": simple_estimate,
269
+ "total_estimated_requests": 0,
270
+ "note": "Synchronous execution - linear scaling from dry run",
271
+ }
272
+
273
+ # Calculate time for each block and sum them
274
+ total_estimated_time = 0
275
+ for block in blocks_executed:
276
+ block_time = block.get("execution_time_seconds", 0)
277
+ input_rows = block.get("input_rows", samples_1)
278
+
279
+ # Calculate time per row for this block
280
+ if input_rows > 0:
281
+ time_per_row = block_time / input_rows
282
+ block_total_time = time_per_row * total_dataset_size
283
+ total_estimated_time += block_total_time
284
+
285
+ # Apply conservative buffer
286
+ total_estimated_time = total_estimated_time * ESTIMATION_BUFFER_FACTOR
287
+ return {
288
+ "estimated_time_seconds": total_estimated_time,
289
+ "total_estimated_requests": 0,
290
+ "note": "Synchronous execution - no concurrency",
291
+ }
292
+
293
+ # Analyze each block with async execution
294
+ block_estimates = []
295
+ total_time = 0
296
+ total_requests = 0
297
+
298
+ # Process each block
299
+ for i, block_1 in enumerate(dry_run_1.get("blocks_executed", [])):
300
+ if i >= len(dry_run_2.get("blocks_executed", [])):
301
+ break
302
+
303
+ block_2 = dry_run_2["blocks_executed"][i]
304
+
305
+ # Only process LLM blocks
306
+ if not is_llm_using_block(block_1):
307
+ continue
308
+
309
+ # Calculate throughput and amplification
310
+ analysis = calculate_block_throughput(block_1, block_2, samples_1, samples_2)
311
+
312
+ # Estimate requests for full dataset
313
+ estimated_requests = total_dataset_size * analysis["amplification"]
314
+
315
+ # Calculate time with pipeline model
316
+ block_time = calculate_time_with_pipeline(
317
+ estimated_requests,
318
+ analysis["throughput"],
319
+ analysis["startup_overhead"],
320
+ max_concurrency,
321
+ )
322
+
323
+ total_time += block_time
324
+ total_requests += estimated_requests
325
+
326
+ block_estimates.append(
327
+ {
328
+ "block": block_1["block_name"],
329
+ "estimated_requests": estimated_requests,
330
+ "throughput": analysis["throughput"],
331
+ "estimated_time": block_time,
332
+ "amplification": analysis["amplification"],
333
+ "startup_overhead": analysis["startup_overhead"],
334
+ }
335
+ )
336
+
337
+ # Apply conservative buffer to account for API variability, network issues, etc.
338
+ total_time = total_time * ESTIMATION_BUFFER_FACTOR
339
+
340
+ return {
341
+ "estimated_time_seconds": total_time,
342
+ "total_estimated_requests": int(total_requests),
343
+ "block_estimates": block_estimates,
344
+ }
@@ -17,7 +17,6 @@ metadata:
17
17
  - qa-pairs
18
18
  - detailed-summaries
19
19
  license: Apache-2.0
20
- min_sdg_hub_version: 0.2.0
21
20
  dataset_requirements:
22
21
  required_columns:
23
22
  - document
@@ -17,7 +17,6 @@ metadata:
17
17
  - qa-pairs
18
18
  - detailed-summaries
19
19
  license: Apache-2.0
20
- min_sdg_hub_version: 0.2.0
21
20
  dataset_requirements:
22
21
  required_columns:
23
22
  - document
@@ -19,7 +19,6 @@ metadata:
19
19
  - qa-pairs
20
20
  - extractive-summaries
21
21
  license: Apache-2.0
22
- min_sdg_hub_version: 0.2.0
23
22
  dataset_requirements:
24
23
  required_columns:
25
24
  - document
@@ -17,7 +17,6 @@ metadata:
17
17
  - qa-pairs
18
18
  - key-facts
19
19
  license: Apache-2.0
20
- min_sdg_hub_version: 0.2.0
21
20
  dataset_requirements:
22
21
  required_columns:
23
22
  - document
@@ -18,8 +18,7 @@ metadata:
18
18
  - "educational"
19
19
 
20
20
  license: "Apache-2.0"
21
- min_sdg_hub_version: "0.2.0"
22
-
21
+
23
22
  dataset_requirements:
24
23
  required_columns:
25
24
  - "document"
@@ -19,8 +19,7 @@ metadata:
19
19
  - "japanese"
20
20
 
21
21
  license: "Apache-2.0"
22
- min_sdg_hub_version: "0.2.0"
23
-
22
+
24
23
  dataset_requirements:
25
24
  required_columns:
26
25
  - "document"
@@ -55,17 +54,19 @@ blocks:
55
54
  output_cols: raw_summary_detailed
56
55
  max_tokens: 2048
57
56
  async_mode: true
57
+ # n: 2
58
58
 
59
59
  - block_type: LLMParserBlock
60
60
  block_config:
61
- block_name: extract_detailed_summary
61
+ block_name: detailed_summary
62
62
  input_cols: raw_summary_detailed
63
63
  extract_content: true
64
+ # extract_reasoning_content: true
64
65
 
65
66
  - block_type: TextParserBlock
66
67
  block_config:
67
68
  block_name: parse_detailed_summary
68
- input_cols: extract_detailed_summary_content
69
+ input_cols: detailed_summary_content
69
70
  output_cols: summary_detailed
70
71
  start_tags: [""]
71
72
  end_tags: [""]
@@ -87,14 +88,14 @@ blocks:
87
88
 
88
89
  - block_type: LLMParserBlock
89
90
  block_config:
90
- block_name: extract_atomic_facts
91
+ block_name: atomic_facts
91
92
  input_cols: raw_atomic_facts
92
93
  extract_content: true
93
94
 
94
95
  - block_type: TextParserBlock
95
96
  block_config:
96
97
  block_name: parse_atomic_facts
97
- input_cols: extract_atomic_facts_content
98
+ input_cols: atomic_facts_content
98
99
  output_cols: summary_atomic_facts
99
100
  start_tags: [""]
100
101
  end_tags: [""]
@@ -116,14 +117,14 @@ blocks:
116
117
 
117
118
  - block_type: LLMParserBlock
118
119
  block_config:
119
- block_name: extract_extractive_summary
120
+ block_name: extractive_summary
120
121
  input_cols: raw_summary_extractive
121
122
  extract_content: true
122
123
 
123
124
  - block_type: TextParserBlock
124
125
  block_config:
125
126
  block_name: parse_extractive_summary
126
- input_cols: extract_extractive_summary_content
127
+ input_cols: extractive_summary_content
127
128
  output_cols: summary_extractive
128
129
  start_tags: [""]
129
130
  end_tags: [""]
@@ -157,14 +158,14 @@ blocks:
157
158
 
158
159
  - block_type: LLMParserBlock
159
160
  block_config:
160
- block_name: extract_knowledge_generation
161
+ block_name: get_knowledge_generation
161
162
  input_cols: raw_knowledge_generation
162
163
  extract_content: true
163
164
 
164
165
  - block_type: TextParserBlock
165
166
  block_config:
166
167
  block_name: parse_knowledge_generation
167
- input_cols: extract_knowledge_generation_content
168
+ input_cols: get_knowledge_generation_content
168
169
  output_cols: [question, response]
169
170
  parsing_pattern: "\\[(?:Question|QUESTION)\\]\\s*(.*?)\\s*\\[(?:Answer|ANSWER)\\]\\s*(.*?)\\s*(?=\\[(?:Question|QUESTION)\\]|$)"
170
171
  parser_cleanup_tags: ["[END]"]
@@ -24,7 +24,6 @@ metadata:
24
24
  - "entity-extraction"
25
25
  - "keyword-extraction"
26
26
  license: "Apache-2.0"
27
- min_sdg_hub_version: "0.2.0"
28
27
  dataset_requirements:
29
28
  required_columns:
30
29
  - "text"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdg_hub
3
- Version: 0.4.0
3
+ Version: 0.4.2
4
4
  Summary: Synthetic Data Generation
5
5
  Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
6
  License: Apache-2.0
@@ -65,6 +65,7 @@ Requires-Dist: pytest-html; extra == "dev"
65
65
  Requires-Dist: tox<5,>=4.4.2; extra == "dev"
66
66
  Requires-Dist: ruff; extra == "dev"
67
67
  Requires-Dist: pytest-env; extra == "dev"
68
+ Requires-Dist: nbconvert>=7.0.0; extra == "dev"
68
69
  Dynamic: license-file
69
70
 
70
71
  # `sdg_hub`: Synthetic Data Generation Toolkit
@@ -1,7 +1,7 @@
1
- sdg_hub/__init__.py,sha256=Tw-6R5a8_W1kJcTAsW3R9ltBDP1dy5-fe7Tvt3cSyCQ,550
2
- sdg_hub/_version.py,sha256=2_0GUP7yBCXRus-qiJKxQD62z172WSs1sQ6DVpPsbmM,704
1
+ sdg_hub/__init__.py,sha256=TlkZT40-70urdcWLqv3kupaJj8s-SVgd2QyvlSFwb4A,510
2
+ sdg_hub/_version.py,sha256=A45grTqzrHuDn1CT9K5GVUbY4_Q3OSTcXAl3zdHzcEI,704
3
3
  sdg_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- sdg_hub/core/__init__.py,sha256=NwqB4fwhC29W50VW7QXZssLxx122YvgO9LHDLdgAnrI,496
4
+ sdg_hub/core/__init__.py,sha256=e3BoejbqjYhasf9t__L4qE52lkD9EBjx4o--2kqKdro,460
5
5
  sdg_hub/core/blocks/__init__.py,sha256=5FsbkcO-dmBv6MqO96TPn9FKKPTQZQCv20j4wR7UvQw,1502
6
6
  sdg_hub/core/blocks/base.py,sha256=-SOdBpJwtRTMsrmCEuLjUBQMRCo_PLYlHEBRrz8sF9g,13031
7
7
  sdg_hub/core/blocks/registry.py,sha256=FuEN_pnq-nSH1LguY3_oCubT6Kz3SuJjk3TcUpLT-lw,10695
@@ -32,21 +32,22 @@ sdg_hub/core/blocks/transform/melt_columns.py,sha256=vaYa5Taq6GhNZYWFL4uPK3-SfN2
32
32
  sdg_hub/core/blocks/transform/rename_columns.py,sha256=qeB5L2utqDQnutUetH1VKZSqDiJSH_yUp5EFCV-XCVI,1998
33
33
  sdg_hub/core/blocks/transform/text_concat.py,sha256=_-B__Hob1WwgwkILPIZvTnsDzuwtoX1hKviyzHlnnes,3149
34
34
  sdg_hub/core/blocks/transform/uniform_col_val_setter.py,sha256=XnjiT29z3PzIPy8M-mmE2w-Miab6Ed5ahy32SaxTCTE,3263
35
- sdg_hub/core/flow/__init__.py,sha256=N2NZGngvd7qpT5FI_knKukUFM0IkD9K5jdTi-gDeUI4,475
36
- sdg_hub/core/flow/base.py,sha256=6UlQ7ymVNs03UQ4NNgD15Y6eFyKPcl5JpuWOZuY70Mo,56654
35
+ sdg_hub/core/flow/__init__.py,sha256=0_m_htuZfPxk8xQ9IKfp0Pz-JRE4O7lYMUFrKyLNoLA,409
36
+ sdg_hub/core/flow/base.py,sha256=4kR-dKXAlLFSwm3YWdT8EoedCIGJT56agcot3tQb6VY,59508
37
37
  sdg_hub/core/flow/checkpointer.py,sha256=stm5ZtjjEiLk9ZkAAnoQQn5Y8Yl_d7qCsQLZTrCXR48,11867
38
- sdg_hub/core/flow/metadata.py,sha256=h9jpvAzWsF5n4ztZMzwa9ZNgnzKTHmFWdn7YbyJLHCw,12977
38
+ sdg_hub/core/flow/metadata.py,sha256=cFrpJjWOaK87aCuRFyC3Pdf83oYU93mrmZEMdUnhsN8,10540
39
39
  sdg_hub/core/flow/migration.py,sha256=6and-RBqV0t2gRipr1GiOOVnyBJdtyyjw1kO08Z--d4,7558
40
- sdg_hub/core/flow/registry.py,sha256=DzCqEEgwhvwnCBAGLogoMVdwXh4pCHrxOWqoxam7O8I,12162
40
+ sdg_hub/core/flow/registry.py,sha256=N6KfX-L7QRkooznIFxDuhRZYuDA5g3N5zC-KRm2jVhk,12109
41
41
  sdg_hub/core/flow/validation.py,sha256=pUJvgaUjLpKNwvW6djcqVOF-HShOjegEmGOnUnoX4BA,9722
42
- sdg_hub/core/utils/__init__.py,sha256=C2FzLn3dHprwGJDEgI4fyFS3aoCJR-9PhHsunxropJ8,351
42
+ sdg_hub/core/utils/__init__.py,sha256=KcT56JhobC5sBg0MKEMn5hc4OyKa9_Vnn45Mt_kS4jQ,610
43
43
  sdg_hub/core/utils/datautils.py,sha256=__HkUe1DxcJVHKrFX68z_hDXwxJygBlJDfjJLnj7rHc,4230
44
44
  sdg_hub/core/utils/error_handling.py,sha256=yku8cGj_nKCyXDsnb-mHCpgukkkAMucJ4iAUrIzqysc,5510
45
45
  sdg_hub/core/utils/flow_id_words.yaml,sha256=5QHpQdP7zwahRuooyAlJIwBY7WcDR7vtbJXxVJqujbg,2317
46
46
  sdg_hub/core/utils/flow_identifier.py,sha256=aAHfK_G9AwEtMglLRMdMpi_AI1dciub5UqBGm4yb2HE,2841
47
- sdg_hub/core/utils/flow_metrics.py,sha256=VOdreUzP0kPgnkPjuQk87tZsK5f1u6XGEPM8ugCt0CY,8824
47
+ sdg_hub/core/utils/flow_metrics.py,sha256=3G-xbfr-rFA578wV4KUbQePTMVGZHr9-rXvyYL4Kt2Q,12604
48
48
  sdg_hub/core/utils/logger_config.py,sha256=6_cnsIHtSAdq1iTTZ7Q7nAJ1dmldlxSZ0AB49yLiQ20,2034
49
49
  sdg_hub/core/utils/path_resolution.py,sha256=yWof4kGNpQ5dKcrVHg0h9KfOKLZ6ROjdfsLAZsQT5rM,2000
50
+ sdg_hub/core/utils/time_estimator.py,sha256=rM3_R-Ka5DEtvOtlJoA_5pXSyQ6tT6t4h6qh3_5BCZo,12639
50
51
  sdg_hub/core/utils/yaml_utils.py,sha256=tShCd-FFkp0xlKnLe7dXsMOR4AvT9d2qRUmu4ZnPSEY,1458
51
52
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
53
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_answers.yaml,sha256=THRT3cY44KGI_69B2wqt2Q89EknnOSE7B4A_jdnxlIU,330
@@ -54,14 +55,14 @@ sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/gener
54
55
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_question_list.yaml,sha256=qHOgUNrQz2vjUjJiEHNGWxDDXwjJlP1kofTxeGgLyPI,1461
55
56
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
57
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/detailed_summary.yaml,sha256=Ik6gAml0O-jPq8jpXBAkURzYkQuFOnDZb4LDwjmfAiE,381
57
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml,sha256=_h_EFdxen842BeJd20soaCeR4eccccxAerUV6myUefE,5567
58
+ sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml,sha256=fUdzY9dtU69o99Uq8FIPycgVWdLD-1kbY97Bh-Vo2A0,5538
58
59
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
59
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/flow.yaml,sha256=OJDlm8uGNqGPertACSG5pKKVGOKdfsQ6RMeh4UHZMJs,4442
60
+ sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/flow.yaml,sha256=smPWVUZRCt58EagWDmJVmTBQj8qMcjpzh-Q3GSuFrz0,4413
60
61
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
61
62
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/extractive_summary.yaml,sha256=SeapWoOx3fhN5SvWYuHss_9prLE8xSkOic7JkbDHSR0,4081
62
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml,sha256=Yy6-2Vytdr4FPxC5wTQkcv7Amy-DBMA3H8vOx9tBB9U,5735
63
+ sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml,sha256=iNNIfofFE7awK7iivtIFWxjfjy8QviMugOPPnOTySKA,5706
63
64
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml,sha256=QYN-zNl0YtqKXCTpMJBD9vbYsTf-30cap9ziiDwxKk0,3248
65
+ sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml,sha256=CIUZNYhvszT-jpz1Hvh6nS2y5W34P529ZOMp8thEQ9k,3219
65
66
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/key_facts_summary.yaml,sha256=YKMX_CuvcThG_bdNCAIXdVBkMvB72I89RGq2ltSSgc8,3298
66
67
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
67
68
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -71,24 +72,24 @@ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/ev
71
72
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml,sha256=zwzklXup6khRkR88avgrJTcjaMcV1wnbeYaML5oPuNs,1767
72
73
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml,sha256=cA8igo7jMrRXaWW6k0of6KOp7YnxLtPj0fP4DbrmZNQ,3647
73
74
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml,sha256=fcMV7LaCFZo4D29nwhGJXqFFuZMYVLo9XYjv8zcU6zs,364
74
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml,sha256=QOhucXsokNEXGdXtk38qxQnSDwiCngUciXRjBqDcnDU,9088
75
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml,sha256=HR8sf7RUZKr8UqKztBj-nlvyrve1UMUu8x8qgYM6O14,9055
75
76
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml,sha256=yX8aLY8dJSDML9ZJhnj9RzPbN8tH2xfcM4Gc6xZuwqQ,2596
76
77
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
78
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
78
79
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml,sha256=OjPZaSCOSLxEWgW3pmNwF7mmLhGhFGTmKL_3rKdqeW4,2488
79
80
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml,sha256=nEy_RcotHGiiENrmUANpKkbIFsrARAeSwECrBeHi2so,391
80
81
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml,sha256=V90W0IeJQZTFThA8v0UOs3DtZbtU3BI9jkpChw1BULo,402
81
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml,sha256=ittFo_tyvG_1eqooO_9NK4jqepafgpHFGy2fuVfjFto,9207
82
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml,sha256=jumjKmKshSd8hoTYpyBJ0nMOADeQmxBmNPY7yfa_xQ8,9171
82
83
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml,sha256=96SQqXG7fmb-50SdX85sgVtrFcQ-oNKe_0BoQdZmY5g,2638
83
84
  sdg_hub/flows/text_analysis/__init__.py,sha256=WStks4eM_KHNTVsHglcj8vFghmI0PH9P1hUrijBLbwc,125
84
85
  sdg_hub/flows/text_analysis/structured_insights/__init__.py,sha256=_DT4NR05JD9CZoSWROPr2lC6se0VjSqQPZJJlEV79mk,274
85
86
  sdg_hub/flows/text_analysis/structured_insights/analyze_sentiment.yaml,sha256=1YGPypFJYS8qfYFj2J6ERTgodKJvMF4YHNGt_vOF5qc,1000
86
87
  sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml,sha256=Q_SDy14Zu-qS2sbKfUBmGlYj3k7CUg6HzzXlFCXRKuU,1169
87
88
  sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml,sha256=_nPPMdHnxag_lYbhYUjGJGo-CvRwWvwdGX7cQhdZ1S0,847
88
- sdg_hub/flows/text_analysis/structured_insights/flow.yaml,sha256=Qpo9WPtl0PWhBF1stIM8OjaTvhtw3dn4eDADt-xj5cA,4965
89
+ sdg_hub/flows/text_analysis/structured_insights/flow.yaml,sha256=BBV18SdvuVTAESjwkJ7V1jbb-cSTBvNl3SCycd0oEQ4,4934
89
90
  sdg_hub/flows/text_analysis/structured_insights/summarize.yaml,sha256=WXwQak1pF8e1OwnOoI1EHu8QB6iUNW89rfkTdi1Oq54,687
90
- sdg_hub-0.4.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
91
- sdg_hub-0.4.0.dist-info/METADATA,sha256=SPjLdht-43yAyDwZzdk91SYoQn8jRbsCTr4qBkXVVlw,9735
92
- sdg_hub-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
93
- sdg_hub-0.4.0.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
94
- sdg_hub-0.4.0.dist-info/RECORD,,
91
+ sdg_hub-0.4.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
92
+ sdg_hub-0.4.2.dist-info/METADATA,sha256=5qbw9_DoVmfntmQlvz4VPdQXdUXoLO8Zhrxbc1uY7b0,9783
93
+ sdg_hub-0.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
94
+ sdg_hub-0.4.2.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
95
+ sdg_hub-0.4.2.dist-info/RECORD,,