sdg-hub 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sdg_hub/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.4.1'
32
- __version_tuple__ = version_tuple = (0, 4, 1)
31
+ __version__ = version = '0.4.2'
32
+ __version_tuple__ = version_tuple = (0, 4, 2)
33
33
 
34
34
  __commit_id__ = commit_id = None
sdg_hub/core/flow/base.py CHANGED
@@ -30,9 +30,14 @@ from ..blocks.base import BaseBlock
30
30
  from ..blocks.registry import BlockRegistry
31
31
  from ..utils.datautils import safe_concatenate_with_validation, validate_no_duplicates
32
32
  from ..utils.error_handling import EmptyDatasetError, FlowValidationError
33
- from ..utils.flow_metrics import display_metrics_summary, save_metrics_to_json
33
+ from ..utils.flow_metrics import (
34
+ display_metrics_summary,
35
+ display_time_estimation_summary,
36
+ save_metrics_to_json,
37
+ )
34
38
  from ..utils.logger_config import setup_logger
35
39
  from ..utils.path_resolution import resolve_path
40
+ from ..utils.time_estimator import estimate_execution_time
36
41
  from ..utils.yaml_utils import save_flow_yaml
37
42
  from .checkpointer import FlowCheckpointer
38
43
  from .metadata import DatasetRequirements, FlowMetadata
@@ -1006,6 +1011,8 @@ class Flow(BaseModel):
1006
1011
  dataset: Dataset,
1007
1012
  sample_size: int = 2,
1008
1013
  runtime_params: Optional[dict[str, dict[str, Any]]] = None,
1014
+ max_concurrency: Optional[int] = None,
1015
+ enable_time_estimation: bool = False,
1009
1016
  ) -> dict[str, Any]:
1010
1017
  """Perform a dry run of the flow with a subset of data.
1011
1018
 
@@ -1017,11 +1024,18 @@ class Flow(BaseModel):
1017
1024
  Number of samples to use for dry run testing.
1018
1025
  runtime_params : Optional[Dict[str, Dict[str, Any]]], optional
1019
1026
  Runtime parameters organized by block name.
1027
+ max_concurrency : Optional[int], optional
1028
+ Maximum concurrent requests for LLM blocks. If None, no limit is applied.
1029
+ enable_time_estimation : bool, default=False
1030
+ If True, estimates execution time for the full dataset and displays it
1031
+ in a Rich table. Automatically runs a second dry run if needed for
1032
+ accurate scaling analysis.
1020
1033
 
1021
1034
  Returns
1022
1035
  -------
1023
1036
  Dict[str, Any]
1024
1037
  Dry run results with execution info and sample outputs.
1038
+ Time estimation is displayed in a table but not included in return value.
1025
1039
 
1026
1040
  Raises
1027
1041
  ------
@@ -1039,6 +1053,19 @@ class Flow(BaseModel):
1039
1053
 
1040
1054
  validate_no_duplicates(dataset)
1041
1055
 
1056
+ # Validate max_concurrency parameter
1057
+ if max_concurrency is not None:
1058
+ if isinstance(max_concurrency, bool) or not isinstance(
1059
+ max_concurrency, int
1060
+ ):
1061
+ raise FlowValidationError(
1062
+ f"max_concurrency must be an int, got {type(max_concurrency).__name__}"
1063
+ )
1064
+ if max_concurrency <= 0:
1065
+ raise FlowValidationError(
1066
+ f"max_concurrency must be greater than 0, got {max_concurrency}"
1067
+ )
1068
+
1042
1069
  # Use smaller sample size if dataset is smaller
1043
1070
  actual_sample_size = min(sample_size, len(dataset))
1044
1071
 
@@ -1056,6 +1083,7 @@ class Flow(BaseModel):
1056
1083
  "flow_version": self.metadata.version,
1057
1084
  "sample_size": actual_sample_size,
1058
1085
  "original_dataset_size": len(dataset),
1086
+ "max_concurrency": max_concurrency,
1059
1087
  "input_columns": dataset.column_names,
1060
1088
  "blocks_executed": [],
1061
1089
  "final_dataset": None,
@@ -1082,6 +1110,10 @@ class Flow(BaseModel):
1082
1110
  # Prepare block execution parameters
1083
1111
  block_kwargs = self._prepare_block_kwargs(block, runtime_params)
1084
1112
 
1113
+ # Add max_concurrency to block kwargs if provided
1114
+ if max_concurrency is not None:
1115
+ block_kwargs["_flow_max_concurrency"] = max_concurrency
1116
+
1085
1117
  # Check if this is a deprecated block and skip validations
1086
1118
  is_deprecated_block = (
1087
1119
  hasattr(block, "__class__")
@@ -1099,7 +1131,9 @@ class Flow(BaseModel):
1099
1131
  # Execute block with validation and logging
1100
1132
  current_dataset = block(current_dataset, **block_kwargs)
1101
1133
 
1102
- block_execution_time = time.time() - block_start_time
1134
+ block_execution_time = (
1135
+ time.perf_counter() - block_start_time
1136
+ ) # Fixed: use perf_counter consistently
1103
1137
 
1104
1138
  # Record block execution info
1105
1139
  block_info = {
@@ -1138,6 +1172,12 @@ class Flow(BaseModel):
1138
1172
  f"in {execution_time:.2f}s"
1139
1173
  )
1140
1174
 
1175
+ # Perform time estimation if requested (displays table but doesn't store in results)
1176
+ if enable_time_estimation:
1177
+ self._estimate_total_time(
1178
+ dry_run_results, dataset, runtime_params, max_concurrency
1179
+ )
1180
+
1141
1181
  return dry_run_results
1142
1182
 
1143
1183
  except Exception as exc:
@@ -1150,6 +1190,103 @@ class Flow(BaseModel):
1150
1190
 
1151
1191
  raise FlowValidationError(f"Dry run failed: {exc}") from exc
1152
1192
 
1193
+ def _estimate_total_time(
1194
+ self,
1195
+ first_run_results: dict[str, Any],
1196
+ dataset: Dataset,
1197
+ runtime_params: Optional[dict[str, dict[str, Any]]],
1198
+ max_concurrency: Optional[int],
1199
+ ) -> dict[str, Any]:
1200
+ """Estimate execution time using 2 dry runs (private method).
1201
+
1202
+ This method contains all the estimation logic. It determines if a second
1203
+ dry run is needed, executes it, and calls estimate_execution_time.
1204
+
1205
+ Parameters
1206
+ ----------
1207
+ first_run_results : dict
1208
+ Results from the first dry run.
1209
+ dataset : Dataset
1210
+ Full dataset for estimation.
1211
+ runtime_params : Optional[dict]
1212
+ Runtime parameters.
1213
+ max_concurrency : Optional[int]
1214
+ Maximum concurrency.
1215
+
1216
+ Returns
1217
+ -------
1218
+ dict
1219
+ Estimation results with estimated_time_seconds, total_estimated_requests, etc.
1220
+ """
1221
+ first_sample_size = first_run_results["sample_size"]
1222
+
1223
+ # Check if we need a second dry run
1224
+ has_async_blocks = any(
1225
+ getattr(block, "async_mode", False) for block in self.blocks
1226
+ )
1227
+
1228
+ # For sequential or no async blocks, single run is sufficient
1229
+ if max_concurrency == 1 or not has_async_blocks:
1230
+ estimation = estimate_execution_time(
1231
+ dry_run_1=first_run_results,
1232
+ dry_run_2=None,
1233
+ total_dataset_size=len(dataset),
1234
+ max_concurrency=max_concurrency,
1235
+ )
1236
+ else:
1237
+ # Need second measurement - always use canonical (1, 5) pair
1238
+ if first_sample_size == 1:
1239
+ # Already have 1, need 5
1240
+ logger.info("Running second dry run with 5 samples for time estimation")
1241
+ second_run = self.dry_run(
1242
+ dataset,
1243
+ 5,
1244
+ runtime_params,
1245
+ max_concurrency,
1246
+ enable_time_estimation=False,
1247
+ )
1248
+ dry_run_1, dry_run_2 = first_run_results, second_run
1249
+ elif first_sample_size == 5:
1250
+ # Already have 5, need 1
1251
+ logger.info("Running second dry run with 1 sample for time estimation")
1252
+ second_run = self.dry_run(
1253
+ dataset,
1254
+ 1,
1255
+ runtime_params,
1256
+ max_concurrency,
1257
+ enable_time_estimation=False,
1258
+ )
1259
+ dry_run_1, dry_run_2 = second_run, first_run_results
1260
+ else:
1261
+ # For other sizes: run both 1 and 5 for canonical pair
1262
+ logger.info("Running dry runs with 1 and 5 samples for time estimation")
1263
+ dry_run_1 = self.dry_run(
1264
+ dataset,
1265
+ 1,
1266
+ runtime_params,
1267
+ max_concurrency,
1268
+ enable_time_estimation=False,
1269
+ )
1270
+ dry_run_2 = self.dry_run(
1271
+ dataset,
1272
+ 5,
1273
+ runtime_params,
1274
+ max_concurrency,
1275
+ enable_time_estimation=False,
1276
+ )
1277
+
1278
+ estimation = estimate_execution_time(
1279
+ dry_run_1=dry_run_1,
1280
+ dry_run_2=dry_run_2,
1281
+ total_dataset_size=len(dataset),
1282
+ max_concurrency=max_concurrency,
1283
+ )
1284
+
1285
+ # Display estimation summary
1286
+ display_time_estimation_summary(estimation, len(dataset), max_concurrency)
1287
+
1288
+ return estimation
1289
+
1153
1290
  def add_block(self, block: BaseBlock) -> "Flow":
1154
1291
  """Add a block to the flow, returning a new Flow instance.
1155
1292
 
@@ -1,8 +1,10 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
2
 
3
3
  # Local
4
- from .flow_identifier import get_flow_identifier
5
- from .path_resolution import resolve_path
4
+ from .flow_identifier import get_flow_identifier as get_flow_identifier
5
+ from .path_resolution import resolve_path as resolve_path
6
+ from .time_estimator import estimate_execution_time as estimate_execution_time
7
+ from .time_estimator import is_llm_using_block as is_llm_using_block
6
8
 
7
9
 
8
10
  # This is part of the public API, and used by instructlab
@@ -10,4 +12,10 @@ class GenerateError(Exception):
10
12
  """An exception raised during generate step."""
11
13
 
12
14
 
13
- __all__ = ["GenerateError", "resolve_path", "get_flow_identifier"]
15
+ __all__ = [
16
+ "GenerateError",
17
+ "resolve_path",
18
+ "get_flow_identifier",
19
+ "estimate_execution_time",
20
+ "is_llm_using_block",
21
+ ]
@@ -188,6 +188,122 @@ def display_metrics_summary(
188
188
  console.print()
189
189
 
190
190
 
191
+ def display_time_estimation_summary(
192
+ time_estimation: dict[str, Any],
193
+ dataset_size: int,
194
+ max_concurrency: Optional[int] = None,
195
+ ) -> None:
196
+ """Display a rich table summarizing time estimation results.
197
+
198
+ Parameters
199
+ ----------
200
+ time_estimation : dict[str, Any]
201
+ Time estimation results from estimate_total_time().
202
+ dataset_size : int
203
+ Total number of samples in the dataset.
204
+ max_concurrency : Optional[int], optional
205
+ Maximum concurrency used for estimation.
206
+ """
207
+ console = Console()
208
+
209
+ # Create main summary table
210
+ summary_table = Table(
211
+ show_header=False,
212
+ box=None,
213
+ padding=(0, 1),
214
+ )
215
+ summary_table.add_column("Metric", style="bright_cyan")
216
+ summary_table.add_column("Value", style="bright_white")
217
+
218
+ # Format time
219
+ est_seconds = time_estimation["estimated_time_seconds"]
220
+ if est_seconds < 60:
221
+ time_str = f"{est_seconds:.1f} seconds"
222
+ elif est_seconds < 3600:
223
+ time_str = f"{est_seconds / 60:.1f} minutes ({est_seconds / 3600:.2f} hours)"
224
+ else:
225
+ time_str = f"{est_seconds / 3600:.2f} hours ({est_seconds / 60:.0f} minutes)"
226
+
227
+ summary_table.add_row("Estimated Time:", time_str)
228
+ summary_table.add_row(
229
+ "Total LLM Requests:", f"{time_estimation.get('total_estimated_requests', 0):,}"
230
+ )
231
+
232
+ if time_estimation.get("total_estimated_requests", 0) > 0:
233
+ requests_per_sample = time_estimation["total_estimated_requests"] / dataset_size
234
+ summary_table.add_row("Requests per Sample:", f"{requests_per_sample:.1f}")
235
+
236
+ if max_concurrency is not None:
237
+ summary_table.add_row("Max Concurrency:", str(max_concurrency))
238
+
239
+ # Display summary panel
240
+ console.print()
241
+ console.print(
242
+ Panel(
243
+ summary_table,
244
+ title=f"[bold bright_white]Time Estimation for {dataset_size:,} Samples[/bold bright_white]",
245
+ border_style="bright_blue",
246
+ )
247
+ )
248
+
249
+ # Display per-block breakdown if available
250
+ block_estimates = time_estimation.get("block_estimates", [])
251
+ if block_estimates:
252
+ console.print()
253
+
254
+ # Create per-block table
255
+ block_table = Table(
256
+ show_header=True,
257
+ header_style="bold bright_white",
258
+ )
259
+ block_table.add_column("Block Name", style="bright_cyan", width=20)
260
+ block_table.add_column("Time", justify="right", style="bright_yellow", width=10)
261
+ block_table.add_column(
262
+ "Requests", justify="right", style="bright_green", width=10
263
+ )
264
+ block_table.add_column(
265
+ "Throughput", justify="right", style="bright_blue", width=12
266
+ )
267
+ block_table.add_column(
268
+ "Amplif.", justify="right", style="bright_magenta", width=10
269
+ )
270
+
271
+ for block in block_estimates:
272
+ # Format time
273
+ block_seconds = block["estimated_time"]
274
+ if block_seconds < 60:
275
+ time_str = f"{block_seconds:.1f}s"
276
+ else:
277
+ time_str = f"{block_seconds / 60:.1f}min"
278
+
279
+ # Format requests
280
+ requests_str = f"{block['estimated_requests']:,.0f}"
281
+
282
+ # Format throughput
283
+ throughput_str = f"{block['throughput']:.2f}/s"
284
+
285
+ # Format amplification
286
+ amplif_str = f"{block['amplification']:.1f}x"
287
+
288
+ block_table.add_row(
289
+ block["block"],
290
+ time_str,
291
+ requests_str,
292
+ throughput_str,
293
+ amplif_str,
294
+ )
295
+
296
+ console.print(
297
+ Panel(
298
+ block_table,
299
+ title="[bold bright_white]Per-Block Breakdown[/bold bright_white]",
300
+ border_style="bright_blue",
301
+ )
302
+ )
303
+
304
+ console.print()
305
+
306
+
191
307
  def save_metrics_to_json(
192
308
  block_metrics: list[dict[str, Any]],
193
309
  flow_name: str,
@@ -0,0 +1,344 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Time estimation utility for predicting full dataset execution time from dry_run results."""
3
+
4
+ # Standard
5
+ from typing import Dict, Optional
6
+ import math
7
+
8
+ # Default max concurrent requests used during dry runs
9
+ DRY_RUN_MAX_CONCURRENT = 100
10
+
11
+ # Conservative estimation factor (20% buffer for API variability, network latency, etc.)
12
+ ESTIMATION_BUFFER_FACTOR = 1.2
13
+
14
+
15
+ def is_llm_using_block(block_info: Dict) -> bool:
16
+ """Detect if a block uses LLMs.
17
+
18
+ Identifies blocks that make LLM API calls based on their type or parameters.
19
+ This is used to calculate request amplification for LLM blocks.
20
+
21
+ Parameters
22
+ ----------
23
+ block_info : Dict
24
+ Block information from dry_run results containing block_type and parameters_used.
25
+
26
+ Returns
27
+ -------
28
+ bool
29
+ True if the block uses LLMs, False otherwise.
30
+
31
+ Examples
32
+ --------
33
+ >>> block = {"block_type": "LLMChatBlock", "parameters_used": {"model": "gpt-4"}}
34
+ >>> is_llm_using_block(block)
35
+ True
36
+ """
37
+ block_type = block_info.get("block_type", "")
38
+
39
+ # Direct LLM blocks or evaluation/verification blocks
40
+ if any(kw in block_type for kw in ["LLMChatBlock", "Evaluate", "Verify"]):
41
+ return True
42
+
43
+ # Check for model parameters
44
+ params = block_info.get("parameters_used", {})
45
+ if any(key in params for key in ["model", "api_base", "api_key"]):
46
+ return True
47
+
48
+ return False
49
+
50
+
51
+ def calculate_block_throughput(
52
+ block_1: Dict, block_2: Dict, samples_1: int, samples_2: int
53
+ ) -> Dict:
54
+ """Calculate throughput and amplification from two dry runs.
55
+
56
+ Analyzes performance metrics from two dry runs with different sample sizes
57
+ to estimate throughput (requests/second), amplification factor, and startup overhead.
58
+
59
+ Parameters
60
+ ----------
61
+ block_1 : Dict
62
+ Block execution info from first dry run.
63
+ block_2 : Dict
64
+ Block execution info from second dry run.
65
+ samples_1 : int
66
+ Number of samples in first dry run.
67
+ samples_2 : int
68
+ Number of samples in second dry run.
69
+
70
+ Returns
71
+ -------
72
+ Dict
73
+ Dictionary containing:
74
+ - throughput: float, requests per second
75
+ - amplification: float, average requests per input sample
76
+ - startup_overhead: float, fixed startup time in seconds
77
+
78
+ Raises
79
+ ------
80
+ ValueError
81
+ If throughput cannot be calculated due to invalid measurements.
82
+
83
+ Examples
84
+ --------
85
+ >>> block1 = {"execution_time_seconds": 1.0, "input_rows": 1, "block_name": "test"}
86
+ >>> block2 = {"execution_time_seconds": 2.0, "input_rows": 5, "block_name": "test"}
87
+ >>> result = calculate_block_throughput(block1, block2, 1, 5)
88
+ >>> assert result["throughput"] > 0
89
+ """
90
+ time_1 = block_1.get("execution_time_seconds", 0)
91
+ time_2 = block_2.get("execution_time_seconds", 0)
92
+ requests_1 = block_1.get("input_rows", 0)
93
+ requests_2 = block_2.get("input_rows", 0)
94
+
95
+ # Calculate amplification (requests per sample)
96
+ amp_1 = requests_1 / samples_1 if samples_1 > 0 else 1
97
+ amp_2 = requests_2 / samples_2 if samples_2 > 0 else 1
98
+ avg_amplification = (amp_1 + amp_2) / 2
99
+
100
+ # Use linear scaling to extract throughput and overhead from two data points
101
+ # Model: time = startup_overhead + (requests / throughput)
102
+
103
+ if requests_2 > requests_1 and time_2 > time_1:
104
+ # Calculate marginal time per request (slope of the line)
105
+ marginal_time = (time_2 - time_1) / (requests_2 - requests_1)
106
+
107
+ # Throughput is the inverse of marginal time
108
+ measured_throughput = 1.0 / marginal_time if marginal_time > 0 else 0
109
+
110
+ # Y-intercept is the startup overhead
111
+ startup_overhead = max(0, time_1 - (requests_1 * marginal_time))
112
+ else:
113
+ # Fallback to simple calculation if we don't have good data for scaling
114
+ throughput_1 = requests_1 / time_1 if time_1 > 0 else 0
115
+ throughput_2 = requests_2 / time_2 if time_2 > 0 else 0
116
+ measured_throughput = max(throughput_1, throughput_2)
117
+
118
+ # Estimate overhead as a small fraction of time
119
+ startup_overhead = min(2.0, time_1 * 0.1) # Assume 10% overhead, max 2 seconds
120
+
121
+ # If we have no valid measurements, raise an error
122
+ if measured_throughput == 0:
123
+ raise ValueError(
124
+ f"Cannot calculate throughput for block '{block_1.get('block_name', 'unknown')}': "
125
+ f"No valid measurements from dry runs (time_1={time_1}, time_2={time_2}, "
126
+ f"requests_1={requests_1}, requests_2={requests_2})"
127
+ )
128
+
129
+ return {
130
+ "throughput": measured_throughput,
131
+ "amplification": avg_amplification,
132
+ "startup_overhead": startup_overhead,
133
+ }
134
+
135
+
136
+ def calculate_time_with_pipeline(
137
+ num_requests: float,
138
+ throughput: float,
139
+ startup_overhead: float,
140
+ max_concurrent: int = DRY_RUN_MAX_CONCURRENT,
141
+ ) -> float:
142
+ """Calculate time considering pipeline behavior and max concurrent limit.
143
+
144
+ Models the execution time for a given number of requests based on throughput,
145
+ startup overhead, and concurrency constraints. Applies non-linear scaling
146
+ for diminishing returns at high concurrency levels.
147
+
148
+ Parameters
149
+ ----------
150
+ num_requests : float
151
+ Total number of requests to process.
152
+ throughput : float
153
+ Base throughput in requests per second.
154
+ startup_overhead : float
155
+ Fixed startup time overhead in seconds.
156
+ max_concurrent : int, optional
157
+ Maximum number of concurrent requests, by default 100.
158
+
159
+ Returns
160
+ -------
161
+ float
162
+ Estimated total execution time in seconds.
163
+
164
+ Examples
165
+ --------
166
+ >>> time = calculate_time_with_pipeline(1000, 10.0, 0.5, 50)
167
+ >>> assert time > 0
168
+ """
169
+ if num_requests <= 0:
170
+ return 0
171
+
172
+ # Validate and clamp max_concurrent to avoid division by zero
173
+ if max_concurrent is None or max_concurrent <= 0:
174
+ max_concurrent = 1
175
+
176
+ # The throughput is what we measured - it represents the server's processing capability
177
+ if max_concurrent == 1:
178
+ # Sequential execution - no pipelining benefit
179
+ effective_throughput = throughput
180
+ else:
181
+ # Concurrent execution - small pipelining benefit
182
+ # At most 10% improvement from perfect pipelining (conservative estimate)
183
+ # Logarithmic growth to model diminishing returns
184
+ pipelining_factor = 1.0 + (0.1 * math.log(max_concurrent) / math.log(100))
185
+ pipelining_factor = min(pipelining_factor, 1.1) # Cap at 10% improvement
186
+ effective_throughput = throughput * pipelining_factor
187
+
188
+ # Calculate total time
189
+ base_time = startup_overhead + (num_requests / effective_throughput)
190
+
191
+ return base_time
192
+
193
+
194
+ def estimate_execution_time(
195
+ dry_run_1: Dict,
196
+ dry_run_2: Optional[Dict] = None,
197
+ total_dataset_size: Optional[int] = None,
198
+ max_concurrency: Optional[int] = None,
199
+ ) -> Dict:
200
+ """Estimate execution time based on dry run results.
201
+
202
+ Estimates the total execution time for a full dataset based on one or two
203
+ dry runs with smaller sample sizes. For async blocks (with two dry runs),
204
+ calculates throughput and concurrency benefits. For sync blocks (single dry run),
205
+ performs simple linear scaling.
206
+
207
+ The estimates include a conservative buffer (20%) to account for API variability,
208
+ network latency, and other real-world factors.
209
+
210
+ Parameters
211
+ ----------
212
+ dry_run_1 : Dict
213
+ Results from first dry run, must contain 'sample_size' and 'execution_time_seconds'.
214
+ dry_run_2 : Optional[Dict], optional
215
+ Results from second dry run for async estimation, by default None.
216
+ total_dataset_size : Optional[int], optional
217
+ Size of full dataset to estimate for. If None, uses original_dataset_size from dry_run_1.
218
+ max_concurrency : Optional[int], optional
219
+ Maximum concurrent requests allowed, by default 100.
220
+
221
+ Returns
222
+ -------
223
+ Dict
224
+ Estimation results containing:
225
+ - estimated_time_seconds: float, estimated time with current configuration (includes buffer)
226
+ - total_estimated_requests: int, total LLM requests (0 for sync blocks)
227
+ - block_estimates: list, per-block estimates (for async blocks)
228
+ - note: str, additional information about the estimation
229
+
230
+ Examples
231
+ --------
232
+ >>> dry_run = {"sample_size": 2, "execution_time_seconds": 10.0}
233
+ >>> result = estimate_execution_time(dry_run, total_dataset_size=100)
234
+ >>> assert result["estimated_time_seconds"] > 0
235
+ >>>
236
+ >>> # With two dry runs for async estimation
237
+ >>> dry_run_1 = {"sample_size": 1, "execution_time_seconds": 5.0, "blocks_executed": [...]}
238
+ >>> dry_run_2 = {"sample_size": 5, "execution_time_seconds": 20.0, "blocks_executed": [...]}
239
+ >>> result = estimate_execution_time(dry_run_1, dry_run_2, total_dataset_size=1000)
240
+ >>> assert result["estimated_time_seconds"] > 0
241
+ """
242
+ # Set defaults
243
+ if max_concurrency is None:
244
+ max_concurrency = DRY_RUN_MAX_CONCURRENT
245
+
246
+ if total_dataset_size is None:
247
+ total_dataset_size = dry_run_1.get(
248
+ "original_dataset_size", dry_run_1["sample_size"]
249
+ )
250
+
251
+ # Get sample sizes
252
+ samples_1 = dry_run_1["sample_size"]
253
+ samples_2 = (
254
+ dry_run_2["sample_size"] if dry_run_2 else 5
255
+ ) # Default to 5 if not provided
256
+
257
+ # If only one dry run, do simple scaling
258
+ if dry_run_2 is None:
259
+ # Process each block individually for synchronous execution
260
+ blocks_executed = dry_run_1.get("blocks_executed", [])
261
+ if not blocks_executed:
262
+ # Fallback to simple scaling if no block details available
263
+ total_time = dry_run_1["execution_time_seconds"]
264
+ simple_estimate = (total_time / samples_1) * total_dataset_size
265
+ # Apply conservative buffer
266
+ simple_estimate = simple_estimate * ESTIMATION_BUFFER_FACTOR
267
+ return {
268
+ "estimated_time_seconds": simple_estimate,
269
+ "total_estimated_requests": 0,
270
+ "note": "Synchronous execution - linear scaling from dry run",
271
+ }
272
+
273
+ # Calculate time for each block and sum them
274
+ total_estimated_time = 0
275
+ for block in blocks_executed:
276
+ block_time = block.get("execution_time_seconds", 0)
277
+ input_rows = block.get("input_rows", samples_1)
278
+
279
+ # Calculate time per row for this block
280
+ if input_rows > 0:
281
+ time_per_row = block_time / input_rows
282
+ block_total_time = time_per_row * total_dataset_size
283
+ total_estimated_time += block_total_time
284
+
285
+ # Apply conservative buffer
286
+ total_estimated_time = total_estimated_time * ESTIMATION_BUFFER_FACTOR
287
+ return {
288
+ "estimated_time_seconds": total_estimated_time,
289
+ "total_estimated_requests": 0,
290
+ "note": "Synchronous execution - no concurrency",
291
+ }
292
+
293
+ # Analyze each block with async execution
294
+ block_estimates = []
295
+ total_time = 0
296
+ total_requests = 0
297
+
298
+ # Process each block
299
+ for i, block_1 in enumerate(dry_run_1.get("blocks_executed", [])):
300
+ if i >= len(dry_run_2.get("blocks_executed", [])):
301
+ break
302
+
303
+ block_2 = dry_run_2["blocks_executed"][i]
304
+
305
+ # Only process LLM blocks
306
+ if not is_llm_using_block(block_1):
307
+ continue
308
+
309
+ # Calculate throughput and amplification
310
+ analysis = calculate_block_throughput(block_1, block_2, samples_1, samples_2)
311
+
312
+ # Estimate requests for full dataset
313
+ estimated_requests = total_dataset_size * analysis["amplification"]
314
+
315
+ # Calculate time with pipeline model
316
+ block_time = calculate_time_with_pipeline(
317
+ estimated_requests,
318
+ analysis["throughput"],
319
+ analysis["startup_overhead"],
320
+ max_concurrency,
321
+ )
322
+
323
+ total_time += block_time
324
+ total_requests += estimated_requests
325
+
326
+ block_estimates.append(
327
+ {
328
+ "block": block_1["block_name"],
329
+ "estimated_requests": estimated_requests,
330
+ "throughput": analysis["throughput"],
331
+ "estimated_time": block_time,
332
+ "amplification": analysis["amplification"],
333
+ "startup_overhead": analysis["startup_overhead"],
334
+ }
335
+ )
336
+
337
+ # Apply conservative buffer to account for API variability, network issues, etc.
338
+ total_time = total_time * ESTIMATION_BUFFER_FACTOR
339
+
340
+ return {
341
+ "estimated_time_seconds": total_time,
342
+ "total_estimated_requests": int(total_requests),
343
+ "block_estimates": block_estimates,
344
+ }
@@ -19,7 +19,7 @@ metadata:
19
19
  - "japanese"
20
20
 
21
21
  license: "Apache-2.0"
22
-
22
+
23
23
  dataset_requirements:
24
24
  required_columns:
25
25
  - "document"
@@ -54,17 +54,19 @@ blocks:
54
54
  output_cols: raw_summary_detailed
55
55
  max_tokens: 2048
56
56
  async_mode: true
57
+ # n: 2
57
58
 
58
59
  - block_type: LLMParserBlock
59
60
  block_config:
60
- block_name: extract_detailed_summary
61
+ block_name: detailed_summary
61
62
  input_cols: raw_summary_detailed
62
63
  extract_content: true
64
+ # extract_reasoning_content: true
63
65
 
64
66
  - block_type: TextParserBlock
65
67
  block_config:
66
68
  block_name: parse_detailed_summary
67
- input_cols: extract_detailed_summary_content
69
+ input_cols: detailed_summary_content
68
70
  output_cols: summary_detailed
69
71
  start_tags: [""]
70
72
  end_tags: [""]
@@ -86,14 +88,14 @@ blocks:
86
88
 
87
89
  - block_type: LLMParserBlock
88
90
  block_config:
89
- block_name: extract_atomic_facts
91
+ block_name: atomic_facts
90
92
  input_cols: raw_atomic_facts
91
93
  extract_content: true
92
94
 
93
95
  - block_type: TextParserBlock
94
96
  block_config:
95
97
  block_name: parse_atomic_facts
96
- input_cols: extract_atomic_facts_content
98
+ input_cols: atomic_facts_content
97
99
  output_cols: summary_atomic_facts
98
100
  start_tags: [""]
99
101
  end_tags: [""]
@@ -115,14 +117,14 @@ blocks:
115
117
 
116
118
  - block_type: LLMParserBlock
117
119
  block_config:
118
- block_name: extract_extractive_summary
120
+ block_name: extractive_summary
119
121
  input_cols: raw_summary_extractive
120
122
  extract_content: true
121
123
 
122
124
  - block_type: TextParserBlock
123
125
  block_config:
124
126
  block_name: parse_extractive_summary
125
- input_cols: extract_extractive_summary_content
127
+ input_cols: extractive_summary_content
126
128
  output_cols: summary_extractive
127
129
  start_tags: [""]
128
130
  end_tags: [""]
@@ -156,14 +158,14 @@ blocks:
156
158
 
157
159
  - block_type: LLMParserBlock
158
160
  block_config:
159
- block_name: extract_knowledge_generation
161
+ block_name: get_knowledge_generation
160
162
  input_cols: raw_knowledge_generation
161
163
  extract_content: true
162
164
 
163
165
  - block_type: TextParserBlock
164
166
  block_config:
165
167
  block_name: parse_knowledge_generation
166
- input_cols: extract_knowledge_generation_content
168
+ input_cols: get_knowledge_generation_content
167
169
  output_cols: [question, response]
168
170
  parsing_pattern: "\\[(?:Question|QUESTION)\\]\\s*(.*?)\\s*\\[(?:Answer|ANSWER)\\]\\s*(.*?)\\s*(?=\\[(?:Question|QUESTION)\\]|$)"
169
171
  parser_cleanup_tags: ["[END]"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdg_hub
3
- Version: 0.4.1
3
+ Version: 0.4.2
4
4
  Summary: Synthetic Data Generation
5
5
  Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
6
  License: Apache-2.0
@@ -1,5 +1,5 @@
1
1
  sdg_hub/__init__.py,sha256=TlkZT40-70urdcWLqv3kupaJj8s-SVgd2QyvlSFwb4A,510
2
- sdg_hub/_version.py,sha256=k7cu0JKra64gmMNU_UfA5sw2eNc_GRvf3QmesiYAy8g,704
2
+ sdg_hub/_version.py,sha256=A45grTqzrHuDn1CT9K5GVUbY4_Q3OSTcXAl3zdHzcEI,704
3
3
  sdg_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  sdg_hub/core/__init__.py,sha256=e3BoejbqjYhasf9t__L4qE52lkD9EBjx4o--2kqKdro,460
5
5
  sdg_hub/core/blocks/__init__.py,sha256=5FsbkcO-dmBv6MqO96TPn9FKKPTQZQCv20j4wR7UvQw,1502
@@ -33,20 +33,21 @@ sdg_hub/core/blocks/transform/rename_columns.py,sha256=qeB5L2utqDQnutUetH1VKZSqD
33
33
  sdg_hub/core/blocks/transform/text_concat.py,sha256=_-B__Hob1WwgwkILPIZvTnsDzuwtoX1hKviyzHlnnes,3149
34
34
  sdg_hub/core/blocks/transform/uniform_col_val_setter.py,sha256=XnjiT29z3PzIPy8M-mmE2w-Miab6Ed5ahy32SaxTCTE,3263
35
35
  sdg_hub/core/flow/__init__.py,sha256=0_m_htuZfPxk8xQ9IKfp0Pz-JRE4O7lYMUFrKyLNoLA,409
36
- sdg_hub/core/flow/base.py,sha256=IRnNEZ3laDmR4sW_MTseL4syhLuUylyHY_0tS5QaS-A,54084
36
+ sdg_hub/core/flow/base.py,sha256=4kR-dKXAlLFSwm3YWdT8EoedCIGJT56agcot3tQb6VY,59508
37
37
  sdg_hub/core/flow/checkpointer.py,sha256=stm5ZtjjEiLk9ZkAAnoQQn5Y8Yl_d7qCsQLZTrCXR48,11867
38
38
  sdg_hub/core/flow/metadata.py,sha256=cFrpJjWOaK87aCuRFyC3Pdf83oYU93mrmZEMdUnhsN8,10540
39
39
  sdg_hub/core/flow/migration.py,sha256=6and-RBqV0t2gRipr1GiOOVnyBJdtyyjw1kO08Z--d4,7558
40
40
  sdg_hub/core/flow/registry.py,sha256=N6KfX-L7QRkooznIFxDuhRZYuDA5g3N5zC-KRm2jVhk,12109
41
41
  sdg_hub/core/flow/validation.py,sha256=pUJvgaUjLpKNwvW6djcqVOF-HShOjegEmGOnUnoX4BA,9722
42
- sdg_hub/core/utils/__init__.py,sha256=C2FzLn3dHprwGJDEgI4fyFS3aoCJR-9PhHsunxropJ8,351
42
+ sdg_hub/core/utils/__init__.py,sha256=KcT56JhobC5sBg0MKEMn5hc4OyKa9_Vnn45Mt_kS4jQ,610
43
43
  sdg_hub/core/utils/datautils.py,sha256=__HkUe1DxcJVHKrFX68z_hDXwxJygBlJDfjJLnj7rHc,4230
44
44
  sdg_hub/core/utils/error_handling.py,sha256=yku8cGj_nKCyXDsnb-mHCpgukkkAMucJ4iAUrIzqysc,5510
45
45
  sdg_hub/core/utils/flow_id_words.yaml,sha256=5QHpQdP7zwahRuooyAlJIwBY7WcDR7vtbJXxVJqujbg,2317
46
46
  sdg_hub/core/utils/flow_identifier.py,sha256=aAHfK_G9AwEtMglLRMdMpi_AI1dciub5UqBGm4yb2HE,2841
47
- sdg_hub/core/utils/flow_metrics.py,sha256=VOdreUzP0kPgnkPjuQk87tZsK5f1u6XGEPM8ugCt0CY,8824
47
+ sdg_hub/core/utils/flow_metrics.py,sha256=3G-xbfr-rFA578wV4KUbQePTMVGZHr9-rXvyYL4Kt2Q,12604
48
48
  sdg_hub/core/utils/logger_config.py,sha256=6_cnsIHtSAdq1iTTZ7Q7nAJ1dmldlxSZ0AB49yLiQ20,2034
49
49
  sdg_hub/core/utils/path_resolution.py,sha256=yWof4kGNpQ5dKcrVHg0h9KfOKLZ6ROjdfsLAZsQT5rM,2000
50
+ sdg_hub/core/utils/time_estimator.py,sha256=rM3_R-Ka5DEtvOtlJoA_5pXSyQ6tT6t4h6qh3_5BCZo,12639
50
51
  sdg_hub/core/utils/yaml_utils.py,sha256=tShCd-FFkp0xlKnLe7dXsMOR4AvT9d2qRUmu4ZnPSEY,1458
51
52
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
53
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_answers.yaml,sha256=THRT3cY44KGI_69B2wqt2Q89EknnOSE7B4A_jdnxlIU,330
@@ -78,7 +79,7 @@ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/j
78
79
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml,sha256=OjPZaSCOSLxEWgW3pmNwF7mmLhGhFGTmKL_3rKdqeW4,2488
79
80
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml,sha256=nEy_RcotHGiiENrmUANpKkbIFsrARAeSwECrBeHi2so,391
80
81
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml,sha256=V90W0IeJQZTFThA8v0UOs3DtZbtU3BI9jkpChw1BULo,402
81
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml,sha256=iY1N6CY97fEkqI5oqaamSfqmiXpHPhWH_aOppsMxVjY,9176
82
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml,sha256=jumjKmKshSd8hoTYpyBJ0nMOADeQmxBmNPY7yfa_xQ8,9171
82
83
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml,sha256=96SQqXG7fmb-50SdX85sgVtrFcQ-oNKe_0BoQdZmY5g,2638
83
84
  sdg_hub/flows/text_analysis/__init__.py,sha256=WStks4eM_KHNTVsHglcj8vFghmI0PH9P1hUrijBLbwc,125
84
85
  sdg_hub/flows/text_analysis/structured_insights/__init__.py,sha256=_DT4NR05JD9CZoSWROPr2lC6se0VjSqQPZJJlEV79mk,274
@@ -87,8 +88,8 @@ sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml,sha256=Q_S
87
88
  sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml,sha256=_nPPMdHnxag_lYbhYUjGJGo-CvRwWvwdGX7cQhdZ1S0,847
88
89
  sdg_hub/flows/text_analysis/structured_insights/flow.yaml,sha256=BBV18SdvuVTAESjwkJ7V1jbb-cSTBvNl3SCycd0oEQ4,4934
89
90
  sdg_hub/flows/text_analysis/structured_insights/summarize.yaml,sha256=WXwQak1pF8e1OwnOoI1EHu8QB6iUNW89rfkTdi1Oq54,687
90
- sdg_hub-0.4.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
91
- sdg_hub-0.4.1.dist-info/METADATA,sha256=pLRs5oOsVI9515UEZxcUEZFZhCoZ0kli0KLpBPPPB7w,9783
92
- sdg_hub-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
93
- sdg_hub-0.4.1.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
94
- sdg_hub-0.4.1.dist-info/RECORD,,
91
+ sdg_hub-0.4.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
92
+ sdg_hub-0.4.2.dist-info/METADATA,sha256=5qbw9_DoVmfntmQlvz4VPdQXdUXoLO8Zhrxbc1uY7b0,9783
93
+ sdg_hub-0.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
94
+ sdg_hub-0.4.2.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
95
+ sdg_hub-0.4.2.dist-info/RECORD,,