sdg-hub 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. sdg_hub/_version.py +2 -2
  2. sdg_hub/core/blocks/__init__.py +0 -22
  3. sdg_hub/core/blocks/transform/rename_columns.py +19 -0
  4. sdg_hub/core/flow/base.py +146 -81
  5. sdg_hub/core/utils/__init__.py +11 -3
  6. sdg_hub/core/utils/flow_metrics.py +116 -0
  7. sdg_hub/core/utils/time_estimator.py +344 -0
  8. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +5 -1
  9. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +5 -1
  10. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +5 -1
  11. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +6 -1
  12. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +16 -10
  13. {sdg_hub-0.4.1.dist-info → sdg_hub-0.5.0.dist-info}/METADATA +2 -2
  14. {sdg_hub-0.4.1.dist-info → sdg_hub-0.5.0.dist-info}/RECORD +17 -27
  15. sdg_hub/core/blocks/deprecated_blocks/__init__.py +0 -29
  16. sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +0 -93
  17. sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +0 -88
  18. sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +0 -103
  19. sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +0 -94
  20. sdg_hub/core/blocks/deprecated_blocks/llmblock.py +0 -479
  21. sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +0 -88
  22. sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +0 -58
  23. sdg_hub/core/blocks/deprecated_blocks/selector.py +0 -97
  24. sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +0 -88
  25. sdg_hub/core/flow/migration.py +0 -198
  26. {sdg_hub-0.4.1.dist-info → sdg_hub-0.5.0.dist-info}/WHEEL +0 -0
  27. {sdg_hub-0.4.1.dist-info → sdg_hub-0.5.0.dist-info}/licenses/LICENSE +0 -0
  28. {sdg_hub-0.4.1.dist-info → sdg_hub-0.5.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,344 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Time estimation utility for predicting full dataset execution time from dry_run results."""
3
+
4
+ # Standard
5
+ from typing import Dict, Optional
6
+ import math
7
+
8
+ # Default max concurrent requests used during dry runs
9
+ DRY_RUN_MAX_CONCURRENT = 100
10
+
11
+ # Conservative estimation factor (20% buffer for API variability, network latency, etc.)
12
+ ESTIMATION_BUFFER_FACTOR = 1.2
13
+
14
+
15
+ def is_llm_using_block(block_info: Dict) -> bool:
16
+ """Detect if a block uses LLMs.
17
+
18
+ Identifies blocks that make LLM API calls based on their type or parameters.
19
+ This is used to calculate request amplification for LLM blocks.
20
+
21
+ Parameters
22
+ ----------
23
+ block_info : Dict
24
+ Block information from dry_run results containing block_type and parameters_used.
25
+
26
+ Returns
27
+ -------
28
+ bool
29
+ True if the block uses LLMs, False otherwise.
30
+
31
+ Examples
32
+ --------
33
+ >>> block = {"block_type": "LLMChatBlock", "parameters_used": {"model": "gpt-4"}}
34
+ >>> is_llm_using_block(block)
35
+ True
36
+ """
37
+ block_type = block_info.get("block_type", "")
38
+
39
+ # Direct LLM blocks or evaluation/verification blocks
40
+ if any(kw in block_type for kw in ["LLMChatBlock", "Evaluate", "Verify"]):
41
+ return True
42
+
43
+ # Check for model parameters
44
+ params = block_info.get("parameters_used", {})
45
+ if any(key in params for key in ["model", "api_base", "api_key"]):
46
+ return True
47
+
48
+ return False
49
+
50
+
51
+ def calculate_block_throughput(
52
+ block_1: Dict, block_2: Dict, samples_1: int, samples_2: int
53
+ ) -> Dict:
54
+ """Calculate throughput and amplification from two dry runs.
55
+
56
+ Analyzes performance metrics from two dry runs with different sample sizes
57
+ to estimate throughput (requests/second), amplification factor, and startup overhead.
58
+
59
+ Parameters
60
+ ----------
61
+ block_1 : Dict
62
+ Block execution info from first dry run.
63
+ block_2 : Dict
64
+ Block execution info from second dry run.
65
+ samples_1 : int
66
+ Number of samples in first dry run.
67
+ samples_2 : int
68
+ Number of samples in second dry run.
69
+
70
+ Returns
71
+ -------
72
+ Dict
73
+ Dictionary containing:
74
+ - throughput: float, requests per second
75
+ - amplification: float, average requests per input sample
76
+ - startup_overhead: float, fixed startup time in seconds
77
+
78
+ Raises
79
+ ------
80
+ ValueError
81
+ If throughput cannot be calculated due to invalid measurements.
82
+
83
+ Examples
84
+ --------
85
+ >>> block1 = {"execution_time_seconds": 1.0, "input_rows": 1, "block_name": "test"}
86
+ >>> block2 = {"execution_time_seconds": 2.0, "input_rows": 5, "block_name": "test"}
87
+ >>> result = calculate_block_throughput(block1, block2, 1, 5)
88
+ >>> assert result["throughput"] > 0
89
+ """
90
+ time_1 = block_1.get("execution_time_seconds", 0)
91
+ time_2 = block_2.get("execution_time_seconds", 0)
92
+ requests_1 = block_1.get("input_rows", 0)
93
+ requests_2 = block_2.get("input_rows", 0)
94
+
95
+ # Calculate amplification (requests per sample)
96
+ amp_1 = requests_1 / samples_1 if samples_1 > 0 else 1
97
+ amp_2 = requests_2 / samples_2 if samples_2 > 0 else 1
98
+ avg_amplification = (amp_1 + amp_2) / 2
99
+
100
+ # Use linear scaling to extract throughput and overhead from two data points
101
+ # Model: time = startup_overhead + (requests / throughput)
102
+
103
+ if requests_2 > requests_1 and time_2 > time_1:
104
+ # Calculate marginal time per request (slope of the line)
105
+ marginal_time = (time_2 - time_1) / (requests_2 - requests_1)
106
+
107
+ # Throughput is the inverse of marginal time
108
+ measured_throughput = 1.0 / marginal_time if marginal_time > 0 else 0
109
+
110
+ # Y-intercept is the startup overhead
111
+ startup_overhead = max(0, time_1 - (requests_1 * marginal_time))
112
+ else:
113
+ # Fallback to simple calculation if we don't have good data for scaling
114
+ throughput_1 = requests_1 / time_1 if time_1 > 0 else 0
115
+ throughput_2 = requests_2 / time_2 if time_2 > 0 else 0
116
+ measured_throughput = max(throughput_1, throughput_2)
117
+
118
+ # Estimate overhead as a small fraction of time
119
+ startup_overhead = min(2.0, time_1 * 0.1) # Assume 10% overhead, max 2 seconds
120
+
121
+ # If we have no valid measurements, raise an error
122
+ if measured_throughput == 0:
123
+ raise ValueError(
124
+ f"Cannot calculate throughput for block '{block_1.get('block_name', 'unknown')}': "
125
+ f"No valid measurements from dry runs (time_1={time_1}, time_2={time_2}, "
126
+ f"requests_1={requests_1}, requests_2={requests_2})"
127
+ )
128
+
129
+ return {
130
+ "throughput": measured_throughput,
131
+ "amplification": avg_amplification,
132
+ "startup_overhead": startup_overhead,
133
+ }
134
+
135
+
136
+ def calculate_time_with_pipeline(
137
+ num_requests: float,
138
+ throughput: float,
139
+ startup_overhead: float,
140
+ max_concurrent: int = DRY_RUN_MAX_CONCURRENT,
141
+ ) -> float:
142
+ """Calculate time considering pipeline behavior and max concurrent limit.
143
+
144
+ Models the execution time for a given number of requests based on throughput,
145
+ startup overhead, and concurrency constraints. Applies non-linear scaling
146
+ for diminishing returns at high concurrency levels.
147
+
148
+ Parameters
149
+ ----------
150
+ num_requests : float
151
+ Total number of requests to process.
152
+ throughput : float
153
+ Base throughput in requests per second.
154
+ startup_overhead : float
155
+ Fixed startup time overhead in seconds.
156
+ max_concurrent : int, optional
157
+ Maximum number of concurrent requests, by default 100.
158
+
159
+ Returns
160
+ -------
161
+ float
162
+ Estimated total execution time in seconds.
163
+
164
+ Examples
165
+ --------
166
+ >>> time = calculate_time_with_pipeline(1000, 10.0, 0.5, 50)
167
+ >>> assert time > 0
168
+ """
169
+ if num_requests <= 0:
170
+ return 0
171
+
172
+ # Validate and clamp max_concurrent to avoid division by zero
173
+ if max_concurrent is None or max_concurrent <= 0:
174
+ max_concurrent = 1
175
+
176
+ # The throughput is what we measured - it represents the server's processing capability
177
+ if max_concurrent == 1:
178
+ # Sequential execution - no pipelining benefit
179
+ effective_throughput = throughput
180
+ else:
181
+ # Concurrent execution - small pipelining benefit
182
+ # At most 10% improvement from perfect pipelining (conservative estimate)
183
+ # Logarithmic growth to model diminishing returns
184
+ pipelining_factor = 1.0 + (0.1 * math.log(max_concurrent) / math.log(100))
185
+ pipelining_factor = min(pipelining_factor, 1.1) # Cap at 10% improvement
186
+ effective_throughput = throughput * pipelining_factor
187
+
188
+ # Calculate total time
189
+ base_time = startup_overhead + (num_requests / effective_throughput)
190
+
191
+ return base_time
192
+
193
+
194
+ def estimate_execution_time(
195
+ dry_run_1: Dict,
196
+ dry_run_2: Optional[Dict] = None,
197
+ total_dataset_size: Optional[int] = None,
198
+ max_concurrency: Optional[int] = None,
199
+ ) -> Dict:
200
+ """Estimate execution time based on dry run results.
201
+
202
+ Estimates the total execution time for a full dataset based on one or two
203
+ dry runs with smaller sample sizes. For async blocks (with two dry runs),
204
+ calculates throughput and concurrency benefits. For sync blocks (single dry run),
205
+ performs simple linear scaling.
206
+
207
+ The estimates include a conservative buffer (20%) to account for API variability,
208
+ network latency, and other real-world factors.
209
+
210
+ Parameters
211
+ ----------
212
+ dry_run_1 : Dict
213
+ Results from first dry run, must contain 'sample_size' and 'execution_time_seconds'.
214
+ dry_run_2 : Optional[Dict], optional
215
+ Results from second dry run for async estimation, by default None.
216
+ total_dataset_size : Optional[int], optional
217
+ Size of full dataset to estimate for. If None, uses original_dataset_size from dry_run_1.
218
+ max_concurrency : Optional[int], optional
219
+ Maximum concurrent requests allowed, by default 100.
220
+
221
+ Returns
222
+ -------
223
+ Dict
224
+ Estimation results containing:
225
+ - estimated_time_seconds: float, estimated time with current configuration (includes buffer)
226
+ - total_estimated_requests: int, total LLM requests (0 for sync blocks)
227
+ - block_estimates: list, per-block estimates (for async blocks)
228
+ - note: str, additional information about the estimation
229
+
230
+ Examples
231
+ --------
232
+ >>> dry_run = {"sample_size": 2, "execution_time_seconds": 10.0}
233
+ >>> result = estimate_execution_time(dry_run, total_dataset_size=100)
234
+ >>> assert result["estimated_time_seconds"] > 0
235
+ >>>
236
+ >>> # With two dry runs for async estimation
237
+ >>> dry_run_1 = {"sample_size": 1, "execution_time_seconds": 5.0, "blocks_executed": [...]}
238
+ >>> dry_run_2 = {"sample_size": 5, "execution_time_seconds": 20.0, "blocks_executed": [...]}
239
+ >>> result = estimate_execution_time(dry_run_1, dry_run_2, total_dataset_size=1000)
240
+ >>> assert result["estimated_time_seconds"] > 0
241
+ """
242
+ # Set defaults
243
+ if max_concurrency is None:
244
+ max_concurrency = DRY_RUN_MAX_CONCURRENT
245
+
246
+ if total_dataset_size is None:
247
+ total_dataset_size = dry_run_1.get(
248
+ "original_dataset_size", dry_run_1["sample_size"]
249
+ )
250
+
251
+ # Get sample sizes
252
+ samples_1 = dry_run_1["sample_size"]
253
+ samples_2 = (
254
+ dry_run_2["sample_size"] if dry_run_2 else 5
255
+ ) # Default to 5 if not provided
256
+
257
+ # If only one dry run, do simple scaling
258
+ if dry_run_2 is None:
259
+ # Process each block individually for synchronous execution
260
+ blocks_executed = dry_run_1.get("blocks_executed", [])
261
+ if not blocks_executed:
262
+ # Fallback to simple scaling if no block details available
263
+ total_time = dry_run_1["execution_time_seconds"]
264
+ simple_estimate = (total_time / samples_1) * total_dataset_size
265
+ # Apply conservative buffer
266
+ simple_estimate = simple_estimate * ESTIMATION_BUFFER_FACTOR
267
+ return {
268
+ "estimated_time_seconds": simple_estimate,
269
+ "total_estimated_requests": 0,
270
+ "note": "Synchronous execution - linear scaling from dry run",
271
+ }
272
+
273
+ # Calculate time for each block and sum them
274
+ total_estimated_time = 0
275
+ for block in blocks_executed:
276
+ block_time = block.get("execution_time_seconds", 0)
277
+ input_rows = block.get("input_rows", samples_1)
278
+
279
+ # Calculate time per row for this block
280
+ if input_rows > 0:
281
+ time_per_row = block_time / input_rows
282
+ block_total_time = time_per_row * total_dataset_size
283
+ total_estimated_time += block_total_time
284
+
285
+ # Apply conservative buffer
286
+ total_estimated_time = total_estimated_time * ESTIMATION_BUFFER_FACTOR
287
+ return {
288
+ "estimated_time_seconds": total_estimated_time,
289
+ "total_estimated_requests": 0,
290
+ "note": "Synchronous execution - no concurrency",
291
+ }
292
+
293
+ # Analyze each block with async execution
294
+ block_estimates = []
295
+ total_time = 0
296
+ total_requests = 0
297
+
298
+ # Process each block
299
+ for i, block_1 in enumerate(dry_run_1.get("blocks_executed", [])):
300
+ if i >= len(dry_run_2.get("blocks_executed", [])):
301
+ break
302
+
303
+ block_2 = dry_run_2["blocks_executed"][i]
304
+
305
+ # Only process LLM blocks
306
+ if not is_llm_using_block(block_1):
307
+ continue
308
+
309
+ # Calculate throughput and amplification
310
+ analysis = calculate_block_throughput(block_1, block_2, samples_1, samples_2)
311
+
312
+ # Estimate requests for full dataset
313
+ estimated_requests = total_dataset_size * analysis["amplification"]
314
+
315
+ # Calculate time with pipeline model
316
+ block_time = calculate_time_with_pipeline(
317
+ estimated_requests,
318
+ analysis["throughput"],
319
+ analysis["startup_overhead"],
320
+ max_concurrency,
321
+ )
322
+
323
+ total_time += block_time
324
+ total_requests += estimated_requests
325
+
326
+ block_estimates.append(
327
+ {
328
+ "block": block_1["block_name"],
329
+ "estimated_requests": estimated_requests,
330
+ "throughput": analysis["throughput"],
331
+ "estimated_time": block_time,
332
+ "amplification": analysis["amplification"],
333
+ "startup_overhead": analysis["startup_overhead"],
334
+ }
335
+ )
336
+
337
+ # Apply conservative buffer to account for API variability, network issues, etc.
338
+ total_time = total_time * ESTIMATION_BUFFER_FACTOR
339
+
340
+ return {
341
+ "estimated_time_seconds": total_time,
342
+ "total_estimated_requests": int(total_requests),
343
+ "block_estimates": block_estimates,
344
+ }
@@ -77,9 +77,13 @@ blocks:
77
77
  - ''
78
78
  - block_type: RenameColumnsBlock
79
79
  block_config:
80
- block_name: rename_to_document_column
80
+ block_name: rename_to_raw_document_column
81
81
  input_cols:
82
82
  document: raw_document
83
+ - block_type: RenameColumnsBlock
84
+ block_config:
85
+ block_name: rename_to_document_column
86
+ input_cols:
83
87
  summary: document
84
88
  - block_type: PromptBuilderBlock
85
89
  block_config:
@@ -79,9 +79,13 @@ blocks:
79
79
  - ''
80
80
  - block_type: RenameColumnsBlock
81
81
  block_config:
82
- block_name: rename_to_document_column
82
+ block_name: rename_to_raw_document_column
83
83
  input_cols:
84
84
  document: raw_document
85
+ - block_type: RenameColumnsBlock
86
+ block_config:
87
+ block_name: rename_to_document_column
88
+ input_cols:
85
89
  summary: document
86
90
  - block_type: PromptBuilderBlock
87
91
  block_config:
@@ -72,9 +72,13 @@ blocks:
72
72
  parsing_pattern: '(?:^|\n)\s*\d+\.\s+(.*?)(?=\n\s*\d+\.\s+|\Z)'
73
73
  - block_type: RenameColumnsBlock
74
74
  block_config:
75
- block_name: rename_to_document_column
75
+ block_name: rename_to_raw_document_column
76
76
  input_cols:
77
77
  document: raw_document
78
+ - block_type: RenameColumnsBlock
79
+ block_config:
80
+ block_name: rename_to_document_column
81
+ input_cols:
78
82
  atomic_facts: document
79
83
  - block_type: PromptBuilderBlock
80
84
  block_config:
@@ -134,10 +134,15 @@ blocks:
134
134
  input_cols: [summary_detailed, summary_extractive, summary_atomic_facts, base_document]
135
135
  output_cols: [summary, dataset_type]
136
136
 
137
+ - block_type: RenameColumnsBlock
138
+ block_config:
139
+ block_name: rename_to_raw_document_column
140
+ input_cols: {document: raw_document}
141
+
137
142
  - block_type: RenameColumnsBlock
138
143
  block_config:
139
144
  block_name: rename_to_document_column
140
- input_cols: {document: raw_document, summary: document}
145
+ input_cols: {summary: document}
141
146
 
142
147
  - block_type: PromptBuilderBlock
143
148
  block_config:
@@ -19,7 +19,7 @@ metadata:
19
19
  - "japanese"
20
20
 
21
21
  license: "Apache-2.0"
22
-
22
+
23
23
  dataset_requirements:
24
24
  required_columns:
25
25
  - "document"
@@ -54,17 +54,19 @@ blocks:
54
54
  output_cols: raw_summary_detailed
55
55
  max_tokens: 2048
56
56
  async_mode: true
57
+ # n: 2
57
58
 
58
59
  - block_type: LLMParserBlock
59
60
  block_config:
60
- block_name: extract_detailed_summary
61
+ block_name: detailed_summary
61
62
  input_cols: raw_summary_detailed
62
63
  extract_content: true
64
+ # extract_reasoning_content: true
63
65
 
64
66
  - block_type: TextParserBlock
65
67
  block_config:
66
68
  block_name: parse_detailed_summary
67
- input_cols: extract_detailed_summary_content
69
+ input_cols: detailed_summary_content
68
70
  output_cols: summary_detailed
69
71
  start_tags: [""]
70
72
  end_tags: [""]
@@ -86,14 +88,14 @@ blocks:
86
88
 
87
89
  - block_type: LLMParserBlock
88
90
  block_config:
89
- block_name: extract_atomic_facts
91
+ block_name: atomic_facts
90
92
  input_cols: raw_atomic_facts
91
93
  extract_content: true
92
94
 
93
95
  - block_type: TextParserBlock
94
96
  block_config:
95
97
  block_name: parse_atomic_facts
96
- input_cols: extract_atomic_facts_content
98
+ input_cols: atomic_facts_content
97
99
  output_cols: summary_atomic_facts
98
100
  start_tags: [""]
99
101
  end_tags: [""]
@@ -115,14 +117,14 @@ blocks:
115
117
 
116
118
  - block_type: LLMParserBlock
117
119
  block_config:
118
- block_name: extract_extractive_summary
120
+ block_name: extractive_summary
119
121
  input_cols: raw_summary_extractive
120
122
  extract_content: true
121
123
 
122
124
  - block_type: TextParserBlock
123
125
  block_config:
124
126
  block_name: parse_extractive_summary
125
- input_cols: extract_extractive_summary_content
127
+ input_cols: extractive_summary_content
126
128
  output_cols: summary_extractive
127
129
  start_tags: [""]
128
130
  end_tags: [""]
@@ -133,10 +135,14 @@ blocks:
133
135
  input_cols: [summary_detailed, summary_extractive, summary_atomic_facts, base_document]
134
136
  output_cols: [summary, dataset_type]
135
137
 
138
+ - block_type: RenameColumnsBlock
139
+ block_config:
140
+ block_name: rename_to_raw_document_column
141
+ input_cols: {document: raw_document}
136
142
  - block_type: RenameColumnsBlock
137
143
  block_config:
138
144
  block_name: rename_to_document_column
139
- input_cols: {document: raw_document, summary: document}
145
+ input_cols: {summary: document}
140
146
 
141
147
  - block_type: PromptBuilderBlock
142
148
  block_config:
@@ -156,14 +162,14 @@ blocks:
156
162
 
157
163
  - block_type: LLMParserBlock
158
164
  block_config:
159
- block_name: extract_knowledge_generation
165
+ block_name: get_knowledge_generation
160
166
  input_cols: raw_knowledge_generation
161
167
  extract_content: true
162
168
 
163
169
  - block_type: TextParserBlock
164
170
  block_config:
165
171
  block_name: parse_knowledge_generation
166
- input_cols: extract_knowledge_generation_content
172
+ input_cols: get_knowledge_generation_content
167
173
  output_cols: [question, response]
168
174
  parsing_pattern: "\\[(?:Question|QUESTION)\\]\\s*(.*?)\\s*\\[(?:Answer|ANSWER)\\]\\s*(.*?)\\s*(?=\\[(?:Question|QUESTION)\\]|$)"
169
175
  parser_cleanup_tags: ["[END]"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdg_hub
3
- Version: 0.4.1
3
+ Version: 0.5.0
4
4
  Summary: Synthetic Data Generation
5
5
  Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
6
  License: Apache-2.0
@@ -23,7 +23,7 @@ Requires-Python: >=3.10
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE
25
25
  Requires-Dist: click<9.0.0,>=8.1.7
26
- Requires-Dist: datasets<4.0.0,>=2.18.0
26
+ Requires-Dist: datasets>=4.0.0
27
27
  Requires-Dist: httpx<1.0.0,>=0.25.0
28
28
  Requires-Dist: jinja2
29
29
  Requires-Dist: litellm<1.75.0,>=1.73.0
@@ -1,20 +1,10 @@
1
1
  sdg_hub/__init__.py,sha256=TlkZT40-70urdcWLqv3kupaJj8s-SVgd2QyvlSFwb4A,510
2
- sdg_hub/_version.py,sha256=k7cu0JKra64gmMNU_UfA5sw2eNc_GRvf3QmesiYAy8g,704
2
+ sdg_hub/_version.py,sha256=fvHpBU3KZKRinkriKdtAt3crenOyysELF-M9y3ozg3U,704
3
3
  sdg_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  sdg_hub/core/__init__.py,sha256=e3BoejbqjYhasf9t__L4qE52lkD9EBjx4o--2kqKdro,460
5
- sdg_hub/core/blocks/__init__.py,sha256=5FsbkcO-dmBv6MqO96TPn9FKKPTQZQCv20j4wR7UvQw,1502
5
+ sdg_hub/core/blocks/__init__.py,sha256=8Rn1SglH8V3jGmTD_cG-h7qk9ktAab2eaBdyk7RN_hY,865
6
6
  sdg_hub/core/blocks/base.py,sha256=-SOdBpJwtRTMsrmCEuLjUBQMRCo_PLYlHEBRrz8sF9g,13031
7
7
  sdg_hub/core/blocks/registry.py,sha256=FuEN_pnq-nSH1LguY3_oCubT6Kz3SuJjk3TcUpLT-lw,10695
8
- sdg_hub/core/blocks/deprecated_blocks/__init__.py,sha256=RDu3MWFStDQko-TKkx8tGoB1UTatP_RSldZK43zHDvY,889
9
- sdg_hub/core/blocks/deprecated_blocks/combine_columns.py,sha256=HCvpaYsAwgx1Dm0vIshcWsKoVsRT0KrmKp9j4oqtByc,2757
10
- sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py,sha256=maCaaEs0EMMzt7L1xm7fAH3ylaFMHEkeC_dtOw3FrjU,2694
11
- sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py,sha256=-fuuMKj2g2MrijMBTd0PWtYBbf9anQ2UkYXHigCxxJI,3328
12
- sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py,sha256=IenCskrPEv09h2uT6aZKCQzaxgA_3kAzOeJSd-R_-EA,2839
13
- sdg_hub/core/blocks/deprecated_blocks/llmblock.py,sha256=34lzC43BODpMk5AwlWA1ctdYPmN7cA6WL5vMXaI0P0Y,20385
14
- sdg_hub/core/blocks/deprecated_blocks/rename_columns.py,sha256=thp-mHtkRmUw_nYKpldy_mLWR2AvC5YUhbqDETM6-T0,2620
15
- sdg_hub/core/blocks/deprecated_blocks/sample_populator.py,sha256=UdueMApxOmPWaxxMrw7b1v74fKJBfqqRATEBqgmVtNw,1737
16
- sdg_hub/core/blocks/deprecated_blocks/selector.py,sha256=nWecsVsW8DvBcqAF_LOqXmW-5MQ28uN3d1y6wkSy38c,2960
17
- sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py,sha256=44TQu-rK5isia-otMVB1zHd8D-wWmu3C8CI1NLtfY5s,2729
18
8
  sdg_hub/core/blocks/filtering/__init__.py,sha256=isxSVSvDqkMjG8dQSl3Q2M4g5c1t9fTjBSA21icf-yA,275
19
9
  sdg_hub/core/blocks/filtering/column_value_filter.py,sha256=2Z9j_CiiTn5mHZ9gfXU-itLXDmeXSh0UI0x1x7j-LQ0,6001
20
10
  sdg_hub/core/blocks/llm/__init__.py,sha256=AyS0dd3pkPPXH5a9aj4mT5HsKjX2vjXfkmQc6rkFV4A,795
@@ -29,24 +19,24 @@ sdg_hub/core/blocks/transform/duplicate_columns.py,sha256=SaP7rIF4ZFEFFa50aU2xGN
29
19
  sdg_hub/core/blocks/transform/index_based_mapper.py,sha256=XC_a7Skbd3mu7f4ra8fGWPxMwqUMSjJkQ7Ag7vflwJA,8235
30
20
  sdg_hub/core/blocks/transform/json_structure_block.py,sha256=hm-0M0NAyUREgJRPyV1u-laorgX6MZ1o17E9rNBhN78,5010
31
21
  sdg_hub/core/blocks/transform/melt_columns.py,sha256=vaYa5Taq6GhNZYWFL4uPK3-SfN2BsKEm-wvjd2EYYoI,4382
32
- sdg_hub/core/blocks/transform/rename_columns.py,sha256=qeB5L2utqDQnutUetH1VKZSqDiJSH_yUp5EFCV-XCVI,1998
22
+ sdg_hub/core/blocks/transform/rename_columns.py,sha256=W2hcDSJY6L73ZpElUhOML2sGLM9Y-v0gSo3xEF1LXDc,2749
33
23
  sdg_hub/core/blocks/transform/text_concat.py,sha256=_-B__Hob1WwgwkILPIZvTnsDzuwtoX1hKviyzHlnnes,3149
34
24
  sdg_hub/core/blocks/transform/uniform_col_val_setter.py,sha256=XnjiT29z3PzIPy8M-mmE2w-Miab6Ed5ahy32SaxTCTE,3263
35
25
  sdg_hub/core/flow/__init__.py,sha256=0_m_htuZfPxk8xQ9IKfp0Pz-JRE4O7lYMUFrKyLNoLA,409
36
- sdg_hub/core/flow/base.py,sha256=IRnNEZ3laDmR4sW_MTseL4syhLuUylyHY_0tS5QaS-A,54084
26
+ sdg_hub/core/flow/base.py,sha256=64YJJujNRaSIbT1YKn9nAxij_hdJ9xRVH_uiUY1IUcI,55788
37
27
  sdg_hub/core/flow/checkpointer.py,sha256=stm5ZtjjEiLk9ZkAAnoQQn5Y8Yl_d7qCsQLZTrCXR48,11867
38
28
  sdg_hub/core/flow/metadata.py,sha256=cFrpJjWOaK87aCuRFyC3Pdf83oYU93mrmZEMdUnhsN8,10540
39
- sdg_hub/core/flow/migration.py,sha256=6and-RBqV0t2gRipr1GiOOVnyBJdtyyjw1kO08Z--d4,7558
40
29
  sdg_hub/core/flow/registry.py,sha256=N6KfX-L7QRkooznIFxDuhRZYuDA5g3N5zC-KRm2jVhk,12109
41
30
  sdg_hub/core/flow/validation.py,sha256=pUJvgaUjLpKNwvW6djcqVOF-HShOjegEmGOnUnoX4BA,9722
42
- sdg_hub/core/utils/__init__.py,sha256=C2FzLn3dHprwGJDEgI4fyFS3aoCJR-9PhHsunxropJ8,351
31
+ sdg_hub/core/utils/__init__.py,sha256=KcT56JhobC5sBg0MKEMn5hc4OyKa9_Vnn45Mt_kS4jQ,610
43
32
  sdg_hub/core/utils/datautils.py,sha256=__HkUe1DxcJVHKrFX68z_hDXwxJygBlJDfjJLnj7rHc,4230
44
33
  sdg_hub/core/utils/error_handling.py,sha256=yku8cGj_nKCyXDsnb-mHCpgukkkAMucJ4iAUrIzqysc,5510
45
34
  sdg_hub/core/utils/flow_id_words.yaml,sha256=5QHpQdP7zwahRuooyAlJIwBY7WcDR7vtbJXxVJqujbg,2317
46
35
  sdg_hub/core/utils/flow_identifier.py,sha256=aAHfK_G9AwEtMglLRMdMpi_AI1dciub5UqBGm4yb2HE,2841
47
- sdg_hub/core/utils/flow_metrics.py,sha256=VOdreUzP0kPgnkPjuQk87tZsK5f1u6XGEPM8ugCt0CY,8824
36
+ sdg_hub/core/utils/flow_metrics.py,sha256=3G-xbfr-rFA578wV4KUbQePTMVGZHr9-rXvyYL4Kt2Q,12604
48
37
  sdg_hub/core/utils/logger_config.py,sha256=6_cnsIHtSAdq1iTTZ7Q7nAJ1dmldlxSZ0AB49yLiQ20,2034
49
38
  sdg_hub/core/utils/path_resolution.py,sha256=yWof4kGNpQ5dKcrVHg0h9KfOKLZ6ROjdfsLAZsQT5rM,2000
39
+ sdg_hub/core/utils/time_estimator.py,sha256=rM3_R-Ka5DEtvOtlJoA_5pXSyQ6tT6t4h6qh3_5BCZo,12639
50
40
  sdg_hub/core/utils/yaml_utils.py,sha256=tShCd-FFkp0xlKnLe7dXsMOR4AvT9d2qRUmu4ZnPSEY,1458
51
41
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
42
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_answers.yaml,sha256=THRT3cY44KGI_69B2wqt2Q89EknnOSE7B4A_jdnxlIU,330
@@ -54,14 +44,14 @@ sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/gener
54
44
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_question_list.yaml,sha256=qHOgUNrQz2vjUjJiEHNGWxDDXwjJlP1kofTxeGgLyPI,1461
55
45
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
46
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/detailed_summary.yaml,sha256=Ik6gAml0O-jPq8jpXBAkURzYkQuFOnDZb4LDwjmfAiE,381
57
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml,sha256=fUdzY9dtU69o99Uq8FIPycgVWdLD-1kbY97Bh-Vo2A0,5538
47
+ sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml,sha256=cxNpPh60mcvzxfczMH8hw66Ql3S8O-cWCCDeauO736c,5649
58
48
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
59
49
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/flow.yaml,sha256=smPWVUZRCt58EagWDmJVmTBQj8qMcjpzh-Q3GSuFrz0,4413
60
50
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
61
51
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/extractive_summary.yaml,sha256=SeapWoOx3fhN5SvWYuHss_9prLE8xSkOic7JkbDHSR0,4081
62
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml,sha256=iNNIfofFE7awK7iivtIFWxjfjy8QviMugOPPnOTySKA,5706
52
+ sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml,sha256=7dVc0_g7Ex5SfdX57pqtk9gmH_lC6Cdm3HC-lg8OiXQ,5817
63
53
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml,sha256=CIUZNYhvszT-jpz1Hvh6nS2y5W34P529ZOMp8thEQ9k,3219
54
+ sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml,sha256=7X4N19TcyHUo7pNo3C6Zv3w6br7hjzEfgv06XUVDaQo,3330
65
55
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/key_facts_summary.yaml,sha256=YKMX_CuvcThG_bdNCAIXdVBkMvB72I89RGq2ltSSgc8,3298
66
56
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
67
57
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -71,14 +61,14 @@ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/ev
71
61
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml,sha256=zwzklXup6khRkR88avgrJTcjaMcV1wnbeYaML5oPuNs,1767
72
62
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml,sha256=cA8igo7jMrRXaWW6k0of6KOp7YnxLtPj0fP4DbrmZNQ,3647
73
63
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml,sha256=fcMV7LaCFZo4D29nwhGJXqFFuZMYVLo9XYjv8zcU6zs,364
74
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml,sha256=HR8sf7RUZKr8UqKztBj-nlvyrve1UMUu8x8qgYM6O14,9055
64
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml,sha256=km0ggcmFsZJGc2TfyYLkzPTrHGmcOB-jBAHInqySisk,9176
75
65
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml,sha256=yX8aLY8dJSDML9ZJhnj9RzPbN8tH2xfcM4Gc6xZuwqQ,2596
76
66
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
67
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
78
68
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml,sha256=OjPZaSCOSLxEWgW3pmNwF7mmLhGhFGTmKL_3rKdqeW4,2488
79
69
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml,sha256=nEy_RcotHGiiENrmUANpKkbIFsrARAeSwECrBeHi2so,391
80
70
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml,sha256=V90W0IeJQZTFThA8v0UOs3DtZbtU3BI9jkpChw1BULo,402
81
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml,sha256=iY1N6CY97fEkqI5oqaamSfqmiXpHPhWH_aOppsMxVjY,9176
71
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml,sha256=Q6RusV-_HHMr5jlFNOP6UVuEf8d6btHENMOP3MnB3u0,9291
82
72
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml,sha256=96SQqXG7fmb-50SdX85sgVtrFcQ-oNKe_0BoQdZmY5g,2638
83
73
  sdg_hub/flows/text_analysis/__init__.py,sha256=WStks4eM_KHNTVsHglcj8vFghmI0PH9P1hUrijBLbwc,125
84
74
  sdg_hub/flows/text_analysis/structured_insights/__init__.py,sha256=_DT4NR05JD9CZoSWROPr2lC6se0VjSqQPZJJlEV79mk,274
@@ -87,8 +77,8 @@ sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml,sha256=Q_S
87
77
  sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml,sha256=_nPPMdHnxag_lYbhYUjGJGo-CvRwWvwdGX7cQhdZ1S0,847
88
78
  sdg_hub/flows/text_analysis/structured_insights/flow.yaml,sha256=BBV18SdvuVTAESjwkJ7V1jbb-cSTBvNl3SCycd0oEQ4,4934
89
79
  sdg_hub/flows/text_analysis/structured_insights/summarize.yaml,sha256=WXwQak1pF8e1OwnOoI1EHu8QB6iUNW89rfkTdi1Oq54,687
90
- sdg_hub-0.4.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
91
- sdg_hub-0.4.1.dist-info/METADATA,sha256=pLRs5oOsVI9515UEZxcUEZFZhCoZ0kli0KLpBPPPB7w,9783
92
- sdg_hub-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
93
- sdg_hub-0.4.1.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
94
- sdg_hub-0.4.1.dist-info/RECORD,,
80
+ sdg_hub-0.5.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
81
+ sdg_hub-0.5.0.dist-info/METADATA,sha256=z4tCCtWlTBzu5DF1K44RtWjIs7ZNL6__2Aae7I0EfxQ,9775
82
+ sdg_hub-0.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
83
+ sdg_hub-0.5.0.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
84
+ sdg_hub-0.5.0.dist-info/RECORD,,
@@ -1,29 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
- """Deprecated blocks for backwards compatibility.
3
-
4
- This module contains deprecated block implementations that are maintained
5
- for backwards compatibility. These blocks should not be used in new code.
6
- """
7
-
8
- # Local
9
- from .combine_columns import CombineColumnsBlock
10
- from .duplicate_columns import DuplicateColumns
11
- from .filter_by_value import FilterByValueBlock
12
- from .flatten_columns import FlattenColumnsBlock
13
- from .llmblock import LLMBlock
14
- from .rename_columns import RenameColumns
15
- from .sample_populator import SamplePopulatorBlock
16
- from .selector import SelectorBlock
17
- from .set_to_majority_value import SetToMajorityValue
18
-
19
- __all__ = [
20
- "CombineColumnsBlock",
21
- "DuplicateColumns",
22
- "FilterByValueBlock",
23
- "FlattenColumnsBlock",
24
- "LLMBlock",
25
- "RenameColumns",
26
- "SamplePopulatorBlock",
27
- "SelectorBlock",
28
- "SetToMajorityValue",
29
- ]