sdg-hub 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/_version.py +2 -2
- sdg_hub/core/blocks/base.py +60 -58
- sdg_hub/core/blocks/filtering/column_value_filter.py +29 -16
- sdg_hub/core/blocks/llm/__init__.py +0 -2
- sdg_hub/core/blocks/llm/llm_chat_block.py +42 -36
- sdg_hub/core/blocks/llm/llm_parser_block.py +13 -59
- sdg_hub/core/blocks/llm/prompt_builder_block.py +15 -10
- sdg_hub/core/blocks/llm/text_parser_block.py +14 -61
- sdg_hub/core/blocks/transform/duplicate_columns.py +9 -8
- sdg_hub/core/blocks/transform/index_based_mapper.py +29 -15
- sdg_hub/core/blocks/transform/json_structure_block.py +16 -13
- sdg_hub/core/blocks/transform/melt_columns.py +13 -12
- sdg_hub/core/blocks/transform/rename_columns.py +20 -9
- sdg_hub/core/blocks/transform/text_concat.py +20 -21
- sdg_hub/core/blocks/transform/uniform_col_val_setter.py +6 -5
- sdg_hub/core/flow/base.py +139 -106
- sdg_hub/core/flow/checkpointer.py +34 -36
- sdg_hub/core/flow/validation.py +4 -4
- sdg_hub/core/utils/datautils.py +52 -54
- sdg_hub/core/utils/flow_metrics.py +9 -6
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +1 -0
- {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/METADATA +5 -9
- {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/RECORD +26 -28
- sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +0 -771
- sdg_hub/core/utils/temp_manager.py +0 -57
- {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/WHEEL +0 -0
- {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/top_level.txt +0 -0
|
@@ -9,7 +9,7 @@ import os
|
|
|
9
9
|
import uuid
|
|
10
10
|
|
|
11
11
|
# Third Party
|
|
12
|
-
|
|
12
|
+
import pandas as pd
|
|
13
13
|
|
|
14
14
|
# Local
|
|
15
15
|
from ..utils.datautils import safe_concatenate_with_validation
|
|
@@ -67,18 +67,18 @@ class FlowCheckpointer:
|
|
|
67
67
|
return os.path.join(self.checkpoint_dir, "flow_metadata.json")
|
|
68
68
|
|
|
69
69
|
def load_existing_progress(
|
|
70
|
-
self, input_dataset:
|
|
71
|
-
) -> Tuple[
|
|
70
|
+
self, input_dataset: pd.DataFrame
|
|
71
|
+
) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
|
|
72
72
|
"""Load existing checkpoint data and determine remaining work.
|
|
73
73
|
|
|
74
74
|
Parameters
|
|
75
75
|
----------
|
|
76
|
-
input_dataset :
|
|
76
|
+
input_dataset : pd.DataFrame
|
|
77
77
|
Original input dataset for the flow.
|
|
78
78
|
|
|
79
79
|
Returns
|
|
80
80
|
-------
|
|
81
|
-
Tuple[
|
|
81
|
+
Tuple[pd.DataFrame, Optional[pd.DataFrame]]
|
|
82
82
|
(remaining_samples_to_process, completed_samples_dataset)
|
|
83
83
|
If no checkpoints exist, returns (input_dataset, None)
|
|
84
84
|
"""
|
|
@@ -127,20 +127,20 @@ class FlowCheckpointer:
|
|
|
127
127
|
logger.warning(f"Failed to load checkpoints: {exc}. Starting from scratch.")
|
|
128
128
|
return input_dataset, None
|
|
129
129
|
|
|
130
|
-
def add_completed_samples(self, samples:
|
|
130
|
+
def add_completed_samples(self, samples: pd.DataFrame) -> None:
|
|
131
131
|
"""Add samples that have completed the entire flow.
|
|
132
132
|
|
|
133
133
|
Parameters
|
|
134
134
|
----------
|
|
135
|
-
samples :
|
|
135
|
+
samples : pd.DataFrame
|
|
136
136
|
Samples that have completed processing through all blocks.
|
|
137
137
|
"""
|
|
138
138
|
if not self.is_enabled:
|
|
139
139
|
return
|
|
140
140
|
|
|
141
141
|
# Add to pending samples
|
|
142
|
-
for sample in samples:
|
|
143
|
-
self._pending_samples.append(sample)
|
|
142
|
+
for _, sample in samples.iterrows():
|
|
143
|
+
self._pending_samples.append(sample.to_dict())
|
|
144
144
|
self._samples_processed += 1
|
|
145
145
|
|
|
146
146
|
# Check if we should save a checkpoint
|
|
@@ -167,9 +167,9 @@ class FlowCheckpointer:
|
|
|
167
167
|
self.checkpoint_dir, f"checkpoint_{self._checkpoint_counter:04d}.jsonl"
|
|
168
168
|
)
|
|
169
169
|
|
|
170
|
-
# Convert pending samples to
|
|
171
|
-
|
|
172
|
-
|
|
170
|
+
# Convert pending samples to dataframe and save
|
|
171
|
+
checkpoint_df = pd.DataFrame(self._pending_samples)
|
|
172
|
+
checkpoint_df.to_json(checkpoint_file, orient="records", lines=True)
|
|
173
173
|
|
|
174
174
|
# Update metadata
|
|
175
175
|
self._save_metadata()
|
|
@@ -207,7 +207,7 @@ class FlowCheckpointer:
|
|
|
207
207
|
logger.warning(f"Failed to load metadata: {exc}")
|
|
208
208
|
return None
|
|
209
209
|
|
|
210
|
-
def _load_completed_samples(self) -> Optional[
|
|
210
|
+
def _load_completed_samples(self) -> Optional[pd.DataFrame]:
|
|
211
211
|
"""Load all completed samples from checkpoint files."""
|
|
212
212
|
checkpoint_files = []
|
|
213
213
|
checkpoint_dir = Path(self.checkpoint_dir)
|
|
@@ -222,27 +222,25 @@ class FlowCheckpointer:
|
|
|
222
222
|
# Sort checkpoint files by number
|
|
223
223
|
checkpoint_files.sort()
|
|
224
224
|
|
|
225
|
-
# Load and concatenate all checkpoint
|
|
226
|
-
|
|
225
|
+
# Load and concatenate all checkpoint dataframes
|
|
226
|
+
dataframes = []
|
|
227
227
|
for file_path in checkpoint_files:
|
|
228
228
|
try:
|
|
229
|
-
|
|
230
|
-
if len(
|
|
231
|
-
|
|
232
|
-
logger.debug(
|
|
233
|
-
f"Loaded checkpoint: {file_path} ({len(dataset)} samples)"
|
|
234
|
-
)
|
|
229
|
+
df = pd.read_json(file_path, lines=True)
|
|
230
|
+
if len(df) > 0:
|
|
231
|
+
dataframes.append(df)
|
|
232
|
+
logger.debug(f"Loaded checkpoint: {file_path} ({len(df)} samples)")
|
|
235
233
|
except Exception as exc:
|
|
236
234
|
logger.warning(f"Failed to load checkpoint {file_path}: {exc}")
|
|
237
235
|
|
|
238
|
-
if not
|
|
236
|
+
if not dataframes:
|
|
239
237
|
return None
|
|
240
238
|
|
|
241
|
-
return safe_concatenate_with_validation(
|
|
239
|
+
return safe_concatenate_with_validation(dataframes, "checkpoint files")
|
|
242
240
|
|
|
243
241
|
def _find_remaining_samples(
|
|
244
|
-
self, input_dataset:
|
|
245
|
-
) ->
|
|
242
|
+
self, input_dataset: pd.DataFrame, completed_dataset: pd.DataFrame
|
|
243
|
+
) -> pd.DataFrame:
|
|
246
244
|
"""Find samples from input_dataset that are not in completed_dataset.
|
|
247
245
|
|
|
248
246
|
Note: Assumes input_dataset contains unique samples. For datasets with
|
|
@@ -250,19 +248,19 @@ class FlowCheckpointer:
|
|
|
250
248
|
|
|
251
249
|
Parameters
|
|
252
250
|
----------
|
|
253
|
-
input_dataset :
|
|
251
|
+
input_dataset : pd.DataFrame
|
|
254
252
|
Original input dataset (assumed to contain unique samples).
|
|
255
|
-
completed_dataset :
|
|
253
|
+
completed_dataset : pd.DataFrame
|
|
256
254
|
Dataset of completed samples.
|
|
257
255
|
|
|
258
256
|
Returns
|
|
259
257
|
-------
|
|
260
|
-
|
|
258
|
+
pd.DataFrame
|
|
261
259
|
Samples that still need processing.
|
|
262
260
|
"""
|
|
263
261
|
# Get common columns for comparison
|
|
264
|
-
input_columns = set(input_dataset.
|
|
265
|
-
completed_columns = set(completed_dataset.
|
|
262
|
+
input_columns = set(input_dataset.columns.tolist())
|
|
263
|
+
completed_columns = set(completed_dataset.columns.tolist())
|
|
266
264
|
common_columns = list(input_columns & completed_columns)
|
|
267
265
|
|
|
268
266
|
if not common_columns:
|
|
@@ -272,9 +270,9 @@ class FlowCheckpointer:
|
|
|
272
270
|
)
|
|
273
271
|
return input_dataset
|
|
274
272
|
|
|
275
|
-
#
|
|
276
|
-
input_df = input_dataset
|
|
277
|
-
completed_df = completed_dataset
|
|
273
|
+
# Select only common columns for comparison
|
|
274
|
+
input_df = input_dataset[common_columns]
|
|
275
|
+
completed_df = completed_dataset[common_columns]
|
|
278
276
|
|
|
279
277
|
# Find rows that haven't been completed
|
|
280
278
|
# Use tuple representation for comparison
|
|
@@ -287,10 +285,10 @@ class FlowCheckpointer:
|
|
|
287
285
|
remaining_indices = input_df[remaining_mask].index.tolist()
|
|
288
286
|
|
|
289
287
|
if not remaining_indices:
|
|
290
|
-
# Return empty
|
|
291
|
-
return input_dataset.
|
|
288
|
+
# Return empty dataframe with same structure
|
|
289
|
+
return input_dataset.iloc[0:0]
|
|
292
290
|
|
|
293
|
-
return input_dataset.
|
|
291
|
+
return input_dataset.iloc[remaining_indices]
|
|
294
292
|
|
|
295
293
|
def get_progress_info(self) -> Dict[str, Any]:
|
|
296
294
|
"""Get information about current progress.
|
sdg_hub/core/flow/validation.py
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
from typing import TYPE_CHECKING, Any
|
|
6
6
|
|
|
7
7
|
# Third Party
|
|
8
|
-
|
|
8
|
+
import pandas as pd
|
|
9
9
|
|
|
10
10
|
if TYPE_CHECKING:
|
|
11
11
|
# Local
|
|
@@ -180,14 +180,14 @@ class FlowValidator:
|
|
|
180
180
|
|
|
181
181
|
return errors
|
|
182
182
|
|
|
183
|
-
def validate_flow_execution(self, flow: "Flow", dataset:
|
|
183
|
+
def validate_flow_execution(self, flow: "Flow", dataset: pd.DataFrame) -> list[str]:
|
|
184
184
|
"""Validate that a flow can be executed with the given dataset.
|
|
185
185
|
|
|
186
186
|
Parameters
|
|
187
187
|
----------
|
|
188
188
|
flow : Flow
|
|
189
189
|
The flow to validate.
|
|
190
|
-
dataset :
|
|
190
|
+
dataset : pd.DataFrame
|
|
191
191
|
Dataset to validate against.
|
|
192
192
|
|
|
193
193
|
Returns
|
|
@@ -206,7 +206,7 @@ class FlowValidator:
|
|
|
206
206
|
return errors
|
|
207
207
|
|
|
208
208
|
# Track available columns as we progress through blocks
|
|
209
|
-
current_columns = set(dataset.
|
|
209
|
+
current_columns = set(dataset.columns.tolist())
|
|
210
210
|
|
|
211
211
|
for _i, block in enumerate(flow.blocks):
|
|
212
212
|
block_name = block.block_name
|
sdg_hub/core/utils/datautils.py
CHANGED
|
@@ -1,33 +1,67 @@
|
|
|
1
1
|
# Third Party
|
|
2
|
-
from datasets import Dataset, concatenate_datasets
|
|
3
2
|
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
4
|
|
|
5
5
|
# Local
|
|
6
6
|
from .error_handling import FlowValidationError
|
|
7
7
|
|
|
8
8
|
|
|
9
|
+
def _is_hashable(x):
|
|
10
|
+
"""Check if a value is hashable."""
|
|
11
|
+
try:
|
|
12
|
+
hash(x)
|
|
13
|
+
return True
|
|
14
|
+
except TypeError:
|
|
15
|
+
return False
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _make_hashable(x):
|
|
19
|
+
"""Convert any value to a hashable representation for duplicate detection.
|
|
20
|
+
|
|
21
|
+
Handles numpy arrays, dicts, sets, lists, and other complex types by
|
|
22
|
+
converting them to hashable equivalents (tuples, frozensets, etc.).
|
|
23
|
+
"""
|
|
24
|
+
if _is_hashable(x):
|
|
25
|
+
return x
|
|
26
|
+
if isinstance(x, np.ndarray):
|
|
27
|
+
if x.ndim == 0:
|
|
28
|
+
return _make_hashable(x.item())
|
|
29
|
+
return tuple(_make_hashable(i) for i in x)
|
|
30
|
+
if isinstance(x, dict):
|
|
31
|
+
return tuple(
|
|
32
|
+
sorted(
|
|
33
|
+
((k, _make_hashable(v)) for k, v in x.items()),
|
|
34
|
+
key=lambda kv: repr(kv[0]),
|
|
35
|
+
)
|
|
36
|
+
)
|
|
37
|
+
if isinstance(x, (set, frozenset)):
|
|
38
|
+
return frozenset(_make_hashable(i) for i in x)
|
|
39
|
+
if hasattr(x, "__iter__"):
|
|
40
|
+
return tuple(_make_hashable(i) for i in x)
|
|
41
|
+
return repr(x)
|
|
42
|
+
|
|
43
|
+
|
|
9
44
|
def safe_concatenate_datasets(datasets: list):
|
|
10
45
|
"""Concatenate datasets safely, ignoring any datasets that are None or empty."""
|
|
11
|
-
filtered_datasets = [ds for ds in datasets if ds is not None and ds
|
|
46
|
+
filtered_datasets = [ds for ds in datasets if ds is not None and len(ds) > 0]
|
|
12
47
|
|
|
13
48
|
if not filtered_datasets:
|
|
14
49
|
return None
|
|
15
50
|
|
|
16
|
-
return
|
|
51
|
+
return pd.concat(filtered_datasets, ignore_index=True)
|
|
17
52
|
|
|
18
53
|
|
|
19
|
-
def validate_no_duplicates(dataset:
|
|
54
|
+
def validate_no_duplicates(dataset: pd.DataFrame) -> None:
|
|
20
55
|
"""
|
|
21
56
|
Validate that the input dataset contains only unique rows.
|
|
22
57
|
|
|
23
58
|
Uses pandas `.duplicated()` for efficient duplicate detection, with preprocessing
|
|
24
|
-
to handle numpy arrays that cause TypeError in pandas
|
|
25
|
-
|
|
26
|
-
of the duplicate rows detected.
|
|
59
|
+
to handle numpy arrays and other unhashable types that cause TypeError in pandas
|
|
60
|
+
duplicate detection.
|
|
27
61
|
|
|
28
62
|
Parameters
|
|
29
63
|
----------
|
|
30
|
-
dataset :
|
|
64
|
+
dataset : pd.DataFrame
|
|
31
65
|
Input dataset to validate.
|
|
32
66
|
|
|
33
67
|
Raises
|
|
@@ -38,47 +72,11 @@ def validate_no_duplicates(dataset: Dataset) -> None:
|
|
|
38
72
|
if len(dataset) == 0:
|
|
39
73
|
return
|
|
40
74
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
return True
|
|
47
|
-
except TypeError:
|
|
48
|
-
return False
|
|
49
|
-
|
|
50
|
-
def make_hashable(x):
|
|
51
|
-
if is_hashable(x):
|
|
52
|
-
# int, float, str, bytes, None etc. are already hashable
|
|
53
|
-
return x
|
|
54
|
-
if isinstance(x, np.ndarray):
|
|
55
|
-
if x.ndim == 0:
|
|
56
|
-
return make_hashable(x.item())
|
|
57
|
-
return tuple(make_hashable(i) for i in x)
|
|
58
|
-
if isinstance(x, dict):
|
|
59
|
-
# sort robustly even with heterogeneous key types
|
|
60
|
-
return tuple(
|
|
61
|
-
sorted(
|
|
62
|
-
((k, make_hashable(v)) for k, v in x.items()),
|
|
63
|
-
key=lambda kv: repr(kv[0]),
|
|
64
|
-
)
|
|
65
|
-
)
|
|
66
|
-
if isinstance(x, (set, frozenset)):
|
|
67
|
-
# order‑insensitive
|
|
68
|
-
return frozenset(make_hashable(i) for i in x)
|
|
69
|
-
if hasattr(x, "__iter__"):
|
|
70
|
-
# lists, tuples, custom iterables
|
|
71
|
-
return tuple(make_hashable(i) for i in x)
|
|
72
|
-
# last‑resort fallback to a stable representation
|
|
73
|
-
return repr(x)
|
|
74
|
-
|
|
75
|
-
# Apply to the whole dataframe to ensure every cell is hashable
|
|
76
|
-
if hasattr(df, "map"):
|
|
77
|
-
df = df.map(make_hashable)
|
|
78
|
-
else:
|
|
79
|
-
df = df.applymap(make_hashable)
|
|
80
|
-
|
|
81
|
-
duplicate_count = int(df.duplicated(keep="first").sum())
|
|
75
|
+
# Transform all cells to hashable representations for duplicate detection
|
|
76
|
+
# This creates a temporary copy but is necessary for reliable duplicate detection
|
|
77
|
+
hashable_df = dataset.map(_make_hashable)
|
|
78
|
+
|
|
79
|
+
duplicate_count = int(hashable_df.duplicated(keep="first").sum())
|
|
82
80
|
if duplicate_count > 0:
|
|
83
81
|
raise FlowValidationError(
|
|
84
82
|
f"Input dataset contains {duplicate_count} duplicate rows. "
|
|
@@ -89,19 +87,19 @@ def validate_no_duplicates(dataset: Dataset) -> None:
|
|
|
89
87
|
|
|
90
88
|
def safe_concatenate_with_validation(
|
|
91
89
|
datasets: list, context: str = "datasets"
|
|
92
|
-
) ->
|
|
90
|
+
) -> pd.DataFrame:
|
|
93
91
|
"""Safely concatenate datasets with schema validation and clear error messages.
|
|
94
92
|
|
|
95
93
|
Parameters
|
|
96
94
|
----------
|
|
97
|
-
datasets : list[
|
|
95
|
+
datasets : list[pd.DataFrame]
|
|
98
96
|
List of datasets to concatenate
|
|
99
97
|
context : str
|
|
100
98
|
Description of what's being concatenated for error messages
|
|
101
99
|
|
|
102
100
|
Returns
|
|
103
101
|
-------
|
|
104
|
-
|
|
102
|
+
pd.DataFrame
|
|
105
103
|
Concatenated dataset
|
|
106
104
|
|
|
107
105
|
Raises
|
|
@@ -119,12 +117,12 @@ def safe_concatenate_with_validation(
|
|
|
119
117
|
return valid_datasets[0]
|
|
120
118
|
|
|
121
119
|
try:
|
|
122
|
-
return
|
|
120
|
+
return pd.concat(valid_datasets, ignore_index=True)
|
|
123
121
|
except Exception as e:
|
|
124
122
|
# Schema mismatch or other concatenation error
|
|
125
123
|
schema_info = []
|
|
126
124
|
for i, ds in enumerate(valid_datasets):
|
|
127
|
-
schema_info.append(f"Dataset {i}: columns={ds.
|
|
125
|
+
schema_info.append(f"Dataset {i}: columns={ds.columns.tolist()}")
|
|
128
126
|
|
|
129
127
|
schema_details = "\n".join(schema_info)
|
|
130
128
|
raise FlowValidationError(
|
|
@@ -8,12 +8,13 @@ from typing import Any, Optional
|
|
|
8
8
|
import json
|
|
9
9
|
import time
|
|
10
10
|
|
|
11
|
-
# Third Party
|
|
12
|
-
from datasets import Dataset
|
|
13
11
|
from rich.console import Console
|
|
14
12
|
from rich.panel import Panel
|
|
15
13
|
from rich.table import Table
|
|
16
14
|
|
|
15
|
+
# Third Party
|
|
16
|
+
import pandas as pd
|
|
17
|
+
|
|
17
18
|
|
|
18
19
|
def aggregate_block_metrics(entries: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
19
20
|
"""Aggregate per-block metrics, coalescing chunked runs.
|
|
@@ -71,7 +72,7 @@ def aggregate_block_metrics(entries: list[dict[str, Any]]) -> list[dict[str, Any
|
|
|
71
72
|
def display_metrics_summary(
|
|
72
73
|
block_metrics: list[dict[str, Any]],
|
|
73
74
|
flow_name: str,
|
|
74
|
-
final_dataset: Optional[
|
|
75
|
+
final_dataset: Optional[pd.DataFrame] = None,
|
|
75
76
|
) -> None:
|
|
76
77
|
"""Display a rich table summarizing block execution metrics.
|
|
77
78
|
|
|
@@ -81,7 +82,7 @@ def display_metrics_summary(
|
|
|
81
82
|
Raw block metrics from flow execution.
|
|
82
83
|
flow_name : str
|
|
83
84
|
Name of the flow for display title.
|
|
84
|
-
final_dataset : Optional[
|
|
85
|
+
final_dataset : Optional[pd.DataFrame], optional
|
|
85
86
|
Final dataset from flow execution. None if flow failed.
|
|
86
87
|
"""
|
|
87
88
|
if not block_metrics:
|
|
@@ -146,8 +147,10 @@ def display_metrics_summary(
|
|
|
146
147
|
|
|
147
148
|
# Add summary row
|
|
148
149
|
table.add_section()
|
|
149
|
-
final_row_count = len(final_dataset) if final_dataset else 0
|
|
150
|
-
final_col_count =
|
|
150
|
+
final_row_count = len(final_dataset) if final_dataset is not None else 0
|
|
151
|
+
final_col_count = (
|
|
152
|
+
len(final_dataset.columns.tolist()) if final_dataset is not None else 0
|
|
153
|
+
)
|
|
151
154
|
|
|
152
155
|
table.add_row(
|
|
153
156
|
"[bold]TOTAL[/bold]",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sdg_hub
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.1
|
|
4
4
|
Summary: Synthetic Data Generation
|
|
5
5
|
Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -28,23 +28,17 @@ Requires-Dist: httpx<1.0.0,>=0.25.0
|
|
|
28
28
|
Requires-Dist: jinja2
|
|
29
29
|
Requires-Dist: litellm<1.75.0,>=1.73.0
|
|
30
30
|
Requires-Dist: rich
|
|
31
|
+
Requires-Dist: pandas
|
|
31
32
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
|
32
33
|
Requires-Dist: python-dotenv<2.0.0,>=1.0.0
|
|
33
34
|
Requires-Dist: tenacity!=8.4.0,>=8.3.0
|
|
34
35
|
Requires-Dist: tqdm<5.0.0,>=4.66.2
|
|
35
|
-
Provides-Extra: vllm
|
|
36
|
-
Requires-Dist: vllm>=0.9.1; extra == "vllm"
|
|
37
|
-
Requires-Dist: torch>=2.0.0; extra == "vllm"
|
|
38
|
-
Requires-Dist: transformers>=4.37.0; extra == "vllm"
|
|
39
|
-
Requires-Dist: accelerate>=0.21.0; extra == "vllm"
|
|
40
|
-
Requires-Dist: xformers>=0.0.22.post7; extra == "vllm"
|
|
41
36
|
Provides-Extra: examples
|
|
42
37
|
Requires-Dist: tabulate>=0.9.0; extra == "examples"
|
|
43
38
|
Requires-Dist: transformers>=4.37.0; extra == "examples"
|
|
44
39
|
Requires-Dist: langchain-text-splitters; extra == "examples"
|
|
45
40
|
Requires-Dist: docling>=2.3.0; extra == "examples"
|
|
46
41
|
Requires-Dist: scikit-learn; extra == "examples"
|
|
47
|
-
Requires-Dist: pandas; extra == "examples"
|
|
48
42
|
Requires-Dist: polars; extra == "examples"
|
|
49
43
|
Requires-Dist: matplotlib; extra == "examples"
|
|
50
44
|
Requires-Dist: spacy; extra == "examples"
|
|
@@ -76,7 +70,9 @@ Dynamic: license-file
|
|
|
76
70
|
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml)
|
|
77
71
|
[](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub)
|
|
78
72
|
|
|
79
|
-
|
|
73
|
+
<p align="center">
|
|
74
|
+
<img src="docs/assets/sdg-hub-cover.png" alt="SDG Hub Cover" width="400">
|
|
75
|
+
</p>
|
|
80
76
|
|
|
81
77
|
A modular Python framework for building synthetic data generation pipelines using composable blocks and flows. Transform datasets through **building-block composition** - mix and match LLM-powered and traditional processing blocks to create sophisticated data generation workflows.
|
|
82
78
|
|
|
@@ -1,42 +1,40 @@
|
|
|
1
1
|
sdg_hub/__init__.py,sha256=TlkZT40-70urdcWLqv3kupaJj8s-SVgd2QyvlSFwb4A,510
|
|
2
|
-
sdg_hub/_version.py,sha256=
|
|
2
|
+
sdg_hub/_version.py,sha256=7vNQiXfKffK0nbqts6Xy6-E1b1YOm4EGigvgaHr83o4,704
|
|
3
3
|
sdg_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
sdg_hub/core/__init__.py,sha256=e3BoejbqjYhasf9t__L4qE52lkD9EBjx4o--2kqKdro,460
|
|
5
5
|
sdg_hub/core/blocks/__init__.py,sha256=8Rn1SglH8V3jGmTD_cG-h7qk9ktAab2eaBdyk7RN_hY,865
|
|
6
|
-
sdg_hub/core/blocks/base.py,sha256
|
|
6
|
+
sdg_hub/core/blocks/base.py,sha256=EpHvqXySIdx0f672c-csGKKs7N57ablC8pad_SiB1s8,13066
|
|
7
7
|
sdg_hub/core/blocks/registry.py,sha256=FuEN_pnq-nSH1LguY3_oCubT6Kz3SuJjk3TcUpLT-lw,10695
|
|
8
8
|
sdg_hub/core/blocks/filtering/__init__.py,sha256=isxSVSvDqkMjG8dQSl3Q2M4g5c1t9fTjBSA21icf-yA,275
|
|
9
|
-
sdg_hub/core/blocks/filtering/column_value_filter.py,sha256=
|
|
10
|
-
sdg_hub/core/blocks/llm/__init__.py,sha256=
|
|
9
|
+
sdg_hub/core/blocks/filtering/column_value_filter.py,sha256=tHNykB-Q_ItbjDzvlpnjt0Z46mR67O6ZY29ed2ecOwo,6493
|
|
10
|
+
sdg_hub/core/blocks/llm/__init__.py,sha256=1Oo2nv2uXJ2AzRlrQcqDi7gW1FNh9Fid84L89dvy4qM,683
|
|
11
11
|
sdg_hub/core/blocks/llm/error_handler.py,sha256=7T-019ZFB9qgZoX1ybIiXyaLjPzrF96qcKmUu6vmO6g,12178
|
|
12
|
-
sdg_hub/core/blocks/llm/llm_chat_block.py,sha256=
|
|
13
|
-
sdg_hub/core/blocks/llm/
|
|
14
|
-
sdg_hub/core/blocks/llm/
|
|
15
|
-
sdg_hub/core/blocks/llm/
|
|
16
|
-
sdg_hub/core/blocks/llm/text_parser_block.py,sha256=NGwBdFmfbY3rbm_T7bqTJmaREo2MpSpQwgLrnHHZHqU,14255
|
|
12
|
+
sdg_hub/core/blocks/llm/llm_chat_block.py,sha256=ckkjF_r9CxoX2sJiikFWFxNrAS4w_gMnedo70TrQo3Y,22730
|
|
13
|
+
sdg_hub/core/blocks/llm/llm_parser_block.py,sha256=NFk8xXceK_F1Pzn9dFNX65ynavuoQiH2ltDLLY_6SXQ,12136
|
|
14
|
+
sdg_hub/core/blocks/llm/prompt_builder_block.py,sha256=zI8DFz34abGnH2Mk0KQe4Mkkb5ophwV7brn4axNsZ2I,14146
|
|
15
|
+
sdg_hub/core/blocks/llm/text_parser_block.py,sha256=CoyfgKcJL9JpokzMcKk4bYeEBr6xnN0XYk45hJANnBQ,12763
|
|
17
16
|
sdg_hub/core/blocks/transform/__init__.py,sha256=lF9InjOzA6p_mjiwV-a2Kwstq9kqRiQ-dEwbsmR9yQs,825
|
|
18
|
-
sdg_hub/core/blocks/transform/duplicate_columns.py,sha256=
|
|
19
|
-
sdg_hub/core/blocks/transform/index_based_mapper.py,sha256=
|
|
20
|
-
sdg_hub/core/blocks/transform/json_structure_block.py,sha256=
|
|
21
|
-
sdg_hub/core/blocks/transform/melt_columns.py,sha256=
|
|
22
|
-
sdg_hub/core/blocks/transform/rename_columns.py,sha256=
|
|
23
|
-
sdg_hub/core/blocks/transform/text_concat.py,sha256=
|
|
24
|
-
sdg_hub/core/blocks/transform/uniform_col_val_setter.py,sha256=
|
|
17
|
+
sdg_hub/core/blocks/transform/duplicate_columns.py,sha256=dYTxgkWq6X2B37pemJdmAVi56A29NF25YTwUUyN9xHs,2837
|
|
18
|
+
sdg_hub/core/blocks/transform/index_based_mapper.py,sha256=W9ezZNgLUGbLk2U1UJCi2KFbSRPM0Q4vHnP5HGlhsoQ,8908
|
|
19
|
+
sdg_hub/core/blocks/transform/json_structure_block.py,sha256=w7Ex2F3gvpG7uUnM2JM1a7D5xUKGE6HRKwyJpnfLPzc,5069
|
|
20
|
+
sdg_hub/core/blocks/transform/melt_columns.py,sha256=zH3d3C0EO2DVRZqmhyr_g51xz1ZmuBRinrngUCiZkrM,4383
|
|
21
|
+
sdg_hub/core/blocks/transform/rename_columns.py,sha256=EafchUDXvfXxqwRvNIcy92I1Zy6U8lsibtSqWaYdMPU,3150
|
|
22
|
+
sdg_hub/core/blocks/transform/text_concat.py,sha256=Oo6VKGdmeiUmH3B0PDL1y_ot-bYmkT2jbGj7g7C84gg,3089
|
|
23
|
+
sdg_hub/core/blocks/transform/uniform_col_val_setter.py,sha256=Osbz-jciBx5jFfzUbtbCBh_ET4CySG2h0IGWChESHi4,3239
|
|
25
24
|
sdg_hub/core/flow/__init__.py,sha256=0_m_htuZfPxk8xQ9IKfp0Pz-JRE4O7lYMUFrKyLNoLA,409
|
|
26
|
-
sdg_hub/core/flow/base.py,sha256=
|
|
27
|
-
sdg_hub/core/flow/checkpointer.py,sha256=
|
|
25
|
+
sdg_hub/core/flow/base.py,sha256=9nCXrCdKMzMAoIpiv2Zo7RzZhiLluXJ9XQAtg3wh_40,59104
|
|
26
|
+
sdg_hub/core/flow/checkpointer.py,sha256=MJay3Q5cfRgJDetk82DaMKJ3ZZUYRHxQabEQTxhGukk,11850
|
|
28
27
|
sdg_hub/core/flow/metadata.py,sha256=cFrpJjWOaK87aCuRFyC3Pdf83oYU93mrmZEMdUnhsN8,10540
|
|
29
28
|
sdg_hub/core/flow/registry.py,sha256=N6KfX-L7QRkooznIFxDuhRZYuDA5g3N5zC-KRm2jVhk,12109
|
|
30
|
-
sdg_hub/core/flow/validation.py,sha256=
|
|
29
|
+
sdg_hub/core/flow/validation.py,sha256=6hs16DnusUYPo6vD_7DcgzRP5JOHDf2wPvgqvBn6hB0,9727
|
|
31
30
|
sdg_hub/core/utils/__init__.py,sha256=KcT56JhobC5sBg0MKEMn5hc4OyKa9_Vnn45Mt_kS4jQ,610
|
|
32
|
-
sdg_hub/core/utils/datautils.py,sha256=
|
|
31
|
+
sdg_hub/core/utils/datautils.py,sha256=7YzG_IpMHj04zHl-r7mswOd3IzTQKJJdfmMBgm7VXWM,4082
|
|
33
32
|
sdg_hub/core/utils/error_handling.py,sha256=yku8cGj_nKCyXDsnb-mHCpgukkkAMucJ4iAUrIzqysc,5510
|
|
34
33
|
sdg_hub/core/utils/flow_id_words.yaml,sha256=5QHpQdP7zwahRuooyAlJIwBY7WcDR7vtbJXxVJqujbg,2317
|
|
35
34
|
sdg_hub/core/utils/flow_identifier.py,sha256=aAHfK_G9AwEtMglLRMdMpi_AI1dciub5UqBGm4yb2HE,2841
|
|
36
|
-
sdg_hub/core/utils/flow_metrics.py,sha256=
|
|
35
|
+
sdg_hub/core/utils/flow_metrics.py,sha256=84ihZHOwbxhqPTdnUXclytf5Tva-IoA1oKIruIXv0Eo,12650
|
|
37
36
|
sdg_hub/core/utils/logger_config.py,sha256=6_cnsIHtSAdq1iTTZ7Q7nAJ1dmldlxSZ0AB49yLiQ20,2034
|
|
38
37
|
sdg_hub/core/utils/path_resolution.py,sha256=yWof4kGNpQ5dKcrVHg0h9KfOKLZ6ROjdfsLAZsQT5rM,2000
|
|
39
|
-
sdg_hub/core/utils/temp_manager.py,sha256=moSPWMxoDEw5FmeuwKTC8f3tYcarQDN0ozv0796CeGg,1484
|
|
40
38
|
sdg_hub/core/utils/time_estimator.py,sha256=rM3_R-Ka5DEtvOtlJoA_5pXSyQ6tT6t4h6qh3_5BCZo,12639
|
|
41
39
|
sdg_hub/core/utils/yaml_utils.py,sha256=tShCd-FFkp0xlKnLe7dXsMOR4AvT9d2qRUmu4ZnPSEY,1458
|
|
42
40
|
sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -69,7 +67,7 @@ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/j
|
|
|
69
67
|
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml,sha256=OjPZaSCOSLxEWgW3pmNwF7mmLhGhFGTmKL_3rKdqeW4,2488
|
|
70
68
|
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml,sha256=nEy_RcotHGiiENrmUANpKkbIFsrARAeSwECrBeHi2so,391
|
|
71
69
|
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml,sha256=V90W0IeJQZTFThA8v0UOs3DtZbtU3BI9jkpChw1BULo,402
|
|
72
|
-
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml,sha256=
|
|
70
|
+
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml,sha256=U9DBWSKkYGGtwWQ39o8l7g-mLb93505APTEFePyzqIc,9312
|
|
73
71
|
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml,sha256=96SQqXG7fmb-50SdX85sgVtrFcQ-oNKe_0BoQdZmY5g,2638
|
|
74
72
|
sdg_hub/flows/text_analysis/__init__.py,sha256=WStks4eM_KHNTVsHglcj8vFghmI0PH9P1hUrijBLbwc,125
|
|
75
73
|
sdg_hub/flows/text_analysis/structured_insights/__init__.py,sha256=_DT4NR05JD9CZoSWROPr2lC6se0VjSqQPZJJlEV79mk,274
|
|
@@ -78,8 +76,8 @@ sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml,sha256=Q_S
|
|
|
78
76
|
sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml,sha256=_nPPMdHnxag_lYbhYUjGJGo-CvRwWvwdGX7cQhdZ1S0,847
|
|
79
77
|
sdg_hub/flows/text_analysis/structured_insights/flow.yaml,sha256=BBV18SdvuVTAESjwkJ7V1jbb-cSTBvNl3SCycd0oEQ4,4934
|
|
80
78
|
sdg_hub/flows/text_analysis/structured_insights/summarize.yaml,sha256=WXwQak1pF8e1OwnOoI1EHu8QB6iUNW89rfkTdi1Oq54,687
|
|
81
|
-
sdg_hub-0.
|
|
82
|
-
sdg_hub-0.
|
|
83
|
-
sdg_hub-0.
|
|
84
|
-
sdg_hub-0.
|
|
85
|
-
sdg_hub-0.
|
|
79
|
+
sdg_hub-0.6.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
80
|
+
sdg_hub-0.6.1.dist-info/METADATA,sha256=JQxLH1YwDrV5D1cAaaRziFFiF17buxN-fnyse5lQVV8,9584
|
|
81
|
+
sdg_hub-0.6.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
82
|
+
sdg_hub-0.6.1.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
|
|
83
|
+
sdg_hub-0.6.1.dist-info/RECORD,,
|