sdg-hub 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. sdg_hub/_version.py +2 -2
  2. sdg_hub/core/blocks/base.py +60 -58
  3. sdg_hub/core/blocks/filtering/column_value_filter.py +29 -16
  4. sdg_hub/core/blocks/llm/__init__.py +0 -2
  5. sdg_hub/core/blocks/llm/llm_chat_block.py +42 -36
  6. sdg_hub/core/blocks/llm/llm_parser_block.py +13 -59
  7. sdg_hub/core/blocks/llm/prompt_builder_block.py +15 -10
  8. sdg_hub/core/blocks/llm/text_parser_block.py +14 -61
  9. sdg_hub/core/blocks/transform/duplicate_columns.py +9 -8
  10. sdg_hub/core/blocks/transform/index_based_mapper.py +29 -15
  11. sdg_hub/core/blocks/transform/json_structure_block.py +16 -13
  12. sdg_hub/core/blocks/transform/melt_columns.py +13 -12
  13. sdg_hub/core/blocks/transform/rename_columns.py +20 -9
  14. sdg_hub/core/blocks/transform/text_concat.py +20 -21
  15. sdg_hub/core/blocks/transform/uniform_col_val_setter.py +6 -5
  16. sdg_hub/core/flow/base.py +139 -106
  17. sdg_hub/core/flow/checkpointer.py +34 -36
  18. sdg_hub/core/flow/validation.py +4 -4
  19. sdg_hub/core/utils/datautils.py +52 -54
  20. sdg_hub/core/utils/flow_metrics.py +9 -6
  21. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +1 -0
  22. {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/METADATA +5 -9
  23. {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/RECORD +26 -28
  24. sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +0 -771
  25. sdg_hub/core/utils/temp_manager.py +0 -57
  26. {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/WHEEL +0 -0
  27. {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/licenses/LICENSE +0 -0
  28. {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/top_level.txt +0 -0
@@ -9,7 +9,7 @@ import os
9
9
  import uuid
10
10
 
11
11
  # Third Party
12
- from datasets import Dataset
12
+ import pandas as pd
13
13
 
14
14
  # Local
15
15
  from ..utils.datautils import safe_concatenate_with_validation
@@ -67,18 +67,18 @@ class FlowCheckpointer:
67
67
  return os.path.join(self.checkpoint_dir, "flow_metadata.json")
68
68
 
69
69
  def load_existing_progress(
70
- self, input_dataset: Dataset
71
- ) -> Tuple[Dataset, Optional[Dataset]]:
70
+ self, input_dataset: pd.DataFrame
71
+ ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
72
72
  """Load existing checkpoint data and determine remaining work.
73
73
 
74
74
  Parameters
75
75
  ----------
76
- input_dataset : Dataset
76
+ input_dataset : pd.DataFrame
77
77
  Original input dataset for the flow.
78
78
 
79
79
  Returns
80
80
  -------
81
- Tuple[Dataset, Optional[Dataset]]
81
+ Tuple[pd.DataFrame, Optional[pd.DataFrame]]
82
82
  (remaining_samples_to_process, completed_samples_dataset)
83
83
  If no checkpoints exist, returns (input_dataset, None)
84
84
  """
@@ -127,20 +127,20 @@ class FlowCheckpointer:
127
127
  logger.warning(f"Failed to load checkpoints: {exc}. Starting from scratch.")
128
128
  return input_dataset, None
129
129
 
130
- def add_completed_samples(self, samples: Dataset) -> None:
130
+ def add_completed_samples(self, samples: pd.DataFrame) -> None:
131
131
  """Add samples that have completed the entire flow.
132
132
 
133
133
  Parameters
134
134
  ----------
135
- samples : Dataset
135
+ samples : pd.DataFrame
136
136
  Samples that have completed processing through all blocks.
137
137
  """
138
138
  if not self.is_enabled:
139
139
  return
140
140
 
141
141
  # Add to pending samples
142
- for sample in samples:
143
- self._pending_samples.append(sample)
142
+ for _, sample in samples.iterrows():
143
+ self._pending_samples.append(sample.to_dict())
144
144
  self._samples_processed += 1
145
145
 
146
146
  # Check if we should save a checkpoint
@@ -167,9 +167,9 @@ class FlowCheckpointer:
167
167
  self.checkpoint_dir, f"checkpoint_{self._checkpoint_counter:04d}.jsonl"
168
168
  )
169
169
 
170
- # Convert pending samples to dataset and save
171
- checkpoint_dataset = Dataset.from_list(self._pending_samples)
172
- checkpoint_dataset.to_json(checkpoint_file, orient="records", lines=True)
170
+ # Convert pending samples to dataframe and save
171
+ checkpoint_df = pd.DataFrame(self._pending_samples)
172
+ checkpoint_df.to_json(checkpoint_file, orient="records", lines=True)
173
173
 
174
174
  # Update metadata
175
175
  self._save_metadata()
@@ -207,7 +207,7 @@ class FlowCheckpointer:
207
207
  logger.warning(f"Failed to load metadata: {exc}")
208
208
  return None
209
209
 
210
- def _load_completed_samples(self) -> Optional[Dataset]:
210
+ def _load_completed_samples(self) -> Optional[pd.DataFrame]:
211
211
  """Load all completed samples from checkpoint files."""
212
212
  checkpoint_files = []
213
213
  checkpoint_dir = Path(self.checkpoint_dir)
@@ -222,27 +222,25 @@ class FlowCheckpointer:
222
222
  # Sort checkpoint files by number
223
223
  checkpoint_files.sort()
224
224
 
225
- # Load and concatenate all checkpoint datasets
226
- datasets = []
225
+ # Load and concatenate all checkpoint dataframes
226
+ dataframes = []
227
227
  for file_path in checkpoint_files:
228
228
  try:
229
- dataset = Dataset.from_json(file_path)
230
- if len(dataset) > 0:
231
- datasets.append(dataset)
232
- logger.debug(
233
- f"Loaded checkpoint: {file_path} ({len(dataset)} samples)"
234
- )
229
+ df = pd.read_json(file_path, lines=True)
230
+ if len(df) > 0:
231
+ dataframes.append(df)
232
+ logger.debug(f"Loaded checkpoint: {file_path} ({len(df)} samples)")
235
233
  except Exception as exc:
236
234
  logger.warning(f"Failed to load checkpoint {file_path}: {exc}")
237
235
 
238
- if not datasets:
236
+ if not dataframes:
239
237
  return None
240
238
 
241
- return safe_concatenate_with_validation(datasets, "checkpoint files")
239
+ return safe_concatenate_with_validation(dataframes, "checkpoint files")
242
240
 
243
241
  def _find_remaining_samples(
244
- self, input_dataset: Dataset, completed_dataset: Dataset
245
- ) -> Dataset:
242
+ self, input_dataset: pd.DataFrame, completed_dataset: pd.DataFrame
243
+ ) -> pd.DataFrame:
246
244
  """Find samples from input_dataset that are not in completed_dataset.
247
245
 
248
246
  Note: Assumes input_dataset contains unique samples. For datasets with
@@ -250,19 +248,19 @@ class FlowCheckpointer:
250
248
 
251
249
  Parameters
252
250
  ----------
253
- input_dataset : Dataset
251
+ input_dataset : pd.DataFrame
254
252
  Original input dataset (assumed to contain unique samples).
255
- completed_dataset : Dataset
253
+ completed_dataset : pd.DataFrame
256
254
  Dataset of completed samples.
257
255
 
258
256
  Returns
259
257
  -------
260
- Dataset
258
+ pd.DataFrame
261
259
  Samples that still need processing.
262
260
  """
263
261
  # Get common columns for comparison
264
- input_columns = set(input_dataset.column_names)
265
- completed_columns = set(completed_dataset.column_names)
262
+ input_columns = set(input_dataset.columns.tolist())
263
+ completed_columns = set(completed_dataset.columns.tolist())
266
264
  common_columns = list(input_columns & completed_columns)
267
265
 
268
266
  if not common_columns:
@@ -272,9 +270,9 @@ class FlowCheckpointer:
272
270
  )
273
271
  return input_dataset
274
272
 
275
- # Convert to pandas for easier comparison
276
- input_df = input_dataset.select_columns(common_columns).to_pandas()
277
- completed_df = completed_dataset.select_columns(common_columns).to_pandas()
273
+ # Select only common columns for comparison
274
+ input_df = input_dataset[common_columns]
275
+ completed_df = completed_dataset[common_columns]
278
276
 
279
277
  # Find rows that haven't been completed
280
278
  # Use tuple representation for comparison
@@ -287,10 +285,10 @@ class FlowCheckpointer:
287
285
  remaining_indices = input_df[remaining_mask].index.tolist()
288
286
 
289
287
  if not remaining_indices:
290
- # Return empty dataset with same structure
291
- return input_dataset.select([])
288
+ # Return empty dataframe with same structure
289
+ return input_dataset.iloc[0:0]
292
290
 
293
- return input_dataset.select(remaining_indices)
291
+ return input_dataset.iloc[remaining_indices]
294
292
 
295
293
  def get_progress_info(self) -> Dict[str, Any]:
296
294
  """Get information about current progress.
@@ -5,7 +5,7 @@
5
5
  from typing import TYPE_CHECKING, Any
6
6
 
7
7
  # Third Party
8
- from datasets import Dataset
8
+ import pandas as pd
9
9
 
10
10
  if TYPE_CHECKING:
11
11
  # Local
@@ -180,14 +180,14 @@ class FlowValidator:
180
180
 
181
181
  return errors
182
182
 
183
- def validate_flow_execution(self, flow: "Flow", dataset: Dataset) -> list[str]:
183
+ def validate_flow_execution(self, flow: "Flow", dataset: pd.DataFrame) -> list[str]:
184
184
  """Validate that a flow can be executed with the given dataset.
185
185
 
186
186
  Parameters
187
187
  ----------
188
188
  flow : Flow
189
189
  The flow to validate.
190
- dataset : Dataset
190
+ dataset : pd.DataFrame
191
191
  Dataset to validate against.
192
192
 
193
193
  Returns
@@ -206,7 +206,7 @@ class FlowValidator:
206
206
  return errors
207
207
 
208
208
  # Track available columns as we progress through blocks
209
- current_columns = set(dataset.column_names)
209
+ current_columns = set(dataset.columns.tolist())
210
210
 
211
211
  for _i, block in enumerate(flow.blocks):
212
212
  block_name = block.block_name
@@ -1,33 +1,67 @@
1
1
  # Third Party
2
- from datasets import Dataset, concatenate_datasets
3
2
  import numpy as np
3
+ import pandas as pd
4
4
 
5
5
  # Local
6
6
  from .error_handling import FlowValidationError
7
7
 
8
8
 
9
+ def _is_hashable(x):
10
+ """Check if a value is hashable."""
11
+ try:
12
+ hash(x)
13
+ return True
14
+ except TypeError:
15
+ return False
16
+
17
+
18
+ def _make_hashable(x):
19
+ """Convert any value to a hashable representation for duplicate detection.
20
+
21
+ Handles numpy arrays, dicts, sets, lists, and other complex types by
22
+ converting them to hashable equivalents (tuples, frozensets, etc.).
23
+ """
24
+ if _is_hashable(x):
25
+ return x
26
+ if isinstance(x, np.ndarray):
27
+ if x.ndim == 0:
28
+ return _make_hashable(x.item())
29
+ return tuple(_make_hashable(i) for i in x)
30
+ if isinstance(x, dict):
31
+ return tuple(
32
+ sorted(
33
+ ((k, _make_hashable(v)) for k, v in x.items()),
34
+ key=lambda kv: repr(kv[0]),
35
+ )
36
+ )
37
+ if isinstance(x, (set, frozenset)):
38
+ return frozenset(_make_hashable(i) for i in x)
39
+ if hasattr(x, "__iter__"):
40
+ return tuple(_make_hashable(i) for i in x)
41
+ return repr(x)
42
+
43
+
9
44
  def safe_concatenate_datasets(datasets: list):
10
45
  """Concatenate datasets safely, ignoring any datasets that are None or empty."""
11
- filtered_datasets = [ds for ds in datasets if ds is not None and ds.num_rows > 0]
46
+ filtered_datasets = [ds for ds in datasets if ds is not None and len(ds) > 0]
12
47
 
13
48
  if not filtered_datasets:
14
49
  return None
15
50
 
16
- return concatenate_datasets(filtered_datasets)
51
+ return pd.concat(filtered_datasets, ignore_index=True)
17
52
 
18
53
 
19
- def validate_no_duplicates(dataset: Dataset) -> None:
54
+ def validate_no_duplicates(dataset: pd.DataFrame) -> None:
20
55
  """
21
56
  Validate that the input dataset contains only unique rows.
22
57
 
23
58
  Uses pandas `.duplicated()` for efficient duplicate detection, with preprocessing
24
- to handle numpy arrays that cause TypeError in pandas duplicate detection.
25
- Raises FlowValidationError if duplicates are found, including a count
26
- of the duplicate rows detected.
59
+ to handle numpy arrays and other unhashable types that cause TypeError in pandas
60
+ duplicate detection.
27
61
 
28
62
  Parameters
29
63
  ----------
30
- dataset : Dataset
64
+ dataset : pd.DataFrame
31
65
  Input dataset to validate.
32
66
 
33
67
  Raises
@@ -38,47 +72,11 @@ def validate_no_duplicates(dataset: Dataset) -> None:
38
72
  if len(dataset) == 0:
39
73
  return
40
74
 
41
- df = dataset.to_pandas()
42
-
43
- def is_hashable(x):
44
- try:
45
- hash(x)
46
- return True
47
- except TypeError:
48
- return False
49
-
50
- def make_hashable(x):
51
- if is_hashable(x):
52
- # int, float, str, bytes, None etc. are already hashable
53
- return x
54
- if isinstance(x, np.ndarray):
55
- if x.ndim == 0:
56
- return make_hashable(x.item())
57
- return tuple(make_hashable(i) for i in x)
58
- if isinstance(x, dict):
59
- # sort robustly even with heterogeneous key types
60
- return tuple(
61
- sorted(
62
- ((k, make_hashable(v)) for k, v in x.items()),
63
- key=lambda kv: repr(kv[0]),
64
- )
65
- )
66
- if isinstance(x, (set, frozenset)):
67
- # order‑insensitive
68
- return frozenset(make_hashable(i) for i in x)
69
- if hasattr(x, "__iter__"):
70
- # lists, tuples, custom iterables
71
- return tuple(make_hashable(i) for i in x)
72
- # last‑resort fallback to a stable representation
73
- return repr(x)
74
-
75
- # Apply to the whole dataframe to ensure every cell is hashable
76
- if hasattr(df, "map"):
77
- df = df.map(make_hashable)
78
- else:
79
- df = df.applymap(make_hashable)
80
-
81
- duplicate_count = int(df.duplicated(keep="first").sum())
75
+ # Transform all cells to hashable representations for duplicate detection
76
+ # This creates a temporary copy but is necessary for reliable duplicate detection
77
+ hashable_df = dataset.map(_make_hashable)
78
+
79
+ duplicate_count = int(hashable_df.duplicated(keep="first").sum())
82
80
  if duplicate_count > 0:
83
81
  raise FlowValidationError(
84
82
  f"Input dataset contains {duplicate_count} duplicate rows. "
@@ -89,19 +87,19 @@ def validate_no_duplicates(dataset: Dataset) -> None:
89
87
 
90
88
  def safe_concatenate_with_validation(
91
89
  datasets: list, context: str = "datasets"
92
- ) -> Dataset:
90
+ ) -> pd.DataFrame:
93
91
  """Safely concatenate datasets with schema validation and clear error messages.
94
92
 
95
93
  Parameters
96
94
  ----------
97
- datasets : list[Dataset]
95
+ datasets : list[pd.DataFrame]
98
96
  List of datasets to concatenate
99
97
  context : str
100
98
  Description of what's being concatenated for error messages
101
99
 
102
100
  Returns
103
101
  -------
104
- Dataset
102
+ pd.DataFrame
105
103
  Concatenated dataset
106
104
 
107
105
  Raises
@@ -119,12 +117,12 @@ def safe_concatenate_with_validation(
119
117
  return valid_datasets[0]
120
118
 
121
119
  try:
122
- return concatenate_datasets(valid_datasets)
120
+ return pd.concat(valid_datasets, ignore_index=True)
123
121
  except Exception as e:
124
122
  # Schema mismatch or other concatenation error
125
123
  schema_info = []
126
124
  for i, ds in enumerate(valid_datasets):
127
- schema_info.append(f"Dataset {i}: columns={ds.column_names}")
125
+ schema_info.append(f"Dataset {i}: columns={ds.columns.tolist()}")
128
126
 
129
127
  schema_details = "\n".join(schema_info)
130
128
  raise FlowValidationError(
@@ -8,12 +8,13 @@ from typing import Any, Optional
8
8
  import json
9
9
  import time
10
10
 
11
- # Third Party
12
- from datasets import Dataset
13
11
  from rich.console import Console
14
12
  from rich.panel import Panel
15
13
  from rich.table import Table
16
14
 
15
+ # Third Party
16
+ import pandas as pd
17
+
17
18
 
18
19
  def aggregate_block_metrics(entries: list[dict[str, Any]]) -> list[dict[str, Any]]:
19
20
  """Aggregate per-block metrics, coalescing chunked runs.
@@ -71,7 +72,7 @@ def aggregate_block_metrics(entries: list[dict[str, Any]]) -> list[dict[str, Any
71
72
  def display_metrics_summary(
72
73
  block_metrics: list[dict[str, Any]],
73
74
  flow_name: str,
74
- final_dataset: Optional[Dataset] = None,
75
+ final_dataset: Optional[pd.DataFrame] = None,
75
76
  ) -> None:
76
77
  """Display a rich table summarizing block execution metrics.
77
78
 
@@ -81,7 +82,7 @@ def display_metrics_summary(
81
82
  Raw block metrics from flow execution.
82
83
  flow_name : str
83
84
  Name of the flow for display title.
84
- final_dataset : Optional[Dataset], optional
85
+ final_dataset : Optional[pd.DataFrame], optional
85
86
  Final dataset from flow execution. None if flow failed.
86
87
  """
87
88
  if not block_metrics:
@@ -146,8 +147,10 @@ def display_metrics_summary(
146
147
 
147
148
  # Add summary row
148
149
  table.add_section()
149
- final_row_count = len(final_dataset) if final_dataset else 0
150
- final_col_count = len(final_dataset.column_names) if final_dataset else 0
150
+ final_row_count = len(final_dataset) if final_dataset is not None else 0
151
+ final_col_count = (
152
+ len(final_dataset.columns.tolist()) if final_dataset is not None else 0
153
+ )
151
154
 
152
155
  table.add_row(
153
156
  "[bold]TOTAL[/bold]",
@@ -16,6 +16,7 @@ metadata:
16
16
  - "qa-pairs"
17
17
  - "document-processing"
18
18
  - "educational"
19
+ - "multilingual"
19
20
  - "japanese"
20
21
 
21
22
  license: "Apache-2.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdg_hub
3
- Version: 0.5.1
3
+ Version: 0.6.1
4
4
  Summary: Synthetic Data Generation
5
5
  Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
6
  License: Apache-2.0
@@ -28,23 +28,17 @@ Requires-Dist: httpx<1.0.0,>=0.25.0
28
28
  Requires-Dist: jinja2
29
29
  Requires-Dist: litellm<1.75.0,>=1.73.0
30
30
  Requires-Dist: rich
31
+ Requires-Dist: pandas
31
32
  Requires-Dist: pydantic<3.0.0,>=2.0.0
32
33
  Requires-Dist: python-dotenv<2.0.0,>=1.0.0
33
34
  Requires-Dist: tenacity!=8.4.0,>=8.3.0
34
35
  Requires-Dist: tqdm<5.0.0,>=4.66.2
35
- Provides-Extra: vllm
36
- Requires-Dist: vllm>=0.9.1; extra == "vllm"
37
- Requires-Dist: torch>=2.0.0; extra == "vllm"
38
- Requires-Dist: transformers>=4.37.0; extra == "vllm"
39
- Requires-Dist: accelerate>=0.21.0; extra == "vllm"
40
- Requires-Dist: xformers>=0.0.22.post7; extra == "vllm"
41
36
  Provides-Extra: examples
42
37
  Requires-Dist: tabulate>=0.9.0; extra == "examples"
43
38
  Requires-Dist: transformers>=4.37.0; extra == "examples"
44
39
  Requires-Dist: langchain-text-splitters; extra == "examples"
45
40
  Requires-Dist: docling>=2.3.0; extra == "examples"
46
41
  Requires-Dist: scikit-learn; extra == "examples"
47
- Requires-Dist: pandas; extra == "examples"
48
42
  Requires-Dist: polars; extra == "examples"
49
43
  Requires-Dist: matplotlib; extra == "examples"
50
44
  Requires-Dist: spacy; extra == "examples"
@@ -76,7 +70,9 @@ Dynamic: license-file
76
70
  [![Tests](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml/badge.svg)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml)
77
71
  [![codecov](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub/graph/badge.svg?token=SP75BCXWO2)](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub)
78
72
 
79
-
73
+ <p align="center">
74
+ <img src="docs/assets/sdg-hub-cover.png" alt="SDG Hub Cover" width="400">
75
+ </p>
80
76
 
81
77
  A modular Python framework for building synthetic data generation pipelines using composable blocks and flows. Transform datasets through **building-block composition** - mix and match LLM-powered and traditional processing blocks to create sophisticated data generation workflows.
82
78
 
@@ -1,42 +1,40 @@
1
1
  sdg_hub/__init__.py,sha256=TlkZT40-70urdcWLqv3kupaJj8s-SVgd2QyvlSFwb4A,510
2
- sdg_hub/_version.py,sha256=cYMOhuaBHd0MIZmumuccsEQ-AxM8LIJy9dsBAWgOpqE,704
2
+ sdg_hub/_version.py,sha256=7vNQiXfKffK0nbqts6Xy6-E1b1YOm4EGigvgaHr83o4,704
3
3
  sdg_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  sdg_hub/core/__init__.py,sha256=e3BoejbqjYhasf9t__L4qE52lkD9EBjx4o--2kqKdro,460
5
5
  sdg_hub/core/blocks/__init__.py,sha256=8Rn1SglH8V3jGmTD_cG-h7qk9ktAab2eaBdyk7RN_hY,865
6
- sdg_hub/core/blocks/base.py,sha256=-SOdBpJwtRTMsrmCEuLjUBQMRCo_PLYlHEBRrz8sF9g,13031
6
+ sdg_hub/core/blocks/base.py,sha256=EpHvqXySIdx0f672c-csGKKs7N57ablC8pad_SiB1s8,13066
7
7
  sdg_hub/core/blocks/registry.py,sha256=FuEN_pnq-nSH1LguY3_oCubT6Kz3SuJjk3TcUpLT-lw,10695
8
8
  sdg_hub/core/blocks/filtering/__init__.py,sha256=isxSVSvDqkMjG8dQSl3Q2M4g5c1t9fTjBSA21icf-yA,275
9
- sdg_hub/core/blocks/filtering/column_value_filter.py,sha256=2Z9j_CiiTn5mHZ9gfXU-itLXDmeXSh0UI0x1x7j-LQ0,6001
10
- sdg_hub/core/blocks/llm/__init__.py,sha256=AyS0dd3pkPPXH5a9aj4mT5HsKjX2vjXfkmQc6rkFV4A,795
9
+ sdg_hub/core/blocks/filtering/column_value_filter.py,sha256=tHNykB-Q_ItbjDzvlpnjt0Z46mR67O6ZY29ed2ecOwo,6493
10
+ sdg_hub/core/blocks/llm/__init__.py,sha256=1Oo2nv2uXJ2AzRlrQcqDi7gW1FNh9Fid84L89dvy4qM,683
11
11
  sdg_hub/core/blocks/llm/error_handler.py,sha256=7T-019ZFB9qgZoX1ybIiXyaLjPzrF96qcKmUu6vmO6g,12178
12
- sdg_hub/core/blocks/llm/llm_chat_block.py,sha256=MHhI2x9i6LrfDXgvAy2_6YxgyoD7j6BpCgNGsM69xDg,22194
13
- sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py,sha256=DW4b09IqXmcshvXawFheDyaLp3rz7vpO5VBrKdUQYW8,31703
14
- sdg_hub/core/blocks/llm/llm_parser_block.py,sha256=pCTaxAML5uFERZx0KTunvgVPHm1H2154VTvF79bGrB8,13699
15
- sdg_hub/core/blocks/llm/prompt_builder_block.py,sha256=fkJd718X1oYlMY1cjo_8WCO16Gl8Tm0bUPWR78E_uws,13935
16
- sdg_hub/core/blocks/llm/text_parser_block.py,sha256=NGwBdFmfbY3rbm_T7bqTJmaREo2MpSpQwgLrnHHZHqU,14255
12
+ sdg_hub/core/blocks/llm/llm_chat_block.py,sha256=ckkjF_r9CxoX2sJiikFWFxNrAS4w_gMnedo70TrQo3Y,22730
13
+ sdg_hub/core/blocks/llm/llm_parser_block.py,sha256=NFk8xXceK_F1Pzn9dFNX65ynavuoQiH2ltDLLY_6SXQ,12136
14
+ sdg_hub/core/blocks/llm/prompt_builder_block.py,sha256=zI8DFz34abGnH2Mk0KQe4Mkkb5ophwV7brn4axNsZ2I,14146
15
+ sdg_hub/core/blocks/llm/text_parser_block.py,sha256=CoyfgKcJL9JpokzMcKk4bYeEBr6xnN0XYk45hJANnBQ,12763
17
16
  sdg_hub/core/blocks/transform/__init__.py,sha256=lF9InjOzA6p_mjiwV-a2Kwstq9kqRiQ-dEwbsmR9yQs,825
18
- sdg_hub/core/blocks/transform/duplicate_columns.py,sha256=SaP7rIF4ZFEFFa50aU2xGNIuddXaEZrKxdWfHjzFpVI,2833
19
- sdg_hub/core/blocks/transform/index_based_mapper.py,sha256=XC_a7Skbd3mu7f4ra8fGWPxMwqUMSjJkQ7Ag7vflwJA,8235
20
- sdg_hub/core/blocks/transform/json_structure_block.py,sha256=hm-0M0NAyUREgJRPyV1u-laorgX6MZ1o17E9rNBhN78,5010
21
- sdg_hub/core/blocks/transform/melt_columns.py,sha256=vaYa5Taq6GhNZYWFL4uPK3-SfN2BsKEm-wvjd2EYYoI,4382
22
- sdg_hub/core/blocks/transform/rename_columns.py,sha256=W2hcDSJY6L73ZpElUhOML2sGLM9Y-v0gSo3xEF1LXDc,2749
23
- sdg_hub/core/blocks/transform/text_concat.py,sha256=_-B__Hob1WwgwkILPIZvTnsDzuwtoX1hKviyzHlnnes,3149
24
- sdg_hub/core/blocks/transform/uniform_col_val_setter.py,sha256=XnjiT29z3PzIPy8M-mmE2w-Miab6Ed5ahy32SaxTCTE,3263
17
+ sdg_hub/core/blocks/transform/duplicate_columns.py,sha256=dYTxgkWq6X2B37pemJdmAVi56A29NF25YTwUUyN9xHs,2837
18
+ sdg_hub/core/blocks/transform/index_based_mapper.py,sha256=W9ezZNgLUGbLk2U1UJCi2KFbSRPM0Q4vHnP5HGlhsoQ,8908
19
+ sdg_hub/core/blocks/transform/json_structure_block.py,sha256=w7Ex2F3gvpG7uUnM2JM1a7D5xUKGE6HRKwyJpnfLPzc,5069
20
+ sdg_hub/core/blocks/transform/melt_columns.py,sha256=zH3d3C0EO2DVRZqmhyr_g51xz1ZmuBRinrngUCiZkrM,4383
21
+ sdg_hub/core/blocks/transform/rename_columns.py,sha256=EafchUDXvfXxqwRvNIcy92I1Zy6U8lsibtSqWaYdMPU,3150
22
+ sdg_hub/core/blocks/transform/text_concat.py,sha256=Oo6VKGdmeiUmH3B0PDL1y_ot-bYmkT2jbGj7g7C84gg,3089
23
+ sdg_hub/core/blocks/transform/uniform_col_val_setter.py,sha256=Osbz-jciBx5jFfzUbtbCBh_ET4CySG2h0IGWChESHi4,3239
25
24
  sdg_hub/core/flow/__init__.py,sha256=0_m_htuZfPxk8xQ9IKfp0Pz-JRE4O7lYMUFrKyLNoLA,409
26
- sdg_hub/core/flow/base.py,sha256=Z2P8QBLl7HWVISdI585hxnIiTu9FhnjlTXn-ngr36Jk,58189
27
- sdg_hub/core/flow/checkpointer.py,sha256=stm5ZtjjEiLk9ZkAAnoQQn5Y8Yl_d7qCsQLZTrCXR48,11867
25
+ sdg_hub/core/flow/base.py,sha256=9nCXrCdKMzMAoIpiv2Zo7RzZhiLluXJ9XQAtg3wh_40,59104
26
+ sdg_hub/core/flow/checkpointer.py,sha256=MJay3Q5cfRgJDetk82DaMKJ3ZZUYRHxQabEQTxhGukk,11850
28
27
  sdg_hub/core/flow/metadata.py,sha256=cFrpJjWOaK87aCuRFyC3Pdf83oYU93mrmZEMdUnhsN8,10540
29
28
  sdg_hub/core/flow/registry.py,sha256=N6KfX-L7QRkooznIFxDuhRZYuDA5g3N5zC-KRm2jVhk,12109
30
- sdg_hub/core/flow/validation.py,sha256=pUJvgaUjLpKNwvW6djcqVOF-HShOjegEmGOnUnoX4BA,9722
29
+ sdg_hub/core/flow/validation.py,sha256=6hs16DnusUYPo6vD_7DcgzRP5JOHDf2wPvgqvBn6hB0,9727
31
30
  sdg_hub/core/utils/__init__.py,sha256=KcT56JhobC5sBg0MKEMn5hc4OyKa9_Vnn45Mt_kS4jQ,610
32
- sdg_hub/core/utils/datautils.py,sha256=__HkUe1DxcJVHKrFX68z_hDXwxJygBlJDfjJLnj7rHc,4230
31
+ sdg_hub/core/utils/datautils.py,sha256=7YzG_IpMHj04zHl-r7mswOd3IzTQKJJdfmMBgm7VXWM,4082
33
32
  sdg_hub/core/utils/error_handling.py,sha256=yku8cGj_nKCyXDsnb-mHCpgukkkAMucJ4iAUrIzqysc,5510
34
33
  sdg_hub/core/utils/flow_id_words.yaml,sha256=5QHpQdP7zwahRuooyAlJIwBY7WcDR7vtbJXxVJqujbg,2317
35
34
  sdg_hub/core/utils/flow_identifier.py,sha256=aAHfK_G9AwEtMglLRMdMpi_AI1dciub5UqBGm4yb2HE,2841
36
- sdg_hub/core/utils/flow_metrics.py,sha256=3G-xbfr-rFA578wV4KUbQePTMVGZHr9-rXvyYL4Kt2Q,12604
35
+ sdg_hub/core/utils/flow_metrics.py,sha256=84ihZHOwbxhqPTdnUXclytf5Tva-IoA1oKIruIXv0Eo,12650
37
36
  sdg_hub/core/utils/logger_config.py,sha256=6_cnsIHtSAdq1iTTZ7Q7nAJ1dmldlxSZ0AB49yLiQ20,2034
38
37
  sdg_hub/core/utils/path_resolution.py,sha256=yWof4kGNpQ5dKcrVHg0h9KfOKLZ6ROjdfsLAZsQT5rM,2000
39
- sdg_hub/core/utils/temp_manager.py,sha256=moSPWMxoDEw5FmeuwKTC8f3tYcarQDN0ozv0796CeGg,1484
40
38
  sdg_hub/core/utils/time_estimator.py,sha256=rM3_R-Ka5DEtvOtlJoA_5pXSyQ6tT6t4h6qh3_5BCZo,12639
41
39
  sdg_hub/core/utils/yaml_utils.py,sha256=tShCd-FFkp0xlKnLe7dXsMOR4AvT9d2qRUmu4ZnPSEY,1458
42
40
  sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -69,7 +67,7 @@ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/j
69
67
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml,sha256=OjPZaSCOSLxEWgW3pmNwF7mmLhGhFGTmKL_3rKdqeW4,2488
70
68
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml,sha256=nEy_RcotHGiiENrmUANpKkbIFsrARAeSwECrBeHi2so,391
71
69
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml,sha256=V90W0IeJQZTFThA8v0UOs3DtZbtU3BI9jkpChw1BULo,402
72
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml,sha256=Q6RusV-_HHMr5jlFNOP6UVuEf8d6btHENMOP3MnB3u0,9291
70
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml,sha256=U9DBWSKkYGGtwWQ39o8l7g-mLb93505APTEFePyzqIc,9312
73
71
  sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml,sha256=96SQqXG7fmb-50SdX85sgVtrFcQ-oNKe_0BoQdZmY5g,2638
74
72
  sdg_hub/flows/text_analysis/__init__.py,sha256=WStks4eM_KHNTVsHglcj8vFghmI0PH9P1hUrijBLbwc,125
75
73
  sdg_hub/flows/text_analysis/structured_insights/__init__.py,sha256=_DT4NR05JD9CZoSWROPr2lC6se0VjSqQPZJJlEV79mk,274
@@ -78,8 +76,8 @@ sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml,sha256=Q_S
78
76
  sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml,sha256=_nPPMdHnxag_lYbhYUjGJGo-CvRwWvwdGX7cQhdZ1S0,847
79
77
  sdg_hub/flows/text_analysis/structured_insights/flow.yaml,sha256=BBV18SdvuVTAESjwkJ7V1jbb-cSTBvNl3SCycd0oEQ4,4934
80
78
  sdg_hub/flows/text_analysis/structured_insights/summarize.yaml,sha256=WXwQak1pF8e1OwnOoI1EHu8QB6iUNW89rfkTdi1Oq54,687
81
- sdg_hub-0.5.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
82
- sdg_hub-0.5.1.dist-info/METADATA,sha256=f5pTZHWrt0JQPHysvca3M7U7HU0Yus5jnGK8KrT2U-g,9775
83
- sdg_hub-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
84
- sdg_hub-0.5.1.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
85
- sdg_hub-0.5.1.dist-info/RECORD,,
79
+ sdg_hub-0.6.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
80
+ sdg_hub-0.6.1.dist-info/METADATA,sha256=JQxLH1YwDrV5D1cAaaRziFFiF17buxN-fnyse5lQVV8,9584
81
+ sdg_hub-0.6.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
82
+ sdg_hub-0.6.1.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
83
+ sdg_hub-0.6.1.dist-info/RECORD,,