sdg-hub 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/_version.py +2 -2
- sdg_hub/core/blocks/base.py +60 -58
- sdg_hub/core/blocks/filtering/column_value_filter.py +29 -16
- sdg_hub/core/blocks/llm/__init__.py +0 -2
- sdg_hub/core/blocks/llm/llm_chat_block.py +42 -36
- sdg_hub/core/blocks/llm/llm_parser_block.py +13 -7
- sdg_hub/core/blocks/llm/prompt_builder_block.py +15 -10
- sdg_hub/core/blocks/llm/text_parser_block.py +14 -9
- sdg_hub/core/blocks/transform/duplicate_columns.py +9 -8
- sdg_hub/core/blocks/transform/index_based_mapper.py +29 -15
- sdg_hub/core/blocks/transform/json_structure_block.py +16 -13
- sdg_hub/core/blocks/transform/melt_columns.py +13 -12
- sdg_hub/core/blocks/transform/rename_columns.py +20 -9
- sdg_hub/core/blocks/transform/text_concat.py +20 -21
- sdg_hub/core/blocks/transform/uniform_col_val_setter.py +6 -5
- sdg_hub/core/flow/base.py +139 -57
- sdg_hub/core/flow/checkpointer.py +34 -36
- sdg_hub/core/flow/validation.py +4 -4
- sdg_hub/core/utils/datautils.py +52 -54
- sdg_hub/core/utils/flow_metrics.py +9 -6
- {sdg_hub-0.5.0.dist-info → sdg_hub-0.6.0.dist-info}/METADATA +2 -8
- {sdg_hub-0.5.0.dist-info → sdg_hub-0.6.0.dist-info}/RECORD +25 -26
- sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +0 -771
- {sdg_hub-0.5.0.dist-info → sdg_hub-0.6.0.dist-info}/WHEEL +0 -0
- {sdg_hub-0.5.0.dist-info → sdg_hub-0.6.0.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.5.0.dist-info → sdg_hub-0.6.0.dist-info}/top_level.txt +0 -0
|
@@ -9,10 +9,11 @@ start/end tags, custom regex patterns, and cleanup operations.
|
|
|
9
9
|
from typing import Any, Optional
|
|
10
10
|
import re
|
|
11
11
|
|
|
12
|
-
# Third Party
|
|
13
|
-
from datasets import Dataset
|
|
14
12
|
from pydantic import Field, field_validator, model_validator
|
|
15
13
|
|
|
14
|
+
# Third Party
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
16
17
|
# Local
|
|
17
18
|
from ...utils.logger_config import setup_logger
|
|
18
19
|
from ..base import BaseBlock
|
|
@@ -27,6 +28,8 @@ logger = setup_logger(__name__)
|
|
|
27
28
|
"Parses and post-processes text content using tags or regex patterns",
|
|
28
29
|
)
|
|
29
30
|
class TextParserBlock(BaseBlock):
|
|
31
|
+
_flow_requires_jsonl_tmp: bool = True
|
|
32
|
+
|
|
30
33
|
"""Block for parsing and post-processing text content.
|
|
31
34
|
|
|
32
35
|
This block handles text parsing using start/end tags, custom regex patterns,
|
|
@@ -117,12 +120,12 @@ class TextParserBlock(BaseBlock):
|
|
|
117
120
|
|
|
118
121
|
return self
|
|
119
122
|
|
|
120
|
-
def _validate_custom(self, dataset:
|
|
123
|
+
def _validate_custom(self, dataset: pd.DataFrame) -> None:
|
|
121
124
|
"""Validate TextParserBlock specific requirements.
|
|
122
125
|
|
|
123
126
|
Parameters
|
|
124
127
|
----------
|
|
125
|
-
dataset :
|
|
128
|
+
dataset : pd.DataFrame
|
|
126
129
|
The dataset to validate.
|
|
127
130
|
|
|
128
131
|
Raises
|
|
@@ -311,13 +314,15 @@ class TextParserBlock(BaseBlock):
|
|
|
311
314
|
)
|
|
312
315
|
return []
|
|
313
316
|
|
|
314
|
-
def generate(self, samples:
|
|
317
|
+
def generate(self, samples: pd.DataFrame, **kwargs: Any) -> pd.DataFrame:
|
|
315
318
|
logger.debug(f"Parsing outputs for {len(samples)} samples")
|
|
316
319
|
if len(samples) == 0:
|
|
317
320
|
logger.warning("No samples to parse, returning empty dataset")
|
|
318
|
-
return
|
|
321
|
+
return pd.DataFrame()
|
|
319
322
|
|
|
320
|
-
|
|
321
|
-
|
|
323
|
+
# Convert DataFrame to list of dicts to avoid iterrows and improve performance
|
|
324
|
+
samples_list = samples.to_dict("records")
|
|
325
|
+
new_data: list[dict] = []
|
|
326
|
+
for sample in samples_list:
|
|
322
327
|
new_data.extend(self._generate(sample))
|
|
323
|
-
return
|
|
328
|
+
return pd.DataFrame(new_data)
|
|
@@ -8,10 +8,11 @@ according to a mapping specification.
|
|
|
8
8
|
# Standard
|
|
9
9
|
from typing import Any
|
|
10
10
|
|
|
11
|
-
# Third Party
|
|
12
|
-
from datasets import Dataset
|
|
13
11
|
from pydantic import field_validator
|
|
14
12
|
|
|
13
|
+
# Third Party
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
15
16
|
# Local
|
|
16
17
|
from ...utils.logger_config import setup_logger
|
|
17
18
|
from ..base import BaseBlock
|
|
@@ -62,27 +63,27 @@ class DuplicateColumnsBlock(BaseBlock):
|
|
|
62
63
|
if self.output_cols is None:
|
|
63
64
|
self.output_cols = list(self.input_cols.values())
|
|
64
65
|
|
|
65
|
-
def generate(self, samples:
|
|
66
|
+
def generate(self, samples: pd.DataFrame, **kwargs: Any) -> pd.DataFrame:
|
|
66
67
|
"""Generate a dataset with duplicated columns.
|
|
67
68
|
|
|
68
69
|
Parameters
|
|
69
70
|
----------
|
|
70
|
-
samples :
|
|
71
|
+
samples : pd.DataFrame
|
|
71
72
|
Input dataset to duplicate columns from.
|
|
72
73
|
|
|
73
74
|
Returns
|
|
74
75
|
-------
|
|
75
|
-
|
|
76
|
+
pd.DataFrame
|
|
76
77
|
Dataset with additional duplicated columns.
|
|
77
78
|
"""
|
|
78
79
|
# Create a copy to avoid modifying the original
|
|
79
|
-
result = samples
|
|
80
|
+
result = samples.copy()
|
|
80
81
|
|
|
81
82
|
# Duplicate each column as specified in the mapping
|
|
82
83
|
for source_col, target_col in self.input_cols.items():
|
|
83
|
-
if source_col not in result.
|
|
84
|
+
if source_col not in result.columns.tolist():
|
|
84
85
|
raise ValueError(f"Source column '{source_col}' not found in dataset")
|
|
85
86
|
|
|
86
|
-
result = result
|
|
87
|
+
result[target_col] = result[source_col]
|
|
87
88
|
|
|
88
89
|
return result
|
|
@@ -8,10 +8,11 @@ to another based on a choice column's value.
|
|
|
8
8
|
# Standard
|
|
9
9
|
from typing import Any
|
|
10
10
|
|
|
11
|
-
# Third Party
|
|
12
|
-
from datasets import Dataset
|
|
13
11
|
from pydantic import Field, field_validator, model_validator
|
|
14
12
|
|
|
13
|
+
# Third Party
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
15
16
|
# Local
|
|
16
17
|
from ...utils.error_handling import MissingColumnError
|
|
17
18
|
from ...utils.logger_config import setup_logger
|
|
@@ -103,12 +104,12 @@ class IndexBasedMapperBlock(BaseBlock):
|
|
|
103
104
|
# Create mapping from choice_col to output_col for easy access
|
|
104
105
|
self.choice_to_output_map = dict(zip(self.choice_cols, self.output_cols))
|
|
105
106
|
|
|
106
|
-
def _validate_custom(self, samples:
|
|
107
|
+
def _validate_custom(self, samples: pd.DataFrame) -> None:
|
|
107
108
|
"""Validate that required columns exist in the dataset.
|
|
108
109
|
|
|
109
110
|
Parameters
|
|
110
111
|
----------
|
|
111
|
-
samples :
|
|
112
|
+
samples : pd.DataFrame
|
|
112
113
|
Input dataset to validate.
|
|
113
114
|
|
|
114
115
|
Raises
|
|
@@ -120,29 +121,29 @@ class IndexBasedMapperBlock(BaseBlock):
|
|
|
120
121
|
"""
|
|
121
122
|
# Check that all choice_cols exist
|
|
122
123
|
missing_choice_cols = [
|
|
123
|
-
col for col in self.choice_cols if col not in samples.
|
|
124
|
+
col for col in self.choice_cols if col not in samples.columns.tolist()
|
|
124
125
|
]
|
|
125
126
|
if missing_choice_cols:
|
|
126
127
|
raise MissingColumnError(
|
|
127
128
|
block_name=self.block_name,
|
|
128
129
|
missing_columns=missing_choice_cols,
|
|
129
|
-
available_columns=samples.
|
|
130
|
+
available_columns=samples.columns.tolist(),
|
|
130
131
|
)
|
|
131
132
|
|
|
132
133
|
# Check that all mapped columns exist
|
|
133
134
|
mapped_cols = list(self.choice_map.values())
|
|
134
|
-
missing_cols = list(set(mapped_cols) - set(samples.
|
|
135
|
+
missing_cols = list(set(mapped_cols) - set(samples.columns.tolist()))
|
|
135
136
|
if missing_cols:
|
|
136
137
|
raise MissingColumnError(
|
|
137
138
|
block_name=self.block_name,
|
|
138
139
|
missing_columns=missing_cols,
|
|
139
|
-
available_columns=samples.
|
|
140
|
+
available_columns=samples.columns.tolist(),
|
|
140
141
|
)
|
|
141
142
|
|
|
142
143
|
# Check that all choice values in all choice columns have corresponding mappings
|
|
143
144
|
all_unique_choices = set()
|
|
144
145
|
for choice_col in self.choice_cols:
|
|
145
|
-
all_unique_choices.update(samples[choice_col])
|
|
146
|
+
all_unique_choices.update(samples[choice_col].unique())
|
|
146
147
|
|
|
147
148
|
mapped_choices = set(self.choice_map.keys())
|
|
148
149
|
unmapped_choices = all_unique_choices - mapped_choices
|
|
@@ -174,23 +175,23 @@ class IndexBasedMapperBlock(BaseBlock):
|
|
|
174
175
|
sample[output_col] = sample[source_col]
|
|
175
176
|
return sample
|
|
176
177
|
|
|
177
|
-
def generate(self, samples:
|
|
178
|
+
def generate(self, samples: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
|
178
179
|
"""Generate a new dataset with selected values.
|
|
179
180
|
|
|
180
181
|
Parameters
|
|
181
182
|
----------
|
|
182
|
-
samples :
|
|
183
|
+
samples : pd.DataFrame
|
|
183
184
|
Input dataset to process.
|
|
184
185
|
|
|
185
186
|
Returns
|
|
186
187
|
-------
|
|
187
|
-
|
|
188
|
+
pd.DataFrame
|
|
188
189
|
Dataset with selected values stored in output column.
|
|
189
190
|
"""
|
|
190
191
|
# Log the operation
|
|
191
192
|
all_unique_choices = set()
|
|
192
193
|
for choice_col in self.choice_cols:
|
|
193
|
-
all_unique_choices.update(samples[choice_col])
|
|
194
|
+
all_unique_choices.update(samples[choice_col].unique())
|
|
194
195
|
mapped_choices = set(self.choice_map.keys())
|
|
195
196
|
|
|
196
197
|
logger.info(
|
|
@@ -205,8 +206,21 @@ class IndexBasedMapperBlock(BaseBlock):
|
|
|
205
206
|
},
|
|
206
207
|
)
|
|
207
208
|
|
|
208
|
-
#
|
|
209
|
-
result = samples.
|
|
209
|
+
# Create a copy to avoid modifying the input
|
|
210
|
+
result = samples.copy()
|
|
211
|
+
|
|
212
|
+
# Handle empty DataFrame case
|
|
213
|
+
if len(result) == 0:
|
|
214
|
+
# Add empty output columns
|
|
215
|
+
for output_col in self.output_cols:
|
|
216
|
+
result[output_col] = []
|
|
217
|
+
else:
|
|
218
|
+
# Apply the mapping for each choice column and output column pair
|
|
219
|
+
for choice_col, output_col in self.choice_to_output_map.items():
|
|
220
|
+
# Map the choice values to source columns, then get values from those columns
|
|
221
|
+
result[output_col] = result.apply(
|
|
222
|
+
lambda row: row[self.choice_map[row[choice_col]]], axis=1
|
|
223
|
+
)
|
|
210
224
|
|
|
211
225
|
# Log completion
|
|
212
226
|
logger.info(
|
|
@@ -9,10 +9,11 @@ containing a structured JSON object with specified field names.
|
|
|
9
9
|
from typing import Any, Dict
|
|
10
10
|
import json
|
|
11
11
|
|
|
12
|
-
# Third Party
|
|
13
|
-
from datasets import Dataset
|
|
14
12
|
from pydantic import Field, field_validator
|
|
15
13
|
|
|
14
|
+
# Third Party
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
16
17
|
# Local
|
|
17
18
|
from ...utils.logger_config import setup_logger
|
|
18
19
|
from ..base import BaseBlock
|
|
@@ -90,17 +91,17 @@ class JSONStructureBlock(BaseBlock):
|
|
|
90
91
|
|
|
91
92
|
raise ValueError("input_cols must be a list of column names")
|
|
92
93
|
|
|
93
|
-
def generate(self, samples:
|
|
94
|
+
def generate(self, samples: pd.DataFrame, **kwargs: Any) -> pd.DataFrame:
|
|
94
95
|
"""Generate a dataset with JSON structured output.
|
|
95
96
|
|
|
96
97
|
Parameters
|
|
97
98
|
----------
|
|
98
|
-
samples :
|
|
99
|
+
samples : pd.DataFrame
|
|
99
100
|
Input dataset to process.
|
|
100
101
|
|
|
101
102
|
Returns
|
|
102
103
|
-------
|
|
103
|
-
|
|
104
|
+
pd.DataFrame
|
|
104
105
|
Dataset with JSON structured output in the specified column.
|
|
105
106
|
"""
|
|
106
107
|
if not self.output_cols:
|
|
@@ -109,17 +110,17 @@ class JSONStructureBlock(BaseBlock):
|
|
|
109
110
|
output_col = self.output_cols[0]
|
|
110
111
|
field_mapping = self._get_field_mapping()
|
|
111
112
|
|
|
112
|
-
def _create_json_structure(
|
|
113
|
+
def _create_json_structure(row):
|
|
113
114
|
"""Create JSON structure from input columns."""
|
|
114
115
|
json_obj = {}
|
|
115
116
|
|
|
116
117
|
# Build the JSON object using the field mapping
|
|
117
118
|
for json_field, col_name in field_mapping.items():
|
|
118
|
-
if col_name not in
|
|
119
|
-
logger.warning(f"Input column '{col_name}' not found in
|
|
119
|
+
if col_name not in row.index:
|
|
120
|
+
logger.warning(f"Input column '{col_name}' not found in row")
|
|
120
121
|
json_obj[json_field] = None
|
|
121
122
|
else:
|
|
122
|
-
value =
|
|
123
|
+
value = row[col_name]
|
|
123
124
|
if self.ensure_json_serializable:
|
|
124
125
|
value = self._make_json_serializable(value)
|
|
125
126
|
json_obj[json_field] = value
|
|
@@ -130,13 +131,15 @@ class JSONStructureBlock(BaseBlock):
|
|
|
130
131
|
json_string = json.dumps(json_obj, indent=2, ensure_ascii=False)
|
|
131
132
|
else:
|
|
132
133
|
json_string = json.dumps(json_obj, ensure_ascii=False)
|
|
133
|
-
|
|
134
|
+
return json_string
|
|
134
135
|
except (TypeError, ValueError) as e:
|
|
135
136
|
logger.error(f"Failed to serialize JSON object: {e}")
|
|
136
|
-
|
|
137
|
+
return "{}"
|
|
137
138
|
|
|
138
|
-
|
|
139
|
+
# Create a copy to avoid modifying the input
|
|
140
|
+
result = samples.copy()
|
|
139
141
|
|
|
140
142
|
# Apply the JSON structuring to all samples
|
|
141
|
-
result =
|
|
143
|
+
result[output_col] = result.apply(_create_json_structure, axis=1)
|
|
144
|
+
|
|
142
145
|
return result
|
|
@@ -8,10 +8,11 @@ by melting specified columns into rows.
|
|
|
8
8
|
# Standard
|
|
9
9
|
from typing import Any
|
|
10
10
|
|
|
11
|
-
# Third Party
|
|
12
|
-
from datasets import Dataset
|
|
13
11
|
from pydantic import field_validator
|
|
14
12
|
|
|
13
|
+
# Third Party
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
15
16
|
# Local
|
|
16
17
|
from ...utils.error_handling import MissingColumnError
|
|
17
18
|
from ...utils.logger_config import setup_logger
|
|
@@ -79,12 +80,12 @@ class MeltColumnsBlock(BaseBlock):
|
|
|
79
80
|
self.input_cols if isinstance(self.input_cols, list) else [self.input_cols]
|
|
80
81
|
)
|
|
81
82
|
|
|
82
|
-
def _validate_custom(self, samples:
|
|
83
|
+
def _validate_custom(self, samples: pd.DataFrame) -> None:
|
|
83
84
|
"""Validate that required columns exist in the dataset.
|
|
84
85
|
|
|
85
86
|
Parameters
|
|
86
87
|
----------
|
|
87
|
-
samples :
|
|
88
|
+
samples : pd.DataFrame
|
|
88
89
|
Input dataset to validate.
|
|
89
90
|
|
|
90
91
|
Raises
|
|
@@ -93,34 +94,34 @@ class MeltColumnsBlock(BaseBlock):
|
|
|
93
94
|
If required columns are missing from the dataset.
|
|
94
95
|
"""
|
|
95
96
|
# Check that all var_cols exist in the dataset
|
|
96
|
-
missing_cols = list(set(self.var_cols) - set(samples.
|
|
97
|
+
missing_cols = list(set(self.var_cols) - set(samples.columns.tolist()))
|
|
97
98
|
if missing_cols:
|
|
98
99
|
raise MissingColumnError(
|
|
99
100
|
block_name=self.block_name,
|
|
100
101
|
missing_columns=missing_cols,
|
|
101
|
-
available_columns=samples.
|
|
102
|
+
available_columns=samples.columns.tolist(),
|
|
102
103
|
)
|
|
103
104
|
|
|
104
|
-
def generate(self, samples:
|
|
105
|
+
def generate(self, samples: pd.DataFrame, **kwargs: Any) -> pd.DataFrame:
|
|
105
106
|
"""Generate a flattened dataset in long format.
|
|
106
107
|
|
|
107
108
|
Parameters
|
|
108
109
|
----------
|
|
109
|
-
samples :
|
|
110
|
+
samples : pd.DataFrame
|
|
110
111
|
Input dataset to flatten.
|
|
111
112
|
|
|
112
113
|
Returns
|
|
113
114
|
-------
|
|
114
|
-
|
|
115
|
+
pd.DataFrame
|
|
115
116
|
Flattened dataset in long format with new variable and value columns.
|
|
116
117
|
"""
|
|
117
118
|
# Use the original simple logic - just adapted to use derived attributes
|
|
118
|
-
df = samples
|
|
119
|
-
id_cols = [col for col in samples.
|
|
119
|
+
df = samples
|
|
120
|
+
id_cols = [col for col in samples.columns.tolist() if col not in self.var_cols]
|
|
120
121
|
flatten_df = df.melt(
|
|
121
122
|
id_vars=id_cols,
|
|
122
123
|
value_vars=self.var_cols,
|
|
123
124
|
value_name=self.value_name,
|
|
124
125
|
var_name=self.var_name,
|
|
125
126
|
)
|
|
126
|
-
return
|
|
127
|
+
return flatten_df
|
|
@@ -8,10 +8,11 @@ to a mapping specification.
|
|
|
8
8
|
# Standard
|
|
9
9
|
from typing import Any
|
|
10
10
|
|
|
11
|
-
# Third Party
|
|
12
|
-
from datasets import Dataset
|
|
13
11
|
from pydantic import field_validator
|
|
14
12
|
|
|
13
|
+
# Third Party
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
15
16
|
# Local
|
|
16
17
|
from ...utils.logger_config import setup_logger
|
|
17
18
|
from ..base import BaseBlock
|
|
@@ -52,28 +53,38 @@ class RenameColumnsBlock(BaseBlock):
|
|
|
52
53
|
)
|
|
53
54
|
return v
|
|
54
55
|
|
|
55
|
-
def generate(self, samples:
|
|
56
|
+
def generate(self, samples: pd.DataFrame, **kwargs: Any) -> pd.DataFrame:
|
|
56
57
|
"""Generate a dataset with renamed columns.
|
|
57
58
|
|
|
58
59
|
Parameters
|
|
59
60
|
----------
|
|
60
|
-
samples :
|
|
61
|
+
samples : pd.DataFrame
|
|
61
62
|
Input dataset to rename columns in.
|
|
62
63
|
|
|
63
64
|
Returns
|
|
64
65
|
-------
|
|
65
|
-
|
|
66
|
+
pd.DataFrame
|
|
66
67
|
Dataset with renamed columns.
|
|
67
68
|
|
|
68
69
|
Raises
|
|
69
70
|
------
|
|
70
71
|
ValueError
|
|
71
|
-
If attempting to rename to a column name that already exists
|
|
72
|
+
If attempting to rename to a column name that already exists,
|
|
73
|
+
or if the original column names don't exist in the dataset.
|
|
72
74
|
"""
|
|
75
|
+
# Check that all original column names exist in the dataset
|
|
76
|
+
existing_cols = set(samples.columns.tolist())
|
|
77
|
+
original_cols = set(self.input_cols.keys())
|
|
78
|
+
|
|
79
|
+
missing_cols = original_cols - existing_cols
|
|
80
|
+
if missing_cols:
|
|
81
|
+
raise ValueError(
|
|
82
|
+
f"Original column names {sorted(missing_cols)} not in the dataset"
|
|
83
|
+
)
|
|
84
|
+
|
|
73
85
|
# Check for column name collisions
|
|
74
86
|
# Strict validation: no target column name can be an existing column name
|
|
75
87
|
# This prevents chained/circular renames which can be confusing
|
|
76
|
-
existing_cols = set(samples.column_names)
|
|
77
88
|
target_cols = set(self.input_cols.values())
|
|
78
89
|
|
|
79
90
|
collision = target_cols & existing_cols
|
|
@@ -84,5 +95,5 @@ class RenameColumnsBlock(BaseBlock):
|
|
|
84
95
|
"Chained renames are not supported."
|
|
85
96
|
)
|
|
86
97
|
|
|
87
|
-
# Rename columns using
|
|
88
|
-
return samples.
|
|
98
|
+
# Rename columns using pandas method
|
|
99
|
+
return samples.rename(columns=self.input_cols)
|
|
@@ -8,10 +8,11 @@ using a specified separator.
|
|
|
8
8
|
# Standard
|
|
9
9
|
from typing import Any
|
|
10
10
|
|
|
11
|
-
# Third Party
|
|
12
|
-
from datasets import Dataset
|
|
13
11
|
from pydantic import Field, field_validator
|
|
14
12
|
|
|
13
|
+
# Third Party
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
15
16
|
# Local
|
|
16
17
|
from ...utils.logger_config import setup_logger
|
|
17
18
|
from ..base import BaseBlock
|
|
@@ -65,17 +66,17 @@ class TextConcatBlock(BaseBlock):
|
|
|
65
66
|
raise ValueError("TextConcatBlock requires exactly one output column")
|
|
66
67
|
return v
|
|
67
68
|
|
|
68
|
-
def generate(self, samples:
|
|
69
|
+
def generate(self, samples: pd.DataFrame, **kwargs: Any) -> pd.DataFrame:
|
|
69
70
|
"""Generate a dataset with combined columns.
|
|
70
71
|
|
|
71
72
|
Parameters
|
|
72
73
|
----------
|
|
73
|
-
samples :
|
|
74
|
+
samples : pd.DataFrame
|
|
74
75
|
Input dataset to process.
|
|
75
76
|
|
|
76
77
|
Returns
|
|
77
78
|
-------
|
|
78
|
-
|
|
79
|
+
pd.DataFrame
|
|
79
80
|
Dataset with combined values stored in output column.
|
|
80
81
|
"""
|
|
81
82
|
if not self.output_cols:
|
|
@@ -83,20 +84,18 @@ class TextConcatBlock(BaseBlock):
|
|
|
83
84
|
|
|
84
85
|
output_col = self.output_cols[0]
|
|
85
86
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
# Apply the combination to all samples
|
|
101
|
-
result = samples.map(_combine_columns)
|
|
87
|
+
# Validate that all input columns exist in the dataset
|
|
88
|
+
for col in self.input_cols:
|
|
89
|
+
if col not in samples.columns:
|
|
90
|
+
raise ValueError(f"Input column '{col}' not found in sample")
|
|
91
|
+
|
|
92
|
+
# Create a copy to avoid modifying the input
|
|
93
|
+
result = samples.copy()
|
|
94
|
+
|
|
95
|
+
# Combine columns using vectorized string operations
|
|
96
|
+
# Convert all input columns to strings and concatenate with separator
|
|
97
|
+
result[output_col] = (
|
|
98
|
+
result[self.input_cols].astype(str).agg(self.separator.join, axis=1)
|
|
99
|
+
)
|
|
100
|
+
|
|
102
101
|
return result
|
|
@@ -8,11 +8,12 @@ mode, min, max, mean, or median.
|
|
|
8
8
|
# Standard
|
|
9
9
|
from typing import Any, Literal
|
|
10
10
|
|
|
11
|
-
# Third Party
|
|
12
|
-
from datasets import Dataset
|
|
13
11
|
from pydantic import field_validator
|
|
14
12
|
import numpy as np
|
|
15
13
|
|
|
14
|
+
# Third Party
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
16
17
|
# Local
|
|
17
18
|
from ...utils.logger_config import setup_logger
|
|
18
19
|
from ..base import BaseBlock
|
|
@@ -66,8 +67,8 @@ class UniformColumnValueSetter(BaseBlock):
|
|
|
66
67
|
self.output_cols = []
|
|
67
68
|
self.col_name = self.input_cols[0]
|
|
68
69
|
|
|
69
|
-
def generate(self, samples:
|
|
70
|
-
df = samples.
|
|
70
|
+
def generate(self, samples: pd.DataFrame, **kwargs: Any) -> pd.DataFrame:
|
|
71
|
+
df = samples.copy()
|
|
71
72
|
|
|
72
73
|
if df.empty:
|
|
73
74
|
raise ValueError("Cannot compute reduction for empty dataset")
|
|
@@ -98,4 +99,4 @@ class UniformColumnValueSetter(BaseBlock):
|
|
|
98
99
|
)
|
|
99
100
|
|
|
100
101
|
df[self.col_name] = value
|
|
101
|
-
return
|
|
102
|
+
return df
|