data-designer 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/_version.py +2 -2
- data_designer/cli/README.md +15 -1
- data_designer/cli/commands/download.py +56 -0
- data_designer/cli/commands/list.py +4 -18
- data_designer/cli/controllers/__init__.py +2 -1
- data_designer/cli/controllers/download_controller.py +217 -0
- data_designer/cli/controllers/model_controller.py +4 -3
- data_designer/cli/forms/field.py +65 -19
- data_designer/cli/forms/model_builder.py +251 -44
- data_designer/cli/main.py +11 -1
- data_designer/cli/repositories/persona_repository.py +88 -0
- data_designer/cli/services/__init__.py +2 -1
- data_designer/cli/services/download_service.py +97 -0
- data_designer/cli/ui.py +131 -0
- data_designer/cli/utils.py +34 -0
- data_designer/config/analysis/__init__.py +2 -0
- data_designer/config/analysis/column_profilers.py +75 -7
- data_designer/config/analysis/column_statistics.py +192 -48
- data_designer/config/analysis/dataset_profiler.py +23 -5
- data_designer/config/analysis/utils/reporting.py +3 -3
- data_designer/config/base.py +3 -3
- data_designer/config/column_configs.py +27 -6
- data_designer/config/column_types.py +24 -17
- data_designer/config/config_builder.py +34 -26
- data_designer/config/data_designer_config.py +7 -7
- data_designer/config/datastore.py +6 -6
- data_designer/config/default_model_settings.py +27 -34
- data_designer/config/exports.py +14 -1
- data_designer/config/models.py +155 -29
- data_designer/config/preview_results.py +5 -4
- data_designer/config/processors.py +109 -4
- data_designer/config/sampler_constraints.py +1 -2
- data_designer/config/sampler_params.py +31 -31
- data_designer/config/seed.py +1 -2
- data_designer/config/utils/code_lang.py +4 -5
- data_designer/config/utils/constants.py +31 -8
- data_designer/config/utils/io_helpers.py +5 -5
- data_designer/config/utils/misc.py +1 -4
- data_designer/config/utils/numerical_helpers.py +2 -2
- data_designer/config/utils/type_helpers.py +3 -3
- data_designer/config/utils/validation.py +39 -9
- data_designer/config/utils/visualization.py +62 -15
- data_designer/config/validator_params.py +4 -8
- data_designer/engine/analysis/column_profilers/base.py +0 -7
- data_designer/engine/analysis/column_profilers/judge_score_profiler.py +2 -3
- data_designer/engine/analysis/column_statistics.py +16 -16
- data_designer/engine/analysis/dataset_profiler.py +25 -4
- data_designer/engine/analysis/utils/column_statistics_calculations.py +71 -49
- data_designer/engine/analysis/utils/judge_score_processing.py +5 -5
- data_designer/engine/column_generators/generators/base.py +34 -0
- data_designer/engine/column_generators/generators/embedding.py +45 -0
- data_designer/engine/column_generators/generators/{llm_generators.py → llm_completion.py} +17 -49
- data_designer/engine/column_generators/registry.py +4 -2
- data_designer/engine/column_generators/utils/judge_score_factory.py +5 -6
- data_designer/engine/configurable_task.py +2 -2
- data_designer/engine/dataset_builders/artifact_storage.py +14 -5
- data_designer/engine/dataset_builders/column_wise_builder.py +12 -8
- data_designer/engine/dataset_builders/utils/concurrency.py +6 -6
- data_designer/engine/models/facade.py +66 -9
- data_designer/engine/models/litellm_overrides.py +5 -6
- data_designer/engine/models/parsers/errors.py +2 -4
- data_designer/engine/models/parsers/parser.py +2 -3
- data_designer/engine/models/parsers/postprocessors.py +3 -4
- data_designer/engine/models/parsers/types.py +4 -4
- data_designer/engine/models/registry.py +20 -11
- data_designer/engine/models/usage.py +7 -9
- data_designer/engine/processing/ginja/ast.py +1 -2
- data_designer/engine/processing/processors/drop_columns.py +1 -1
- data_designer/engine/processing/processors/registry.py +3 -0
- data_designer/engine/processing/processors/schema_transform.py +53 -0
- data_designer/engine/processing/utils.py +40 -2
- data_designer/engine/registry/base.py +12 -12
- data_designer/engine/sampling_gen/constraints.py +1 -2
- data_designer/engine/sampling_gen/data_sources/base.py +14 -14
- data_designer/engine/sampling_gen/entities/phone_number.py +1 -2
- data_designer/engine/sampling_gen/people_gen.py +3 -7
- data_designer/engine/validators/base.py +2 -2
- data_designer/interface/data_designer.py +12 -0
- data_designer/interface/results.py +36 -0
- data_designer/logging.py +2 -2
- data_designer/plugin_manager.py +3 -3
- data_designer/plugins/plugin.py +3 -3
- data_designer/plugins/registry.py +2 -2
- {data_designer-0.1.4.dist-info → data_designer-0.2.0.dist-info}/METADATA +9 -9
- {data_designer-0.1.4.dist-info → data_designer-0.2.0.dist-info}/RECORD +88 -81
- {data_designer-0.1.4.dist-info → data_designer-0.2.0.dist-info}/WHEEL +0 -0
- {data_designer-0.1.4.dist-info → data_designer-0.2.0.dist-info}/entry_points.txt +0 -0
- {data_designer-0.1.4.dist-info → data_designer-0.2.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
4
|
from enum import Enum
|
|
5
|
-
from typing import Any
|
|
5
|
+
from typing import Any
|
|
6
6
|
|
|
7
7
|
from pydantic import Field, field_serializer, model_validator
|
|
8
8
|
from typing_extensions import Self, TypeAlias
|
|
@@ -51,7 +51,7 @@ class LocalCallableValidatorParams(ConfigBase):
|
|
|
51
51
|
validation_function: Any = Field(
|
|
52
52
|
description="Function (Callable[[pd.DataFrame], pd.DataFrame]) to validate the data"
|
|
53
53
|
)
|
|
54
|
-
output_schema:
|
|
54
|
+
output_schema: dict[str, Any] | None = Field(
|
|
55
55
|
default=None, description="Expected schema for local callable validator's output"
|
|
56
56
|
)
|
|
57
57
|
|
|
@@ -80,7 +80,7 @@ class RemoteValidatorParams(ConfigBase):
|
|
|
80
80
|
"""
|
|
81
81
|
|
|
82
82
|
endpoint_url: str = Field(description="URL of the remote endpoint")
|
|
83
|
-
output_schema:
|
|
83
|
+
output_schema: dict[str, Any] | None = Field(
|
|
84
84
|
default=None, description="Expected schema for remote validator's output"
|
|
85
85
|
)
|
|
86
86
|
timeout: float = Field(default=30.0, gt=0, description="The timeout for the HTTP request")
|
|
@@ -89,8 +89,4 @@ class RemoteValidatorParams(ConfigBase):
|
|
|
89
89
|
max_parallel_requests: int = Field(default=4, ge=1, description="The maximum number of parallel requests to make")
|
|
90
90
|
|
|
91
91
|
|
|
92
|
-
ValidatorParamsT: TypeAlias =
|
|
93
|
-
CodeValidatorParams,
|
|
94
|
-
LocalCallableValidatorParams,
|
|
95
|
-
RemoteValidatorParams,
|
|
96
|
-
]
|
|
92
|
+
ValidatorParamsT: TypeAlias = CodeValidatorParams | LocalCallableValidatorParams | RemoteValidatorParams
|
|
@@ -7,7 +7,6 @@ import logging
|
|
|
7
7
|
from abc import ABC, abstractmethod
|
|
8
8
|
|
|
9
9
|
import pandas as pd
|
|
10
|
-
import pyarrow as pa
|
|
11
10
|
from pydantic import BaseModel, model_validator
|
|
12
11
|
from typing_extensions import Self
|
|
13
12
|
|
|
@@ -29,12 +28,6 @@ class ColumnConfigWithDataFrame(ConfigBase):
|
|
|
29
28
|
raise ValueError(f"Column {self.column_config.name!r} not found in DataFrame")
|
|
30
29
|
return self
|
|
31
30
|
|
|
32
|
-
@model_validator(mode="after")
|
|
33
|
-
def ensure_pyarrow_backend(self) -> Self:
|
|
34
|
-
if not all(isinstance(dtype, pd.ArrowDtype) for dtype in self.df.dtypes):
|
|
35
|
-
self.df = pa.Table.from_pandas(self.df).to_pandas(types_mapper=pd.ArrowDtype)
|
|
36
|
-
return self
|
|
37
|
-
|
|
38
31
|
def as_tuple(self) -> tuple[SingleColumnConfig, pd.DataFrame]:
|
|
39
32
|
return (self.column_config, self.df)
|
|
40
33
|
|
|
@@ -5,7 +5,6 @@ from __future__ import annotations
|
|
|
5
5
|
|
|
6
6
|
import logging
|
|
7
7
|
import random
|
|
8
|
-
from typing import Union
|
|
9
8
|
|
|
10
9
|
from data_designer.config.analysis.column_profilers import (
|
|
11
10
|
JudgeScoreProfilerConfig,
|
|
@@ -69,7 +68,7 @@ class JudgeScoreProfiler(ColumnProfiler[JudgeScoreProfilerConfig]):
|
|
|
69
68
|
)
|
|
70
69
|
|
|
71
70
|
for score in column_config.scores:
|
|
72
|
-
score_name = score.name
|
|
71
|
+
score_name = score.name
|
|
73
72
|
logger.info(f"{random.choice(['👩⚖️', '👨⚖️'])} Summarizing LLM-as-judge score: '{score_name}'")
|
|
74
73
|
score_sample = sample_scores_and_reasoning(
|
|
75
74
|
scores=score_distributions.scores[score_name],
|
|
@@ -96,7 +95,7 @@ class JudgeScoreProfiler(ColumnProfiler[JudgeScoreProfilerConfig]):
|
|
|
96
95
|
name: str,
|
|
97
96
|
sample: list[JudgeScoreSample],
|
|
98
97
|
histogram: CategoricalHistogramData,
|
|
99
|
-
distribution:
|
|
98
|
+
distribution: CategoricalDistribution | NumericalDistribution | MissingValue,
|
|
100
99
|
distribution_type: ColumnDistributionType,
|
|
101
100
|
) -> JudgeScoreSummary:
|
|
102
101
|
if isinstance(distribution, MissingValue) or not sample:
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
6
|
import logging
|
|
7
|
-
from typing import Any,
|
|
7
|
+
from typing import Any, TypeAlias
|
|
8
8
|
|
|
9
9
|
import pandas as pd
|
|
10
10
|
from pydantic import BaseModel
|
|
@@ -41,7 +41,7 @@ class GeneralColumnStatisticsCalculator(BaseModel):
|
|
|
41
41
|
return self.column_config_with_df.df
|
|
42
42
|
|
|
43
43
|
@property
|
|
44
|
-
def column_statistics_type(self) ->
|
|
44
|
+
def column_statistics_type(self) -> type[ColumnStatisticsT]:
|
|
45
45
|
return DEFAULT_COLUMN_STATISTICS_MAP.get(self.column_config.column_type, GeneralColumnStatistics)
|
|
46
46
|
|
|
47
47
|
def calculate(self) -> Self:
|
|
@@ -59,7 +59,7 @@ class GeneralColumnStatisticsCalculator(BaseModel):
|
|
|
59
59
|
)
|
|
60
60
|
|
|
61
61
|
def calculate_general_column_info(self) -> dict[str, Any]:
|
|
62
|
-
return calculate_general_column_info(self.column_config, self.df)
|
|
62
|
+
return calculate_general_column_info(self.column_config.name, self.df)
|
|
63
63
|
|
|
64
64
|
def __repr__(self) -> str:
|
|
65
65
|
params = []
|
|
@@ -93,7 +93,7 @@ class SamplerColumnStatisticsCalculator(GeneralColumnStatisticsCalculator):
|
|
|
93
93
|
return (
|
|
94
94
|
{
|
|
95
95
|
"sampler_type": SamplerType(self.column_config.sampler_type),
|
|
96
|
-
**calculate_column_distribution(self.column_config, self.df, dist_type),
|
|
96
|
+
**calculate_column_distribution(self.column_config.name, self.df, dist_type),
|
|
97
97
|
}
|
|
98
98
|
if make_dist
|
|
99
99
|
else {
|
|
@@ -109,23 +109,23 @@ class SeedDatasetColumnStatisticsCalculator(GeneralColumnStatisticsCalculator):
|
|
|
109
109
|
|
|
110
110
|
class ValidationColumnStatisticsCalculator(GeneralColumnStatisticsCalculator):
|
|
111
111
|
def calculate_validation_column_info(self) -> dict[str, Any]:
|
|
112
|
-
return calculate_validation_column_info(self.column_config, self.df)
|
|
112
|
+
return calculate_validation_column_info(self.column_config.name, self.df)
|
|
113
113
|
|
|
114
114
|
|
|
115
115
|
class ExpressionColumnStatisticsCalculator(GeneralColumnStatisticsCalculator): ...
|
|
116
116
|
|
|
117
117
|
|
|
118
|
-
ColumnStatisticsCalculatorT: TypeAlias =
|
|
119
|
-
ExpressionColumnStatisticsCalculator
|
|
120
|
-
ValidationColumnStatisticsCalculator
|
|
121
|
-
GeneralColumnStatisticsCalculator
|
|
122
|
-
LLMCodeColumnStatisticsCalculator
|
|
123
|
-
LLMJudgedColumnStatisticsCalculator
|
|
124
|
-
LLMStructuredColumnStatisticsCalculator
|
|
125
|
-
LLMTextColumnStatisticsCalculator
|
|
126
|
-
SamplerColumnStatisticsCalculator
|
|
127
|
-
SeedDatasetColumnStatisticsCalculator
|
|
128
|
-
|
|
118
|
+
ColumnStatisticsCalculatorT: TypeAlias = (
|
|
119
|
+
ExpressionColumnStatisticsCalculator
|
|
120
|
+
| ValidationColumnStatisticsCalculator
|
|
121
|
+
| GeneralColumnStatisticsCalculator
|
|
122
|
+
| LLMCodeColumnStatisticsCalculator
|
|
123
|
+
| LLMJudgedColumnStatisticsCalculator
|
|
124
|
+
| LLMStructuredColumnStatisticsCalculator
|
|
125
|
+
| LLMTextColumnStatisticsCalculator
|
|
126
|
+
| SamplerColumnStatisticsCalculator
|
|
127
|
+
| SeedDatasetColumnStatisticsCalculator
|
|
128
|
+
)
|
|
129
129
|
DEFAULT_COLUMN_STATISTICS_CALCULATOR_MAP = {
|
|
130
130
|
DataDesignerColumnType.EXPRESSION: ExpressionColumnStatisticsCalculator,
|
|
131
131
|
DataDesignerColumnType.VALIDATION: ValidationColumnStatisticsCalculator,
|
|
@@ -6,6 +6,7 @@ from collections.abc import Sequence
|
|
|
6
6
|
from functools import cached_property
|
|
7
7
|
|
|
8
8
|
import pandas as pd
|
|
9
|
+
import pyarrow as pa
|
|
9
10
|
from pydantic import Field, field_validator
|
|
10
11
|
|
|
11
12
|
from data_designer.config.analysis.column_profilers import ColumnProfilerConfigT
|
|
@@ -19,10 +20,8 @@ from data_designer.config.column_types import (
|
|
|
19
20
|
from data_designer.engine.analysis.column_profilers.base import ColumnConfigWithDataFrame, ColumnProfiler
|
|
20
21
|
from data_designer.engine.analysis.column_statistics import get_column_statistics_calculator
|
|
21
22
|
from data_designer.engine.analysis.errors import DatasetProfilerConfigurationError
|
|
22
|
-
from data_designer.engine.
|
|
23
|
-
|
|
24
|
-
MultiColumnConfig,
|
|
25
|
-
)
|
|
23
|
+
from data_designer.engine.analysis.utils.column_statistics_calculations import has_pyarrow_backend
|
|
24
|
+
from data_designer.engine.dataset_builders.multi_column_configs import DatasetBuilderColumnConfigT, MultiColumnConfig
|
|
26
25
|
from data_designer.engine.registry.data_designer_registry import DataDesignerRegistry
|
|
27
26
|
from data_designer.engine.resources.resource_provider import ResourceProvider
|
|
28
27
|
|
|
@@ -68,6 +67,7 @@ class DataDesignerDatasetProfiler:
|
|
|
68
67
|
logger.info("📐 Measuring dataset column statistics:")
|
|
69
68
|
|
|
70
69
|
self._validate_schema_consistency(list(dataset.columns))
|
|
70
|
+
dataset = self._convert_to_pyarrow_backend_if_needed(dataset)
|
|
71
71
|
|
|
72
72
|
column_statistics = []
|
|
73
73
|
for c in self.config.column_configs:
|
|
@@ -100,6 +100,27 @@ class DataDesignerDatasetProfiler:
|
|
|
100
100
|
column_profiles=column_profiles if column_profiles else None,
|
|
101
101
|
)
|
|
102
102
|
|
|
103
|
+
def _convert_to_pyarrow_backend_if_needed(self, dataset: pd.DataFrame) -> pd.DataFrame:
|
|
104
|
+
if not has_pyarrow_backend(dataset):
|
|
105
|
+
try:
|
|
106
|
+
dataset = pa.Table.from_pandas(dataset).to_pandas(types_mapper=pd.ArrowDtype)
|
|
107
|
+
except Exception as e:
|
|
108
|
+
# For ArrowTypeError, the second arg contains the more informative message
|
|
109
|
+
if isinstance(e, pa.lib.ArrowTypeError) and len(e.args) > 1:
|
|
110
|
+
error_msg = str(e.args[1])
|
|
111
|
+
else:
|
|
112
|
+
error_msg = str(e)
|
|
113
|
+
for col in dataset.columns:
|
|
114
|
+
# Make sure column names are clear in the error message
|
|
115
|
+
error_msg = error_msg.replace(col, f"'{col}'")
|
|
116
|
+
logger.warning("⚠️ Unable to convert the dataset to a PyArrow backend")
|
|
117
|
+
logger.warning(f" |-- Conversion Error Message: {error_msg}")
|
|
118
|
+
logger.warning(" |-- This is often due to at least one column having mixed data types")
|
|
119
|
+
logger.warning(
|
|
120
|
+
" |-- Note: Reported data types will be inferred from the first non-null value of each column"
|
|
121
|
+
)
|
|
122
|
+
return dataset
|
|
123
|
+
|
|
103
124
|
def _create_column_profiler(self, profiler_config: ColumnProfilerConfigT) -> ColumnProfiler:
|
|
104
125
|
return self.registry.column_profilers.get_for_config_type(type(profiler_config))(
|
|
105
126
|
config=profiler_config, resource_provider=self.resource_provider
|
|
@@ -20,10 +20,8 @@ from data_designer.config.analysis.column_statistics import (
|
|
|
20
20
|
)
|
|
21
21
|
from data_designer.config.column_configs import (
|
|
22
22
|
LLMTextColumnConfig,
|
|
23
|
-
SingleColumnConfig,
|
|
24
|
-
ValidationColumnConfig,
|
|
25
23
|
)
|
|
26
|
-
from data_designer.engine.column_generators.
|
|
24
|
+
from data_designer.engine.column_generators.utils.prompt_renderer import (
|
|
27
25
|
PromptType,
|
|
28
26
|
RecordBasedPromptRenderer,
|
|
29
27
|
create_response_recipe,
|
|
@@ -39,41 +37,54 @@ logger = logging.getLogger(__name__)
|
|
|
39
37
|
|
|
40
38
|
|
|
41
39
|
def calculate_column_distribution(
|
|
42
|
-
|
|
40
|
+
column_name: str, df: pd.DataFrame, distribution_type: ColumnDistributionType
|
|
43
41
|
) -> dict[str, CategoricalDistribution | NumericalDistribution | MissingValue | None]:
|
|
44
42
|
distribution_type = ColumnDistributionType(distribution_type)
|
|
45
43
|
try:
|
|
46
44
|
if distribution_type == ColumnDistributionType.CATEGORICAL:
|
|
47
45
|
return {
|
|
48
46
|
"distribution_type": ColumnDistributionType.CATEGORICAL,
|
|
49
|
-
"distribution": CategoricalDistribution.from_series(df[
|
|
47
|
+
"distribution": CategoricalDistribution.from_series(df[column_name]),
|
|
50
48
|
}
|
|
51
49
|
|
|
52
50
|
if distribution_type == ColumnDistributionType.NUMERICAL:
|
|
53
51
|
return {
|
|
54
52
|
"distribution_type": ColumnDistributionType.NUMERICAL,
|
|
55
|
-
"distribution": NumericalDistribution.from_series(df[
|
|
53
|
+
"distribution": NumericalDistribution.from_series(df[column_name]),
|
|
56
54
|
}
|
|
57
55
|
except Exception as e:
|
|
58
|
-
logger.warning(f"{WARNING_PREFIX} failed to calculate column distribution for '{
|
|
56
|
+
logger.warning(f"{WARNING_PREFIX} failed to calculate column distribution for '{column_name}' {e}")
|
|
59
57
|
return {
|
|
60
58
|
"distribution_type": ColumnDistributionType.UNKNOWN,
|
|
61
59
|
"distribution": MissingValue.CALCULATION_FAILED,
|
|
62
60
|
}
|
|
63
61
|
|
|
64
62
|
|
|
65
|
-
def calculate_general_column_info(
|
|
63
|
+
def calculate_general_column_info(column_name: str, df: pd.DataFrame) -> dict[str, Any]:
|
|
66
64
|
try:
|
|
67
|
-
_df = pd.DataFrame(df[
|
|
65
|
+
_df = pd.DataFrame(df[column_name].apply(ensure_hashable))
|
|
66
|
+
|
|
67
|
+
if has_pyarrow_backend(df):
|
|
68
|
+
pyarrow_dtype = str(df[column_name].dtype.pyarrow_dtype)
|
|
69
|
+
simple_dtype = convert_pyarrow_dtype_to_simple_dtype(df[column_name].dtype.pyarrow_dtype)
|
|
70
|
+
else:
|
|
71
|
+
# We do not log a warning at the column-level because it would be too noisy.
|
|
72
|
+
# However, there is a logged warning at the dataset-profiler level.
|
|
73
|
+
try:
|
|
74
|
+
simple_dtype = get_column_data_type_from_first_non_null_value(column_name, df)
|
|
75
|
+
except Exception:
|
|
76
|
+
simple_dtype = MissingValue.CALCULATION_FAILED
|
|
77
|
+
pyarrow_dtype = "n/a"
|
|
78
|
+
|
|
68
79
|
return {
|
|
69
|
-
"pyarrow_dtype":
|
|
70
|
-
"simple_dtype":
|
|
71
|
-
"num_records": len(_df[
|
|
72
|
-
"num_null": _df[
|
|
73
|
-
"num_unique": _df[
|
|
80
|
+
"pyarrow_dtype": pyarrow_dtype,
|
|
81
|
+
"simple_dtype": simple_dtype,
|
|
82
|
+
"num_records": len(_df[column_name]),
|
|
83
|
+
"num_null": _df[column_name].isnull().sum(),
|
|
84
|
+
"num_unique": _df[column_name].nunique(),
|
|
74
85
|
}
|
|
75
86
|
except Exception as e:
|
|
76
|
-
logger.warning(f"{WARNING_PREFIX} failed to calculate general column info for '{
|
|
87
|
+
logger.warning(f"{WARNING_PREFIX} failed to calculate general column info for '{column_name}': {e}")
|
|
77
88
|
return {
|
|
78
89
|
"pyarrow_dtype": MissingValue.CALCULATION_FAILED,
|
|
79
90
|
"simple_dtype": MissingValue.CALCULATION_FAILED,
|
|
@@ -83,7 +94,7 @@ def calculate_general_column_info(column_config: SingleColumnConfig, df: pd.Data
|
|
|
83
94
|
}
|
|
84
95
|
|
|
85
96
|
|
|
86
|
-
def
|
|
97
|
+
def calculate_input_token_stats(
|
|
87
98
|
column_config: LLMTextColumnConfig, df: pd.DataFrame
|
|
88
99
|
) -> dict[str, float | MissingValue]:
|
|
89
100
|
try:
|
|
@@ -100,22 +111,20 @@ def calculate_prompt_token_stats(
|
|
|
100
111
|
concatenated_prompt = str(system_prompt + "\n\n" + prompt)
|
|
101
112
|
num_tokens.append(len(TOKENIZER.encode(concatenated_prompt, disallowed_special=())))
|
|
102
113
|
except Exception as e:
|
|
103
|
-
logger.warning(
|
|
104
|
-
f"{WARNING_PREFIX} failed to calculate prompt token stats for column {column_config.name!r}: {e}"
|
|
105
|
-
)
|
|
114
|
+
logger.warning(f"{WARNING_PREFIX} failed to calculate input token stats for column {column_config.name!r}: {e}")
|
|
106
115
|
return {
|
|
107
|
-
"
|
|
108
|
-
"
|
|
109
|
-
"
|
|
116
|
+
"input_tokens_mean": MissingValue.CALCULATION_FAILED,
|
|
117
|
+
"input_tokens_median": MissingValue.CALCULATION_FAILED,
|
|
118
|
+
"input_tokens_stddev": MissingValue.CALCULATION_FAILED,
|
|
110
119
|
}
|
|
111
120
|
return {
|
|
112
|
-
"
|
|
113
|
-
"
|
|
114
|
-
"
|
|
121
|
+
"input_tokens_mean": np.mean(num_tokens),
|
|
122
|
+
"input_tokens_median": np.median(num_tokens),
|
|
123
|
+
"input_tokens_stddev": np.std(num_tokens),
|
|
115
124
|
}
|
|
116
125
|
|
|
117
126
|
|
|
118
|
-
def
|
|
127
|
+
def calculate_output_token_stats(
|
|
119
128
|
column_config: LLMTextColumnConfig, df: pd.DataFrame
|
|
120
129
|
) -> dict[str, float | MissingValue]:
|
|
121
130
|
try:
|
|
@@ -123,34 +132,32 @@ def calculate_completion_token_stats(
|
|
|
123
132
|
lambda value: len(TOKENIZER.encode(str(value), disallowed_special=()))
|
|
124
133
|
)
|
|
125
134
|
return {
|
|
126
|
-
"
|
|
127
|
-
"
|
|
128
|
-
"
|
|
135
|
+
"output_tokens_mean": tokens_per_record.mean(),
|
|
136
|
+
"output_tokens_median": tokens_per_record.median(),
|
|
137
|
+
"output_tokens_stddev": tokens_per_record.std(),
|
|
129
138
|
}
|
|
130
139
|
except Exception as e:
|
|
131
|
-
logger.warning(
|
|
132
|
-
f"{WARNING_PREFIX} failed to calculate completion token stats for column {column_config.name}: {e}"
|
|
133
|
-
)
|
|
140
|
+
logger.warning(f"{WARNING_PREFIX} failed to calculate output token stats for column {column_config.name}: {e}")
|
|
134
141
|
return {
|
|
135
|
-
"
|
|
136
|
-
"
|
|
137
|
-
"
|
|
142
|
+
"output_tokens_mean": MissingValue.CALCULATION_FAILED,
|
|
143
|
+
"output_tokens_median": MissingValue.CALCULATION_FAILED,
|
|
144
|
+
"output_tokens_stddev": MissingValue.CALCULATION_FAILED,
|
|
138
145
|
}
|
|
139
146
|
|
|
140
147
|
|
|
141
148
|
def calculate_token_stats(column_config: LLMTextColumnConfig, df: pd.DataFrame) -> dict[str, float | MissingValue]:
|
|
142
149
|
return {
|
|
143
|
-
**
|
|
144
|
-
**
|
|
150
|
+
**calculate_input_token_stats(column_config, df),
|
|
151
|
+
**calculate_output_token_stats(column_config, df),
|
|
145
152
|
}
|
|
146
153
|
|
|
147
154
|
|
|
148
|
-
def calculate_validation_column_info(
|
|
155
|
+
def calculate_validation_column_info(column_name: str, df: pd.DataFrame) -> dict[str, Any]:
|
|
149
156
|
try:
|
|
150
|
-
return {"num_valid_records": df[
|
|
157
|
+
return {"num_valid_records": df[column_name].apply(lambda x: ensure_boolean(x["is_valid"])).sum()}
|
|
151
158
|
except Exception as e:
|
|
152
159
|
logger.warning(
|
|
153
|
-
f"{WARNING_PREFIX} failed to calculate code validation column info for column {
|
|
160
|
+
f"{WARNING_PREFIX} failed to calculate code validation column info for column {column_name}: {e}"
|
|
154
161
|
)
|
|
155
162
|
return {"num_valid_records": MissingValue.CALCULATION_FAILED}
|
|
156
163
|
|
|
@@ -160,22 +167,33 @@ def convert_pyarrow_dtype_to_simple_dtype(pyarrow_dtype: pa.DataType) -> str:
|
|
|
160
167
|
return f"list[{convert_pyarrow_dtype_to_simple_dtype(pyarrow_dtype.value_type)}]"
|
|
161
168
|
if isinstance(pyarrow_dtype, pa.StructType):
|
|
162
169
|
return "dict"
|
|
163
|
-
|
|
164
|
-
|
|
170
|
+
return convert_to_simple_dtype(str(pyarrow_dtype))
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def convert_to_simple_dtype(dtype: str) -> str:
|
|
174
|
+
if "int" in dtype:
|
|
165
175
|
return "int"
|
|
166
|
-
if "double" in
|
|
176
|
+
if "double" in dtype:
|
|
167
177
|
return "float"
|
|
168
|
-
if "float" in
|
|
178
|
+
if "float" in dtype:
|
|
169
179
|
return "float"
|
|
170
|
-
if "
|
|
180
|
+
if "str" in dtype:
|
|
171
181
|
return "string"
|
|
172
|
-
if "timestamp" in
|
|
182
|
+
if "timestamp" in dtype:
|
|
173
183
|
return "timestamp"
|
|
174
|
-
if "time" in
|
|
184
|
+
if "time" in dtype:
|
|
175
185
|
return "time"
|
|
176
|
-
if "date" in
|
|
186
|
+
if "date" in dtype:
|
|
177
187
|
return "date"
|
|
178
|
-
return
|
|
188
|
+
return dtype
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def get_column_data_type_from_first_non_null_value(column_name: str, df: pd.DataFrame) -> str:
|
|
192
|
+
df_no_nulls = df[column_name].dropna()
|
|
193
|
+
if len(df_no_nulls) == 0:
|
|
194
|
+
return MissingValue.CALCULATION_FAILED
|
|
195
|
+
dtype = type(df_no_nulls.iloc[0]).__name__
|
|
196
|
+
return convert_to_simple_dtype(dtype)
|
|
179
197
|
|
|
180
198
|
|
|
181
199
|
def ensure_hashable(x: Any) -> str:
|
|
@@ -207,3 +225,7 @@ def ensure_boolean(v: bool | str | int | None) -> bool:
|
|
|
207
225
|
if v is None:
|
|
208
226
|
return False
|
|
209
227
|
raise ValueError(f"Invalid boolean value: {v}")
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def has_pyarrow_backend(df: pd.DataFrame) -> bool:
|
|
231
|
+
return all(isinstance(dtype, pd.ArrowDtype) for dtype in df.dtypes)
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
|
|
4
4
|
import logging
|
|
5
5
|
from collections import defaultdict
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any
|
|
7
7
|
|
|
8
8
|
import pandas as pd
|
|
9
9
|
|
|
@@ -21,7 +21,7 @@ logger = logging.getLogger(__name__)
|
|
|
21
21
|
|
|
22
22
|
def extract_judge_score_distributions(
|
|
23
23
|
column_config: LLMJudgeColumnConfig, df: pd.DataFrame
|
|
24
|
-
) ->
|
|
24
|
+
) -> JudgeScoreDistributions | MissingValue:
|
|
25
25
|
scores = defaultdict(list)
|
|
26
26
|
reasoning = defaultdict(list)
|
|
27
27
|
|
|
@@ -32,7 +32,7 @@ def extract_judge_score_distributions(
|
|
|
32
32
|
|
|
33
33
|
for score in column_config.scores:
|
|
34
34
|
is_numerical = True
|
|
35
|
-
name = score.name
|
|
35
|
+
name = score.name
|
|
36
36
|
for results in df[column_config.name]:
|
|
37
37
|
try:
|
|
38
38
|
score = results[name].get("score", None)
|
|
@@ -79,10 +79,10 @@ def extract_judge_score_distributions(
|
|
|
79
79
|
|
|
80
80
|
|
|
81
81
|
def sample_scores_and_reasoning(
|
|
82
|
-
scores: list[
|
|
82
|
+
scores: list[int | str],
|
|
83
83
|
reasoning: list[str],
|
|
84
84
|
num_samples: int,
|
|
85
|
-
random_seed:
|
|
85
|
+
random_seed: int | None = None,
|
|
86
86
|
) -> list[JudgeScoreSample]:
|
|
87
87
|
if len(scores) != len(reasoning):
|
|
88
88
|
raise ValueError("scores and reasoning must have the same length")
|
|
@@ -1,13 +1,20 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
import functools
|
|
5
|
+
import logging
|
|
4
6
|
from abc import ABC, abstractmethod
|
|
5
7
|
from typing import overload
|
|
6
8
|
|
|
7
9
|
import pandas as pd
|
|
8
10
|
|
|
11
|
+
from data_designer.config.column_types import COLUMN_TYPE_EMOJI_MAP
|
|
12
|
+
from data_designer.config.models import BaseInferenceParams, ModelConfig
|
|
9
13
|
from data_designer.config.utils.type_helpers import StrEnum
|
|
10
14
|
from data_designer.engine.configurable_task import ConfigurableTask, ConfigurableTaskMetadata, DataT, TaskConfigT
|
|
15
|
+
from data_designer.engine.models.facade import ModelFacade
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
11
18
|
|
|
12
19
|
|
|
13
20
|
class GenerationStrategy(StrEnum):
|
|
@@ -59,3 +66,30 @@ class FromScratchColumnGenerator(ColumnGenerator[TaskConfigT], ABC):
|
|
|
59
66
|
|
|
60
67
|
@abstractmethod
|
|
61
68
|
def generate_from_scratch(self, num_records: int) -> pd.DataFrame: ...
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class WithModelGeneration:
|
|
72
|
+
@functools.cached_property
|
|
73
|
+
def model(self) -> ModelFacade:
|
|
74
|
+
return self.resource_provider.model_registry.get_model(model_alias=self.config.model_alias)
|
|
75
|
+
|
|
76
|
+
@functools.cached_property
|
|
77
|
+
def model_config(self) -> ModelConfig:
|
|
78
|
+
return self.resource_provider.model_registry.get_model_config(model_alias=self.config.model_alias)
|
|
79
|
+
|
|
80
|
+
@functools.cached_property
|
|
81
|
+
def inference_parameters(self) -> BaseInferenceParams:
|
|
82
|
+
return self.model_config.inference_parameters
|
|
83
|
+
|
|
84
|
+
def log_pre_generation(self) -> None:
|
|
85
|
+
emoji = COLUMN_TYPE_EMOJI_MAP[self.config.column_type]
|
|
86
|
+
logger.info(f"{emoji} Preparing {self.config.column_type} column generation")
|
|
87
|
+
logger.info(f" |-- column name: {self.config.name!r}")
|
|
88
|
+
logger.info(f" |-- model config:\n{self.model_config.model_dump_json(indent=4)}")
|
|
89
|
+
if self.model_config.provider is None:
|
|
90
|
+
logger.info(f" |-- default model provider: {self._get_provider_name()!r}")
|
|
91
|
+
|
|
92
|
+
def _get_provider_name(self) -> str:
|
|
93
|
+
model_alias = self.model_config.alias
|
|
94
|
+
provider = self.resource_provider.model_registry.get_model_provider(model_alias=model_alias)
|
|
95
|
+
return provider.name
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, computed_field
|
|
6
|
+
|
|
7
|
+
from data_designer.config.column_configs import EmbeddingColumnConfig
|
|
8
|
+
from data_designer.engine.column_generators.generators.base import (
|
|
9
|
+
ColumnGenerator,
|
|
10
|
+
GenerationStrategy,
|
|
11
|
+
GeneratorMetadata,
|
|
12
|
+
WithModelGeneration,
|
|
13
|
+
)
|
|
14
|
+
from data_designer.engine.processing.utils import deserialize_json_values, parse_list_string
|
|
15
|
+
from data_designer.engine.resources.resource_provider import ResourceType
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class EmbeddingGenerationResult(BaseModel):
|
|
19
|
+
embeddings: list[list[float]]
|
|
20
|
+
|
|
21
|
+
@computed_field
|
|
22
|
+
def num_embeddings(self) -> int:
|
|
23
|
+
return len(self.embeddings)
|
|
24
|
+
|
|
25
|
+
@computed_field
|
|
26
|
+
def dimension(self) -> int:
|
|
27
|
+
return len(self.embeddings[0]) if len(self.embeddings) > 0 else 0
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class EmbeddingCellGenerator(WithModelGeneration, ColumnGenerator[EmbeddingColumnConfig]):
|
|
31
|
+
@staticmethod
|
|
32
|
+
def metadata() -> GeneratorMetadata:
|
|
33
|
+
return GeneratorMetadata(
|
|
34
|
+
name="embedding_cell_generator",
|
|
35
|
+
description="Generate embeddings for a text column.",
|
|
36
|
+
generation_strategy=GenerationStrategy.CELL_BY_CELL,
|
|
37
|
+
required_resources=[ResourceType.MODEL_REGISTRY],
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def generate(self, data: dict) -> dict:
|
|
41
|
+
deserialized_record = deserialize_json_values(data)
|
|
42
|
+
input_texts = parse_list_string(deserialized_record[self.config.target_column])
|
|
43
|
+
embeddings = self.model.generate_text_embeddings(input_texts=input_texts)
|
|
44
|
+
data[self.config.name] = EmbeddingGenerationResult(embeddings=embeddings).model_dump(mode="json")
|
|
45
|
+
return data
|