data-designer-engine 0.4.0rc2__tar.gz → 0.5.0rc1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/PKG-INFO +3 -2
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/pyproject.toml +21 -14
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/analysis/column_profilers/base.py +1 -2
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/analysis/dataset_profiler.py +1 -2
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/column_generators/generators/base.py +1 -6
- data_designer_engine-0.5.0rc1/src/data_designer/engine/column_generators/generators/custom.py +195 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/column_generators/generators/llm_completion.py +34 -4
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/column_generators/registry.py +3 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/column_generators/utils/errors.py +3 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/column_generators/utils/prompt_renderer.py +1 -1
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/dataset_builders/column_wise_builder.py +47 -10
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/dataset_builders/multi_column_configs.py +2 -2
- data_designer_engine-0.5.0rc1/src/data_designer/engine/dataset_builders/utils/progress_tracker.py +122 -0
- data_designer_engine-0.5.0rc1/src/data_designer/engine/mcp/__init__.py +30 -0
- data_designer_engine-0.5.0rc1/src/data_designer/engine/mcp/errors.py +22 -0
- data_designer_engine-0.5.0rc1/src/data_designer/engine/mcp/facade.py +485 -0
- data_designer_engine-0.5.0rc1/src/data_designer/engine/mcp/factory.py +46 -0
- data_designer_engine-0.5.0rc1/src/data_designer/engine/mcp/io.py +487 -0
- data_designer_engine-0.5.0rc1/src/data_designer/engine/mcp/registry.py +203 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/model_provider.py +68 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/models/facade.py +92 -30
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/models/factory.py +18 -1
- data_designer_engine-0.5.0rc1/src/data_designer/engine/models/utils.py +128 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/resources/resource_provider.py +72 -3
- data_designer_engine-0.5.0rc1/src/data_designer/engine/testing/fixtures.py +233 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/testing/stubs.py +1 -2
- data_designer_engine-0.5.0rc1/tests/conftest.py +7 -0
- data_designer_engine-0.5.0rc1/tests/engine/column_generators/generators/test_custom.py +336 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/column_generators/generators/test_llm_completion_generators.py +252 -16
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/dataset_builders/test_column_wise_builder.py +2 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/dataset_builders/utils/test_dag.py +1 -1
- data_designer_engine-0.5.0rc1/tests/engine/dataset_builders/utils/test_progress_tracker.py +290 -0
- data_designer_engine-0.5.0rc1/tests/engine/mcp/conftest.py +4 -0
- data_designer_engine-0.5.0rc1/tests/engine/mcp/test_mcp_facade.py +872 -0
- data_designer_engine-0.5.0rc1/tests/engine/mcp/test_mcp_factory.py +86 -0
- data_designer_engine-0.5.0rc1/tests/engine/mcp/test_mcp_io.py +996 -0
- data_designer_engine-0.5.0rc1/tests/engine/mcp/test_mcp_registry.py +281 -0
- data_designer_engine-0.5.0rc1/tests/engine/models/test_facade.py +1117 -0
- data_designer_engine-0.5.0rc1/tests/engine/models/test_model_utils.py +23 -0
- data_designer_engine-0.5.0rc1/tests/engine/resources/test_resource_provider.py +210 -0
- data_designer_engine-0.5.0rc1/tests/engine/test_model_provider.py +135 -0
- data_designer_engine-0.4.0rc2/src/data_designer/engine/_version.py +0 -34
- data_designer_engine-0.4.0rc2/src/data_designer/engine/models/utils.py +0 -38
- data_designer_engine-0.4.0rc2/tests/conftest.py +0 -16
- data_designer_engine-0.4.0rc2/tests/engine/models/test_facade.py +0 -233
- data_designer_engine-0.4.0rc2/tests/engine/models/test_model_utils.py +0 -36
- data_designer_engine-0.4.0rc2/tests/engine/resources/test_resource_provider.py +0 -40
- data_designer_engine-0.4.0rc2/tests/engine/test_model_provider.py +0 -61
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/.gitignore +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/README.md +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/__init__.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/analysis/column_profilers/judge_score_profiler.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/analysis/column_profilers/registry.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/analysis/column_statistics.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/analysis/errors.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/analysis/utils/column_statistics_calculations.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/analysis/utils/judge_score_processing.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/column_generators/__init__.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/column_generators/generators/__init__.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/column_generators/generators/embedding.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/column_generators/generators/expression.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/column_generators/generators/samplers.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/column_generators/generators/seed_dataset.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/column_generators/generators/validation.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/column_generators/utils/generator_classification.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/column_generators/utils/judge_score_factory.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/compiler.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/configurable_task.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/dataset_builders/artifact_storage.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/dataset_builders/errors.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/dataset_builders/utils/__init__.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/dataset_builders/utils/concurrency.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/dataset_builders/utils/config_compiler.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/dataset_builders/utils/dag.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/dataset_builders/utils/errors.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/errors.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/models/__init__.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/models/errors.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/models/litellm_overrides.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/models/parsers/__init__.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/models/parsers/errors.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/models/parsers/parser.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/models/parsers/postprocessors.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/models/parsers/tag_parsers.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/models/parsers/types.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/models/recipes/base.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/models/recipes/response_recipes.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/models/registry.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/models/telemetry.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/models/usage.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/processing/ginja/__init__.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/processing/ginja/ast.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/processing/ginja/environment.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/processing/ginja/exceptions.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/processing/ginja/record.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/processing/gsonschema/__init__.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/processing/gsonschema/exceptions.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/processing/gsonschema/schema_transformers.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/processing/gsonschema/types.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/processing/gsonschema/validators.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/processing/processors/base.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/processing/processors/drop_columns.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/processing/processors/registry.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/processing/processors/schema_transform.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/processing/utils.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/registry/base.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/registry/data_designer_registry.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/registry/errors.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/resources/managed_dataset_generator.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/resources/managed_dataset_repository.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/resources/managed_storage.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/resources/seed_reader.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/sampling_gen/column.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/sampling_gen/constraints.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/sampling_gen/data_sources/base.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/sampling_gen/data_sources/errors.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/sampling_gen/data_sources/sources.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/sampling_gen/entities/__init__.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/sampling_gen/entities/email_address_utils.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/sampling_gen/entities/errors.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/sampling_gen/entities/national_id_utils.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/sampling_gen/entities/person.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/sampling_gen/entities/phone_number.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/sampling_gen/errors.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/sampling_gen/generator.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/sampling_gen/jinja_utils.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/sampling_gen/people_gen.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/sampling_gen/person_constants.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/sampling_gen/schema.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/sampling_gen/schema_builder.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/sampling_gen/utils.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/secret_resolver.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/testing/__init__.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/testing/utils.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/validation.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/validators/__init__.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/validators/base.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/validators/local_callable.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/validators/python.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/validators/remote.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/src/data_designer/engine/validators/sql.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/analysis/column_profilers/test_base.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/analysis/column_profilers/test_judge_score_profiler.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/analysis/conftest.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/analysis/test_column_statistics_calculator.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/analysis/test_data/artifacts/dataset/column_configs.json +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/analysis/test_data/artifacts/dataset/dataset.json +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/analysis/test_data/artifacts/dataset/metadata.json +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/analysis/test_dataset_profiler.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/analysis/test_errors.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/analysis/utils/test_column_statistics_calculations.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/analysis/utils/test_judge_score_processing.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/column_generators/generators/__init__.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/column_generators/generators/test_column_generator_base.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/column_generators/generators/test_embedding.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/column_generators/generators/test_expression.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/column_generators/generators/test_samplers.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/column_generators/generators/test_seed_dataset.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/column_generators/generators/test_validation.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/column_generators/test_registry.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/column_generators/utils/test_column_generator_errors.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/column_generators/utils/test_generator_classification.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/column_generators/utils/test_judge_score_factory.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/column_generators/utils/test_prompt_renderer.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/conftest.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/dataset_builders/test_artifact_storage.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/dataset_builders/test_multi_column_configs.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/dataset_builders/utils/test_concurrency.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/dataset_builders/utils/test_config_compiler.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/dataset_builders/utils/test_dataset_batch_manager.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/models/conftest.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/models/parsers/test_parser.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/models/parsers/test_parsers_types.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/models/parsers/test_postprocessors.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/models/parsers/test_tag_parsers.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/models/recipes/test_recipe_base.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/models/recipes/test_response_recipes.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/models/stub_secrets.json +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/models/test_litellm_overrides.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/models/test_model_errors.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/models/test_model_registry.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/models/test_usage.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/processing/__init__.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/processing/ginja/__init__.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/processing/ginja/test_ast.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/processing/ginja/test_environment.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/processing/ginja/test_exceptions.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/processing/ginja/test_record.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/processing/gsonschema/__init__.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/processing/gsonschema/test_exceptions.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/processing/gsonschema/test_schema_transformers.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/processing/gsonschema/test_types.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/processing/gsonschema/test_validators.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/processing/processors/__init__.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/processing/processors/test_drop_columns.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/processing/processors/test_registry.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/processing/processors/test_schema_transform.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/processing/test_utils.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/registry/__init__.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/registry/conftest.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/registry/test_base.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/registry/test_data_designer_registry.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/registry/test_errors.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/resources/__init__.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/resources/conftest.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/resources/test_managed_dataset_generator.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/resources/test_managed_dataset_repository.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/resources/test_managed_storage.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/resources/test_seed_reader.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/sampling_gen/conftest.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/sampling_gen/data_sources/test_sampler_errors.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/sampling_gen/data_sources/test_sources.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/sampling_gen/entities/test_email_address_utils.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/sampling_gen/entities/test_national_id_utils.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/sampling_gen/entities/test_person.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/sampling_gen/entities/test_phone_number.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/sampling_gen/test_column.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/sampling_gen/test_constraints.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/sampling_gen/test_generator.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/sampling_gen/test_jinja_utils.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/sampling_gen/test_people_gen.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/sampling_gen/test_schema.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/sampling_gen/test_utils.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/test_compiler.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/test_configurable_task.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/test_dataset_metadata.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/test_engine_errors.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/test_secret_resolver.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/test_validation.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/validators/test_local_callable.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/validators/test_python.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/validators/test_remote.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/engine/validators/test_sql.py +0 -0
- {data_designer_engine-0.4.0rc2 → data_designer_engine-0.5.0rc1}/tests/test_plugin_manager.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: data-designer-engine
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0rc1
|
|
4
4
|
Summary: Generation engine for DataDesigner synthetic data generation
|
|
5
5
|
License-Expression: Apache-2.0
|
|
6
6
|
Classifier: Development Status :: 4 - Beta
|
|
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
14
14
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
15
|
Requires-Python: >=3.10
|
|
16
16
|
Requires-Dist: anyascii<1,>=0.3.3
|
|
17
|
-
Requires-Dist: data-designer-config
|
|
17
|
+
Requires-Dist: data-designer-config==0.5.0rc1
|
|
18
18
|
Requires-Dist: duckdb<2,>=1.1.3
|
|
19
19
|
Requires-Dist: faker<21,>=20.1.0
|
|
20
20
|
Requires-Dist: httpx-retries<1,>=0.4.2
|
|
@@ -26,6 +26,7 @@ Requires-Dist: jsonschema<5,>=4.0.0
|
|
|
26
26
|
Requires-Dist: litellm<1.80.12,>=1.73.6
|
|
27
27
|
Requires-Dist: lxml<7,>=6.0.2
|
|
28
28
|
Requires-Dist: marko<3,>=2.1.2
|
|
29
|
+
Requires-Dist: mcp<2,>=1.26.0
|
|
29
30
|
Requires-Dist: networkx<4,>=3.0
|
|
30
31
|
Requires-Dist: ruff<1,>=0.14.10
|
|
31
32
|
Requires-Dist: scipy<2,>=1.11.0
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "data-designer-engine"
|
|
3
|
-
dynamic = ["version"]
|
|
3
|
+
dynamic = ["version", "dependencies"]
|
|
4
4
|
description = "Generation engine for DataDesigner synthetic data generation"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.10"
|
|
@@ -18,9 +18,22 @@ classifiers = [
|
|
|
18
18
|
"Programming Language :: Python :: 3.13",
|
|
19
19
|
]
|
|
20
20
|
|
|
21
|
+
[build-system]
|
|
22
|
+
requires = ["hatchling", "uv-dynamic-versioning>=0.7.0"]
|
|
23
|
+
build-backend = "hatchling.build"
|
|
24
|
+
|
|
25
|
+
[tool.hatch.version]
|
|
26
|
+
source = "uv-dynamic-versioning"
|
|
27
|
+
|
|
28
|
+
[tool.uv-dynamic-versioning]
|
|
29
|
+
vcs = "git"
|
|
30
|
+
style = "pep440"
|
|
31
|
+
bump = true
|
|
32
|
+
|
|
33
|
+
[tool.hatch.metadata.hooks.uv-dynamic-versioning]
|
|
21
34
|
dependencies = [
|
|
22
35
|
"anyascii>=0.3.3,<1",
|
|
23
|
-
"data-designer-config",
|
|
36
|
+
"data-designer-config=={{ version }}",
|
|
24
37
|
"duckdb>=1.1.3,<2",
|
|
25
38
|
"faker>=20.1.0,<21",
|
|
26
39
|
"httpx>=0.27.2,<1",
|
|
@@ -32,6 +45,7 @@ dependencies = [
|
|
|
32
45
|
"litellm>=1.73.6,<1.80.12",
|
|
33
46
|
"lxml>=6.0.2,<7",
|
|
34
47
|
"marko>=2.1.2,<3",
|
|
48
|
+
"mcp>=1.26.0,<2",
|
|
35
49
|
"networkx>=3.0,<4",
|
|
36
50
|
"ruff>=0.14.10,<1",
|
|
37
51
|
"scipy>=1.11.0,<2",
|
|
@@ -39,24 +53,17 @@ dependencies = [
|
|
|
39
53
|
"tiktoken>=0.8.0,<1",
|
|
40
54
|
]
|
|
41
55
|
|
|
42
|
-
[build-system]
|
|
43
|
-
requires = ["hatchling", "hatch-vcs"]
|
|
44
|
-
build-backend = "hatchling.build"
|
|
45
|
-
|
|
46
|
-
[tool.hatch.version]
|
|
47
|
-
source = "vcs"
|
|
48
|
-
fallback-version = "0.1.0.dev0"
|
|
49
|
-
raw-options = { root = "../.." }
|
|
50
|
-
|
|
51
|
-
[tool.hatch.build.hooks.vcs]
|
|
52
|
-
version-file = "src/data_designer/engine/_version.py"
|
|
53
|
-
|
|
54
56
|
[tool.hatch.build.targets.wheel]
|
|
55
57
|
packages = ["src/data_designer"]
|
|
56
58
|
|
|
57
59
|
[tool.ruff]
|
|
58
60
|
extend = "../../pyproject.toml"
|
|
59
61
|
|
|
62
|
+
[tool.pytest.ini_options]
|
|
63
|
+
testpaths = ["tests"]
|
|
64
|
+
asyncio_default_fixture_loop_scope = "session"
|
|
65
|
+
env = ["DISABLE_DATA_DESIGNER_PLUGINS=true"]
|
|
66
|
+
|
|
60
67
|
[tool.uv]
|
|
61
68
|
package = true
|
|
62
69
|
|
|
@@ -10,8 +10,7 @@ from typing import TYPE_CHECKING
|
|
|
10
10
|
from pydantic import BaseModel, model_validator
|
|
11
11
|
from typing_extensions import Self
|
|
12
12
|
|
|
13
|
-
from data_designer.config.base import ConfigBase
|
|
14
|
-
from data_designer.config.column_configs import SingleColumnConfig
|
|
13
|
+
from data_designer.config.base import ConfigBase, SingleColumnConfig
|
|
15
14
|
from data_designer.config.column_types import DataDesignerColumnType
|
|
16
15
|
from data_designer.engine.configurable_task import ConfigurableTask, TaskConfigT
|
|
17
16
|
from data_designer.lazy_heavy_imports import pd
|
|
@@ -12,8 +12,7 @@ from pydantic import Field, field_validator
|
|
|
12
12
|
|
|
13
13
|
from data_designer.config.analysis.column_profilers import ColumnProfilerConfigT
|
|
14
14
|
from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
|
|
15
|
-
from data_designer.config.base import ConfigBase
|
|
16
|
-
from data_designer.config.column_configs import SingleColumnConfig
|
|
15
|
+
from data_designer.config.base import ConfigBase, SingleColumnConfig
|
|
17
16
|
from data_designer.config.column_types import ColumnConfigT
|
|
18
17
|
from data_designer.engine.analysis.column_profilers.base import ColumnConfigWithDataFrame, ColumnProfiler
|
|
19
18
|
from data_designer.engine.analysis.column_statistics import get_column_statistics_calculator
|
|
@@ -6,9 +6,9 @@ from __future__ import annotations
|
|
|
6
6
|
import functools
|
|
7
7
|
import logging
|
|
8
8
|
from abc import ABC, abstractmethod
|
|
9
|
-
from enum import Enum
|
|
10
9
|
from typing import TYPE_CHECKING, overload
|
|
11
10
|
|
|
11
|
+
from data_designer.config.column_configs import GenerationStrategy
|
|
12
12
|
from data_designer.engine.configurable_task import ConfigurableTask, DataT, TaskConfigT
|
|
13
13
|
from data_designer.lazy_heavy_imports import pd
|
|
14
14
|
|
|
@@ -22,11 +22,6 @@ if TYPE_CHECKING:
|
|
|
22
22
|
logger = logging.getLogger(__name__)
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
class GenerationStrategy(str, Enum):
|
|
26
|
-
CELL_BY_CELL = "cell_by_cell"
|
|
27
|
-
FULL_COLUMN = "full_column"
|
|
28
|
-
|
|
29
|
-
|
|
30
25
|
class ColumnGenerator(ConfigurableTask[TaskConfigT], ABC):
|
|
31
26
|
@property
|
|
32
27
|
def can_generate_from_scratch(self) -> bool:
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""Custom column generator using user-provided callable functions."""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import inspect
|
|
9
|
+
import logging
|
|
10
|
+
from typing import TYPE_CHECKING, Any
|
|
11
|
+
|
|
12
|
+
from data_designer.config.column_configs import CustomColumnConfig, GenerationStrategy
|
|
13
|
+
from data_designer.engine.column_generators.generators.base import ColumnGenerator
|
|
14
|
+
from data_designer.engine.column_generators.utils.errors import CustomColumnGenerationError
|
|
15
|
+
from data_designer.lazy_heavy_imports import pd
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
import pandas as pd
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class CustomColumnGenerator(ColumnGenerator[CustomColumnConfig]):
|
|
24
|
+
"""Column generator that uses a user-provided callable function.
|
|
25
|
+
|
|
26
|
+
Supports two strategies based on config.strategy:
|
|
27
|
+
- cell_by_cell: Processes rows one at a time (dict -> dict), parallelized by framework.
|
|
28
|
+
- full_column: Processes entire batch (DataFrame -> DataFrame) for vectorized ops.
|
|
29
|
+
|
|
30
|
+
Supported function signatures (validated by parameter name):
|
|
31
|
+
- fn(row) -> dict # cell_by_cell, simple transform
|
|
32
|
+
- fn(row, generator_params) -> dict # cell_by_cell, with typed params
|
|
33
|
+
- fn(row, generator_params, models) -> dict # cell_by_cell, with LLM access
|
|
34
|
+
- fn(df) -> DataFrame # full_column, simple transform
|
|
35
|
+
- fn(df, generator_params) -> DataFrame # full_column, with typed params
|
|
36
|
+
- fn(df, generator_params, models) -> DataFrame # full_column, with LLM access
|
|
37
|
+
|
|
38
|
+
The models dict provides direct access to ModelFacade instances keyed by alias.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def get_generation_strategy(self) -> GenerationStrategy:
|
|
42
|
+
"""Return strategy based on config."""
|
|
43
|
+
return self.config.generation_strategy
|
|
44
|
+
|
|
45
|
+
def generate(self, data: dict | pd.DataFrame) -> dict | pd.DataFrame:
|
|
46
|
+
"""Generate column value(s) for a row (dict) or batch (DataFrame)."""
|
|
47
|
+
is_full_column = self.config.generation_strategy == GenerationStrategy.FULL_COLUMN
|
|
48
|
+
is_dataframe = not isinstance(data, dict)
|
|
49
|
+
|
|
50
|
+
# Validate data type matches strategy
|
|
51
|
+
if is_full_column and not is_dataframe:
|
|
52
|
+
raise CustomColumnGenerationError(
|
|
53
|
+
f"Custom generator {self.config.name!r} is configured for 'full_column' strategy "
|
|
54
|
+
"but received a dict. Expected a DataFrame."
|
|
55
|
+
)
|
|
56
|
+
if not is_full_column and is_dataframe:
|
|
57
|
+
raise CustomColumnGenerationError(
|
|
58
|
+
f"Custom generator {self.config.name!r} is configured for 'cell_by_cell' strategy "
|
|
59
|
+
"but received a DataFrame. Expected a dict."
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
return self._generate(data, is_dataframe)
|
|
63
|
+
|
|
64
|
+
def _generate(self, data: dict | pd.DataFrame, is_dataframe: bool) -> dict | pd.DataFrame:
|
|
65
|
+
"""Unified generation logic for both strategies."""
|
|
66
|
+
# Get columns/keys using unified accessor
|
|
67
|
+
get_keys = (lambda d: set(d.columns)) if is_dataframe else (lambda d: set(d.keys()))
|
|
68
|
+
expected_type = pd.DataFrame if is_dataframe else dict
|
|
69
|
+
type_name = "DataFrame" if is_dataframe else "dict"
|
|
70
|
+
|
|
71
|
+
# Check required columns
|
|
72
|
+
missing = set(self.config.required_columns) - get_keys(data)
|
|
73
|
+
if missing:
|
|
74
|
+
raise CustomColumnGenerationError(
|
|
75
|
+
f"Missing required columns for custom generator '{self.config.name}': {sorted(missing)}"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
keys_before = get_keys(data)
|
|
79
|
+
|
|
80
|
+
# Invoke generator
|
|
81
|
+
try:
|
|
82
|
+
result = self._invoke_generator_function(data)
|
|
83
|
+
except CustomColumnGenerationError:
|
|
84
|
+
raise
|
|
85
|
+
except Exception as e:
|
|
86
|
+
raise CustomColumnGenerationError(
|
|
87
|
+
f"Custom generator function failed for column '{self.config.name}': {e}"
|
|
88
|
+
) from e
|
|
89
|
+
|
|
90
|
+
# Validate return type
|
|
91
|
+
if not isinstance(result, expected_type):
|
|
92
|
+
raise CustomColumnGenerationError(
|
|
93
|
+
f"Custom generator for column '{self.config.name}' must return a {type_name}, "
|
|
94
|
+
f"got {type(result).__name__}"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
return self._validate_output(result, keys_before, is_dataframe)
|
|
98
|
+
|
|
99
|
+
def _validate_output(
|
|
100
|
+
self, result: dict | pd.DataFrame, keys_before: set[str], is_dataframe: bool
|
|
101
|
+
) -> dict | pd.DataFrame:
|
|
102
|
+
"""Validate output columns and remove undeclared ones."""
|
|
103
|
+
# Unified accessors
|
|
104
|
+
get_keys = (lambda d: set(d.columns)) if is_dataframe else (lambda d: set(d.keys()))
|
|
105
|
+
container_name = "DataFrame" if is_dataframe else "row"
|
|
106
|
+
|
|
107
|
+
expected_new = {self.config.name} | set(self.config.side_effect_columns)
|
|
108
|
+
result_keys = get_keys(result)
|
|
109
|
+
|
|
110
|
+
# Check primary column exists
|
|
111
|
+
if self.config.name not in result_keys:
|
|
112
|
+
raise CustomColumnGenerationError(
|
|
113
|
+
f"Custom generator for column '{self.config.name}' did not create the expected column. "
|
|
114
|
+
f"The generator_function must add a key named '{self.config.name}' to the {container_name}."
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Check side effect columns exist
|
|
118
|
+
missing = set(self.config.side_effect_columns) - result_keys
|
|
119
|
+
if missing:
|
|
120
|
+
raise CustomColumnGenerationError(
|
|
121
|
+
f"Custom generator for column '{self.config.name}' did not create declared side_effect_columns: "
|
|
122
|
+
f"{sorted(missing)}. Declared side_effect_columns must be added to the {container_name}."
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Check no pre-existing columns removed
|
|
126
|
+
removed = keys_before - result_keys
|
|
127
|
+
if removed:
|
|
128
|
+
raise CustomColumnGenerationError(
|
|
129
|
+
f"Custom generator for column '{self.config.name}' removed pre-existing columns: "
|
|
130
|
+
f"{sorted(removed)}. The generator_function must not remove any existing columns."
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# Remove undeclared columns with warning
|
|
134
|
+
undeclared = (result_keys - keys_before) - expected_new
|
|
135
|
+
if undeclared:
|
|
136
|
+
logger.warning(
|
|
137
|
+
f"⚠️ Custom generator for column '{self.config.name}' created undeclared columns: "
|
|
138
|
+
f"{sorted(undeclared)}. These columns will be removed. "
|
|
139
|
+
f"To keep additional columns, declare them in @custom_column_generator(side_effect_columns=[...])."
|
|
140
|
+
)
|
|
141
|
+
if is_dataframe:
|
|
142
|
+
result = result.drop(columns=list(undeclared))
|
|
143
|
+
else:
|
|
144
|
+
for key in undeclared:
|
|
145
|
+
del result[key]
|
|
146
|
+
|
|
147
|
+
return result
|
|
148
|
+
|
|
149
|
+
def _invoke_generator_function(self, data: dict | pd.DataFrame) -> dict | pd.DataFrame:
|
|
150
|
+
"""Invoke the user's generate function with appropriate arguments based on signature."""
|
|
151
|
+
params = self._get_validated_params()
|
|
152
|
+
|
|
153
|
+
if len(params) == 1:
|
|
154
|
+
return self.config.generator_function(data)
|
|
155
|
+
elif len(params) == 2:
|
|
156
|
+
return self.config.generator_function(data, self.config.generator_params)
|
|
157
|
+
else:
|
|
158
|
+
models = self._build_models_dict()
|
|
159
|
+
return self.config.generator_function(data, self.config.generator_params, models)
|
|
160
|
+
|
|
161
|
+
def _build_models_dict(self) -> dict[str, Any]:
|
|
162
|
+
"""Build a dict of ModelFacade instances from model_aliases."""
|
|
163
|
+
return {
|
|
164
|
+
alias: self.resource_provider.model_registry.get_model(model_alias=alias)
|
|
165
|
+
for alias in self.config.model_aliases
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
def _get_validated_params(self) -> list[inspect.Parameter]:
|
|
169
|
+
"""Get positional params and validate first param matches generation strategy."""
|
|
170
|
+
params = [
|
|
171
|
+
p
|
|
172
|
+
for p in inspect.signature(self.config.generator_function).parameters.values()
|
|
173
|
+
if p.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD)
|
|
174
|
+
]
|
|
175
|
+
# Decorator validated param names; here we only check strategy match
|
|
176
|
+
is_full = self.config.generation_strategy == GenerationStrategy.FULL_COLUMN
|
|
177
|
+
expected = "df" if is_full else "row"
|
|
178
|
+
if params[0].name != expected:
|
|
179
|
+
raise CustomColumnGenerationError(
|
|
180
|
+
f"Generator '{self.config.name}': strategy is {'full_column' if is_full else 'cell_by_cell'}, "
|
|
181
|
+
f"first parameter must be '{expected}', got '{params[0].name}'."
|
|
182
|
+
)
|
|
183
|
+
return params
|
|
184
|
+
|
|
185
|
+
def log_pre_generation(self) -> None:
|
|
186
|
+
logger.info(f"{self.config.get_column_emoji()} Custom column config for column '{self.config.name}'")
|
|
187
|
+
logger.info(f" |-- generator_function: {self.config.generator_function.__name__!r}")
|
|
188
|
+
logger.info(f" |-- generation_strategy: {self.config.generation_strategy!r}")
|
|
189
|
+
logger.info(f" |-- required_columns: {self.config.required_columns}")
|
|
190
|
+
if self.config.side_effect_columns:
|
|
191
|
+
logger.info(f" |-- side_effect_columns: {self.config.side_effect_columns}")
|
|
192
|
+
if self.config.model_aliases:
|
|
193
|
+
logger.info(f" |-- model_aliases: {self.config.model_aliases}")
|
|
194
|
+
if self.config.generator_params:
|
|
195
|
+
logger.info(f" |-- generator_params: {self.config.generator_params}")
|
|
@@ -12,7 +12,8 @@ from data_designer.config.column_configs import (
|
|
|
12
12
|
LLMStructuredColumnConfig,
|
|
13
13
|
LLMTextColumnConfig,
|
|
14
14
|
)
|
|
15
|
-
from data_designer.config.utils.constants import
|
|
15
|
+
from data_designer.config.utils.constants import REASONING_CONTENT_COLUMN_POSTFIX, TRACE_COLUMN_POSTFIX
|
|
16
|
+
from data_designer.config.utils.trace_type import TraceType
|
|
16
17
|
from data_designer.engine.column_generators.generators.base import ColumnGeneratorWithModel, GenerationStrategy
|
|
17
18
|
from data_designer.engine.column_generators.utils.prompt_renderer import (
|
|
18
19
|
PromptType,
|
|
@@ -66,7 +67,7 @@ class ColumnGeneratorWithModelChatCompletion(ColumnGeneratorWithModel[TaskConfig
|
|
|
66
67
|
for context in self.config.multi_modal_context:
|
|
67
68
|
multi_modal_context.extend(context.get_contexts(deserialized_record))
|
|
68
69
|
|
|
69
|
-
response,
|
|
70
|
+
response, trace = self.model.generate(
|
|
70
71
|
prompt=self.prompt_renderer.render(
|
|
71
72
|
record=deserialized_record,
|
|
72
73
|
prompt_template=self.config.prompt,
|
|
@@ -79,6 +80,7 @@ class ColumnGeneratorWithModelChatCompletion(ColumnGeneratorWithModel[TaskConfig
|
|
|
79
80
|
),
|
|
80
81
|
parser=self.response_recipe.parse,
|
|
81
82
|
multi_modal_context=multi_modal_context,
|
|
83
|
+
tool_alias=self.config.tool_alias,
|
|
82
84
|
max_correction_steps=self.max_conversation_correction_steps,
|
|
83
85
|
max_conversation_restarts=self.max_conversation_restarts,
|
|
84
86
|
purpose=f"running generation for column '{self.config.name}'",
|
|
@@ -87,11 +89,39 @@ class ColumnGeneratorWithModelChatCompletion(ColumnGeneratorWithModel[TaskConfig
|
|
|
87
89
|
serialized_output = self.response_recipe.serialize_output(response)
|
|
88
90
|
data[self.config.name] = self._process_serialized_output(serialized_output)
|
|
89
91
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
+
effective_trace_type = self.config.with_trace
|
|
93
|
+
|
|
94
|
+
if effective_trace_type == TraceType.ALL_MESSAGES:
|
|
95
|
+
data[self.config.name + TRACE_COLUMN_POSTFIX] = [message.to_dict() for message in trace]
|
|
96
|
+
elif effective_trace_type == TraceType.LAST_MESSAGE:
|
|
97
|
+
last_assistant = next((m for m in reversed(trace) if m.role == "assistant"), None)
|
|
98
|
+
data[self.config.name + TRACE_COLUMN_POSTFIX] = [last_assistant.to_dict()] if last_assistant else []
|
|
99
|
+
|
|
100
|
+
if self.config.extract_reasoning_content:
|
|
101
|
+
data[self.config.name + REASONING_CONTENT_COLUMN_POSTFIX] = self._extract_reasoning_content(trace)
|
|
92
102
|
|
|
93
103
|
return data
|
|
94
104
|
|
|
105
|
+
def _extract_reasoning_content(self, trace: list) -> str | None:
|
|
106
|
+
"""Extract reasoning_content from the final assistant message in the trace.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
trace: List of ChatMessage objects from the generation.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
The stripped reasoning_content from the final assistant message, or None if not present.
|
|
113
|
+
"""
|
|
114
|
+
reasoning_value: str | None = None
|
|
115
|
+
for message in reversed(trace):
|
|
116
|
+
if message.role == "assistant":
|
|
117
|
+
reasoning_value = message.reasoning_content
|
|
118
|
+
break
|
|
119
|
+
|
|
120
|
+
if reasoning_value is not None:
|
|
121
|
+
reasoning_value = reasoning_value.strip() or None
|
|
122
|
+
|
|
123
|
+
return reasoning_value
|
|
124
|
+
|
|
95
125
|
def _process_serialized_output(self, serialized_output: str) -> str | dict | list:
|
|
96
126
|
"""Process the serialized output from the model. Subclasses can override to customize deserialization."""
|
|
97
127
|
return serialized_output
|
|
@@ -5,6 +5,7 @@ from __future__ import annotations
|
|
|
5
5
|
|
|
6
6
|
from data_designer.config.base import ConfigBase
|
|
7
7
|
from data_designer.config.column_configs import (
|
|
8
|
+
CustomColumnConfig,
|
|
8
9
|
EmbeddingColumnConfig,
|
|
9
10
|
ExpressionColumnConfig,
|
|
10
11
|
LLMCodeColumnConfig,
|
|
@@ -15,6 +16,7 @@ from data_designer.config.column_configs import (
|
|
|
15
16
|
)
|
|
16
17
|
from data_designer.config.column_types import DataDesignerColumnType
|
|
17
18
|
from data_designer.engine.column_generators.generators.base import ColumnGenerator
|
|
19
|
+
from data_designer.engine.column_generators.generators.custom import CustomColumnGenerator
|
|
18
20
|
from data_designer.engine.column_generators.generators.embedding import EmbeddingCellGenerator
|
|
19
21
|
from data_designer.engine.column_generators.generators.expression import ExpressionColumnGenerator
|
|
20
22
|
from data_designer.engine.column_generators.generators.llm_completion import (
|
|
@@ -40,6 +42,7 @@ class ColumnGeneratorRegistry(TaskRegistry[DataDesignerColumnType, ColumnGenerat
|
|
|
40
42
|
|
|
41
43
|
def create_default_column_generator_registry(with_plugins: bool = True) -> ColumnGeneratorRegistry:
|
|
42
44
|
registry = ColumnGeneratorRegistry()
|
|
45
|
+
registry.register(DataDesignerColumnType.CUSTOM, CustomColumnGenerator, CustomColumnConfig)
|
|
43
46
|
registry.register(DataDesignerColumnType.LLM_TEXT, LLMTextCellGenerator, LLMTextColumnConfig)
|
|
44
47
|
registry.register(DataDesignerColumnType.LLM_CODE, LLMCodeCellGenerator, LLMCodeColumnConfig)
|
|
45
48
|
registry.register(DataDesignerColumnType.LLM_JUDGE, LLMJudgeCellGenerator, LLMJudgeColumnConfig)
|
|
@@ -6,7 +6,7 @@ from __future__ import annotations
|
|
|
6
6
|
import json
|
|
7
7
|
import logging
|
|
8
8
|
|
|
9
|
-
from data_designer.config.
|
|
9
|
+
from data_designer.config.base import SingleColumnConfig
|
|
10
10
|
from data_designer.config.column_types import DataDesignerColumnType
|
|
11
11
|
from data_designer.config.models import ModelConfig
|
|
12
12
|
from data_designer.config.utils.code_lang import CodeLang
|
|
@@ -12,6 +12,7 @@ import uuid
|
|
|
12
12
|
from pathlib import Path
|
|
13
13
|
from typing import TYPE_CHECKING, Callable
|
|
14
14
|
|
|
15
|
+
from data_designer.config.column_configs import CustomColumnConfig
|
|
15
16
|
from data_designer.config.column_types import ColumnConfigT
|
|
16
17
|
from data_designer.config.config_builder import BuilderConfig
|
|
17
18
|
from data_designer.config.data_designer_config import DataDesignerConfig
|
|
@@ -34,6 +35,7 @@ from data_designer.engine.dataset_builders.multi_column_configs import MultiColu
|
|
|
34
35
|
from data_designer.engine.dataset_builders.utils.concurrency import ConcurrentThreadExecutor
|
|
35
36
|
from data_designer.engine.dataset_builders.utils.config_compiler import compile_dataset_builder_column_configs
|
|
36
37
|
from data_designer.engine.dataset_builders.utils.dataset_batch_manager import DatasetBatchManager
|
|
38
|
+
from data_designer.engine.dataset_builders.utils.progress_tracker import ProgressTracker
|
|
37
39
|
from data_designer.engine.models.telemetry import InferenceEvent, NemoSourceEnum, TaskStatusEnum, TelemetryHandler
|
|
38
40
|
from data_designer.engine.processing.processors.base import Processor
|
|
39
41
|
from data_designer.engine.processing.processors.drop_columns import DropColumnsProcessor
|
|
@@ -96,6 +98,7 @@ class ColumnWiseDatasetBuilder:
|
|
|
96
98
|
on_batch_complete: Callable[[Path], None] | None = None,
|
|
97
99
|
) -> Path:
|
|
98
100
|
self._run_model_health_check_if_needed()
|
|
101
|
+
self._run_mcp_tool_check_if_needed()
|
|
99
102
|
self._write_builder_config()
|
|
100
103
|
generators = self._initialize_generators()
|
|
101
104
|
start_time = time.perf_counter()
|
|
@@ -124,6 +127,7 @@ class ColumnWiseDatasetBuilder:
|
|
|
124
127
|
|
|
125
128
|
def build_preview(self, *, num_records: int) -> pd.DataFrame:
|
|
126
129
|
self._run_model_health_check_if_needed()
|
|
130
|
+
self._run_mcp_tool_check_if_needed()
|
|
127
131
|
|
|
128
132
|
generators = self._initialize_generators()
|
|
129
133
|
group_id = uuid.uuid4().hex
|
|
@@ -208,11 +212,26 @@ class ColumnWiseDatasetBuilder:
|
|
|
208
212
|
df = generator.generate(self.batch_manager.get_current_batch(as_dataframe=True))
|
|
209
213
|
self.batch_manager.update_records(df.to_dict(orient="records"))
|
|
210
214
|
|
|
211
|
-
def _run_model_health_check_if_needed(self) ->
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
215
|
+
def _run_model_health_check_if_needed(self) -> None:
|
|
216
|
+
model_aliases: set[str] = set()
|
|
217
|
+
for config in self.single_column_configs:
|
|
218
|
+
if column_type_is_model_generated(config.column_type):
|
|
219
|
+
model_aliases.add(config.model_alias)
|
|
220
|
+
if isinstance(config, CustomColumnConfig) and config.model_aliases:
|
|
221
|
+
model_aliases.update(config.model_aliases)
|
|
222
|
+
|
|
223
|
+
if model_aliases:
|
|
224
|
+
self._resource_provider.model_registry.run_health_check(list(model_aliases))
|
|
225
|
+
|
|
226
|
+
def _run_mcp_tool_check_if_needed(self) -> None:
|
|
227
|
+
tool_aliases = sorted(
|
|
228
|
+
{config.tool_alias for config in self.llm_generated_column_configs if getattr(config, "tool_alias", None)}
|
|
229
|
+
)
|
|
230
|
+
if not tool_aliases:
|
|
231
|
+
return
|
|
232
|
+
if self._resource_provider.mcp_registry is None:
|
|
233
|
+
raise DatasetGenerationError(f"Tool alias(es) {tool_aliases!r} specified but no MCPRegistry configured.")
|
|
234
|
+
self._resource_provider.mcp_registry.run_health_check(tool_aliases)
|
|
216
235
|
|
|
217
236
|
def _fan_out_with_threads(self, generator: ColumnGeneratorWithModelRegistry, max_workers: int) -> None:
|
|
218
237
|
if generator.get_generation_strategy() != GenerationStrategy.CELL_BY_CELL:
|
|
@@ -221,16 +240,18 @@ class ColumnWiseDatasetBuilder:
|
|
|
221
240
|
"generator so concurrency through threads is not supported."
|
|
222
241
|
)
|
|
223
242
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
f"
|
|
243
|
+
progress_tracker = ProgressTracker(
|
|
244
|
+
total_records=self.batch_manager.num_records_batch,
|
|
245
|
+
label=f"{generator.config.column_type} column '{generator.config.name}'",
|
|
227
246
|
)
|
|
247
|
+
progress_tracker.log_start(max_workers)
|
|
248
|
+
|
|
228
249
|
settings = self._resource_provider.run_config
|
|
229
250
|
with ConcurrentThreadExecutor(
|
|
230
251
|
max_workers=max_workers,
|
|
231
252
|
column_name=generator.config.name,
|
|
232
|
-
result_callback=self.
|
|
233
|
-
error_callback=self.
|
|
253
|
+
result_callback=self._make_result_callback(progress_tracker),
|
|
254
|
+
error_callback=self._make_error_callback(progress_tracker),
|
|
234
255
|
shutdown_error_rate=settings.shutdown_error_rate,
|
|
235
256
|
shutdown_error_window=settings.shutdown_error_window,
|
|
236
257
|
disable_early_shutdown=settings.disable_early_shutdown,
|
|
@@ -238,10 +259,26 @@ class ColumnWiseDatasetBuilder:
|
|
|
238
259
|
for i, record in self.batch_manager.iter_current_batch():
|
|
239
260
|
executor.submit(lambda record: generator.generate(record), record, context={"index": i})
|
|
240
261
|
|
|
262
|
+
progress_tracker.log_final()
|
|
263
|
+
|
|
241
264
|
if len(self._records_to_drop) > 0:
|
|
242
265
|
self.batch_manager.drop_records(self._records_to_drop)
|
|
243
266
|
self._records_to_drop.clear()
|
|
244
267
|
|
|
268
|
+
def _make_result_callback(self, progress_tracker: ProgressTracker) -> Callable[[dict], None]:
|
|
269
|
+
def callback(result: dict, *, context: dict | None = None) -> None:
|
|
270
|
+
self._worker_result_callback(result, context=context)
|
|
271
|
+
progress_tracker.record_success()
|
|
272
|
+
|
|
273
|
+
return callback
|
|
274
|
+
|
|
275
|
+
def _make_error_callback(self, progress_tracker: ProgressTracker) -> Callable[[Exception], None]:
|
|
276
|
+
def callback(exc: Exception, *, context: dict | None = None) -> None:
|
|
277
|
+
self._worker_error_callback(exc, context=context)
|
|
278
|
+
progress_tracker.record_failure()
|
|
279
|
+
|
|
280
|
+
return callback
|
|
281
|
+
|
|
245
282
|
def _write_processed_batch(self, dataframe: pd.DataFrame) -> None:
|
|
246
283
|
self.batch_manager.update_records(dataframe.to_dict(orient="records"))
|
|
247
284
|
self.batch_manager.write()
|
|
@@ -8,8 +8,8 @@ from typing import TypeAlias
|
|
|
8
8
|
|
|
9
9
|
from pydantic import Field, field_validator
|
|
10
10
|
|
|
11
|
-
from data_designer.config.base import ConfigBase
|
|
12
|
-
from data_designer.config.column_configs import SamplerColumnConfig, SeedDatasetColumnConfig
|
|
11
|
+
from data_designer.config.base import ConfigBase, SingleColumnConfig
|
|
12
|
+
from data_designer.config.column_configs import SamplerColumnConfig, SeedDatasetColumnConfig
|
|
13
13
|
from data_designer.config.column_types import ColumnConfigT, DataDesignerColumnType
|
|
14
14
|
from data_designer.config.sampler_constraints import ColumnConstraintT
|
|
15
15
|
from data_designer.config.seed import SeedConfig
|