data-designer-engine 0.5.2__tar.gz → 0.5.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/.gitignore +6 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/PKG-INFO +4 -3
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/pyproject.toml +2 -1
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/generators/base.py +68 -8
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/generators/custom.py +60 -3
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/generators/embedding.py +12 -2
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/generators/image.py +20 -23
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/generators/seed_dataset.py +5 -1
- data_designer_engine-0.5.3/src/data_designer/engine/dataset_builders/utils/completion_tracker.py +230 -0
- data_designer_engine-0.5.3/src/data_designer/engine/dataset_builders/utils/execution_graph.py +260 -0
- data_designer_engine-0.5.3/src/data_designer/engine/dataset_builders/utils/task_model.py +61 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/mcp/facade.py +59 -229
- data_designer_engine-0.5.3/src/data_designer/engine/models/clients/__init__.py +46 -0
- data_designer_engine-0.5.3/src/data_designer/engine/models/clients/adapters/__init__.py +8 -0
- data_designer_engine-0.5.3/src/data_designer/engine/models/clients/adapters/litellm_bridge.py +217 -0
- data_designer_engine-0.5.3/src/data_designer/engine/models/clients/base.py +47 -0
- data_designer_engine-0.5.3/src/data_designer/engine/models/clients/errors.py +229 -0
- data_designer_engine-0.5.3/src/data_designer/engine/models/clients/factory.py +50 -0
- data_designer_engine-0.5.3/src/data_designer/engine/models/clients/parsing.py +335 -0
- data_designer_engine-0.5.3/src/data_designer/engine/models/clients/types.py +158 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/errors.py +109 -6
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/facade.py +348 -485
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/factory.py +8 -2
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/litellm_overrides.py +21 -1
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/registry.py +29 -7
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/gsonschema/validators.py +44 -1
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/storage/artifact_storage.py +21 -10
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/testing/__init__.py +2 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/testing/fixtures.py +38 -32
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/testing/stubs.py +62 -33
- data_designer_engine-0.5.3/tests/engine/column_generators/generators/test_async_generators.py +424 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/generators/test_seed_dataset.py +2 -2
- data_designer_engine-0.5.3/tests/engine/dataset_builders/utils/test_completion_tracker.py +348 -0
- data_designer_engine-0.5.3/tests/engine/dataset_builders/utils/test_execution_graph.py +450 -0
- data_designer_engine-0.5.3/tests/engine/dataset_builders/utils/test_task_model.py +89 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/mcp/test_mcp_facade.py +74 -171
- data_designer_engine-0.5.3/tests/engine/models/clients/conftest.py +20 -0
- data_designer_engine-0.5.3/tests/engine/models/clients/test_client_errors.py +253 -0
- data_designer_engine-0.5.3/tests/engine/models/clients/test_litellm_bridge.py +428 -0
- data_designer_engine-0.5.3/tests/engine/models/clients/test_parsing.py +213 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/conftest.py +15 -1
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/test_facade.py +251 -487
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/test_litellm_overrides.py +43 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/gsonschema/test_validators.py +75 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/storage/test_artifact_storage.py +55 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/README.md +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/__init__.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/analysis/column_profilers/base.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/analysis/column_profilers/judge_score_profiler.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/analysis/column_profilers/registry.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/analysis/column_statistics.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/analysis/dataset_profiler.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/analysis/errors.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/analysis/utils/column_statistics_calculations.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/analysis/utils/judge_score_processing.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/__init__.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/generators/__init__.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/generators/expression.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/generators/llm_completion.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/generators/samplers.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/generators/validation.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/registry.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/utils/errors.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/utils/generator_classification.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/utils/judge_score_factory.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/utils/prompt_renderer.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/compiler.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/configurable_task.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/dataset_builders/column_wise_builder.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/dataset_builders/errors.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/dataset_builders/multi_column_configs.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/dataset_builders/utils/__init__.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/dataset_builders/utils/async_concurrency.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/dataset_builders/utils/concurrency.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/dataset_builders/utils/config_compiler.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/dataset_builders/utils/dag.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/dataset_builders/utils/errors.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/dataset_builders/utils/processor_runner.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/dataset_builders/utils/progress_tracker.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/errors.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/mcp/__init__.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/mcp/errors.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/mcp/factory.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/mcp/io.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/mcp/registry.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/model_provider.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/__init__.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/parsers/__init__.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/parsers/errors.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/parsers/parser.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/parsers/postprocessors.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/parsers/tag_parsers.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/parsers/types.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/recipes/base.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/recipes/response_recipes.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/telemetry.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/usage.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/utils.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/ginja/__init__.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/ginja/ast.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/ginja/environment.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/ginja/exceptions.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/ginja/record.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/gsonschema/__init__.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/gsonschema/exceptions.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/gsonschema/schema_transformers.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/gsonschema/types.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/processors/base.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/processors/drop_columns.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/processors/registry.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/processors/schema_transform.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/utils.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/registry/base.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/registry/data_designer_registry.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/registry/errors.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/resources/managed_dataset_generator.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/resources/managed_dataset_repository.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/resources/managed_storage.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/resources/resource_provider.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/resources/seed_reader.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/column.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/constraints.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/data_sources/base.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/data_sources/errors.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/data_sources/sources.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/entities/__init__.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/entities/email_address_utils.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/entities/errors.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/entities/national_id_utils.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/entities/person.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/entities/phone_number.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/errors.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/generator.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/jinja_utils.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/people_gen.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/person_constants.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/schema.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/schema_builder.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/utils.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/secret_resolver.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/storage/__init__.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/storage/media_storage.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/testing/utils.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/validation.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/validators/__init__.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/validators/base.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/validators/local_callable.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/validators/python.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/validators/remote.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/validators/sql.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/conftest.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/analysis/column_profilers/test_base.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/analysis/column_profilers/test_judge_score_profiler.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/analysis/conftest.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/analysis/test_column_statistics_calculator.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/analysis/test_data/artifacts/dataset/column_configs.json +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/analysis/test_data/artifacts/dataset/dataset.json +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/analysis/test_data/artifacts/dataset/metadata.json +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/analysis/test_dataset_profiler.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/analysis/test_errors.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/analysis/utils/test_column_statistics_calculations.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/analysis/utils/test_judge_score_processing.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/generators/__init__.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/generators/test_column_generator_base.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/generators/test_custom.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/generators/test_embedding.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/generators/test_expression.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/generators/test_image.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/generators/test_llm_completion_generators.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/generators/test_samplers.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/generators/test_validation.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/test_registry.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/utils/test_column_generator_errors.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/utils/test_generator_classification.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/utils/test_judge_score_factory.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/utils/test_prompt_renderer.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/conftest.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/dataset_builders/test_column_wise_builder.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/dataset_builders/test_multi_column_configs.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/dataset_builders/utils/test_async_concurrency.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/dataset_builders/utils/test_concurrency.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/dataset_builders/utils/test_config_compiler.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/dataset_builders/utils/test_dag.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/dataset_builders/utils/test_dataset_batch_manager.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/dataset_builders/utils/test_progress_tracker.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/mcp/conftest.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/mcp/test_mcp_factory.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/mcp/test_mcp_io.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/mcp/test_mcp_registry.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/parsers/test_parser.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/parsers/test_parsers_types.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/parsers/test_postprocessors.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/parsers/test_tag_parsers.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/recipes/test_recipe_base.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/recipes/test_response_recipes.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/stub_secrets.json +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/test_async_engine_switch.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/test_model_errors.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/test_model_registry.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/test_model_utils.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/test_usage.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/__init__.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/ginja/__init__.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/ginja/test_ast.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/ginja/test_environment.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/ginja/test_exceptions.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/ginja/test_record.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/gsonschema/__init__.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/gsonschema/test_exceptions.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/gsonschema/test_schema_transformers.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/gsonschema/test_types.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/processors/__init__.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/processors/test_drop_columns.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/processors/test_registry.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/processors/test_schema_transform.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/test_utils.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/registry/__init__.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/registry/conftest.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/registry/test_base.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/registry/test_data_designer_registry.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/registry/test_errors.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/resources/__init__.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/resources/conftest.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/resources/test_managed_dataset_generator.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/resources/test_managed_dataset_repository.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/resources/test_managed_storage.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/resources/test_resource_provider.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/resources/test_seed_reader.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/conftest.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/data_sources/test_sampler_errors.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/data_sources/test_sources.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/entities/test_email_address_utils.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/entities/test_national_id_utils.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/entities/test_person.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/entities/test_phone_number.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/test_column.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/test_constraints.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/test_generator.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/test_jinja_utils.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/test_people_gen.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/test_schema.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/test_utils.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/storage/__init__.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/storage/test_media_storage.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/test_compiler.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/test_configurable_task.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/test_dataset_metadata.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/test_engine_errors.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/test_model_provider.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/test_secret_resolver.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/test_validation.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/validators/test_local_callable.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/validators/test_python.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/validators/test_remote.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/validators/test_sql.py +0 -0
- {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/test_plugin_manager.py +0 -0
|
@@ -99,8 +99,14 @@ NOTEPAD.md
|
|
|
99
99
|
# Build-time copy of README for data-designer package (copied from top-level during build)
|
|
100
100
|
packages/data-designer/README.md
|
|
101
101
|
|
|
102
|
+
# Notebook build cache
|
|
103
|
+
.notebook-cache/
|
|
104
|
+
|
|
102
105
|
# Cerebro knowledge base
|
|
103
106
|
.cerebro/
|
|
104
107
|
.cursor/rules/cerebro.mdc
|
|
105
108
|
.cursor/mcp.json
|
|
106
109
|
.claude/rules/cerebro.md
|
|
110
|
+
|
|
111
|
+
# Claude worktrees
|
|
112
|
+
.claude/worktrees/
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: data-designer-engine
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.3
|
|
4
4
|
Summary: Generation engine for DataDesigner synthetic data generation
|
|
5
5
|
License-Expression: Apache-2.0
|
|
6
6
|
Classifier: Development Status :: 4 - Beta
|
|
@@ -14,8 +14,9 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
14
14
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
15
|
Requires-Python: >=3.10
|
|
16
16
|
Requires-Dist: anyascii<1,>=0.3.3
|
|
17
|
-
Requires-Dist:
|
|
18
|
-
Requires-Dist:
|
|
17
|
+
Requires-Dist: chardet<6,>=3.0.2
|
|
18
|
+
Requires-Dist: data-designer-config==0.5.3
|
|
19
|
+
Requires-Dist: duckdb<2,>=1.5.0
|
|
19
20
|
Requires-Dist: faker<21,>=20.1.0
|
|
20
21
|
Requires-Dist: httpx-retries<1,>=0.4.2
|
|
21
22
|
Requires-Dist: httpx<1,>=0.27.2
|
|
@@ -33,8 +33,9 @@ bump = true
|
|
|
33
33
|
[tool.hatch.metadata.hooks.uv-dynamic-versioning]
|
|
34
34
|
dependencies = [
|
|
35
35
|
"anyascii>=0.3.3,<1",
|
|
36
|
+
"chardet>=3.0.2,<6", # Pulled in by sqlfluff; pin <6 to avoid RequestsDependencyWarning from requests<2.33
|
|
36
37
|
"data-designer-config=={{ version }}",
|
|
37
|
-
"duckdb>=1.
|
|
38
|
+
"duckdb>=1.5.0,<2",
|
|
38
39
|
"faker>=20.1.0,<21",
|
|
39
40
|
"httpx>=0.27.2,<1",
|
|
40
41
|
"httpx-retries>=0.4.2,<1",
|
|
@@ -4,15 +4,20 @@
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
6
|
import asyncio
|
|
7
|
+
import concurrent.futures
|
|
7
8
|
import functools
|
|
8
9
|
import logging
|
|
9
10
|
from abc import ABC, abstractmethod
|
|
10
|
-
from typing import TYPE_CHECKING, Any, overload
|
|
11
|
+
from typing import TYPE_CHECKING, Any, Coroutine, TypeVar, overload
|
|
11
12
|
|
|
12
13
|
from data_designer.config.column_configs import GenerationStrategy
|
|
13
14
|
from data_designer.engine.configurable_task import ConfigurableTask, DataT, TaskConfigT
|
|
14
15
|
from data_designer.logging import LOG_DOUBLE_INDENT, LOG_INDENT
|
|
15
16
|
|
|
17
|
+
_T = TypeVar("_T")
|
|
18
|
+
|
|
19
|
+
_SYNC_BRIDGE_TIMEOUT = 300
|
|
20
|
+
|
|
16
21
|
if TYPE_CHECKING:
|
|
17
22
|
import pandas as pd
|
|
18
23
|
|
|
@@ -23,33 +28,84 @@ if TYPE_CHECKING:
|
|
|
23
28
|
logger = logging.getLogger(__name__)
|
|
24
29
|
|
|
25
30
|
|
|
31
|
+
def _run_coroutine_sync(coro: Coroutine[Any, Any, _T]) -> _T:
|
|
32
|
+
"""Run an async coroutine from sync context.
|
|
33
|
+
|
|
34
|
+
- No running event loop → ``asyncio.run(coro)``
|
|
35
|
+
- Running event loop (e.g. notebook/service) → run in a background thread
|
|
36
|
+
"""
|
|
37
|
+
try:
|
|
38
|
+
asyncio.get_running_loop()
|
|
39
|
+
except RuntimeError:
|
|
40
|
+
return asyncio.run(coro)
|
|
41
|
+
pool = concurrent.futures.ThreadPoolExecutor(max_workers=1)
|
|
42
|
+
future = pool.submit(asyncio.run, coro)
|
|
43
|
+
timed_out = False
|
|
44
|
+
try:
|
|
45
|
+
result = future.result(timeout=_SYNC_BRIDGE_TIMEOUT)
|
|
46
|
+
except concurrent.futures.TimeoutError as exc:
|
|
47
|
+
timed_out = True
|
|
48
|
+
logger.warning(f"⚠️ Sync bridge timed out after {_SYNC_BRIDGE_TIMEOUT}s; background thread still running")
|
|
49
|
+
raise TimeoutError(f"_run_coroutine_sync timed out after {_SYNC_BRIDGE_TIMEOUT}s") from exc
|
|
50
|
+
finally:
|
|
51
|
+
pool.shutdown(wait=not timed_out, cancel_futures=timed_out)
|
|
52
|
+
return result
|
|
53
|
+
|
|
54
|
+
|
|
26
55
|
class ColumnGenerator(ConfigurableTask[TaskConfigT], ABC):
|
|
27
56
|
@property
|
|
28
57
|
def can_generate_from_scratch(self) -> bool:
|
|
29
58
|
return False
|
|
30
59
|
|
|
60
|
+
@property
|
|
61
|
+
def is_order_dependent(self) -> bool:
|
|
62
|
+
"""Whether this generator's output depends on prior row-group calls.
|
|
63
|
+
|
|
64
|
+
Example: SeedDatasetColumnGenerator tracks its position in the seed
|
|
65
|
+
dataset, so row group N must complete before N+1 starts.
|
|
66
|
+
"""
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
def _is_overridden(self, method_name: str) -> bool:
|
|
70
|
+
"""Check if a subclass has overridden a base ColumnGenerator method."""
|
|
71
|
+
return getattr(type(self), method_name) is not getattr(ColumnGenerator, method_name)
|
|
72
|
+
|
|
31
73
|
@staticmethod
|
|
32
74
|
@abstractmethod
|
|
33
75
|
def get_generation_strategy() -> GenerationStrategy: ...
|
|
34
76
|
|
|
35
77
|
@overload
|
|
36
|
-
@abstractmethod
|
|
37
78
|
def generate(self, data: dict) -> dict: ...
|
|
38
79
|
|
|
39
80
|
@overload
|
|
40
|
-
@abstractmethod
|
|
41
81
|
def generate(self, data: pd.DataFrame) -> pd.DataFrame: ...
|
|
42
82
|
|
|
43
|
-
|
|
44
|
-
|
|
83
|
+
def generate(self, data: DataT) -> DataT:
|
|
84
|
+
"""Sync generate — overridden by most concrete generators.
|
|
85
|
+
|
|
86
|
+
Default bridges to ``agenerate()`` for async-first subclasses that only
|
|
87
|
+
implement ``agenerate()``. Raises ``NotImplementedError`` if neither
|
|
88
|
+
``generate()`` nor ``agenerate()`` is overridden.
|
|
89
|
+
"""
|
|
90
|
+
if not self._is_overridden("agenerate"):
|
|
91
|
+
raise NotImplementedError(f"{type(self).__name__} must implement either generate() or agenerate()")
|
|
92
|
+
return _run_coroutine_sync(self.agenerate(data))
|
|
45
93
|
|
|
46
|
-
|
|
47
|
-
|
|
94
|
+
@overload
|
|
95
|
+
async def agenerate(self, data: dict) -> dict: ...
|
|
96
|
+
|
|
97
|
+
@overload
|
|
98
|
+
async def agenerate(self, data: pd.DataFrame) -> pd.DataFrame: ...
|
|
99
|
+
|
|
100
|
+
async def agenerate(self, data: DataT) -> DataT:
|
|
101
|
+
"""Async generate — delegates to sync ``generate()`` via thread pool.
|
|
48
102
|
|
|
49
103
|
Subclasses with native async support (e.g. ColumnGeneratorWithModelChatCompletion)
|
|
50
104
|
should override this with a direct async implementation.
|
|
51
105
|
"""
|
|
52
|
-
|
|
106
|
+
if not self._is_overridden("generate"):
|
|
107
|
+
raise NotImplementedError(f"{type(self).__name__} must implement either generate() or agenerate()")
|
|
108
|
+
return await asyncio.to_thread(self.generate, data.copy())
|
|
53
109
|
|
|
54
110
|
def log_pre_generation(self) -> None:
|
|
55
111
|
"""A shared method to log info before the generator's `generate` method is called.
|
|
@@ -68,6 +124,10 @@ class FromScratchColumnGenerator(ColumnGenerator[TaskConfigT], ABC):
|
|
|
68
124
|
@abstractmethod
|
|
69
125
|
def generate_from_scratch(self, num_records: int) -> pd.DataFrame: ...
|
|
70
126
|
|
|
127
|
+
async def agenerate_from_scratch(self, num_records: int) -> pd.DataFrame:
|
|
128
|
+
"""Async wrapper — wraps sync ``generate_from_scratch()`` in a thread."""
|
|
129
|
+
return await asyncio.to_thread(self.generate_from_scratch, num_records)
|
|
130
|
+
|
|
71
131
|
|
|
72
132
|
class ColumnGeneratorWithModelRegistry(ColumnGenerator[TaskConfigT], ABC):
|
|
73
133
|
@property
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
|
+
import asyncio
|
|
8
9
|
import inspect
|
|
9
10
|
import logging
|
|
10
11
|
from typing import TYPE_CHECKING, Any
|
|
@@ -65,12 +66,57 @@ class CustomColumnGenerator(ColumnGenerator[CustomColumnConfig]):
|
|
|
65
66
|
|
|
66
67
|
return self._generate(data, is_dataframe)
|
|
67
68
|
|
|
69
|
+
async def agenerate(self, data: dict | pd.DataFrame) -> dict | pd.DataFrame | list[dict]:
|
|
70
|
+
"""Async generate — branches on strategy and detects coroutine functions."""
|
|
71
|
+
is_full_column = self.config.generation_strategy == GenerationStrategy.FULL_COLUMN
|
|
72
|
+
if is_full_column:
|
|
73
|
+
return await asyncio.to_thread(self.generate, data.copy())
|
|
74
|
+
# The @custom_column_generator decorator wraps the user function in a sync
|
|
75
|
+
# wrapper, so we must unwrap to detect async functions.
|
|
76
|
+
fn_unwrapped = inspect.unwrap(self.config.generator_function)
|
|
77
|
+
if asyncio.iscoroutinefunction(fn_unwrapped):
|
|
78
|
+
missing = set(self.config.required_columns) - set(data.keys())
|
|
79
|
+
if missing:
|
|
80
|
+
raise CustomColumnGenerationError(
|
|
81
|
+
f"Missing required columns for custom generator '{self.config.name}': {sorted(missing)}"
|
|
82
|
+
)
|
|
83
|
+
keys_before = set(data.keys())
|
|
84
|
+
|
|
85
|
+
try:
|
|
86
|
+
result = await self._ainvoke_generator_function(data)
|
|
87
|
+
except CustomColumnGenerationError:
|
|
88
|
+
raise
|
|
89
|
+
except Exception as e:
|
|
90
|
+
logger.warning(
|
|
91
|
+
f"⚠️ Custom generator function {self.config.generator_function.__name__!r} "
|
|
92
|
+
f"failed for column '{self.config.name}'. This record will be skipped.\n{e}"
|
|
93
|
+
)
|
|
94
|
+
raise CustomColumnGenerationError(
|
|
95
|
+
f"Custom generator function failed for column '{self.config.name}': {e}"
|
|
96
|
+
) from e
|
|
97
|
+
|
|
98
|
+
return self._postprocess_result(result, is_dataframe=False, keys_before=keys_before)
|
|
99
|
+
return await asyncio.to_thread(self.generate, data)
|
|
100
|
+
|
|
101
|
+
async def _ainvoke_generator_function(self, data: dict) -> dict | pd.DataFrame:
|
|
102
|
+
"""Invoke an async user generator function with appropriate arguments.
|
|
103
|
+
|
|
104
|
+
The @custom_column_generator decorator's sync wrapper returns a coroutine
|
|
105
|
+
when the original function is async, so we await the wrapper's return value.
|
|
106
|
+
"""
|
|
107
|
+
params = self._get_validated_params()
|
|
108
|
+
fn = self.config.generator_function
|
|
109
|
+
if len(params) == 1:
|
|
110
|
+
return await fn(data)
|
|
111
|
+
elif len(params) == 2:
|
|
112
|
+
return await fn(data, self.config.generator_params)
|
|
113
|
+
else:
|
|
114
|
+
models = self._build_models_dict()
|
|
115
|
+
return await fn(data, self.config.generator_params, models)
|
|
116
|
+
|
|
68
117
|
def _generate(self, data: dict | pd.DataFrame, is_dataframe: bool) -> dict | pd.DataFrame | list[dict]:
|
|
69
118
|
"""Unified generation logic for both strategies."""
|
|
70
|
-
# Get columns/keys using unified accessor
|
|
71
119
|
get_keys = (lambda d: set(d.columns)) if is_dataframe else (lambda d: set(d.keys()))
|
|
72
|
-
expected_type = lazy.pd.DataFrame if is_dataframe else dict
|
|
73
|
-
type_name = "DataFrame" if is_dataframe else "dict"
|
|
74
120
|
|
|
75
121
|
# Check required columns
|
|
76
122
|
missing = set(self.config.required_columns) - get_keys(data)
|
|
@@ -96,6 +142,15 @@ class CustomColumnGenerator(ColumnGenerator[CustomColumnConfig]):
|
|
|
96
142
|
f"Custom generator function failed for column '{self.config.name}': {e}"
|
|
97
143
|
) from e
|
|
98
144
|
|
|
145
|
+
return self._postprocess_result(result, is_dataframe, keys_before)
|
|
146
|
+
|
|
147
|
+
def _postprocess_result(
|
|
148
|
+
self,
|
|
149
|
+
result: dict | pd.DataFrame | list[dict],
|
|
150
|
+
is_dataframe: bool,
|
|
151
|
+
keys_before: set[str],
|
|
152
|
+
) -> dict | pd.DataFrame | list[dict]:
|
|
153
|
+
"""Validate type and output columns of a generation result."""
|
|
99
154
|
# Cell-by-cell with allow_resize: accept dict or list[dict]
|
|
100
155
|
if not is_dataframe and self.config.allow_resize:
|
|
101
156
|
if isinstance(result, dict):
|
|
@@ -113,6 +168,8 @@ class CustomColumnGenerator(ColumnGenerator[CustomColumnConfig]):
|
|
|
113
168
|
)
|
|
114
169
|
|
|
115
170
|
# Validate return type for non-resize paths
|
|
171
|
+
expected_type = lazy.pd.DataFrame if is_dataframe else dict
|
|
172
|
+
type_name = "DataFrame" if is_dataframe else "dict"
|
|
116
173
|
if not isinstance(result, expected_type):
|
|
117
174
|
raise CustomColumnGenerationError(
|
|
118
175
|
f"Custom generator for column '{self.config.name}' must return a {type_name}, "
|
|
@@ -27,9 +27,19 @@ class EmbeddingCellGenerator(ColumnGeneratorWithModel[EmbeddingColumnConfig]):
|
|
|
27
27
|
def get_generation_strategy() -> GenerationStrategy:
|
|
28
28
|
return GenerationStrategy.CELL_BY_CELL
|
|
29
29
|
|
|
30
|
-
def
|
|
30
|
+
def _prepare_embedding_inputs(self, data: dict) -> list[str]:
|
|
31
31
|
deserialized_record = deserialize_json_values(data)
|
|
32
|
-
|
|
32
|
+
return parse_list_string(deserialized_record[self.config.target_column])
|
|
33
|
+
|
|
34
|
+
def generate(self, data: dict) -> dict:
|
|
35
|
+
input_texts = self._prepare_embedding_inputs(data)
|
|
33
36
|
embeddings = self.model.generate_text_embeddings(input_texts=input_texts)
|
|
34
37
|
data[self.config.name] = EmbeddingGenerationResult(embeddings=embeddings).model_dump(mode="json")
|
|
35
38
|
return data
|
|
39
|
+
|
|
40
|
+
async def agenerate(self, data: dict) -> dict:
|
|
41
|
+
"""Native async generate using model.agenerate_text_embeddings."""
|
|
42
|
+
input_texts = self._prepare_embedding_inputs(data)
|
|
43
|
+
embeddings = await self.model.agenerate_text_embeddings(input_texts=input_texts)
|
|
44
|
+
data[self.config.name] = EmbeddingGenerationResult(embeddings=embeddings).model_dump(mode="json")
|
|
45
|
+
return data
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
|
+
import asyncio
|
|
6
7
|
from typing import TYPE_CHECKING
|
|
7
8
|
|
|
8
9
|
from data_designer.config.column_configs import ImageColumnConfig
|
|
@@ -31,46 +32,42 @@ class ImageCellGenerator(WithJinja2UserTemplateRendering, ColumnGeneratorWithMod
|
|
|
31
32
|
def get_generation_strategy() -> GenerationStrategy:
|
|
32
33
|
return GenerationStrategy.CELL_BY_CELL
|
|
33
34
|
|
|
34
|
-
def
|
|
35
|
-
"""
|
|
36
|
-
|
|
37
|
-
Args:
|
|
38
|
-
data: Record data
|
|
39
|
-
|
|
40
|
-
Returns:
|
|
41
|
-
Record with image path(s) (create mode) or base64 data (preview mode) added
|
|
42
|
-
"""
|
|
35
|
+
def _prepare_image_inputs(self, data: dict) -> tuple[str, list[dict] | None]:
|
|
36
|
+
"""Validate inputs and render prompt for image generation."""
|
|
43
37
|
deserialized_record = deserialize_json_values(data)
|
|
44
|
-
|
|
45
|
-
# Validate required columns
|
|
46
38
|
missing_columns = list(set(self.config.required_columns) - set(data.keys()))
|
|
47
39
|
if len(missing_columns) > 0:
|
|
48
|
-
|
|
40
|
+
raise ValueError(
|
|
49
41
|
f"There was an error preparing the Jinja2 expression template. "
|
|
50
42
|
f"The following columns {missing_columns} are missing!"
|
|
51
43
|
)
|
|
52
|
-
raise ValueError(error_msg)
|
|
53
|
-
|
|
54
|
-
# Render prompt template
|
|
55
44
|
self.prepare_jinja2_template_renderer(self.config.prompt, list(deserialized_record.keys()))
|
|
56
45
|
prompt = self.render_template(deserialized_record)
|
|
57
|
-
|
|
58
|
-
# Validate prompt is non-empty
|
|
59
46
|
if not prompt or not prompt.strip():
|
|
60
47
|
raise ValueError(f"Rendered prompt for column {self.config.name!r} is empty")
|
|
61
|
-
|
|
62
|
-
# Process multi-modal context if provided
|
|
63
48
|
multi_modal_context = self._build_multi_modal_context(deserialized_record)
|
|
49
|
+
return prompt, multi_modal_context
|
|
64
50
|
|
|
65
|
-
|
|
51
|
+
def generate(self, data: dict) -> dict:
|
|
52
|
+
"""Generate image(s) and optionally save to disk."""
|
|
53
|
+
prompt, multi_modal_context = self._prepare_image_inputs(data)
|
|
66
54
|
base64_images = self.model.generate_image(prompt=prompt, multi_modal_context=multi_modal_context)
|
|
67
|
-
|
|
68
|
-
# Store via media storage (mode determines disk vs dataframe storage)
|
|
69
|
-
# Use column name as subfolder to organize images
|
|
70
55
|
results = [
|
|
71
56
|
self.media_storage.save_base64_image(base64_image, subfolder_name=self.config.name)
|
|
72
57
|
for base64_image in base64_images
|
|
73
58
|
]
|
|
74
59
|
data[self.config.name] = results
|
|
60
|
+
return data
|
|
75
61
|
|
|
62
|
+
async def agenerate(self, data: dict) -> dict:
|
|
63
|
+
"""Native async generate using model.agenerate_image."""
|
|
64
|
+
prompt, multi_modal_context = self._prepare_image_inputs(data)
|
|
65
|
+
base64_images = await self.model.agenerate_image(prompt=prompt, multi_modal_context=multi_modal_context)
|
|
66
|
+
results = await asyncio.to_thread(
|
|
67
|
+
lambda: [
|
|
68
|
+
self.media_storage.save_base64_image(base64_image, subfolder_name=self.config.name)
|
|
69
|
+
for base64_image in base64_images
|
|
70
|
+
]
|
|
71
|
+
)
|
|
72
|
+
data[self.config.name] = results
|
|
76
73
|
return data
|
|
@@ -29,6 +29,10 @@ class SeedDatasetColumnGenerator(FromScratchColumnGenerator[SeedDatasetMultiColu
|
|
|
29
29
|
def get_generation_strategy() -> GenerationStrategy:
|
|
30
30
|
return GenerationStrategy.FULL_COLUMN
|
|
31
31
|
|
|
32
|
+
@property
|
|
33
|
+
def is_order_dependent(self) -> bool:
|
|
34
|
+
return True
|
|
35
|
+
|
|
32
36
|
@property
|
|
33
37
|
def num_records_sampled(self) -> int:
|
|
34
38
|
return self._num_records_sampled
|
|
@@ -102,7 +106,7 @@ class SeedDatasetColumnGenerator(FromScratchColumnGenerator[SeedDatasetMultiColu
|
|
|
102
106
|
read_query = f"SELECT * FROM ({read_query}){shuffle_query}"
|
|
103
107
|
else:
|
|
104
108
|
read_query = f"SELECT * FROM '{self._dataset_uri}'{shuffle_query}"
|
|
105
|
-
self._batch_reader = self.duckdb_conn.query(read_query).
|
|
109
|
+
self._batch_reader = self.duckdb_conn.query(read_query).to_arrow_reader(batch_size=num_records)
|
|
106
110
|
|
|
107
111
|
def _sample_records(self, num_records: int) -> pd.DataFrame:
|
|
108
112
|
logger.info(f"🌱 Sampling {num_records} records from seed dataset")
|
data_designer_engine-0.5.3/src/data_designer/engine/dataset_builders/utils/completion_tracker.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
from data_designer.config.column_configs import GenerationStrategy
|
|
10
|
+
from data_designer.engine.dataset_builders.utils.task_model import SliceRef, Task
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from data_designer.engine.dataset_builders.utils.execution_graph import ExecutionGraph
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class CompletionTracker:
|
|
17
|
+
"""Tracks which cells (column, row_group, row_index) are done.
|
|
18
|
+
|
|
19
|
+
Row indices are local to their row group (0-based).
|
|
20
|
+
|
|
21
|
+
Use ``with_graph`` to create a frontier-enabled tracker where
|
|
22
|
+
``get_ready_tasks`` returns in O(frontier) instead of scanning all
|
|
23
|
+
columns x rows x row groups.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self) -> None:
|
|
27
|
+
# row_group → column → set of completed local row indices
|
|
28
|
+
self._completed: dict[int, dict[str, set[int]]] = defaultdict(lambda: defaultdict(set))
|
|
29
|
+
# row_group → set of dropped row indices
|
|
30
|
+
self._dropped: dict[int, set[int]] = defaultdict(set)
|
|
31
|
+
|
|
32
|
+
self._graph: ExecutionGraph | None = None
|
|
33
|
+
self._row_group_sizes: dict[int, int] = {}
|
|
34
|
+
self._batch_complete: dict[int, set[str]] = defaultdict(set)
|
|
35
|
+
self._frontier: set[Task] = set()
|
|
36
|
+
|
|
37
|
+
@classmethod
|
|
38
|
+
def with_graph(cls, graph: ExecutionGraph, row_groups: list[tuple[int, int]]) -> CompletionTracker:
|
|
39
|
+
"""Create a frontier-enabled tracker backed by an execution graph."""
|
|
40
|
+
tracker = cls()
|
|
41
|
+
tracker._graph = graph
|
|
42
|
+
tracker._row_group_sizes = {rg_id: size for rg_id, size in row_groups}
|
|
43
|
+
tracker._seed_frontier()
|
|
44
|
+
return tracker
|
|
45
|
+
|
|
46
|
+
def mark_cell_complete(self, column: str, row_group: int, row_index: int) -> None:
|
|
47
|
+
self._validate_row_group(row_group)
|
|
48
|
+
self._validate_strategy(column, GenerationStrategy.CELL_BY_CELL, "mark_cell_complete")
|
|
49
|
+
self._completed[row_group][column].add(row_index)
|
|
50
|
+
if self._graph is not None:
|
|
51
|
+
self._frontier.discard(Task(column=column, row_group=row_group, row_index=row_index, task_type="cell"))
|
|
52
|
+
self._enqueue_downstream(column, row_group, row_index=row_index)
|
|
53
|
+
|
|
54
|
+
def mark_row_range_complete(self, column: str, row_group: int, row_group_size: int) -> None:
|
|
55
|
+
expected = self._validate_row_group(row_group)
|
|
56
|
+
self._validate_strategy(column, GenerationStrategy.FULL_COLUMN, "mark_row_range_complete")
|
|
57
|
+
if expected is not None and row_group_size != expected:
|
|
58
|
+
raise ValueError(f"Row-group size mismatch for rg={row_group}: got {row_group_size}, expected {expected}")
|
|
59
|
+
self._completed[row_group][column] = set(range(row_group_size))
|
|
60
|
+
self._batch_complete[row_group].add(column)
|
|
61
|
+
if self._graph is not None:
|
|
62
|
+
self._frontier.discard(Task(column=column, row_group=row_group, row_index=None, task_type="batch"))
|
|
63
|
+
self._enqueue_downstream(column, row_group, row_index=None)
|
|
64
|
+
|
|
65
|
+
def is_complete(self, ref: SliceRef) -> bool:
|
|
66
|
+
return ref.row_index in self._completed.get(ref.row_group, {}).get(ref.column, set())
|
|
67
|
+
|
|
68
|
+
def is_all_complete(self, cells: list[SliceRef]) -> bool:
|
|
69
|
+
"""Check whether all the given cells are done.
|
|
70
|
+
|
|
71
|
+
A ``row_index`` of ``None`` means the entire batch for that column must
|
|
72
|
+
have been completed via ``mark_row_range_complete``.
|
|
73
|
+
"""
|
|
74
|
+
for ref in cells:
|
|
75
|
+
if ref.row_index is None:
|
|
76
|
+
if ref.column not in self._batch_complete.get(ref.row_group, set()):
|
|
77
|
+
return False
|
|
78
|
+
elif not self.is_complete(ref):
|
|
79
|
+
return False
|
|
80
|
+
return True
|
|
81
|
+
|
|
82
|
+
def drop_row(self, row_group: int, row_index: int) -> None:
|
|
83
|
+
self._validate_row_group(row_group)
|
|
84
|
+
self._dropped[row_group].add(row_index)
|
|
85
|
+
if self._graph is not None:
|
|
86
|
+
# Remove cell tasks for this row from the frontier
|
|
87
|
+
for col in self._graph.columns:
|
|
88
|
+
self._frontier.discard(Task(column=col, row_group=row_group, row_index=row_index, task_type="cell"))
|
|
89
|
+
# Dropping a row may unblock batch downstream tasks
|
|
90
|
+
self._reevaluate_batch_tasks(row_group)
|
|
91
|
+
|
|
92
|
+
def is_dropped(self, row_group: int, row_index: int) -> bool:
|
|
93
|
+
return row_index in self._dropped.get(row_group, set())
|
|
94
|
+
|
|
95
|
+
def is_row_group_complete(
|
|
96
|
+
self,
|
|
97
|
+
row_group: int,
|
|
98
|
+
row_group_size: int,
|
|
99
|
+
all_columns: list[str],
|
|
100
|
+
) -> bool:
|
|
101
|
+
"""All non-dropped rows have all columns done."""
|
|
102
|
+
dropped = self._dropped.get(row_group, set())
|
|
103
|
+
completed = self._completed.get(row_group, {})
|
|
104
|
+
for ri in range(row_group_size):
|
|
105
|
+
if ri in dropped:
|
|
106
|
+
continue
|
|
107
|
+
for col in all_columns:
|
|
108
|
+
if ri not in completed.get(col, set()):
|
|
109
|
+
return False
|
|
110
|
+
return True
|
|
111
|
+
|
|
112
|
+
def get_ready_tasks(self, dispatched: set[Task]) -> list[Task]:
|
|
113
|
+
"""Return all currently dispatchable tasks from the frontier.
|
|
114
|
+
|
|
115
|
+
Excludes already-dispatched/in-flight tasks.
|
|
116
|
+
"""
|
|
117
|
+
return [t for t in self._frontier if t not in dispatched]
|
|
118
|
+
|
|
119
|
+
def _seed_frontier(self) -> None:
|
|
120
|
+
"""Populate the frontier with root tasks (columns with no upstream deps)."""
|
|
121
|
+
if self._graph is None:
|
|
122
|
+
raise RuntimeError("This method requires a graph to be set.")
|
|
123
|
+
for col in self._graph.get_root_columns():
|
|
124
|
+
strategy = self._graph.get_strategy(col)
|
|
125
|
+
for rg_id, rg_size in self._row_group_sizes.items():
|
|
126
|
+
if strategy == GenerationStrategy.CELL_BY_CELL:
|
|
127
|
+
for ri in range(rg_size):
|
|
128
|
+
self._frontier.add(Task(column=col, row_group=rg_id, row_index=ri, task_type="cell"))
|
|
129
|
+
else:
|
|
130
|
+
self._frontier.add(Task(column=col, row_group=rg_id, row_index=None, task_type="batch"))
|
|
131
|
+
|
|
132
|
+
def _enqueue_downstream(self, column: str, row_group: int, row_index: int | None) -> None:
|
|
133
|
+
"""Add newly-ready downstream tasks to the frontier."""
|
|
134
|
+
if self._graph is None:
|
|
135
|
+
raise RuntimeError("This method requires a graph to be set.")
|
|
136
|
+
rg_completed = self._completed.get(row_group, {})
|
|
137
|
+
rg_dropped = self._dropped.get(row_group, set())
|
|
138
|
+
rg_batch_complete = self._batch_complete.get(row_group, set())
|
|
139
|
+
rg_size = self._row_group_sizes[row_group]
|
|
140
|
+
|
|
141
|
+
for down in self._graph.get_downstream_columns(column):
|
|
142
|
+
batch_ups, cell_ups = self._graph.split_upstream_by_strategy(down)
|
|
143
|
+
|
|
144
|
+
if any(up not in rg_batch_complete for up in batch_ups):
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
down_strategy = self._graph.get_strategy(down)
|
|
148
|
+
|
|
149
|
+
if down_strategy == GenerationStrategy.CELL_BY_CELL:
|
|
150
|
+
cell_up_completed = [rg_completed.get(up, set()) for up in cell_ups]
|
|
151
|
+
if row_index is not None:
|
|
152
|
+
# Cell completion: only check the same row
|
|
153
|
+
down_completed = rg_completed.get(down, set())
|
|
154
|
+
if (
|
|
155
|
+
row_index not in rg_dropped
|
|
156
|
+
and row_index not in down_completed
|
|
157
|
+
and all(row_index in s for s in cell_up_completed)
|
|
158
|
+
):
|
|
159
|
+
task = Task(column=down, row_group=row_group, row_index=row_index, task_type="cell")
|
|
160
|
+
self._frontier.add(task)
|
|
161
|
+
else:
|
|
162
|
+
# Batch completion: check all non-dropped, non-complete rows
|
|
163
|
+
down_completed = rg_completed.get(down, set())
|
|
164
|
+
for ri in range(rg_size):
|
|
165
|
+
if ri in rg_dropped or ri in down_completed:
|
|
166
|
+
continue
|
|
167
|
+
if all(ri in s for s in cell_up_completed):
|
|
168
|
+
task = Task(column=down, row_group=row_group, row_index=ri, task_type="cell")
|
|
169
|
+
self._frontier.add(task)
|
|
170
|
+
else:
|
|
171
|
+
# FULL_COLUMN downstream: ready when all cell upstreams are fully complete
|
|
172
|
+
if down not in rg_batch_complete and self._are_cell_ups_complete(
|
|
173
|
+
cell_ups, rg_completed, rg_size, rg_dropped
|
|
174
|
+
):
|
|
175
|
+
task = Task(column=down, row_group=row_group, row_index=None, task_type="batch")
|
|
176
|
+
self._frontier.add(task)
|
|
177
|
+
|
|
178
|
+
def _reevaluate_batch_tasks(self, row_group: int) -> None:
|
|
179
|
+
"""Check if any batch tasks became ready after a row was dropped."""
|
|
180
|
+
if self._graph is None:
|
|
181
|
+
raise RuntimeError("This method requires a graph to be set.")
|
|
182
|
+
rg_completed = self._completed.get(row_group, {})
|
|
183
|
+
rg_dropped = self._dropped.get(row_group, set())
|
|
184
|
+
rg_batch_complete = self._batch_complete.get(row_group, set())
|
|
185
|
+
rg_size = self._row_group_sizes[row_group]
|
|
186
|
+
|
|
187
|
+
for col in self._graph.get_topological_order():
|
|
188
|
+
if self._graph.get_strategy(col) != GenerationStrategy.FULL_COLUMN:
|
|
189
|
+
continue
|
|
190
|
+
if col in rg_batch_complete:
|
|
191
|
+
continue
|
|
192
|
+
batch_ups, cell_ups = self._graph.split_upstream_by_strategy(col)
|
|
193
|
+
if any(up not in rg_batch_complete for up in batch_ups):
|
|
194
|
+
continue
|
|
195
|
+
if self._are_cell_ups_complete(cell_ups, rg_completed, rg_size, rg_dropped):
|
|
196
|
+
task = Task(column=col, row_group=row_group, row_index=None, task_type="batch")
|
|
197
|
+
self._frontier.add(task)
|
|
198
|
+
|
|
199
|
+
def _are_cell_ups_complete(
|
|
200
|
+
self,
|
|
201
|
+
cell_ups: list[str],
|
|
202
|
+
rg_completed: dict[str, set[int]],
|
|
203
|
+
rg_size: int,
|
|
204
|
+
rg_dropped: set[int],
|
|
205
|
+
) -> bool:
|
|
206
|
+
"""Check all non-dropped rows are complete for each cell-by-cell upstream column."""
|
|
207
|
+
for up in cell_ups:
|
|
208
|
+
up_completed = rg_completed.get(up, set())
|
|
209
|
+
for ri in range(rg_size):
|
|
210
|
+
if ri not in rg_dropped and ri not in up_completed:
|
|
211
|
+
return False
|
|
212
|
+
return True
|
|
213
|
+
|
|
214
|
+
def _validate_strategy(self, column: str, expected: GenerationStrategy, method: str) -> None:
|
|
215
|
+
"""Validate that *column* matches the expected strategy in graph-enabled mode."""
|
|
216
|
+
if self._graph is None:
|
|
217
|
+
return
|
|
218
|
+
actual = self._graph.get_strategy(column)
|
|
219
|
+
if actual != expected:
|
|
220
|
+
raise ValueError(f"{method}() requires {expected.value} strategy, but column '{column}' has {actual.value}")
|
|
221
|
+
|
|
222
|
+
def _validate_row_group(self, row_group: int) -> int | None:
|
|
223
|
+
"""Validate row-group id in graph-enabled mode and return its expected size."""
|
|
224
|
+
if self._graph is None:
|
|
225
|
+
return None
|
|
226
|
+
expected = self._row_group_sizes.get(row_group)
|
|
227
|
+
if expected is None:
|
|
228
|
+
known = sorted(self._row_group_sizes)
|
|
229
|
+
raise ValueError(f"Unknown row_group {row_group}. Known row_groups: {known}")
|
|
230
|
+
return expected
|