data-designer 0.1.1__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_designer-0.1.1 → data_designer-0.1.2}/PKG-INFO +3 -6
- {data_designer-0.1.1 → data_designer-0.1.2}/README.md +2 -5
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/index.md +2 -2
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/_version.py +2 -2
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/column_configs.py +29 -4
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/datastore.py +70 -34
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/default_model_settings.py +1 -1
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/sampler_params.py +16 -2
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/interface/data_designer.py +2 -2
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/test_columns.py +120 -1
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/test_datastore.py +28 -18
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/test_default_model_settings.py +2 -2
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/analysis/column_profilers/test_base.py +12 -4
- {data_designer-0.1.1 → data_designer-0.1.2}/.github/workflows/build-docs.yml +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/.github/workflows/ci.yml +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/.github/workflows/dco-assistant.yml +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/.github/workflows/pack-tutorials.yml +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/.github/workflows/semantic-pull-requests.yml +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/.gitignore +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/.pre-commit-config.yaml +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/AGENTS.md +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/CLAUDE.md +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/CODE_OF_CONDUCT.md +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/CONTRIBUTING.md +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/DCO +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/LICENSE +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/Makefile +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/VERSIONING.md +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/CONTRIBUTING.md +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/assets/palette-favicon.png +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/code_reference/column_configs.md +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/code_reference/config_builder.md +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/code_reference/data_designer_config.md +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/code_reference/sampler_params.md +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/code_reference/validator_params.md +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/concepts/columns.md +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/concepts/person_sampling.md +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/concepts/plugins.md +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/concepts/validators.md +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/css/mkdocstrings.css +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/css/style.css +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/installation.md +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/js/toc-toggle.js +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/models/configure-model-settings-with-the-cli.md +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/models/default-model-settings.md +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/models/model-configs.md +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/models/model-providers.md +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/notebooks/.gitignore +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/notebooks/1-the-basics.ipynb +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/notebooks/2-structured-outputs-and-jinja-expressions.ipynb +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/notebooks/3-seeding-with-a-dataset.ipynb +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/notebooks/README.md +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/notebooks/pyproject.toml +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/docs/quick-start.md +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/mkdocs.yml +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/pyproject.toml +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/scripts/update_license_headers.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/README.md +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/commands/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/commands/list.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/commands/models.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/commands/providers.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/commands/reset.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/controllers/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/controllers/model_controller.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/controllers/provider_controller.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/forms/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/forms/builder.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/forms/field.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/forms/form.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/forms/model_builder.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/forms/provider_builder.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/main.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/repositories/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/repositories/base.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/repositories/model_repository.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/repositories/provider_repository.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/services/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/services/model_service.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/services/provider_service.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/ui.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/utils.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/analysis/column_profilers.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/analysis/column_statistics.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/analysis/dataset_profiler.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/analysis/utils/errors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/analysis/utils/reporting.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/base.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/column_types.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/config_builder.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/data_designer_config.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/dataset_builders.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/errors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/interface.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/models.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/preview_results.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/processors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/sampler_constraints.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/seed.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/utils/code_lang.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/utils/constants.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/utils/errors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/utils/info.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/utils/io_helpers.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/utils/misc.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/utils/numerical_helpers.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/utils/type_helpers.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/utils/validation.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/utils/visualization.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/validator_params.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/analysis/column_profilers/base.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/analysis/column_profilers/judge_score_profiler.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/analysis/column_profilers/registry.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/analysis/column_statistics.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/analysis/dataset_profiler.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/analysis/errors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/analysis/utils/column_statistics_calculations.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/analysis/utils/judge_score_processing.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/column_generators/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/column_generators/generators/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/column_generators/generators/base.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/column_generators/generators/expression.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/column_generators/generators/llm_generators.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/column_generators/generators/samplers.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/column_generators/generators/seed_dataset.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/column_generators/generators/validation.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/column_generators/registry.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/column_generators/utils/errors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/column_generators/utils/judge_score_factory.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/column_generators/utils/prompt_renderer.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/configurable_task.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/dataset_builders/artifact_storage.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/dataset_builders/column_wise_builder.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/dataset_builders/errors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/dataset_builders/multi_column_configs.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/dataset_builders/utils/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/dataset_builders/utils/concurrency.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/dataset_builders/utils/config_compiler.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/dataset_builders/utils/dag.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/dataset_builders/utils/errors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/errors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/model_provider.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/errors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/facade.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/litellm_overrides.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/parsers/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/parsers/errors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/parsers/parser.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/parsers/postprocessors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/parsers/tag_parsers.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/parsers/types.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/recipes/base.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/recipes/response_recipes.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/registry.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/usage.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/utils.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/ginja/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/ginja/ast.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/ginja/environment.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/ginja/exceptions.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/ginja/record.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/gsonschema/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/gsonschema/exceptions.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/gsonschema/schema_transformers.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/gsonschema/types.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/gsonschema/validators.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/processors/base.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/processors/drop_columns.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/processors/registry.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/utils.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/registry/base.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/registry/data_designer_registry.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/registry/errors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/resources/managed_dataset_generator.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/resources/managed_dataset_repository.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/resources/managed_storage.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/resources/resource_provider.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/resources/seed_dataset_data_store.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/column.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/constraints.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/data_sources/base.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/data_sources/errors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/data_sources/sources.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/entities/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/entities/email_address_utils.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/entities/errors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/entities/national_id_utils.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/entities/person.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/entities/phone_number.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/errors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/generator.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/jinja_utils.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/people_gen.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/person_constants.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/schema.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/schema_builder.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/utils.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/secret_resolver.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/validators/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/validators/base.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/validators/local_callable.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/validators/python.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/validators/remote.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/validators/sql.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/errors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/essentials/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/interface/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/interface/errors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/interface/results.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/logging.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/plugin_manager.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/plugins/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/plugins/errors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/plugins/plugin.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/plugins/registry.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/commands/test_list_command.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/commands/test_models_command.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/commands/test_providers_command.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/commands/test_reset_command.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/conftest.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/controllers/test_model_controller.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/controllers/test_provider_controller.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/forms/test_field.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/forms/test_form.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/forms/test_model_builder.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/forms/test_provider_builder.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/repositories/test_model_repository.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/repositories/test_provider_repository.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/services/test_model_service.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/services/test_provider_service.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/test_cli_utils.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/analysis/conftest.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/analysis/test_column_statistics.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/analysis/test_dataset_profiler_results.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/analysis/utils/test_reporting.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/test_config_builder.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/test_data_designer_config.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/test_models.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/test_processors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/test_sampler_constraints.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/test_sampler_params.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/test_seed.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/test_validator_params.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/utils/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/utils/test_code_lang.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/utils/test_info.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/utils/test_io_helpers.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/utils/test_misc.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/utils/test_type_helpers.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/utils/test_validation.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/utils/test_visualization.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/conftest.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/analysis/column_profilers/test_judge_score_profiler.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/analysis/conftest.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/analysis/test_column_statistics_calculator.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/analysis/test_data/artifacts/dataset/column_configs.json +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/analysis/test_data/artifacts/dataset/dataset.json +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/analysis/test_data/artifacts/dataset/metadata.json +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/analysis/test_dataset_profiler.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/analysis/test_errors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/analysis/utils/test_column_statistics_calculations.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/analysis/utils/test_judge_score_processing.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/column_generators/generators/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/column_generators/generators/test_column_generator_base.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/column_generators/generators/test_expression.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/column_generators/generators/test_llm_generators.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/column_generators/generators/test_samplers.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/column_generators/generators/test_seed_dataset.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/column_generators/generators/test_validation.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/column_generators/test_registry.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/column_generators/utils/test_column_generator_errors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/column_generators/utils/test_judge_score_factory.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/column_generators/utils/test_prompt_renderer.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/conftest.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/dataset_builders/test_artifact_storage.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/dataset_builders/test_column_wise_builder.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/dataset_builders/test_multi_column_configs.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/dataset_builders/utils/test_concurrency.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/dataset_builders/utils/test_config_compiler.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/dataset_builders/utils/test_dag.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/dataset_builders/utils/test_dataset_batch_manager.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/conftest.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/parsers/test_parser.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/parsers/test_parsers_types.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/parsers/test_postprocessors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/parsers/test_tag_parsers.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/recipes/test_recipe_base.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/recipes/test_response_recipes.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/stub_secrets.json +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/test_facade.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/test_litellm_overrides.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/test_model_errors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/test_model_registry.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/test_model_utils.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/test_usage.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/ginja/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/ginja/test_ast.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/ginja/test_environment.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/ginja/test_exceptions.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/ginja/test_record.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/gsonschema/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/gsonschema/test_exceptions.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/gsonschema/test_schema_transformers.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/gsonschema/test_types.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/gsonschema/test_validators.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/processors/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/processors/test_drop_columns.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/processors/test_registry.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/test_utils.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/registry/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/registry/conftest.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/registry/test_base.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/registry/test_data_designer_registry.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/registry/test_errors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/resources/__init__.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/resources/conftest.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/resources/test_managed_dataset_generator.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/resources/test_managed_dataset_repository.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/resources/test_managed_storage.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/resources/test_resource_provider.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/conftest.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/data_sources/test_sampler_errors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/data_sources/test_sources.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/entities/test_email_address_utils.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/entities/test_national_id_utils.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/entities/test_person.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/entities/test_phone_number.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/test_column.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/test_constraints.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/test_generator.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/test_jinja_utils.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/test_people_gen.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/test_schema.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/test_utils.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/test_configurable_task.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/test_engine_errors.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/test_model_provider.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/test_secret_resolver.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/validators/test_local_callable.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/validators/test_python.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/validators/test_remote.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/validators/test_sql.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/essentials/test_init.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/interface/test_data_designer.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/interface/test_results.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/plugins/test_plugin.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/plugins/test_plugin_registry.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/test_logging.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/tests/test_plugin_manager.py +0 -0
- {data_designer-0.1.1 → data_designer-0.1.2}/uv.lock +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: data-designer
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: General framework for synthetic data generation
|
|
5
5
|
License-Expression: Apache-2.0
|
|
6
6
|
License-File: LICENSE
|
|
@@ -97,8 +97,7 @@ export NVIDIA_API_KEY="your-api-key-here"
|
|
|
97
97
|
export OPENAI_API_KEY="your-openai-api-key-here"
|
|
98
98
|
```
|
|
99
99
|
|
|
100
|
-
### 3.
|
|
101
|
-
|
|
100
|
+
### 3. Start generating data!
|
|
102
101
|
```python
|
|
103
102
|
from data_designer.essentials import (
|
|
104
103
|
CategorySamplerParams,
|
|
@@ -139,8 +138,6 @@ preview = data_designer.preview(config_builder=config_builder)
|
|
|
139
138
|
preview.display_sample_record()
|
|
140
139
|
```
|
|
141
140
|
|
|
142
|
-
**That's it!** You've created a dataset.
|
|
143
|
-
|
|
144
141
|
---
|
|
145
142
|
|
|
146
143
|
## What's next?
|
|
@@ -148,7 +145,7 @@ preview.display_sample_record()
|
|
|
148
145
|
### 📚 Learn more
|
|
149
146
|
|
|
150
147
|
- **[Quick Start Guide](https://nvidia-nemo.github.io/DataDesigner/quick-start/)** – Detailed walkthrough with more examples
|
|
151
|
-
- **[Tutorial Notebooks](https://nvidia-nemo.github.io/DataDesigner/notebooks/
|
|
148
|
+
- **[Tutorial Notebooks](https://nvidia-nemo.github.io/DataDesigner/notebooks/)** – Step-by-step interactive tutorials
|
|
152
149
|
- **[Column Types](https://nvidia-nemo.github.io/DataDesigner/concepts/columns/)** – Explore samplers, LLM columns, validators, and more
|
|
153
150
|
- **[Validators](https://nvidia-nemo.github.io/DataDesigner/concepts/validators/)** – Learn how to validate generated data with Python, SQL, and remote validators
|
|
154
151
|
- **[Model Configuration](https://nvidia-nemo.github.io/DataDesigner/models/model-configs/)** – Configure custom models and providers
|
|
@@ -48,8 +48,7 @@ export NVIDIA_API_KEY="your-api-key-here"
|
|
|
48
48
|
export OPENAI_API_KEY="your-openai-api-key-here"
|
|
49
49
|
```
|
|
50
50
|
|
|
51
|
-
### 3.
|
|
52
|
-
|
|
51
|
+
### 3. Start generating data!
|
|
53
52
|
```python
|
|
54
53
|
from data_designer.essentials import (
|
|
55
54
|
CategorySamplerParams,
|
|
@@ -90,8 +89,6 @@ preview = data_designer.preview(config_builder=config_builder)
|
|
|
90
89
|
preview.display_sample_record()
|
|
91
90
|
```
|
|
92
91
|
|
|
93
|
-
**That's it!** You've created a dataset.
|
|
94
|
-
|
|
95
92
|
---
|
|
96
93
|
|
|
97
94
|
## What's next?
|
|
@@ -99,7 +96,7 @@ preview.display_sample_record()
|
|
|
99
96
|
### 📚 Learn more
|
|
100
97
|
|
|
101
98
|
- **[Quick Start Guide](https://nvidia-nemo.github.io/DataDesigner/quick-start/)** – Detailed walkthrough with more examples
|
|
102
|
-
- **[Tutorial Notebooks](https://nvidia-nemo.github.io/DataDesigner/notebooks/
|
|
99
|
+
- **[Tutorial Notebooks](https://nvidia-nemo.github.io/DataDesigner/notebooks/)** – Step-by-step interactive tutorials
|
|
103
100
|
- **[Column Types](https://nvidia-nemo.github.io/DataDesigner/concepts/columns/)** – Explore samplers, LLM columns, validators, and more
|
|
104
101
|
- **[Validators](https://nvidia-nemo.github.io/DataDesigner/concepts/validators/)** – Learn how to validate generated data with Python, SQL, and remote validators
|
|
105
102
|
- **[Model Configuration](https://nvidia-nemo.github.io/DataDesigner/models/model-configs/)** – Configure custom models and providers
|
|
@@ -34,11 +34,11 @@ Data Designer helps you create datasets through an intuitive, **iterative** proc
|
|
|
34
34
|
3. **🔁 Preview** your results and iterate
|
|
35
35
|
- Generate a preview dataset stored in memory for fast iteration
|
|
36
36
|
- Inspect sample records and analysis results to refine your configuration
|
|
37
|
-
- Try for yourself by running the [tutorial notebooks](notebooks/
|
|
37
|
+
- Try for yourself by running the [tutorial notebooks](notebooks/README.md)
|
|
38
38
|
4. **🖼️ Create** your dataset
|
|
39
39
|
- Generate your full dataset and save results to disk
|
|
40
40
|
- Access the generated dataset and associated artifacts for downstream use
|
|
41
|
-
- Give it a try by running the [tutorial notebooks](notebooks/
|
|
41
|
+
- Give it a try by running the [tutorial notebooks](notebooks/README.md)
|
|
42
42
|
|
|
43
43
|
## Library and Microservice
|
|
44
44
|
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.1.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 1,
|
|
31
|
+
__version__ = version = '0.1.2'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 2)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -2,9 +2,9 @@
|
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
4
|
from abc import ABC
|
|
5
|
-
from typing import Literal, Optional, Type, Union
|
|
5
|
+
from typing import Annotated, Literal, Optional, Type, Union
|
|
6
6
|
|
|
7
|
-
from pydantic import BaseModel, Field, model_validator
|
|
7
|
+
from pydantic import BaseModel, Discriminator, Field, model_validator
|
|
8
8
|
from typing_extensions import Self
|
|
9
9
|
|
|
10
10
|
from .base import ConfigBase
|
|
@@ -89,11 +89,36 @@ class SamplerColumnConfig(SingleColumnConfig):
|
|
|
89
89
|
"""
|
|
90
90
|
|
|
91
91
|
sampler_type: SamplerType
|
|
92
|
-
params: SamplerParamsT
|
|
93
|
-
conditional_params: dict[str, SamplerParamsT] = {}
|
|
92
|
+
params: Annotated[SamplerParamsT, Discriminator("sampler_type")]
|
|
93
|
+
conditional_params: dict[str, Annotated[SamplerParamsT, Discriminator("sampler_type")]] = {}
|
|
94
94
|
convert_to: Optional[str] = None
|
|
95
95
|
column_type: Literal["sampler"] = "sampler"
|
|
96
96
|
|
|
97
|
+
@model_validator(mode="before")
|
|
98
|
+
@classmethod
|
|
99
|
+
def inject_sampler_type_into_params(cls, data: dict) -> dict:
|
|
100
|
+
"""Inject sampler_type into params dict to enable discriminated union resolution.
|
|
101
|
+
|
|
102
|
+
This allows users to pass params as a simple dict without the sampler_type field,
|
|
103
|
+
which will be automatically added based on the outer sampler_type field.
|
|
104
|
+
"""
|
|
105
|
+
if isinstance(data, dict):
|
|
106
|
+
sampler_type = data.get("sampler_type")
|
|
107
|
+
params = data.get("params")
|
|
108
|
+
|
|
109
|
+
# If params is a dict and doesn't have sampler_type, inject it
|
|
110
|
+
if sampler_type and isinstance(params, dict) and "sampler_type" not in params:
|
|
111
|
+
data["params"] = {"sampler_type": sampler_type, **params}
|
|
112
|
+
|
|
113
|
+
# Handle conditional_params similarly
|
|
114
|
+
conditional_params = data.get("conditional_params")
|
|
115
|
+
if conditional_params and isinstance(conditional_params, dict):
|
|
116
|
+
for condition, cond_params in conditional_params.items():
|
|
117
|
+
if isinstance(cond_params, dict) and "sampler_type" not in cond_params:
|
|
118
|
+
data["conditional_params"][condition] = {"sampler_type": sampler_type, **cond_params}
|
|
119
|
+
|
|
120
|
+
return data
|
|
121
|
+
|
|
97
122
|
|
|
98
123
|
class LLMTextColumnConfig(SingleColumnConfig):
|
|
99
124
|
"""Configuration for text generation columns using Large Language Models.
|
|
@@ -31,34 +31,37 @@ class DatastoreSettings(BaseModel):
|
|
|
31
31
|
token: Optional[str] = Field(default=None, description="If needed, token to use for authentication.")
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
def get_file_column_names(
|
|
35
|
-
"""
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
raise InvalidFilePathError(f"🛑 No files found matching pattern: {str(file_path)!r}")
|
|
41
|
-
logger.debug(f"0️⃣ Using the first matching file in {str(file_path)!r} to determine column names in seed dataset")
|
|
42
|
-
file_path = matching_files[0]
|
|
34
|
+
def get_file_column_names(file_reference: Union[str, Path, HfFileSystem], file_type: str) -> list[str]:
|
|
35
|
+
"""Get column names from a dataset file.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
file_reference: Path to the dataset file, or an HfFileSystem object.
|
|
39
|
+
file_type: Type of the dataset file. Must be one of: 'parquet', 'json', 'jsonl', 'csv'.
|
|
43
40
|
|
|
41
|
+
Raises:
|
|
42
|
+
InvalidFilePathError: If the file type is not supported.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
List of column names.
|
|
46
|
+
"""
|
|
44
47
|
if file_type == "parquet":
|
|
45
48
|
try:
|
|
46
|
-
schema = pq.read_schema(
|
|
49
|
+
schema = pq.read_schema(file_reference)
|
|
47
50
|
if hasattr(schema, "names"):
|
|
48
51
|
return schema.names
|
|
49
52
|
else:
|
|
50
53
|
return [field.name for field in schema]
|
|
51
54
|
except Exception as e:
|
|
52
|
-
logger.warning(f"Failed to process parquet file {
|
|
55
|
+
logger.warning(f"Failed to process parquet file {file_reference}: {e}")
|
|
53
56
|
return []
|
|
54
57
|
elif file_type in ["json", "jsonl"]:
|
|
55
|
-
return pd.read_json(
|
|
58
|
+
return pd.read_json(file_reference, orient="records", lines=True, nrows=1).columns.tolist()
|
|
56
59
|
elif file_type == "csv":
|
|
57
60
|
try:
|
|
58
|
-
df = pd.read_csv(
|
|
61
|
+
df = pd.read_csv(file_reference, nrows=1)
|
|
59
62
|
return df.columns.tolist()
|
|
60
63
|
except (pd.errors.EmptyDataError, pd.errors.ParserError) as e:
|
|
61
|
-
logger.warning(f"Failed to process CSV file {
|
|
64
|
+
logger.warning(f"Failed to process CSV file {file_reference}: {e}")
|
|
62
65
|
return []
|
|
63
66
|
else:
|
|
64
67
|
raise InvalidFilePathError(f"🛑 Unsupported file type: {file_type!r}")
|
|
@@ -66,12 +69,36 @@ def get_file_column_names(file_path: Union[str, Path], file_type: str) -> list[s
|
|
|
66
69
|
|
|
67
70
|
def fetch_seed_dataset_column_names(seed_dataset_reference: SeedDatasetReference) -> list[str]:
|
|
68
71
|
if hasattr(seed_dataset_reference, "datastore_settings"):
|
|
69
|
-
return
|
|
72
|
+
return fetch_seed_dataset_column_names_from_datastore(
|
|
70
73
|
seed_dataset_reference.repo_id,
|
|
71
74
|
seed_dataset_reference.filename,
|
|
72
75
|
seed_dataset_reference.datastore_settings,
|
|
73
76
|
)
|
|
74
|
-
return
|
|
77
|
+
return fetch_seed_dataset_column_names_from_local_file(seed_dataset_reference.dataset)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def fetch_seed_dataset_column_names_from_datastore(
|
|
81
|
+
repo_id: str,
|
|
82
|
+
filename: str,
|
|
83
|
+
datastore_settings: Optional[Union[DatastoreSettings, dict]] = None,
|
|
84
|
+
) -> list[str]:
|
|
85
|
+
file_type = filename.split(".")[-1]
|
|
86
|
+
if f".{file_type}" not in VALID_DATASET_FILE_EXTENSIONS:
|
|
87
|
+
raise InvalidFileFormatError(f"🛑 Unsupported file type: {filename!r}")
|
|
88
|
+
|
|
89
|
+
datastore_settings = resolve_datastore_settings(datastore_settings)
|
|
90
|
+
fs = HfFileSystem(endpoint=datastore_settings.endpoint, token=datastore_settings.token, skip_instance_cache=True)
|
|
91
|
+
|
|
92
|
+
file_path = _extract_single_file_path_from_glob_pattern_if_present(f"datasets/{repo_id}/{filename}", fs=fs)
|
|
93
|
+
|
|
94
|
+
with fs.open(file_path) as f:
|
|
95
|
+
return get_file_column_names(f, file_type)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def fetch_seed_dataset_column_names_from_local_file(dataset_path: str | Path) -> list[str]:
|
|
99
|
+
dataset_path = _validate_dataset_path(dataset_path, allow_glob_pattern=True)
|
|
100
|
+
dataset_path = _extract_single_file_path_from_glob_pattern_if_present(dataset_path)
|
|
101
|
+
return get_file_column_names(dataset_path, str(dataset_path).split(".")[-1])
|
|
75
102
|
|
|
76
103
|
|
|
77
104
|
def resolve_datastore_settings(datastore_settings: DatastoreSettings | dict | None) -> DatastoreSettings:
|
|
@@ -114,25 +141,34 @@ def upload_to_hf_hub(
|
|
|
114
141
|
return f"{repo_id}/{filename}"
|
|
115
142
|
|
|
116
143
|
|
|
117
|
-
def
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
file_type = filename.split(".")[-1]
|
|
123
|
-
if f".{file_type}" not in VALID_DATASET_FILE_EXTENSIONS:
|
|
124
|
-
raise InvalidFileFormatError(f"🛑 Unsupported file type: {filename!r}")
|
|
125
|
-
|
|
126
|
-
datastore_settings = resolve_datastore_settings(datastore_settings)
|
|
127
|
-
fs = HfFileSystem(endpoint=datastore_settings.endpoint, token=datastore_settings.token, skip_instance_cache=True)
|
|
128
|
-
|
|
129
|
-
with fs.open(f"datasets/{repo_id}/{filename}") as f:
|
|
130
|
-
return get_file_column_names(f, file_type)
|
|
131
|
-
|
|
144
|
+
def _extract_single_file_path_from_glob_pattern_if_present(
|
|
145
|
+
file_path: str | Path,
|
|
146
|
+
fs: HfFileSystem | None = None,
|
|
147
|
+
) -> Path:
|
|
148
|
+
file_path = Path(file_path)
|
|
132
149
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
150
|
+
# no glob pattern
|
|
151
|
+
if "*" not in str(file_path):
|
|
152
|
+
return file_path
|
|
153
|
+
|
|
154
|
+
# glob pattern with HfFileSystem
|
|
155
|
+
if fs is not None:
|
|
156
|
+
file_to_check = None
|
|
157
|
+
file_extension = file_path.name.split(".")[-1]
|
|
158
|
+
for file in fs.ls(str(file_path.parent)):
|
|
159
|
+
filename = file["name"]
|
|
160
|
+
if filename.endswith(f".{file_extension}"):
|
|
161
|
+
file_to_check = filename
|
|
162
|
+
if file_to_check is None:
|
|
163
|
+
raise InvalidFilePathError(f"🛑 No files found matching pattern: {str(file_path)!r}")
|
|
164
|
+
logger.debug(f"Using the first matching file in {str(file_path)!r} to determine column names in seed dataset")
|
|
165
|
+
return Path(file_to_check)
|
|
166
|
+
|
|
167
|
+
# glob pattern with local file system
|
|
168
|
+
if not (matching_files := sorted(file_path.parent.glob(file_path.name))):
|
|
169
|
+
raise InvalidFilePathError(f"🛑 No files found matching pattern: {str(file_path)!r}")
|
|
170
|
+
logger.debug(f"Using the first matching file in {str(file_path)!r} to determine column names in seed dataset")
|
|
171
|
+
return matching_files[0]
|
|
136
172
|
|
|
137
173
|
|
|
138
174
|
def _validate_dataset_path(dataset_path: Union[str, Path], allow_glob_pattern: bool = False) -> Path:
|
{data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/default_model_settings.py
RENAMED
|
@@ -78,7 +78,7 @@ def get_default_model_configs() -> list[ModelConfig]:
|
|
|
78
78
|
return []
|
|
79
79
|
|
|
80
80
|
|
|
81
|
-
def
|
|
81
|
+
def get_default_model_providers_missing_api_keys() -> list[str]:
|
|
82
82
|
missing_api_keys = []
|
|
83
83
|
for predefined_provider in PREDEFINED_PROVIDERS:
|
|
84
84
|
if os.environ.get(predefined_provider["api_key"]) is None:
|
|
@@ -66,6 +66,7 @@ class CategorySamplerParams(ConfigBase):
|
|
|
66
66
|
"Larger values will be sampled with higher probability."
|
|
67
67
|
),
|
|
68
68
|
)
|
|
69
|
+
sampler_type: Literal[SamplerType.CATEGORY] = SamplerType.CATEGORY
|
|
69
70
|
|
|
70
71
|
@model_validator(mode="after")
|
|
71
72
|
def _normalize_weights_if_needed(self) -> Self:
|
|
@@ -106,6 +107,7 @@ class DatetimeSamplerParams(ConfigBase):
|
|
|
106
107
|
default="D",
|
|
107
108
|
description="Sampling units, e.g. the smallest possible time interval between samples.",
|
|
108
109
|
)
|
|
110
|
+
sampler_type: Literal[SamplerType.DATETIME] = SamplerType.DATETIME
|
|
109
111
|
|
|
110
112
|
@field_validator("start", "end")
|
|
111
113
|
@classmethod
|
|
@@ -136,6 +138,7 @@ class SubcategorySamplerParams(ConfigBase):
|
|
|
136
138
|
...,
|
|
137
139
|
description="Mapping from each value of parent category to a list of subcategory values.",
|
|
138
140
|
)
|
|
141
|
+
sampler_type: Literal[SamplerType.SUBCATEGORY] = SamplerType.SUBCATEGORY
|
|
139
142
|
|
|
140
143
|
|
|
141
144
|
class TimeDeltaSamplerParams(ConfigBase):
|
|
@@ -187,6 +190,7 @@ class TimeDeltaSamplerParams(ConfigBase):
|
|
|
187
190
|
default="D",
|
|
188
191
|
description="Sampling units, e.g. the smallest possible time interval between samples.",
|
|
189
192
|
)
|
|
193
|
+
sampler_type: Literal[SamplerType.TIMEDELTA] = SamplerType.TIMEDELTA
|
|
190
194
|
|
|
191
195
|
@model_validator(mode="after")
|
|
192
196
|
def _validate_min_less_than_max(self) -> Self:
|
|
@@ -219,6 +223,7 @@ class UUIDSamplerParams(ConfigBase):
|
|
|
219
223
|
default=False,
|
|
220
224
|
description="If true, all letters in the UUID will be capitalized.",
|
|
221
225
|
)
|
|
226
|
+
sampler_type: Literal[SamplerType.UUID] = SamplerType.UUID
|
|
222
227
|
|
|
223
228
|
@property
|
|
224
229
|
def last_index(self) -> int:
|
|
@@ -257,6 +262,7 @@ class ScipySamplerParams(ConfigBase):
|
|
|
257
262
|
decimal_places: Optional[int] = Field(
|
|
258
263
|
default=None, description="Number of decimal places to round the sampled values to."
|
|
259
264
|
)
|
|
265
|
+
sampler_type: Literal[SamplerType.SCIPY] = SamplerType.SCIPY
|
|
260
266
|
|
|
261
267
|
|
|
262
268
|
class BinomialSamplerParams(ConfigBase):
|
|
@@ -273,6 +279,7 @@ class BinomialSamplerParams(ConfigBase):
|
|
|
273
279
|
|
|
274
280
|
n: int = Field(..., description="Number of trials.")
|
|
275
281
|
p: float = Field(..., description="Probability of success on each trial.", ge=0.0, le=1.0)
|
|
282
|
+
sampler_type: Literal[SamplerType.BINOMIAL] = SamplerType.BINOMIAL
|
|
276
283
|
|
|
277
284
|
|
|
278
285
|
class BernoulliSamplerParams(ConfigBase):
|
|
@@ -288,6 +295,7 @@ class BernoulliSamplerParams(ConfigBase):
|
|
|
288
295
|
"""
|
|
289
296
|
|
|
290
297
|
p: float = Field(..., description="Probability of success.", ge=0.0, le=1.0)
|
|
298
|
+
sampler_type: Literal[SamplerType.BERNOULLI] = SamplerType.BERNOULLI
|
|
291
299
|
|
|
292
300
|
|
|
293
301
|
class BernoulliMixtureSamplerParams(ConfigBase):
|
|
@@ -327,6 +335,7 @@ class BernoulliMixtureSamplerParams(ConfigBase):
|
|
|
327
335
|
...,
|
|
328
336
|
description="Parameters of the scipy.stats distribution given in `dist_name`.",
|
|
329
337
|
)
|
|
338
|
+
sampler_type: Literal[SamplerType.BERNOULLI_MIXTURE] = SamplerType.BERNOULLI_MIXTURE
|
|
330
339
|
|
|
331
340
|
|
|
332
341
|
class GaussianSamplerParams(ConfigBase):
|
|
@@ -350,6 +359,7 @@ class GaussianSamplerParams(ConfigBase):
|
|
|
350
359
|
decimal_places: Optional[int] = Field(
|
|
351
360
|
default=None, description="Number of decimal places to round the sampled values to."
|
|
352
361
|
)
|
|
362
|
+
sampler_type: Literal[SamplerType.GAUSSIAN] = SamplerType.GAUSSIAN
|
|
353
363
|
|
|
354
364
|
|
|
355
365
|
class PoissonSamplerParams(ConfigBase):
|
|
@@ -369,6 +379,7 @@ class PoissonSamplerParams(ConfigBase):
|
|
|
369
379
|
"""
|
|
370
380
|
|
|
371
381
|
mean: float = Field(..., description="Mean number of events in a fixed interval.")
|
|
382
|
+
sampler_type: Literal[SamplerType.POISSON] = SamplerType.POISSON
|
|
372
383
|
|
|
373
384
|
|
|
374
385
|
class UniformSamplerParams(ConfigBase):
|
|
@@ -390,6 +401,7 @@ class UniformSamplerParams(ConfigBase):
|
|
|
390
401
|
decimal_places: Optional[int] = Field(
|
|
391
402
|
default=None, description="Number of decimal places to round the sampled values to."
|
|
392
403
|
)
|
|
404
|
+
sampler_type: Literal[SamplerType.UNIFORM] = SamplerType.UNIFORM
|
|
393
405
|
|
|
394
406
|
|
|
395
407
|
#########################################
|
|
@@ -470,11 +482,12 @@ class PersonSamplerParams(ConfigBase):
|
|
|
470
482
|
default=False,
|
|
471
483
|
description="If True, then append synthetic persona columns to each generated person.",
|
|
472
484
|
)
|
|
485
|
+
sampler_type: Literal[SamplerType.PERSON] = SamplerType.PERSON
|
|
473
486
|
|
|
474
487
|
@property
|
|
475
488
|
def generator_kwargs(self) -> list[str]:
|
|
476
489
|
"""Keyword arguments to pass to the person generator."""
|
|
477
|
-
return [f for f in list(PersonSamplerParams.model_fields) if f
|
|
490
|
+
return [f for f in list(PersonSamplerParams.model_fields) if f not in ("locale", "sampler_type")]
|
|
478
491
|
|
|
479
492
|
@property
|
|
480
493
|
def people_gen_key(self) -> str:
|
|
@@ -533,11 +546,12 @@ class PersonFromFakerSamplerParams(ConfigBase):
|
|
|
533
546
|
min_length=2,
|
|
534
547
|
max_length=2,
|
|
535
548
|
)
|
|
549
|
+
sampler_type: Literal[SamplerType.PERSON_FROM_FAKER] = SamplerType.PERSON_FROM_FAKER
|
|
536
550
|
|
|
537
551
|
@property
|
|
538
552
|
def generator_kwargs(self) -> list[str]:
|
|
539
553
|
"""Keyword arguments to pass to the person generator."""
|
|
540
|
-
return [f for f in list(PersonFromFakerSamplerParams.model_fields) if f
|
|
554
|
+
return [f for f in list(PersonFromFakerSamplerParams.model_fields) if f not in ("locale", "sampler_type")]
|
|
541
555
|
|
|
542
556
|
@property
|
|
543
557
|
def people_gen_key(self) -> str:
|
|
@@ -9,8 +9,8 @@ import pandas as pd
|
|
|
9
9
|
from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
|
|
10
10
|
from data_designer.config.config_builder import DataDesignerConfigBuilder
|
|
11
11
|
from data_designer.config.default_model_settings import (
|
|
12
|
-
get_defaul_model_providers_missing_api_keys,
|
|
13
12
|
get_default_model_configs,
|
|
13
|
+
get_default_model_providers_missing_api_keys,
|
|
14
14
|
get_default_provider_name,
|
|
15
15
|
get_default_providers,
|
|
16
16
|
resolve_seed_default_model_settings,
|
|
@@ -313,7 +313,7 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
|
|
|
313
313
|
if model_providers is None:
|
|
314
314
|
if can_run_data_designer_locally():
|
|
315
315
|
model_providers = get_default_providers()
|
|
316
|
-
missing_api_keys =
|
|
316
|
+
missing_api_keys = get_default_model_providers_missing_api_keys()
|
|
317
317
|
if len(missing_api_keys) == len(PREDEFINED_PROVIDERS):
|
|
318
318
|
logger.warning(
|
|
319
319
|
"🚨 You are trying to use a default model provider but your API keys are missing."
|
|
@@ -23,7 +23,15 @@ from data_designer.config.column_types import (
|
|
|
23
23
|
get_column_display_order,
|
|
24
24
|
)
|
|
25
25
|
from data_designer.config.errors import InvalidConfigError
|
|
26
|
-
from data_designer.config.sampler_params import
|
|
26
|
+
from data_designer.config.sampler_params import (
|
|
27
|
+
CategorySamplerParams,
|
|
28
|
+
GaussianSamplerParams,
|
|
29
|
+
PersonFromFakerSamplerParams,
|
|
30
|
+
PersonSamplerParams,
|
|
31
|
+
SamplerType,
|
|
32
|
+
UniformSamplerParams,
|
|
33
|
+
UUIDSamplerParams,
|
|
34
|
+
)
|
|
27
35
|
from data_designer.config.utils.code_lang import CodeLang
|
|
28
36
|
from data_designer.config.utils.errors import UserJinjaTemplateSyntaxError
|
|
29
37
|
from data_designer.config.validator_params import CodeValidatorParams
|
|
@@ -324,3 +332,114 @@ def test_get_column_config_from_kwargs():
|
|
|
324
332
|
),
|
|
325
333
|
SeedDatasetColumnConfig,
|
|
326
334
|
)
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def test_sampler_column_config_discriminated_union_with_dict_params():
|
|
338
|
+
"""Test that sampler_type field is automatically injected into params dict."""
|
|
339
|
+
config = SamplerColumnConfig(
|
|
340
|
+
name="test_uniform",
|
|
341
|
+
sampler_type=SamplerType.UNIFORM,
|
|
342
|
+
params={"low": 0.0, "high": 1.0, "decimal_places": 2},
|
|
343
|
+
)
|
|
344
|
+
assert config.name == "test_uniform"
|
|
345
|
+
assert config.sampler_type == SamplerType.UNIFORM
|
|
346
|
+
assert isinstance(config.params, UniformSamplerParams)
|
|
347
|
+
assert config.params.sampler_type == SamplerType.UNIFORM
|
|
348
|
+
assert config.params.low == 0.0
|
|
349
|
+
assert config.params.high == 1.0
|
|
350
|
+
assert config.params.decimal_places == 2
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def test_sampler_column_config_discriminated_union_with_explicit_sampler_type():
|
|
354
|
+
"""Test that explicit sampler_type in params dict is preserved."""
|
|
355
|
+
config = SamplerColumnConfig(
|
|
356
|
+
name="test_category",
|
|
357
|
+
sampler_type=SamplerType.CATEGORY,
|
|
358
|
+
params={"sampler_type": "category", "values": ["A", "B", "C"], "weights": [0.5, 0.3, 0.2]},
|
|
359
|
+
)
|
|
360
|
+
assert config.name == "test_category"
|
|
361
|
+
assert config.sampler_type == SamplerType.CATEGORY
|
|
362
|
+
assert isinstance(config.params, CategorySamplerParams)
|
|
363
|
+
assert config.params.sampler_type == SamplerType.CATEGORY
|
|
364
|
+
assert config.params.values == ["A", "B", "C"]
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def test_sampler_column_config_discriminated_union_serialization():
|
|
368
|
+
"""Test that discriminated union works correctly with serialization/deserialization."""
|
|
369
|
+
config = SamplerColumnConfig(
|
|
370
|
+
name="test_person",
|
|
371
|
+
sampler_type=SamplerType.PERSON,
|
|
372
|
+
params={"locale": "en_US", "sex": "Female", "age_range": [25, 45]},
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
# Serialize
|
|
376
|
+
serialized = config.model_dump()
|
|
377
|
+
assert "sampler_type" in serialized["params"]
|
|
378
|
+
assert serialized["params"]["sampler_type"] == "person"
|
|
379
|
+
|
|
380
|
+
# Deserialize
|
|
381
|
+
deserialized = SamplerColumnConfig(**serialized)
|
|
382
|
+
assert isinstance(deserialized.params, PersonSamplerParams)
|
|
383
|
+
assert deserialized.params.locale == "en_US"
|
|
384
|
+
assert deserialized.params.sex == "Female"
|
|
385
|
+
assert deserialized.params.age_range == [25, 45]
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def test_sampler_column_config_discriminated_union_person_vs_person_from_faker():
|
|
389
|
+
"""Test that discriminated union correctly distinguishes between person and person_from_faker."""
|
|
390
|
+
# Test person sampler (managed datasets)
|
|
391
|
+
person_config = SamplerColumnConfig(
|
|
392
|
+
name="test_person",
|
|
393
|
+
sampler_type=SamplerType.PERSON,
|
|
394
|
+
params={"locale": "en_US", "sex": "Male", "age_range": [30, 50]},
|
|
395
|
+
)
|
|
396
|
+
assert isinstance(person_config.params, PersonSamplerParams)
|
|
397
|
+
assert person_config.params.sampler_type == SamplerType.PERSON
|
|
398
|
+
assert person_config.params.locale == "en_US"
|
|
399
|
+
|
|
400
|
+
# Test person_from_faker sampler (Faker-based)
|
|
401
|
+
person_faker_config = SamplerColumnConfig(
|
|
402
|
+
name="test_person_faker",
|
|
403
|
+
sampler_type=SamplerType.PERSON_FROM_FAKER,
|
|
404
|
+
params={"locale": "en_GB", "sex": "Female", "age_range": [20, 40]},
|
|
405
|
+
)
|
|
406
|
+
assert isinstance(person_faker_config.params, PersonFromFakerSamplerParams)
|
|
407
|
+
assert person_faker_config.params.sampler_type == SamplerType.PERSON_FROM_FAKER
|
|
408
|
+
assert person_faker_config.params.locale == "en_GB"
|
|
409
|
+
|
|
410
|
+
# Verify they are different types
|
|
411
|
+
assert type(person_config.params) is not type(person_faker_config.params)
|
|
412
|
+
assert isinstance(person_config.params, PersonSamplerParams)
|
|
413
|
+
assert isinstance(person_faker_config.params, PersonFromFakerSamplerParams)
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def test_sampler_column_config_discriminated_union_with_conditional_params():
|
|
417
|
+
"""Test that sampler_type is injected into conditional_params as well."""
|
|
418
|
+
config = SamplerColumnConfig(
|
|
419
|
+
name="test_gaussian",
|
|
420
|
+
sampler_type=SamplerType.GAUSSIAN,
|
|
421
|
+
params={"mean": 0.0, "stddev": 1.0},
|
|
422
|
+
conditional_params={"age > 21": {"mean": 5.0, "stddev": 2.0}},
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
assert isinstance(config.params, GaussianSamplerParams)
|
|
426
|
+
assert config.params.mean == 0.0
|
|
427
|
+
assert config.params.stddev == 1.0
|
|
428
|
+
|
|
429
|
+
# Check conditional params
|
|
430
|
+
assert "age > 21" in config.conditional_params
|
|
431
|
+
cond_param = config.conditional_params["age > 21"]
|
|
432
|
+
assert isinstance(cond_param, GaussianSamplerParams)
|
|
433
|
+
assert cond_param.sampler_type == SamplerType.GAUSSIAN
|
|
434
|
+
assert cond_param.mean == 5.0
|
|
435
|
+
assert cond_param.stddev == 2.0
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def test_sampler_column_config_discriminated_union_wrong_params_type():
|
|
439
|
+
"""Test that discriminated union rejects params that don't match the sampler_type."""
|
|
440
|
+
with pytest.raises(ValidationError):
|
|
441
|
+
SamplerColumnConfig(
|
|
442
|
+
name="test_wrong_params",
|
|
443
|
+
sampler_type=SamplerType.UNIFORM,
|
|
444
|
+
params={"values": ["A", "B"]}, # Category params for uniform sampler
|
|
445
|
+
)
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
from pathlib import Path
|
|
5
4
|
from unittest.mock import MagicMock, patch
|
|
6
5
|
|
|
7
6
|
import numpy as np
|
|
@@ -13,6 +12,7 @@ import pytest
|
|
|
13
12
|
from data_designer.config.datastore import (
|
|
14
13
|
DatastoreSettings,
|
|
15
14
|
fetch_seed_dataset_column_names,
|
|
15
|
+
fetch_seed_dataset_column_names_from_local_file,
|
|
16
16
|
get_file_column_names,
|
|
17
17
|
resolve_datastore_settings,
|
|
18
18
|
upload_to_hf_hub,
|
|
@@ -127,22 +127,6 @@ def test_get_file_column_names_unicode(tmp_path, file_type):
|
|
|
127
127
|
assert get_file_column_names(str(unicode_path), file_type) == df_unicode.columns.tolist()
|
|
128
128
|
|
|
129
129
|
|
|
130
|
-
@pytest.mark.parametrize("file_type", ["parquet", "csv", "json", "jsonl"])
|
|
131
|
-
def test_get_file_column_names_with_glob_pattern(tmp_path, file_type):
|
|
132
|
-
df = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
|
|
133
|
-
for i in range(5):
|
|
134
|
-
_write_file(df, tmp_path / f"{i}.{file_type}", file_type)
|
|
135
|
-
assert get_file_column_names(f"{tmp_path}/*.{file_type}", file_type) == ["col1", "col2"]
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
def test_get_file_column_names_with_glob_pattern_error(tmp_path):
|
|
139
|
-
df = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
|
|
140
|
-
for i in range(5):
|
|
141
|
-
_write_file(df, tmp_path / f"{i}.parquet", "parquet")
|
|
142
|
-
with pytest.raises(InvalidFilePathError, match="No files found matching pattern"):
|
|
143
|
-
get_file_column_names(f"{tmp_path}/*.csv", "csv")
|
|
144
|
-
|
|
145
|
-
|
|
146
130
|
def test_get_file_column_names_with_filesystem_parquet():
|
|
147
131
|
"""Test get_file_column_names with filesystem parameter for parquet files."""
|
|
148
132
|
mock_schema = MagicMock()
|
|
@@ -153,7 +137,7 @@ def test_get_file_column_names_with_filesystem_parquet():
|
|
|
153
137
|
result = get_file_column_names("datasets/test/file.parquet", "parquet")
|
|
154
138
|
|
|
155
139
|
assert result == ["col1", "col2", "col3"]
|
|
156
|
-
mock_read_schema.assert_called_once_with(
|
|
140
|
+
mock_read_schema.assert_called_once_with("datasets/test/file.parquet")
|
|
157
141
|
|
|
158
142
|
|
|
159
143
|
@pytest.mark.parametrize("file_type", ["json", "jsonl", "csv"])
|
|
@@ -274,3 +258,29 @@ def test_upload_to_hf_hub_error_handling(datastore_settings):
|
|
|
274
258
|
with patch("data_designer.config.datastore.Path.is_file", autospec=True) as mock_is_file:
|
|
275
259
|
mock_is_file.return_value = True
|
|
276
260
|
upload_to_hf_hub("test.text", "test.txt", "test/repo", datastore_settings)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
@pytest.mark.parametrize("file_type", ["parquet", "json", "jsonl", "csv"])
|
|
264
|
+
def test_fetch_seed_dataset_column_names_from_local_file_with_glob(tmp_path, file_type):
|
|
265
|
+
"""Test fetch_seed_dataset_column_names_from_local_file with glob pattern matching multiple files."""
|
|
266
|
+
test_data = pd.DataFrame({"col1": [1, 2], "col2": [3, 4], "col3": [5, 6]})
|
|
267
|
+
|
|
268
|
+
# Create multiple files with the same schema
|
|
269
|
+
for i in range(3):
|
|
270
|
+
file_path = tmp_path / f"data_{i}.{file_type}"
|
|
271
|
+
_write_file(test_data, file_path, file_type)
|
|
272
|
+
|
|
273
|
+
# Test glob pattern that matches all files
|
|
274
|
+
glob_pattern = str(tmp_path / f"*.{file_type}")
|
|
275
|
+
result = fetch_seed_dataset_column_names_from_local_file(glob_pattern)
|
|
276
|
+
|
|
277
|
+
assert result == ["col1", "col2", "col3"]
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
@pytest.mark.parametrize("file_type", ["parquet", "csv"])
|
|
281
|
+
def test_fetch_seed_dataset_column_names_from_local_file_with_glob_no_matches(tmp_path, file_type):
|
|
282
|
+
"""Test fetch_seed_dataset_column_names_from_local_file with glob pattern that matches no files."""
|
|
283
|
+
glob_pattern = str(tmp_path / f"nonexistent_*.{file_type}")
|
|
284
|
+
|
|
285
|
+
with pytest.raises(InvalidFilePathError, match="does not contain files of type"):
|
|
286
|
+
fetch_seed_dataset_column_names_from_local_file(glob_pattern)
|
|
@@ -11,9 +11,9 @@ import yaml
|
|
|
11
11
|
from data_designer.config.default_model_settings import (
|
|
12
12
|
get_builtin_model_configs,
|
|
13
13
|
get_builtin_model_providers,
|
|
14
|
-
get_defaul_model_providers_missing_api_keys,
|
|
15
14
|
get_default_inference_parameters,
|
|
16
15
|
get_default_model_configs,
|
|
16
|
+
get_default_model_providers_missing_api_keys,
|
|
17
17
|
get_default_provider_name,
|
|
18
18
|
get_default_providers,
|
|
19
19
|
resolve_seed_default_model_settings,
|
|
@@ -152,4 +152,4 @@ def test_resolve_seed_default_model_settings(tmp_path: Path):
|
|
|
152
152
|
@patch("data_designer.config.default_model_settings.os.environ.get")
|
|
153
153
|
def test_get_default_model_providers_missing_api_keys(mock_environ_get):
|
|
154
154
|
mock_environ_get.return_value = None
|
|
155
|
-
assert
|
|
155
|
+
assert get_default_model_providers_missing_api_keys() == ["NVIDIA_API_KEY", "OPENAI_API_KEY"]
|