data-designer 0.1.1__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (359) hide show
  1. {data_designer-0.1.1 → data_designer-0.1.2}/PKG-INFO +3 -6
  2. {data_designer-0.1.1 → data_designer-0.1.2}/README.md +2 -5
  3. {data_designer-0.1.1 → data_designer-0.1.2}/docs/index.md +2 -2
  4. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/_version.py +2 -2
  5. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/column_configs.py +29 -4
  6. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/datastore.py +70 -34
  7. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/default_model_settings.py +1 -1
  8. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/sampler_params.py +16 -2
  9. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/interface/data_designer.py +2 -2
  10. {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/test_columns.py +120 -1
  11. {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/test_datastore.py +28 -18
  12. {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/test_default_model_settings.py +2 -2
  13. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/analysis/column_profilers/test_base.py +12 -4
  14. {data_designer-0.1.1 → data_designer-0.1.2}/.github/workflows/build-docs.yml +0 -0
  15. {data_designer-0.1.1 → data_designer-0.1.2}/.github/workflows/ci.yml +0 -0
  16. {data_designer-0.1.1 → data_designer-0.1.2}/.github/workflows/dco-assistant.yml +0 -0
  17. {data_designer-0.1.1 → data_designer-0.1.2}/.github/workflows/pack-tutorials.yml +0 -0
  18. {data_designer-0.1.1 → data_designer-0.1.2}/.github/workflows/semantic-pull-requests.yml +0 -0
  19. {data_designer-0.1.1 → data_designer-0.1.2}/.gitignore +0 -0
  20. {data_designer-0.1.1 → data_designer-0.1.2}/.pre-commit-config.yaml +0 -0
  21. {data_designer-0.1.1 → data_designer-0.1.2}/AGENTS.md +0 -0
  22. {data_designer-0.1.1 → data_designer-0.1.2}/CLAUDE.md +0 -0
  23. {data_designer-0.1.1 → data_designer-0.1.2}/CODE_OF_CONDUCT.md +0 -0
  24. {data_designer-0.1.1 → data_designer-0.1.2}/CONTRIBUTING.md +0 -0
  25. {data_designer-0.1.1 → data_designer-0.1.2}/DCO +0 -0
  26. {data_designer-0.1.1 → data_designer-0.1.2}/LICENSE +0 -0
  27. {data_designer-0.1.1 → data_designer-0.1.2}/Makefile +0 -0
  28. {data_designer-0.1.1 → data_designer-0.1.2}/VERSIONING.md +0 -0
  29. {data_designer-0.1.1 → data_designer-0.1.2}/docs/CONTRIBUTING.md +0 -0
  30. {data_designer-0.1.1 → data_designer-0.1.2}/docs/assets/palette-favicon.png +0 -0
  31. {data_designer-0.1.1 → data_designer-0.1.2}/docs/code_reference/column_configs.md +0 -0
  32. {data_designer-0.1.1 → data_designer-0.1.2}/docs/code_reference/config_builder.md +0 -0
  33. {data_designer-0.1.1 → data_designer-0.1.2}/docs/code_reference/data_designer_config.md +0 -0
  34. {data_designer-0.1.1 → data_designer-0.1.2}/docs/code_reference/sampler_params.md +0 -0
  35. {data_designer-0.1.1 → data_designer-0.1.2}/docs/code_reference/validator_params.md +0 -0
  36. {data_designer-0.1.1 → data_designer-0.1.2}/docs/concepts/columns.md +0 -0
  37. {data_designer-0.1.1 → data_designer-0.1.2}/docs/concepts/person_sampling.md +0 -0
  38. {data_designer-0.1.1 → data_designer-0.1.2}/docs/concepts/plugins.md +0 -0
  39. {data_designer-0.1.1 → data_designer-0.1.2}/docs/concepts/validators.md +0 -0
  40. {data_designer-0.1.1 → data_designer-0.1.2}/docs/css/mkdocstrings.css +0 -0
  41. {data_designer-0.1.1 → data_designer-0.1.2}/docs/css/style.css +0 -0
  42. {data_designer-0.1.1 → data_designer-0.1.2}/docs/installation.md +0 -0
  43. {data_designer-0.1.1 → data_designer-0.1.2}/docs/js/toc-toggle.js +0 -0
  44. {data_designer-0.1.1 → data_designer-0.1.2}/docs/models/configure-model-settings-with-the-cli.md +0 -0
  45. {data_designer-0.1.1 → data_designer-0.1.2}/docs/models/default-model-settings.md +0 -0
  46. {data_designer-0.1.1 → data_designer-0.1.2}/docs/models/model-configs.md +0 -0
  47. {data_designer-0.1.1 → data_designer-0.1.2}/docs/models/model-providers.md +0 -0
  48. {data_designer-0.1.1 → data_designer-0.1.2}/docs/notebooks/.gitignore +0 -0
  49. {data_designer-0.1.1 → data_designer-0.1.2}/docs/notebooks/1-the-basics.ipynb +0 -0
  50. {data_designer-0.1.1 → data_designer-0.1.2}/docs/notebooks/2-structured-outputs-and-jinja-expressions.ipynb +0 -0
  51. {data_designer-0.1.1 → data_designer-0.1.2}/docs/notebooks/3-seeding-with-a-dataset.ipynb +0 -0
  52. {data_designer-0.1.1 → data_designer-0.1.2}/docs/notebooks/README.md +0 -0
  53. {data_designer-0.1.1 → data_designer-0.1.2}/docs/notebooks/pyproject.toml +0 -0
  54. {data_designer-0.1.1 → data_designer-0.1.2}/docs/quick-start.md +0 -0
  55. {data_designer-0.1.1 → data_designer-0.1.2}/mkdocs.yml +0 -0
  56. {data_designer-0.1.1 → data_designer-0.1.2}/pyproject.toml +0 -0
  57. {data_designer-0.1.1 → data_designer-0.1.2}/scripts/update_license_headers.py +0 -0
  58. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/__init__.py +0 -0
  59. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/README.md +0 -0
  60. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/__init__.py +0 -0
  61. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/commands/__init__.py +0 -0
  62. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/commands/list.py +0 -0
  63. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/commands/models.py +0 -0
  64. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/commands/providers.py +0 -0
  65. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/commands/reset.py +0 -0
  66. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/controllers/__init__.py +0 -0
  67. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/controllers/model_controller.py +0 -0
  68. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/controllers/provider_controller.py +0 -0
  69. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/forms/__init__.py +0 -0
  70. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/forms/builder.py +0 -0
  71. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/forms/field.py +0 -0
  72. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/forms/form.py +0 -0
  73. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/forms/model_builder.py +0 -0
  74. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/forms/provider_builder.py +0 -0
  75. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/main.py +0 -0
  76. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/repositories/__init__.py +0 -0
  77. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/repositories/base.py +0 -0
  78. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/repositories/model_repository.py +0 -0
  79. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/repositories/provider_repository.py +0 -0
  80. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/services/__init__.py +0 -0
  81. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/services/model_service.py +0 -0
  82. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/services/provider_service.py +0 -0
  83. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/ui.py +0 -0
  84. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/cli/utils.py +0 -0
  85. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/__init__.py +0 -0
  86. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/analysis/column_profilers.py +0 -0
  87. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/analysis/column_statistics.py +0 -0
  88. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/analysis/dataset_profiler.py +0 -0
  89. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/analysis/utils/errors.py +0 -0
  90. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/analysis/utils/reporting.py +0 -0
  91. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/base.py +0 -0
  92. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/column_types.py +0 -0
  93. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/config_builder.py +0 -0
  94. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/data_designer_config.py +0 -0
  95. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/dataset_builders.py +0 -0
  96. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/errors.py +0 -0
  97. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/interface.py +0 -0
  98. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/models.py +0 -0
  99. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/preview_results.py +0 -0
  100. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/processors.py +0 -0
  101. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/sampler_constraints.py +0 -0
  102. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/seed.py +0 -0
  103. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/utils/code_lang.py +0 -0
  104. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/utils/constants.py +0 -0
  105. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/utils/errors.py +0 -0
  106. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/utils/info.py +0 -0
  107. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/utils/io_helpers.py +0 -0
  108. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/utils/misc.py +0 -0
  109. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/utils/numerical_helpers.py +0 -0
  110. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/utils/type_helpers.py +0 -0
  111. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/utils/validation.py +0 -0
  112. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/utils/visualization.py +0 -0
  113. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/config/validator_params.py +0 -0
  114. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/__init__.py +0 -0
  115. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/analysis/column_profilers/base.py +0 -0
  116. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/analysis/column_profilers/judge_score_profiler.py +0 -0
  117. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/analysis/column_profilers/registry.py +0 -0
  118. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/analysis/column_statistics.py +0 -0
  119. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/analysis/dataset_profiler.py +0 -0
  120. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/analysis/errors.py +0 -0
  121. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/analysis/utils/column_statistics_calculations.py +0 -0
  122. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/analysis/utils/judge_score_processing.py +0 -0
  123. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/column_generators/__init__.py +0 -0
  124. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/column_generators/generators/__init__.py +0 -0
  125. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/column_generators/generators/base.py +0 -0
  126. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/column_generators/generators/expression.py +0 -0
  127. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/column_generators/generators/llm_generators.py +0 -0
  128. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/column_generators/generators/samplers.py +0 -0
  129. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/column_generators/generators/seed_dataset.py +0 -0
  130. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/column_generators/generators/validation.py +0 -0
  131. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/column_generators/registry.py +0 -0
  132. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/column_generators/utils/errors.py +0 -0
  133. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/column_generators/utils/judge_score_factory.py +0 -0
  134. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/column_generators/utils/prompt_renderer.py +0 -0
  135. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/configurable_task.py +0 -0
  136. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/dataset_builders/artifact_storage.py +0 -0
  137. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/dataset_builders/column_wise_builder.py +0 -0
  138. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/dataset_builders/errors.py +0 -0
  139. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/dataset_builders/multi_column_configs.py +0 -0
  140. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/dataset_builders/utils/__init__.py +0 -0
  141. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/dataset_builders/utils/concurrency.py +0 -0
  142. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/dataset_builders/utils/config_compiler.py +0 -0
  143. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/dataset_builders/utils/dag.py +0 -0
  144. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +0 -0
  145. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/dataset_builders/utils/errors.py +0 -0
  146. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/errors.py +0 -0
  147. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/model_provider.py +0 -0
  148. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/__init__.py +0 -0
  149. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/errors.py +0 -0
  150. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/facade.py +0 -0
  151. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/litellm_overrides.py +0 -0
  152. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/parsers/__init__.py +0 -0
  153. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/parsers/errors.py +0 -0
  154. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/parsers/parser.py +0 -0
  155. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/parsers/postprocessors.py +0 -0
  156. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/parsers/tag_parsers.py +0 -0
  157. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/parsers/types.py +0 -0
  158. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/recipes/base.py +0 -0
  159. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/recipes/response_recipes.py +0 -0
  160. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/registry.py +0 -0
  161. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/usage.py +0 -0
  162. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/models/utils.py +0 -0
  163. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/ginja/__init__.py +0 -0
  164. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/ginja/ast.py +0 -0
  165. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/ginja/environment.py +0 -0
  166. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/ginja/exceptions.py +0 -0
  167. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/ginja/record.py +0 -0
  168. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/gsonschema/__init__.py +0 -0
  169. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/gsonschema/exceptions.py +0 -0
  170. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/gsonschema/schema_transformers.py +0 -0
  171. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/gsonschema/types.py +0 -0
  172. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/gsonschema/validators.py +0 -0
  173. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/processors/base.py +0 -0
  174. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/processors/drop_columns.py +0 -0
  175. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/processors/registry.py +0 -0
  176. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/processing/utils.py +0 -0
  177. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/registry/base.py +0 -0
  178. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/registry/data_designer_registry.py +0 -0
  179. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/registry/errors.py +0 -0
  180. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/resources/managed_dataset_generator.py +0 -0
  181. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/resources/managed_dataset_repository.py +0 -0
  182. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/resources/managed_storage.py +0 -0
  183. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/resources/resource_provider.py +0 -0
  184. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/resources/seed_dataset_data_store.py +0 -0
  185. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/column.py +0 -0
  186. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/constraints.py +0 -0
  187. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/data_sources/base.py +0 -0
  188. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/data_sources/errors.py +0 -0
  189. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/data_sources/sources.py +0 -0
  190. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/entities/__init__.py +0 -0
  191. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
  192. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +0 -0
  193. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/entities/email_address_utils.py +0 -0
  194. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/entities/errors.py +0 -0
  195. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/entities/national_id_utils.py +0 -0
  196. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/entities/person.py +0 -0
  197. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/entities/phone_number.py +0 -0
  198. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/errors.py +0 -0
  199. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/generator.py +0 -0
  200. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/jinja_utils.py +0 -0
  201. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/people_gen.py +0 -0
  202. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/person_constants.py +0 -0
  203. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/schema.py +0 -0
  204. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/schema_builder.py +0 -0
  205. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/sampling_gen/utils.py +0 -0
  206. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/secret_resolver.py +0 -0
  207. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/validators/__init__.py +0 -0
  208. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/validators/base.py +0 -0
  209. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/validators/local_callable.py +0 -0
  210. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/validators/python.py +0 -0
  211. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/validators/remote.py +0 -0
  212. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/engine/validators/sql.py +0 -0
  213. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/errors.py +0 -0
  214. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/essentials/__init__.py +0 -0
  215. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/interface/__init__.py +0 -0
  216. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/interface/errors.py +0 -0
  217. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/interface/results.py +0 -0
  218. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/logging.py +0 -0
  219. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/plugin_manager.py +0 -0
  220. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/plugins/__init__.py +0 -0
  221. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/plugins/errors.py +0 -0
  222. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/plugins/plugin.py +0 -0
  223. {data_designer-0.1.1 → data_designer-0.1.2}/src/data_designer/plugins/registry.py +0 -0
  224. {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/commands/test_list_command.py +0 -0
  225. {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/commands/test_models_command.py +0 -0
  226. {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/commands/test_providers_command.py +0 -0
  227. {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/commands/test_reset_command.py +0 -0
  228. {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/conftest.py +0 -0
  229. {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/controllers/test_model_controller.py +0 -0
  230. {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/controllers/test_provider_controller.py +0 -0
  231. {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/forms/test_field.py +0 -0
  232. {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/forms/test_form.py +0 -0
  233. {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/forms/test_model_builder.py +0 -0
  234. {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/forms/test_provider_builder.py +0 -0
  235. {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/repositories/test_model_repository.py +0 -0
  236. {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/repositories/test_provider_repository.py +0 -0
  237. {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/services/test_model_service.py +0 -0
  238. {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/services/test_provider_service.py +0 -0
  239. {data_designer-0.1.1 → data_designer-0.1.2}/tests/cli/test_cli_utils.py +0 -0
  240. {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/analysis/conftest.py +0 -0
  241. {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/analysis/test_column_statistics.py +0 -0
  242. {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/analysis/test_dataset_profiler_results.py +0 -0
  243. {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/analysis/utils/test_reporting.py +0 -0
  244. {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/test_config_builder.py +0 -0
  245. {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/test_data_designer_config.py +0 -0
  246. {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/test_models.py +0 -0
  247. {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/test_processors.py +0 -0
  248. {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/test_sampler_constraints.py +0 -0
  249. {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/test_sampler_params.py +0 -0
  250. {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/test_seed.py +0 -0
  251. {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/test_validator_params.py +0 -0
  252. {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/utils/__init__.py +0 -0
  253. {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/utils/test_code_lang.py +0 -0
  254. {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/utils/test_info.py +0 -0
  255. {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/utils/test_io_helpers.py +0 -0
  256. {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/utils/test_misc.py +0 -0
  257. {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/utils/test_type_helpers.py +0 -0
  258. {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/utils/test_validation.py +0 -0
  259. {data_designer-0.1.1 → data_designer-0.1.2}/tests/config/utils/test_visualization.py +0 -0
  260. {data_designer-0.1.1 → data_designer-0.1.2}/tests/conftest.py +0 -0
  261. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/analysis/column_profilers/test_judge_score_profiler.py +0 -0
  262. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/analysis/conftest.py +0 -0
  263. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/analysis/test_column_statistics_calculator.py +0 -0
  264. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/analysis/test_data/artifacts/dataset/column_configs.json +0 -0
  265. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/analysis/test_data/artifacts/dataset/dataset.json +0 -0
  266. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/analysis/test_data/artifacts/dataset/metadata.json +0 -0
  267. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/analysis/test_dataset_profiler.py +0 -0
  268. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/analysis/test_errors.py +0 -0
  269. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/analysis/utils/test_column_statistics_calculations.py +0 -0
  270. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/analysis/utils/test_judge_score_processing.py +0 -0
  271. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/column_generators/generators/__init__.py +0 -0
  272. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/column_generators/generators/test_column_generator_base.py +0 -0
  273. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/column_generators/generators/test_expression.py +0 -0
  274. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/column_generators/generators/test_llm_generators.py +0 -0
  275. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/column_generators/generators/test_samplers.py +0 -0
  276. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/column_generators/generators/test_seed_dataset.py +0 -0
  277. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/column_generators/generators/test_validation.py +0 -0
  278. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/column_generators/test_registry.py +0 -0
  279. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/column_generators/utils/test_column_generator_errors.py +0 -0
  280. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/column_generators/utils/test_judge_score_factory.py +0 -0
  281. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/column_generators/utils/test_prompt_renderer.py +0 -0
  282. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/conftest.py +0 -0
  283. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/dataset_builders/test_artifact_storage.py +0 -0
  284. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/dataset_builders/test_column_wise_builder.py +0 -0
  285. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/dataset_builders/test_multi_column_configs.py +0 -0
  286. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/dataset_builders/utils/test_concurrency.py +0 -0
  287. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/dataset_builders/utils/test_config_compiler.py +0 -0
  288. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/dataset_builders/utils/test_dag.py +0 -0
  289. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/dataset_builders/utils/test_dataset_batch_manager.py +0 -0
  290. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/conftest.py +0 -0
  291. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/parsers/test_parser.py +0 -0
  292. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/parsers/test_parsers_types.py +0 -0
  293. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/parsers/test_postprocessors.py +0 -0
  294. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/parsers/test_tag_parsers.py +0 -0
  295. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/recipes/test_recipe_base.py +0 -0
  296. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/recipes/test_response_recipes.py +0 -0
  297. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/stub_secrets.json +0 -0
  298. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/test_facade.py +0 -0
  299. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/test_litellm_overrides.py +0 -0
  300. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/test_model_errors.py +0 -0
  301. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/test_model_registry.py +0 -0
  302. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/test_model_utils.py +0 -0
  303. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/models/test_usage.py +0 -0
  304. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/__init__.py +0 -0
  305. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/ginja/__init__.py +0 -0
  306. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/ginja/test_ast.py +0 -0
  307. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/ginja/test_environment.py +0 -0
  308. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/ginja/test_exceptions.py +0 -0
  309. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/ginja/test_record.py +0 -0
  310. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/gsonschema/__init__.py +0 -0
  311. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/gsonschema/test_exceptions.py +0 -0
  312. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/gsonschema/test_schema_transformers.py +0 -0
  313. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/gsonschema/test_types.py +0 -0
  314. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/gsonschema/test_validators.py +0 -0
  315. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/processors/__init__.py +0 -0
  316. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/processors/test_drop_columns.py +0 -0
  317. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/processors/test_registry.py +0 -0
  318. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/processing/test_utils.py +0 -0
  319. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/registry/__init__.py +0 -0
  320. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/registry/conftest.py +0 -0
  321. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/registry/test_base.py +0 -0
  322. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/registry/test_data_designer_registry.py +0 -0
  323. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/registry/test_errors.py +0 -0
  324. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/resources/__init__.py +0 -0
  325. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/resources/conftest.py +0 -0
  326. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/resources/test_managed_dataset_generator.py +0 -0
  327. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/resources/test_managed_dataset_repository.py +0 -0
  328. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/resources/test_managed_storage.py +0 -0
  329. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/resources/test_resource_provider.py +0 -0
  330. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/conftest.py +0 -0
  331. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/data_sources/test_sampler_errors.py +0 -0
  332. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/data_sources/test_sources.py +0 -0
  333. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/entities/test_email_address_utils.py +0 -0
  334. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/entities/test_national_id_utils.py +0 -0
  335. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/entities/test_person.py +0 -0
  336. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/entities/test_phone_number.py +0 -0
  337. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/test_column.py +0 -0
  338. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/test_constraints.py +0 -0
  339. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/test_generator.py +0 -0
  340. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/test_jinja_utils.py +0 -0
  341. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/test_people_gen.py +0 -0
  342. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/test_schema.py +0 -0
  343. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/sampling_gen/test_utils.py +0 -0
  344. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/test_configurable_task.py +0 -0
  345. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/test_engine_errors.py +0 -0
  346. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/test_model_provider.py +0 -0
  347. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/test_secret_resolver.py +0 -0
  348. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/validators/test_local_callable.py +0 -0
  349. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/validators/test_python.py +0 -0
  350. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/validators/test_remote.py +0 -0
  351. {data_designer-0.1.1 → data_designer-0.1.2}/tests/engine/validators/test_sql.py +0 -0
  352. {data_designer-0.1.1 → data_designer-0.1.2}/tests/essentials/test_init.py +0 -0
  353. {data_designer-0.1.1 → data_designer-0.1.2}/tests/interface/test_data_designer.py +0 -0
  354. {data_designer-0.1.1 → data_designer-0.1.2}/tests/interface/test_results.py +0 -0
  355. {data_designer-0.1.1 → data_designer-0.1.2}/tests/plugins/test_plugin.py +0 -0
  356. {data_designer-0.1.1 → data_designer-0.1.2}/tests/plugins/test_plugin_registry.py +0 -0
  357. {data_designer-0.1.1 → data_designer-0.1.2}/tests/test_logging.py +0 -0
  358. {data_designer-0.1.1 → data_designer-0.1.2}/tests/test_plugin_manager.py +0 -0
  359. {data_designer-0.1.1 → data_designer-0.1.2}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-designer
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: General framework for synthetic data generation
5
5
  License-Expression: Apache-2.0
6
6
  License-File: LICENSE
@@ -97,8 +97,7 @@ export NVIDIA_API_KEY="your-api-key-here"
97
97
  export OPENAI_API_KEY="your-openai-api-key-here"
98
98
  ```
99
99
 
100
- ### 3. Generate your first dataset
101
-
100
+ ### 3. Start generating data!
102
101
  ```python
103
102
  from data_designer.essentials import (
104
103
  CategorySamplerParams,
@@ -139,8 +138,6 @@ preview = data_designer.preview(config_builder=config_builder)
139
138
  preview.display_sample_record()
140
139
  ```
141
140
 
142
- **That's it!** You've created a dataset.
143
-
144
141
  ---
145
142
 
146
143
  ## What's next?
@@ -148,7 +145,7 @@ preview.display_sample_record()
148
145
  ### 📚 Learn more
149
146
 
150
147
  - **[Quick Start Guide](https://nvidia-nemo.github.io/DataDesigner/quick-start/)** – Detailed walkthrough with more examples
151
- - **[Tutorial Notebooks](https://nvidia-nemo.github.io/DataDesigner/notebooks/intro/)** – Step-by-step interactive tutorials
148
+ - **[Tutorial Notebooks](https://nvidia-nemo.github.io/DataDesigner/notebooks/)** – Step-by-step interactive tutorials
152
149
  - **[Column Types](https://nvidia-nemo.github.io/DataDesigner/concepts/columns/)** – Explore samplers, LLM columns, validators, and more
153
150
  - **[Validators](https://nvidia-nemo.github.io/DataDesigner/concepts/validators/)** – Learn how to validate generated data with Python, SQL, and remote validators
154
151
  - **[Model Configuration](https://nvidia-nemo.github.io/DataDesigner/models/model-configs/)** – Configure custom models and providers
@@ -48,8 +48,7 @@ export NVIDIA_API_KEY="your-api-key-here"
48
48
  export OPENAI_API_KEY="your-openai-api-key-here"
49
49
  ```
50
50
 
51
- ### 3. Generate your first dataset
52
-
51
+ ### 3. Start generating data!
53
52
  ```python
54
53
  from data_designer.essentials import (
55
54
  CategorySamplerParams,
@@ -90,8 +89,6 @@ preview = data_designer.preview(config_builder=config_builder)
90
89
  preview.display_sample_record()
91
90
  ```
92
91
 
93
- **That's it!** You've created a dataset.
94
-
95
92
  ---
96
93
 
97
94
  ## What's next?
@@ -99,7 +96,7 @@ preview.display_sample_record()
99
96
  ### 📚 Learn more
100
97
 
101
98
  - **[Quick Start Guide](https://nvidia-nemo.github.io/DataDesigner/quick-start/)** – Detailed walkthrough with more examples
102
- - **[Tutorial Notebooks](https://nvidia-nemo.github.io/DataDesigner/notebooks/intro/)** – Step-by-step interactive tutorials
99
+ - **[Tutorial Notebooks](https://nvidia-nemo.github.io/DataDesigner/notebooks/)** – Step-by-step interactive tutorials
103
100
  - **[Column Types](https://nvidia-nemo.github.io/DataDesigner/concepts/columns/)** – Explore samplers, LLM columns, validators, and more
104
101
  - **[Validators](https://nvidia-nemo.github.io/DataDesigner/concepts/validators/)** – Learn how to validate generated data with Python, SQL, and remote validators
105
102
  - **[Model Configuration](https://nvidia-nemo.github.io/DataDesigner/models/model-configs/)** – Configure custom models and providers
@@ -34,11 +34,11 @@ Data Designer helps you create datasets through an intuitive, **iterative** proc
34
34
  3. **🔁 Preview** your results and iterate
35
35
  - Generate a preview dataset stored in memory for fast iteration
36
36
  - Inspect sample records and analysis results to refine your configuration
37
- - Try for yourself by running the [tutorial notebooks](notebooks/intro.md)
37
+ - Try for yourself by running the [tutorial notebooks](notebooks/README.md)
38
38
  4. **🖼️ Create** your dataset
39
39
  - Generate your full dataset and save results to disk
40
40
  - Access the generated dataset and associated artifacts for downstream use
41
- - Give it a try by running the [tutorial notebooks](notebooks/intro.md)
41
+ - Give it a try by running the [tutorial notebooks](notebooks/README.md)
42
42
 
43
43
  ## Library and Microservice
44
44
 
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.1'
32
- __version_tuple__ = version_tuple = (0, 1, 1)
31
+ __version__ = version = '0.1.2'
32
+ __version_tuple__ = version_tuple = (0, 1, 2)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -2,9 +2,9 @@
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
4
  from abc import ABC
5
- from typing import Literal, Optional, Type, Union
5
+ from typing import Annotated, Literal, Optional, Type, Union
6
6
 
7
- from pydantic import BaseModel, Field, model_validator
7
+ from pydantic import BaseModel, Discriminator, Field, model_validator
8
8
  from typing_extensions import Self
9
9
 
10
10
  from .base import ConfigBase
@@ -89,11 +89,36 @@ class SamplerColumnConfig(SingleColumnConfig):
89
89
  """
90
90
 
91
91
  sampler_type: SamplerType
92
- params: SamplerParamsT
93
- conditional_params: dict[str, SamplerParamsT] = {}
92
+ params: Annotated[SamplerParamsT, Discriminator("sampler_type")]
93
+ conditional_params: dict[str, Annotated[SamplerParamsT, Discriminator("sampler_type")]] = {}
94
94
  convert_to: Optional[str] = None
95
95
  column_type: Literal["sampler"] = "sampler"
96
96
 
97
+ @model_validator(mode="before")
98
+ @classmethod
99
+ def inject_sampler_type_into_params(cls, data: dict) -> dict:
100
+ """Inject sampler_type into params dict to enable discriminated union resolution.
101
+
102
+ This allows users to pass params as a simple dict without the sampler_type field,
103
+ which will be automatically added based on the outer sampler_type field.
104
+ """
105
+ if isinstance(data, dict):
106
+ sampler_type = data.get("sampler_type")
107
+ params = data.get("params")
108
+
109
+ # If params is a dict and doesn't have sampler_type, inject it
110
+ if sampler_type and isinstance(params, dict) and "sampler_type" not in params:
111
+ data["params"] = {"sampler_type": sampler_type, **params}
112
+
113
+ # Handle conditional_params similarly
114
+ conditional_params = data.get("conditional_params")
115
+ if conditional_params and isinstance(conditional_params, dict):
116
+ for condition, cond_params in conditional_params.items():
117
+ if isinstance(cond_params, dict) and "sampler_type" not in cond_params:
118
+ data["conditional_params"][condition] = {"sampler_type": sampler_type, **cond_params}
119
+
120
+ return data
121
+
97
122
 
98
123
  class LLMTextColumnConfig(SingleColumnConfig):
99
124
  """Configuration for text generation columns using Large Language Models.
@@ -31,34 +31,37 @@ class DatastoreSettings(BaseModel):
31
31
  token: Optional[str] = Field(default=None, description="If needed, token to use for authentication.")
32
32
 
33
33
 
34
- def get_file_column_names(file_path: Union[str, Path], file_type: str) -> list[str]:
35
- """Extract column names based on file type. Supports glob patterns like '../path/*.parquet'."""
36
- file_path = Path(file_path)
37
- if "*" in str(file_path):
38
- matching_files = sorted(file_path.parent.glob(file_path.name))
39
- if not matching_files:
40
- raise InvalidFilePathError(f"🛑 No files found matching pattern: {str(file_path)!r}")
41
- logger.debug(f"0️⃣ Using the first matching file in {str(file_path)!r} to determine column names in seed dataset")
42
- file_path = matching_files[0]
34
+ def get_file_column_names(file_reference: Union[str, Path, HfFileSystem], file_type: str) -> list[str]:
35
+ """Get column names from a dataset file.
36
+
37
+ Args:
38
+ file_reference: Path to the dataset file, or an HfFileSystem object.
39
+ file_type: Type of the dataset file. Must be one of: 'parquet', 'json', 'jsonl', 'csv'.
43
40
 
41
+ Raises:
42
+ InvalidFilePathError: If the file type is not supported.
43
+
44
+ Returns:
45
+ List of column names.
46
+ """
44
47
  if file_type == "parquet":
45
48
  try:
46
- schema = pq.read_schema(file_path)
49
+ schema = pq.read_schema(file_reference)
47
50
  if hasattr(schema, "names"):
48
51
  return schema.names
49
52
  else:
50
53
  return [field.name for field in schema]
51
54
  except Exception as e:
52
- logger.warning(f"Failed to process parquet file {file_path}: {e}")
55
+ logger.warning(f"Failed to process parquet file {file_reference}: {e}")
53
56
  return []
54
57
  elif file_type in ["json", "jsonl"]:
55
- return pd.read_json(file_path, orient="records", lines=True, nrows=1).columns.tolist()
58
+ return pd.read_json(file_reference, orient="records", lines=True, nrows=1).columns.tolist()
56
59
  elif file_type == "csv":
57
60
  try:
58
- df = pd.read_csv(file_path, nrows=1)
61
+ df = pd.read_csv(file_reference, nrows=1)
59
62
  return df.columns.tolist()
60
63
  except (pd.errors.EmptyDataError, pd.errors.ParserError) as e:
61
- logger.warning(f"Failed to process CSV file {file_path}: {e}")
64
+ logger.warning(f"Failed to process CSV file {file_reference}: {e}")
62
65
  return []
63
66
  else:
64
67
  raise InvalidFilePathError(f"🛑 Unsupported file type: {file_type!r}")
@@ -66,12 +69,36 @@ def get_file_column_names(file_path: Union[str, Path], file_type: str) -> list[s
66
69
 
67
70
  def fetch_seed_dataset_column_names(seed_dataset_reference: SeedDatasetReference) -> list[str]:
68
71
  if hasattr(seed_dataset_reference, "datastore_settings"):
69
- return _fetch_seed_dataset_column_names_from_datastore(
72
+ return fetch_seed_dataset_column_names_from_datastore(
70
73
  seed_dataset_reference.repo_id,
71
74
  seed_dataset_reference.filename,
72
75
  seed_dataset_reference.datastore_settings,
73
76
  )
74
- return _fetch_seed_dataset_column_names_from_local_file(seed_dataset_reference.dataset)
77
+ return fetch_seed_dataset_column_names_from_local_file(seed_dataset_reference.dataset)
78
+
79
+
80
+ def fetch_seed_dataset_column_names_from_datastore(
81
+ repo_id: str,
82
+ filename: str,
83
+ datastore_settings: Optional[Union[DatastoreSettings, dict]] = None,
84
+ ) -> list[str]:
85
+ file_type = filename.split(".")[-1]
86
+ if f".{file_type}" not in VALID_DATASET_FILE_EXTENSIONS:
87
+ raise InvalidFileFormatError(f"🛑 Unsupported file type: {filename!r}")
88
+
89
+ datastore_settings = resolve_datastore_settings(datastore_settings)
90
+ fs = HfFileSystem(endpoint=datastore_settings.endpoint, token=datastore_settings.token, skip_instance_cache=True)
91
+
92
+ file_path = _extract_single_file_path_from_glob_pattern_if_present(f"datasets/{repo_id}/{filename}", fs=fs)
93
+
94
+ with fs.open(file_path) as f:
95
+ return get_file_column_names(f, file_type)
96
+
97
+
98
+ def fetch_seed_dataset_column_names_from_local_file(dataset_path: str | Path) -> list[str]:
99
+ dataset_path = _validate_dataset_path(dataset_path, allow_glob_pattern=True)
100
+ dataset_path = _extract_single_file_path_from_glob_pattern_if_present(dataset_path)
101
+ return get_file_column_names(dataset_path, str(dataset_path).split(".")[-1])
75
102
 
76
103
 
77
104
  def resolve_datastore_settings(datastore_settings: DatastoreSettings | dict | None) -> DatastoreSettings:
@@ -114,25 +141,34 @@ def upload_to_hf_hub(
114
141
  return f"{repo_id}/{filename}"
115
142
 
116
143
 
117
- def _fetch_seed_dataset_column_names_from_datastore(
118
- repo_id: str,
119
- filename: str,
120
- datastore_settings: Optional[Union[DatastoreSettings, dict]] = None,
121
- ) -> list[str]:
122
- file_type = filename.split(".")[-1]
123
- if f".{file_type}" not in VALID_DATASET_FILE_EXTENSIONS:
124
- raise InvalidFileFormatError(f"🛑 Unsupported file type: {filename!r}")
125
-
126
- datastore_settings = resolve_datastore_settings(datastore_settings)
127
- fs = HfFileSystem(endpoint=datastore_settings.endpoint, token=datastore_settings.token, skip_instance_cache=True)
128
-
129
- with fs.open(f"datasets/{repo_id}/{filename}") as f:
130
- return get_file_column_names(f, file_type)
131
-
144
+ def _extract_single_file_path_from_glob_pattern_if_present(
145
+ file_path: str | Path,
146
+ fs: HfFileSystem | None = None,
147
+ ) -> Path:
148
+ file_path = Path(file_path)
132
149
 
133
- def _fetch_seed_dataset_column_names_from_local_file(dataset_path: str | Path) -> list[str]:
134
- dataset_path = _validate_dataset_path(dataset_path, allow_glob_pattern=True)
135
- return get_file_column_names(dataset_path, str(dataset_path).split(".")[-1])
150
+ # no glob pattern
151
+ if "*" not in str(file_path):
152
+ return file_path
153
+
154
+ # glob pattern with HfFileSystem
155
+ if fs is not None:
156
+ file_to_check = None
157
+ file_extension = file_path.name.split(".")[-1]
158
+ for file in fs.ls(str(file_path.parent)):
159
+ filename = file["name"]
160
+ if filename.endswith(f".{file_extension}"):
161
+ file_to_check = filename
162
+ if file_to_check is None:
163
+ raise InvalidFilePathError(f"🛑 No files found matching pattern: {str(file_path)!r}")
164
+ logger.debug(f"Using the first matching file in {str(file_path)!r} to determine column names in seed dataset")
165
+ return Path(file_to_check)
166
+
167
+ # glob pattern with local file system
168
+ if not (matching_files := sorted(file_path.parent.glob(file_path.name))):
169
+ raise InvalidFilePathError(f"🛑 No files found matching pattern: {str(file_path)!r}")
170
+ logger.debug(f"Using the first matching file in {str(file_path)!r} to determine column names in seed dataset")
171
+ return matching_files[0]
136
172
 
137
173
 
138
174
  def _validate_dataset_path(dataset_path: Union[str, Path], allow_glob_pattern: bool = False) -> Path:
@@ -78,7 +78,7 @@ def get_default_model_configs() -> list[ModelConfig]:
78
78
  return []
79
79
 
80
80
 
81
- def get_defaul_model_providers_missing_api_keys() -> list[str]:
81
+ def get_default_model_providers_missing_api_keys() -> list[str]:
82
82
  missing_api_keys = []
83
83
  for predefined_provider in PREDEFINED_PROVIDERS:
84
84
  if os.environ.get(predefined_provider["api_key"]) is None:
@@ -66,6 +66,7 @@ class CategorySamplerParams(ConfigBase):
66
66
  "Larger values will be sampled with higher probability."
67
67
  ),
68
68
  )
69
+ sampler_type: Literal[SamplerType.CATEGORY] = SamplerType.CATEGORY
69
70
 
70
71
  @model_validator(mode="after")
71
72
  def _normalize_weights_if_needed(self) -> Self:
@@ -106,6 +107,7 @@ class DatetimeSamplerParams(ConfigBase):
106
107
  default="D",
107
108
  description="Sampling units, e.g. the smallest possible time interval between samples.",
108
109
  )
110
+ sampler_type: Literal[SamplerType.DATETIME] = SamplerType.DATETIME
109
111
 
110
112
  @field_validator("start", "end")
111
113
  @classmethod
@@ -136,6 +138,7 @@ class SubcategorySamplerParams(ConfigBase):
136
138
  ...,
137
139
  description="Mapping from each value of parent category to a list of subcategory values.",
138
140
  )
141
+ sampler_type: Literal[SamplerType.SUBCATEGORY] = SamplerType.SUBCATEGORY
139
142
 
140
143
 
141
144
  class TimeDeltaSamplerParams(ConfigBase):
@@ -187,6 +190,7 @@ class TimeDeltaSamplerParams(ConfigBase):
187
190
  default="D",
188
191
  description="Sampling units, e.g. the smallest possible time interval between samples.",
189
192
  )
193
+ sampler_type: Literal[SamplerType.TIMEDELTA] = SamplerType.TIMEDELTA
190
194
 
191
195
  @model_validator(mode="after")
192
196
  def _validate_min_less_than_max(self) -> Self:
@@ -219,6 +223,7 @@ class UUIDSamplerParams(ConfigBase):
219
223
  default=False,
220
224
  description="If true, all letters in the UUID will be capitalized.",
221
225
  )
226
+ sampler_type: Literal[SamplerType.UUID] = SamplerType.UUID
222
227
 
223
228
  @property
224
229
  def last_index(self) -> int:
@@ -257,6 +262,7 @@ class ScipySamplerParams(ConfigBase):
257
262
  decimal_places: Optional[int] = Field(
258
263
  default=None, description="Number of decimal places to round the sampled values to."
259
264
  )
265
+ sampler_type: Literal[SamplerType.SCIPY] = SamplerType.SCIPY
260
266
 
261
267
 
262
268
  class BinomialSamplerParams(ConfigBase):
@@ -273,6 +279,7 @@ class BinomialSamplerParams(ConfigBase):
273
279
 
274
280
  n: int = Field(..., description="Number of trials.")
275
281
  p: float = Field(..., description="Probability of success on each trial.", ge=0.0, le=1.0)
282
+ sampler_type: Literal[SamplerType.BINOMIAL] = SamplerType.BINOMIAL
276
283
 
277
284
 
278
285
  class BernoulliSamplerParams(ConfigBase):
@@ -288,6 +295,7 @@ class BernoulliSamplerParams(ConfigBase):
288
295
  """
289
296
 
290
297
  p: float = Field(..., description="Probability of success.", ge=0.0, le=1.0)
298
+ sampler_type: Literal[SamplerType.BERNOULLI] = SamplerType.BERNOULLI
291
299
 
292
300
 
293
301
  class BernoulliMixtureSamplerParams(ConfigBase):
@@ -327,6 +335,7 @@ class BernoulliMixtureSamplerParams(ConfigBase):
327
335
  ...,
328
336
  description="Parameters of the scipy.stats distribution given in `dist_name`.",
329
337
  )
338
+ sampler_type: Literal[SamplerType.BERNOULLI_MIXTURE] = SamplerType.BERNOULLI_MIXTURE
330
339
 
331
340
 
332
341
  class GaussianSamplerParams(ConfigBase):
@@ -350,6 +359,7 @@ class GaussianSamplerParams(ConfigBase):
350
359
  decimal_places: Optional[int] = Field(
351
360
  default=None, description="Number of decimal places to round the sampled values to."
352
361
  )
362
+ sampler_type: Literal[SamplerType.GAUSSIAN] = SamplerType.GAUSSIAN
353
363
 
354
364
 
355
365
  class PoissonSamplerParams(ConfigBase):
@@ -369,6 +379,7 @@ class PoissonSamplerParams(ConfigBase):
369
379
  """
370
380
 
371
381
  mean: float = Field(..., description="Mean number of events in a fixed interval.")
382
+ sampler_type: Literal[SamplerType.POISSON] = SamplerType.POISSON
372
383
 
373
384
 
374
385
  class UniformSamplerParams(ConfigBase):
@@ -390,6 +401,7 @@ class UniformSamplerParams(ConfigBase):
390
401
  decimal_places: Optional[int] = Field(
391
402
  default=None, description="Number of decimal places to round the sampled values to."
392
403
  )
404
+ sampler_type: Literal[SamplerType.UNIFORM] = SamplerType.UNIFORM
393
405
 
394
406
 
395
407
  #########################################
@@ -470,11 +482,12 @@ class PersonSamplerParams(ConfigBase):
470
482
  default=False,
471
483
  description="If True, then append synthetic persona columns to each generated person.",
472
484
  )
485
+ sampler_type: Literal[SamplerType.PERSON] = SamplerType.PERSON
473
486
 
474
487
  @property
475
488
  def generator_kwargs(self) -> list[str]:
476
489
  """Keyword arguments to pass to the person generator."""
477
- return [f for f in list(PersonSamplerParams.model_fields) if f != "locale"]
490
+ return [f for f in list(PersonSamplerParams.model_fields) if f not in ("locale", "sampler_type")]
478
491
 
479
492
  @property
480
493
  def people_gen_key(self) -> str:
@@ -533,11 +546,12 @@ class PersonFromFakerSamplerParams(ConfigBase):
533
546
  min_length=2,
534
547
  max_length=2,
535
548
  )
549
+ sampler_type: Literal[SamplerType.PERSON_FROM_FAKER] = SamplerType.PERSON_FROM_FAKER
536
550
 
537
551
  @property
538
552
  def generator_kwargs(self) -> list[str]:
539
553
  """Keyword arguments to pass to the person generator."""
540
- return [f for f in list(PersonFromFakerSamplerParams.model_fields) if f != "locale"]
554
+ return [f for f in list(PersonFromFakerSamplerParams.model_fields) if f not in ("locale", "sampler_type")]
541
555
 
542
556
  @property
543
557
  def people_gen_key(self) -> str:
@@ -9,8 +9,8 @@ import pandas as pd
9
9
  from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
10
10
  from data_designer.config.config_builder import DataDesignerConfigBuilder
11
11
  from data_designer.config.default_model_settings import (
12
- get_defaul_model_providers_missing_api_keys,
13
12
  get_default_model_configs,
13
+ get_default_model_providers_missing_api_keys,
14
14
  get_default_provider_name,
15
15
  get_default_providers,
16
16
  resolve_seed_default_model_settings,
@@ -313,7 +313,7 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
313
313
  if model_providers is None:
314
314
  if can_run_data_designer_locally():
315
315
  model_providers = get_default_providers()
316
- missing_api_keys = get_defaul_model_providers_missing_api_keys()
316
+ missing_api_keys = get_default_model_providers_missing_api_keys()
317
317
  if len(missing_api_keys) == len(PREDEFINED_PROVIDERS):
318
318
  logger.warning(
319
319
  "🚨 You are trying to use a default model provider but your API keys are missing."
@@ -23,7 +23,15 @@ from data_designer.config.column_types import (
23
23
  get_column_display_order,
24
24
  )
25
25
  from data_designer.config.errors import InvalidConfigError
26
- from data_designer.config.sampler_params import SamplerType, UUIDSamplerParams
26
+ from data_designer.config.sampler_params import (
27
+ CategorySamplerParams,
28
+ GaussianSamplerParams,
29
+ PersonFromFakerSamplerParams,
30
+ PersonSamplerParams,
31
+ SamplerType,
32
+ UniformSamplerParams,
33
+ UUIDSamplerParams,
34
+ )
27
35
  from data_designer.config.utils.code_lang import CodeLang
28
36
  from data_designer.config.utils.errors import UserJinjaTemplateSyntaxError
29
37
  from data_designer.config.validator_params import CodeValidatorParams
@@ -324,3 +332,114 @@ def test_get_column_config_from_kwargs():
324
332
  ),
325
333
  SeedDatasetColumnConfig,
326
334
  )
335
+
336
+
337
+ def test_sampler_column_config_discriminated_union_with_dict_params():
338
+ """Test that sampler_type field is automatically injected into params dict."""
339
+ config = SamplerColumnConfig(
340
+ name="test_uniform",
341
+ sampler_type=SamplerType.UNIFORM,
342
+ params={"low": 0.0, "high": 1.0, "decimal_places": 2},
343
+ )
344
+ assert config.name == "test_uniform"
345
+ assert config.sampler_type == SamplerType.UNIFORM
346
+ assert isinstance(config.params, UniformSamplerParams)
347
+ assert config.params.sampler_type == SamplerType.UNIFORM
348
+ assert config.params.low == 0.0
349
+ assert config.params.high == 1.0
350
+ assert config.params.decimal_places == 2
351
+
352
+
353
+ def test_sampler_column_config_discriminated_union_with_explicit_sampler_type():
354
+ """Test that explicit sampler_type in params dict is preserved."""
355
+ config = SamplerColumnConfig(
356
+ name="test_category",
357
+ sampler_type=SamplerType.CATEGORY,
358
+ params={"sampler_type": "category", "values": ["A", "B", "C"], "weights": [0.5, 0.3, 0.2]},
359
+ )
360
+ assert config.name == "test_category"
361
+ assert config.sampler_type == SamplerType.CATEGORY
362
+ assert isinstance(config.params, CategorySamplerParams)
363
+ assert config.params.sampler_type == SamplerType.CATEGORY
364
+ assert config.params.values == ["A", "B", "C"]
365
+
366
+
367
+ def test_sampler_column_config_discriminated_union_serialization():
368
+ """Test that discriminated union works correctly with serialization/deserialization."""
369
+ config = SamplerColumnConfig(
370
+ name="test_person",
371
+ sampler_type=SamplerType.PERSON,
372
+ params={"locale": "en_US", "sex": "Female", "age_range": [25, 45]},
373
+ )
374
+
375
+ # Serialize
376
+ serialized = config.model_dump()
377
+ assert "sampler_type" in serialized["params"]
378
+ assert serialized["params"]["sampler_type"] == "person"
379
+
380
+ # Deserialize
381
+ deserialized = SamplerColumnConfig(**serialized)
382
+ assert isinstance(deserialized.params, PersonSamplerParams)
383
+ assert deserialized.params.locale == "en_US"
384
+ assert deserialized.params.sex == "Female"
385
+ assert deserialized.params.age_range == [25, 45]
386
+
387
+
388
+ def test_sampler_column_config_discriminated_union_person_vs_person_from_faker():
389
+ """Test that discriminated union correctly distinguishes between person and person_from_faker."""
390
+ # Test person sampler (managed datasets)
391
+ person_config = SamplerColumnConfig(
392
+ name="test_person",
393
+ sampler_type=SamplerType.PERSON,
394
+ params={"locale": "en_US", "sex": "Male", "age_range": [30, 50]},
395
+ )
396
+ assert isinstance(person_config.params, PersonSamplerParams)
397
+ assert person_config.params.sampler_type == SamplerType.PERSON
398
+ assert person_config.params.locale == "en_US"
399
+
400
+ # Test person_from_faker sampler (Faker-based)
401
+ person_faker_config = SamplerColumnConfig(
402
+ name="test_person_faker",
403
+ sampler_type=SamplerType.PERSON_FROM_FAKER,
404
+ params={"locale": "en_GB", "sex": "Female", "age_range": [20, 40]},
405
+ )
406
+ assert isinstance(person_faker_config.params, PersonFromFakerSamplerParams)
407
+ assert person_faker_config.params.sampler_type == SamplerType.PERSON_FROM_FAKER
408
+ assert person_faker_config.params.locale == "en_GB"
409
+
410
+ # Verify they are different types
411
+ assert type(person_config.params) is not type(person_faker_config.params)
412
+ assert isinstance(person_config.params, PersonSamplerParams)
413
+ assert isinstance(person_faker_config.params, PersonFromFakerSamplerParams)
414
+
415
+
416
+ def test_sampler_column_config_discriminated_union_with_conditional_params():
417
+ """Test that sampler_type is injected into conditional_params as well."""
418
+ config = SamplerColumnConfig(
419
+ name="test_gaussian",
420
+ sampler_type=SamplerType.GAUSSIAN,
421
+ params={"mean": 0.0, "stddev": 1.0},
422
+ conditional_params={"age > 21": {"mean": 5.0, "stddev": 2.0}},
423
+ )
424
+
425
+ assert isinstance(config.params, GaussianSamplerParams)
426
+ assert config.params.mean == 0.0
427
+ assert config.params.stddev == 1.0
428
+
429
+ # Check conditional params
430
+ assert "age > 21" in config.conditional_params
431
+ cond_param = config.conditional_params["age > 21"]
432
+ assert isinstance(cond_param, GaussianSamplerParams)
433
+ assert cond_param.sampler_type == SamplerType.GAUSSIAN
434
+ assert cond_param.mean == 5.0
435
+ assert cond_param.stddev == 2.0
436
+
437
+
438
+ def test_sampler_column_config_discriminated_union_wrong_params_type():
439
+ """Test that discriminated union rejects params that don't match the sampler_type."""
440
+ with pytest.raises(ValidationError):
441
+ SamplerColumnConfig(
442
+ name="test_wrong_params",
443
+ sampler_type=SamplerType.UNIFORM,
444
+ params={"values": ["A", "B"]}, # Category params for uniform sampler
445
+ )
@@ -1,7 +1,6 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
- from pathlib import Path
5
4
  from unittest.mock import MagicMock, patch
6
5
 
7
6
  import numpy as np
@@ -13,6 +12,7 @@ import pytest
13
12
  from data_designer.config.datastore import (
14
13
  DatastoreSettings,
15
14
  fetch_seed_dataset_column_names,
15
+ fetch_seed_dataset_column_names_from_local_file,
16
16
  get_file_column_names,
17
17
  resolve_datastore_settings,
18
18
  upload_to_hf_hub,
@@ -127,22 +127,6 @@ def test_get_file_column_names_unicode(tmp_path, file_type):
127
127
  assert get_file_column_names(str(unicode_path), file_type) == df_unicode.columns.tolist()
128
128
 
129
129
 
130
- @pytest.mark.parametrize("file_type", ["parquet", "csv", "json", "jsonl"])
131
- def test_get_file_column_names_with_glob_pattern(tmp_path, file_type):
132
- df = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
133
- for i in range(5):
134
- _write_file(df, tmp_path / f"{i}.{file_type}", file_type)
135
- assert get_file_column_names(f"{tmp_path}/*.{file_type}", file_type) == ["col1", "col2"]
136
-
137
-
138
- def test_get_file_column_names_with_glob_pattern_error(tmp_path):
139
- df = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
140
- for i in range(5):
141
- _write_file(df, tmp_path / f"{i}.parquet", "parquet")
142
- with pytest.raises(InvalidFilePathError, match="No files found matching pattern"):
143
- get_file_column_names(f"{tmp_path}/*.csv", "csv")
144
-
145
-
146
130
  def test_get_file_column_names_with_filesystem_parquet():
147
131
  """Test get_file_column_names with filesystem parameter for parquet files."""
148
132
  mock_schema = MagicMock()
@@ -153,7 +137,7 @@ def test_get_file_column_names_with_filesystem_parquet():
153
137
  result = get_file_column_names("datasets/test/file.parquet", "parquet")
154
138
 
155
139
  assert result == ["col1", "col2", "col3"]
156
- mock_read_schema.assert_called_once_with(Path("datasets/test/file.parquet"))
140
+ mock_read_schema.assert_called_once_with("datasets/test/file.parquet")
157
141
 
158
142
 
159
143
  @pytest.mark.parametrize("file_type", ["json", "jsonl", "csv"])
@@ -274,3 +258,29 @@ def test_upload_to_hf_hub_error_handling(datastore_settings):
274
258
  with patch("data_designer.config.datastore.Path.is_file", autospec=True) as mock_is_file:
275
259
  mock_is_file.return_value = True
276
260
  upload_to_hf_hub("test.text", "test.txt", "test/repo", datastore_settings)
261
+
262
+
263
+ @pytest.mark.parametrize("file_type", ["parquet", "json", "jsonl", "csv"])
264
+ def test_fetch_seed_dataset_column_names_from_local_file_with_glob(tmp_path, file_type):
265
+ """Test fetch_seed_dataset_column_names_from_local_file with glob pattern matching multiple files."""
266
+ test_data = pd.DataFrame({"col1": [1, 2], "col2": [3, 4], "col3": [5, 6]})
267
+
268
+ # Create multiple files with the same schema
269
+ for i in range(3):
270
+ file_path = tmp_path / f"data_{i}.{file_type}"
271
+ _write_file(test_data, file_path, file_type)
272
+
273
+ # Test glob pattern that matches all files
274
+ glob_pattern = str(tmp_path / f"*.{file_type}")
275
+ result = fetch_seed_dataset_column_names_from_local_file(glob_pattern)
276
+
277
+ assert result == ["col1", "col2", "col3"]
278
+
279
+
280
+ @pytest.mark.parametrize("file_type", ["parquet", "csv"])
281
+ def test_fetch_seed_dataset_column_names_from_local_file_with_glob_no_matches(tmp_path, file_type):
282
+ """Test fetch_seed_dataset_column_names_from_local_file with glob pattern that matches no files."""
283
+ glob_pattern = str(tmp_path / f"nonexistent_*.{file_type}")
284
+
285
+ with pytest.raises(InvalidFilePathError, match="does not contain files of type"):
286
+ fetch_seed_dataset_column_names_from_local_file(glob_pattern)
@@ -11,9 +11,9 @@ import yaml
11
11
  from data_designer.config.default_model_settings import (
12
12
  get_builtin_model_configs,
13
13
  get_builtin_model_providers,
14
- get_defaul_model_providers_missing_api_keys,
15
14
  get_default_inference_parameters,
16
15
  get_default_model_configs,
16
+ get_default_model_providers_missing_api_keys,
17
17
  get_default_provider_name,
18
18
  get_default_providers,
19
19
  resolve_seed_default_model_settings,
@@ -152,4 +152,4 @@ def test_resolve_seed_default_model_settings(tmp_path: Path):
152
152
  @patch("data_designer.config.default_model_settings.os.environ.get")
153
153
  def test_get_default_model_providers_missing_api_keys(mock_environ_get):
154
154
  mock_environ_get.return_value = None
155
- assert get_defaul_model_providers_missing_api_keys() == ["NVIDIA_API_KEY", "OPENAI_API_KEY"]
155
+ assert get_default_model_providers_missing_api_keys() == ["NVIDIA_API_KEY", "OPENAI_API_KEY"]