data-designer 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (357) hide show
  1. data_designer-0.1.0/.github/workflows/build-docs.yml +18 -0
  2. data_designer-0.1.0/.github/workflows/ci.yml +87 -0
  3. data_designer-0.1.0/.github/workflows/dco-assistant.yml +44 -0
  4. data_designer-0.1.0/.github/workflows/semantic-pull-requests.yml +26 -0
  5. data_designer-0.1.0/.gitignore +90 -0
  6. data_designer-0.1.0/.pre-commit-config.yaml +24 -0
  7. data_designer-0.1.0/AGENTS.md +417 -0
  8. data_designer-0.1.0/CLAUDE.md +3 -0
  9. data_designer-0.1.0/CODE_OF_CONDUCT.md +76 -0
  10. data_designer-0.1.0/CONTRIBUTING.md +242 -0
  11. data_designer-0.1.0/DCO +34 -0
  12. data_designer-0.1.0/LICENSE +201 -0
  13. data_designer-0.1.0/Makefile +116 -0
  14. data_designer-0.1.0/PKG-INFO +173 -0
  15. data_designer-0.1.0/README.md +124 -0
  16. data_designer-0.1.0/VERSIONING.md +90 -0
  17. data_designer-0.1.0/docs/CONTRIBUTING.md +1 -0
  18. data_designer-0.1.0/docs/assets/palette-favicon.png +0 -0
  19. data_designer-0.1.0/docs/code_reference/column_configs.md +8 -0
  20. data_designer-0.1.0/docs/code_reference/config_builder.md +10 -0
  21. data_designer-0.1.0/docs/code_reference/data_designer_config.md +7 -0
  22. data_designer-0.1.0/docs/code_reference/sampler_params.md +12 -0
  23. data_designer-0.1.0/docs/code_reference/validator_params.md +6 -0
  24. data_designer-0.1.0/docs/concepts/columns.md +136 -0
  25. data_designer-0.1.0/docs/concepts/persons.md +240 -0
  26. data_designer-0.1.0/docs/concepts/plugins.md +0 -0
  27. data_designer-0.1.0/docs/concepts/validators.md +340 -0
  28. data_designer-0.1.0/docs/css/mkdocstrings.css +80 -0
  29. data_designer-0.1.0/docs/css/style.css +184 -0
  30. data_designer-0.1.0/docs/index.md +48 -0
  31. data_designer-0.1.0/docs/installation.md +29 -0
  32. data_designer-0.1.0/docs/js/toc-toggle.js +22 -0
  33. data_designer-0.1.0/docs/models/configure-model-settings-with-the-cli.md +135 -0
  34. data_designer-0.1.0/docs/models/default-model-settings.md +96 -0
  35. data_designer-0.1.0/docs/models/model-configs.md +244 -0
  36. data_designer-0.1.0/docs/models/model-providers.md +50 -0
  37. data_designer-0.1.0/docs/notebooks/.gitignore +2 -0
  38. data_designer-0.1.0/docs/notebooks/1-the-basics.ipynb +1971 -0
  39. data_designer-0.1.0/docs/notebooks/2-structured-outputs-and-jinja-expressions.ipynb +1729 -0
  40. data_designer-0.1.0/docs/notebooks/3-seeding-with-a-dataset.ipynb +1913 -0
  41. data_designer-0.1.0/docs/notebooks/intro.md +120 -0
  42. data_designer-0.1.0/docs/quick-start.md +85 -0
  43. data_designer-0.1.0/mkdocs.yml +112 -0
  44. data_designer-0.1.0/pyproject.toml +138 -0
  45. data_designer-0.1.0/scripts/update_license_headers.py +215 -0
  46. data_designer-0.1.0/src/data_designer/__init__.py +15 -0
  47. data_designer-0.1.0/src/data_designer/_version.py +34 -0
  48. data_designer-0.1.0/src/data_designer/cli/README.md +236 -0
  49. data_designer-0.1.0/src/data_designer/cli/__init__.py +6 -0
  50. data_designer-0.1.0/src/data_designer/cli/commands/__init__.py +2 -0
  51. data_designer-0.1.0/src/data_designer/cli/commands/list.py +130 -0
  52. data_designer-0.1.0/src/data_designer/cli/commands/models.py +10 -0
  53. data_designer-0.1.0/src/data_designer/cli/commands/providers.py +11 -0
  54. data_designer-0.1.0/src/data_designer/cli/commands/reset.py +100 -0
  55. data_designer-0.1.0/src/data_designer/cli/controllers/__init__.py +7 -0
  56. data_designer-0.1.0/src/data_designer/cli/controllers/model_controller.py +246 -0
  57. data_designer-0.1.0/src/data_designer/cli/controllers/provider_controller.py +317 -0
  58. data_designer-0.1.0/src/data_designer/cli/forms/__init__.py +20 -0
  59. data_designer-0.1.0/src/data_designer/cli/forms/builder.py +51 -0
  60. data_designer-0.1.0/src/data_designer/cli/forms/field.py +180 -0
  61. data_designer-0.1.0/src/data_designer/cli/forms/form.py +59 -0
  62. data_designer-0.1.0/src/data_designer/cli/forms/model_builder.py +125 -0
  63. data_designer-0.1.0/src/data_designer/cli/forms/provider_builder.py +76 -0
  64. data_designer-0.1.0/src/data_designer/cli/main.py +44 -0
  65. data_designer-0.1.0/src/data_designer/cli/repositories/__init__.py +8 -0
  66. data_designer-0.1.0/src/data_designer/cli/repositories/base.py +39 -0
  67. data_designer-0.1.0/src/data_designer/cli/repositories/model_repository.py +42 -0
  68. data_designer-0.1.0/src/data_designer/cli/repositories/provider_repository.py +43 -0
  69. data_designer-0.1.0/src/data_designer/cli/services/__init__.py +7 -0
  70. data_designer-0.1.0/src/data_designer/cli/services/model_service.py +116 -0
  71. data_designer-0.1.0/src/data_designer/cli/services/provider_service.py +111 -0
  72. data_designer-0.1.0/src/data_designer/cli/ui.py +448 -0
  73. data_designer-0.1.0/src/data_designer/cli/utils.py +47 -0
  74. data_designer-0.1.0/src/data_designer/config/__init__.py +2 -0
  75. data_designer-0.1.0/src/data_designer/config/analysis/column_profilers.py +89 -0
  76. data_designer-0.1.0/src/data_designer/config/analysis/column_statistics.py +274 -0
  77. data_designer-0.1.0/src/data_designer/config/analysis/dataset_profiler.py +60 -0
  78. data_designer-0.1.0/src/data_designer/config/analysis/utils/errors.py +8 -0
  79. data_designer-0.1.0/src/data_designer/config/analysis/utils/reporting.py +188 -0
  80. data_designer-0.1.0/src/data_designer/config/base.py +68 -0
  81. data_designer-0.1.0/src/data_designer/config/column_configs.py +354 -0
  82. data_designer-0.1.0/src/data_designer/config/column_types.py +168 -0
  83. data_designer-0.1.0/src/data_designer/config/config_builder.py +660 -0
  84. data_designer-0.1.0/src/data_designer/config/data_designer_config.py +40 -0
  85. data_designer-0.1.0/src/data_designer/config/dataset_builders.py +11 -0
  86. data_designer-0.1.0/src/data_designer/config/datastore.py +151 -0
  87. data_designer-0.1.0/src/data_designer/config/default_model_settings.py +123 -0
  88. data_designer-0.1.0/src/data_designer/config/errors.py +19 -0
  89. data_designer-0.1.0/src/data_designer/config/interface.py +54 -0
  90. data_designer-0.1.0/src/data_designer/config/models.py +231 -0
  91. data_designer-0.1.0/src/data_designer/config/preview_results.py +32 -0
  92. data_designer-0.1.0/src/data_designer/config/processors.py +41 -0
  93. data_designer-0.1.0/src/data_designer/config/sampler_constraints.py +51 -0
  94. data_designer-0.1.0/src/data_designer/config/sampler_params.py +604 -0
  95. data_designer-0.1.0/src/data_designer/config/seed.py +145 -0
  96. data_designer-0.1.0/src/data_designer/config/utils/code_lang.py +83 -0
  97. data_designer-0.1.0/src/data_designer/config/utils/constants.py +313 -0
  98. data_designer-0.1.0/src/data_designer/config/utils/errors.py +19 -0
  99. data_designer-0.1.0/src/data_designer/config/utils/info.py +88 -0
  100. data_designer-0.1.0/src/data_designer/config/utils/io_helpers.py +273 -0
  101. data_designer-0.1.0/src/data_designer/config/utils/misc.py +81 -0
  102. data_designer-0.1.0/src/data_designer/config/utils/numerical_helpers.py +28 -0
  103. data_designer-0.1.0/src/data_designer/config/utils/type_helpers.py +100 -0
  104. data_designer-0.1.0/src/data_designer/config/utils/validation.py +336 -0
  105. data_designer-0.1.0/src/data_designer/config/utils/visualization.py +427 -0
  106. data_designer-0.1.0/src/data_designer/config/validator_params.py +96 -0
  107. data_designer-0.1.0/src/data_designer/engine/__init__.py +2 -0
  108. data_designer-0.1.0/src/data_designer/engine/analysis/column_profilers/base.py +55 -0
  109. data_designer-0.1.0/src/data_designer/engine/analysis/column_profilers/judge_score_profiler.py +160 -0
  110. data_designer-0.1.0/src/data_designer/engine/analysis/column_profilers/registry.py +20 -0
  111. data_designer-0.1.0/src/data_designer/engine/analysis/column_statistics.py +142 -0
  112. data_designer-0.1.0/src/data_designer/engine/analysis/dataset_profiler.py +125 -0
  113. data_designer-0.1.0/src/data_designer/engine/analysis/errors.py +7 -0
  114. data_designer-0.1.0/src/data_designer/engine/analysis/utils/column_statistics_calculations.py +209 -0
  115. data_designer-0.1.0/src/data_designer/engine/analysis/utils/judge_score_processing.py +128 -0
  116. data_designer-0.1.0/src/data_designer/engine/column_generators/__init__.py +2 -0
  117. data_designer-0.1.0/src/data_designer/engine/column_generators/generators/__init__.py +2 -0
  118. data_designer-0.1.0/src/data_designer/engine/column_generators/generators/base.py +61 -0
  119. data_designer-0.1.0/src/data_designer/engine/column_generators/generators/expression.py +63 -0
  120. data_designer-0.1.0/src/data_designer/engine/column_generators/generators/llm_generators.py +172 -0
  121. data_designer-0.1.0/src/data_designer/engine/column_generators/generators/samplers.py +75 -0
  122. data_designer-0.1.0/src/data_designer/engine/column_generators/generators/seed_dataset.py +149 -0
  123. data_designer-0.1.0/src/data_designer/engine/column_generators/generators/validation.py +147 -0
  124. data_designer-0.1.0/src/data_designer/engine/column_generators/registry.py +56 -0
  125. data_designer-0.1.0/src/data_designer/engine/column_generators/utils/errors.py +13 -0
  126. data_designer-0.1.0/src/data_designer/engine/column_generators/utils/judge_score_factory.py +57 -0
  127. data_designer-0.1.0/src/data_designer/engine/column_generators/utils/prompt_renderer.py +98 -0
  128. data_designer-0.1.0/src/data_designer/engine/configurable_task.py +82 -0
  129. data_designer-0.1.0/src/data_designer/engine/dataset_builders/artifact_storage.py +181 -0
  130. data_designer-0.1.0/src/data_designer/engine/dataset_builders/column_wise_builder.py +287 -0
  131. data_designer-0.1.0/src/data_designer/engine/dataset_builders/errors.py +13 -0
  132. data_designer-0.1.0/src/data_designer/engine/dataset_builders/multi_column_configs.py +44 -0
  133. data_designer-0.1.0/src/data_designer/engine/dataset_builders/utils/__init__.py +2 -0
  134. data_designer-0.1.0/src/data_designer/engine/dataset_builders/utils/concurrency.py +184 -0
  135. data_designer-0.1.0/src/data_designer/engine/dataset_builders/utils/config_compiler.py +60 -0
  136. data_designer-0.1.0/src/data_designer/engine/dataset_builders/utils/dag.py +56 -0
  137. data_designer-0.1.0/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +190 -0
  138. data_designer-0.1.0/src/data_designer/engine/dataset_builders/utils/errors.py +13 -0
  139. data_designer-0.1.0/src/data_designer/engine/errors.py +49 -0
  140. data_designer-0.1.0/src/data_designer/engine/model_provider.py +75 -0
  141. data_designer-0.1.0/src/data_designer/engine/models/__init__.py +2 -0
  142. data_designer-0.1.0/src/data_designer/engine/models/errors.py +308 -0
  143. data_designer-0.1.0/src/data_designer/engine/models/facade.py +225 -0
  144. data_designer-0.1.0/src/data_designer/engine/models/litellm_overrides.py +162 -0
  145. data_designer-0.1.0/src/data_designer/engine/models/parsers/__init__.py +2 -0
  146. data_designer-0.1.0/src/data_designer/engine/models/parsers/errors.py +34 -0
  147. data_designer-0.1.0/src/data_designer/engine/models/parsers/parser.py +236 -0
  148. data_designer-0.1.0/src/data_designer/engine/models/parsers/postprocessors.py +93 -0
  149. data_designer-0.1.0/src/data_designer/engine/models/parsers/tag_parsers.py +60 -0
  150. data_designer-0.1.0/src/data_designer/engine/models/parsers/types.py +82 -0
  151. data_designer-0.1.0/src/data_designer/engine/models/recipes/base.py +79 -0
  152. data_designer-0.1.0/src/data_designer/engine/models/recipes/response_recipes.py +291 -0
  153. data_designer-0.1.0/src/data_designer/engine/models/registry.py +118 -0
  154. data_designer-0.1.0/src/data_designer/engine/models/usage.py +75 -0
  155. data_designer-0.1.0/src/data_designer/engine/models/utils.py +38 -0
  156. data_designer-0.1.0/src/data_designer/engine/processing/ginja/__init__.py +2 -0
  157. data_designer-0.1.0/src/data_designer/engine/processing/ginja/ast.py +64 -0
  158. data_designer-0.1.0/src/data_designer/engine/processing/ginja/environment.py +461 -0
  159. data_designer-0.1.0/src/data_designer/engine/processing/ginja/exceptions.py +54 -0
  160. data_designer-0.1.0/src/data_designer/engine/processing/ginja/record.py +30 -0
  161. data_designer-0.1.0/src/data_designer/engine/processing/gsonschema/__init__.py +2 -0
  162. data_designer-0.1.0/src/data_designer/engine/processing/gsonschema/exceptions.py +8 -0
  163. data_designer-0.1.0/src/data_designer/engine/processing/gsonschema/schema_transformers.py +81 -0
  164. data_designer-0.1.0/src/data_designer/engine/processing/gsonschema/types.py +8 -0
  165. data_designer-0.1.0/src/data_designer/engine/processing/gsonschema/validators.py +143 -0
  166. data_designer-0.1.0/src/data_designer/engine/processing/processors/base.py +15 -0
  167. data_designer-0.1.0/src/data_designer/engine/processing/processors/drop_columns.py +46 -0
  168. data_designer-0.1.0/src/data_designer/engine/processing/processors/registry.py +20 -0
  169. data_designer-0.1.0/src/data_designer/engine/processing/utils.py +120 -0
  170. data_designer-0.1.0/src/data_designer/engine/registry/base.py +97 -0
  171. data_designer-0.1.0/src/data_designer/engine/registry/data_designer_registry.py +37 -0
  172. data_designer-0.1.0/src/data_designer/engine/registry/errors.py +10 -0
  173. data_designer-0.1.0/src/data_designer/engine/resources/managed_dataset_generator.py +35 -0
  174. data_designer-0.1.0/src/data_designer/engine/resources/managed_dataset_repository.py +194 -0
  175. data_designer-0.1.0/src/data_designer/engine/resources/managed_storage.py +63 -0
  176. data_designer-0.1.0/src/data_designer/engine/resources/resource_provider.py +46 -0
  177. data_designer-0.1.0/src/data_designer/engine/resources/seed_dataset_data_store.py +66 -0
  178. data_designer-0.1.0/src/data_designer/engine/sampling_gen/column.py +89 -0
  179. data_designer-0.1.0/src/data_designer/engine/sampling_gen/constraints.py +95 -0
  180. data_designer-0.1.0/src/data_designer/engine/sampling_gen/data_sources/base.py +214 -0
  181. data_designer-0.1.0/src/data_designer/engine/sampling_gen/data_sources/errors.py +10 -0
  182. data_designer-0.1.0/src/data_designer/engine/sampling_gen/data_sources/sources.py +342 -0
  183. data_designer-0.1.0/src/data_designer/engine/sampling_gen/entities/__init__.py +2 -0
  184. data_designer-0.1.0/src/data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
  185. data_designer-0.1.0/src/data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +64 -0
  186. data_designer-0.1.0/src/data_designer/engine/sampling_gen/entities/email_address_utils.py +169 -0
  187. data_designer-0.1.0/src/data_designer/engine/sampling_gen/entities/errors.py +8 -0
  188. data_designer-0.1.0/src/data_designer/engine/sampling_gen/entities/national_id_utils.py +100 -0
  189. data_designer-0.1.0/src/data_designer/engine/sampling_gen/entities/person.py +142 -0
  190. data_designer-0.1.0/src/data_designer/engine/sampling_gen/entities/phone_number.py +122 -0
  191. data_designer-0.1.0/src/data_designer/engine/sampling_gen/errors.py +24 -0
  192. data_designer-0.1.0/src/data_designer/engine/sampling_gen/generator.py +121 -0
  193. data_designer-0.1.0/src/data_designer/engine/sampling_gen/jinja_utils.py +60 -0
  194. data_designer-0.1.0/src/data_designer/engine/sampling_gen/people_gen.py +203 -0
  195. data_designer-0.1.0/src/data_designer/engine/sampling_gen/person_constants.py +54 -0
  196. data_designer-0.1.0/src/data_designer/engine/sampling_gen/schema.py +143 -0
  197. data_designer-0.1.0/src/data_designer/engine/sampling_gen/schema_builder.py +59 -0
  198. data_designer-0.1.0/src/data_designer/engine/sampling_gen/utils.py +40 -0
  199. data_designer-0.1.0/src/data_designer/engine/secret_resolver.py +80 -0
  200. data_designer-0.1.0/src/data_designer/engine/validators/__init__.py +17 -0
  201. data_designer-0.1.0/src/data_designer/engine/validators/base.py +36 -0
  202. data_designer-0.1.0/src/data_designer/engine/validators/local_callable.py +34 -0
  203. data_designer-0.1.0/src/data_designer/engine/validators/python.py +245 -0
  204. data_designer-0.1.0/src/data_designer/engine/validators/remote.py +83 -0
  205. data_designer-0.1.0/src/data_designer/engine/validators/sql.py +60 -0
  206. data_designer-0.1.0/src/data_designer/errors.py +5 -0
  207. data_designer-0.1.0/src/data_designer/essentials/__init__.py +137 -0
  208. data_designer-0.1.0/src/data_designer/interface/__init__.py +2 -0
  209. data_designer-0.1.0/src/data_designer/interface/data_designer.py +351 -0
  210. data_designer-0.1.0/src/data_designer/interface/errors.py +16 -0
  211. data_designer-0.1.0/src/data_designer/interface/results.py +55 -0
  212. data_designer-0.1.0/src/data_designer/logging.py +161 -0
  213. data_designer-0.1.0/src/data_designer/plugin_manager.py +83 -0
  214. data_designer-0.1.0/src/data_designer/plugins/__init__.py +6 -0
  215. data_designer-0.1.0/src/data_designer/plugins/errors.py +10 -0
  216. data_designer-0.1.0/src/data_designer/plugins/plugin.py +69 -0
  217. data_designer-0.1.0/src/data_designer/plugins/registry.py +86 -0
  218. data_designer-0.1.0/tests/cli/commands/test_list_command.py +82 -0
  219. data_designer-0.1.0/tests/cli/commands/test_models_command.py +18 -0
  220. data_designer-0.1.0/tests/cli/commands/test_providers_command.py +18 -0
  221. data_designer-0.1.0/tests/cli/commands/test_reset_command.py +287 -0
  222. data_designer-0.1.0/tests/cli/conftest.py +93 -0
  223. data_designer-0.1.0/tests/cli/controllers/test_model_controller.py +159 -0
  224. data_designer-0.1.0/tests/cli/controllers/test_provider_controller.py +226 -0
  225. data_designer-0.1.0/tests/cli/forms/test_field.py +320 -0
  226. data_designer-0.1.0/tests/cli/forms/test_form.py +222 -0
  227. data_designer-0.1.0/tests/cli/forms/test_model_builder.py +344 -0
  228. data_designer-0.1.0/tests/cli/forms/test_provider_builder.py +233 -0
  229. data_designer-0.1.0/tests/cli/repositories/test_model_repository.py +34 -0
  230. data_designer-0.1.0/tests/cli/repositories/test_provider_repository.py +37 -0
  231. data_designer-0.1.0/tests/cli/services/test_model_service.py +134 -0
  232. data_designer-0.1.0/tests/cli/services/test_provider_service.py +167 -0
  233. data_designer-0.1.0/tests/cli/test_cli_utils.py +111 -0
  234. data_designer-0.1.0/tests/config/analysis/conftest.py +82 -0
  235. data_designer-0.1.0/tests/config/analysis/test_column_statistics.py +292 -0
  236. data_designer-0.1.0/tests/config/analysis/test_dataset_profiler_results.py +160 -0
  237. data_designer-0.1.0/tests/config/analysis/utils/test_reporting.py +292 -0
  238. data_designer-0.1.0/tests/config/test_columns.py +326 -0
  239. data_designer-0.1.0/tests/config/test_config_builder.py +761 -0
  240. data_designer-0.1.0/tests/config/test_data_designer_config.py +29 -0
  241. data_designer-0.1.0/tests/config/test_datastore.py +236 -0
  242. data_designer-0.1.0/tests/config/test_default_model_settings.py +149 -0
  243. data_designer-0.1.0/tests/config/test_models.py +253 -0
  244. data_designer-0.1.0/tests/config/test_processors.py +66 -0
  245. data_designer-0.1.0/tests/config/test_sampler_constraints.py +25 -0
  246. data_designer-0.1.0/tests/config/test_sampler_params.py +141 -0
  247. data_designer-0.1.0/tests/config/test_seed.py +101 -0
  248. data_designer-0.1.0/tests/config/test_validator_params.py +52 -0
  249. data_designer-0.1.0/tests/config/utils/__init__.py +2 -0
  250. data_designer-0.1.0/tests/config/utils/test_code_lang.py +37 -0
  251. data_designer-0.1.0/tests/config/utils/test_info.py +58 -0
  252. data_designer-0.1.0/tests/config/utils/test_io_helpers.py +172 -0
  253. data_designer-0.1.0/tests/config/utils/test_misc.py +75 -0
  254. data_designer-0.1.0/tests/config/utils/test_type_helpers.py +162 -0
  255. data_designer-0.1.0/tests/config/utils/test_validation.py +266 -0
  256. data_designer-0.1.0/tests/config/utils/test_visualization.py +77 -0
  257. data_designer-0.1.0/tests/conftest.py +312 -0
  258. data_designer-0.1.0/tests/engine/analysis/column_profilers/test_base.py +67 -0
  259. data_designer-0.1.0/tests/engine/analysis/column_profilers/test_judge_score_profiler.py +286 -0
  260. data_designer-0.1.0/tests/engine/analysis/conftest.py +153 -0
  261. data_designer-0.1.0/tests/engine/analysis/test_column_statistics_calculator.py +73 -0
  262. data_designer-0.1.0/tests/engine/analysis/test_data/artifacts/dataset/column_configs.json +142 -0
  263. data_designer-0.1.0/tests/engine/analysis/test_data/artifacts/dataset/dataset.json +2929 -0
  264. data_designer-0.1.0/tests/engine/analysis/test_data/artifacts/dataset/metadata.json +27 -0
  265. data_designer-0.1.0/tests/engine/analysis/test_dataset_profiler.py +160 -0
  266. data_designer-0.1.0/tests/engine/analysis/test_errors.py +59 -0
  267. data_designer-0.1.0/tests/engine/analysis/utils/test_column_statistics_calculations.py +307 -0
  268. data_designer-0.1.0/tests/engine/analysis/utils/test_judge_score_processing.py +164 -0
  269. data_designer-0.1.0/tests/engine/column_generators/generators/__init__.py +2 -0
  270. data_designer-0.1.0/tests/engine/column_generators/generators/test_column_generator_base.py +99 -0
  271. data_designer-0.1.0/tests/engine/column_generators/generators/test_expression.py +168 -0
  272. data_designer-0.1.0/tests/engine/column_generators/generators/test_llm_generators.py +278 -0
  273. data_designer-0.1.0/tests/engine/column_generators/generators/test_samplers.py +131 -0
  274. data_designer-0.1.0/tests/engine/column_generators/generators/test_seed_dataset.py +788 -0
  275. data_designer-0.1.0/tests/engine/column_generators/generators/test_validation.py +239 -0
  276. data_designer-0.1.0/tests/engine/column_generators/test_registry.py +39 -0
  277. data_designer-0.1.0/tests/engine/column_generators/utils/test_column_generator_errors.py +15 -0
  278. data_designer-0.1.0/tests/engine/column_generators/utils/test_judge_score_factory.py +75 -0
  279. data_designer-0.1.0/tests/engine/column_generators/utils/test_prompt_renderer.py +127 -0
  280. data_designer-0.1.0/tests/engine/conftest.py +52 -0
  281. data_designer-0.1.0/tests/engine/dataset_builders/test_artifact_storage.py +215 -0
  282. data_designer-0.1.0/tests/engine/dataset_builders/test_column_wise_builder.py +210 -0
  283. data_designer-0.1.0/tests/engine/dataset_builders/test_multi_column_configs.py +157 -0
  284. data_designer-0.1.0/tests/engine/dataset_builders/utils/test_concurrency.py +366 -0
  285. data_designer-0.1.0/tests/engine/dataset_builders/utils/test_config_compiler.py +85 -0
  286. data_designer-0.1.0/tests/engine/dataset_builders/utils/test_dag.py +113 -0
  287. data_designer-0.1.0/tests/engine/dataset_builders/utils/test_dataset_batch_manager.py +387 -0
  288. data_designer-0.1.0/tests/engine/models/conftest.py +62 -0
  289. data_designer-0.1.0/tests/engine/models/parsers/test_parser.py +173 -0
  290. data_designer-0.1.0/tests/engine/models/parsers/test_parsers_types.py +96 -0
  291. data_designer-0.1.0/tests/engine/models/parsers/test_postprocessors.py +122 -0
  292. data_designer-0.1.0/tests/engine/models/parsers/test_tag_parsers.py +118 -0
  293. data_designer-0.1.0/tests/engine/models/recipes/test_recipe_base.py +130 -0
  294. data_designer-0.1.0/tests/engine/models/recipes/test_response_recipes.py +257 -0
  295. data_designer-0.1.0/tests/engine/models/stub_secrets.json +3 -0
  296. data_designer-0.1.0/tests/engine/models/test_facade.py +174 -0
  297. data_designer-0.1.0/tests/engine/models/test_litellm_overrides.py +131 -0
  298. data_designer-0.1.0/tests/engine/models/test_model_errors.py +231 -0
  299. data_designer-0.1.0/tests/engine/models/test_model_registry.py +204 -0
  300. data_designer-0.1.0/tests/engine/models/test_model_utils.py +36 -0
  301. data_designer-0.1.0/tests/engine/models/test_usage.py +65 -0
  302. data_designer-0.1.0/tests/engine/processing/__init__.py +2 -0
  303. data_designer-0.1.0/tests/engine/processing/ginja/__init__.py +2 -0
  304. data_designer-0.1.0/tests/engine/processing/ginja/test_ast.py +124 -0
  305. data_designer-0.1.0/tests/engine/processing/ginja/test_environment.py +213 -0
  306. data_designer-0.1.0/tests/engine/processing/ginja/test_exceptions.py +21 -0
  307. data_designer-0.1.0/tests/engine/processing/ginja/test_record.py +25 -0
  308. data_designer-0.1.0/tests/engine/processing/gsonschema/__init__.py +2 -0
  309. data_designer-0.1.0/tests/engine/processing/gsonschema/test_exceptions.py +42 -0
  310. data_designer-0.1.0/tests/engine/processing/gsonschema/test_schema_transformers.py +368 -0
  311. data_designer-0.1.0/tests/engine/processing/gsonschema/test_types.py +109 -0
  312. data_designer-0.1.0/tests/engine/processing/gsonschema/test_validators.py +198 -0
  313. data_designer-0.1.0/tests/engine/processing/processors/__init__.py +2 -0
  314. data_designer-0.1.0/tests/engine/processing/processors/test_drop_columns.py +162 -0
  315. data_designer-0.1.0/tests/engine/processing/processors/test_registry.py +18 -0
  316. data_designer-0.1.0/tests/engine/processing/test_utils.py +118 -0
  317. data_designer-0.1.0/tests/engine/registry/__init__.py +2 -0
  318. data_designer-0.1.0/tests/engine/registry/conftest.py +37 -0
  319. data_designer-0.1.0/tests/engine/registry/test_base.py +227 -0
  320. data_designer-0.1.0/tests/engine/registry/test_data_designer_registry.py +215 -0
  321. data_designer-0.1.0/tests/engine/registry/test_errors.py +63 -0
  322. data_designer-0.1.0/tests/engine/resources/__init__.py +2 -0
  323. data_designer-0.1.0/tests/engine/resources/conftest.py +57 -0
  324. data_designer-0.1.0/tests/engine/resources/test_managed_dataset_generator.py +119 -0
  325. data_designer-0.1.0/tests/engine/resources/test_managed_dataset_repository.py +215 -0
  326. data_designer-0.1.0/tests/engine/resources/test_managed_storage.py +95 -0
  327. data_designer-0.1.0/tests/engine/resources/test_resource_provider.py +60 -0
  328. data_designer-0.1.0/tests/engine/sampling_gen/conftest.py +300 -0
  329. data_designer-0.1.0/tests/engine/sampling_gen/data_sources/test_sampler_errors.py +17 -0
  330. data_designer-0.1.0/tests/engine/sampling_gen/data_sources/test_sources.py +363 -0
  331. data_designer-0.1.0/tests/engine/sampling_gen/entities/test_email_address_utils.py +105 -0
  332. data_designer-0.1.0/tests/engine/sampling_gen/entities/test_national_id_utils.py +61 -0
  333. data_designer-0.1.0/tests/engine/sampling_gen/entities/test_person.py +286 -0
  334. data_designer-0.1.0/tests/engine/sampling_gen/entities/test_phone_number.py +94 -0
  335. data_designer-0.1.0/tests/engine/sampling_gen/test_column.py +101 -0
  336. data_designer-0.1.0/tests/engine/sampling_gen/test_constraints.py +100 -0
  337. data_designer-0.1.0/tests/engine/sampling_gen/test_generator.py +530 -0
  338. data_designer-0.1.0/tests/engine/sampling_gen/test_jinja_utils.py +113 -0
  339. data_designer-0.1.0/tests/engine/sampling_gen/test_people_gen.py +56 -0
  340. data_designer-0.1.0/tests/engine/sampling_gen/test_schema.py +255 -0
  341. data_designer-0.1.0/tests/engine/sampling_gen/test_utils.py +36 -0
  342. data_designer-0.1.0/tests/engine/test_configurable_task.py +163 -0
  343. data_designer-0.1.0/tests/engine/test_engine_errors.py +61 -0
  344. data_designer-0.1.0/tests/engine/test_model_provider.py +61 -0
  345. data_designer-0.1.0/tests/engine/test_secret_resolver.py +89 -0
  346. data_designer-0.1.0/tests/engine/validators/test_local_callable.py +33 -0
  347. data_designer-0.1.0/tests/engine/validators/test_python.py +121 -0
  348. data_designer-0.1.0/tests/engine/validators/test_remote.py +58 -0
  349. data_designer-0.1.0/tests/engine/validators/test_sql.py +22 -0
  350. data_designer-0.1.0/tests/essentials/test_init.py +317 -0
  351. data_designer-0.1.0/tests/interface/test_data_designer.py +479 -0
  352. data_designer-0.1.0/tests/interface/test_results.py +259 -0
  353. data_designer-0.1.0/tests/plugins/test_plugin.py +181 -0
  354. data_designer-0.1.0/tests/plugins/test_plugin_registry.py +289 -0
  355. data_designer-0.1.0/tests/test_logging.py +210 -0
  356. data_designer-0.1.0/tests/test_plugin_manager.py +232 -0
  357. data_designer-0.1.0/uv.lock +4979 -0
@@ -0,0 +1,18 @@
1
+ name: build_docs
2
+ on:
3
+ push:
4
+ branches:
5
+ - main
6
+ jobs:
7
+ deploy:
8
+ runs-on: ubuntu-latest
9
+ permissions:
10
+ contents: write
11
+ steps:
12
+ - uses: actions/checkout@v2
13
+ - uses: astral-sh/setup-uv@v6
14
+ with:
15
+ version: "0.9.5"
16
+ - run: uv python install
17
+ - run: uv sync --group docs
18
+ - run: uv run mkdocs gh-deploy --force --clean --verbose
@@ -0,0 +1,87 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [ main ]
6
+ pull_request:
7
+ branches: [ main ]
8
+ workflow_dispatch:
9
+
10
+ jobs:
11
+ test:
12
+ name: Test (Python ${{ matrix.python-version }} on ${{ matrix.os }})
13
+ runs-on: ${{ matrix.os }}
14
+ strategy:
15
+ fail-fast: false
16
+ matrix:
17
+ os: [ubuntu-latest, macos-latest]
18
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
19
+
20
+ steps:
21
+ - name: Checkout code
22
+ uses: actions/checkout@v4
23
+
24
+ - name: Install uv
25
+ uses: astral-sh/setup-uv@v5
26
+ with:
27
+ version: "latest"
28
+ python-version: ${{ matrix.python-version }}
29
+ enable-cache: true
30
+
31
+ - name: Install dependencies
32
+ run: |
33
+ uv sync --group dev
34
+
35
+ - name: Run tests with coverage
36
+ run: |
37
+ uv run pytest -v --cov=data_designer --cov-report=term-missing --cov-report=xml --cov-fail-under=90
38
+
39
+ lint:
40
+ name: Lint and Format Check
41
+ runs-on: ubuntu-latest
42
+
43
+ steps:
44
+ - name: Checkout code
45
+ uses: actions/checkout@v4
46
+
47
+ - name: Install uv
48
+ uses: astral-sh/setup-uv@v5
49
+ with:
50
+ version: "latest"
51
+ python-version: "3.11"
52
+ enable-cache: true
53
+
54
+ - name: Install dependencies
55
+ run: |
56
+ uv sync --group dev
57
+
58
+ - name: Check formatting
59
+ run: |
60
+ uv run ruff format --check
61
+
62
+ - name: Run linter
63
+ run: |
64
+ uv run ruff check
65
+
66
+ license-headers:
67
+ name: Check License Headers
68
+ runs-on: ubuntu-latest
69
+
70
+ steps:
71
+ - name: Checkout code
72
+ uses: actions/checkout@v4
73
+
74
+ - name: Install uv
75
+ uses: astral-sh/setup-uv@v5
76
+ with:
77
+ version: "latest"
78
+ python-version: "3.11"
79
+ enable-cache: true
80
+
81
+ - name: Install dependencies
82
+ run: |
83
+ uv sync --group dev
84
+
85
+ - name: Check license headers
86
+ run: |
87
+ uv run python scripts/update_license_headers.py --check
@@ -0,0 +1,44 @@
1
+ name: "DCO Assistant"
2
+ on:
3
+ issue_comment:
4
+ types: [created]
5
+ pull_request_target:
6
+ types: [opened,closed,synchronize]
7
+
8
+ permissions:
9
+ actions: write
10
+ checks: none
11
+ contents: write
12
+ deployments: none
13
+ id-token: none
14
+ issues: none
15
+ discussions: none
16
+ packages: none
17
+ pages: none
18
+ pull-requests: write
19
+ repository-projects: none
20
+ security-events: none
21
+ statuses: write
22
+
23
+ jobs:
24
+ DCOAssistant:
25
+ if: github.repository_owner == 'NVIDIA-NeMo'
26
+ runs-on: ubuntu-latest
27
+ steps:
28
+ - name: "DCO Assistant"
29
+ if: (github.event.comment.body == 'recheck' || github.event.comment.body == 'I have read the Contributor Agreement including DCO and I hereby sign the Contributor Agreement and DCO') || github.event_name == 'pull_request_target'
30
+ uses: contributor-assistant/github-action@ca4a40a7d1004f18d9960b404b97e5f30a505a08
31
+ env:
32
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
33
+ PERSONAL_ACCESS_TOKEN: ${{ secrets.DCO_ASSISTANT_TOKEN }}
34
+ with:
35
+ path-to-signatures: "dco-signatures.json"
36
+ path-to-document: 'https://github.com/NVIDIA-NeMo/DataDesigner/blob/main/DCO'
37
+ branch: 'signatures'
38
+ allowlist: dependabot
39
+ create-file-commit-message: "chore: create file to store dco signatures"
40
+ signed-commit-message: "chore: $contributorName has signed the dco in #$pullRequestNo"
41
+ custom-notsigned-prcomment: "Thank you for your submission! We ask that $you sign our [Developer Certificate of Origin](https://github.com/NVIDIA-NeMo/DataDesigner/blob/main/DCO) before we can accept your contribution. You can sign the DCO by adding a comment below using this text:"
42
+ custom-pr-sign-comment: "I have read the DCO document and I hereby sign the DCO."
43
+ lock-pullrequest-aftermerge: false
44
+ use-dco-flag: true
@@ -0,0 +1,26 @@
1
+ name: Validate PR title
2
+
3
+ on:
4
+ pull_request_target:
5
+ types:
6
+ - opened
7
+ - edited
8
+ - synchronize
9
+ - reopened
10
+ pull_request:
11
+ types:
12
+ - opened
13
+ - edited
14
+ - synchronize
15
+ - reopened
16
+
17
+ defaults:
18
+ run:
19
+ shell: bash -x -e -u -o pipefail {0}
20
+
21
+ permissions:
22
+ pull-requests: read
23
+
24
+ jobs:
25
+ semantic-pull-request:
26
+ uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_semantic_pull_request.yml@v0.65.12
@@ -0,0 +1,90 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # Installer logs
30
+ pip-log.txt
31
+ pip-delete-this-directory.txt
32
+
33
+ # Unit test / coverage reports
34
+ htmlcov/
35
+ .coverage
36
+ .coverage.*
37
+ .cache
38
+ coverage.xml
39
+ .pytest_cache/
40
+
41
+ # mkdocs documentation
42
+ /site
43
+
44
+ # Jupyter Notebook
45
+ .ipynb_checkpoints
46
+
47
+ # IPython
48
+ profile_default/
49
+ ipython_config.py
50
+
51
+ # pyenv
52
+ .python-version
53
+
54
+ # uv
55
+ .venv/
56
+ .uv/
57
+
58
+ # Environments
59
+ .env
60
+ env/
61
+ venv/
62
+ ENV/
63
+ env.bak/
64
+ venv.bak/
65
+
66
+ # Ruff
67
+ .ruff_cache/
68
+
69
+ # IDEs
70
+ .vscode/
71
+ .idea/
72
+ *.swp
73
+ *.swo
74
+ *~
75
+ .DS_Store
76
+
77
+ # Build artifacts
78
+ *.whl
79
+ *.tar.gz
80
+ *.zip
81
+
82
+ # Auto-generated version file
83
+ src/data_designer/_version.py
84
+
85
+ # Local scratch space
86
+ .scratch/
87
+
88
+ .claude/
89
+
90
+ docs/examples/artifacts/
@@ -0,0 +1,24 @@
1
+ repos:
2
+ # General file checks
3
+ - repo: https://github.com/pre-commit/pre-commit-hooks
4
+ rev: v5.0.0
5
+ hooks:
6
+ - id: trailing-whitespace
7
+ - id: end-of-file-fixer
8
+ - id: check-yaml
9
+ - id: check-added-large-files
10
+ - id: check-json
11
+ - id: check-toml
12
+ - id: check-merge-conflict
13
+ - id: debug-statements
14
+ - id: mixed-line-ending
15
+
16
+ # Ruff - linting and formatting (using your existing ruff config)
17
+ - repo: https://github.com/astral-sh/ruff-pre-commit
18
+ rev: v0.12.3
19
+ hooks:
20
+ # Run the linter
21
+ - id: ruff
22
+ args: [--fix]
23
+ # Run the formatter
24
+ - id: ruff-format
@@ -0,0 +1,417 @@
1
+ # AGENTS.md
2
+
3
+ This file provides guidance to agents when working with code in this repository.
4
+
5
+ ## Project Overview
6
+
7
+ **DataDesigner** is an NVIDIA NeMo project for creating synthetic datasets from scratch. It's a comprehensive framework that generates structured data using multiple generation strategies:
8
+
9
+ - **Sampled data**: Built-in generators (UUID, DateTime, etc.) and Faker integration
10
+ - **LLM-generated content**: Text, code, and structured data via LiteLLM
11
+ - **Expression-based columns**: Derived columns using Jinja2 templates
12
+ - **Validation & scoring**: Python, SQL, and remote validators; LLM-based judge scoring
13
+ - **Seed dataset-based generation**: Generate from existing datasets
14
+
15
+ ### Architecture
16
+
17
+ The project follows a layered architecture:
18
+
19
+ 1. **Config Layer** ([src/data_designer/config/](src/data_designer/config/)): User-facing configuration API
20
+ - `config_builder.py`: Main builder API for constructing configurations
21
+ - `columns.py`: Column configuration types (Sampler, LLMText, LLMCode, LLMStructured, LLMJudge, Expression, Validation, SeedDataset)
22
+ - `models.py`: Model configurations and inference parameters
23
+ - `sampler_params.py`: Parametrized samplers (Uniform, Category, Person, DateTime, etc.)
24
+
25
+ 2. **Engine Layer** ([src/data_designer/engine/](src/data_designer/engine/)): Internal generation and processing
26
+ - `column_generators/`: Generates individual columns from configs
27
+ - `dataset_builders/`: Orchestrates full dataset generation with DAG-based dependency management
28
+ - `models/`: LLM integration via LiteLLM with response parsing
29
+ - `validators/`: Column validation (Python, SQL, Code, Remote)
30
+ - `sampling_gen/`: Sophisticated person/entity sampling
31
+
32
+ 3. **Interface Layer** ([src/data_designer/interface/](src/data_designer/interface/)): Public API
33
+ - `data_designer.py`: Main `DataDesigner` class (primary entry point)
34
+ - `results.py`: Result containers
35
+ - `errors.py`: Public error types
36
+
37
+ 4. **Essentials** ([src/data_designer/essentials/](src/data_designer/essentials/)): Convenience module re-exporting key classes for users
38
+
39
+ ### Key Design Patterns
40
+
41
+ - **Builder pattern**: Configuration construction via `DataDesignerConfigBuilder`
42
+ - **Registry pattern**: Plugin system for column generators, validators, and profilers
43
+ - **Strategy pattern**: Multiple generation approaches (sampled, LLM, expression, seed)
44
+ - **DAG-based execution**: Column dependencies managed as directed acyclic graph
45
+
46
+ ## Development Workflow
47
+
48
+ This project uses `uv` for dependency management and `make` for common tasks:
49
+
50
+ ```bash
51
+ # Install dependencies
52
+ uv sync
53
+
54
+ # Install with dev dependencies
55
+ uv sync --all-extras
56
+
57
+ # Run the main module (if applicable)
58
+ uv run python -m data_designer
59
+ ```
60
+
61
+ ### Code Quality
62
+
63
+ ```bash
64
+ # Using Make (recommended)
65
+ make lint # Run ruff linter
66
+ make lint-fix # Fix linting issues automatically
67
+ make format # Format code with ruff
68
+ make format-check # Check code formatting without changes
69
+ make check-all # Run all checks (format-check + lint)
70
+ make check-all-fix # Run all checks with autofix (format + lint-fix)
71
+
72
+ # Direct commands
73
+ uv run ruff check # Lint all files
74
+ uv run ruff check --fix # Lint with autofix
75
+ uv run ruff format # Format all files
76
+ uv run ruff format --check # Check formatting
77
+ ```
78
+
79
+ ### Running Tests
80
+
81
+ ```bash
82
+ # Run all tests
83
+ uv run pytest
84
+
85
+ # Run tests with verbose output
86
+ uv run pytest -v
87
+
88
+ # Run a specific test file
89
+ uv run pytest tests/config/test_sampler_constraints.py
90
+
91
+ # Run tests with coverage
92
+ uv run pytest --cov=data_designer --cov-report=term-missing --cov-report=html
93
+
94
+ # Using Make
95
+ make test # Run all tests
96
+ make coverage # Run tests with coverage report
97
+ ```
98
+
99
+ ## Key Files
100
+
101
+ - [src/data_designer/interface/data_designer.py](src/data_designer/interface/data_designer.py) - Main entry point (`DataDesigner` class)
102
+ - [src/data_designer/config/config_builder.py](src/data_designer/config/config_builder.py) - Configuration API (`DataDesignerConfigBuilder`)
103
+ - [src/data_designer/engine/dataset_builders/column_wise_builder.py](src/data_designer/engine/dataset_builders/column_wise_builder.py) - Generation orchestrator
104
+ - [src/data_designer/essentials/\_\_init\_\_.py](src/data_designer/essentials/__init__.py) - User-facing API exports
105
+ - [pyproject.toml](pyproject.toml) - Project dependencies and tool configurations
106
+ - [Makefile](Makefile) - Common development commands
107
+
108
+ ## Working Guidelines
109
+
110
+ - **Comments**: Only insert comments when code is especially important to understand. For basic code blocks, comments aren't necessary. We want readable code without vacuous comments.
111
+ - **License headers**: All Python files must include the NVIDIA SPDX license header:
112
+ ```python
113
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
114
+ # SPDX-License-Identifier: Apache-2.0
115
+ ```
116
+ Use `make update-license-headers` to add headers to all files automatically.
117
+ - **Imports**: Avoid importing Python modules inside method definitions. Prefer module-level imports for better performance and clarity.
118
+ - **Type annotations**: ALWAYS add type annotations to all functions, methods, and class attributes (including tests).
119
+
120
+ ## Code Style
121
+
122
+ This project uses `ruff` (v0.12.3) for linting and formatting. Follow these guidelines to avoid linter errors:
123
+
124
+ ### General Formatting
125
+
126
+ - **Line length**: Maximum 120 characters per line
127
+ - **Quote style**: Always use double quotes (`"`) for strings
128
+ - **Indentation**: Use 4 spaces (never tabs)
129
+ - **Target version**: Python 3.11+
130
+
131
+ ### Type Annotations
132
+
133
+ Type annotations are REQUIRED for all code in this project. This is strictly enforced for code quality and maintainability.
134
+
135
+ - **ALWAYS** add type annotations to all functions, methods, and class attributes (including tests)
136
+ - Use primitive types when possible: `list` not `List`, `dict` not `Dict`, `set` not `Set`, `tuple` not `Tuple`
137
+ - Use modern union syntax with `|` for optional and union types (Python 3.10+):
138
+ - `str | None` not `Optional[str]`
139
+ - `int | str` not `Union[int, str]`
140
+ - Only import from `typing` when absolutely necessary for complex generic types
141
+ - For Pydantic models, use field-level type annotations
142
+
143
+ ```python
144
+ # Good
145
+ def process_items(items: list[str], max_count: int | None = None) -> dict[str, int]:
146
+ return {item: len(item) for item in items}
147
+
148
+ # Avoid - missing type annotations
149
+ def process_items(items, max_count=None):
150
+ return {item: len(item) for item in items}
151
+
152
+ # Avoid - old-style typing
153
+ from typing import List, Dict, Optional
154
+ def process_items(items: List[str], max_count: Optional[int] = None) -> Dict[str, int]:
155
+ return {item: len(item) for item in items}
156
+ ```
157
+
158
+ ### Import Style
159
+
160
+ - **ALWAYS** use absolute imports, never relative imports
161
+ - Place imports at module level, not inside functions
162
+ - Import sorting is handled by `ruff`'s `isort` - imports should be grouped and sorted:
163
+ 1. Standard library imports
164
+ 2. Third-party imports
165
+ 3. First-party imports (`data_designer`)
166
+ - Use standard import conventions (enforced by `ICN`)
167
+
168
+ ```python
169
+ # Good
170
+ from data_designer.config.config_builder import DataDesignerConfigBuilder
171
+
172
+ # Bad - relative import (will cause linter errors)
173
+ from .config_builder import DataDesignerConfigBuilder
174
+
175
+ # Good - imports at module level
176
+ from pathlib import Path
177
+
178
+ def process_file(filename: str) -> None:
179
+ path = Path(filename)
180
+
181
+ # Bad - import inside function
182
+ def process_file(filename: str) -> None:
183
+ from pathlib import Path
184
+ path = Path(filename)
185
+ ```
186
+
187
+ ### Naming Conventions (PEP 8)
188
+
189
+ Follow PEP 8 naming conventions:
190
+
191
+ - **Functions and variables**: `snake_case`
192
+ - **Classes**: `PascalCase`
193
+ - **Constants**: `UPPER_SNAKE_CASE`
194
+ - **Private attributes**: prefix with single underscore `_private_var`
195
+
196
+ ```python
197
+ # Good
198
+ class DatasetGenerator:
199
+ MAX_RETRIES = 3
200
+
201
+ def __init__(self) -> None:
202
+ self._cache: dict[str, str] = {}
203
+
204
+ def generate_dataset(self, config: dict[str, str]) -> pd.DataFrame:
205
+ pass
206
+
207
+ # Bad
208
+ class dataset_generator: # Should be PascalCase
209
+ maxRetries = 3 # Should be UPPER_SNAKE_CASE
210
+
211
+ def GenerateDataset(self, Config): # Should be snake_case
212
+ pass
213
+ ```
214
+
215
+ ### Common Pitfalls to Avoid
216
+
217
+ 1. **Mutable default arguments**:
218
+
219
+ ```python
220
+ # Bad - mutable default argument
221
+ def add_item(item: str, items: list[str] = []) -> list[str]:
222
+ items.append(item)
223
+ return items
224
+
225
+ # Good
226
+ def add_item(item: str, items: list[str] | None = None) -> list[str]:
227
+ if items is None:
228
+ items = []
229
+ items.append(item)
230
+ return items
231
+ ```
232
+
233
+ 2. **Unused imports and variables**:
234
+
235
+ ```python
236
+ # Bad - unused import
237
+ from pathlib import Path
238
+ from typing import Any # Not used
239
+
240
+ def process() -> None:
241
+ pass
242
+
243
+ # Good - only import what you use
244
+ from pathlib import Path
245
+
246
+ def process() -> None:
247
+ pass
248
+ ```
249
+
250
+ 3. **Simplify code where possible** (enforced by `SIM`):
251
+
252
+ ```python
253
+ # Bad
254
+ if condition:
255
+ return True
256
+ else:
257
+ return False
258
+
259
+ # Good
260
+ return condition
261
+
262
+ # Bad
263
+ if key in my_dict:
264
+ value = my_dict[key]
265
+ else:
266
+ value = default
267
+
268
+ # Good
269
+ value = my_dict.get(key, default)
270
+ ```
271
+
272
+ 4. **Use comprehensions properly**:
273
+
274
+ ```python
275
+ # Bad
276
+ list([x for x in items]) # Unnecessary list() call
277
+
278
+ # Good
279
+ [x for x in items]
280
+
281
+ # Bad
282
+ dict([(k, v) for k, v in items])
283
+
284
+ # Good
285
+ {k: v for k, v in items}
286
+ ```
287
+
288
+ 5. **Proper return statements**:
289
+
290
+ ```python
291
+ # Bad - unnecessary else after return
292
+ def get_value(condition: bool) -> str:
293
+ if condition:
294
+ return "yes"
295
+ else:
296
+ return "no"
297
+
298
+ # Good
299
+ def get_value(condition: bool) -> str:
300
+ if condition:
301
+ return "yes"
302
+ return "no"
303
+ ```
304
+
305
+ ### Active Linter Rules
306
+
307
+ The following ruff linter rules are currently enabled (see [pyproject.toml](pyproject.toml)):
308
+
309
+ - `W`: pycodestyle warnings
310
+ - `F`: pyflakes (unused imports, undefined names)
311
+ - `I`: isort (import sorting)
312
+ - `ICN`: flake8-import-conventions (standard import names)
313
+ - `PIE`: flake8-pie (miscellaneous lints)
314
+
315
+ **Note**: Additional rules (E, N, UP, ANN, B, C4, DTZ, RET, SIM, PTH) are commented out but may be enabled in the future. Write code that would pass these checks for future-proofing.
316
+
317
+ ## Testing Patterns
318
+
319
+ The project uses `pytest` with the following patterns:
320
+
321
+ - **Fixtures**: Shared test data and configurations in [tests/conftest.py](tests/conftest.py)
322
+ - **Stub configs**: YAML-based configuration stubs for testing (see `stub_data_designer_config_str` fixture)
323
+ - **Mocking**: Use `unittest.mock.patch` for external services and dependencies
324
+ - **Async support**: pytest-asyncio for async tests (`asyncio_default_fixture_loop_scope = "session"`)
325
+ - **HTTP mocking**: pytest-httpx for mocking HTTP requests
326
+ - **Coverage**: Track test coverage with pytest-cov
327
+
328
+ Example test structure:
329
+
330
+ ```python
331
+ import pytest
332
+ from data_designer.config.config_builder import DataDesignerConfigBuilder
333
+
334
+ def test_something(stub_model_configs):
335
+ """Test description."""
336
+ builder = DataDesignerConfigBuilder(model_configs=stub_model_configs)
337
+ # ... test implementation
338
+ assert expected == actual
339
+ ```
340
+
341
+ ## Column Configuration Types
342
+
343
+ When working with column configurations, understand these key types:
344
+
345
+ - **`SamplerColumnConfig`**: Built-in samplers (UUID, Category, Uniform, Gaussian, Person, DateTime, etc.)
346
+ - **`LLMTextColumnConfig`**: LLM text generation with Jinja2 templating
347
+ - **`LLMCodeColumnConfig`**: Code generation with language specification
348
+ - **`LLMStructuredColumnConfig`**: Structured JSON generation with schema
349
+ - **`LLMJudgeColumnConfig`**: Judge/scoring columns for quality assessment
350
+ - **`ExpressionColumnConfig`**: Expression-based derived columns (Python eval or Jinja2)
351
+ - **`ValidationColumnConfig`**: Validation results (Python, SQL, Code, Remote validators)
352
+ - **`SeedDatasetColumnConfig`**: Data from seed datasets
353
+
354
+ See [src/data_designer/config/columns.py](src/data_designer/config/columns.py) for detailed schemas.
355
+
356
+ ## Model Configuration
357
+
358
+ Models are configured via `ModelConfig` with:
359
+
360
+ - `alias`: User-defined alias for the model
361
+ - `model`: Model ID (e.g., from build.nvidia.com)
362
+ - `inference_parameters`: Temperature, top_p, max_tokens (can be distribution-based)
363
+ - `system_prompt`: Optional system prompt
364
+ - `image_modality`: Support for image inputs
365
+
366
+ See [src/data_designer/config/models.py](src/data_designer/config/models.py) for details.
367
+
368
+ ## Registry System
369
+
370
+ The project uses a registry pattern for extensibility. Key registries:
371
+
372
+ - **Column generators**: [src/data_designer/engine/column_generators/registry.py](src/data_designer/engine/column_generators/registry.py)
373
+ - **Validators**: [src/data_designer/engine/validators/](src/data_designer/engine/validators/)
374
+ - **Column profilers**: [src/data_designer/engine/analysis/column_profilers/registry.py](src/data_designer/engine/analysis/column_profilers/registry.py)
375
+ - **Models**: [src/data_designer/engine/models/registry.py](src/data_designer/engine/models/registry.py)
376
+
377
+ When adding new generators or validators, register them appropriately.
378
+
379
+ ## Pre-commit Hooks
380
+
381
+ The project uses pre-commit hooks to enforce code quality. Install them with:
382
+
383
+ ```bash
384
+ uv run pre-commit install
385
+ ```
386
+
387
+ Hooks include:
388
+ - Trailing whitespace removal
389
+ - End-of-file fixer
390
+ - YAML/JSON/TOML validation
391
+ - Merge conflict detection
392
+ - Debug statement detection
393
+ - Ruff linting and formatting
394
+
395
+ ## Common Development Tasks
396
+
397
+ ```bash
398
+ # Clean up generated files
399
+ make clean
400
+
401
+ # Update license headers
402
+ make update-license-headers
403
+
404
+ # Run all checks before committing
405
+ make check-all-fix
406
+ make test
407
+
408
+ # Generate coverage report
409
+ make coverage
410
+ # View htmlcov/index.html in browser
411
+ ```
412
+
413
+ ## Additional Resources
414
+
415
+ - **README.md**: Installation and basic usage examples
416
+ - **src/data_designer/config/**: Configuration API documentation
417
+ - **tests/**: Comprehensive test suite with usage examples