data-designer-engine 0.5.2__tar.gz → 0.5.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (259) hide show
  1. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/.gitignore +6 -0
  2. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/PKG-INFO +4 -3
  3. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/pyproject.toml +2 -1
  4. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/generators/base.py +68 -8
  5. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/generators/custom.py +60 -3
  6. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/generators/embedding.py +12 -2
  7. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/generators/image.py +20 -23
  8. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/generators/seed_dataset.py +5 -1
  9. data_designer_engine-0.5.3/src/data_designer/engine/dataset_builders/utils/completion_tracker.py +230 -0
  10. data_designer_engine-0.5.3/src/data_designer/engine/dataset_builders/utils/execution_graph.py +260 -0
  11. data_designer_engine-0.5.3/src/data_designer/engine/dataset_builders/utils/task_model.py +61 -0
  12. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/mcp/facade.py +59 -229
  13. data_designer_engine-0.5.3/src/data_designer/engine/models/clients/__init__.py +46 -0
  14. data_designer_engine-0.5.3/src/data_designer/engine/models/clients/adapters/__init__.py +8 -0
  15. data_designer_engine-0.5.3/src/data_designer/engine/models/clients/adapters/litellm_bridge.py +217 -0
  16. data_designer_engine-0.5.3/src/data_designer/engine/models/clients/base.py +47 -0
  17. data_designer_engine-0.5.3/src/data_designer/engine/models/clients/errors.py +229 -0
  18. data_designer_engine-0.5.3/src/data_designer/engine/models/clients/factory.py +50 -0
  19. data_designer_engine-0.5.3/src/data_designer/engine/models/clients/parsing.py +335 -0
  20. data_designer_engine-0.5.3/src/data_designer/engine/models/clients/types.py +158 -0
  21. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/errors.py +109 -6
  22. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/facade.py +348 -485
  23. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/factory.py +8 -2
  24. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/litellm_overrides.py +21 -1
  25. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/registry.py +29 -7
  26. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/gsonschema/validators.py +44 -1
  27. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/storage/artifact_storage.py +21 -10
  28. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/testing/__init__.py +2 -0
  29. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/testing/fixtures.py +38 -32
  30. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/testing/stubs.py +62 -33
  31. data_designer_engine-0.5.3/tests/engine/column_generators/generators/test_async_generators.py +424 -0
  32. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/generators/test_seed_dataset.py +2 -2
  33. data_designer_engine-0.5.3/tests/engine/dataset_builders/utils/test_completion_tracker.py +348 -0
  34. data_designer_engine-0.5.3/tests/engine/dataset_builders/utils/test_execution_graph.py +450 -0
  35. data_designer_engine-0.5.3/tests/engine/dataset_builders/utils/test_task_model.py +89 -0
  36. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/mcp/test_mcp_facade.py +74 -171
  37. data_designer_engine-0.5.3/tests/engine/models/clients/conftest.py +20 -0
  38. data_designer_engine-0.5.3/tests/engine/models/clients/test_client_errors.py +253 -0
  39. data_designer_engine-0.5.3/tests/engine/models/clients/test_litellm_bridge.py +428 -0
  40. data_designer_engine-0.5.3/tests/engine/models/clients/test_parsing.py +213 -0
  41. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/conftest.py +15 -1
  42. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/test_facade.py +251 -487
  43. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/test_litellm_overrides.py +43 -0
  44. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/gsonschema/test_validators.py +75 -0
  45. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/storage/test_artifact_storage.py +55 -0
  46. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/README.md +0 -0
  47. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/__init__.py +0 -0
  48. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/analysis/column_profilers/base.py +0 -0
  49. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/analysis/column_profilers/judge_score_profiler.py +0 -0
  50. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/analysis/column_profilers/registry.py +0 -0
  51. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/analysis/column_statistics.py +0 -0
  52. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/analysis/dataset_profiler.py +0 -0
  53. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/analysis/errors.py +0 -0
  54. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/analysis/utils/column_statistics_calculations.py +0 -0
  55. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/analysis/utils/judge_score_processing.py +0 -0
  56. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/__init__.py +0 -0
  57. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/generators/__init__.py +0 -0
  58. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/generators/expression.py +0 -0
  59. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/generators/llm_completion.py +0 -0
  60. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/generators/samplers.py +0 -0
  61. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/generators/validation.py +0 -0
  62. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/registry.py +0 -0
  63. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/utils/errors.py +0 -0
  64. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/utils/generator_classification.py +0 -0
  65. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/utils/judge_score_factory.py +0 -0
  66. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/column_generators/utils/prompt_renderer.py +0 -0
  67. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/compiler.py +0 -0
  68. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/configurable_task.py +0 -0
  69. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/dataset_builders/column_wise_builder.py +0 -0
  70. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/dataset_builders/errors.py +0 -0
  71. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/dataset_builders/multi_column_configs.py +0 -0
  72. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/dataset_builders/utils/__init__.py +0 -0
  73. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/dataset_builders/utils/async_concurrency.py +0 -0
  74. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/dataset_builders/utils/concurrency.py +0 -0
  75. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/dataset_builders/utils/config_compiler.py +0 -0
  76. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/dataset_builders/utils/dag.py +0 -0
  77. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +0 -0
  78. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/dataset_builders/utils/errors.py +0 -0
  79. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/dataset_builders/utils/processor_runner.py +0 -0
  80. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/dataset_builders/utils/progress_tracker.py +0 -0
  81. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/errors.py +0 -0
  82. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/mcp/__init__.py +0 -0
  83. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/mcp/errors.py +0 -0
  84. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/mcp/factory.py +0 -0
  85. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/mcp/io.py +0 -0
  86. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/mcp/registry.py +0 -0
  87. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/model_provider.py +0 -0
  88. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/__init__.py +0 -0
  89. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/parsers/__init__.py +0 -0
  90. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/parsers/errors.py +0 -0
  91. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/parsers/parser.py +0 -0
  92. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/parsers/postprocessors.py +0 -0
  93. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/parsers/tag_parsers.py +0 -0
  94. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/parsers/types.py +0 -0
  95. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/recipes/base.py +0 -0
  96. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/recipes/response_recipes.py +0 -0
  97. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/telemetry.py +0 -0
  98. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/usage.py +0 -0
  99. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/models/utils.py +0 -0
  100. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/ginja/__init__.py +0 -0
  101. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/ginja/ast.py +0 -0
  102. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/ginja/environment.py +0 -0
  103. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/ginja/exceptions.py +0 -0
  104. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/ginja/record.py +0 -0
  105. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/gsonschema/__init__.py +0 -0
  106. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/gsonschema/exceptions.py +0 -0
  107. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/gsonschema/schema_transformers.py +0 -0
  108. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/gsonschema/types.py +0 -0
  109. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/processors/base.py +0 -0
  110. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/processors/drop_columns.py +0 -0
  111. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/processors/registry.py +0 -0
  112. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/processors/schema_transform.py +0 -0
  113. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/processing/utils.py +0 -0
  114. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/registry/base.py +0 -0
  115. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/registry/data_designer_registry.py +0 -0
  116. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/registry/errors.py +0 -0
  117. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/resources/managed_dataset_generator.py +0 -0
  118. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/resources/managed_dataset_repository.py +0 -0
  119. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/resources/managed_storage.py +0 -0
  120. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/resources/resource_provider.py +0 -0
  121. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/resources/seed_reader.py +0 -0
  122. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/column.py +0 -0
  123. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/constraints.py +0 -0
  124. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/data_sources/base.py +0 -0
  125. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/data_sources/errors.py +0 -0
  126. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/data_sources/sources.py +0 -0
  127. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/entities/__init__.py +0 -0
  128. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
  129. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +0 -0
  130. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/entities/email_address_utils.py +0 -0
  131. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/entities/errors.py +0 -0
  132. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/entities/national_id_utils.py +0 -0
  133. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/entities/person.py +0 -0
  134. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/entities/phone_number.py +0 -0
  135. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/errors.py +0 -0
  136. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/generator.py +0 -0
  137. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/jinja_utils.py +0 -0
  138. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/people_gen.py +0 -0
  139. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/person_constants.py +0 -0
  140. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/schema.py +0 -0
  141. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/schema_builder.py +0 -0
  142. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/sampling_gen/utils.py +0 -0
  143. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/secret_resolver.py +0 -0
  144. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/storage/__init__.py +0 -0
  145. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/storage/media_storage.py +0 -0
  146. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/testing/utils.py +0 -0
  147. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/validation.py +0 -0
  148. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/validators/__init__.py +0 -0
  149. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/validators/base.py +0 -0
  150. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/validators/local_callable.py +0 -0
  151. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/validators/python.py +0 -0
  152. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/validators/remote.py +0 -0
  153. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/src/data_designer/engine/validators/sql.py +0 -0
  154. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/conftest.py +0 -0
  155. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/analysis/column_profilers/test_base.py +0 -0
  156. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/analysis/column_profilers/test_judge_score_profiler.py +0 -0
  157. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/analysis/conftest.py +0 -0
  158. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/analysis/test_column_statistics_calculator.py +0 -0
  159. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/analysis/test_data/artifacts/dataset/column_configs.json +0 -0
  160. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/analysis/test_data/artifacts/dataset/dataset.json +0 -0
  161. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/analysis/test_data/artifacts/dataset/metadata.json +0 -0
  162. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/analysis/test_dataset_profiler.py +0 -0
  163. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/analysis/test_errors.py +0 -0
  164. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/analysis/utils/test_column_statistics_calculations.py +0 -0
  165. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/analysis/utils/test_judge_score_processing.py +0 -0
  166. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/generators/__init__.py +0 -0
  167. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/generators/test_column_generator_base.py +0 -0
  168. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/generators/test_custom.py +0 -0
  169. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/generators/test_embedding.py +0 -0
  170. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/generators/test_expression.py +0 -0
  171. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/generators/test_image.py +0 -0
  172. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/generators/test_llm_completion_generators.py +0 -0
  173. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/generators/test_samplers.py +0 -0
  174. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/generators/test_validation.py +0 -0
  175. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/test_registry.py +0 -0
  176. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/utils/test_column_generator_errors.py +0 -0
  177. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/utils/test_generator_classification.py +0 -0
  178. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/utils/test_judge_score_factory.py +0 -0
  179. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/column_generators/utils/test_prompt_renderer.py +0 -0
  180. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/conftest.py +0 -0
  181. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/dataset_builders/test_column_wise_builder.py +0 -0
  182. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/dataset_builders/test_multi_column_configs.py +0 -0
  183. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/dataset_builders/utils/test_async_concurrency.py +0 -0
  184. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/dataset_builders/utils/test_concurrency.py +0 -0
  185. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/dataset_builders/utils/test_config_compiler.py +0 -0
  186. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/dataset_builders/utils/test_dag.py +0 -0
  187. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/dataset_builders/utils/test_dataset_batch_manager.py +0 -0
  188. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/dataset_builders/utils/test_progress_tracker.py +0 -0
  189. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/mcp/conftest.py +0 -0
  190. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/mcp/test_mcp_factory.py +0 -0
  191. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/mcp/test_mcp_io.py +0 -0
  192. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/mcp/test_mcp_registry.py +0 -0
  193. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/parsers/test_parser.py +0 -0
  194. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/parsers/test_parsers_types.py +0 -0
  195. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/parsers/test_postprocessors.py +0 -0
  196. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/parsers/test_tag_parsers.py +0 -0
  197. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/recipes/test_recipe_base.py +0 -0
  198. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/recipes/test_response_recipes.py +0 -0
  199. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/stub_secrets.json +0 -0
  200. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/test_async_engine_switch.py +0 -0
  201. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/test_model_errors.py +0 -0
  202. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/test_model_registry.py +0 -0
  203. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/test_model_utils.py +0 -0
  204. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/models/test_usage.py +0 -0
  205. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/__init__.py +0 -0
  206. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/ginja/__init__.py +0 -0
  207. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/ginja/test_ast.py +0 -0
  208. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/ginja/test_environment.py +0 -0
  209. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/ginja/test_exceptions.py +0 -0
  210. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/ginja/test_record.py +0 -0
  211. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/gsonschema/__init__.py +0 -0
  212. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/gsonschema/test_exceptions.py +0 -0
  213. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/gsonschema/test_schema_transformers.py +0 -0
  214. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/gsonschema/test_types.py +0 -0
  215. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/processors/__init__.py +0 -0
  216. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/processors/test_drop_columns.py +0 -0
  217. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/processors/test_registry.py +0 -0
  218. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/processors/test_schema_transform.py +0 -0
  219. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/processing/test_utils.py +0 -0
  220. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/registry/__init__.py +0 -0
  221. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/registry/conftest.py +0 -0
  222. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/registry/test_base.py +0 -0
  223. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/registry/test_data_designer_registry.py +0 -0
  224. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/registry/test_errors.py +0 -0
  225. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/resources/__init__.py +0 -0
  226. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/resources/conftest.py +0 -0
  227. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/resources/test_managed_dataset_generator.py +0 -0
  228. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/resources/test_managed_dataset_repository.py +0 -0
  229. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/resources/test_managed_storage.py +0 -0
  230. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/resources/test_resource_provider.py +0 -0
  231. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/resources/test_seed_reader.py +0 -0
  232. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/conftest.py +0 -0
  233. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/data_sources/test_sampler_errors.py +0 -0
  234. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/data_sources/test_sources.py +0 -0
  235. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/entities/test_email_address_utils.py +0 -0
  236. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/entities/test_national_id_utils.py +0 -0
  237. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/entities/test_person.py +0 -0
  238. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/entities/test_phone_number.py +0 -0
  239. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/test_column.py +0 -0
  240. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/test_constraints.py +0 -0
  241. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/test_generator.py +0 -0
  242. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/test_jinja_utils.py +0 -0
  243. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/test_people_gen.py +0 -0
  244. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/test_schema.py +0 -0
  245. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/sampling_gen/test_utils.py +0 -0
  246. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/storage/__init__.py +0 -0
  247. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/storage/test_media_storage.py +0 -0
  248. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/test_compiler.py +0 -0
  249. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/test_configurable_task.py +0 -0
  250. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/test_dataset_metadata.py +0 -0
  251. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/test_engine_errors.py +0 -0
  252. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/test_model_provider.py +0 -0
  253. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/test_secret_resolver.py +0 -0
  254. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/test_validation.py +0 -0
  255. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/validators/test_local_callable.py +0 -0
  256. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/validators/test_python.py +0 -0
  257. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/validators/test_remote.py +0 -0
  258. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/engine/validators/test_sql.py +0 -0
  259. {data_designer_engine-0.5.2 → data_designer_engine-0.5.3}/tests/test_plugin_manager.py +0 -0
@@ -99,8 +99,14 @@ NOTEPAD.md
99
99
  # Build-time copy of README for data-designer package (copied from top-level during build)
100
100
  packages/data-designer/README.md
101
101
 
102
+ # Notebook build cache
103
+ .notebook-cache/
104
+
102
105
  # Cerebro knowledge base
103
106
  .cerebro/
104
107
  .cursor/rules/cerebro.mdc
105
108
  .cursor/mcp.json
106
109
  .claude/rules/cerebro.md
110
+
111
+ # Claude worktrees
112
+ .claude/worktrees/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-designer-engine
3
- Version: 0.5.2
3
+ Version: 0.5.3
4
4
  Summary: Generation engine for DataDesigner synthetic data generation
5
5
  License-Expression: Apache-2.0
6
6
  Classifier: Development Status :: 4 - Beta
@@ -14,8 +14,9 @@ Classifier: Programming Language :: Python :: 3.13
14
14
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
15
  Requires-Python: >=3.10
16
16
  Requires-Dist: anyascii<1,>=0.3.3
17
- Requires-Dist: data-designer-config==0.5.2
18
- Requires-Dist: duckdb<2,>=1.1.3
17
+ Requires-Dist: chardet<6,>=3.0.2
18
+ Requires-Dist: data-designer-config==0.5.3
19
+ Requires-Dist: duckdb<2,>=1.5.0
19
20
  Requires-Dist: faker<21,>=20.1.0
20
21
  Requires-Dist: httpx-retries<1,>=0.4.2
21
22
  Requires-Dist: httpx<1,>=0.27.2
@@ -33,8 +33,9 @@ bump = true
33
33
  [tool.hatch.metadata.hooks.uv-dynamic-versioning]
34
34
  dependencies = [
35
35
  "anyascii>=0.3.3,<1",
36
+ "chardet>=3.0.2,<6", # Pulled in by sqlfluff; pin <6 to avoid RequestsDependencyWarning from requests<2.33
36
37
  "data-designer-config=={{ version }}",
37
- "duckdb>=1.1.3,<2",
38
+ "duckdb>=1.5.0,<2",
38
39
  "faker>=20.1.0,<21",
39
40
  "httpx>=0.27.2,<1",
40
41
  "httpx-retries>=0.4.2,<1",
@@ -4,15 +4,20 @@
4
4
  from __future__ import annotations
5
5
 
6
6
  import asyncio
7
+ import concurrent.futures
7
8
  import functools
8
9
  import logging
9
10
  from abc import ABC, abstractmethod
10
- from typing import TYPE_CHECKING, Any, overload
11
+ from typing import TYPE_CHECKING, Any, Coroutine, TypeVar, overload
11
12
 
12
13
  from data_designer.config.column_configs import GenerationStrategy
13
14
  from data_designer.engine.configurable_task import ConfigurableTask, DataT, TaskConfigT
14
15
  from data_designer.logging import LOG_DOUBLE_INDENT, LOG_INDENT
15
16
 
17
+ _T = TypeVar("_T")
18
+
19
+ _SYNC_BRIDGE_TIMEOUT = 300
20
+
16
21
  if TYPE_CHECKING:
17
22
  import pandas as pd
18
23
 
@@ -23,33 +28,84 @@ if TYPE_CHECKING:
23
28
  logger = logging.getLogger(__name__)
24
29
 
25
30
 
31
+ def _run_coroutine_sync(coro: Coroutine[Any, Any, _T]) -> _T:
32
+ """Run an async coroutine from sync context.
33
+
34
+ - No running event loop → ``asyncio.run(coro)``
35
+ - Running event loop (e.g. notebook/service) → run in a background thread
36
+ """
37
+ try:
38
+ asyncio.get_running_loop()
39
+ except RuntimeError:
40
+ return asyncio.run(coro)
41
+ pool = concurrent.futures.ThreadPoolExecutor(max_workers=1)
42
+ future = pool.submit(asyncio.run, coro)
43
+ timed_out = False
44
+ try:
45
+ result = future.result(timeout=_SYNC_BRIDGE_TIMEOUT)
46
+ except concurrent.futures.TimeoutError as exc:
47
+ timed_out = True
48
+ logger.warning(f"⚠️ Sync bridge timed out after {_SYNC_BRIDGE_TIMEOUT}s; background thread still running")
49
+ raise TimeoutError(f"_run_coroutine_sync timed out after {_SYNC_BRIDGE_TIMEOUT}s") from exc
50
+ finally:
51
+ pool.shutdown(wait=not timed_out, cancel_futures=timed_out)
52
+ return result
53
+
54
+
26
55
  class ColumnGenerator(ConfigurableTask[TaskConfigT], ABC):
27
56
  @property
28
57
  def can_generate_from_scratch(self) -> bool:
29
58
  return False
30
59
 
60
+ @property
61
+ def is_order_dependent(self) -> bool:
62
+ """Whether this generator's output depends on prior row-group calls.
63
+
64
+ Example: SeedDatasetColumnGenerator tracks its position in the seed
65
+ dataset, so row group N must complete before N+1 starts.
66
+ """
67
+ return False
68
+
69
+ def _is_overridden(self, method_name: str) -> bool:
70
+ """Check if a subclass has overridden a base ColumnGenerator method."""
71
+ return getattr(type(self), method_name) is not getattr(ColumnGenerator, method_name)
72
+
31
73
  @staticmethod
32
74
  @abstractmethod
33
75
  def get_generation_strategy() -> GenerationStrategy: ...
34
76
 
35
77
  @overload
36
- @abstractmethod
37
78
  def generate(self, data: dict) -> dict: ...
38
79
 
39
80
  @overload
40
- @abstractmethod
41
81
  def generate(self, data: pd.DataFrame) -> pd.DataFrame: ...
42
82
 
43
- @abstractmethod
44
- def generate(self, data: DataT) -> DataT: ...
83
+ def generate(self, data: DataT) -> DataT:
84
+ """Sync generate overridden by most concrete generators.
85
+
86
+ Default bridges to ``agenerate()`` for async-first subclasses that only
87
+ implement ``agenerate()``. Raises ``NotImplementedError`` if neither
88
+ ``generate()`` nor ``agenerate()`` is overridden.
89
+ """
90
+ if not self._is_overridden("agenerate"):
91
+ raise NotImplementedError(f"{type(self).__name__} must implement either generate() or agenerate()")
92
+ return _run_coroutine_sync(self.agenerate(data))
45
93
 
46
- async def agenerate(self, data: dict) -> dict:
47
- """Async fallback delegates to sync generate via thread pool.
94
+ @overload
95
+ async def agenerate(self, data: dict) -> dict: ...
96
+
97
+ @overload
98
+ async def agenerate(self, data: pd.DataFrame) -> pd.DataFrame: ...
99
+
100
+ async def agenerate(self, data: DataT) -> DataT:
101
+ """Async generate — delegates to sync ``generate()`` via thread pool.
48
102
 
49
103
  Subclasses with native async support (e.g. ColumnGeneratorWithModelChatCompletion)
50
104
  should override this with a direct async implementation.
51
105
  """
52
- return await asyncio.to_thread(self.generate, data)
106
+ if not self._is_overridden("generate"):
107
+ raise NotImplementedError(f"{type(self).__name__} must implement either generate() or agenerate()")
108
+ return await asyncio.to_thread(self.generate, data.copy())
53
109
 
54
110
  def log_pre_generation(self) -> None:
55
111
  """A shared method to log info before the generator's `generate` method is called.
@@ -68,6 +124,10 @@ class FromScratchColumnGenerator(ColumnGenerator[TaskConfigT], ABC):
68
124
  @abstractmethod
69
125
  def generate_from_scratch(self, num_records: int) -> pd.DataFrame: ...
70
126
 
127
+ async def agenerate_from_scratch(self, num_records: int) -> pd.DataFrame:
128
+ """Async wrapper — wraps sync ``generate_from_scratch()`` in a thread."""
129
+ return await asyncio.to_thread(self.generate_from_scratch, num_records)
130
+
71
131
 
72
132
  class ColumnGeneratorWithModelRegistry(ColumnGenerator[TaskConfigT], ABC):
73
133
  @property
@@ -5,6 +5,7 @@
5
5
 
6
6
  from __future__ import annotations
7
7
 
8
+ import asyncio
8
9
  import inspect
9
10
  import logging
10
11
  from typing import TYPE_CHECKING, Any
@@ -65,12 +66,57 @@ class CustomColumnGenerator(ColumnGenerator[CustomColumnConfig]):
65
66
 
66
67
  return self._generate(data, is_dataframe)
67
68
 
69
+ async def agenerate(self, data: dict | pd.DataFrame) -> dict | pd.DataFrame | list[dict]:
70
+ """Async generate — branches on strategy and detects coroutine functions."""
71
+ is_full_column = self.config.generation_strategy == GenerationStrategy.FULL_COLUMN
72
+ if is_full_column:
73
+ return await asyncio.to_thread(self.generate, data.copy())
74
+ # The @custom_column_generator decorator wraps the user function in a sync
75
+ # wrapper, so we must unwrap to detect async functions.
76
+ fn_unwrapped = inspect.unwrap(self.config.generator_function)
77
+ if asyncio.iscoroutinefunction(fn_unwrapped):
78
+ missing = set(self.config.required_columns) - set(data.keys())
79
+ if missing:
80
+ raise CustomColumnGenerationError(
81
+ f"Missing required columns for custom generator '{self.config.name}': {sorted(missing)}"
82
+ )
83
+ keys_before = set(data.keys())
84
+
85
+ try:
86
+ result = await self._ainvoke_generator_function(data)
87
+ except CustomColumnGenerationError:
88
+ raise
89
+ except Exception as e:
90
+ logger.warning(
91
+ f"⚠️ Custom generator function {self.config.generator_function.__name__!r} "
92
+ f"failed for column '{self.config.name}'. This record will be skipped.\n{e}"
93
+ )
94
+ raise CustomColumnGenerationError(
95
+ f"Custom generator function failed for column '{self.config.name}': {e}"
96
+ ) from e
97
+
98
+ return self._postprocess_result(result, is_dataframe=False, keys_before=keys_before)
99
+ return await asyncio.to_thread(self.generate, data)
100
+
101
+ async def _ainvoke_generator_function(self, data: dict) -> dict | pd.DataFrame:
102
+ """Invoke an async user generator function with appropriate arguments.
103
+
104
+ The @custom_column_generator decorator's sync wrapper returns a coroutine
105
+ when the original function is async, so we await the wrapper's return value.
106
+ """
107
+ params = self._get_validated_params()
108
+ fn = self.config.generator_function
109
+ if len(params) == 1:
110
+ return await fn(data)
111
+ elif len(params) == 2:
112
+ return await fn(data, self.config.generator_params)
113
+ else:
114
+ models = self._build_models_dict()
115
+ return await fn(data, self.config.generator_params, models)
116
+
68
117
  def _generate(self, data: dict | pd.DataFrame, is_dataframe: bool) -> dict | pd.DataFrame | list[dict]:
69
118
  """Unified generation logic for both strategies."""
70
- # Get columns/keys using unified accessor
71
119
  get_keys = (lambda d: set(d.columns)) if is_dataframe else (lambda d: set(d.keys()))
72
- expected_type = lazy.pd.DataFrame if is_dataframe else dict
73
- type_name = "DataFrame" if is_dataframe else "dict"
74
120
 
75
121
  # Check required columns
76
122
  missing = set(self.config.required_columns) - get_keys(data)
@@ -96,6 +142,15 @@ class CustomColumnGenerator(ColumnGenerator[CustomColumnConfig]):
96
142
  f"Custom generator function failed for column '{self.config.name}': {e}"
97
143
  ) from e
98
144
 
145
+ return self._postprocess_result(result, is_dataframe, keys_before)
146
+
147
+ def _postprocess_result(
148
+ self,
149
+ result: dict | pd.DataFrame | list[dict],
150
+ is_dataframe: bool,
151
+ keys_before: set[str],
152
+ ) -> dict | pd.DataFrame | list[dict]:
153
+ """Validate type and output columns of a generation result."""
99
154
  # Cell-by-cell with allow_resize: accept dict or list[dict]
100
155
  if not is_dataframe and self.config.allow_resize:
101
156
  if isinstance(result, dict):
@@ -113,6 +168,8 @@ class CustomColumnGenerator(ColumnGenerator[CustomColumnConfig]):
113
168
  )
114
169
 
115
170
  # Validate return type for non-resize paths
171
+ expected_type = lazy.pd.DataFrame if is_dataframe else dict
172
+ type_name = "DataFrame" if is_dataframe else "dict"
116
173
  if not isinstance(result, expected_type):
117
174
  raise CustomColumnGenerationError(
118
175
  f"Custom generator for column '{self.config.name}' must return a {type_name}, "
@@ -27,9 +27,19 @@ class EmbeddingCellGenerator(ColumnGeneratorWithModel[EmbeddingColumnConfig]):
27
27
  def get_generation_strategy() -> GenerationStrategy:
28
28
  return GenerationStrategy.CELL_BY_CELL
29
29
 
30
- def generate(self, data: dict) -> dict:
30
+ def _prepare_embedding_inputs(self, data: dict) -> list[str]:
31
31
  deserialized_record = deserialize_json_values(data)
32
- input_texts = parse_list_string(deserialized_record[self.config.target_column])
32
+ return parse_list_string(deserialized_record[self.config.target_column])
33
+
34
+ def generate(self, data: dict) -> dict:
35
+ input_texts = self._prepare_embedding_inputs(data)
33
36
  embeddings = self.model.generate_text_embeddings(input_texts=input_texts)
34
37
  data[self.config.name] = EmbeddingGenerationResult(embeddings=embeddings).model_dump(mode="json")
35
38
  return data
39
+
40
+ async def agenerate(self, data: dict) -> dict:
41
+ """Native async generate using model.agenerate_text_embeddings."""
42
+ input_texts = self._prepare_embedding_inputs(data)
43
+ embeddings = await self.model.agenerate_text_embeddings(input_texts=input_texts)
44
+ data[self.config.name] = EmbeddingGenerationResult(embeddings=embeddings).model_dump(mode="json")
45
+ return data
@@ -3,6 +3,7 @@
3
3
 
4
4
  from __future__ import annotations
5
5
 
6
+ import asyncio
6
7
  from typing import TYPE_CHECKING
7
8
 
8
9
  from data_designer.config.column_configs import ImageColumnConfig
@@ -31,46 +32,42 @@ class ImageCellGenerator(WithJinja2UserTemplateRendering, ColumnGeneratorWithMod
31
32
  def get_generation_strategy() -> GenerationStrategy:
32
33
  return GenerationStrategy.CELL_BY_CELL
33
34
 
34
- def generate(self, data: dict) -> dict:
35
- """Generate image(s) and optionally save to disk.
36
-
37
- Args:
38
- data: Record data
39
-
40
- Returns:
41
- Record with image path(s) (create mode) or base64 data (preview mode) added
42
- """
35
+ def _prepare_image_inputs(self, data: dict) -> tuple[str, list[dict] | None]:
36
+ """Validate inputs and render prompt for image generation."""
43
37
  deserialized_record = deserialize_json_values(data)
44
-
45
- # Validate required columns
46
38
  missing_columns = list(set(self.config.required_columns) - set(data.keys()))
47
39
  if len(missing_columns) > 0:
48
- error_msg = (
40
+ raise ValueError(
49
41
  f"There was an error preparing the Jinja2 expression template. "
50
42
  f"The following columns {missing_columns} are missing!"
51
43
  )
52
- raise ValueError(error_msg)
53
-
54
- # Render prompt template
55
44
  self.prepare_jinja2_template_renderer(self.config.prompt, list(deserialized_record.keys()))
56
45
  prompt = self.render_template(deserialized_record)
57
-
58
- # Validate prompt is non-empty
59
46
  if not prompt or not prompt.strip():
60
47
  raise ValueError(f"Rendered prompt for column {self.config.name!r} is empty")
61
-
62
- # Process multi-modal context if provided
63
48
  multi_modal_context = self._build_multi_modal_context(deserialized_record)
49
+ return prompt, multi_modal_context
64
50
 
65
- # Generate images (returns list of base64 strings)
51
+ def generate(self, data: dict) -> dict:
52
+ """Generate image(s) and optionally save to disk."""
53
+ prompt, multi_modal_context = self._prepare_image_inputs(data)
66
54
  base64_images = self.model.generate_image(prompt=prompt, multi_modal_context=multi_modal_context)
67
-
68
- # Store via media storage (mode determines disk vs dataframe storage)
69
- # Use column name as subfolder to organize images
70
55
  results = [
71
56
  self.media_storage.save_base64_image(base64_image, subfolder_name=self.config.name)
72
57
  for base64_image in base64_images
73
58
  ]
74
59
  data[self.config.name] = results
60
+ return data
75
61
 
62
+ async def agenerate(self, data: dict) -> dict:
63
+ """Native async generate using model.agenerate_image."""
64
+ prompt, multi_modal_context = self._prepare_image_inputs(data)
65
+ base64_images = await self.model.agenerate_image(prompt=prompt, multi_modal_context=multi_modal_context)
66
+ results = await asyncio.to_thread(
67
+ lambda: [
68
+ self.media_storage.save_base64_image(base64_image, subfolder_name=self.config.name)
69
+ for base64_image in base64_images
70
+ ]
71
+ )
72
+ data[self.config.name] = results
76
73
  return data
@@ -29,6 +29,10 @@ class SeedDatasetColumnGenerator(FromScratchColumnGenerator[SeedDatasetMultiColu
29
29
  def get_generation_strategy() -> GenerationStrategy:
30
30
  return GenerationStrategy.FULL_COLUMN
31
31
 
32
+ @property
33
+ def is_order_dependent(self) -> bool:
34
+ return True
35
+
32
36
  @property
33
37
  def num_records_sampled(self) -> int:
34
38
  return self._num_records_sampled
@@ -102,7 +106,7 @@ class SeedDatasetColumnGenerator(FromScratchColumnGenerator[SeedDatasetMultiColu
102
106
  read_query = f"SELECT * FROM ({read_query}){shuffle_query}"
103
107
  else:
104
108
  read_query = f"SELECT * FROM '{self._dataset_uri}'{shuffle_query}"
105
- self._batch_reader = self.duckdb_conn.query(read_query).record_batch(batch_size=num_records)
109
+ self._batch_reader = self.duckdb_conn.query(read_query).to_arrow_reader(batch_size=num_records)
106
110
 
107
111
  def _sample_records(self, num_records: int) -> pd.DataFrame:
108
112
  logger.info(f"🌱 Sampling {num_records} records from seed dataset")
@@ -0,0 +1,230 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ from collections import defaultdict
7
+ from typing import TYPE_CHECKING
8
+
9
+ from data_designer.config.column_configs import GenerationStrategy
10
+ from data_designer.engine.dataset_builders.utils.task_model import SliceRef, Task
11
+
12
+ if TYPE_CHECKING:
13
+ from data_designer.engine.dataset_builders.utils.execution_graph import ExecutionGraph
14
+
15
+
16
+ class CompletionTracker:
17
+ """Tracks which cells (column, row_group, row_index) are done.
18
+
19
+ Row indices are local to their row group (0-based).
20
+
21
+ Use ``with_graph`` to create a frontier-enabled tracker where
22
+ ``get_ready_tasks`` returns in O(frontier) instead of scanning all
23
+ columns x rows x row groups.
24
+ """
25
+
26
+ def __init__(self) -> None:
27
+ # row_group → column → set of completed local row indices
28
+ self._completed: dict[int, dict[str, set[int]]] = defaultdict(lambda: defaultdict(set))
29
+ # row_group → set of dropped row indices
30
+ self._dropped: dict[int, set[int]] = defaultdict(set)
31
+
32
+ self._graph: ExecutionGraph | None = None
33
+ self._row_group_sizes: dict[int, int] = {}
34
+ self._batch_complete: dict[int, set[str]] = defaultdict(set)
35
+ self._frontier: set[Task] = set()
36
+
37
+ @classmethod
38
+ def with_graph(cls, graph: ExecutionGraph, row_groups: list[tuple[int, int]]) -> CompletionTracker:
39
+ """Create a frontier-enabled tracker backed by an execution graph."""
40
+ tracker = cls()
41
+ tracker._graph = graph
42
+ tracker._row_group_sizes = {rg_id: size for rg_id, size in row_groups}
43
+ tracker._seed_frontier()
44
+ return tracker
45
+
46
+ def mark_cell_complete(self, column: str, row_group: int, row_index: int) -> None:
47
+ self._validate_row_group(row_group)
48
+ self._validate_strategy(column, GenerationStrategy.CELL_BY_CELL, "mark_cell_complete")
49
+ self._completed[row_group][column].add(row_index)
50
+ if self._graph is not None:
51
+ self._frontier.discard(Task(column=column, row_group=row_group, row_index=row_index, task_type="cell"))
52
+ self._enqueue_downstream(column, row_group, row_index=row_index)
53
+
54
+ def mark_row_range_complete(self, column: str, row_group: int, row_group_size: int) -> None:
55
+ expected = self._validate_row_group(row_group)
56
+ self._validate_strategy(column, GenerationStrategy.FULL_COLUMN, "mark_row_range_complete")
57
+ if expected is not None and row_group_size != expected:
58
+ raise ValueError(f"Row-group size mismatch for rg={row_group}: got {row_group_size}, expected {expected}")
59
+ self._completed[row_group][column] = set(range(row_group_size))
60
+ self._batch_complete[row_group].add(column)
61
+ if self._graph is not None:
62
+ self._frontier.discard(Task(column=column, row_group=row_group, row_index=None, task_type="batch"))
63
+ self._enqueue_downstream(column, row_group, row_index=None)
64
+
65
+ def is_complete(self, ref: SliceRef) -> bool:
66
+ return ref.row_index in self._completed.get(ref.row_group, {}).get(ref.column, set())
67
+
68
+ def is_all_complete(self, cells: list[SliceRef]) -> bool:
69
+ """Check whether all the given cells are done.
70
+
71
+ A ``row_index`` of ``None`` means the entire batch for that column must
72
+ have been completed via ``mark_row_range_complete``.
73
+ """
74
+ for ref in cells:
75
+ if ref.row_index is None:
76
+ if ref.column not in self._batch_complete.get(ref.row_group, set()):
77
+ return False
78
+ elif not self.is_complete(ref):
79
+ return False
80
+ return True
81
+
82
+ def drop_row(self, row_group: int, row_index: int) -> None:
83
+ self._validate_row_group(row_group)
84
+ self._dropped[row_group].add(row_index)
85
+ if self._graph is not None:
86
+ # Remove cell tasks for this row from the frontier
87
+ for col in self._graph.columns:
88
+ self._frontier.discard(Task(column=col, row_group=row_group, row_index=row_index, task_type="cell"))
89
+ # Dropping a row may unblock batch downstream tasks
90
+ self._reevaluate_batch_tasks(row_group)
91
+
92
+ def is_dropped(self, row_group: int, row_index: int) -> bool:
93
+ return row_index in self._dropped.get(row_group, set())
94
+
95
+ def is_row_group_complete(
96
+ self,
97
+ row_group: int,
98
+ row_group_size: int,
99
+ all_columns: list[str],
100
+ ) -> bool:
101
+ """All non-dropped rows have all columns done."""
102
+ dropped = self._dropped.get(row_group, set())
103
+ completed = self._completed.get(row_group, {})
104
+ for ri in range(row_group_size):
105
+ if ri in dropped:
106
+ continue
107
+ for col in all_columns:
108
+ if ri not in completed.get(col, set()):
109
+ return False
110
+ return True
111
+
112
+ def get_ready_tasks(self, dispatched: set[Task]) -> list[Task]:
113
+ """Return all currently dispatchable tasks from the frontier.
114
+
115
+ Excludes already-dispatched/in-flight tasks.
116
+ """
117
+ return [t for t in self._frontier if t not in dispatched]
118
+
119
+ def _seed_frontier(self) -> None:
120
+ """Populate the frontier with root tasks (columns with no upstream deps)."""
121
+ if self._graph is None:
122
+ raise RuntimeError("This method requires a graph to be set.")
123
+ for col in self._graph.get_root_columns():
124
+ strategy = self._graph.get_strategy(col)
125
+ for rg_id, rg_size in self._row_group_sizes.items():
126
+ if strategy == GenerationStrategy.CELL_BY_CELL:
127
+ for ri in range(rg_size):
128
+ self._frontier.add(Task(column=col, row_group=rg_id, row_index=ri, task_type="cell"))
129
+ else:
130
+ self._frontier.add(Task(column=col, row_group=rg_id, row_index=None, task_type="batch"))
131
+
132
+ def _enqueue_downstream(self, column: str, row_group: int, row_index: int | None) -> None:
133
+ """Add newly-ready downstream tasks to the frontier."""
134
+ if self._graph is None:
135
+ raise RuntimeError("This method requires a graph to be set.")
136
+ rg_completed = self._completed.get(row_group, {})
137
+ rg_dropped = self._dropped.get(row_group, set())
138
+ rg_batch_complete = self._batch_complete.get(row_group, set())
139
+ rg_size = self._row_group_sizes[row_group]
140
+
141
+ for down in self._graph.get_downstream_columns(column):
142
+ batch_ups, cell_ups = self._graph.split_upstream_by_strategy(down)
143
+
144
+ if any(up not in rg_batch_complete for up in batch_ups):
145
+ continue
146
+
147
+ down_strategy = self._graph.get_strategy(down)
148
+
149
+ if down_strategy == GenerationStrategy.CELL_BY_CELL:
150
+ cell_up_completed = [rg_completed.get(up, set()) for up in cell_ups]
151
+ if row_index is not None:
152
+ # Cell completion: only check the same row
153
+ down_completed = rg_completed.get(down, set())
154
+ if (
155
+ row_index not in rg_dropped
156
+ and row_index not in down_completed
157
+ and all(row_index in s for s in cell_up_completed)
158
+ ):
159
+ task = Task(column=down, row_group=row_group, row_index=row_index, task_type="cell")
160
+ self._frontier.add(task)
161
+ else:
162
+ # Batch completion: check all non-dropped, non-complete rows
163
+ down_completed = rg_completed.get(down, set())
164
+ for ri in range(rg_size):
165
+ if ri in rg_dropped or ri in down_completed:
166
+ continue
167
+ if all(ri in s for s in cell_up_completed):
168
+ task = Task(column=down, row_group=row_group, row_index=ri, task_type="cell")
169
+ self._frontier.add(task)
170
+ else:
171
+ # FULL_COLUMN downstream: ready when all cell upstreams are fully complete
172
+ if down not in rg_batch_complete and self._are_cell_ups_complete(
173
+ cell_ups, rg_completed, rg_size, rg_dropped
174
+ ):
175
+ task = Task(column=down, row_group=row_group, row_index=None, task_type="batch")
176
+ self._frontier.add(task)
177
+
178
+ def _reevaluate_batch_tasks(self, row_group: int) -> None:
179
+ """Check if any batch tasks became ready after a row was dropped."""
180
+ if self._graph is None:
181
+ raise RuntimeError("This method requires a graph to be set.")
182
+ rg_completed = self._completed.get(row_group, {})
183
+ rg_dropped = self._dropped.get(row_group, set())
184
+ rg_batch_complete = self._batch_complete.get(row_group, set())
185
+ rg_size = self._row_group_sizes[row_group]
186
+
187
+ for col in self._graph.get_topological_order():
188
+ if self._graph.get_strategy(col) != GenerationStrategy.FULL_COLUMN:
189
+ continue
190
+ if col in rg_batch_complete:
191
+ continue
192
+ batch_ups, cell_ups = self._graph.split_upstream_by_strategy(col)
193
+ if any(up not in rg_batch_complete for up in batch_ups):
194
+ continue
195
+ if self._are_cell_ups_complete(cell_ups, rg_completed, rg_size, rg_dropped):
196
+ task = Task(column=col, row_group=row_group, row_index=None, task_type="batch")
197
+ self._frontier.add(task)
198
+
199
+ def _are_cell_ups_complete(
200
+ self,
201
+ cell_ups: list[str],
202
+ rg_completed: dict[str, set[int]],
203
+ rg_size: int,
204
+ rg_dropped: set[int],
205
+ ) -> bool:
206
+ """Check all non-dropped rows are complete for each cell-by-cell upstream column."""
207
+ for up in cell_ups:
208
+ up_completed = rg_completed.get(up, set())
209
+ for ri in range(rg_size):
210
+ if ri not in rg_dropped and ri not in up_completed:
211
+ return False
212
+ return True
213
+
214
+ def _validate_strategy(self, column: str, expected: GenerationStrategy, method: str) -> None:
215
+ """Validate that *column* matches the expected strategy in graph-enabled mode."""
216
+ if self._graph is None:
217
+ return
218
+ actual = self._graph.get_strategy(column)
219
+ if actual != expected:
220
+ raise ValueError(f"{method}() requires {expected.value} strategy, but column '{column}' has {actual.value}")
221
+
222
+ def _validate_row_group(self, row_group: int) -> int | None:
223
+ """Validate row-group id in graph-enabled mode and return its expected size."""
224
+ if self._graph is None:
225
+ return None
226
+ expected = self._row_group_sizes.get(row_group)
227
+ if expected is None:
228
+ known = sorted(self._row_group_sizes)
229
+ raise ValueError(f"Unknown row_group {row_group}. Known row_groups: {known}")
230
+ return expected