data-designer-engine 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer_engine-0.4.0/.gitignore +100 -0
- data_designer_engine-0.4.0/PKG-INFO +50 -0
- data_designer_engine-0.4.0/README.md +15 -0
- data_designer_engine-0.4.0/pyproject.toml +64 -0
- data_designer_engine-0.4.0/src/data_designer/engine/__init__.py +2 -0
- data_designer_engine-0.4.0/src/data_designer/engine/_version.py +34 -0
- data_designer_engine-0.4.0/src/data_designer/engine/analysis/column_profilers/base.py +49 -0
- data_designer_engine-0.4.0/src/data_designer/engine/analysis/column_profilers/judge_score_profiler.py +153 -0
- data_designer_engine-0.4.0/src/data_designer/engine/analysis/column_profilers/registry.py +22 -0
- data_designer_engine-0.4.0/src/data_designer/engine/analysis/column_statistics.py +145 -0
- data_designer_engine-0.4.0/src/data_designer/engine/analysis/dataset_profiler.py +149 -0
- data_designer_engine-0.4.0/src/data_designer/engine/analysis/errors.py +9 -0
- data_designer_engine-0.4.0/src/data_designer/engine/analysis/utils/column_statistics_calculations.py +234 -0
- data_designer_engine-0.4.0/src/data_designer/engine/analysis/utils/judge_score_processing.py +132 -0
- data_designer_engine-0.4.0/src/data_designer/engine/column_generators/__init__.py +2 -0
- data_designer_engine-0.4.0/src/data_designer/engine/column_generators/generators/__init__.py +2 -0
- data_designer_engine-0.4.0/src/data_designer/engine/column_generators/generators/base.py +122 -0
- data_designer_engine-0.4.0/src/data_designer/engine/column_generators/generators/embedding.py +35 -0
- data_designer_engine-0.4.0/src/data_designer/engine/column_generators/generators/expression.py +55 -0
- data_designer_engine-0.4.0/src/data_designer/engine/column_generators/generators/llm_completion.py +116 -0
- data_designer_engine-0.4.0/src/data_designer/engine/column_generators/generators/samplers.py +69 -0
- data_designer_engine-0.4.0/src/data_designer/engine/column_generators/generators/seed_dataset.py +144 -0
- data_designer_engine-0.4.0/src/data_designer/engine/column_generators/generators/validation.py +140 -0
- data_designer_engine-0.4.0/src/data_designer/engine/column_generators/registry.py +60 -0
- data_designer_engine-0.4.0/src/data_designer/engine/column_generators/utils/errors.py +15 -0
- data_designer_engine-0.4.0/src/data_designer/engine/column_generators/utils/generator_classification.py +43 -0
- data_designer_engine-0.4.0/src/data_designer/engine/column_generators/utils/judge_score_factory.py +58 -0
- data_designer_engine-0.4.0/src/data_designer/engine/column_generators/utils/prompt_renderer.py +100 -0
- data_designer_engine-0.4.0/src/data_designer/engine/compiler.py +97 -0
- data_designer_engine-0.4.0/src/data_designer/engine/configurable_task.py +71 -0
- data_designer_engine-0.4.0/src/data_designer/engine/dataset_builders/artifact_storage.py +283 -0
- data_designer_engine-0.4.0/src/data_designer/engine/dataset_builders/column_wise_builder.py +354 -0
- data_designer_engine-0.4.0/src/data_designer/engine/dataset_builders/errors.py +15 -0
- data_designer_engine-0.4.0/src/data_designer/engine/dataset_builders/multi_column_configs.py +46 -0
- data_designer_engine-0.4.0/src/data_designer/engine/dataset_builders/utils/__init__.py +2 -0
- data_designer_engine-0.4.0/src/data_designer/engine/dataset_builders/utils/concurrency.py +212 -0
- data_designer_engine-0.4.0/src/data_designer/engine/dataset_builders/utils/config_compiler.py +62 -0
- data_designer_engine-0.4.0/src/data_designer/engine/dataset_builders/utils/dag.py +62 -0
- data_designer_engine-0.4.0/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +200 -0
- data_designer_engine-0.4.0/src/data_designer/engine/dataset_builders/utils/errors.py +15 -0
- data_designer_engine-0.4.0/src/data_designer/engine/dataset_builders/utils/progress_tracker.py +122 -0
- data_designer_engine-0.4.0/src/data_designer/engine/errors.py +51 -0
- data_designer_engine-0.4.0/src/data_designer/engine/model_provider.py +77 -0
- data_designer_engine-0.4.0/src/data_designer/engine/models/__init__.py +2 -0
- data_designer_engine-0.4.0/src/data_designer/engine/models/errors.py +300 -0
- data_designer_engine-0.4.0/src/data_designer/engine/models/facade.py +284 -0
- data_designer_engine-0.4.0/src/data_designer/engine/models/factory.py +42 -0
- data_designer_engine-0.4.0/src/data_designer/engine/models/litellm_overrides.py +179 -0
- data_designer_engine-0.4.0/src/data_designer/engine/models/parsers/__init__.py +2 -0
- data_designer_engine-0.4.0/src/data_designer/engine/models/parsers/errors.py +34 -0
- data_designer_engine-0.4.0/src/data_designer/engine/models/parsers/parser.py +235 -0
- data_designer_engine-0.4.0/src/data_designer/engine/models/parsers/postprocessors.py +93 -0
- data_designer_engine-0.4.0/src/data_designer/engine/models/parsers/tag_parsers.py +62 -0
- data_designer_engine-0.4.0/src/data_designer/engine/models/parsers/types.py +84 -0
- data_designer_engine-0.4.0/src/data_designer/engine/models/recipes/base.py +81 -0
- data_designer_engine-0.4.0/src/data_designer/engine/models/recipes/response_recipes.py +293 -0
- data_designer_engine-0.4.0/src/data_designer/engine/models/registry.py +151 -0
- data_designer_engine-0.4.0/src/data_designer/engine/models/telemetry.py +362 -0
- data_designer_engine-0.4.0/src/data_designer/engine/models/usage.py +73 -0
- data_designer_engine-0.4.0/src/data_designer/engine/models/utils.py +101 -0
- data_designer_engine-0.4.0/src/data_designer/engine/processing/ginja/__init__.py +2 -0
- data_designer_engine-0.4.0/src/data_designer/engine/processing/ginja/ast.py +65 -0
- data_designer_engine-0.4.0/src/data_designer/engine/processing/ginja/environment.py +463 -0
- data_designer_engine-0.4.0/src/data_designer/engine/processing/ginja/exceptions.py +56 -0
- data_designer_engine-0.4.0/src/data_designer/engine/processing/ginja/record.py +32 -0
- data_designer_engine-0.4.0/src/data_designer/engine/processing/gsonschema/__init__.py +2 -0
- data_designer_engine-0.4.0/src/data_designer/engine/processing/gsonschema/exceptions.py +15 -0
- data_designer_engine-0.4.0/src/data_designer/engine/processing/gsonschema/schema_transformers.py +83 -0
- data_designer_engine-0.4.0/src/data_designer/engine/processing/gsonschema/types.py +10 -0
- data_designer_engine-0.4.0/src/data_designer/engine/processing/gsonschema/validators.py +202 -0
- data_designer_engine-0.4.0/src/data_designer/engine/processing/processors/base.py +13 -0
- data_designer_engine-0.4.0/src/data_designer/engine/processing/processors/drop_columns.py +42 -0
- data_designer_engine-0.4.0/src/data_designer/engine/processing/processors/registry.py +25 -0
- data_designer_engine-0.4.0/src/data_designer/engine/processing/processors/schema_transform.py +71 -0
- data_designer_engine-0.4.0/src/data_designer/engine/processing/utils.py +169 -0
- data_designer_engine-0.4.0/src/data_designer/engine/registry/base.py +99 -0
- data_designer_engine-0.4.0/src/data_designer/engine/registry/data_designer_registry.py +39 -0
- data_designer_engine-0.4.0/src/data_designer/engine/registry/errors.py +12 -0
- data_designer_engine-0.4.0/src/data_designer/engine/resources/managed_dataset_generator.py +39 -0
- data_designer_engine-0.4.0/src/data_designer/engine/resources/managed_dataset_repository.py +197 -0
- data_designer_engine-0.4.0/src/data_designer/engine/resources/managed_storage.py +65 -0
- data_designer_engine-0.4.0/src/data_designer/engine/resources/resource_provider.py +77 -0
- data_designer_engine-0.4.0/src/data_designer/engine/resources/seed_reader.py +154 -0
- data_designer_engine-0.4.0/src/data_designer/engine/sampling_gen/column.py +91 -0
- data_designer_engine-0.4.0/src/data_designer/engine/sampling_gen/constraints.py +100 -0
- data_designer_engine-0.4.0/src/data_designer/engine/sampling_gen/data_sources/base.py +217 -0
- data_designer_engine-0.4.0/src/data_designer/engine/sampling_gen/data_sources/errors.py +12 -0
- data_designer_engine-0.4.0/src/data_designer/engine/sampling_gen/data_sources/sources.py +347 -0
- data_designer_engine-0.4.0/src/data_designer/engine/sampling_gen/entities/__init__.py +2 -0
- data_designer_engine-0.4.0/src/data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
- data_designer_engine-0.4.0/src/data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +90 -0
- data_designer_engine-0.4.0/src/data_designer/engine/sampling_gen/entities/email_address_utils.py +171 -0
- data_designer_engine-0.4.0/src/data_designer/engine/sampling_gen/entities/errors.py +10 -0
- data_designer_engine-0.4.0/src/data_designer/engine/sampling_gen/entities/national_id_utils.py +102 -0
- data_designer_engine-0.4.0/src/data_designer/engine/sampling_gen/entities/person.py +144 -0
- data_designer_engine-0.4.0/src/data_designer/engine/sampling_gen/entities/phone_number.py +128 -0
- data_designer_engine-0.4.0/src/data_designer/engine/sampling_gen/errors.py +26 -0
- data_designer_engine-0.4.0/src/data_designer/engine/sampling_gen/generator.py +122 -0
- data_designer_engine-0.4.0/src/data_designer/engine/sampling_gen/jinja_utils.py +64 -0
- data_designer_engine-0.4.0/src/data_designer/engine/sampling_gen/people_gen.py +199 -0
- data_designer_engine-0.4.0/src/data_designer/engine/sampling_gen/person_constants.py +56 -0
- data_designer_engine-0.4.0/src/data_designer/engine/sampling_gen/schema.py +147 -0
- data_designer_engine-0.4.0/src/data_designer/engine/sampling_gen/schema_builder.py +61 -0
- data_designer_engine-0.4.0/src/data_designer/engine/sampling_gen/utils.py +46 -0
- data_designer_engine-0.4.0/src/data_designer/engine/secret_resolver.py +82 -0
- data_designer_engine-0.4.0/src/data_designer/engine/testing/__init__.py +12 -0
- data_designer_engine-0.4.0/src/data_designer/engine/testing/stubs.py +133 -0
- data_designer_engine-0.4.0/src/data_designer/engine/testing/utils.py +20 -0
- data_designer_engine-0.4.0/src/data_designer/engine/validation.py +367 -0
- data_designer_engine-0.4.0/src/data_designer/engine/validators/__init__.py +19 -0
- data_designer_engine-0.4.0/src/data_designer/engine/validators/base.py +38 -0
- data_designer_engine-0.4.0/src/data_designer/engine/validators/local_callable.py +39 -0
- data_designer_engine-0.4.0/src/data_designer/engine/validators/python.py +254 -0
- data_designer_engine-0.4.0/src/data_designer/engine/validators/remote.py +89 -0
- data_designer_engine-0.4.0/src/data_designer/engine/validators/sql.py +65 -0
- data_designer_engine-0.4.0/tests/conftest.py +16 -0
- data_designer_engine-0.4.0/tests/engine/analysis/column_profilers/test_base.py +54 -0
- data_designer_engine-0.4.0/tests/engine/analysis/column_profilers/test_judge_score_profiler.py +292 -0
- data_designer_engine-0.4.0/tests/engine/analysis/conftest.py +159 -0
- data_designer_engine-0.4.0/tests/engine/analysis/test_column_statistics_calculator.py +79 -0
- data_designer_engine-0.4.0/tests/engine/analysis/test_data/artifacts/dataset/column_configs.json +145 -0
- data_designer_engine-0.4.0/tests/engine/analysis/test_data/artifacts/dataset/dataset.json +2929 -0
- data_designer_engine-0.4.0/tests/engine/analysis/test_data/artifacts/dataset/metadata.json +27 -0
- data_designer_engine-0.4.0/tests/engine/analysis/test_dataset_profiler.py +130 -0
- data_designer_engine-0.4.0/tests/engine/analysis/test_errors.py +59 -0
- data_designer_engine-0.4.0/tests/engine/analysis/utils/test_column_statistics_calculations.py +357 -0
- data_designer_engine-0.4.0/tests/engine/analysis/utils/test_judge_score_processing.py +171 -0
- data_designer_engine-0.4.0/tests/engine/column_generators/generators/__init__.py +2 -0
- data_designer_engine-0.4.0/tests/engine/column_generators/generators/test_column_generator_base.py +85 -0
- data_designer_engine-0.4.0/tests/engine/column_generators/generators/test_embedding.py +47 -0
- data_designer_engine-0.4.0/tests/engine/column_generators/generators/test_expression.py +166 -0
- data_designer_engine-0.4.0/tests/engine/column_generators/generators/test_llm_completion_generators.py +353 -0
- data_designer_engine-0.4.0/tests/engine/column_generators/generators/test_samplers.py +131 -0
- data_designer_engine-0.4.0/tests/engine/column_generators/generators/test_seed_dataset.py +796 -0
- data_designer_engine-0.4.0/tests/engine/column_generators/generators/test_validation.py +248 -0
- data_designer_engine-0.4.0/tests/engine/column_generators/test_registry.py +39 -0
- data_designer_engine-0.4.0/tests/engine/column_generators/utils/test_column_generator_errors.py +15 -0
- data_designer_engine-0.4.0/tests/engine/column_generators/utils/test_generator_classification.py +32 -0
- data_designer_engine-0.4.0/tests/engine/column_generators/utils/test_judge_score_factory.py +97 -0
- data_designer_engine-0.4.0/tests/engine/column_generators/utils/test_prompt_renderer.py +127 -0
- data_designer_engine-0.4.0/tests/engine/conftest.py +61 -0
- data_designer_engine-0.4.0/tests/engine/dataset_builders/test_artifact_storage.py +362 -0
- data_designer_engine-0.4.0/tests/engine/dataset_builders/test_column_wise_builder.py +389 -0
- data_designer_engine-0.4.0/tests/engine/dataset_builders/test_multi_column_configs.py +158 -0
- data_designer_engine-0.4.0/tests/engine/dataset_builders/utils/test_concurrency.py +577 -0
- data_designer_engine-0.4.0/tests/engine/dataset_builders/utils/test_config_compiler.py +92 -0
- data_designer_engine-0.4.0/tests/engine/dataset_builders/utils/test_dag.py +113 -0
- data_designer_engine-0.4.0/tests/engine/dataset_builders/utils/test_dataset_batch_manager.py +421 -0
- data_designer_engine-0.4.0/tests/engine/dataset_builders/utils/test_progress_tracker.py +290 -0
- data_designer_engine-0.4.0/tests/engine/models/conftest.py +75 -0
- data_designer_engine-0.4.0/tests/engine/models/parsers/test_parser.py +175 -0
- data_designer_engine-0.4.0/tests/engine/models/parsers/test_parsers_types.py +96 -0
- data_designer_engine-0.4.0/tests/engine/models/parsers/test_postprocessors.py +122 -0
- data_designer_engine-0.4.0/tests/engine/models/parsers/test_tag_parsers.py +118 -0
- data_designer_engine-0.4.0/tests/engine/models/recipes/test_recipe_base.py +130 -0
- data_designer_engine-0.4.0/tests/engine/models/recipes/test_response_recipes.py +257 -0
- data_designer_engine-0.4.0/tests/engine/models/stub_secrets.json +3 -0
- data_designer_engine-0.4.0/tests/engine/models/test_facade.py +269 -0
- data_designer_engine-0.4.0/tests/engine/models/test_litellm_overrides.py +140 -0
- data_designer_engine-0.4.0/tests/engine/models/test_model_errors.py +231 -0
- data_designer_engine-0.4.0/tests/engine/models/test_model_registry.py +377 -0
- data_designer_engine-0.4.0/tests/engine/models/test_model_utils.py +23 -0
- data_designer_engine-0.4.0/tests/engine/models/test_usage.py +65 -0
- data_designer_engine-0.4.0/tests/engine/processing/__init__.py +2 -0
- data_designer_engine-0.4.0/tests/engine/processing/ginja/__init__.py +2 -0
- data_designer_engine-0.4.0/tests/engine/processing/ginja/test_ast.py +124 -0
- data_designer_engine-0.4.0/tests/engine/processing/ginja/test_environment.py +213 -0
- data_designer_engine-0.4.0/tests/engine/processing/ginja/test_exceptions.py +21 -0
- data_designer_engine-0.4.0/tests/engine/processing/ginja/test_record.py +25 -0
- data_designer_engine-0.4.0/tests/engine/processing/gsonschema/__init__.py +2 -0
- data_designer_engine-0.4.0/tests/engine/processing/gsonschema/test_exceptions.py +42 -0
- data_designer_engine-0.4.0/tests/engine/processing/gsonschema/test_schema_transformers.py +368 -0
- data_designer_engine-0.4.0/tests/engine/processing/gsonschema/test_types.py +109 -0
- data_designer_engine-0.4.0/tests/engine/processing/gsonschema/test_validators.py +229 -0
- data_designer_engine-0.4.0/tests/engine/processing/processors/__init__.py +2 -0
- data_designer_engine-0.4.0/tests/engine/processing/processors/test_drop_columns.py +162 -0
- data_designer_engine-0.4.0/tests/engine/processing/processors/test_registry.py +18 -0
- data_designer_engine-0.4.0/tests/engine/processing/processors/test_schema_transform.py +193 -0
- data_designer_engine-0.4.0/tests/engine/processing/test_utils.py +141 -0
- data_designer_engine-0.4.0/tests/engine/registry/__init__.py +2 -0
- data_designer_engine-0.4.0/tests/engine/registry/conftest.py +37 -0
- data_designer_engine-0.4.0/tests/engine/registry/test_base.py +227 -0
- data_designer_engine-0.4.0/tests/engine/registry/test_data_designer_registry.py +215 -0
- data_designer_engine-0.4.0/tests/engine/registry/test_errors.py +63 -0
- data_designer_engine-0.4.0/tests/engine/resources/__init__.py +2 -0
- data_designer_engine-0.4.0/tests/engine/resources/conftest.py +62 -0
- data_designer_engine-0.4.0/tests/engine/resources/test_managed_dataset_generator.py +125 -0
- data_designer_engine-0.4.0/tests/engine/resources/test_managed_dataset_repository.py +221 -0
- data_designer_engine-0.4.0/tests/engine/resources/test_managed_storage.py +95 -0
- data_designer_engine-0.4.0/tests/engine/resources/test_resource_provider.py +40 -0
- data_designer_engine-0.4.0/tests/engine/resources/test_seed_reader.py +58 -0
- data_designer_engine-0.4.0/tests/engine/sampling_gen/conftest.py +306 -0
- data_designer_engine-0.4.0/tests/engine/sampling_gen/data_sources/test_sampler_errors.py +17 -0
- data_designer_engine-0.4.0/tests/engine/sampling_gen/data_sources/test_sources.py +369 -0
- data_designer_engine-0.4.0/tests/engine/sampling_gen/entities/test_email_address_utils.py +105 -0
- data_designer_engine-0.4.0/tests/engine/sampling_gen/entities/test_national_id_utils.py +61 -0
- data_designer_engine-0.4.0/tests/engine/sampling_gen/entities/test_person.py +286 -0
- data_designer_engine-0.4.0/tests/engine/sampling_gen/entities/test_phone_number.py +94 -0
- data_designer_engine-0.4.0/tests/engine/sampling_gen/test_column.py +101 -0
- data_designer_engine-0.4.0/tests/engine/sampling_gen/test_constraints.py +107 -0
- data_designer_engine-0.4.0/tests/engine/sampling_gen/test_generator.py +536 -0
- data_designer_engine-0.4.0/tests/engine/sampling_gen/test_jinja_utils.py +119 -0
- data_designer_engine-0.4.0/tests/engine/sampling_gen/test_people_gen.py +56 -0
- data_designer_engine-0.4.0/tests/engine/sampling_gen/test_schema.py +255 -0
- data_designer_engine-0.4.0/tests/engine/sampling_gen/test_utils.py +43 -0
- data_designer_engine-0.4.0/tests/engine/test_compiler.py +146 -0
- data_designer_engine-0.4.0/tests/engine/test_configurable_task.py +130 -0
- data_designer_engine-0.4.0/tests/engine/test_dataset_metadata.py +56 -0
- data_designer_engine-0.4.0/tests/engine/test_engine_errors.py +61 -0
- data_designer_engine-0.4.0/tests/engine/test_model_provider.py +61 -0
- data_designer_engine-0.4.0/tests/engine/test_secret_resolver.py +89 -0
- data_designer_engine-0.4.0/tests/engine/test_validation.py +300 -0
- data_designer_engine-0.4.0/tests/engine/validators/test_local_callable.py +40 -0
- data_designer_engine-0.4.0/tests/engine/validators/test_python.py +123 -0
- data_designer_engine-0.4.0/tests/engine/validators/test_remote.py +64 -0
- data_designer_engine-0.4.0/tests/engine/validators/test_sql.py +22 -0
- data_designer_engine-0.4.0/tests/test_plugin_manager.py +124 -0
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# Installer logs
|
|
30
|
+
pip-log.txt
|
|
31
|
+
pip-delete-this-directory.txt
|
|
32
|
+
|
|
33
|
+
# Unit test / coverage reports
|
|
34
|
+
htmlcov/
|
|
35
|
+
.coverage
|
|
36
|
+
.coverage.*
|
|
37
|
+
.cache
|
|
38
|
+
coverage.xml
|
|
39
|
+
.pytest_cache/
|
|
40
|
+
|
|
41
|
+
# mkdocs documentation
|
|
42
|
+
/site
|
|
43
|
+
|
|
44
|
+
# Jupyter Notebook
|
|
45
|
+
.ipynb_checkpoints
|
|
46
|
+
|
|
47
|
+
# IPython
|
|
48
|
+
profile_default/
|
|
49
|
+
ipython_config.py
|
|
50
|
+
|
|
51
|
+
# pyenv
|
|
52
|
+
.python-version
|
|
53
|
+
|
|
54
|
+
# uv
|
|
55
|
+
.venv/
|
|
56
|
+
.uv/
|
|
57
|
+
|
|
58
|
+
# Environments
|
|
59
|
+
.env
|
|
60
|
+
env/
|
|
61
|
+
venv/
|
|
62
|
+
ENV/
|
|
63
|
+
env.bak/
|
|
64
|
+
venv.bak/
|
|
65
|
+
|
|
66
|
+
# Ruff
|
|
67
|
+
.ruff_cache/
|
|
68
|
+
|
|
69
|
+
# IDEs
|
|
70
|
+
.vscode/
|
|
71
|
+
.idea/
|
|
72
|
+
*.swp
|
|
73
|
+
*.swo
|
|
74
|
+
*~
|
|
75
|
+
.DS_Store
|
|
76
|
+
|
|
77
|
+
# Build artifacts
|
|
78
|
+
*.whl
|
|
79
|
+
*.tar.gz
|
|
80
|
+
*.zip
|
|
81
|
+
|
|
82
|
+
# Auto-generated version files
|
|
83
|
+
**/_version.py
|
|
84
|
+
|
|
85
|
+
# Local scratch space
|
|
86
|
+
.scratch/
|
|
87
|
+
|
|
88
|
+
docs/notebooks/
|
|
89
|
+
docs/notebook_source/*.ipynb
|
|
90
|
+
docs/notebook_source/*.csv
|
|
91
|
+
docs/**/artifacts/
|
|
92
|
+
|
|
93
|
+
tests_e2e/uv.lock
|
|
94
|
+
|
|
95
|
+
# Performance profiling
|
|
96
|
+
perf_*.txt
|
|
97
|
+
NOTEPAD.md
|
|
98
|
+
|
|
99
|
+
# Build-time copy of README for data-designer package (copied from top-level during build)
|
|
100
|
+
packages/data-designer/README.md
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: data-designer-engine
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: Generation engine for DataDesigner synthetic data generation
|
|
5
|
+
License-Expression: Apache-2.0
|
|
6
|
+
Classifier: Development Status :: 4 - Beta
|
|
7
|
+
Classifier: Intended Audience :: Developers
|
|
8
|
+
Classifier: Intended Audience :: Science/Research
|
|
9
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
+
Requires-Python: >=3.10
|
|
16
|
+
Requires-Dist: anyascii<1,>=0.3.3
|
|
17
|
+
Requires-Dist: data-designer-config
|
|
18
|
+
Requires-Dist: duckdb<2,>=1.1.3
|
|
19
|
+
Requires-Dist: faker<21,>=20.1.0
|
|
20
|
+
Requires-Dist: httpx-retries<1,>=0.4.2
|
|
21
|
+
Requires-Dist: httpx<1,>=0.27.2
|
|
22
|
+
Requires-Dist: huggingface-hub<2,>=1.0.1
|
|
23
|
+
Requires-Dist: json-repair<1,>=0.48.0
|
|
24
|
+
Requires-Dist: jsonpath-rust-bindings<2,>=1.0
|
|
25
|
+
Requires-Dist: jsonschema<5,>=4.0.0
|
|
26
|
+
Requires-Dist: litellm<1.80.12,>=1.73.6
|
|
27
|
+
Requires-Dist: lxml<7,>=6.0.2
|
|
28
|
+
Requires-Dist: marko<3,>=2.1.2
|
|
29
|
+
Requires-Dist: networkx<4,>=3.0
|
|
30
|
+
Requires-Dist: ruff<1,>=0.14.10
|
|
31
|
+
Requires-Dist: scipy<2,>=1.11.0
|
|
32
|
+
Requires-Dist: sqlfluff<4,>=3.2.0
|
|
33
|
+
Requires-Dist: tiktoken<1,>=0.8.0
|
|
34
|
+
Description-Content-Type: text/markdown
|
|
35
|
+
|
|
36
|
+
# data-designer-engine
|
|
37
|
+
|
|
38
|
+
Generation engine for NeMo Data Designer synthetic data generation framework.
|
|
39
|
+
|
|
40
|
+
This package contains the execution engine that powers Data Designer. It depends on `data-designer-config` and includes heavy dependencies like pandas, numpy, and LLM integration via litellm.
|
|
41
|
+
|
|
42
|
+
## Installation
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install data-designer-engine
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
This automatically installs `data-designer-config` as a dependency.
|
|
49
|
+
|
|
50
|
+
See main [README.md](https://github.com/NVIDIA-NeMo/DataDesigner/blob/main/README.md) for more information.
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# data-designer-engine
|
|
2
|
+
|
|
3
|
+
Generation engine for NeMo Data Designer synthetic data generation framework.
|
|
4
|
+
|
|
5
|
+
This package contains the execution engine that powers Data Designer. It depends on `data-designer-config` and includes heavy dependencies like pandas, numpy, and LLM integration via litellm.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install data-designer-engine
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
This automatically installs `data-designer-config` as a dependency.
|
|
14
|
+
|
|
15
|
+
See main [README.md](https://github.com/NVIDIA-NeMo/DataDesigner/blob/main/README.md) for more information.
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "data-designer-engine"
|
|
3
|
+
dynamic = ["version"]
|
|
4
|
+
description = "Generation engine for DataDesigner synthetic data generation"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.10"
|
|
7
|
+
license = "Apache-2.0"
|
|
8
|
+
|
|
9
|
+
classifiers = [
|
|
10
|
+
"Development Status :: 4 - Beta",
|
|
11
|
+
"Intended Audience :: Developers",
|
|
12
|
+
"Intended Audience :: Science/Research",
|
|
13
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
14
|
+
"License :: OSI Approved :: Apache Software License",
|
|
15
|
+
"Programming Language :: Python :: 3.10",
|
|
16
|
+
"Programming Language :: Python :: 3.11",
|
|
17
|
+
"Programming Language :: Python :: 3.12",
|
|
18
|
+
"Programming Language :: Python :: 3.13",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
dependencies = [
|
|
22
|
+
"anyascii>=0.3.3,<1",
|
|
23
|
+
"data-designer-config",
|
|
24
|
+
"duckdb>=1.1.3,<2",
|
|
25
|
+
"faker>=20.1.0,<21",
|
|
26
|
+
"httpx>=0.27.2,<1",
|
|
27
|
+
"httpx-retries>=0.4.2,<1",
|
|
28
|
+
"huggingface-hub>=1.0.1,<2",
|
|
29
|
+
"json-repair>=0.48.0,<1",
|
|
30
|
+
"jsonpath-rust-bindings>=1.0,<2",
|
|
31
|
+
"jsonschema>=4.0.0,<5",
|
|
32
|
+
"litellm>=1.73.6,<1.80.12",
|
|
33
|
+
"lxml>=6.0.2,<7",
|
|
34
|
+
"marko>=2.1.2,<3",
|
|
35
|
+
"networkx>=3.0,<4",
|
|
36
|
+
"ruff>=0.14.10,<1",
|
|
37
|
+
"scipy>=1.11.0,<2",
|
|
38
|
+
"sqlfluff>=3.2.0,<4",
|
|
39
|
+
"tiktoken>=0.8.0,<1",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[build-system]
|
|
43
|
+
requires = ["hatchling", "hatch-vcs"]
|
|
44
|
+
build-backend = "hatchling.build"
|
|
45
|
+
|
|
46
|
+
[tool.hatch.version]
|
|
47
|
+
source = "vcs"
|
|
48
|
+
fallback-version = "0.1.0.dev0"
|
|
49
|
+
raw-options = { root = "../.." }
|
|
50
|
+
|
|
51
|
+
[tool.hatch.build.hooks.vcs]
|
|
52
|
+
version-file = "src/data_designer/engine/_version.py"
|
|
53
|
+
|
|
54
|
+
[tool.hatch.build.targets.wheel]
|
|
55
|
+
packages = ["src/data_designer"]
|
|
56
|
+
|
|
57
|
+
[tool.ruff]
|
|
58
|
+
extend = "../../pyproject.toml"
|
|
59
|
+
|
|
60
|
+
[tool.uv]
|
|
61
|
+
package = true
|
|
62
|
+
|
|
63
|
+
[tool.uv.sources]
|
|
64
|
+
data-designer-config = { workspace = true }
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# file generated by setuptools-scm
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"__version__",
|
|
6
|
+
"__version_tuple__",
|
|
7
|
+
"version",
|
|
8
|
+
"version_tuple",
|
|
9
|
+
"__commit_id__",
|
|
10
|
+
"commit_id",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
TYPE_CHECKING = False
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from typing import Tuple
|
|
16
|
+
from typing import Union
|
|
17
|
+
|
|
18
|
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
19
|
+
COMMIT_ID = Union[str, None]
|
|
20
|
+
else:
|
|
21
|
+
VERSION_TUPLE = object
|
|
22
|
+
COMMIT_ID = object
|
|
23
|
+
|
|
24
|
+
version: str
|
|
25
|
+
__version__: str
|
|
26
|
+
__version_tuple__: VERSION_TUPLE
|
|
27
|
+
version_tuple: VERSION_TUPLE
|
|
28
|
+
commit_id: COMMIT_ID
|
|
29
|
+
__commit_id__: COMMIT_ID
|
|
30
|
+
|
|
31
|
+
__version__ = version = '0.4.0'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 4, 0)
|
|
33
|
+
|
|
34
|
+
__commit_id__ = commit_id = None
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, model_validator
|
|
11
|
+
from typing_extensions import Self
|
|
12
|
+
|
|
13
|
+
from data_designer.config.base import ConfigBase
|
|
14
|
+
from data_designer.config.column_configs import SingleColumnConfig
|
|
15
|
+
from data_designer.config.column_types import DataDesignerColumnType
|
|
16
|
+
from data_designer.engine.configurable_task import ConfigurableTask, TaskConfigT
|
|
17
|
+
from data_designer.lazy_heavy_imports import pd
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
import pandas as pd
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ColumnConfigWithDataFrame(ConfigBase):
|
|
26
|
+
column_config: SingleColumnConfig
|
|
27
|
+
df: pd.DataFrame
|
|
28
|
+
|
|
29
|
+
@model_validator(mode="after")
|
|
30
|
+
def validate_column_exists(self) -> Self:
|
|
31
|
+
if self.column_config.name not in self.df.columns:
|
|
32
|
+
raise ValueError(f"Column {self.column_config.name!r} not found in DataFrame")
|
|
33
|
+
return self
|
|
34
|
+
|
|
35
|
+
def as_tuple(self) -> tuple[SingleColumnConfig, pd.DataFrame]:
|
|
36
|
+
return (self.column_config, self.df)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class ColumnProfiler(ConfigurableTask[TaskConfigT], ABC):
|
|
40
|
+
@staticmethod
|
|
41
|
+
@abstractmethod
|
|
42
|
+
def get_applicable_column_types() -> list[DataDesignerColumnType]:
|
|
43
|
+
"""Returns a list of column types that this profiler can be applied to during dataset profiling."""
|
|
44
|
+
|
|
45
|
+
@abstractmethod
|
|
46
|
+
def profile(self, column_config_with_df: ColumnConfigWithDataFrame) -> BaseModel: ...
|
|
47
|
+
|
|
48
|
+
def _initialize(self) -> None:
|
|
49
|
+
logger.info(f"💫 Initializing column profiler: '{self.name}'")
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
import random
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
|
|
10
|
+
from data_designer.config.analysis.column_profilers import (
|
|
11
|
+
JudgeScoreProfilerConfig,
|
|
12
|
+
JudgeScoreProfilerResults,
|
|
13
|
+
JudgeScoreSummary,
|
|
14
|
+
)
|
|
15
|
+
from data_designer.config.analysis.column_statistics import (
|
|
16
|
+
ColumnDistributionType,
|
|
17
|
+
MissingValue,
|
|
18
|
+
)
|
|
19
|
+
from data_designer.config.column_types import DataDesignerColumnType
|
|
20
|
+
from data_designer.engine.analysis.column_profilers.base import ColumnConfigWithDataFrame, ColumnProfiler
|
|
21
|
+
from data_designer.engine.analysis.utils.judge_score_processing import (
|
|
22
|
+
extract_judge_score_distributions,
|
|
23
|
+
sample_scores_and_reasoning,
|
|
24
|
+
)
|
|
25
|
+
from data_designer.engine.models.recipes.response_recipes import TextResponseRecipe
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from data_designer.config.analysis.column_profilers import JudgeScoreSample
|
|
29
|
+
from data_designer.config.analysis.column_statistics import (
|
|
30
|
+
CategoricalDistribution,
|
|
31
|
+
CategoricalHistogramData,
|
|
32
|
+
NumericalDistribution,
|
|
33
|
+
)
|
|
34
|
+
from data_designer.engine.models.facade import ModelFacade
|
|
35
|
+
|
|
36
|
+
logger = logging.getLogger(__name__)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class JudgeScoreProfiler(ColumnProfiler[JudgeScoreProfilerConfig]):
|
|
40
|
+
@staticmethod
|
|
41
|
+
def get_applicable_column_types() -> list[DataDesignerColumnType]:
|
|
42
|
+
return [DataDesignerColumnType.LLM_JUDGE]
|
|
43
|
+
|
|
44
|
+
def get_model(self, model_alias: str) -> ModelFacade:
|
|
45
|
+
return self.resource_provider.model_registry.get_model(model_alias=model_alias)
|
|
46
|
+
|
|
47
|
+
def profile(self, column_config_with_df: ColumnConfigWithDataFrame) -> JudgeScoreProfilerResults:
|
|
48
|
+
column_config, df = column_config_with_df.as_tuple()
|
|
49
|
+
|
|
50
|
+
logger.info(
|
|
51
|
+
f"{column_config.get_column_emoji()} Analyzing LLM-as-judge scores for column: '{column_config.name}'"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
score_summaries = {}
|
|
55
|
+
score_distributions = extract_judge_score_distributions(column_config, df)
|
|
56
|
+
|
|
57
|
+
if self.config.summary_score_sample_size is None or isinstance(score_distributions, MissingValue):
|
|
58
|
+
return JudgeScoreProfilerResults(
|
|
59
|
+
summaries={},
|
|
60
|
+
column_name=column_config.name,
|
|
61
|
+
score_distributions=score_distributions,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
for score in column_config.scores:
|
|
65
|
+
score_name = score.name
|
|
66
|
+
logger.info(f"{random.choice(['👩⚖️', '👨⚖️'])} Summarizing LLM-as-judge score: '{score_name}'")
|
|
67
|
+
score_sample = sample_scores_and_reasoning(
|
|
68
|
+
scores=score_distributions.scores[score_name],
|
|
69
|
+
reasoning=score_distributions.reasoning[score_name],
|
|
70
|
+
num_samples=self.config.summary_score_sample_size,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
score_summaries[score_name] = self._summarize_score_sample(
|
|
74
|
+
name=score_name,
|
|
75
|
+
sample=score_sample,
|
|
76
|
+
histogram=score_distributions.histograms[score_name],
|
|
77
|
+
distribution=score_distributions.distributions[score_name],
|
|
78
|
+
distribution_type=score_distributions.distribution_types[score_name],
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
return JudgeScoreProfilerResults(
|
|
82
|
+
column_name=column_config.name,
|
|
83
|
+
summaries=score_summaries,
|
|
84
|
+
score_distributions=score_distributions,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
def _summarize_score_sample(
|
|
88
|
+
self,
|
|
89
|
+
name: str,
|
|
90
|
+
sample: list[JudgeScoreSample],
|
|
91
|
+
histogram: CategoricalHistogramData,
|
|
92
|
+
distribution: CategoricalDistribution | NumericalDistribution | MissingValue,
|
|
93
|
+
distribution_type: ColumnDistributionType,
|
|
94
|
+
) -> JudgeScoreSummary:
|
|
95
|
+
if isinstance(distribution, MissingValue) or not sample:
|
|
96
|
+
return JudgeScoreSummary(
|
|
97
|
+
score_name=name,
|
|
98
|
+
summary="No judge score information available to summarize.",
|
|
99
|
+
score_samples=sample,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
category_info = []
|
|
103
|
+
total_count = sum(histogram.counts)
|
|
104
|
+
for cat, count in zip(histogram.categories, histogram.counts):
|
|
105
|
+
percentage = (count / total_count) * 100
|
|
106
|
+
category_info.append(f"{cat}: {count} records ({percentage:.1f}%)")
|
|
107
|
+
|
|
108
|
+
distribution_context = f"Score distribution - {', '.join(category_info)}, "
|
|
109
|
+
if distribution_type == ColumnDistributionType.CATEGORICAL:
|
|
110
|
+
distribution_context += f"Most common value: {distribution.most_common_value}. "
|
|
111
|
+
if distribution_type == ColumnDistributionType.NUMERICAL:
|
|
112
|
+
distribution_context += f"Mean score: {distribution.mean:.2f}. "
|
|
113
|
+
|
|
114
|
+
logger.info(f" |-- number of score samples: {len(sample)}")
|
|
115
|
+
logger.info(f" |-- {distribution_context.lower()}")
|
|
116
|
+
|
|
117
|
+
combined_reasoning = "\n".join([r.reasoning for r in sample])
|
|
118
|
+
prompt = (
|
|
119
|
+
f"Based on the following evaluator reasoning for the '{name}' criterion, "
|
|
120
|
+
"provide a concise summary that captures both the strengths and areas for improvement mentioned. "
|
|
121
|
+
"Be specific about what worked well and what needs improvement.\n\n"
|
|
122
|
+
f"Overall distribution of scores: {distribution_context}"
|
|
123
|
+
f"\nA sample of reasoning:\n{combined_reasoning}\n\n"
|
|
124
|
+
"Do not include any titles like `Summary` or `Summary:`. "
|
|
125
|
+
"Do not wrap the summary in quotation marks. "
|
|
126
|
+
"YOU WILL PRODUCE LESS THAN 75 WORDS in a readable sentence format. "
|
|
127
|
+
"No need to use bullets or headers. Write naturally."
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
system_prompt = (
|
|
131
|
+
"You are an expert at distilling complex feedback into concise summaries. "
|
|
132
|
+
"Focus on specificity and balance, incorporating both the distribution context and individual reasoning examples."
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
try:
|
|
136
|
+
model = self.get_model(self.config.model_alias)
|
|
137
|
+
recipe = TextResponseRecipe()
|
|
138
|
+
summary, _ = model.generate(
|
|
139
|
+
prompt=recipe.apply_recipe_to_user_prompt(prompt),
|
|
140
|
+
system_prompt=recipe.apply_recipe_to_system_prompt(system_prompt),
|
|
141
|
+
parser=recipe.parse,
|
|
142
|
+
)
|
|
143
|
+
return JudgeScoreSummary(
|
|
144
|
+
score_name=name,
|
|
145
|
+
summary=summary.strip(),
|
|
146
|
+
score_samples=sample,
|
|
147
|
+
)
|
|
148
|
+
except Exception as e:
|
|
149
|
+
return JudgeScoreSummary(
|
|
150
|
+
score_name=name,
|
|
151
|
+
summary=f"Score summarization failed: {e}",
|
|
152
|
+
score_samples=sample,
|
|
153
|
+
)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
from data_designer.config.analysis.column_profilers import ColumnProfilerType
|
|
7
|
+
from data_designer.config.base import ConfigBase
|
|
8
|
+
from data_designer.engine.analysis.column_profilers.base import ColumnProfiler
|
|
9
|
+
from data_designer.engine.analysis.column_profilers.judge_score_profiler import (
|
|
10
|
+
JudgeScoreProfiler,
|
|
11
|
+
JudgeScoreProfilerConfig,
|
|
12
|
+
)
|
|
13
|
+
from data_designer.engine.registry.base import TaskRegistry
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ColumnProfilerRegistry(TaskRegistry[ColumnProfilerType, ColumnProfiler, ConfigBase]): ...
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def create_default_column_profiler_registry() -> ColumnProfilerRegistry:
|
|
20
|
+
registry = ColumnProfilerRegistry()
|
|
21
|
+
registry.register(ColumnProfilerType.JUDGE_SCORE, JudgeScoreProfiler, JudgeScoreProfilerConfig, False)
|
|
22
|
+
return registry
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from typing import TYPE_CHECKING, Any, TypeAlias
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel
|
|
10
|
+
from typing_extensions import Self
|
|
11
|
+
|
|
12
|
+
from data_designer.config.analysis.column_statistics import (
|
|
13
|
+
DEFAULT_COLUMN_STATISTICS_MAP,
|
|
14
|
+
ColumnStatisticsT,
|
|
15
|
+
GeneralColumnStatistics,
|
|
16
|
+
)
|
|
17
|
+
from data_designer.config.column_types import ColumnConfigT, DataDesignerColumnType
|
|
18
|
+
from data_designer.config.sampler_params import SamplerType, is_numerical_sampler_type
|
|
19
|
+
from data_designer.engine.analysis.column_profilers.base import ColumnConfigWithDataFrame
|
|
20
|
+
from data_designer.engine.analysis.utils.column_statistics_calculations import (
|
|
21
|
+
ColumnDistributionType,
|
|
22
|
+
calculate_column_distribution,
|
|
23
|
+
calculate_general_column_info,
|
|
24
|
+
calculate_token_stats,
|
|
25
|
+
calculate_validation_column_info,
|
|
26
|
+
)
|
|
27
|
+
from data_designer.lazy_heavy_imports import pd
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
import pandas as pd
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class GeneralColumnStatisticsCalculator(BaseModel):
|
|
36
|
+
column_config_with_df: ColumnConfigWithDataFrame
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def column_config(self) -> ColumnConfigT:
|
|
40
|
+
return self.column_config_with_df.column_config
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def df(self) -> pd.DataFrame:
|
|
44
|
+
return self.column_config_with_df.df
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def column_statistics_type(self) -> type[ColumnStatisticsT]:
|
|
48
|
+
return DEFAULT_COLUMN_STATISTICS_MAP.get(self.column_config.column_type, GeneralColumnStatistics)
|
|
49
|
+
|
|
50
|
+
def calculate(self) -> Self:
|
|
51
|
+
"""Calculate all the column statistics fields for the given column configuration and dataset profiler.
|
|
52
|
+
|
|
53
|
+
This method dynamically collects all class methods prefixed with 'calculate_' and invokes them to
|
|
54
|
+
compute various column statistics, aggregating their results into a single statistics object.
|
|
55
|
+
"""
|
|
56
|
+
calculate_methods = [
|
|
57
|
+
name for name in dir(self) if name.startswith("calculate_") and callable(getattr(self, name))
|
|
58
|
+
]
|
|
59
|
+
return self.column_statistics_type(
|
|
60
|
+
column_name=self.column_config.name,
|
|
61
|
+
**{k: v for name in calculate_methods for k, v in getattr(self, name)().items()},
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
def calculate_general_column_info(self) -> dict[str, Any]:
|
|
65
|
+
return calculate_general_column_info(self.column_config.name, self.df)
|
|
66
|
+
|
|
67
|
+
def __repr__(self) -> str:
|
|
68
|
+
params = []
|
|
69
|
+
for field, value in self.model_dump(mode="json").items():
|
|
70
|
+
params.append(f" {field}: {value}")
|
|
71
|
+
params_str = "\n".join(params)
|
|
72
|
+
return f"{self.__class__.__name__}(\n{params_str}\n)"
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class LLMTextColumnStatisticsCalculator(GeneralColumnStatisticsCalculator):
|
|
76
|
+
def calculate_token_stats(self) -> dict[str, Any]:
|
|
77
|
+
return calculate_token_stats(self.column_config, self.df)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class LLMCodeColumnStatisticsCalculator(LLMTextColumnStatisticsCalculator): ...
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class LLMStructuredColumnStatisticsCalculator(LLMTextColumnStatisticsCalculator): ...
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class LLMJudgedColumnStatisticsCalculator(LLMTextColumnStatisticsCalculator): ...
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class SamplerColumnStatisticsCalculator(GeneralColumnStatisticsCalculator):
|
|
90
|
+
def calculate_sampler_distribution(self) -> dict[str, Any]:
|
|
91
|
+
make_dist, dist_type = False, ColumnDistributionType.OTHER
|
|
92
|
+
if self.column_config.sampler_type in [SamplerType.CATEGORY, SamplerType.SUBCATEGORY]:
|
|
93
|
+
make_dist, dist_type = True, ColumnDistributionType.CATEGORICAL
|
|
94
|
+
elif is_numerical_sampler_type(self.column_config.sampler_type):
|
|
95
|
+
make_dist, dist_type = True, ColumnDistributionType.NUMERICAL
|
|
96
|
+
return (
|
|
97
|
+
{
|
|
98
|
+
"sampler_type": SamplerType(self.column_config.sampler_type),
|
|
99
|
+
**calculate_column_distribution(self.column_config.name, self.df, dist_type),
|
|
100
|
+
}
|
|
101
|
+
if make_dist
|
|
102
|
+
else {
|
|
103
|
+
"sampler_type": SamplerType(self.column_config.sampler_type),
|
|
104
|
+
"distribution_type": dist_type,
|
|
105
|
+
"distribution": None,
|
|
106
|
+
}
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class SeedDatasetColumnStatisticsCalculator(GeneralColumnStatisticsCalculator): ...
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class ValidationColumnStatisticsCalculator(GeneralColumnStatisticsCalculator):
|
|
114
|
+
def calculate_validation_column_info(self) -> dict[str, Any]:
|
|
115
|
+
return calculate_validation_column_info(self.column_config.name, self.df)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class ExpressionColumnStatisticsCalculator(GeneralColumnStatisticsCalculator): ...
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
ColumnStatisticsCalculatorT: TypeAlias = (
|
|
122
|
+
ExpressionColumnStatisticsCalculator
|
|
123
|
+
| ValidationColumnStatisticsCalculator
|
|
124
|
+
| GeneralColumnStatisticsCalculator
|
|
125
|
+
| LLMCodeColumnStatisticsCalculator
|
|
126
|
+
| LLMJudgedColumnStatisticsCalculator
|
|
127
|
+
| LLMStructuredColumnStatisticsCalculator
|
|
128
|
+
| LLMTextColumnStatisticsCalculator
|
|
129
|
+
| SamplerColumnStatisticsCalculator
|
|
130
|
+
| SeedDatasetColumnStatisticsCalculator
|
|
131
|
+
)
|
|
132
|
+
DEFAULT_COLUMN_STATISTICS_CALCULATOR_MAP = {
|
|
133
|
+
DataDesignerColumnType.EXPRESSION: ExpressionColumnStatisticsCalculator,
|
|
134
|
+
DataDesignerColumnType.VALIDATION: ValidationColumnStatisticsCalculator,
|
|
135
|
+
DataDesignerColumnType.LLM_CODE: LLMCodeColumnStatisticsCalculator,
|
|
136
|
+
DataDesignerColumnType.LLM_JUDGE: LLMJudgedColumnStatisticsCalculator,
|
|
137
|
+
DataDesignerColumnType.LLM_STRUCTURED: LLMStructuredColumnStatisticsCalculator,
|
|
138
|
+
DataDesignerColumnType.LLM_TEXT: LLMTextColumnStatisticsCalculator,
|
|
139
|
+
DataDesignerColumnType.SAMPLER: SamplerColumnStatisticsCalculator,
|
|
140
|
+
DataDesignerColumnType.SEED_DATASET: SeedDatasetColumnStatisticsCalculator,
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def get_column_statistics_calculator(column_type: DataDesignerColumnType) -> ColumnStatisticsCalculatorT:
|
|
145
|
+
return DEFAULT_COLUMN_STATISTICS_CALCULATOR_MAP.get(column_type, GeneralColumnStatisticsCalculator)
|