data-designer 0.2.3__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. data_designer/_version.py +2 -2
  2. data_designer/cli/forms/model_builder.py +2 -2
  3. data_designer/config/config_builder.py +30 -113
  4. data_designer/config/errors.py +3 -0
  5. data_designer/config/exports.py +8 -6
  6. data_designer/config/models.py +7 -18
  7. data_designer/config/run_config.py +34 -0
  8. data_designer/config/seed.py +16 -46
  9. data_designer/config/seed_source.py +84 -0
  10. data_designer/config/utils/constants.py +27 -2
  11. data_designer/config/utils/io_helpers.py +0 -20
  12. data_designer/engine/column_generators/generators/seed_dataset.py +5 -5
  13. data_designer/engine/column_generators/generators/validation.py +3 -0
  14. data_designer/engine/column_generators/registry.py +1 -1
  15. data_designer/engine/compiler.py +69 -0
  16. data_designer/engine/dataset_builders/column_wise_builder.py +3 -0
  17. data_designer/engine/dataset_builders/utils/config_compiler.py +1 -1
  18. data_designer/engine/models/facade.py +2 -0
  19. data_designer/engine/processing/gsonschema/validators.py +55 -0
  20. data_designer/engine/resources/resource_provider.py +17 -5
  21. data_designer/engine/resources/seed_reader.py +149 -0
  22. data_designer/essentials/__init__.py +2 -0
  23. data_designer/interface/data_designer.py +72 -62
  24. data_designer/plugin_manager.py +1 -1
  25. data_designer/plugins/errors.py +3 -0
  26. data_designer/plugins/plugin.py +82 -12
  27. data_designer/plugins/testing/__init__.py +8 -0
  28. data_designer/plugins/testing/stubs.py +145 -0
  29. data_designer/plugins/testing/utils.py +11 -0
  30. {data_designer-0.2.3.dist-info → data_designer-0.3.1.dist-info}/METADATA +3 -3
  31. {data_designer-0.2.3.dist-info → data_designer-0.3.1.dist-info}/RECORD +35 -30
  32. data_designer/config/datastore.py +0 -187
  33. data_designer/engine/resources/seed_dataset_data_store.py +0 -84
  34. /data_designer/{config/utils → engine}/validation.py +0 -0
  35. {data_designer-0.2.3.dist-info → data_designer-0.3.1.dist-info}/WHEEL +0 -0
  36. {data_designer-0.2.3.dist-info → data_designer-0.3.1.dist-info}/entry_points.txt +0 -0
  37. {data_designer-0.2.3.dist-info → data_designer-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,8 +1,8 @@
1
1
  data_designer/__init__.py,sha256=iCeqRnb640RrL2QpA630GY5Ng7JiDt83Vq0DwLnNugU,461
2
- data_designer/_version.py,sha256=kBRz0P2plw1eVdIpt70W6m1LMbEIhLY3RyOfVGdubaI,704
2
+ data_designer/_version.py,sha256=gGLpQUQx-ty9SEy9PYw9OgJWWzJLBnCpfJOfzL7SjlI,704
3
3
  data_designer/errors.py,sha256=Z4eN9XwzZvGRdBluSNoSqQYkPPzNQIDf0ET_OqWRZh8,179
4
4
  data_designer/logging.py,sha256=ZsruJ0tx_4NK0PIMyxCZJJ0wJugoDff9UP3PbsdEDxQ,5341
5
- data_designer/plugin_manager.py,sha256=xaMX274gdlYLNNPIrAOmJNLaZlG_0ROJ0H29v8t2aKs,2604
5
+ data_designer/plugin_manager.py,sha256=NCYhBtMqbIGSMZwTY2F2NkkkdgBGPqnxZCmeiQzEH80,2604
6
6
  data_designer/cli/README.md,sha256=uPE3KdlF5Y3H8pQc8c6ZZ3h6YSFXNQW-iEXGQJuVnI4,9026
7
7
  data_designer/cli/__init__.py,sha256=kTfolrDol0izniNPXtuaUJ_oXRfJ-jGUPuVR5IwibEM,210
8
8
  data_designer/cli/main.py,sha256=v_vjyHbEF0n0Np-jc5KigtNFZflHR7tAnnea5mDzXHI,1933
@@ -22,7 +22,7 @@ data_designer/cli/forms/__init__.py,sha256=BGLbNJCHCgYiQWoAdTbUjzqgVlJymTQOV8sNW
22
22
  data_designer/cli/forms/builder.py,sha256=QMCutZb7l3DeL4nXFGCUaiS1bxBu1BdaBWwlb1rmiIE,1690
23
23
  data_designer/cli/forms/field.py,sha256=8EYfaqxSynyhPreFOa9JGnsTkeSXxwpRzV8xJb98FGg,7502
24
24
  data_designer/cli/forms/form.py,sha256=f6_LdSlk4kddB9a4mGotA-VlR2mlXAU_9RtLkbliI38,2025
25
- data_designer/cli/forms/model_builder.py,sha256=SEfJuhw22bYjRc0XvLZsorXc8EpEsugHlpoC0OHxhKo,13345
25
+ data_designer/cli/forms/model_builder.py,sha256=voBv7_e0M3CwXxwuv75OSG0_CJQ0thkVVA8G7eCK798,13313
26
26
  data_designer/cli/forms/provider_builder.py,sha256=xphQlNlnfDLm0XwqbPC6SJ3wXwlU45xVo_35Pe1EBdU,2895
27
27
  data_designer/cli/repositories/__init__.py,sha256=RBOWAkIOOpr-L-kVz-PDIPmMXdlGLCinxkwvKS6bAB4,434
28
28
  data_designer/cli/repositories/base.py,sha256=LQ0i_KrTdhS5o48qphlr4uWoAVrz02Lf_ZH1JIhcuBQ,1054
@@ -37,20 +37,21 @@ data_designer/config/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMb
37
37
  data_designer/config/base.py,sha256=ypam9XX6dg8Q_55su84WmVExNXsmt5jb3eeW3JLlHwc,2396
38
38
  data_designer/config/column_configs.py,sha256=pjpy5z3Kk7i4WmIjOxdiW5Awpjy5CxQSy0YMy0QxtvA,18961
39
39
  data_designer/config/column_types.py,sha256=EILVM42d4TMl2xbSj5htMsenJwybCHIc_G8AUXyrjWU,7197
40
- data_designer/config/config_builder.py,sha256=n8in3O-hR2j3wJBnZMCoT5NawlobJDWTyNZCIYSgWIo,29241
40
+ data_designer/config/config_builder.py,sha256=eO7iXO4WeF3h4iJdH203ZFHET9Vj4XAgoIPEJpQDbkg,25371
41
41
  data_designer/config/data_designer_config.py,sha256=D2b4Dl8pR6kCkvPoZ3APxC9pVBqXi5EJMVK1WBZ6ni8,1886
42
42
  data_designer/config/dataset_builders.py,sha256=1pNFy_pkQ5lJ6AVZ43AeTuSbz6yC_l7Ndcyp5yaT8hQ,327
43
- data_designer/config/datastore.py,sha256=gEHR2hYlJwD_vzjuaSOMRiYjtwdQhyO9q1afZDrhBCo,7586
44
43
  data_designer/config/default_model_settings.py,sha256=YqucXdOdXV6-J2jOc3gSSjbtfXDVbKfmG94neC2Ynaw,4457
45
- data_designer/config/errors.py,sha256=MNMnqh8G1XzXAMeJ5ju6zkBiIH2aVgyITnzYJbGEwFY,461
46
- data_designer/config/exports.py,sha256=jPGjuFIpfApwygiTVq7lOI271guJF1oaMwcoyycXV3w,4716
44
+ data_designer/config/errors.py,sha256=MvrZd7tLMaWKjnKHj_GHIap2mkb20O7x56H4c1hIyZI,519
45
+ data_designer/config/exports.py,sha256=OFh-iHtkjqWPOtjUc1esRRaAssL3XVImOTPp0yWWSW0,4715
47
46
  data_designer/config/interface.py,sha256=ery8a93pnCW1JPbgtiaRsMKSR8Q2o7rDmsZfVYbfkeE,1619
48
- data_designer/config/models.py,sha256=_uLOh2TutJV3Fq_8YyAi5E7G37j47j64zcrCYnzpjbo,15713
47
+ data_designer/config/models.py,sha256=EQCzN_pNIZ2enarKSs0Ry40oNzVFGr188jwplOQQGCI,15330
49
48
  data_designer/config/preview_results.py,sha256=bPRKX1enzNTY240ixc8jZVgO7EDHABZ1_er0TabhLZg,1345
50
49
  data_designer/config/processors.py,sha256=bA6SVF1kmAJSshmWseLE6HzlEBAsH9FtUtNJk0QzJtU,5987
50
+ data_designer/config/run_config.py,sha256=6CDHjcmkZmzP3NSlaeyVkLbRVKsb7ry3b0ms6T-A7rc,1418
51
51
  data_designer/config/sampler_constraints.py,sha256=Q8-JrwTD69AJy8cvs_-0yf4yOBGemLwLZNmk-8Y5wPk,1156
52
52
  data_designer/config/sampler_params.py,sha256=-MLNFDqattNWrHuWPYyGTe2YdbaGMH-JKTCzxq1ji3E,27838
53
- data_designer/config/seed.py,sha256=n4iHDBkUlwNJSXqDu6BqD6uZZeFtLu6x1hyyOhcG9zM,5503
53
+ data_designer/config/seed.py,sha256=FEi3O5eNMk3qLiAHNAKFyJ5ijAk0e0MaHcYpfo1pa9c,4621
54
+ data_designer/config/seed_source.py,sha256=X1Kj-Q6qxX2cnC3dlDUl-dAGuPecgnqI8CI6wzh3KkY,2598
54
55
  data_designer/config/validator_params.py,sha256=BSDNVZQvXB4hmhuX4EnJ89pR-1hdEfI_KWYO8POQlMk,3906
55
56
  data_designer/config/analysis/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
56
57
  data_designer/config/analysis/column_profilers.py,sha256=PcFR6uhlJzz8VMYHYoYYMZ6Rj6zrk0RbjomSHDn9GDI,6215
@@ -59,20 +60,21 @@ data_designer/config/analysis/dataset_profiler.py,sha256=Omst6qNsUi9sdbV7c0kASK6
59
60
  data_designer/config/analysis/utils/errors.py,sha256=UbWKIo8uMNqGXJKf4XR6PbuDDZnBQ7qHyEdq8Oh97QY,290
60
61
  data_designer/config/analysis/utils/reporting.py,sha256=8tds-g6q1NLW8xHzT9OukgEWA1Y5qAnMq-BjP4u_Frk,7030
61
62
  data_designer/config/utils/code_lang.py,sha256=szSihIdVZsjgUOcbxndgI3P4GNN51DUeNlk5Z8nEYro,2349
62
- data_designer/config/utils/constants.py,sha256=MMb5-aJOHfsvfptCFxv2AyzOeryeDNsqoce9B-wCTJk,7880
63
+ data_designer/config/utils/constants.py,sha256=mNP1r3ucImsB743UFbjd-VFc0ZcSuqOQXRu4jVBR8FI,8884
63
64
  data_designer/config/utils/errors.py,sha256=X8_ghPqHKWN6qDMW0WhBoxFNr9MNygOEawq0oGigPD8,479
64
65
  data_designer/config/utils/info.py,sha256=AW8GnmxGX-LahQCDT9BPl8eqUz3ymNr647VHHj5gBHg,3428
65
- data_designer/config/utils/io_helpers.py,sha256=2jpR0F0n4m5qBJCA6GXqEEUDyh289H0AsIBkC6RZn2E,9173
66
+ data_designer/config/utils/io_helpers.py,sha256=xIN4FicjITOoNZB-nX5LnVr45-lvhsSYenlerXlq5_8,8331
66
67
  data_designer/config/utils/misc.py,sha256=qH2VgPYmr1FZTX_r8rXmtNTdKpCs0rg7aEos0w7ylNo,2467
67
68
  data_designer/config/utils/numerical_helpers.py,sha256=YujDvTOkm0FW3V5G6Ja_Etf7d8tJ71Rpj_BHRu5JlRY,797
68
69
  data_designer/config/utils/type_helpers.py,sha256=KOnxSYDHefadfOrlzRk-XixvwbZ31Ql7wOXAdQh0qoU,4018
69
- data_designer/config/utils/validation.py,sha256=VvQ2vYZKSp-nPSukGzKv9rXXkpr5Spz3fkIX8HC28lk,14440
70
70
  data_designer/config/utils/visualization.py,sha256=ql0rnRIF7AhkuyS6zE35FockYzCkDRamzFUs6J7g23I,18186
71
71
  data_designer/engine/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
72
+ data_designer/engine/compiler.py,sha256=dQOCnjIU3SC6TrwpJVY8Nnv2Ql8-_aEAeY9TdewO6T0,2853
72
73
  data_designer/engine/configurable_task.py,sha256=nLUUzmVQ_RcX4l97wwROspFeYtVfdh0llxiau49d5Ic,3118
73
74
  data_designer/engine/errors.py,sha256=DUoKhQCSwIBoLSQGv7dstzO3DFGDRqW3MBoWnRPcm1I,1262
74
75
  data_designer/engine/model_provider.py,sha256=w_7EZpDKgOLdzmCWJ6v6oKdM0GdRijir3iK102bBtg8,2782
75
76
  data_designer/engine/secret_resolver.py,sha256=IyvLvx_me9oiLk6uaVddTnd01Pz7qeYsleOhtKtcm3A,2427
77
+ data_designer/engine/validation.py,sha256=VvQ2vYZKSp-nPSukGzKv9rXXkpr5Spz3fkIX8HC28lk,14440
76
78
  data_designer/engine/analysis/column_statistics.py,sha256=M_IxRwPeIaaN37UtPFMW09eXYQ-hEgu_NnLzlQj8AtY,5752
77
79
  data_designer/engine/analysis/dataset_profiler.py,sha256=jmK0VFKH6mVkRaAA-upvCFBtxVeT-02ss9X0gRInP0A,7238
78
80
  data_designer/engine/analysis/errors.py,sha256=VBeKREcPcInWhjAo3U2x_9UnJBi8zcGnUjLXNippPtA,255
@@ -82,31 +84,31 @@ data_designer/engine/analysis/column_profilers/registry.py,sha256=GpudnadaJxb8ub
82
84
  data_designer/engine/analysis/utils/column_statistics_calculations.py,sha256=B5g1j8f0z1AFAEUVXf3roLZUI--jEs01Tj9g9yREWxk,8851
83
85
  data_designer/engine/analysis/utils/judge_score_processing.py,sha256=rl11e3PxAOQPDSmK9G9kxOMUCJrZddQusUobDKZhIzw,4758
84
86
  data_designer/engine/column_generators/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
85
- data_designer/engine/column_generators/registry.py,sha256=Eg6tqNM7mmEPNom1fWF9S5D3qABpMennOHGEGePwJN0,3060
87
+ data_designer/engine/column_generators/registry.py,sha256=qiC7B1MvcEgzfPtWBPHqF9vNpWGHpUg9YekR-lAZJl0,3060
86
88
  data_designer/engine/column_generators/generators/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
87
89
  data_designer/engine/column_generators/generators/base.py,sha256=zurwtamM2l3shLa4SLjUOE0zOTDozQ5wPGAvDkrNYqE,3231
88
90
  data_designer/engine/column_generators/generators/embedding.py,sha256=xYnFWRJ2W7JuwK8CRIUhv4QiT_DCGDuQkuHFKXCxrow,1724
89
91
  data_designer/engine/column_generators/generators/expression.py,sha256=7xniEj8aPscWDYLrnNbG2mF3s08C7aR8ZgNUCzr_x8g,2539
90
92
  data_designer/engine/column_generators/generators/llm_completion.py,sha256=XqpXzYczbZ6efUIVuvcm2O_mXBnXCMAvcjeyaB5dIFA,5301
91
93
  data_designer/engine/column_generators/generators/samplers.py,sha256=YHoTWi8Wo9TyR-98I-rOHJUHOIJXxvbil-PrhNKMWxQ,3579
92
- data_designer/engine/column_generators/generators/seed_dataset.py,sha256=QUegAT55AxyBHY5VhAtJKv9BRgGJ2jxN0Yff7YvkLDI,7018
93
- data_designer/engine/column_generators/generators/validation.py,sha256=MbDFXzieftv6-77rRdltNUnquUe5FxCVkBEHsAwvwh4,6591
94
+ data_designer/engine/column_generators/generators/seed_dataset.py,sha256=6fG1ybsil7kqKfEL8AXS7L96tdloWbh8X9nYzf803AY,6996
95
+ data_designer/engine/column_generators/generators/validation.py,sha256=QxDE4HIBLtZ-_12eYwJxPXOaYo81vCDifM5YHdk7tow,6772
94
96
  data_designer/engine/column_generators/utils/errors.py,sha256=ugNwaqnPdrPZI7YnKLbYwFjYUSm0WAzgaVu_u6i5Rc8,365
95
97
  data_designer/engine/column_generators/utils/judge_score_factory.py,sha256=umo8-iMWbvkAztWkB5m_pU1cY1eBpR5L2gHt_fuZPD4,2100
96
98
  data_designer/engine/column_generators/utils/prompt_renderer.py,sha256=d4tbyPsgmFDikW3nxL5is9RNaajMkoPDCrfkQkxw7rc,4760
97
99
  data_designer/engine/dataset_builders/artifact_storage.py,sha256=mVCqcW8shylofi_pjYEeHUa9Mo-tjIcl4nR8D8oy2bw,8420
98
- data_designer/engine/dataset_builders/column_wise_builder.py,sha256=h6R6YfU2sfhxewIyTaLdcgSI6FpfIouyc1qdfnWfUZ0,14801
100
+ data_designer/engine/dataset_builders/column_wise_builder.py,sha256=UDZce0nxh6t3d6PgVlJdOJl0T7tseAdGocLHeGdJP5I,14983
99
101
  data_designer/engine/dataset_builders/errors.py,sha256=1kChleChG4rASWIiL4Bel6Ox6aFZjQUrh5ogPt1CDWo,359
100
102
  data_designer/engine/dataset_builders/multi_column_configs.py,sha256=t28fhI-WRIBohFnAJ80l5EAETEDB5rJ5RSWInMiRfyE,1619
101
103
  data_designer/engine/dataset_builders/utils/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
102
104
  data_designer/engine/dataset_builders/utils/concurrency.py,sha256=Qvf5H-FdUCikIocOry_E3jkSC7qVZQjAXOY9dxdtMGg,7336
103
- data_designer/engine/dataset_builders/utils/config_compiler.py,sha256=verC3CBA0MjuTQN32RBX10fFvVOefG-DnPDF5Ql2hjg,2402
105
+ data_designer/engine/dataset_builders/utils/config_compiler.py,sha256=e_Y9WdcHE-H-fGwm-4Qgs2FlD6qO08Gw2SLafUtoqTE,2400
104
106
  data_designer/engine/dataset_builders/utils/dag.py,sha256=8h7jEu0XiYGSKHIe4CGFi6SC9HGyAgvkD23ZECNWDC0,2388
105
107
  data_designer/engine/dataset_builders/utils/dataset_batch_manager.py,sha256=xLN_cAu5xb4V-RjSPezNivOkCAigA3-qNfv_kAWRAHs,7769
106
108
  data_designer/engine/dataset_builders/utils/errors.py,sha256=qW_TFOKNVODbb8bYrUlbqMAkheDAg12DDo3RmAhHrCg,370
107
109
  data_designer/engine/models/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
108
110
  data_designer/engine/models/errors.py,sha256=AQlZ-cf0IqFW-e-K9HZFH3YhXXOTLLI2eZCXr7_58Yk,12174
109
- data_designer/engine/models/facade.py,sha256=sqTSqW7jQ1vbRc1fCOoKuhb7vTVil5Z8RqN_NBp6exY,12410
111
+ data_designer/engine/models/facade.py,sha256=rONeAGTRwvoYBN3Dyq3Xe012NNOpVQj-pj3N3deGpsU,12528
110
112
  data_designer/engine/models/litellm_overrides.py,sha256=FpCztZQrHYGrVweLR0_NAqxVRs-UXmpkMPdFWqciu84,5539
111
113
  data_designer/engine/models/registry.py,sha256=-TbGhvs8WRq6f7z6cH_DDdo7uhs4Hb5qkJce_Y4UBWM,6840
112
114
  data_designer/engine/models/telemetry.py,sha256=3g4jDz8xxOOkPtIYit94c4D4mGUwgfiCDaDdnbTLhFQ,12407
@@ -130,7 +132,7 @@ data_designer/engine/processing/gsonschema/__init__.py,sha256=9eG4WHKyrJcNoK4GEz
130
132
  data_designer/engine/processing/gsonschema/exceptions.py,sha256=IoMlQE-eRJcBUlzKnkCCBSVSlGjsoYZSE0OVwcikxlI,281
131
133
  data_designer/engine/processing/gsonschema/schema_transformers.py,sha256=__-dfrCFxDs5-XcTzi1Z-FZL9z0eWUS7Zppr32OxgiY,3066
132
134
  data_designer/engine/processing/gsonschema/types.py,sha256=-x_K2HrVnZ_Z7fzYl4T2Gd7QHf6B6ADvn7E7iYvw5Kc,313
133
- data_designer/engine/processing/gsonschema/validators.py,sha256=5Jh864KnA5gWBeLbpz1cE5Kk_GMxI6kPWvunAbLI3vI,4704
135
+ data_designer/engine/processing/gsonschema/validators.py,sha256=jmY51MJC0kfBExAXGED9Uqc9sxT47_sZ1K3X7OuvCXM,6848
134
136
  data_designer/engine/processing/processors/base.py,sha256=WJl7_0dtiUppjfY-lrQ3lDiIgYqRDSEYUwSAQNN7nFE,548
135
137
  data_designer/engine/processing/processors/drop_columns.py,sha256=MIb_CVrpoM3kyN5-8dHZrdFAAUiCCWgDEyQjAk8nZqE,2060
136
138
  data_designer/engine/processing/processors/registry.py,sha256=nhB1O4b0wSUkWQeleV9l1MykwZD-dSvY0ydqmSscEY8,1056
@@ -141,8 +143,8 @@ data_designer/engine/registry/errors.py,sha256=nO794QVy4DovKGKWEjycVDN9cdDlH-skb
141
143
  data_designer/engine/resources/managed_dataset_generator.py,sha256=KXrWdgod-NFaCZvmWSwoJKp2daQgqf8XBIVXvrk6fHI,1369
142
144
  data_designer/engine/resources/managed_dataset_repository.py,sha256=IzolKh2n_8BLd0XRuWEZhddYpXqc9YYpVUFC7DH7Qz8,7547
143
145
  data_designer/engine/resources/managed_storage.py,sha256=BOdOtUChfNJL6S2Cxw_hrQF28ZnJvObtTIVNkQMUgX0,2079
144
- data_designer/engine/resources/resource_provider.py,sha256=CbB2D538ECGkvyHF1V63_TDn-wStCoklV7bF0y4mabY,1859
145
- data_designer/engine/resources/seed_dataset_data_store.py,sha256=dM2HgfyUgbF7MidN8dn5S-LAR0GVPJfjqXpDPTP2XoA,3035
146
+ data_designer/engine/resources/resource_provider.py,sha256=guDKOGenQfi2VHH6ZevVbGwKmG9nTtNaW2TEWybhcw8,2318
147
+ data_designer/engine/resources/seed_reader.py,sha256=-UgqYH_iAaMrw73y09p2rskTiu3q9-XhNKJoTM3KlRo,5591
146
148
  data_designer/engine/sampling_gen/column.py,sha256=gDIPth7vK2797rGtLhf_kVGMAC-khefKHodeeDoqV-I,3946
147
149
  data_designer/engine/sampling_gen/constraints.py,sha256=e-gLItnSobUR8eSfQdkXkmvrBYLScpBcE97Xd0H2wV8,3004
148
150
  data_designer/engine/sampling_gen/errors.py,sha256=UBZBtosD07EisCdeo8r-Uq4h0QL3tYS1qwtEmca8_jM,828
@@ -170,17 +172,20 @@ data_designer/engine/validators/local_callable.py,sha256=oCUXj_NRt0gVqUIh0fLrvw-
170
172
  data_designer/engine/validators/python.py,sha256=jAp1u8yLjqfebh60xGapkHVjMz58WHB0QjfMc2zQCaY,7894
171
173
  data_designer/engine/validators/remote.py,sha256=jtDIvWzfHh17m2ac_Fp93p49Th8RlkBzzih2jiqD7gk,2929
172
174
  data_designer/engine/validators/sql.py,sha256=bxbyxPxDT9yuwjhABVEY40iR1pzWRFi65WU4tPgG2bE,2250
173
- data_designer/essentials/__init__.py,sha256=eHuZFJTmeRf_b6KQZ2vZeqy1afJ7y7RMTm7q4Jrg58s,1012
175
+ data_designer/essentials/__init__.py,sha256=viyPs-sIVMuHuFpWhZVKtwRsDQFMsmtw6B7veK_No_I,1094
174
176
  data_designer/interface/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
175
- data_designer/interface/data_designer.py,sha256=O6PehBIdL4_2d9rFW86J9b3jfJ_CJmFId8T2AviM2zM,16844
177
+ data_designer/interface/data_designer.py,sha256=jiPI0vbzTAT4Oug4W4RCBS69bCQOfCE9awfClwBeDnI,17175
176
178
  data_designer/interface/errors.py,sha256=jagKT3tPUnYq4e3e6AkTnBkcayHyEfxjPMBzx-GEKe4,565
177
179
  data_designer/interface/results.py,sha256=zYVX589OUyFuB-8XLmjjdKk3hCDNKu189sH-gOOFreQ,3511
178
180
  data_designer/plugins/__init__.py,sha256=c_V7q4QhfVoNf_uc9UwmXCsWqwtyWogI7YoN_0PzzE4,234
179
- data_designer/plugins/errors.py,sha256=yPIHpSddEr-o9ZcNVibb2hI-73O15Kg_Od8SlmQlnRs,297
180
- data_designer/plugins/plugin.py,sha256=a2KfoCNhYa8U0uQrPSBWfuyjXOb5WeITzFRpEdZFo6s,2516
181
+ data_designer/plugins/errors.py,sha256=nljerskefztjy84UARpw3ogm4GI8CmJCi4FrfaVJI2w,345
182
+ data_designer/plugins/plugin.py,sha256=n4nlFHZZ0INY7vFJylTGoh1ij9k3TSVw-15OGSdROGE,5355
181
183
  data_designer/plugins/registry.py,sha256=c0X03TnA_J60RWpxaVJEmtIXKvA9up-LznrUHXDcYxg,3012
182
- data_designer-0.2.3.dist-info/METADATA,sha256=JUXSRI21S_Sp_Jwh060AHSCvqEERjHXG9sNhwxJUfSU,7636
183
- data_designer-0.2.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
184
- data_designer-0.2.3.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
185
- data_designer-0.2.3.dist-info/licenses/LICENSE,sha256=cSWJDwVqHyQgly8Zmt3pqXJ2eQbZVYwN9qd0NMssxXY,11336
186
- data_designer-0.2.3.dist-info/RECORD,,
184
+ data_designer/plugins/testing/__init__.py,sha256=fyFqJcdyYcU5nj55RjqD6NjkXl4nhNZw5Bd6-sLnnjQ,255
185
+ data_designer/plugins/testing/stubs.py,sha256=IqriaMWtQbhPSoWAkbcKsTga-Xi5rTZBlW70IC0cpd0,4699
186
+ data_designer/plugins/testing/utils.py,sha256=2Pk0BJ7tx6gKLCoJr72EqpqfTWzHnlk_Lkwi89aDQQs,578
187
+ data_designer-0.3.1.dist-info/METADATA,sha256=WRtWyrBDkHsWSp50epjKL_3s8kzWK8_qK45lxYqBKWU,7652
188
+ data_designer-0.3.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
189
+ data_designer-0.3.1.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
190
+ data_designer-0.3.1.dist-info/licenses/LICENSE,sha256=cSWJDwVqHyQgly8Zmt3pqXJ2eQbZVYwN9qd0NMssxXY,11336
191
+ data_designer-0.3.1.dist-info/RECORD,,
@@ -1,187 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- import logging
7
- from pathlib import Path
8
- from typing import TYPE_CHECKING
9
-
10
- import pandas as pd
11
- import pyarrow.parquet as pq
12
- from huggingface_hub import HfApi, HfFileSystem
13
- from pydantic import BaseModel, Field
14
-
15
- from data_designer.config.errors import InvalidConfigError, InvalidFileFormatError, InvalidFilePathError
16
- from data_designer.config.utils.io_helpers import VALID_DATASET_FILE_EXTENSIONS, validate_path_contains_files_of_type
17
-
18
- if TYPE_CHECKING:
19
- from data_designer.config.seed import SeedDatasetReference
20
-
21
- logger = logging.getLogger(__name__)
22
-
23
-
24
- class DatastoreSettings(BaseModel):
25
- """Configuration for interacting with a datastore."""
26
-
27
- endpoint: str = Field(
28
- ...,
29
- description="Datastore endpoint. Use 'https://huggingface.co' for the Hugging Face Hub.",
30
- )
31
- token: str | None = Field(default=None, description="If needed, token to use for authentication.")
32
-
33
-
34
- def get_file_column_names(file_reference: str | Path | HfFileSystem, file_type: str) -> list[str]:
35
- """Get column names from a dataset file.
36
-
37
- Args:
38
- file_reference: Path to the dataset file, or an HfFileSystem object.
39
- file_type: Type of the dataset file. Must be one of: 'parquet', 'json', 'jsonl', 'csv'.
40
-
41
- Raises:
42
- InvalidFilePathError: If the file type is not supported.
43
-
44
- Returns:
45
- List of column names.
46
- """
47
- if file_type == "parquet":
48
- try:
49
- schema = pq.read_schema(file_reference)
50
- if hasattr(schema, "names"):
51
- return schema.names
52
- else:
53
- return [field.name for field in schema]
54
- except Exception as e:
55
- logger.warning(f"Failed to process parquet file {file_reference}: {e}")
56
- return []
57
- elif file_type in ["json", "jsonl"]:
58
- return pd.read_json(file_reference, orient="records", lines=True, nrows=1).columns.tolist()
59
- elif file_type == "csv":
60
- try:
61
- df = pd.read_csv(file_reference, nrows=1)
62
- return df.columns.tolist()
63
- except (pd.errors.EmptyDataError, pd.errors.ParserError) as e:
64
- logger.warning(f"Failed to process CSV file {file_reference}: {e}")
65
- return []
66
- else:
67
- raise InvalidFilePathError(f"🛑 Unsupported file type: {file_type!r}")
68
-
69
-
70
- def fetch_seed_dataset_column_names(seed_dataset_reference: SeedDatasetReference) -> list[str]:
71
- if hasattr(seed_dataset_reference, "datastore_settings"):
72
- return fetch_seed_dataset_column_names_from_datastore(
73
- seed_dataset_reference.repo_id,
74
- seed_dataset_reference.filename,
75
- seed_dataset_reference.datastore_settings,
76
- )
77
- return fetch_seed_dataset_column_names_from_local_file(seed_dataset_reference.dataset)
78
-
79
-
80
- def fetch_seed_dataset_column_names_from_datastore(
81
- repo_id: str,
82
- filename: str,
83
- datastore_settings: DatastoreSettings | dict | None = None,
84
- ) -> list[str]:
85
- file_type = filename.split(".")[-1]
86
- if f".{file_type}" not in VALID_DATASET_FILE_EXTENSIONS:
87
- raise InvalidFileFormatError(f"🛑 Unsupported file type: {filename!r}")
88
-
89
- datastore_settings = resolve_datastore_settings(datastore_settings)
90
- fs = HfFileSystem(endpoint=datastore_settings.endpoint, token=datastore_settings.token, skip_instance_cache=True)
91
-
92
- file_path = _extract_single_file_path_from_glob_pattern_if_present(f"datasets/{repo_id}/{filename}", fs=fs)
93
-
94
- with fs.open(file_path) as f:
95
- return get_file_column_names(f, file_type)
96
-
97
-
98
- def fetch_seed_dataset_column_names_from_local_file(dataset_path: str | Path) -> list[str]:
99
- dataset_path = _validate_dataset_path(dataset_path, allow_glob_pattern=True)
100
- dataset_path = _extract_single_file_path_from_glob_pattern_if_present(dataset_path)
101
- return get_file_column_names(dataset_path, str(dataset_path).split(".")[-1])
102
-
103
-
104
- def resolve_datastore_settings(datastore_settings: DatastoreSettings | dict | None) -> DatastoreSettings:
105
- if datastore_settings is None:
106
- raise InvalidConfigError("🛑 Datastore settings are required in order to upload datasets to the datastore.")
107
- if isinstance(datastore_settings, DatastoreSettings):
108
- return datastore_settings
109
- elif isinstance(datastore_settings, dict):
110
- return DatastoreSettings.model_validate(datastore_settings)
111
- else:
112
- raise InvalidConfigError(
113
- "🛑 Invalid datastore settings format. Must be DatastoreSettings object or dictionary."
114
- )
115
-
116
-
117
- def upload_to_hf_hub(
118
- dataset_path: str | Path,
119
- filename: str,
120
- repo_id: str,
121
- datastore_settings: DatastoreSettings,
122
- **kwargs,
123
- ) -> str:
124
- datastore_settings = resolve_datastore_settings(datastore_settings)
125
- dataset_path = _validate_dataset_path(dataset_path)
126
- filename_ext = filename.split(".")[-1].lower()
127
- if dataset_path.suffix.lower()[1:] != filename_ext:
128
- raise InvalidFileFormatError(
129
- f"🛑 Dataset file extension {dataset_path.suffix!r} does not match `filename` extension .{filename_ext!r}"
130
- )
131
-
132
- hfapi = HfApi(endpoint=datastore_settings.endpoint, token=datastore_settings.token)
133
- hfapi.create_repo(repo_id, exist_ok=True, repo_type="dataset")
134
- hfapi.upload_file(
135
- path_or_fileobj=dataset_path,
136
- path_in_repo=filename,
137
- repo_id=repo_id,
138
- repo_type="dataset",
139
- **kwargs,
140
- )
141
- return f"{repo_id}/{filename}"
142
-
143
-
144
- def _extract_single_file_path_from_glob_pattern_if_present(
145
- file_path: str | Path,
146
- fs: HfFileSystem | None = None,
147
- ) -> Path:
148
- file_path = Path(file_path)
149
-
150
- # no glob pattern
151
- if "*" not in str(file_path):
152
- return file_path
153
-
154
- # glob pattern with HfFileSystem
155
- if fs is not None:
156
- file_to_check = None
157
- file_extension = file_path.name.split(".")[-1]
158
- for file in fs.ls(str(file_path.parent)):
159
- filename = file["name"]
160
- if filename.endswith(f".{file_extension}"):
161
- file_to_check = filename
162
- if file_to_check is None:
163
- raise InvalidFilePathError(f"🛑 No files found matching pattern: {str(file_path)!r}")
164
- logger.debug(f"Using the first matching file in {str(file_path)!r} to determine column names in seed dataset")
165
- return Path(file_to_check)
166
-
167
- # glob pattern with local file system
168
- if not (matching_files := sorted(file_path.parent.glob(file_path.name))):
169
- raise InvalidFilePathError(f"🛑 No files found matching pattern: {str(file_path)!r}")
170
- logger.debug(f"Using the first matching file in {str(file_path)!r} to determine column names in seed dataset")
171
- return matching_files[0]
172
-
173
-
174
- def _validate_dataset_path(dataset_path: str | Path, allow_glob_pattern: bool = False) -> Path:
175
- if allow_glob_pattern and "*" in str(dataset_path):
176
- parts = str(dataset_path).split("*.")
177
- file_path = parts[0]
178
- file_extension = parts[-1]
179
- validate_path_contains_files_of_type(file_path, file_extension)
180
- return Path(dataset_path)
181
- if not Path(dataset_path).is_file():
182
- raise InvalidFilePathError("🛑 To upload a dataset to the datastore, you must provide a valid file path.")
183
- if not Path(dataset_path).name.endswith(tuple(VALID_DATASET_FILE_EXTENSIONS)):
184
- raise InvalidFileFormatError(
185
- "🛑 Dataset files must be in `parquet`, `csv`, or `json` (orient='records', lines=True) format."
186
- )
187
- return Path(dataset_path)
@@ -1,84 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from abc import ABC, abstractmethod
5
-
6
- import duckdb
7
- from huggingface_hub import HfApi, HfFileSystem
8
-
9
- from data_designer.logging import quiet_noisy_logger
10
-
11
- quiet_noisy_logger("httpx")
12
-
13
- _HF_DATASETS_PREFIX = "hf://datasets/"
14
-
15
-
16
- class MalformedFileIdError(Exception):
17
- """Raised when file_id format is invalid."""
18
-
19
-
20
- class SeedDatasetDataStore(ABC):
21
- """Abstract base class for dataset storage implementations."""
22
-
23
- @abstractmethod
24
- def create_duckdb_connection(self) -> duckdb.DuckDBPyConnection: ...
25
-
26
- @abstractmethod
27
- def get_dataset_uri(self, file_id: str) -> str: ...
28
-
29
-
30
- class LocalSeedDatasetDataStore(SeedDatasetDataStore):
31
- """Local filesystem-based dataset storage."""
32
-
33
- def create_duckdb_connection(self) -> duckdb.DuckDBPyConnection:
34
- return duckdb.connect()
35
-
36
- def get_dataset_uri(self, file_id: str) -> str:
37
- return file_id
38
-
39
-
40
- class HfHubSeedDatasetDataStore(SeedDatasetDataStore):
41
- """Hugging Face and Data Store dataset storage."""
42
-
43
- def __init__(self, endpoint: str, token: str | None):
44
- self.hfapi = HfApi(endpoint=endpoint, token=token)
45
- self.endpoint = endpoint
46
- self.token = token
47
-
48
- def create_duckdb_connection(self) -> duckdb.DuckDBPyConnection:
49
- """Create a DuckDB connection with a fresh HfFileSystem registered.
50
-
51
- Creates a new HfFileSystem instance for each connection to ensure file metadata
52
- is fetched fresh from the datastore, avoiding cache-related issues when reading
53
- recently updated parquet files.
54
-
55
- Returns:
56
- A DuckDB connection with the HfFileSystem registered for hf:// URI support.
57
- """
58
- # Use skip_instance_cache to avoid fsspec-level caching
59
- hffs = HfFileSystem(endpoint=self.endpoint, token=self.token, skip_instance_cache=True)
60
-
61
- # Clear all internal caches to avoid stale metadata issues
62
- # HfFileSystem caches file metadata (size, etc.) which can become stale when files are re-uploaded
63
- if hasattr(hffs, "dircache"):
64
- hffs.dircache.clear()
65
-
66
- conn = duckdb.connect()
67
- conn.register_filesystem(hffs)
68
- return conn
69
-
70
- def get_dataset_uri(self, file_id: str) -> str:
71
- identifier = file_id.removeprefix(_HF_DATASETS_PREFIX)
72
- repo_id, filename = self._get_repo_id_and_filename(identifier)
73
- return f"{_HF_DATASETS_PREFIX}{repo_id}/{filename}"
74
-
75
- def _get_repo_id_and_filename(self, identifier: str) -> tuple[str, str]:
76
- """Extract repo_id and filename from identifier."""
77
- parts = identifier.split("/", 2)
78
- if len(parts) < 3:
79
- raise MalformedFileIdError(
80
- "Could not extract repo id and filename from file_id, "
81
- "expected 'hf://datasets/{repo-namespace}/{repo-name}/{filename}'"
82
- )
83
- repo_ns, repo_name, filename = parts
84
- return f"{repo_ns}/{repo_name}", filename
File without changes