data-designer 0.2.3__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/_version.py +2 -2
- data_designer/cli/forms/model_builder.py +2 -2
- data_designer/config/config_builder.py +30 -113
- data_designer/config/errors.py +3 -0
- data_designer/config/exports.py +8 -6
- data_designer/config/models.py +7 -18
- data_designer/config/run_config.py +34 -0
- data_designer/config/seed.py +16 -46
- data_designer/config/seed_source.py +73 -0
- data_designer/config/utils/constants.py +27 -2
- data_designer/config/utils/io_helpers.py +0 -20
- data_designer/engine/column_generators/generators/seed_dataset.py +5 -5
- data_designer/engine/column_generators/generators/validation.py +3 -0
- data_designer/engine/column_generators/registry.py +1 -1
- data_designer/engine/compiler.py +69 -0
- data_designer/engine/dataset_builders/column_wise_builder.py +3 -0
- data_designer/engine/dataset_builders/utils/config_compiler.py +1 -1
- data_designer/engine/models/facade.py +2 -0
- data_designer/engine/processing/gsonschema/validators.py +55 -0
- data_designer/engine/resources/resource_provider.py +17 -5
- data_designer/engine/resources/seed_reader.py +149 -0
- data_designer/essentials/__init__.py +2 -0
- data_designer/interface/data_designer.py +72 -62
- data_designer/plugin_manager.py +1 -1
- data_designer/plugins/errors.py +3 -0
- data_designer/plugins/plugin.py +82 -12
- data_designer/plugins/testing/__init__.py +8 -0
- data_designer/plugins/testing/stubs.py +145 -0
- data_designer/plugins/testing/utils.py +11 -0
- {data_designer-0.2.3.dist-info → data_designer-0.3.0.dist-info}/METADATA +3 -3
- {data_designer-0.2.3.dist-info → data_designer-0.3.0.dist-info}/RECORD +35 -30
- data_designer/config/datastore.py +0 -187
- data_designer/engine/resources/seed_dataset_data_store.py +0 -84
- /data_designer/{config/utils → engine}/validation.py +0 -0
- {data_designer-0.2.3.dist-info → data_designer-0.3.0.dist-info}/WHEEL +0 -0
- {data_designer-0.2.3.dist-info → data_designer-0.3.0.dist-info}/entry_points.txt +0 -0
- {data_designer-0.2.3.dist-info → data_designer-0.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
data_designer/__init__.py,sha256=iCeqRnb640RrL2QpA630GY5Ng7JiDt83Vq0DwLnNugU,461
|
|
2
|
-
data_designer/_version.py,sha256=
|
|
2
|
+
data_designer/_version.py,sha256=5zTqm8rgXsWYBpB2M3Zw_K1D-aV8wP7NsBLrmMKkrAQ,704
|
|
3
3
|
data_designer/errors.py,sha256=Z4eN9XwzZvGRdBluSNoSqQYkPPzNQIDf0ET_OqWRZh8,179
|
|
4
4
|
data_designer/logging.py,sha256=ZsruJ0tx_4NK0PIMyxCZJJ0wJugoDff9UP3PbsdEDxQ,5341
|
|
5
|
-
data_designer/plugin_manager.py,sha256=
|
|
5
|
+
data_designer/plugin_manager.py,sha256=NCYhBtMqbIGSMZwTY2F2NkkkdgBGPqnxZCmeiQzEH80,2604
|
|
6
6
|
data_designer/cli/README.md,sha256=uPE3KdlF5Y3H8pQc8c6ZZ3h6YSFXNQW-iEXGQJuVnI4,9026
|
|
7
7
|
data_designer/cli/__init__.py,sha256=kTfolrDol0izniNPXtuaUJ_oXRfJ-jGUPuVR5IwibEM,210
|
|
8
8
|
data_designer/cli/main.py,sha256=v_vjyHbEF0n0Np-jc5KigtNFZflHR7tAnnea5mDzXHI,1933
|
|
@@ -22,7 +22,7 @@ data_designer/cli/forms/__init__.py,sha256=BGLbNJCHCgYiQWoAdTbUjzqgVlJymTQOV8sNW
|
|
|
22
22
|
data_designer/cli/forms/builder.py,sha256=QMCutZb7l3DeL4nXFGCUaiS1bxBu1BdaBWwlb1rmiIE,1690
|
|
23
23
|
data_designer/cli/forms/field.py,sha256=8EYfaqxSynyhPreFOa9JGnsTkeSXxwpRzV8xJb98FGg,7502
|
|
24
24
|
data_designer/cli/forms/form.py,sha256=f6_LdSlk4kddB9a4mGotA-VlR2mlXAU_9RtLkbliI38,2025
|
|
25
|
-
data_designer/cli/forms/model_builder.py,sha256=
|
|
25
|
+
data_designer/cli/forms/model_builder.py,sha256=voBv7_e0M3CwXxwuv75OSG0_CJQ0thkVVA8G7eCK798,13313
|
|
26
26
|
data_designer/cli/forms/provider_builder.py,sha256=xphQlNlnfDLm0XwqbPC6SJ3wXwlU45xVo_35Pe1EBdU,2895
|
|
27
27
|
data_designer/cli/repositories/__init__.py,sha256=RBOWAkIOOpr-L-kVz-PDIPmMXdlGLCinxkwvKS6bAB4,434
|
|
28
28
|
data_designer/cli/repositories/base.py,sha256=LQ0i_KrTdhS5o48qphlr4uWoAVrz02Lf_ZH1JIhcuBQ,1054
|
|
@@ -37,20 +37,21 @@ data_designer/config/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMb
|
|
|
37
37
|
data_designer/config/base.py,sha256=ypam9XX6dg8Q_55su84WmVExNXsmt5jb3eeW3JLlHwc,2396
|
|
38
38
|
data_designer/config/column_configs.py,sha256=pjpy5z3Kk7i4WmIjOxdiW5Awpjy5CxQSy0YMy0QxtvA,18961
|
|
39
39
|
data_designer/config/column_types.py,sha256=EILVM42d4TMl2xbSj5htMsenJwybCHIc_G8AUXyrjWU,7197
|
|
40
|
-
data_designer/config/config_builder.py,sha256=
|
|
40
|
+
data_designer/config/config_builder.py,sha256=eO7iXO4WeF3h4iJdH203ZFHET9Vj4XAgoIPEJpQDbkg,25371
|
|
41
41
|
data_designer/config/data_designer_config.py,sha256=D2b4Dl8pR6kCkvPoZ3APxC9pVBqXi5EJMVK1WBZ6ni8,1886
|
|
42
42
|
data_designer/config/dataset_builders.py,sha256=1pNFy_pkQ5lJ6AVZ43AeTuSbz6yC_l7Ndcyp5yaT8hQ,327
|
|
43
|
-
data_designer/config/datastore.py,sha256=gEHR2hYlJwD_vzjuaSOMRiYjtwdQhyO9q1afZDrhBCo,7586
|
|
44
43
|
data_designer/config/default_model_settings.py,sha256=YqucXdOdXV6-J2jOc3gSSjbtfXDVbKfmG94neC2Ynaw,4457
|
|
45
|
-
data_designer/config/errors.py,sha256=
|
|
46
|
-
data_designer/config/exports.py,sha256=
|
|
44
|
+
data_designer/config/errors.py,sha256=MvrZd7tLMaWKjnKHj_GHIap2mkb20O7x56H4c1hIyZI,519
|
|
45
|
+
data_designer/config/exports.py,sha256=OFh-iHtkjqWPOtjUc1esRRaAssL3XVImOTPp0yWWSW0,4715
|
|
47
46
|
data_designer/config/interface.py,sha256=ery8a93pnCW1JPbgtiaRsMKSR8Q2o7rDmsZfVYbfkeE,1619
|
|
48
|
-
data_designer/config/models.py,sha256=
|
|
47
|
+
data_designer/config/models.py,sha256=EQCzN_pNIZ2enarKSs0Ry40oNzVFGr188jwplOQQGCI,15330
|
|
49
48
|
data_designer/config/preview_results.py,sha256=bPRKX1enzNTY240ixc8jZVgO7EDHABZ1_er0TabhLZg,1345
|
|
50
49
|
data_designer/config/processors.py,sha256=bA6SVF1kmAJSshmWseLE6HzlEBAsH9FtUtNJk0QzJtU,5987
|
|
50
|
+
data_designer/config/run_config.py,sha256=6CDHjcmkZmzP3NSlaeyVkLbRVKsb7ry3b0ms6T-A7rc,1418
|
|
51
51
|
data_designer/config/sampler_constraints.py,sha256=Q8-JrwTD69AJy8cvs_-0yf4yOBGemLwLZNmk-8Y5wPk,1156
|
|
52
52
|
data_designer/config/sampler_params.py,sha256=-MLNFDqattNWrHuWPYyGTe2YdbaGMH-JKTCzxq1ji3E,27838
|
|
53
|
-
data_designer/config/seed.py,sha256=
|
|
53
|
+
data_designer/config/seed.py,sha256=FEi3O5eNMk3qLiAHNAKFyJ5ijAk0e0MaHcYpfo1pa9c,4621
|
|
54
|
+
data_designer/config/seed_source.py,sha256=ajdlhpDSnXJZYRMdP-MpBc5q0hKE_niMdo75FhHyvp8,2240
|
|
54
55
|
data_designer/config/validator_params.py,sha256=BSDNVZQvXB4hmhuX4EnJ89pR-1hdEfI_KWYO8POQlMk,3906
|
|
55
56
|
data_designer/config/analysis/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
|
|
56
57
|
data_designer/config/analysis/column_profilers.py,sha256=PcFR6uhlJzz8VMYHYoYYMZ6Rj6zrk0RbjomSHDn9GDI,6215
|
|
@@ -59,20 +60,21 @@ data_designer/config/analysis/dataset_profiler.py,sha256=Omst6qNsUi9sdbV7c0kASK6
|
|
|
59
60
|
data_designer/config/analysis/utils/errors.py,sha256=UbWKIo8uMNqGXJKf4XR6PbuDDZnBQ7qHyEdq8Oh97QY,290
|
|
60
61
|
data_designer/config/analysis/utils/reporting.py,sha256=8tds-g6q1NLW8xHzT9OukgEWA1Y5qAnMq-BjP4u_Frk,7030
|
|
61
62
|
data_designer/config/utils/code_lang.py,sha256=szSihIdVZsjgUOcbxndgI3P4GNN51DUeNlk5Z8nEYro,2349
|
|
62
|
-
data_designer/config/utils/constants.py,sha256=
|
|
63
|
+
data_designer/config/utils/constants.py,sha256=mNP1r3ucImsB743UFbjd-VFc0ZcSuqOQXRu4jVBR8FI,8884
|
|
63
64
|
data_designer/config/utils/errors.py,sha256=X8_ghPqHKWN6qDMW0WhBoxFNr9MNygOEawq0oGigPD8,479
|
|
64
65
|
data_designer/config/utils/info.py,sha256=AW8GnmxGX-LahQCDT9BPl8eqUz3ymNr647VHHj5gBHg,3428
|
|
65
|
-
data_designer/config/utils/io_helpers.py,sha256=
|
|
66
|
+
data_designer/config/utils/io_helpers.py,sha256=xIN4FicjITOoNZB-nX5LnVr45-lvhsSYenlerXlq5_8,8331
|
|
66
67
|
data_designer/config/utils/misc.py,sha256=qH2VgPYmr1FZTX_r8rXmtNTdKpCs0rg7aEos0w7ylNo,2467
|
|
67
68
|
data_designer/config/utils/numerical_helpers.py,sha256=YujDvTOkm0FW3V5G6Ja_Etf7d8tJ71Rpj_BHRu5JlRY,797
|
|
68
69
|
data_designer/config/utils/type_helpers.py,sha256=KOnxSYDHefadfOrlzRk-XixvwbZ31Ql7wOXAdQh0qoU,4018
|
|
69
|
-
data_designer/config/utils/validation.py,sha256=VvQ2vYZKSp-nPSukGzKv9rXXkpr5Spz3fkIX8HC28lk,14440
|
|
70
70
|
data_designer/config/utils/visualization.py,sha256=ql0rnRIF7AhkuyS6zE35FockYzCkDRamzFUs6J7g23I,18186
|
|
71
71
|
data_designer/engine/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
|
|
72
|
+
data_designer/engine/compiler.py,sha256=dQOCnjIU3SC6TrwpJVY8Nnv2Ql8-_aEAeY9TdewO6T0,2853
|
|
72
73
|
data_designer/engine/configurable_task.py,sha256=nLUUzmVQ_RcX4l97wwROspFeYtVfdh0llxiau49d5Ic,3118
|
|
73
74
|
data_designer/engine/errors.py,sha256=DUoKhQCSwIBoLSQGv7dstzO3DFGDRqW3MBoWnRPcm1I,1262
|
|
74
75
|
data_designer/engine/model_provider.py,sha256=w_7EZpDKgOLdzmCWJ6v6oKdM0GdRijir3iK102bBtg8,2782
|
|
75
76
|
data_designer/engine/secret_resolver.py,sha256=IyvLvx_me9oiLk6uaVddTnd01Pz7qeYsleOhtKtcm3A,2427
|
|
77
|
+
data_designer/engine/validation.py,sha256=VvQ2vYZKSp-nPSukGzKv9rXXkpr5Spz3fkIX8HC28lk,14440
|
|
76
78
|
data_designer/engine/analysis/column_statistics.py,sha256=M_IxRwPeIaaN37UtPFMW09eXYQ-hEgu_NnLzlQj8AtY,5752
|
|
77
79
|
data_designer/engine/analysis/dataset_profiler.py,sha256=jmK0VFKH6mVkRaAA-upvCFBtxVeT-02ss9X0gRInP0A,7238
|
|
78
80
|
data_designer/engine/analysis/errors.py,sha256=VBeKREcPcInWhjAo3U2x_9UnJBi8zcGnUjLXNippPtA,255
|
|
@@ -82,31 +84,31 @@ data_designer/engine/analysis/column_profilers/registry.py,sha256=GpudnadaJxb8ub
|
|
|
82
84
|
data_designer/engine/analysis/utils/column_statistics_calculations.py,sha256=B5g1j8f0z1AFAEUVXf3roLZUI--jEs01Tj9g9yREWxk,8851
|
|
83
85
|
data_designer/engine/analysis/utils/judge_score_processing.py,sha256=rl11e3PxAOQPDSmK9G9kxOMUCJrZddQusUobDKZhIzw,4758
|
|
84
86
|
data_designer/engine/column_generators/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
|
|
85
|
-
data_designer/engine/column_generators/registry.py,sha256=
|
|
87
|
+
data_designer/engine/column_generators/registry.py,sha256=qiC7B1MvcEgzfPtWBPHqF9vNpWGHpUg9YekR-lAZJl0,3060
|
|
86
88
|
data_designer/engine/column_generators/generators/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
|
|
87
89
|
data_designer/engine/column_generators/generators/base.py,sha256=zurwtamM2l3shLa4SLjUOE0zOTDozQ5wPGAvDkrNYqE,3231
|
|
88
90
|
data_designer/engine/column_generators/generators/embedding.py,sha256=xYnFWRJ2W7JuwK8CRIUhv4QiT_DCGDuQkuHFKXCxrow,1724
|
|
89
91
|
data_designer/engine/column_generators/generators/expression.py,sha256=7xniEj8aPscWDYLrnNbG2mF3s08C7aR8ZgNUCzr_x8g,2539
|
|
90
92
|
data_designer/engine/column_generators/generators/llm_completion.py,sha256=XqpXzYczbZ6efUIVuvcm2O_mXBnXCMAvcjeyaB5dIFA,5301
|
|
91
93
|
data_designer/engine/column_generators/generators/samplers.py,sha256=YHoTWi8Wo9TyR-98I-rOHJUHOIJXxvbil-PrhNKMWxQ,3579
|
|
92
|
-
data_designer/engine/column_generators/generators/seed_dataset.py,sha256=
|
|
93
|
-
data_designer/engine/column_generators/generators/validation.py,sha256=
|
|
94
|
+
data_designer/engine/column_generators/generators/seed_dataset.py,sha256=6fG1ybsil7kqKfEL8AXS7L96tdloWbh8X9nYzf803AY,6996
|
|
95
|
+
data_designer/engine/column_generators/generators/validation.py,sha256=QxDE4HIBLtZ-_12eYwJxPXOaYo81vCDifM5YHdk7tow,6772
|
|
94
96
|
data_designer/engine/column_generators/utils/errors.py,sha256=ugNwaqnPdrPZI7YnKLbYwFjYUSm0WAzgaVu_u6i5Rc8,365
|
|
95
97
|
data_designer/engine/column_generators/utils/judge_score_factory.py,sha256=umo8-iMWbvkAztWkB5m_pU1cY1eBpR5L2gHt_fuZPD4,2100
|
|
96
98
|
data_designer/engine/column_generators/utils/prompt_renderer.py,sha256=d4tbyPsgmFDikW3nxL5is9RNaajMkoPDCrfkQkxw7rc,4760
|
|
97
99
|
data_designer/engine/dataset_builders/artifact_storage.py,sha256=mVCqcW8shylofi_pjYEeHUa9Mo-tjIcl4nR8D8oy2bw,8420
|
|
98
|
-
data_designer/engine/dataset_builders/column_wise_builder.py,sha256=
|
|
100
|
+
data_designer/engine/dataset_builders/column_wise_builder.py,sha256=UDZce0nxh6t3d6PgVlJdOJl0T7tseAdGocLHeGdJP5I,14983
|
|
99
101
|
data_designer/engine/dataset_builders/errors.py,sha256=1kChleChG4rASWIiL4Bel6Ox6aFZjQUrh5ogPt1CDWo,359
|
|
100
102
|
data_designer/engine/dataset_builders/multi_column_configs.py,sha256=t28fhI-WRIBohFnAJ80l5EAETEDB5rJ5RSWInMiRfyE,1619
|
|
101
103
|
data_designer/engine/dataset_builders/utils/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
|
|
102
104
|
data_designer/engine/dataset_builders/utils/concurrency.py,sha256=Qvf5H-FdUCikIocOry_E3jkSC7qVZQjAXOY9dxdtMGg,7336
|
|
103
|
-
data_designer/engine/dataset_builders/utils/config_compiler.py,sha256=
|
|
105
|
+
data_designer/engine/dataset_builders/utils/config_compiler.py,sha256=e_Y9WdcHE-H-fGwm-4Qgs2FlD6qO08Gw2SLafUtoqTE,2400
|
|
104
106
|
data_designer/engine/dataset_builders/utils/dag.py,sha256=8h7jEu0XiYGSKHIe4CGFi6SC9HGyAgvkD23ZECNWDC0,2388
|
|
105
107
|
data_designer/engine/dataset_builders/utils/dataset_batch_manager.py,sha256=xLN_cAu5xb4V-RjSPezNivOkCAigA3-qNfv_kAWRAHs,7769
|
|
106
108
|
data_designer/engine/dataset_builders/utils/errors.py,sha256=qW_TFOKNVODbb8bYrUlbqMAkheDAg12DDo3RmAhHrCg,370
|
|
107
109
|
data_designer/engine/models/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
|
|
108
110
|
data_designer/engine/models/errors.py,sha256=AQlZ-cf0IqFW-e-K9HZFH3YhXXOTLLI2eZCXr7_58Yk,12174
|
|
109
|
-
data_designer/engine/models/facade.py,sha256=
|
|
111
|
+
data_designer/engine/models/facade.py,sha256=rONeAGTRwvoYBN3Dyq3Xe012NNOpVQj-pj3N3deGpsU,12528
|
|
110
112
|
data_designer/engine/models/litellm_overrides.py,sha256=FpCztZQrHYGrVweLR0_NAqxVRs-UXmpkMPdFWqciu84,5539
|
|
111
113
|
data_designer/engine/models/registry.py,sha256=-TbGhvs8WRq6f7z6cH_DDdo7uhs4Hb5qkJce_Y4UBWM,6840
|
|
112
114
|
data_designer/engine/models/telemetry.py,sha256=3g4jDz8xxOOkPtIYit94c4D4mGUwgfiCDaDdnbTLhFQ,12407
|
|
@@ -130,7 +132,7 @@ data_designer/engine/processing/gsonschema/__init__.py,sha256=9eG4WHKyrJcNoK4GEz
|
|
|
130
132
|
data_designer/engine/processing/gsonschema/exceptions.py,sha256=IoMlQE-eRJcBUlzKnkCCBSVSlGjsoYZSE0OVwcikxlI,281
|
|
131
133
|
data_designer/engine/processing/gsonschema/schema_transformers.py,sha256=__-dfrCFxDs5-XcTzi1Z-FZL9z0eWUS7Zppr32OxgiY,3066
|
|
132
134
|
data_designer/engine/processing/gsonschema/types.py,sha256=-x_K2HrVnZ_Z7fzYl4T2Gd7QHf6B6ADvn7E7iYvw5Kc,313
|
|
133
|
-
data_designer/engine/processing/gsonschema/validators.py,sha256=
|
|
135
|
+
data_designer/engine/processing/gsonschema/validators.py,sha256=jmY51MJC0kfBExAXGED9Uqc9sxT47_sZ1K3X7OuvCXM,6848
|
|
134
136
|
data_designer/engine/processing/processors/base.py,sha256=WJl7_0dtiUppjfY-lrQ3lDiIgYqRDSEYUwSAQNN7nFE,548
|
|
135
137
|
data_designer/engine/processing/processors/drop_columns.py,sha256=MIb_CVrpoM3kyN5-8dHZrdFAAUiCCWgDEyQjAk8nZqE,2060
|
|
136
138
|
data_designer/engine/processing/processors/registry.py,sha256=nhB1O4b0wSUkWQeleV9l1MykwZD-dSvY0ydqmSscEY8,1056
|
|
@@ -141,8 +143,8 @@ data_designer/engine/registry/errors.py,sha256=nO794QVy4DovKGKWEjycVDN9cdDlH-skb
|
|
|
141
143
|
data_designer/engine/resources/managed_dataset_generator.py,sha256=KXrWdgod-NFaCZvmWSwoJKp2daQgqf8XBIVXvrk6fHI,1369
|
|
142
144
|
data_designer/engine/resources/managed_dataset_repository.py,sha256=IzolKh2n_8BLd0XRuWEZhddYpXqc9YYpVUFC7DH7Qz8,7547
|
|
143
145
|
data_designer/engine/resources/managed_storage.py,sha256=BOdOtUChfNJL6S2Cxw_hrQF28ZnJvObtTIVNkQMUgX0,2079
|
|
144
|
-
data_designer/engine/resources/resource_provider.py,sha256=
|
|
145
|
-
data_designer/engine/resources/
|
|
146
|
+
data_designer/engine/resources/resource_provider.py,sha256=guDKOGenQfi2VHH6ZevVbGwKmG9nTtNaW2TEWybhcw8,2318
|
|
147
|
+
data_designer/engine/resources/seed_reader.py,sha256=-UgqYH_iAaMrw73y09p2rskTiu3q9-XhNKJoTM3KlRo,5591
|
|
146
148
|
data_designer/engine/sampling_gen/column.py,sha256=gDIPth7vK2797rGtLhf_kVGMAC-khefKHodeeDoqV-I,3946
|
|
147
149
|
data_designer/engine/sampling_gen/constraints.py,sha256=e-gLItnSobUR8eSfQdkXkmvrBYLScpBcE97Xd0H2wV8,3004
|
|
148
150
|
data_designer/engine/sampling_gen/errors.py,sha256=UBZBtosD07EisCdeo8r-Uq4h0QL3tYS1qwtEmca8_jM,828
|
|
@@ -170,17 +172,20 @@ data_designer/engine/validators/local_callable.py,sha256=oCUXj_NRt0gVqUIh0fLrvw-
|
|
|
170
172
|
data_designer/engine/validators/python.py,sha256=jAp1u8yLjqfebh60xGapkHVjMz58WHB0QjfMc2zQCaY,7894
|
|
171
173
|
data_designer/engine/validators/remote.py,sha256=jtDIvWzfHh17m2ac_Fp93p49Th8RlkBzzih2jiqD7gk,2929
|
|
172
174
|
data_designer/engine/validators/sql.py,sha256=bxbyxPxDT9yuwjhABVEY40iR1pzWRFi65WU4tPgG2bE,2250
|
|
173
|
-
data_designer/essentials/__init__.py,sha256=
|
|
175
|
+
data_designer/essentials/__init__.py,sha256=viyPs-sIVMuHuFpWhZVKtwRsDQFMsmtw6B7veK_No_I,1094
|
|
174
176
|
data_designer/interface/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
|
|
175
|
-
data_designer/interface/data_designer.py,sha256=
|
|
177
|
+
data_designer/interface/data_designer.py,sha256=jiPI0vbzTAT4Oug4W4RCBS69bCQOfCE9awfClwBeDnI,17175
|
|
176
178
|
data_designer/interface/errors.py,sha256=jagKT3tPUnYq4e3e6AkTnBkcayHyEfxjPMBzx-GEKe4,565
|
|
177
179
|
data_designer/interface/results.py,sha256=zYVX589OUyFuB-8XLmjjdKk3hCDNKu189sH-gOOFreQ,3511
|
|
178
180
|
data_designer/plugins/__init__.py,sha256=c_V7q4QhfVoNf_uc9UwmXCsWqwtyWogI7YoN_0PzzE4,234
|
|
179
|
-
data_designer/plugins/errors.py,sha256=
|
|
180
|
-
data_designer/plugins/plugin.py,sha256=
|
|
181
|
+
data_designer/plugins/errors.py,sha256=nljerskefztjy84UARpw3ogm4GI8CmJCi4FrfaVJI2w,345
|
|
182
|
+
data_designer/plugins/plugin.py,sha256=n4nlFHZZ0INY7vFJylTGoh1ij9k3TSVw-15OGSdROGE,5355
|
|
181
183
|
data_designer/plugins/registry.py,sha256=c0X03TnA_J60RWpxaVJEmtIXKvA9up-LznrUHXDcYxg,3012
|
|
182
|
-
data_designer
|
|
183
|
-
data_designer
|
|
184
|
-
data_designer
|
|
185
|
-
data_designer-0.
|
|
186
|
-
data_designer-0.
|
|
184
|
+
data_designer/plugins/testing/__init__.py,sha256=fyFqJcdyYcU5nj55RjqD6NjkXl4nhNZw5Bd6-sLnnjQ,255
|
|
185
|
+
data_designer/plugins/testing/stubs.py,sha256=IqriaMWtQbhPSoWAkbcKsTga-Xi5rTZBlW70IC0cpd0,4699
|
|
186
|
+
data_designer/plugins/testing/utils.py,sha256=2Pk0BJ7tx6gKLCoJr72EqpqfTWzHnlk_Lkwi89aDQQs,578
|
|
187
|
+
data_designer-0.3.0.dist-info/METADATA,sha256=lb_Mn3vMSrflnJZYdRaWMBXK69VrTa3xZdvyRj8lqB8,7652
|
|
188
|
+
data_designer-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
189
|
+
data_designer-0.3.0.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
|
|
190
|
+
data_designer-0.3.0.dist-info/licenses/LICENSE,sha256=cSWJDwVqHyQgly8Zmt3pqXJ2eQbZVYwN9qd0NMssxXY,11336
|
|
191
|
+
data_designer-0.3.0.dist-info/RECORD,,
|
|
@@ -1,187 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
import logging
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
from typing import TYPE_CHECKING
|
|
9
|
-
|
|
10
|
-
import pandas as pd
|
|
11
|
-
import pyarrow.parquet as pq
|
|
12
|
-
from huggingface_hub import HfApi, HfFileSystem
|
|
13
|
-
from pydantic import BaseModel, Field
|
|
14
|
-
|
|
15
|
-
from data_designer.config.errors import InvalidConfigError, InvalidFileFormatError, InvalidFilePathError
|
|
16
|
-
from data_designer.config.utils.io_helpers import VALID_DATASET_FILE_EXTENSIONS, validate_path_contains_files_of_type
|
|
17
|
-
|
|
18
|
-
if TYPE_CHECKING:
|
|
19
|
-
from data_designer.config.seed import SeedDatasetReference
|
|
20
|
-
|
|
21
|
-
logger = logging.getLogger(__name__)
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class DatastoreSettings(BaseModel):
|
|
25
|
-
"""Configuration for interacting with a datastore."""
|
|
26
|
-
|
|
27
|
-
endpoint: str = Field(
|
|
28
|
-
...,
|
|
29
|
-
description="Datastore endpoint. Use 'https://huggingface.co' for the Hugging Face Hub.",
|
|
30
|
-
)
|
|
31
|
-
token: str | None = Field(default=None, description="If needed, token to use for authentication.")
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def get_file_column_names(file_reference: str | Path | HfFileSystem, file_type: str) -> list[str]:
|
|
35
|
-
"""Get column names from a dataset file.
|
|
36
|
-
|
|
37
|
-
Args:
|
|
38
|
-
file_reference: Path to the dataset file, or an HfFileSystem object.
|
|
39
|
-
file_type: Type of the dataset file. Must be one of: 'parquet', 'json', 'jsonl', 'csv'.
|
|
40
|
-
|
|
41
|
-
Raises:
|
|
42
|
-
InvalidFilePathError: If the file type is not supported.
|
|
43
|
-
|
|
44
|
-
Returns:
|
|
45
|
-
List of column names.
|
|
46
|
-
"""
|
|
47
|
-
if file_type == "parquet":
|
|
48
|
-
try:
|
|
49
|
-
schema = pq.read_schema(file_reference)
|
|
50
|
-
if hasattr(schema, "names"):
|
|
51
|
-
return schema.names
|
|
52
|
-
else:
|
|
53
|
-
return [field.name for field in schema]
|
|
54
|
-
except Exception as e:
|
|
55
|
-
logger.warning(f"Failed to process parquet file {file_reference}: {e}")
|
|
56
|
-
return []
|
|
57
|
-
elif file_type in ["json", "jsonl"]:
|
|
58
|
-
return pd.read_json(file_reference, orient="records", lines=True, nrows=1).columns.tolist()
|
|
59
|
-
elif file_type == "csv":
|
|
60
|
-
try:
|
|
61
|
-
df = pd.read_csv(file_reference, nrows=1)
|
|
62
|
-
return df.columns.tolist()
|
|
63
|
-
except (pd.errors.EmptyDataError, pd.errors.ParserError) as e:
|
|
64
|
-
logger.warning(f"Failed to process CSV file {file_reference}: {e}")
|
|
65
|
-
return []
|
|
66
|
-
else:
|
|
67
|
-
raise InvalidFilePathError(f"🛑 Unsupported file type: {file_type!r}")
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
def fetch_seed_dataset_column_names(seed_dataset_reference: SeedDatasetReference) -> list[str]:
|
|
71
|
-
if hasattr(seed_dataset_reference, "datastore_settings"):
|
|
72
|
-
return fetch_seed_dataset_column_names_from_datastore(
|
|
73
|
-
seed_dataset_reference.repo_id,
|
|
74
|
-
seed_dataset_reference.filename,
|
|
75
|
-
seed_dataset_reference.datastore_settings,
|
|
76
|
-
)
|
|
77
|
-
return fetch_seed_dataset_column_names_from_local_file(seed_dataset_reference.dataset)
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def fetch_seed_dataset_column_names_from_datastore(
|
|
81
|
-
repo_id: str,
|
|
82
|
-
filename: str,
|
|
83
|
-
datastore_settings: DatastoreSettings | dict | None = None,
|
|
84
|
-
) -> list[str]:
|
|
85
|
-
file_type = filename.split(".")[-1]
|
|
86
|
-
if f".{file_type}" not in VALID_DATASET_FILE_EXTENSIONS:
|
|
87
|
-
raise InvalidFileFormatError(f"🛑 Unsupported file type: {filename!r}")
|
|
88
|
-
|
|
89
|
-
datastore_settings = resolve_datastore_settings(datastore_settings)
|
|
90
|
-
fs = HfFileSystem(endpoint=datastore_settings.endpoint, token=datastore_settings.token, skip_instance_cache=True)
|
|
91
|
-
|
|
92
|
-
file_path = _extract_single_file_path_from_glob_pattern_if_present(f"datasets/{repo_id}/{filename}", fs=fs)
|
|
93
|
-
|
|
94
|
-
with fs.open(file_path) as f:
|
|
95
|
-
return get_file_column_names(f, file_type)
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
def fetch_seed_dataset_column_names_from_local_file(dataset_path: str | Path) -> list[str]:
|
|
99
|
-
dataset_path = _validate_dataset_path(dataset_path, allow_glob_pattern=True)
|
|
100
|
-
dataset_path = _extract_single_file_path_from_glob_pattern_if_present(dataset_path)
|
|
101
|
-
return get_file_column_names(dataset_path, str(dataset_path).split(".")[-1])
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
def resolve_datastore_settings(datastore_settings: DatastoreSettings | dict | None) -> DatastoreSettings:
|
|
105
|
-
if datastore_settings is None:
|
|
106
|
-
raise InvalidConfigError("🛑 Datastore settings are required in order to upload datasets to the datastore.")
|
|
107
|
-
if isinstance(datastore_settings, DatastoreSettings):
|
|
108
|
-
return datastore_settings
|
|
109
|
-
elif isinstance(datastore_settings, dict):
|
|
110
|
-
return DatastoreSettings.model_validate(datastore_settings)
|
|
111
|
-
else:
|
|
112
|
-
raise InvalidConfigError(
|
|
113
|
-
"🛑 Invalid datastore settings format. Must be DatastoreSettings object or dictionary."
|
|
114
|
-
)
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
def upload_to_hf_hub(
|
|
118
|
-
dataset_path: str | Path,
|
|
119
|
-
filename: str,
|
|
120
|
-
repo_id: str,
|
|
121
|
-
datastore_settings: DatastoreSettings,
|
|
122
|
-
**kwargs,
|
|
123
|
-
) -> str:
|
|
124
|
-
datastore_settings = resolve_datastore_settings(datastore_settings)
|
|
125
|
-
dataset_path = _validate_dataset_path(dataset_path)
|
|
126
|
-
filename_ext = filename.split(".")[-1].lower()
|
|
127
|
-
if dataset_path.suffix.lower()[1:] != filename_ext:
|
|
128
|
-
raise InvalidFileFormatError(
|
|
129
|
-
f"🛑 Dataset file extension {dataset_path.suffix!r} does not match `filename` extension .{filename_ext!r}"
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
hfapi = HfApi(endpoint=datastore_settings.endpoint, token=datastore_settings.token)
|
|
133
|
-
hfapi.create_repo(repo_id, exist_ok=True, repo_type="dataset")
|
|
134
|
-
hfapi.upload_file(
|
|
135
|
-
path_or_fileobj=dataset_path,
|
|
136
|
-
path_in_repo=filename,
|
|
137
|
-
repo_id=repo_id,
|
|
138
|
-
repo_type="dataset",
|
|
139
|
-
**kwargs,
|
|
140
|
-
)
|
|
141
|
-
return f"{repo_id}/{filename}"
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
def _extract_single_file_path_from_glob_pattern_if_present(
|
|
145
|
-
file_path: str | Path,
|
|
146
|
-
fs: HfFileSystem | None = None,
|
|
147
|
-
) -> Path:
|
|
148
|
-
file_path = Path(file_path)
|
|
149
|
-
|
|
150
|
-
# no glob pattern
|
|
151
|
-
if "*" not in str(file_path):
|
|
152
|
-
return file_path
|
|
153
|
-
|
|
154
|
-
# glob pattern with HfFileSystem
|
|
155
|
-
if fs is not None:
|
|
156
|
-
file_to_check = None
|
|
157
|
-
file_extension = file_path.name.split(".")[-1]
|
|
158
|
-
for file in fs.ls(str(file_path.parent)):
|
|
159
|
-
filename = file["name"]
|
|
160
|
-
if filename.endswith(f".{file_extension}"):
|
|
161
|
-
file_to_check = filename
|
|
162
|
-
if file_to_check is None:
|
|
163
|
-
raise InvalidFilePathError(f"🛑 No files found matching pattern: {str(file_path)!r}")
|
|
164
|
-
logger.debug(f"Using the first matching file in {str(file_path)!r} to determine column names in seed dataset")
|
|
165
|
-
return Path(file_to_check)
|
|
166
|
-
|
|
167
|
-
# glob pattern with local file system
|
|
168
|
-
if not (matching_files := sorted(file_path.parent.glob(file_path.name))):
|
|
169
|
-
raise InvalidFilePathError(f"🛑 No files found matching pattern: {str(file_path)!r}")
|
|
170
|
-
logger.debug(f"Using the first matching file in {str(file_path)!r} to determine column names in seed dataset")
|
|
171
|
-
return matching_files[0]
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
def _validate_dataset_path(dataset_path: str | Path, allow_glob_pattern: bool = False) -> Path:
|
|
175
|
-
if allow_glob_pattern and "*" in str(dataset_path):
|
|
176
|
-
parts = str(dataset_path).split("*.")
|
|
177
|
-
file_path = parts[0]
|
|
178
|
-
file_extension = parts[-1]
|
|
179
|
-
validate_path_contains_files_of_type(file_path, file_extension)
|
|
180
|
-
return Path(dataset_path)
|
|
181
|
-
if not Path(dataset_path).is_file():
|
|
182
|
-
raise InvalidFilePathError("🛑 To upload a dataset to the datastore, you must provide a valid file path.")
|
|
183
|
-
if not Path(dataset_path).name.endswith(tuple(VALID_DATASET_FILE_EXTENSIONS)):
|
|
184
|
-
raise InvalidFileFormatError(
|
|
185
|
-
"🛑 Dataset files must be in `parquet`, `csv`, or `json` (orient='records', lines=True) format."
|
|
186
|
-
)
|
|
187
|
-
return Path(dataset_path)
|
|
@@ -1,84 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
from abc import ABC, abstractmethod
|
|
5
|
-
|
|
6
|
-
import duckdb
|
|
7
|
-
from huggingface_hub import HfApi, HfFileSystem
|
|
8
|
-
|
|
9
|
-
from data_designer.logging import quiet_noisy_logger
|
|
10
|
-
|
|
11
|
-
quiet_noisy_logger("httpx")
|
|
12
|
-
|
|
13
|
-
_HF_DATASETS_PREFIX = "hf://datasets/"
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class MalformedFileIdError(Exception):
|
|
17
|
-
"""Raised when file_id format is invalid."""
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class SeedDatasetDataStore(ABC):
|
|
21
|
-
"""Abstract base class for dataset storage implementations."""
|
|
22
|
-
|
|
23
|
-
@abstractmethod
|
|
24
|
-
def create_duckdb_connection(self) -> duckdb.DuckDBPyConnection: ...
|
|
25
|
-
|
|
26
|
-
@abstractmethod
|
|
27
|
-
def get_dataset_uri(self, file_id: str) -> str: ...
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
class LocalSeedDatasetDataStore(SeedDatasetDataStore):
|
|
31
|
-
"""Local filesystem-based dataset storage."""
|
|
32
|
-
|
|
33
|
-
def create_duckdb_connection(self) -> duckdb.DuckDBPyConnection:
|
|
34
|
-
return duckdb.connect()
|
|
35
|
-
|
|
36
|
-
def get_dataset_uri(self, file_id: str) -> str:
|
|
37
|
-
return file_id
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
class HfHubSeedDatasetDataStore(SeedDatasetDataStore):
|
|
41
|
-
"""Hugging Face and Data Store dataset storage."""
|
|
42
|
-
|
|
43
|
-
def __init__(self, endpoint: str, token: str | None):
|
|
44
|
-
self.hfapi = HfApi(endpoint=endpoint, token=token)
|
|
45
|
-
self.endpoint = endpoint
|
|
46
|
-
self.token = token
|
|
47
|
-
|
|
48
|
-
def create_duckdb_connection(self) -> duckdb.DuckDBPyConnection:
|
|
49
|
-
"""Create a DuckDB connection with a fresh HfFileSystem registered.
|
|
50
|
-
|
|
51
|
-
Creates a new HfFileSystem instance for each connection to ensure file metadata
|
|
52
|
-
is fetched fresh from the datastore, avoiding cache-related issues when reading
|
|
53
|
-
recently updated parquet files.
|
|
54
|
-
|
|
55
|
-
Returns:
|
|
56
|
-
A DuckDB connection with the HfFileSystem registered for hf:// URI support.
|
|
57
|
-
"""
|
|
58
|
-
# Use skip_instance_cache to avoid fsspec-level caching
|
|
59
|
-
hffs = HfFileSystem(endpoint=self.endpoint, token=self.token, skip_instance_cache=True)
|
|
60
|
-
|
|
61
|
-
# Clear all internal caches to avoid stale metadata issues
|
|
62
|
-
# HfFileSystem caches file metadata (size, etc.) which can become stale when files are re-uploaded
|
|
63
|
-
if hasattr(hffs, "dircache"):
|
|
64
|
-
hffs.dircache.clear()
|
|
65
|
-
|
|
66
|
-
conn = duckdb.connect()
|
|
67
|
-
conn.register_filesystem(hffs)
|
|
68
|
-
return conn
|
|
69
|
-
|
|
70
|
-
def get_dataset_uri(self, file_id: str) -> str:
|
|
71
|
-
identifier = file_id.removeprefix(_HF_DATASETS_PREFIX)
|
|
72
|
-
repo_id, filename = self._get_repo_id_and_filename(identifier)
|
|
73
|
-
return f"{_HF_DATASETS_PREFIX}{repo_id}/{filename}"
|
|
74
|
-
|
|
75
|
-
def _get_repo_id_and_filename(self, identifier: str) -> tuple[str, str]:
|
|
76
|
-
"""Extract repo_id and filename from identifier."""
|
|
77
|
-
parts = identifier.split("/", 2)
|
|
78
|
-
if len(parts) < 3:
|
|
79
|
-
raise MalformedFileIdError(
|
|
80
|
-
"Could not extract repo id and filename from file_id, "
|
|
81
|
-
"expected 'hf://datasets/{repo-namespace}/{repo-name}/{filename}'"
|
|
82
|
-
)
|
|
83
|
-
repo_ns, repo_name, filename = parts
|
|
84
|
-
return f"{repo_ns}/{repo_name}", filename
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|