workbench 0.8.198__py3-none-any.whl → 0.8.203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- workbench/algorithms/dataframe/proximity.py +11 -4
- workbench/api/__init__.py +2 -1
- workbench/api/df_store.py +17 -108
- workbench/api/feature_set.py +48 -11
- workbench/api/model.py +1 -1
- workbench/api/parameter_store.py +3 -52
- workbench/core/artifacts/__init__.py +11 -2
- workbench/core/artifacts/artifact.py +5 -5
- workbench/core/artifacts/df_store_core.py +114 -0
- workbench/core/artifacts/endpoint_core.py +261 -78
- workbench/core/artifacts/feature_set_core.py +69 -1
- workbench/core/artifacts/model_core.py +48 -14
- workbench/core/artifacts/parameter_store_core.py +98 -0
- workbench/core/transforms/features_to_model/features_to_model.py +50 -33
- workbench/core/transforms/pandas_transforms/pandas_to_features.py +11 -2
- workbench/core/views/view.py +2 -2
- workbench/model_scripts/chemprop/chemprop.template +933 -0
- workbench/model_scripts/chemprop/generated_model_script.py +933 -0
- workbench/model_scripts/chemprop/requirements.txt +11 -0
- workbench/model_scripts/custom_models/chem_info/fingerprints.py +134 -0
- workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
- workbench/model_scripts/custom_models/proximity/proximity.py +11 -4
- workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +11 -5
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +11 -5
- workbench/model_scripts/custom_models/uq_models/ngboost.template +11 -5
- workbench/model_scripts/custom_models/uq_models/proximity.py +11 -4
- workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +11 -5
- workbench/model_scripts/pytorch_model/generated_model_script.py +365 -173
- workbench/model_scripts/pytorch_model/pytorch.template +362 -170
- workbench/model_scripts/scikit_learn/generated_model_script.py +302 -0
- workbench/model_scripts/script_generation.py +10 -7
- workbench/model_scripts/uq_models/generated_model_script.py +43 -27
- workbench/model_scripts/uq_models/mapie.template +40 -24
- workbench/model_scripts/xgb_model/generated_model_script.py +36 -7
- workbench/model_scripts/xgb_model/xgb_model.template +36 -7
- workbench/repl/workbench_shell.py +14 -5
- workbench/resources/open_source_api.key +1 -1
- workbench/scripts/endpoint_test.py +162 -0
- workbench/scripts/{lambda_launcher.py → lambda_test.py} +10 -0
- workbench/utils/chemprop_utils.py +761 -0
- workbench/utils/pytorch_utils.py +527 -0
- workbench/utils/xgboost_model_utils.py +10 -5
- workbench/web_interface/components/model_plot.py +7 -1
- {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/METADATA +3 -3
- {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/RECORD +49 -43
- {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/entry_points.txt +2 -1
- workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
- workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
- workbench/model_scripts/__pycache__/script_generation.cpython-312.pyc +0 -0
- workbench/model_scripts/__pycache__/script_generation.cpython-313.pyc +0 -0
- {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/WHEEL +0 -0
- {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/top_level.txt +0 -0
|
@@ -6,7 +6,7 @@ workbench/algorithms/dataframe/data_source_eda.py,sha256=WgVL6tzBCw1tznQr8RQ6daQ
|
|
|
6
6
|
workbench/algorithms/dataframe/feature_space_proximity.py,sha256=6RxzvbpLdDkHMm1D49Nv59SFcyYUj8bisd6_5EpBEGI,3515
|
|
7
7
|
workbench/algorithms/dataframe/fingerprint_proximity.py,sha256=nGxfmYQ3bfMtvs90s4p7gaY9DN4gijdDU7R6B2lRHgo,5825
|
|
8
8
|
workbench/algorithms/dataframe/projection_2d.py,sha256=zK4hc0OQrySmfcfFg8y0GxEL34uDNqvZL4OgttB9vRs,7834
|
|
9
|
-
workbench/algorithms/dataframe/proximity.py,sha256=
|
|
9
|
+
workbench/algorithms/dataframe/proximity.py,sha256=dPTYD1N-JTIqg6iL7ak_JSouaCdfmBPjG08IRRvTLXU,15836
|
|
10
10
|
workbench/algorithms/dataframe/storage/aggregation.py,sha256=VuTb7A6Vh6IS5djZeItvOLnnEOlf7tzMQ8OaYIuftvU,2852
|
|
11
11
|
workbench/algorithms/dataframe/storage/feature_resolution.py,sha256=w_iLf8EFTg7Jc5laH-bsq8MEtZVqcg05W-GihCqR-r4,9450
|
|
12
12
|
workbench/algorithms/dataframe/storage/feature_spider.py,sha256=uIZ4JHIKuhpy08wBFReSrohb5DGxx8vGroHUbjPm1jE,14353
|
|
@@ -27,17 +27,17 @@ workbench/algorithms/sql/descriptive_stats.py,sha256=VxSR5zQi8NmAWrJvOCO3wrmgVHY
|
|
|
27
27
|
workbench/algorithms/sql/outliers.py,sha256=2hoilOk0gaz9pwrnGEBY2y7M-UqFED3KO_mFm_0_3ac,10645
|
|
28
28
|
workbench/algorithms/sql/sample_rows.py,sha256=SRYoGb24QP_iPvOoW9bGZ95yZuseYDtyoNhilfoLu34,2688
|
|
29
29
|
workbench/algorithms/sql/value_counts.py,sha256=F-rZoLTTKv1cHYl2_tDlvWDjczy76uLTr3EMHa-WrEk,3340
|
|
30
|
-
workbench/api/__init__.py,sha256=
|
|
30
|
+
workbench/api/__init__.py,sha256=KDKzFb4SL8AArtd9ucTkFYdCxbsBMbK1ZMkj0G2rACY,1065
|
|
31
31
|
workbench/api/compound.py,sha256=kf5EaM5qjWwsZutcxqj9IC_MPnDV1uVHDMns9OA_GOo,2545
|
|
32
32
|
workbench/api/data_source.py,sha256=Ngz36YZWxFfpJbmURhM1LQPYjh5kdpZNGo6_fCRePbA,8321
|
|
33
|
-
workbench/api/df_store.py,sha256=
|
|
33
|
+
workbench/api/df_store.py,sha256=1qSYM3Xb4MwMMTMaF3CX0hOCEzhIbnra5Deivg4cryk,3014
|
|
34
34
|
workbench/api/endpoint.py,sha256=spLse2UoAsZdu_ZxmAvMJX_aX-zutAsQ5_SPm9Xt-nA,3839
|
|
35
|
-
workbench/api/feature_set.py,sha256=
|
|
35
|
+
workbench/api/feature_set.py,sha256=K0Sl59yAf_qZr8EH4rPjDotezCwP5Q7aG38FGaF4zi0,8062
|
|
36
36
|
workbench/api/graph_store.py,sha256=LremJyPrQFgsHb7hxsctuCsoxx3p7TKtaY5qALHe6pc,4372
|
|
37
37
|
workbench/api/meta.py,sha256=1_9989cPvf3hd3tA-83hLijOGNnhwXAF8aZF45adeDQ,8596
|
|
38
|
-
workbench/api/model.py,sha256=
|
|
38
|
+
workbench/api/model.py,sha256=fncUc8MJwXyteKeXOlAy5IMjE48sH_VmDBi3P2MPGG4,4458
|
|
39
39
|
workbench/api/monitor.py,sha256=Cez89Uac7Tzt47FxkjoX-YDGccEhvBcxw3sZFtw4ud8,4506
|
|
40
|
-
workbench/api/parameter_store.py,sha256=
|
|
40
|
+
workbench/api/parameter_store.py,sha256=_3MmPxKiVy7_OIgCSRlUv9xbk8nuiOWiCtZgT-AxN1k,2574
|
|
41
41
|
workbench/api/pipeline.py,sha256=MSYGrDSXrRB_oQELtAlOwBfxSBTw3REAkHy5XBHau0Y,6261
|
|
42
42
|
workbench/cached/__init__.py,sha256=wvTyIFvusv2HjU3yop6OSr3js5_-SZuR8nPmlCuZQJ4,525
|
|
43
43
|
workbench/cached/cached_data_source.py,sha256=A0o4H9g1aEms8HkOHWnb46vJ5fx6ebs1aCYaQcf8gPI,2649
|
|
@@ -47,24 +47,24 @@ workbench/cached/cached_meta.py,sha256=DTlnb6jblviVmSg9w0F6LRVIuQ_lWBNqGh8vqKP5B
|
|
|
47
47
|
workbench/cached/cached_model.py,sha256=iMc_fySUE5qau3feduVXMNb24JY0sBjt1g6WeLLciXc,4348
|
|
48
48
|
workbench/cached/cached_pipeline.py,sha256=QOVnEKu5RbIdlNpJUi-0Ebh0_-C68RigSPwKh4dvZTM,1948
|
|
49
49
|
workbench/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
50
|
-
workbench/core/artifacts/__init__.py,sha256=
|
|
51
|
-
workbench/core/artifacts/artifact.py,sha256=
|
|
50
|
+
workbench/core/artifacts/__init__.py,sha256=ukcgbYlI9m99bzwaBNO01K1h0-cQkzsbh_jT_GyQ-LY,1034
|
|
51
|
+
workbench/core/artifacts/artifact.py,sha256=scWUbX2Sk1rxT8VEm_Z7YTxbOzkDASNyqqXB56xLZ2w,17721
|
|
52
52
|
workbench/core/artifacts/athena_source.py,sha256=RNmCe7s6uH4gVHpcdJcL84aSbF5Q1ahJBLLGwHYRXEU,26081
|
|
53
53
|
workbench/core/artifacts/cached_artifact_mixin.py,sha256=ngqFLZ4cQx_TFouXZgXZQsv_7W6XCvxVGXXSfzzaft8,3775
|
|
54
54
|
workbench/core/artifacts/data_capture_core.py,sha256=q8f79rRTYiZ7T4IQRWXl8ZvPpcvZyNxYERwvo8o0OQc,14858
|
|
55
55
|
workbench/core/artifacts/data_source_abstract.py,sha256=5IRCzFVK-17cd4NXPMRfx99vQAmQ0WHE5jcm5RfsVTg,10619
|
|
56
56
|
workbench/core/artifacts/data_source_factory.py,sha256=YL_tA5fsgubbB3dPF6T4tO0rGgz-6oo3ge4i_YXVC-M,2380
|
|
57
|
-
workbench/core/artifacts/
|
|
58
|
-
workbench/core/artifacts/
|
|
59
|
-
workbench/core/artifacts/
|
|
57
|
+
workbench/core/artifacts/df_store_core.py,sha256=AueNr_JvuLLu_ByE7cb3u-isH9u0Q7cMP-UCgCX-Ctg,3536
|
|
58
|
+
workbench/core/artifacts/endpoint_core.py,sha256=oWWJSXSod5JzI7b4JvoxKWm46lv0FNZZf_FIZR4ZP9Q,60832
|
|
59
|
+
workbench/core/artifacts/feature_set_core.py,sha256=wZy-02WXWmSBet5t8mWXFRdv9O4MtW3hWqJuVv7Kok0,39330
|
|
60
|
+
workbench/core/artifacts/model_core.py,sha256=QIgV5MJr8aDY63in83thdNc5-bzkWLn5f5vvsS4aNYo,52348
|
|
60
61
|
workbench/core/artifacts/monitor_core.py,sha256=M307yz7tEzOEHgv-LmtVy9jKjSbM98fHW3ckmNYrwlU,27897
|
|
62
|
+
workbench/core/artifacts/parameter_store_core.py,sha256=sHvjJMuybM4qdcKhH-Sx6Ur6Yn5ozA3QHwtidsnhyG8,2867
|
|
61
63
|
workbench/core/cloud_platform/cloud_meta.py,sha256=-g4-LTC3D0PXb3VfaXdLR1ERijKuHdffeMK_zhD-koQ,8809
|
|
62
64
|
workbench/core/cloud_platform/aws/README.md,sha256=QT5IQXoUHbIA0qQ2wO6_2P2lYjYQFVYuezc22mWY4i8,97
|
|
63
65
|
workbench/core/cloud_platform/aws/aws_account_clamp.py,sha256=V5iVsoGvSRilARtTdExnt27QptzAcJaW0s3nm2B8-ow,8286
|
|
64
|
-
workbench/core/cloud_platform/aws/aws_df_store.py,sha256=utRIlTCPwFneHHZ8_Z3Hw3rOJSeryiFA4wBtucxULRQ,15055
|
|
65
66
|
workbench/core/cloud_platform/aws/aws_graph_store.py,sha256=ytYxQTplUmeWbsPmxyZbf6mO9qyTl60ewlJG8MyfyEY,9414
|
|
66
67
|
workbench/core/cloud_platform/aws/aws_meta.py,sha256=eY9Pn6pl2yAyseACFb2nitR-0vLwG4i8CSEXe8Iaswc,34778
|
|
67
|
-
workbench/core/cloud_platform/aws/aws_parameter_store.py,sha256=9ekuMOQFHFMIEV68UbHhS_fLB9iqG5Hvu4EV6iamEpk,10400
|
|
68
68
|
workbench/core/cloud_platform/aws/aws_secrets_manager.py,sha256=TUnddp1gX-OwxJ_oO5ONh7OI4Z2HC_6euGkJ-himCCk,8615
|
|
69
69
|
workbench/core/cloud_platform/aws/aws_session.py,sha256=2Gc_k4Q87BBeQDgXgVR-w-qmsF6ncZR8wvTeNnixM6k,6926
|
|
70
70
|
workbench/core/cloud_platform/aws/cache_dataframe.py,sha256=VnObkVqcjg7v4fegrIkXR1j-K2AHTBpSAoriUXDe12A,2314
|
|
@@ -102,14 +102,14 @@ workbench/core/transforms/features_to_features/__init__.py,sha256=47DEQpj8HBSa-_
|
|
|
102
102
|
workbench/core/transforms/features_to_features/heavy/emr/Readme.md,sha256=YtQgCEQeKe0CQXQkhzMTYq9xOtCsCYb5P5LW2BmRKWQ,68
|
|
103
103
|
workbench/core/transforms/features_to_features/heavy/glue/Readme.md,sha256=TuyCatWfoDr99zUwvOcxf-TqMkQzaMqXlj5nmFcRzfo,48
|
|
104
104
|
workbench/core/transforms/features_to_model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
105
|
-
workbench/core/transforms/features_to_model/features_to_model.py,sha256=
|
|
105
|
+
workbench/core/transforms/features_to_model/features_to_model.py,sha256=JdKKz3eKrKhicA1WxTfmb1IqQNCdHJE0CKDs66bLHYU,21071
|
|
106
106
|
workbench/core/transforms/model_to_endpoint/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
107
107
|
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py,sha256=TIYXvuK0s383PwJ4iS6fCRhuif6oIxsoWb4CpMGJjY4,6358
|
|
108
108
|
workbench/core/transforms/pandas_transforms/__init__.py,sha256=xL4MT8-fZ1SFqDbTLc8XyxjupHtB1YR6Ej0AC2nwd7I,894
|
|
109
109
|
workbench/core/transforms/pandas_transforms/data_to_pandas.py,sha256=sJHPeuNF8Q8aQqgRnkdWkyvur5cbggdUVIwR-xF3Dlo,3621
|
|
110
110
|
workbench/core/transforms/pandas_transforms/features_to_pandas.py,sha256=af6xdPt2V4zhh-SzQa_UYxdmNMzMLXbrbsznV5QoIJg,3441
|
|
111
111
|
workbench/core/transforms/pandas_transforms/pandas_to_data.py,sha256=cqo6hQmzUGUFACvNuVLZQdgrlXrQIu4NjqK-ujPmoIc,9181
|
|
112
|
-
workbench/core/transforms/pandas_transforms/pandas_to_features.py,sha256=
|
|
112
|
+
workbench/core/transforms/pandas_transforms/pandas_to_features.py,sha256=AqXS4ZND7lg94enclRP9wGBrYm4AmhL3c--q0o-6_JM,21972
|
|
113
113
|
workbench/core/transforms/pandas_transforms/pandas_to_features_chunked.py,sha256=0R8mQlWfbIlTVmYUmrtu2gsw0AE815k6kqPgpd0bmyQ,4422
|
|
114
114
|
workbench/core/views/__init__.py,sha256=UZJMAJBCMVM3uSYmnFg8c2LWtdu9-479WNAdVMIohAc,962
|
|
115
115
|
workbench/core/views/column_subset_view.py,sha256=vGDKTTGrPIY-IFOeWvudJrhKiq0OjWDp5rTuuj-X40U,4261
|
|
@@ -119,54 +119,58 @@ workbench/core/views/display_view.py,sha256=9K4O77ZnKOh93aMRhxcQJQ1lqScLhuJnU_tH
|
|
|
119
119
|
workbench/core/views/inference_view.py,sha256=9s70M0dFdGq0tWvzMZfgUK7EPKtuvcQhux0uyRZuuLM,3293
|
|
120
120
|
workbench/core/views/pandas_to_view.py,sha256=20uCsnG2iMh-U1VxqVUUtnrWAY98SeuHjmfJK_wcq1I,6422
|
|
121
121
|
workbench/core/views/training_view.py,sha256=7HwhbQhDBhT3Zo_gssS-b4eueJ0h9nqqT8YGFSuaEcU,9016
|
|
122
|
-
workbench/core/views/view.py,sha256=
|
|
122
|
+
workbench/core/views/view.py,sha256=DvmEA1xdvL980GET_cnbmHzqSy6IhlNaZcoQnVTtYis,13534
|
|
123
123
|
workbench/core/views/view_utils.py,sha256=CwOlpqXpumCr6REi-ey7Qjz5_tpg-s4oWHmlOVu8POQ,12270
|
|
124
124
|
workbench/core/views/storage/mdq_view.py,sha256=qf_ep1KwaXOIfO930laEwNIiCYP7VNOqjE3VdHfopRE,5195
|
|
125
|
-
workbench/model_scripts/script_generation.py,sha256=
|
|
126
|
-
workbench/model_scripts/
|
|
127
|
-
workbench/model_scripts/
|
|
125
|
+
workbench/model_scripts/script_generation.py,sha256=_AhzM2qzjBuI7pIaXBRZ1YOOs2lwsKQGVM_ovL6T1bo,8135
|
|
126
|
+
workbench/model_scripts/chemprop/chemprop.template,sha256=NR1jMb-IPxBAaQ-KiPR09ylL_gTIC35lZwBpBQPtzig,38109
|
|
127
|
+
workbench/model_scripts/chemprop/generated_model_script.py,sha256=Cxfbu7mNf_HLBCzlsOOXR1u1Y-eHMma63YWM9l8ku44,38206
|
|
128
|
+
workbench/model_scripts/chemprop/requirements.txt,sha256=PIuUdPAeDUH3I2M_5nIrCnCfs3FL1l9V5kzHqgCcu7s,281
|
|
128
129
|
workbench/model_scripts/custom_models/chem_info/Readme.md,sha256=mH1lxJ4Pb7F5nBnVXaiuxpi8zS_yjUw_LBJepVKXhlA,574
|
|
130
|
+
workbench/model_scripts/custom_models/chem_info/fingerprints.py,sha256=Qvs8jaUwguWUq3Q3j695MY0t0Wk3BvroW-oWBwalMUo,5255
|
|
129
131
|
workbench/model_scripts/custom_models/chem_info/mol_descriptors.py,sha256=c8gkHZ-8s3HJaW9zN9pnYGK7YVW8Y0xFqQ1G_ysrF2Y,18789
|
|
130
132
|
workbench/model_scripts/custom_models/chem_info/mol_standardize.py,sha256=qPLCdVMSXMOWN-01O1isg2zq7eQyFAI0SNatHkRq1uw,17524
|
|
131
133
|
workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py,sha256=xljMjdfh4Idi4v1Afq1zZxvF1SDa7pDOLSAhvGBEj88,2891
|
|
132
|
-
workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py,sha256=
|
|
134
|
+
workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py,sha256=LqVh_AHObo0uxHt_uNmeemScTLjM2j9C3I_QFJXdmUI,3232
|
|
133
135
|
workbench/model_scripts/custom_models/chem_info/requirements.txt,sha256=7HBUzvNiM8lOir-UfQabXYlUp3gxdGJ42u18EuSMGjc,39
|
|
134
136
|
workbench/model_scripts/custom_models/meta_endpoints/example.py,sha256=hzOAuLhIGB8vei-555ruNxpsE1GhuByHGjGB0zw8GSs,1726
|
|
135
137
|
workbench/model_scripts/custom_models/network_security/Readme.md,sha256=Z2gtiu0hLHvEJ1x-_oFq3qJZcsK81sceBAGAGltpqQ8,222
|
|
136
138
|
workbench/model_scripts/custom_models/proximity/Readme.md,sha256=RlMFAJZgAT2mCgDk-UwR_R0Y_NbCqeI5-8DUsxsbpWQ,289
|
|
137
139
|
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template,sha256=eOllmqB20BWtTiV53dgpIqXKtgSbPFDW_zf8PvM3oF0,4813
|
|
138
|
-
workbench/model_scripts/custom_models/proximity/proximity.py,sha256=
|
|
140
|
+
workbench/model_scripts/custom_models/proximity/proximity.py,sha256=dPTYD1N-JTIqg6iL7ak_JSouaCdfmBPjG08IRRvTLXU,15836
|
|
139
141
|
workbench/model_scripts/custom_models/proximity/requirements.txt,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
140
142
|
workbench/model_scripts/custom_models/uq_models/Readme.md,sha256=UVpL-lvtTrLqwBeQFinLhd_uNrEw4JUlggIdUSDrd-w,188
|
|
141
143
|
workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template,sha256=ca3CaAk6HVuNv1HnPgABTzRY3oDrRxomjgD4V1ZDwoc,6448
|
|
142
|
-
workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template,sha256=
|
|
144
|
+
workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template,sha256=449Enh4-7RrMrxt1oS_SHJHGV8yYcFlWHsLrCVTFQGI,13778
|
|
143
145
|
workbench/model_scripts/custom_models/uq_models/gaussian_process.template,sha256=3nMlCi8nEbc4N-MQTzjfIcljfDQkUmWeLBfmd18m5fg,6632
|
|
144
|
-
workbench/model_scripts/custom_models/uq_models/meta_uq.template,sha256=
|
|
145
|
-
workbench/model_scripts/custom_models/uq_models/ngboost.template,sha256=
|
|
146
|
-
workbench/model_scripts/custom_models/uq_models/proximity.py,sha256=
|
|
146
|
+
workbench/model_scripts/custom_models/uq_models/meta_uq.template,sha256=RIC90o9iI37ylOOJBUVDVF2FmYs9kJl8AifL-AYIwAI,14282
|
|
147
|
+
workbench/model_scripts/custom_models/uq_models/ngboost.template,sha256=_ukYcsL4pnWvFV1oA89_wfVpxWbvoEx6MGwKxc38kSI,8512
|
|
148
|
+
workbench/model_scripts/custom_models/uq_models/proximity.py,sha256=dPTYD1N-JTIqg6iL7ak_JSouaCdfmBPjG08IRRvTLXU,15836
|
|
147
149
|
workbench/model_scripts/custom_models/uq_models/requirements.txt,sha256=fw7T7t_YJAXK3T6Ysbesxh_Agx_tv0oYx72cEBTqRDY,98
|
|
148
150
|
workbench/model_scripts/custom_script_example/custom_model_script.py,sha256=T8aydawgRVAdSlDimoWpXxG2YuWWQkbcjBVjAeSG2_0,6408
|
|
149
151
|
workbench/model_scripts/custom_script_example/requirements.txt,sha256=jWlGc7HH7vqyukTm38LN4EyDi8jDUPEay4n45z-30uc,104
|
|
150
|
-
workbench/model_scripts/ensemble_xgb/ensemble_xgb.template,sha256=
|
|
152
|
+
workbench/model_scripts/ensemble_xgb/ensemble_xgb.template,sha256=lMEx0IkawcpTI52gSjCp1Wr0g2vWd4kIGuIqjXhA2GA,10671
|
|
151
153
|
workbench/model_scripts/ensemble_xgb/requirements.txt,sha256=jWlGc7HH7vqyukTm38LN4EyDi8jDUPEay4n45z-30uc,104
|
|
152
|
-
workbench/model_scripts/pytorch_model/generated_model_script.py,sha256=
|
|
153
|
-
workbench/model_scripts/pytorch_model/pytorch.template,sha256=
|
|
154
|
+
workbench/model_scripts/pytorch_model/generated_model_script.py,sha256=nst6kRN8T_LmmDANAaFYSC9GdGQtrDYdVBs4mU1RJ-U,32883
|
|
155
|
+
workbench/model_scripts/pytorch_model/pytorch.template,sha256=PFmGO_jP8S6RKvAzAXiuogkVXYTb5MKajJk_57qQDcc,30718
|
|
154
156
|
workbench/model_scripts/pytorch_model/requirements.txt,sha256=ICS5nW0wix44EJO2tJszJSaUrSvhSfdedn6FcRInGx4,181
|
|
157
|
+
workbench/model_scripts/scikit_learn/generated_model_script.py,sha256=xhQIglpAgPRCH9iwI3wI0N0V6p9AgqW0mVOMuSXzUCk,17187
|
|
155
158
|
workbench/model_scripts/scikit_learn/requirements.txt,sha256=aVvwiJ3LgBUhM_PyFlb2gHXu_kpGPho3ANBzlOkfcvs,107
|
|
156
159
|
workbench/model_scripts/scikit_learn/scikit_learn.template,sha256=QQvqx-eX9ZTbYmyupq6R6vIQwosmsmY_MRBPaHyfjdk,12586
|
|
157
|
-
workbench/model_scripts/uq_models/generated_model_script.py,sha256=
|
|
158
|
-
workbench/model_scripts/uq_models/mapie.template,sha256=
|
|
160
|
+
workbench/model_scripts/uq_models/generated_model_script.py,sha256=caAXcK03XQQcPo2rvFJtZqnwQpLAz7v0CQWBWDO2Dts,27866
|
|
161
|
+
workbench/model_scripts/uq_models/mapie.template,sha256=on3I40D7zyNfvfqBf5k8VXCFtmepcxKmqVWCH5Q9S84,23432
|
|
159
162
|
workbench/model_scripts/uq_models/requirements.txt,sha256=fw7T7t_YJAXK3T6Ysbesxh_Agx_tv0oYx72cEBTqRDY,98
|
|
160
|
-
workbench/model_scripts/xgb_model/generated_model_script.py,sha256=
|
|
163
|
+
workbench/model_scripts/xgb_model/generated_model_script.py,sha256=qUGg5R-boaswzXtgKp_J7JPxFzMdRNv51QeF-lMWL-4,19334
|
|
161
164
|
workbench/model_scripts/xgb_model/requirements.txt,sha256=jWlGc7HH7vqyukTm38LN4EyDi8jDUPEay4n45z-30uc,104
|
|
162
|
-
workbench/model_scripts/xgb_model/xgb_model.template,sha256=
|
|
165
|
+
workbench/model_scripts/xgb_model/xgb_model.template,sha256=gOXHsymCZjde6L2LvrlTtMRprJ-mXczpE4ZB8mhZZ0s,19168
|
|
163
166
|
workbench/repl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
164
|
-
workbench/repl/workbench_shell.py,sha256=
|
|
165
|
-
workbench/resources/open_source_api.key,sha256=
|
|
167
|
+
workbench/repl/workbench_shell.py,sha256=__FOnBqe3I6Luzb-N9mAecOUfcPEkctzxBfJSKTqDDA,22504
|
|
168
|
+
workbench/resources/open_source_api.key,sha256=vi9099CjkNnZ1IXB6AQWcG83iFYn2db0iTfTlpGVA1o,432
|
|
166
169
|
workbench/resources/signature_verify_pub.pem,sha256=V3-u-3_z2PH-805ybkKvzDOBwAbvHxcKn0jLBImEtzM,272
|
|
167
170
|
workbench/scripts/check_double_bond_stereo.py,sha256=p5hnL54Weq77ES0HCELq9JeoM-PyUGkvVSeWYF2dKyo,7776
|
|
171
|
+
workbench/scripts/endpoint_test.py,sha256=G4GdQMa7KlKX7WiUSFX_OHAzDdCyf8ZbVYbZBkAPiSo,5339
|
|
168
172
|
workbench/scripts/glue_launcher.py,sha256=bIKQvfGxpAhzbeNvTnHfRW_5kQhY-169_868ZnCejJk,10692
|
|
169
|
-
workbench/scripts/
|
|
173
|
+
workbench/scripts/lambda_test.py,sha256=SLAPIXeGQn82neQ6-Hif3VS3LWLwT0-dGw8yWw2aXRQ,2077
|
|
170
174
|
workbench/scripts/ml_pipeline_batch.py,sha256=1T5JnLlUJR7bwAGBLHmLPOuj1xFRqVIQX8PsuDhHy8o,4907
|
|
171
175
|
workbench/scripts/ml_pipeline_sqs.py,sha256=5c8qX-SoV4htOUcSXk4OzD7BQskCnaA7cLMiF4Et24c,6666
|
|
172
176
|
workbench/scripts/monitor_cloud_watch.py,sha256=s7MY4bsHts0nup9G0lWESCvgJZ9Mw1Eo-c8aKRgLjMw,9235
|
|
@@ -198,6 +202,7 @@ workbench/utils/athena_utils.py,sha256=DDyLhJujzh1PfejtGU7ZzOf5hLPOgoXmi4Lrn-_AJ
|
|
|
198
202
|
workbench/utils/aws_utils.py,sha256=x8c_WxtdSKmBqNg8P_Z6K2m4AsSMEiD_kh2nVaUZ28c,22077
|
|
199
203
|
workbench/utils/bulk_utils.py,sha256=s1lYN2Uk536MNGetekLYL_VL0N34hUjk1FX9BAz3Qu0,1182
|
|
200
204
|
workbench/utils/cache.py,sha256=0R5RXYEz_XHARK3anmQC4VRMawMks_cJ8S4vwC2roAE,5524
|
|
205
|
+
workbench/utils/chemprop_utils.py,sha256=0eszF9K2DYB5bOxbWSomr9SuX3QANdF7ROmWa0tikzY,28805
|
|
201
206
|
workbench/utils/cloudwatch_handler.py,sha256=t0L280Qa1nMq95dwnf8lB5g8FHrQAyGY5S4JwP3yIa8,5165
|
|
202
207
|
workbench/utils/cloudwatch_utils.py,sha256=wXSqKcJlSnHyC0D6d4RsH8wwmx_0CsffcetUgXlZ_78,4828
|
|
203
208
|
workbench/utils/color_utils.py,sha256=TmDGLK44t975lkfjt_1O-ee02QxrKfke7vPuXb-V-Uo,11779
|
|
@@ -226,6 +231,7 @@ workbench/utils/pipeline_utils.py,sha256=yzR5tgAzz6zNqvxzZR6YqsbS7r3QDKzBXozaM_A
|
|
|
226
231
|
workbench/utils/plot_utils.py,sha256=yFveic-4aY7lKT-CPhYdbIkBr-mZqjbhaRmCySWG_kE,6537
|
|
227
232
|
workbench/utils/plugin_manager.py,sha256=JWfyFHQih_J_MMtAT1cgjGVnNVPk9bM917LkfH8Z-_A,13873
|
|
228
233
|
workbench/utils/prox_utils.py,sha256=V0YSxI6lboZl8Bed1GUobFqfMhfpehn2FtgqHpkuhDQ,6170
|
|
234
|
+
workbench/utils/pytorch_utils.py,sha256=ig91xlAaWaCp06N4Ml2yoteDQGMJkAfysktbFEImNII,20260
|
|
229
235
|
workbench/utils/redis_cache.py,sha256=39LFSWmOlNNcah02D3sBnmibc-DPeKC3SNq71K4HaB4,12893
|
|
230
236
|
workbench/utils/repl_utils.py,sha256=rWOMv2HiEIp8ZL6Ps6DlwiJlGr-pOhv9OZQhm3aR-1A,4668
|
|
231
237
|
workbench/utils/s3_utils.py,sha256=Xme_o_cftC_jWnw6R9YKS6-6C11zaCBAoQDlY3dZb5o,7337
|
|
@@ -240,7 +246,7 @@ workbench/utils/workbench_cache.py,sha256=IQchxB81iR4eVggHBxUJdXxUCRkqWz1jKe5gxN
|
|
|
240
246
|
workbench/utils/workbench_event_bridge.py,sha256=z1GmXOB-Qs7VOgC6Hjnp2DI9nSEWepaSXejACxTIR7o,4150
|
|
241
247
|
workbench/utils/workbench_logging.py,sha256=WCuMWhQwibrvcGAyj96h2wowh6dH7zNlDJ7sWUzdCeI,10263
|
|
242
248
|
workbench/utils/workbench_sqs.py,sha256=RwM80z7YWwdtMaCKh7KWF8v38f7eBRU7kyC7ZhTRuI0,2072
|
|
243
|
-
workbench/utils/xgboost_model_utils.py,sha256=
|
|
249
|
+
workbench/utils/xgboost_model_utils.py,sha256=Zs3nTqZRDm2rbziuFVg5XzYyjf6TwBUltqmb0PmP4H8,25046
|
|
244
250
|
workbench/utils/chem_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
245
251
|
workbench/utils/chem_utils/fingerprints.py,sha256=Qvs8jaUwguWUq3Q3j695MY0t0Wk3BvroW-oWBwalMUo,5255
|
|
246
252
|
workbench/utils/chem_utils/misc.py,sha256=Nevf8_opu-uIPrv_1_0ubuFVVo2_fGUkMoLAHB3XAeo,7372
|
|
@@ -256,7 +262,7 @@ workbench/web_interface/components/component_interface.py,sha256=QCPWqiZLkVsAEzQ
|
|
|
256
262
|
workbench/web_interface/components/correlation_matrix.py,sha256=Lv4vRta5-TdxBsu0G8Ea7hyyR3XyPes-k5AfL6qZWEc,6376
|
|
257
263
|
workbench/web_interface/components/data_details_markdown.py,sha256=axDs6eXniglBmvFwIKjpJ5oyT-3D4FO9IcfA_cl-EJ8,9706
|
|
258
264
|
workbench/web_interface/components/endpoint_metric_plots.py,sha256=H0cXuj9UQrrh_2JvRHtq7O8pMXFXKs7o9XpzySENylw,3441
|
|
259
|
-
workbench/web_interface/components/model_plot.py,sha256=
|
|
265
|
+
workbench/web_interface/components/model_plot.py,sha256=9KSILXvq1L_DUZszj5ozWwi43jEtJlpWdqSs3mXBPeQ,2774
|
|
260
266
|
workbench/web_interface/components/plugin_interface.py,sha256=jGRq4igUTVXUT4sDqqsKKI2yjilV0ORNBQq6CjEWE84,9563
|
|
261
267
|
workbench/web_interface/components/plugin_unit_test.py,sha256=Lx3HhIMHzrwDUYs2bADSFYzQq3sFHS9RyA415hyUOdc,7747
|
|
262
268
|
workbench/web_interface/components/regression_plot.py,sha256=k18Bd0fcH7ig6kL5GqC_dINci3_YLle_fSEM32zXtzY,3342
|
|
@@ -285,9 +291,9 @@ workbench/web_interface/page_views/main_page.py,sha256=X4-KyGTKLAdxR-Zk2niuLJB2Y
|
|
|
285
291
|
workbench/web_interface/page_views/models_page_view.py,sha256=M0bdC7bAzLyIaE2jviY12FF4abdMFZmg6sFuOY_LaGI,2650
|
|
286
292
|
workbench/web_interface/page_views/page_view.py,sha256=Gh6YnpOGlUejx-bHZAf5pzqoQ1H1R0OSwOpGhOBO06w,455
|
|
287
293
|
workbench/web_interface/page_views/pipelines_page_view.py,sha256=v2pxrIbsHBcYiblfius3JK766NZ7ciD2yPx0t3E5IJo,2656
|
|
288
|
-
workbench-0.8.
|
|
289
|
-
workbench-0.8.
|
|
290
|
-
workbench-0.8.
|
|
291
|
-
workbench-0.8.
|
|
292
|
-
workbench-0.8.
|
|
293
|
-
workbench-0.8.
|
|
294
|
+
workbench-0.8.203.dist-info/licenses/LICENSE,sha256=RTBoTMeEwTgEhS-n8vgQ-VUo5qig0PWVd8xFPKU6Lck,1080
|
|
295
|
+
workbench-0.8.203.dist-info/METADATA,sha256=qC58O-dE5_EMFpEJWDa9fyPSNwRt-n6K7krsrsJP13I,10500
|
|
296
|
+
workbench-0.8.203.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
297
|
+
workbench-0.8.203.dist-info/entry_points.txt,sha256=j02NCuno2Y_BuE4jEvw-IL73WZ9lkTpLwom29uKcLCw,458
|
|
298
|
+
workbench-0.8.203.dist-info/top_level.txt,sha256=Dhy72zTxaA_o_yRkPZx5zw-fwumnjGaeGf0hBN3jc_w,10
|
|
299
|
+
workbench-0.8.203.dist-info/RECORD,,
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
[console_scripts]
|
|
2
2
|
cloud_watch = workbench.scripts.monitor_cloud_watch:main
|
|
3
|
+
endpoint_test = workbench.scripts.endpoint_test:main
|
|
3
4
|
glue_launcher = workbench.scripts.glue_launcher:main
|
|
4
|
-
|
|
5
|
+
lambda_test = workbench.scripts.lambda_test:main
|
|
5
6
|
ml_pipeline_batch = workbench.scripts.ml_pipeline_batch:main
|
|
6
7
|
ml_pipeline_sqs = workbench.scripts.ml_pipeline_sqs:main
|
|
7
8
|
workbench = workbench.repl.workbench_shell:launch_shell
|
|
@@ -1,404 +0,0 @@
|
|
|
1
|
-
"""AWSDFStore: Fast/efficient storage of DataFrames using AWS S3/Parquet/Snappy"""
|
|
2
|
-
|
|
3
|
-
from datetime import datetime
|
|
4
|
-
from typing import Union
|
|
5
|
-
import logging
|
|
6
|
-
import awswrangler as wr
|
|
7
|
-
import pandas as pd
|
|
8
|
-
import re
|
|
9
|
-
from urllib.parse import urlparse
|
|
10
|
-
|
|
11
|
-
# Workbench Imports
|
|
12
|
-
from workbench.core.cloud_platform.aws.aws_account_clamp import AWSAccountClamp
|
|
13
|
-
from workbench.utils.config_manager import ConfigManager
|
|
14
|
-
from workbench.utils.aws_utils import not_found_returns_none
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class AWSDFStore:
|
|
18
|
-
"""AWSDFStore: Fast/efficient storage of DataFrames using AWS S3/Parquet/Snappy
|
|
19
|
-
|
|
20
|
-
Common Usage:
|
|
21
|
-
```python
|
|
22
|
-
df_store = AWSDFStore()
|
|
23
|
-
|
|
24
|
-
# List Data
|
|
25
|
-
df_store.list()
|
|
26
|
-
|
|
27
|
-
# Add DataFrame
|
|
28
|
-
df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
|
|
29
|
-
df_store.upsert("/test/my_data", df)
|
|
30
|
-
|
|
31
|
-
# Retrieve DataFrame
|
|
32
|
-
df = df_store.get("/test/my_data")
|
|
33
|
-
print(df)
|
|
34
|
-
|
|
35
|
-
# Delete Data
|
|
36
|
-
df_store.delete("/test/my_data")
|
|
37
|
-
```
|
|
38
|
-
"""
|
|
39
|
-
|
|
40
|
-
def __init__(self, path_prefix: Union[str, None] = None):
|
|
41
|
-
"""AWSDFStore Init Method
|
|
42
|
-
|
|
43
|
-
Args:
|
|
44
|
-
path_prefix (Union[str, None], optional): Path prefix for storage locations (Defaults to None)
|
|
45
|
-
"""
|
|
46
|
-
self.log = logging.getLogger("workbench")
|
|
47
|
-
self._base_prefix = "df_store/"
|
|
48
|
-
self.path_prefix = self._base_prefix + path_prefix if path_prefix else self._base_prefix
|
|
49
|
-
self.path_prefix = re.sub(r"/+", "/", self.path_prefix) # Collapse slashes
|
|
50
|
-
|
|
51
|
-
# Get the Workbench Bucket
|
|
52
|
-
config = ConfigManager()
|
|
53
|
-
self.workbench_bucket = config.get_config("WORKBENCH_BUCKET")
|
|
54
|
-
|
|
55
|
-
# Get the S3 Client
|
|
56
|
-
self.boto3_session = AWSAccountClamp().boto3_session
|
|
57
|
-
self.s3_client = self.boto3_session.client("s3")
|
|
58
|
-
|
|
59
|
-
def list(self, include_cache: bool = False) -> list:
|
|
60
|
-
"""List all objects in the data_store prefix
|
|
61
|
-
|
|
62
|
-
Args:
|
|
63
|
-
include_cache (bool, optional): Include cache objects in the list (Defaults to False)
|
|
64
|
-
|
|
65
|
-
Returns:
|
|
66
|
-
list: A list of all the objects in the data_store prefix.
|
|
67
|
-
"""
|
|
68
|
-
df = self.summary(include_cache=include_cache)
|
|
69
|
-
return df["location"].tolist()
|
|
70
|
-
|
|
71
|
-
def last_modified(self, location: str) -> Union[datetime, None]:
|
|
72
|
-
"""Return the last modified date of a graph.
|
|
73
|
-
|
|
74
|
-
Args:
|
|
75
|
-
location (str): Logical location of the graph.
|
|
76
|
-
|
|
77
|
-
Returns:
|
|
78
|
-
Union[datetime, None]: Last modified datetime or None if not found.
|
|
79
|
-
"""
|
|
80
|
-
s3_uri = self._generate_s3_uri(location)
|
|
81
|
-
bucket, key = self._parse_s3_uri(s3_uri)
|
|
82
|
-
|
|
83
|
-
try:
|
|
84
|
-
response = self.s3_client.head_object(Bucket=bucket, Key=key)
|
|
85
|
-
return response["LastModified"]
|
|
86
|
-
except self.s3_client.exceptions.ClientError:
|
|
87
|
-
return None
|
|
88
|
-
|
|
89
|
-
def summary(self, include_cache: bool = False) -> pd.DataFrame:
|
|
90
|
-
"""Return a nicely formatted summary of object locations, sizes (in MB), and modified dates.
|
|
91
|
-
|
|
92
|
-
Args:
|
|
93
|
-
include_cache (bool, optional): Include cache objects in the summary (Defaults to False)
|
|
94
|
-
"""
|
|
95
|
-
df = self.details(include_cache=include_cache)
|
|
96
|
-
|
|
97
|
-
# Create a formatted DataFrame
|
|
98
|
-
formatted_df = pd.DataFrame(
|
|
99
|
-
{
|
|
100
|
-
"location": df["location"],
|
|
101
|
-
"size (MB)": (df["size"] / (1024 * 1024)).round(2), # Convert size to MB
|
|
102
|
-
"modified": pd.to_datetime(df["modified"]).dt.strftime("%Y-%m-%d %H:%M:%S"), # Format date
|
|
103
|
-
}
|
|
104
|
-
)
|
|
105
|
-
return formatted_df
|
|
106
|
-
|
|
107
|
-
def details(self, include_cache: bool = False) -> pd.DataFrame:
|
|
108
|
-
"""Return detailed metadata for all objects, optionally excluding the specified prefix.
|
|
109
|
-
|
|
110
|
-
Args:
|
|
111
|
-
include_cache (bool, optional): Include cache objects in the details (Defaults to False)
|
|
112
|
-
"""
|
|
113
|
-
try:
|
|
114
|
-
response = self.s3_client.list_objects_v2(Bucket=self.workbench_bucket, Prefix=self.path_prefix)
|
|
115
|
-
if "Contents" not in response:
|
|
116
|
-
return pd.DataFrame(columns=["location", "s3_file", "size", "modified"])
|
|
117
|
-
|
|
118
|
-
# Collect details for each object
|
|
119
|
-
data = []
|
|
120
|
-
for obj in response["Contents"]:
|
|
121
|
-
full_key = obj["Key"]
|
|
122
|
-
|
|
123
|
-
# Reverse logic: Strip the bucket/prefix in the front and .parquet in the end
|
|
124
|
-
location = full_key.replace(f"{self.path_prefix}", "/").split(".parquet")[0]
|
|
125
|
-
s3_file = f"s3://{self.workbench_bucket}/{full_key}"
|
|
126
|
-
size = obj["Size"]
|
|
127
|
-
modified = obj["LastModified"]
|
|
128
|
-
data.append([location, s3_file, size, modified])
|
|
129
|
-
|
|
130
|
-
# Create the DataFrame
|
|
131
|
-
df = pd.DataFrame(data, columns=["location", "s3_file", "size", "modified"])
|
|
132
|
-
|
|
133
|
-
# Apply the exclude_prefix filter if set
|
|
134
|
-
cache_prefix = "/workbench/dataframe_cache/"
|
|
135
|
-
if not include_cache:
|
|
136
|
-
df = df[~df["location"].str.startswith(cache_prefix)]
|
|
137
|
-
|
|
138
|
-
return df
|
|
139
|
-
|
|
140
|
-
except Exception as e:
|
|
141
|
-
self.log.error(f"Failed to get object details: {e}")
|
|
142
|
-
return pd.DataFrame(columns=["location", "s3_file", "size", "created", "modified"])
|
|
143
|
-
|
|
144
|
-
def check(self, location: str) -> bool:
|
|
145
|
-
"""Check if a DataFrame exists at the specified location
|
|
146
|
-
|
|
147
|
-
Args:
|
|
148
|
-
location (str): The location of the data to check.
|
|
149
|
-
|
|
150
|
-
Returns:
|
|
151
|
-
bool: True if the data exists, False otherwise.
|
|
152
|
-
"""
|
|
153
|
-
# Generate the specific S3 prefix for the target location
|
|
154
|
-
s3_prefix = f"{self.path_prefix}/{location}.parquet/"
|
|
155
|
-
s3_prefix = re.sub(r"/+", "/", s3_prefix) # Collapse slashes
|
|
156
|
-
|
|
157
|
-
# Use list_objects_v2 to check if any objects exist under this specific prefix
|
|
158
|
-
response = self.s3_client.list_objects_v2(Bucket=self.workbench_bucket, Prefix=s3_prefix, MaxKeys=1)
|
|
159
|
-
return "Contents" in response
|
|
160
|
-
|
|
161
|
-
@not_found_returns_none
|
|
162
|
-
def get(self, location: str) -> Union[pd.DataFrame, None]:
|
|
163
|
-
"""Retrieve a DataFrame from AWS S3.
|
|
164
|
-
|
|
165
|
-
Args:
|
|
166
|
-
location (str): The location of the data to retrieve.
|
|
167
|
-
|
|
168
|
-
Returns:
|
|
169
|
-
pd.DataFrame: The retrieved DataFrame or None if not found.
|
|
170
|
-
"""
|
|
171
|
-
s3_uri = self._generate_s3_uri(location)
|
|
172
|
-
return wr.s3.read_parquet(s3_uri)
|
|
173
|
-
|
|
174
|
-
def upsert(self, location: str, data: Union[pd.DataFrame, pd.Series]):
|
|
175
|
-
"""Insert or update a DataFrame or Series in the AWS S3.
|
|
176
|
-
|
|
177
|
-
Args:
|
|
178
|
-
location (str): The location of the data.
|
|
179
|
-
data (Union[pd.DataFrame, pd.Series]): The data to be stored.
|
|
180
|
-
"""
|
|
181
|
-
# Check if the data is a Pandas Series, convert it to a DataFrame
|
|
182
|
-
if isinstance(data, pd.Series):
|
|
183
|
-
data = data.to_frame()
|
|
184
|
-
|
|
185
|
-
# Ensure data is a DataFrame
|
|
186
|
-
if not isinstance(data, pd.DataFrame):
|
|
187
|
-
raise ValueError("Only Pandas DataFrame or Series objects are supported.")
|
|
188
|
-
|
|
189
|
-
# Convert object columns to string type to avoid PyArrow type inference issues.
|
|
190
|
-
data = self.type_convert_before_parquet(data)
|
|
191
|
-
|
|
192
|
-
# Update/Insert the DataFrame to S3
|
|
193
|
-
s3_uri = self._generate_s3_uri(location)
|
|
194
|
-
try:
|
|
195
|
-
wr.s3.to_parquet(df=data, path=s3_uri, dataset=True, mode="overwrite", index=True)
|
|
196
|
-
self.log.info(f"Dataframe cached {s3_uri}...")
|
|
197
|
-
except Exception as e:
|
|
198
|
-
self.log.error(f"Failed to cache dataframe '{s3_uri}': {e}")
|
|
199
|
-
raise
|
|
200
|
-
|
|
201
|
-
@staticmethod
|
|
202
|
-
def type_convert_before_parquet(df: pd.DataFrame) -> pd.DataFrame:
|
|
203
|
-
# Convert object columns to string type to avoid PyArrow type inference issues.
|
|
204
|
-
df = df.copy()
|
|
205
|
-
object_cols = df.select_dtypes(include=["object"]).columns
|
|
206
|
-
df[object_cols] = df[object_cols].astype("str")
|
|
207
|
-
return df
|
|
208
|
-
|
|
209
|
-
def delete(self, location: str):
|
|
210
|
-
"""Delete a DataFrame from the AWS S3.
|
|
211
|
-
|
|
212
|
-
Args:
|
|
213
|
-
location (str): The location of the data to delete.
|
|
214
|
-
"""
|
|
215
|
-
s3_uri = self._generate_s3_uri(location)
|
|
216
|
-
|
|
217
|
-
# Check if the folder (prefix) exists in S3
|
|
218
|
-
if not wr.s3.list_objects(s3_uri):
|
|
219
|
-
self.log.info(f"Data '{location}' does not exist in S3...")
|
|
220
|
-
return
|
|
221
|
-
|
|
222
|
-
# Delete the data from S3
|
|
223
|
-
try:
|
|
224
|
-
wr.s3.delete_objects(s3_uri)
|
|
225
|
-
self.log.info(f"Data '{location}' deleted successfully from S3.")
|
|
226
|
-
except Exception as e:
|
|
227
|
-
self.log.error(f"Failed to delete data '{location}': {e}")
|
|
228
|
-
|
|
229
|
-
def delete_recursive(self, location: str):
|
|
230
|
-
"""Recursively delete all data under the specified location in AWS S3.
|
|
231
|
-
|
|
232
|
-
Args:
|
|
233
|
-
location (str): The location prefix of the data to delete.
|
|
234
|
-
"""
|
|
235
|
-
# Construct the full prefix for S3
|
|
236
|
-
s3_prefix = re.sub(r"/+", "/", f"{self.path_prefix}/{location}") # Collapse slashes
|
|
237
|
-
s3_prefix = s3_prefix.rstrip("/") + "/" # Ensure the prefix ends with a slash
|
|
238
|
-
|
|
239
|
-
# List all objects under the given prefix
|
|
240
|
-
try:
|
|
241
|
-
response = self.s3_client.list_objects_v2(Bucket=self.workbench_bucket, Prefix=s3_prefix)
|
|
242
|
-
if "Contents" not in response:
|
|
243
|
-
self.log.info(f"No data found under '{s3_prefix}' to delete.")
|
|
244
|
-
return
|
|
245
|
-
|
|
246
|
-
# Gather all keys to delete
|
|
247
|
-
keys = [{"Key": obj["Key"]} for obj in response["Contents"]]
|
|
248
|
-
response = self.s3_client.delete_objects(Bucket=self.workbench_bucket, Delete={"Objects": keys})
|
|
249
|
-
for response in response.get("Deleted", []):
|
|
250
|
-
self.log.info(f"Deleted: {response['Key']}")
|
|
251
|
-
|
|
252
|
-
except Exception as e:
|
|
253
|
-
self.log.error(f"Failed to delete data recursively at '{location}': {e}")
|
|
254
|
-
|
|
255
|
-
def list_subfiles(self, prefix: str) -> list:
|
|
256
|
-
"""Return a list of file locations with the given prefix.
|
|
257
|
-
|
|
258
|
-
Args:
|
|
259
|
-
prefix (str, optional): Only include files with the given prefix
|
|
260
|
-
|
|
261
|
-
Returns:
|
|
262
|
-
list: List of file locations (paths)
|
|
263
|
-
"""
|
|
264
|
-
try:
|
|
265
|
-
full_prefix = f"{self.path_prefix}{prefix.lstrip('/')}"
|
|
266
|
-
response = self.s3_client.list_objects_v2(Bucket=self.workbench_bucket, Prefix=full_prefix)
|
|
267
|
-
if "Contents" not in response:
|
|
268
|
-
return []
|
|
269
|
-
|
|
270
|
-
locations = []
|
|
271
|
-
for obj in response["Contents"]:
|
|
272
|
-
full_key = obj["Key"]
|
|
273
|
-
location = full_key.replace(f"{self.path_prefix}", "/").split(".parquet")[0]
|
|
274
|
-
locations.append(location)
|
|
275
|
-
return locations
|
|
276
|
-
|
|
277
|
-
except Exception as e:
|
|
278
|
-
self.log.error(f"Failed to list subfiles: {e}")
|
|
279
|
-
return []
|
|
280
|
-
|
|
281
|
-
def _generate_s3_uri(self, location: str) -> str:
|
|
282
|
-
"""Generate the S3 URI for the given location."""
|
|
283
|
-
s3_path = f"{self.workbench_bucket}/{self.path_prefix}/{location}.parquet"
|
|
284
|
-
return f"s3://{re.sub(r'/+', '/', s3_path)}"
|
|
285
|
-
|
|
286
|
-
def _parse_s3_uri(self, s3_uri: str) -> tuple:
|
|
287
|
-
"""Parse an S3 URI into bucket and key."""
|
|
288
|
-
parsed = urlparse(s3_uri)
|
|
289
|
-
if parsed.scheme != "s3":
|
|
290
|
-
raise ValueError(f"Invalid S3 URI: {s3_uri}")
|
|
291
|
-
return parsed.netloc, parsed.path.lstrip("/")
|
|
292
|
-
|
|
293
|
-
def __repr__(self):
|
|
294
|
-
"""Return a string representation of the AWSDFStore object."""
|
|
295
|
-
# Use the summary() method and format it to align columns for printing
|
|
296
|
-
summary_df = self.summary()
|
|
297
|
-
|
|
298
|
-
# Sanity check: If there are no objects, return a message
|
|
299
|
-
if summary_df.empty:
|
|
300
|
-
return "AWSDFStore: No data objects found in the store."
|
|
301
|
-
|
|
302
|
-
# Dynamically compute the max length of the 'location' column and add 5 spaces for padding
|
|
303
|
-
max_location_len = summary_df["location"].str.len().max() + 2
|
|
304
|
-
summary_df["location"] = summary_df["location"].str.ljust(max_location_len)
|
|
305
|
-
|
|
306
|
-
# Format the size column to include (MB) and ensure 3 spaces between size and date
|
|
307
|
-
summary_df["size (MB)"] = summary_df["size (MB)"].apply(lambda x: f"{x:.2f} MB")
|
|
308
|
-
|
|
309
|
-
# Enclose the modified date in parentheses and ensure 3 spaces between size and date
|
|
310
|
-
summary_df["modified"] = summary_df["modified"].apply(lambda x: f" ({x})")
|
|
311
|
-
|
|
312
|
-
# Convert the DataFrame to a string, remove headers, and return
|
|
313
|
-
return summary_df.to_string(index=False, header=False)
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
if __name__ == "__main__":
|
|
317
|
-
"""Exercise the AWSDFStore Class"""
|
|
318
|
-
import time
|
|
319
|
-
|
|
320
|
-
# Create a AWSDFStore manager
|
|
321
|
-
df_store = AWSDFStore()
|
|
322
|
-
|
|
323
|
-
# Details of the Dataframe Store
|
|
324
|
-
print("Detailed Data...")
|
|
325
|
-
print(df_store.details())
|
|
326
|
-
|
|
327
|
-
# List all objects
|
|
328
|
-
print("List Data...")
|
|
329
|
-
print(df_store.list())
|
|
330
|
-
|
|
331
|
-
# Add a new DataFrame
|
|
332
|
-
my_df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
|
|
333
|
-
df_store.upsert("/testing/test_data", my_df)
|
|
334
|
-
|
|
335
|
-
# Check the last modified date
|
|
336
|
-
print("Last Modified Date:")
|
|
337
|
-
print(df_store.last_modified("/testing/test_data"))
|
|
338
|
-
|
|
339
|
-
# Get the DataFrame
|
|
340
|
-
print(f"Getting data 'test_data':\n{df_store.get('/testing/test_data')}")
|
|
341
|
-
|
|
342
|
-
# Now let's test adding a Series
|
|
343
|
-
series = pd.Series([1, 2, 3, 4], name="Series")
|
|
344
|
-
df_store.upsert("/testing/test_series", series)
|
|
345
|
-
print(f"Getting data 'test_series':\n{df_store.get('/testing/test_series')}")
|
|
346
|
-
|
|
347
|
-
# Summary of the data
|
|
348
|
-
print("Summary Data...")
|
|
349
|
-
print(df_store.summary())
|
|
350
|
-
|
|
351
|
-
# Repr of the AWSDFStore object
|
|
352
|
-
print("AWSDFStore Object:")
|
|
353
|
-
print(df_store)
|
|
354
|
-
|
|
355
|
-
# Check if the data exists
|
|
356
|
-
print("Check if data exists...")
|
|
357
|
-
print(df_store.check("/testing/test_data"))
|
|
358
|
-
print(df_store.check("/testing/test_series"))
|
|
359
|
-
|
|
360
|
-
# Time the check
|
|
361
|
-
start_time = time.time()
|
|
362
|
-
print(df_store.check("/testing/test_data"))
|
|
363
|
-
print("--- Check %s seconds ---" % (time.time() - start_time))
|
|
364
|
-
|
|
365
|
-
# Test list_subfiles
|
|
366
|
-
print("List Subfiles:")
|
|
367
|
-
print(df_store.list_subfiles("/testing"))
|
|
368
|
-
|
|
369
|
-
# Now delete the test data
|
|
370
|
-
df_store.delete("/testing/test_data")
|
|
371
|
-
df_store.delete("/testing/test_series")
|
|
372
|
-
|
|
373
|
-
# Check if the data exists
|
|
374
|
-
print("Check if data exists...")
|
|
375
|
-
print(df_store.check("/testing/test_data"))
|
|
376
|
-
print(df_store.check("/testing/test_series"))
|
|
377
|
-
|
|
378
|
-
# Add a bunch of dataframes and then test recursive delete
|
|
379
|
-
for i in range(10):
|
|
380
|
-
df_store.upsert(f"/testing/data_{i}", pd.DataFrame({"A": [1, 2], "B": [3, 4]}))
|
|
381
|
-
print("Before Recursive Delete:")
|
|
382
|
-
print(df_store.summary())
|
|
383
|
-
df_store.delete_recursive("/testing")
|
|
384
|
-
print("After Recursive Delete:")
|
|
385
|
-
print(df_store.summary())
|
|
386
|
-
|
|
387
|
-
# Get a non-existent DataFrame
|
|
388
|
-
print("Getting non-existent data...")
|
|
389
|
-
print(df_store.get("/testing/no_where"))
|
|
390
|
-
|
|
391
|
-
# Test path_prefix
|
|
392
|
-
df_store = AWSDFStore(path_prefix="/super/test")
|
|
393
|
-
print(df_store.path_prefix)
|
|
394
|
-
df_store.upsert("test_data", my_df)
|
|
395
|
-
print(df_store.get("test_data"))
|
|
396
|
-
print(df_store.summary())
|
|
397
|
-
df_store.delete("test_data")
|
|
398
|
-
print(df_store.summary())
|
|
399
|
-
|
|
400
|
-
# Test columns with Spaces in them
|
|
401
|
-
my_df = pd.DataFrame({"My A": [1, 2], "My B": [3, 4]})
|
|
402
|
-
df_store.upsert("/testing/test_data", my_df)
|
|
403
|
-
my_df = df_store.get("/testing/test_data")
|
|
404
|
-
print(my_df)
|