mlrun 1.6.4rc2__py3-none-any.whl → 1.7.0rc20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +11 -1
- mlrun/__main__.py +26 -112
- mlrun/alerts/__init__.py +15 -0
- mlrun/alerts/alert.py +144 -0
- mlrun/api/schemas/__init__.py +5 -4
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +46 -257
- mlrun/artifacts/dataset.py +11 -192
- mlrun/artifacts/manager.py +47 -48
- mlrun/artifacts/model.py +31 -159
- mlrun/artifacts/plots.py +23 -380
- mlrun/common/constants.py +69 -0
- mlrun/common/db/sql_session.py +2 -3
- mlrun/common/formatters/__init__.py +19 -0
- mlrun/common/formatters/artifact.py +21 -0
- mlrun/common/formatters/base.py +78 -0
- mlrun/common/formatters/function.py +41 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/common/helpers.py +1 -2
- mlrun/common/model_monitoring/helpers.py +9 -5
- mlrun/{runtimes → common/runtimes}/constants.py +37 -9
- mlrun/common/schemas/__init__.py +24 -4
- mlrun/common/schemas/alert.py +203 -0
- mlrun/common/schemas/api_gateway.py +148 -0
- mlrun/common/schemas/artifact.py +18 -8
- mlrun/common/schemas/auth.py +11 -5
- mlrun/common/schemas/background_task.py +1 -1
- mlrun/common/schemas/client_spec.py +4 -1
- mlrun/common/schemas/feature_store.py +16 -16
- mlrun/common/schemas/frontend_spec.py +8 -7
- mlrun/common/schemas/function.py +5 -1
- mlrun/common/schemas/hub.py +11 -18
- mlrun/common/schemas/memory_reports.py +2 -2
- mlrun/common/schemas/model_monitoring/__init__.py +18 -3
- mlrun/common/schemas/model_monitoring/constants.py +83 -26
- mlrun/common/schemas/model_monitoring/grafana.py +13 -9
- mlrun/common/schemas/model_monitoring/model_endpoints.py +99 -16
- mlrun/common/schemas/notification.py +4 -4
- mlrun/common/schemas/object.py +2 -2
- mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
- mlrun/common/schemas/pipeline.py +1 -10
- mlrun/common/schemas/project.py +24 -23
- mlrun/common/schemas/runtime_resource.py +8 -12
- mlrun/common/schemas/schedule.py +3 -3
- mlrun/common/schemas/tag.py +1 -2
- mlrun/common/schemas/workflow.py +2 -2
- mlrun/common/types.py +7 -1
- mlrun/config.py +54 -17
- mlrun/data_types/to_pandas.py +10 -12
- mlrun/datastore/__init__.py +5 -8
- mlrun/datastore/alibaba_oss.py +130 -0
- mlrun/datastore/azure_blob.py +17 -5
- mlrun/datastore/base.py +62 -39
- mlrun/datastore/datastore.py +28 -9
- mlrun/datastore/datastore_profile.py +146 -20
- mlrun/datastore/filestore.py +0 -1
- mlrun/datastore/google_cloud_storage.py +6 -2
- mlrun/datastore/hdfs.py +56 -0
- mlrun/datastore/inmem.py +2 -2
- mlrun/datastore/redis.py +6 -2
- mlrun/datastore/s3.py +9 -0
- mlrun/datastore/snowflake_utils.py +43 -0
- mlrun/datastore/sources.py +201 -96
- mlrun/datastore/spark_utils.py +1 -2
- mlrun/datastore/store_resources.py +7 -7
- mlrun/datastore/targets.py +358 -104
- mlrun/datastore/utils.py +72 -58
- mlrun/datastore/v3io.py +5 -1
- mlrun/db/base.py +185 -35
- mlrun/db/factory.py +1 -1
- mlrun/db/httpdb.py +614 -179
- mlrun/db/nopdb.py +210 -26
- mlrun/errors.py +12 -1
- mlrun/execution.py +41 -24
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +40 -72
- mlrun/feature_store/common.py +1 -1
- mlrun/feature_store/feature_set.py +76 -55
- mlrun/feature_store/feature_vector.py +28 -30
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +16 -11
- mlrun/feature_store/retrieval/conversion.py +11 -13
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +9 -3
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +34 -24
- mlrun/feature_store/steps.py +37 -34
- mlrun/features.py +9 -20
- mlrun/frameworks/_common/artifacts_library.py +9 -9
- mlrun/frameworks/_common/mlrun_interface.py +5 -5
- mlrun/frameworks/_common/model_handler.py +48 -48
- mlrun/frameworks/_common/plan.py +2 -3
- mlrun/frameworks/_common/producer.py +3 -4
- mlrun/frameworks/_common/utils.py +5 -5
- mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
- mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
- mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
- mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
- mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
- mlrun/frameworks/_ml_common/model_handler.py +24 -24
- mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
- mlrun/frameworks/_ml_common/plan.py +1 -1
- mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
- mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
- mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
- mlrun/frameworks/_ml_common/utils.py +4 -4
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
- mlrun/frameworks/huggingface/model_server.py +4 -4
- mlrun/frameworks/lgbm/__init__.py +33 -33
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
- mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
- mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
- mlrun/frameworks/lgbm/model_handler.py +10 -10
- mlrun/frameworks/lgbm/model_server.py +6 -6
- mlrun/frameworks/lgbm/utils.py +5 -5
- mlrun/frameworks/onnx/dataset.py +8 -8
- mlrun/frameworks/onnx/mlrun_interface.py +3 -3
- mlrun/frameworks/onnx/model_handler.py +6 -6
- mlrun/frameworks/onnx/model_server.py +7 -7
- mlrun/frameworks/parallel_coordinates.py +4 -3
- mlrun/frameworks/pytorch/__init__.py +18 -18
- mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
- mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
- mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
- mlrun/frameworks/pytorch/model_handler.py +17 -17
- mlrun/frameworks/pytorch/model_server.py +7 -7
- mlrun/frameworks/sklearn/__init__.py +13 -13
- mlrun/frameworks/sklearn/estimator.py +4 -4
- mlrun/frameworks/sklearn/metrics_library.py +14 -14
- mlrun/frameworks/sklearn/mlrun_interface.py +3 -6
- mlrun/frameworks/sklearn/model_handler.py +2 -2
- mlrun/frameworks/tf_keras/__init__.py +10 -7
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
- mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
- mlrun/frameworks/tf_keras/model_handler.py +14 -14
- mlrun/frameworks/tf_keras/model_server.py +6 -6
- mlrun/frameworks/xgboost/__init__.py +13 -13
- mlrun/frameworks/xgboost/model_handler.py +6 -6
- mlrun/k8s_utils.py +14 -16
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +16 -15
- mlrun/launcher/client.py +8 -6
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +17 -11
- mlrun/launcher/remote.py +16 -10
- mlrun/lists.py +7 -6
- mlrun/model.py +238 -73
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +138 -315
- mlrun/model_monitoring/application.py +5 -296
- mlrun/model_monitoring/applications/__init__.py +24 -0
- mlrun/model_monitoring/applications/_application_steps.py +157 -0
- mlrun/model_monitoring/applications/base.py +282 -0
- mlrun/model_monitoring/applications/context.py +214 -0
- mlrun/model_monitoring/applications/evidently_base.py +211 -0
- mlrun/model_monitoring/applications/histogram_data_drift.py +349 -0
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +104 -84
- mlrun/model_monitoring/controller_handler.py +13 -5
- mlrun/model_monitoring/db/__init__.py +18 -0
- mlrun/model_monitoring/{stores → db/stores}/__init__.py +43 -36
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/{stores/model_endpoint_store.py → db/stores/base/store.py} +64 -40
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
- mlrun/model_monitoring/{stores → db/stores/sqldb}/models/base.py +109 -5
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +88 -0
- mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +684 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/{stores/kv_model_endpoint_store.py → db/stores/v3io_kv/kv_store.py} +310 -165
- mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
- mlrun/model_monitoring/db/tsdb/base.py +329 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +397 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +630 -0
- mlrun/model_monitoring/evidently_application.py +6 -118
- mlrun/model_monitoring/features_drift_table.py +134 -106
- mlrun/model_monitoring/helpers.py +127 -28
- mlrun/model_monitoring/metrics/__init__.py +13 -0
- mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/prometheus.py +1 -4
- mlrun/model_monitoring/stream_processing.py +62 -231
- mlrun/model_monitoring/tracking_policy.py +9 -2
- mlrun/model_monitoring/writer.py +152 -124
- mlrun/package/__init__.py +6 -6
- mlrun/package/context_handler.py +5 -5
- mlrun/package/packager.py +7 -7
- mlrun/package/packagers/default_packager.py +6 -6
- mlrun/package/packagers/numpy_packagers.py +15 -15
- mlrun/package/packagers/pandas_packagers.py +5 -5
- mlrun/package/packagers/python_standard_library_packagers.py +10 -10
- mlrun/package/packagers_manager.py +19 -23
- mlrun/package/utils/_formatter.py +6 -6
- mlrun/package/utils/_pickler.py +2 -2
- mlrun/package/utils/_supported_format.py +4 -4
- mlrun/package/utils/log_hint_utils.py +2 -2
- mlrun/package/utils/type_hint_utils.py +4 -9
- mlrun/platforms/__init__.py +11 -10
- mlrun/platforms/iguazio.py +24 -203
- mlrun/projects/operations.py +35 -21
- mlrun/projects/pipelines.py +68 -99
- mlrun/projects/project.py +830 -266
- mlrun/render.py +3 -11
- mlrun/run.py +162 -166
- mlrun/runtimes/__init__.py +62 -7
- mlrun/runtimes/base.py +39 -32
- mlrun/runtimes/daskjob.py +8 -8
- mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
- mlrun/runtimes/databricks_job/databricks_runtime.py +7 -7
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +0 -28
- mlrun/runtimes/function_reference.py +1 -1
- mlrun/runtimes/kubejob.py +28 -122
- mlrun/runtimes/local.py +6 -3
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/abstract.py +9 -10
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
- mlrun/runtimes/nuclio/api_gateway.py +709 -0
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +523 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
- mlrun/runtimes/{function.py → nuclio/function.py} +112 -73
- mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
- mlrun/runtimes/{serving.py → nuclio/serving.py} +45 -51
- mlrun/runtimes/pod.py +286 -88
- mlrun/runtimes/remotesparkjob.py +2 -2
- mlrun/runtimes/sparkjob/spark3job.py +51 -34
- mlrun/runtimes/utils.py +7 -75
- mlrun/secrets.py +9 -5
- mlrun/serving/remote.py +2 -7
- mlrun/serving/routers.py +13 -10
- mlrun/serving/server.py +22 -26
- mlrun/serving/states.py +99 -25
- mlrun/serving/utils.py +3 -3
- mlrun/serving/v1_serving.py +6 -7
- mlrun/serving/v2_serving.py +59 -20
- mlrun/track/tracker.py +2 -1
- mlrun/track/tracker_manager.py +3 -3
- mlrun/track/trackers/mlflow_tracker.py +1 -2
- mlrun/utils/async_http.py +5 -7
- mlrun/utils/azure_vault.py +1 -1
- mlrun/utils/clones.py +1 -2
- mlrun/utils/condition_evaluator.py +3 -3
- mlrun/utils/db.py +3 -3
- mlrun/utils/helpers.py +183 -197
- mlrun/utils/http.py +2 -5
- mlrun/utils/logger.py +76 -14
- mlrun/utils/notifications/notification/__init__.py +17 -12
- mlrun/utils/notifications/notification/base.py +14 -2
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +3 -1
- mlrun/utils/notifications/notification/ipython.py +3 -1
- mlrun/utils/notifications/notification/slack.py +101 -21
- mlrun/utils/notifications/notification/webhook.py +11 -1
- mlrun/utils/notifications/notification_pusher.py +155 -30
- mlrun/utils/retryer.py +208 -0
- mlrun/utils/singleton.py +1 -1
- mlrun/utils/v3io_clients.py +2 -4
- mlrun/utils/version/version.json +2 -2
- mlrun/utils/version/version.py +2 -6
- {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/METADATA +31 -19
- mlrun-1.7.0rc20.dist-info/RECORD +353 -0
- mlrun/kfpops.py +0 -868
- mlrun/model_monitoring/batch.py +0 -1095
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
- mlrun/platforms/other.py +0 -306
- mlrun-1.6.4rc2.dist-info/RECORD +0 -314
- {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/LICENSE +0 -0
- {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/WHEEL +0 -0
- {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/entry_points.txt +0 -0
- {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/top_level.txt +0 -0
mlrun/datastore/base.py
CHANGED
|
@@ -144,6 +144,10 @@ class DataStore:
|
|
|
144
144
|
def url(self):
|
|
145
145
|
return f"{self.kind}://{self.endpoint}"
|
|
146
146
|
|
|
147
|
+
@property
|
|
148
|
+
def spark_url(self):
|
|
149
|
+
return self.url
|
|
150
|
+
|
|
147
151
|
def get(self, key, size=None, offset=0):
|
|
148
152
|
pass
|
|
149
153
|
|
|
@@ -175,11 +179,23 @@ class DataStore:
|
|
|
175
179
|
return {}
|
|
176
180
|
|
|
177
181
|
@staticmethod
|
|
178
|
-
def _parquet_reader(
|
|
182
|
+
def _parquet_reader(
|
|
183
|
+
df_module,
|
|
184
|
+
url,
|
|
185
|
+
file_system,
|
|
186
|
+
time_column,
|
|
187
|
+
start_time,
|
|
188
|
+
end_time,
|
|
189
|
+
additional_filters,
|
|
190
|
+
):
|
|
179
191
|
from storey.utils import find_filters, find_partitions
|
|
180
192
|
|
|
181
193
|
def set_filters(
|
|
182
|
-
partitions_time_attributes,
|
|
194
|
+
partitions_time_attributes,
|
|
195
|
+
start_time_inner,
|
|
196
|
+
end_time_inner,
|
|
197
|
+
filters_inner,
|
|
198
|
+
kwargs,
|
|
183
199
|
):
|
|
184
200
|
filters = []
|
|
185
201
|
find_filters(
|
|
@@ -189,20 +205,23 @@ class DataStore:
|
|
|
189
205
|
filters,
|
|
190
206
|
time_column,
|
|
191
207
|
)
|
|
208
|
+
if filters and filters_inner:
|
|
209
|
+
filters[0] += filters_inner
|
|
210
|
+
|
|
192
211
|
kwargs["filters"] = filters
|
|
193
212
|
|
|
194
213
|
def reader(*args, **kwargs):
|
|
195
|
-
if start_time or end_time:
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
214
|
+
if time_column is None and (start_time or end_time):
|
|
215
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
216
|
+
"When providing start_time or end_time, must provide time_column"
|
|
217
|
+
)
|
|
218
|
+
if start_time or end_time or additional_filters:
|
|
201
219
|
partitions_time_attributes = find_partitions(url, file_system)
|
|
202
220
|
set_filters(
|
|
203
221
|
partitions_time_attributes,
|
|
204
222
|
start_time,
|
|
205
223
|
end_time,
|
|
224
|
+
additional_filters,
|
|
206
225
|
kwargs,
|
|
207
226
|
)
|
|
208
227
|
try:
|
|
@@ -213,6 +232,7 @@ class DataStore:
|
|
|
213
232
|
):
|
|
214
233
|
raise ex
|
|
215
234
|
|
|
235
|
+
# TODO: fix timezone issue (ML-6308)
|
|
216
236
|
if start_time.tzinfo:
|
|
217
237
|
start_time_inner = start_time.replace(tzinfo=None)
|
|
218
238
|
end_time_inner = end_time.replace(tzinfo=None)
|
|
@@ -224,6 +244,7 @@ class DataStore:
|
|
|
224
244
|
partitions_time_attributes,
|
|
225
245
|
start_time_inner,
|
|
226
246
|
end_time_inner,
|
|
247
|
+
additional_filters,
|
|
227
248
|
kwargs,
|
|
228
249
|
)
|
|
229
250
|
return df_module.read_parquet(*args, **kwargs)
|
|
@@ -242,6 +263,7 @@ class DataStore:
|
|
|
242
263
|
start_time=None,
|
|
243
264
|
end_time=None,
|
|
244
265
|
time_column=None,
|
|
266
|
+
additional_filters=None,
|
|
245
267
|
**kwargs,
|
|
246
268
|
):
|
|
247
269
|
df_module = df_module or pd
|
|
@@ -306,7 +328,13 @@ class DataStore:
|
|
|
306
328
|
kwargs["columns"] = columns
|
|
307
329
|
|
|
308
330
|
reader = self._parquet_reader(
|
|
309
|
-
df_module,
|
|
331
|
+
df_module,
|
|
332
|
+
url,
|
|
333
|
+
file_system,
|
|
334
|
+
time_column,
|
|
335
|
+
start_time,
|
|
336
|
+
end_time,
|
|
337
|
+
additional_filters,
|
|
310
338
|
)
|
|
311
339
|
|
|
312
340
|
elif file_url.endswith(".json") or format == "json":
|
|
@@ -317,31 +345,17 @@ class DataStore:
|
|
|
317
345
|
raise Exception(f"File type unhandled {url}")
|
|
318
346
|
|
|
319
347
|
if file_system:
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
# Pass the underlying file system
|
|
332
|
-
kwargs["filesystem"] = file_system
|
|
333
|
-
elif storage_options:
|
|
334
|
-
kwargs["storage_options"] = storage_options
|
|
335
|
-
df = reader(url, **kwargs)
|
|
336
|
-
else:
|
|
337
|
-
file = url
|
|
338
|
-
# Workaround for ARROW-12472 affecting pyarrow 3.x and 4.x.
|
|
339
|
-
if file_system.protocol != "file":
|
|
340
|
-
# If not dir, use file_system.open() to avoid regression when pandas < 1.2 and does not
|
|
341
|
-
# support the storage_options parameter.
|
|
342
|
-
file = file_system.open(url)
|
|
343
|
-
|
|
344
|
-
df = reader(file, **kwargs)
|
|
348
|
+
storage_options = self.get_storage_options()
|
|
349
|
+
if url.startswith("ds://"):
|
|
350
|
+
parsed_url = urllib.parse.urlparse(url)
|
|
351
|
+
url = parsed_url.path
|
|
352
|
+
if self.using_bucket:
|
|
353
|
+
url = url[1:]
|
|
354
|
+
# Pass the underlying file system
|
|
355
|
+
kwargs["filesystem"] = file_system
|
|
356
|
+
elif storage_options:
|
|
357
|
+
kwargs["storage_options"] = storage_options
|
|
358
|
+
df = reader(url, **kwargs)
|
|
345
359
|
else:
|
|
346
360
|
temp_file = tempfile.NamedTemporaryFile(delete=False)
|
|
347
361
|
self.download(self._join(subpath), temp_file.name)
|
|
@@ -399,14 +413,15 @@ class DataItem:
|
|
|
399
413
|
|
|
400
414
|
|
|
401
415
|
# reading run results using DataItem (run.artifact())
|
|
402
|
-
train_run = train_iris_func.run(
|
|
403
|
-
|
|
416
|
+
train_run = train_iris_func.run(
|
|
417
|
+
inputs={"dataset": dataset}, params={"label_column": "label"}
|
|
418
|
+
)
|
|
404
419
|
|
|
405
|
-
train_run.artifact(
|
|
406
|
-
test_set = train_run.artifact(
|
|
420
|
+
train_run.artifact("confusion-matrix").show()
|
|
421
|
+
test_set = train_run.artifact("test_set").as_df()
|
|
407
422
|
|
|
408
423
|
# create and use DataItem from uri
|
|
409
|
-
data = mlrun.get_dataitem(
|
|
424
|
+
data = mlrun.get_dataitem("http://xyz/data.json").get()
|
|
410
425
|
"""
|
|
411
426
|
|
|
412
427
|
def __init__(
|
|
@@ -548,6 +563,7 @@ class DataItem:
|
|
|
548
563
|
time_column=None,
|
|
549
564
|
start_time=None,
|
|
550
565
|
end_time=None,
|
|
566
|
+
additional_filters=None,
|
|
551
567
|
**kwargs,
|
|
552
568
|
):
|
|
553
569
|
"""return a dataframe object (generated from the dataitem).
|
|
@@ -559,6 +575,12 @@ class DataItem:
|
|
|
559
575
|
:param end_time: filters out data after this time
|
|
560
576
|
:param time_column: Store timestamp_key will be used if None.
|
|
561
577
|
The results will be filtered by this column and start_time & end_time.
|
|
578
|
+
:param additional_filters: List of additional_filter conditions as tuples.
|
|
579
|
+
Each tuple should be in the format (column_name, operator, value).
|
|
580
|
+
Supported operators: "=", ">=", "<=", ">", "<".
|
|
581
|
+
Example: [("Product", "=", "Computer")]
|
|
582
|
+
For all supported filters, please see:
|
|
583
|
+
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
|
|
562
584
|
"""
|
|
563
585
|
df = self._store.as_df(
|
|
564
586
|
self._url,
|
|
@@ -569,6 +591,7 @@ class DataItem:
|
|
|
569
591
|
time_column=time_column,
|
|
570
592
|
start_time=start_time,
|
|
571
593
|
end_time=end_time,
|
|
594
|
+
additional_filters=additional_filters,
|
|
572
595
|
**kwargs,
|
|
573
596
|
)
|
|
574
597
|
return df
|
mlrun/datastore/datastore.py
CHANGED
|
@@ -94,6 +94,14 @@ def schema_to_store(schema):
|
|
|
94
94
|
from .dbfs_store import DBFSStore
|
|
95
95
|
|
|
96
96
|
return DBFSStore
|
|
97
|
+
elif schema == "hdfs":
|
|
98
|
+
from .hdfs import HdfsStore
|
|
99
|
+
|
|
100
|
+
return HdfsStore
|
|
101
|
+
elif schema == "oss":
|
|
102
|
+
from .alibaba_oss import OSSStore
|
|
103
|
+
|
|
104
|
+
return OSSStore
|
|
97
105
|
else:
|
|
98
106
|
raise ValueError(f"unsupported store scheme ({schema})")
|
|
99
107
|
|
|
@@ -170,7 +178,7 @@ class StoreManager:
|
|
|
170
178
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
171
179
|
f"resource {url} does not have a valid/persistent offline target"
|
|
172
180
|
)
|
|
173
|
-
return resource, target
|
|
181
|
+
return resource, target or ""
|
|
174
182
|
|
|
175
183
|
def object(
|
|
176
184
|
self, url, key="", project="", allow_empty_resources=None, secrets: dict = None
|
|
@@ -182,14 +190,21 @@ class StoreManager:
|
|
|
182
190
|
url, project, allow_empty_resources, secrets
|
|
183
191
|
)
|
|
184
192
|
|
|
185
|
-
store, subpath = self.get_or_create_store(
|
|
193
|
+
store, subpath, url = self.get_or_create_store(
|
|
186
194
|
url, secrets=secrets, project_name=project
|
|
187
195
|
)
|
|
188
|
-
return DataItem(
|
|
196
|
+
return DataItem(
|
|
197
|
+
key,
|
|
198
|
+
store,
|
|
199
|
+
subpath,
|
|
200
|
+
url,
|
|
201
|
+
meta=meta,
|
|
202
|
+
artifact_url=artifact_url,
|
|
203
|
+
)
|
|
189
204
|
|
|
190
205
|
def get_or_create_store(
|
|
191
206
|
self, url, secrets: dict = None, project_name=""
|
|
192
|
-
) -> (DataStore, str):
|
|
207
|
+
) -> (DataStore, str, str):
|
|
193
208
|
schema, endpoint, parsed_url = parse_url(url)
|
|
194
209
|
subpath = parsed_url.path
|
|
195
210
|
store_key = f"{schema}://{endpoint}"
|
|
@@ -206,17 +221,22 @@ class StoreManager:
|
|
|
206
221
|
|
|
207
222
|
if schema == "memory":
|
|
208
223
|
subpath = url[len("memory://") :]
|
|
209
|
-
return in_memory_store, subpath
|
|
224
|
+
return in_memory_store, subpath, url
|
|
225
|
+
|
|
226
|
+
elif schema in get_local_file_schema():
|
|
227
|
+
# parse_url() will drop the windows drive-letter from the path for url like "c:\a\b".
|
|
228
|
+
# As a workaround, we set subpath to the url.
|
|
229
|
+
subpath = url.replace("file://", "", 1)
|
|
210
230
|
|
|
211
231
|
if not schema and endpoint:
|
|
212
232
|
if endpoint in self._stores.keys():
|
|
213
|
-
return self._stores[endpoint], subpath
|
|
233
|
+
return self._stores[endpoint], subpath, url
|
|
214
234
|
else:
|
|
215
235
|
raise ValueError(f"no such store ({endpoint})")
|
|
216
236
|
|
|
217
237
|
if not secrets and not mlrun.config.is_running_as_api():
|
|
218
238
|
if store_key in self._stores.keys():
|
|
219
|
-
return self._stores[store_key], subpath
|
|
239
|
+
return self._stores[store_key], subpath, url
|
|
220
240
|
|
|
221
241
|
# support u/p embedding in url (as done in redis) by setting netloc as the "endpoint" parameter
|
|
222
242
|
# when running on server we don't cache the datastore, because there are multiple users and we don't want to
|
|
@@ -226,8 +246,7 @@ class StoreManager:
|
|
|
226
246
|
)
|
|
227
247
|
if not secrets and not mlrun.config.is_running_as_api():
|
|
228
248
|
self._stores[store_key] = store
|
|
229
|
-
|
|
230
|
-
return store, url if store.kind == "file" else subpath
|
|
249
|
+
return store, subpath, url
|
|
231
250
|
|
|
232
251
|
def reset_secrets(self):
|
|
233
252
|
self._secrets = {}
|
|
@@ -16,6 +16,7 @@ import ast
|
|
|
16
16
|
import base64
|
|
17
17
|
import json
|
|
18
18
|
import typing
|
|
19
|
+
import warnings
|
|
19
20
|
from urllib.parse import ParseResult, urlparse, urlunparse
|
|
20
21
|
|
|
21
22
|
import pydantic
|
|
@@ -30,12 +31,13 @@ from ..secrets import get_secret_or_env
|
|
|
30
31
|
class DatastoreProfile(pydantic.BaseModel):
|
|
31
32
|
type: str
|
|
32
33
|
name: str
|
|
33
|
-
_private_attributes:
|
|
34
|
+
_private_attributes: list = ()
|
|
34
35
|
|
|
35
36
|
class Config:
|
|
36
37
|
extra = pydantic.Extra.forbid
|
|
37
38
|
|
|
38
39
|
@pydantic.validator("name")
|
|
40
|
+
@classmethod
|
|
39
41
|
def lower_case(cls, v):
|
|
40
42
|
return v.lower()
|
|
41
43
|
|
|
@@ -68,6 +70,9 @@ class TemporaryClientDatastoreProfiles(metaclass=mlrun.utils.singleton.Singleton
|
|
|
68
70
|
def get(self, key):
|
|
69
71
|
return self._data.get(key, None)
|
|
70
72
|
|
|
73
|
+
def remove(self, key):
|
|
74
|
+
self._data.pop(key, None)
|
|
75
|
+
|
|
71
76
|
|
|
72
77
|
class DatastoreProfileBasic(DatastoreProfile):
|
|
73
78
|
type: str = pydantic.Field("basic")
|
|
@@ -79,13 +84,37 @@ class DatastoreProfileBasic(DatastoreProfile):
|
|
|
79
84
|
class DatastoreProfileKafkaTarget(DatastoreProfile):
|
|
80
85
|
type: str = pydantic.Field("kafka_target")
|
|
81
86
|
_private_attributes = "kwargs_private"
|
|
82
|
-
bootstrap_servers: str
|
|
87
|
+
bootstrap_servers: typing.Optional[str] = None
|
|
88
|
+
brokers: typing.Optional[str] = None
|
|
83
89
|
topic: str
|
|
84
|
-
kwargs_public: typing.Optional[
|
|
85
|
-
kwargs_private: typing.Optional[
|
|
90
|
+
kwargs_public: typing.Optional[dict]
|
|
91
|
+
kwargs_private: typing.Optional[dict]
|
|
92
|
+
|
|
93
|
+
def __init__(self, **kwargs):
|
|
94
|
+
super().__init__(**kwargs)
|
|
95
|
+
|
|
96
|
+
if not self.brokers and not self.bootstrap_servers:
|
|
97
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
98
|
+
"DatastoreProfileKafkaTarget requires the 'brokers' field to be set"
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
if self.bootstrap_servers:
|
|
102
|
+
if self.brokers:
|
|
103
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
104
|
+
"DatastoreProfileKafkaTarget cannot be created with both 'brokers' and 'bootstrap_servers'"
|
|
105
|
+
)
|
|
106
|
+
else:
|
|
107
|
+
self.brokers = self.bootstrap_servers
|
|
108
|
+
self.bootstrap_servers = None
|
|
109
|
+
warnings.warn(
|
|
110
|
+
"'bootstrap_servers' parameter is deprecated in 1.7.0 and will be removed in 1.9.0, "
|
|
111
|
+
"use 'brokers' instead.",
|
|
112
|
+
# TODO: Remove this in 1.9.0
|
|
113
|
+
FutureWarning,
|
|
114
|
+
)
|
|
86
115
|
|
|
87
116
|
def attributes(self):
|
|
88
|
-
attributes = {"
|
|
117
|
+
attributes = {"brokers": self.brokers or self.bootstrap_servers}
|
|
89
118
|
if self.kwargs_public:
|
|
90
119
|
attributes = merge(attributes, self.kwargs_public)
|
|
91
120
|
if self.kwargs_private:
|
|
@@ -96,15 +125,15 @@ class DatastoreProfileKafkaTarget(DatastoreProfile):
|
|
|
96
125
|
class DatastoreProfileKafkaSource(DatastoreProfile):
|
|
97
126
|
type: str = pydantic.Field("kafka_source")
|
|
98
127
|
_private_attributes = ("kwargs_private", "sasl_user", "sasl_pass")
|
|
99
|
-
brokers: typing.Union[str,
|
|
100
|
-
topics: typing.Union[str,
|
|
128
|
+
brokers: typing.Union[str, list[str]]
|
|
129
|
+
topics: typing.Union[str, list[str]]
|
|
101
130
|
group: typing.Optional[str] = "serving"
|
|
102
131
|
initial_offset: typing.Optional[str] = "earliest"
|
|
103
|
-
partitions: typing.Optional[typing.Union[str,
|
|
132
|
+
partitions: typing.Optional[typing.Union[str, list[str]]]
|
|
104
133
|
sasl_user: typing.Optional[str]
|
|
105
134
|
sasl_pass: typing.Optional[str]
|
|
106
|
-
kwargs_public: typing.Optional[
|
|
107
|
-
kwargs_private: typing.Optional[
|
|
135
|
+
kwargs_public: typing.Optional[dict]
|
|
136
|
+
kwargs_private: typing.Optional[dict]
|
|
108
137
|
|
|
109
138
|
def attributes(self):
|
|
110
139
|
attributes = {}
|
|
@@ -132,6 +161,22 @@ class DatastoreProfileKafkaSource(DatastoreProfile):
|
|
|
132
161
|
return attributes
|
|
133
162
|
|
|
134
163
|
|
|
164
|
+
class DatastoreProfileV3io(DatastoreProfile):
|
|
165
|
+
type: str = pydantic.Field("v3io")
|
|
166
|
+
v3io_access_key: typing.Optional[str] = None
|
|
167
|
+
_private_attributes = "v3io_access_key"
|
|
168
|
+
|
|
169
|
+
def url(self, subpath):
|
|
170
|
+
subpath = subpath.lstrip("/")
|
|
171
|
+
return f"v3io:///{subpath}"
|
|
172
|
+
|
|
173
|
+
def secrets(self) -> dict:
|
|
174
|
+
res = {}
|
|
175
|
+
if self.v3io_access_key:
|
|
176
|
+
res["V3IO_ACCESS_KEY"] = self.v3io_access_key
|
|
177
|
+
return res
|
|
178
|
+
|
|
179
|
+
|
|
135
180
|
class DatastoreProfileS3(DatastoreProfile):
|
|
136
181
|
type: str = pydantic.Field("s3")
|
|
137
182
|
_private_attributes = ("access_key_id", "secret_key")
|
|
@@ -141,6 +186,18 @@ class DatastoreProfileS3(DatastoreProfile):
|
|
|
141
186
|
assume_role_arn: typing.Optional[str] = None
|
|
142
187
|
access_key_id: typing.Optional[str] = None
|
|
143
188
|
secret_key: typing.Optional[str] = None
|
|
189
|
+
bucket: typing.Optional[str] = None
|
|
190
|
+
|
|
191
|
+
@pydantic.validator("bucket")
|
|
192
|
+
@classmethod
|
|
193
|
+
def check_bucket(cls, v):
|
|
194
|
+
if not v:
|
|
195
|
+
warnings.warn(
|
|
196
|
+
"The 'bucket' attribute will be mandatory starting from version 1.9",
|
|
197
|
+
FutureWarning,
|
|
198
|
+
stacklevel=2,
|
|
199
|
+
)
|
|
200
|
+
return v
|
|
144
201
|
|
|
145
202
|
def secrets(self) -> dict:
|
|
146
203
|
res = {}
|
|
@@ -156,10 +213,16 @@ class DatastoreProfileS3(DatastoreProfile):
|
|
|
156
213
|
res["AWS_PROFILE"] = self.profile_name
|
|
157
214
|
if self.assume_role_arn:
|
|
158
215
|
res["MLRUN_AWS_ROLE_ARN"] = self.assume_role_arn
|
|
159
|
-
return res
|
|
216
|
+
return res
|
|
160
217
|
|
|
161
218
|
def url(self, subpath):
|
|
162
|
-
|
|
219
|
+
# TODO: There is an inconsistency with DatastoreProfileGCS. In DatastoreProfileGCS,
|
|
220
|
+
# we assume that the subpath can begin without a '/' character,
|
|
221
|
+
# while here we assume it always starts with one.
|
|
222
|
+
if self.bucket:
|
|
223
|
+
return f"s3://{self.bucket}{subpath}"
|
|
224
|
+
else:
|
|
225
|
+
return f"s3:/{subpath}"
|
|
163
226
|
|
|
164
227
|
|
|
165
228
|
class DatastoreProfileRedis(DatastoreProfile):
|
|
@@ -199,7 +262,7 @@ class DatastoreProfileRedis(DatastoreProfile):
|
|
|
199
262
|
res["REDIS_USER"] = self.username
|
|
200
263
|
if self.password:
|
|
201
264
|
res["REDIS_PASSWORD"] = self.password
|
|
202
|
-
return res
|
|
265
|
+
return res
|
|
203
266
|
|
|
204
267
|
def url(self, subpath):
|
|
205
268
|
return self.endpoint_url + subpath
|
|
@@ -220,26 +283,44 @@ class DatastoreProfileDBFS(DatastoreProfile):
|
|
|
220
283
|
res["DATABRICKS_TOKEN"] = self.token
|
|
221
284
|
if self.endpoint_url:
|
|
222
285
|
res["DATABRICKS_HOST"] = self.endpoint_url
|
|
223
|
-
return res
|
|
286
|
+
return res
|
|
224
287
|
|
|
225
288
|
|
|
226
289
|
class DatastoreProfileGCS(DatastoreProfile):
|
|
227
290
|
type: str = pydantic.Field("gcs")
|
|
228
291
|
_private_attributes = ("gcp_credentials",)
|
|
229
292
|
credentials_path: typing.Optional[str] = None # path to file.
|
|
230
|
-
gcp_credentials: typing.Optional[typing.Union[str,
|
|
293
|
+
gcp_credentials: typing.Optional[typing.Union[str, dict]] = None
|
|
294
|
+
bucket: typing.Optional[str] = None
|
|
295
|
+
|
|
296
|
+
@pydantic.validator("bucket")
|
|
297
|
+
@classmethod
|
|
298
|
+
def check_bucket(cls, v):
|
|
299
|
+
if not v:
|
|
300
|
+
warnings.warn(
|
|
301
|
+
"The 'bucket' attribute will be mandatory starting from version 1.9",
|
|
302
|
+
FutureWarning,
|
|
303
|
+
stacklevel=2,
|
|
304
|
+
)
|
|
305
|
+
return v
|
|
231
306
|
|
|
232
307
|
@pydantic.validator("gcp_credentials", pre=True, always=True)
|
|
308
|
+
@classmethod
|
|
233
309
|
def convert_dict_to_json(cls, v):
|
|
234
310
|
if isinstance(v, dict):
|
|
235
311
|
return json.dumps(v)
|
|
236
312
|
return v
|
|
237
313
|
|
|
238
314
|
def url(self, subpath) -> str:
|
|
315
|
+
# TODO: but there's something wrong with the subpath being assumed to not start with a slash here,
|
|
316
|
+
# but the opposite assumption is made in S3.
|
|
239
317
|
if subpath.startswith("/"):
|
|
240
318
|
# in gcs the path after schema is starts with bucket, wherefore it should not start with "/".
|
|
241
319
|
subpath = subpath[1:]
|
|
242
|
-
|
|
320
|
+
if self.bucket:
|
|
321
|
+
return f"gcs://{self.bucket}/{subpath}"
|
|
322
|
+
else:
|
|
323
|
+
return f"gcs://{subpath}"
|
|
243
324
|
|
|
244
325
|
def secrets(self) -> dict:
|
|
245
326
|
res = {}
|
|
@@ -247,7 +328,7 @@ class DatastoreProfileGCS(DatastoreProfile):
|
|
|
247
328
|
res["GOOGLE_APPLICATION_CREDENTIALS"] = self.credentials_path
|
|
248
329
|
if self.gcp_credentials:
|
|
249
330
|
res["GCP_CREDENTIALS"] = self.gcp_credentials
|
|
250
|
-
return res
|
|
331
|
+
return res
|
|
251
332
|
|
|
252
333
|
|
|
253
334
|
class DatastoreProfileAzureBlob(DatastoreProfile):
|
|
@@ -267,12 +348,27 @@ class DatastoreProfileAzureBlob(DatastoreProfile):
|
|
|
267
348
|
client_secret: typing.Optional[str] = None
|
|
268
349
|
sas_token: typing.Optional[str] = None
|
|
269
350
|
credential: typing.Optional[str] = None
|
|
351
|
+
container: typing.Optional[str] = None
|
|
352
|
+
|
|
353
|
+
@pydantic.validator("container")
|
|
354
|
+
@classmethod
|
|
355
|
+
def check_container(cls, v):
|
|
356
|
+
if not v:
|
|
357
|
+
warnings.warn(
|
|
358
|
+
"The 'container' attribute will be mandatory starting from version 1.9",
|
|
359
|
+
FutureWarning,
|
|
360
|
+
stacklevel=2,
|
|
361
|
+
)
|
|
362
|
+
return v
|
|
270
363
|
|
|
271
364
|
def url(self, subpath) -> str:
|
|
272
365
|
if subpath.startswith("/"):
|
|
273
|
-
# in azure the path after schema is starts with
|
|
366
|
+
# in azure the path after schema is starts with container, wherefore it should not start with "/".
|
|
274
367
|
subpath = subpath[1:]
|
|
275
|
-
|
|
368
|
+
if self.container:
|
|
369
|
+
return f"az://{self.container}/{subpath}"
|
|
370
|
+
else:
|
|
371
|
+
return f"az://{subpath}"
|
|
276
372
|
|
|
277
373
|
def secrets(self) -> dict:
|
|
278
374
|
res = {}
|
|
@@ -292,7 +388,31 @@ class DatastoreProfileAzureBlob(DatastoreProfile):
|
|
|
292
388
|
res["sas_token"] = self.sas_token
|
|
293
389
|
if self.credential:
|
|
294
390
|
res["credential"] = self.credential
|
|
295
|
-
return res
|
|
391
|
+
return res
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
class DatastoreProfileHdfs(DatastoreProfile):
|
|
395
|
+
type: str = pydantic.Field("hdfs")
|
|
396
|
+
_private_attributes = "token"
|
|
397
|
+
host: typing.Optional[str] = None
|
|
398
|
+
port: typing.Optional[int] = None
|
|
399
|
+
http_port: typing.Optional[int] = None
|
|
400
|
+
user: typing.Optional[str] = None
|
|
401
|
+
|
|
402
|
+
def secrets(self) -> dict:
|
|
403
|
+
res = {}
|
|
404
|
+
if self.host:
|
|
405
|
+
res["HDFS_HOST"] = self.host
|
|
406
|
+
if self.port:
|
|
407
|
+
res["HDFS_PORT"] = self.port
|
|
408
|
+
if self.port:
|
|
409
|
+
res["HDFS_HTTP_PORT"] = self.http_port
|
|
410
|
+
if self.user:
|
|
411
|
+
res["HDFS_USER"] = self.user
|
|
412
|
+
return res or None
|
|
413
|
+
|
|
414
|
+
def url(self, subpath):
|
|
415
|
+
return f"hdfs://{self.host}:{self.http_port}{subpath}"
|
|
296
416
|
|
|
297
417
|
|
|
298
418
|
class DatastoreProfile2Json(pydantic.BaseModel):
|
|
@@ -346,6 +466,7 @@ class DatastoreProfile2Json(pydantic.BaseModel):
|
|
|
346
466
|
decoded_dict = {k: safe_literal_eval(v) for k, v in decoded_dict.items()}
|
|
347
467
|
datastore_type = decoded_dict.get("type")
|
|
348
468
|
ds_profile_factory = {
|
|
469
|
+
"v3io": DatastoreProfileV3io,
|
|
349
470
|
"s3": DatastoreProfileS3,
|
|
350
471
|
"redis": DatastoreProfileRedis,
|
|
351
472
|
"basic": DatastoreProfileBasic,
|
|
@@ -354,6 +475,7 @@ class DatastoreProfile2Json(pydantic.BaseModel):
|
|
|
354
475
|
"dbfs": DatastoreProfileDBFS,
|
|
355
476
|
"gcs": DatastoreProfileGCS,
|
|
356
477
|
"az": DatastoreProfileAzureBlob,
|
|
478
|
+
"hdfs": DatastoreProfileHdfs,
|
|
357
479
|
}
|
|
358
480
|
if datastore_type in ds_profile_factory:
|
|
359
481
|
return ds_profile_factory[datastore_type].parse_obj(decoded_dict)
|
|
@@ -418,3 +540,7 @@ def register_temporary_client_datastore_profile(profile: DatastoreProfile):
|
|
|
418
540
|
It's beneficial for testing purposes.
|
|
419
541
|
"""
|
|
420
542
|
TemporaryClientDatastoreProfiles().add(profile)
|
|
543
|
+
|
|
544
|
+
|
|
545
|
+
def remove_temporary_client_datastore_profile(profile_name: str):
|
|
546
|
+
TemporaryClientDatastoreProfiles().remove(profile_name)
|
mlrun/datastore/filestore.py
CHANGED
|
@@ -132,13 +132,13 @@ class GoogleCloudStorageStore(DataStore):
|
|
|
132
132
|
self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
|
|
133
133
|
|
|
134
134
|
def get_spark_options(self):
|
|
135
|
-
res =
|
|
135
|
+
res = {}
|
|
136
136
|
st = self.get_storage_options()
|
|
137
137
|
if "token" in st:
|
|
138
138
|
res = {"spark.hadoop.google.cloud.auth.service.account.enable": "true"}
|
|
139
139
|
if isinstance(st["token"], str):
|
|
140
140
|
# Token is a filename, read json from it
|
|
141
|
-
with open(st["token"]
|
|
141
|
+
with open(st["token"]) as file:
|
|
142
142
|
credentials = json.load(file)
|
|
143
143
|
else:
|
|
144
144
|
# Token is a dictionary, use it directly
|
|
@@ -161,3 +161,7 @@ class GoogleCloudStorageStore(DataStore):
|
|
|
161
161
|
if "client_id" in credentials:
|
|
162
162
|
res["spark.hadoop.fs.gs.client.id"] = credentials["client_id"]
|
|
163
163
|
return res
|
|
164
|
+
|
|
165
|
+
@property
|
|
166
|
+
def spark_url(self):
|
|
167
|
+
return f"gs://{self.endpoint}"
|
mlrun/datastore/hdfs.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# Copyright 2024 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import os
|
|
15
|
+
from urllib.parse import urlparse
|
|
16
|
+
|
|
17
|
+
import fsspec
|
|
18
|
+
|
|
19
|
+
from mlrun.datastore.base import DataStore
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class HdfsStore(DataStore):
|
|
23
|
+
def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
|
|
24
|
+
super().__init__(parent, name, schema, endpoint, secrets)
|
|
25
|
+
|
|
26
|
+
self.host = self._get_secret_or_env("HDFS_HOST")
|
|
27
|
+
self.port = self._get_secret_or_env("HDFS_PORT")
|
|
28
|
+
self.http_port = self._get_secret_or_env("HDFS_HTTP_PORT")
|
|
29
|
+
self.user = self._get_secret_or_env("HDFS_USER")
|
|
30
|
+
if not self.user:
|
|
31
|
+
self.user = os.environ.get("HADOOP_USER_NAME", os.environ.get("USER"))
|
|
32
|
+
|
|
33
|
+
self._filesystem = None
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def filesystem(self):
|
|
37
|
+
if not self._filesystem:
|
|
38
|
+
self._filesystem = fsspec.filesystem(
|
|
39
|
+
"webhdfs",
|
|
40
|
+
host=self.host,
|
|
41
|
+
port=self.http_port,
|
|
42
|
+
user=self.user,
|
|
43
|
+
)
|
|
44
|
+
return self._filesystem
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def url(self):
|
|
48
|
+
return f"webhdfs://{self.host}:{self.http_port}"
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def spark_url(self):
|
|
52
|
+
return f"hdfs://{self.host}:{self.port}"
|
|
53
|
+
|
|
54
|
+
def rm(self, url, recursive=False, maxdepth=None):
|
|
55
|
+
path = urlparse(url).path
|
|
56
|
+
self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
|
mlrun/datastore/inmem.py
CHANGED
|
@@ -80,8 +80,8 @@ class InMemoryStore(DataStore):
|
|
|
80
80
|
reader = df_module.read_json
|
|
81
81
|
else:
|
|
82
82
|
raise mlrun.errors.MLRunInvalidArgumentError(f"file type unhandled {url}")
|
|
83
|
-
# InMemoryStore store
|
|
84
|
-
for field in ["time_column", "start_time", "end_time"]:
|
|
83
|
+
# InMemoryStore store – don't pass filters
|
|
84
|
+
for field in ["time_column", "start_time", "end_time", "additional_filters"]:
|
|
85
85
|
kwargs.pop(field, None)
|
|
86
86
|
|
|
87
87
|
return reader(item, **kwargs)
|