mlrun 1.7.0rc5__py3-none-any.whl → 1.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +11 -1
- mlrun/__main__.py +39 -121
- mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
- mlrun/alerts/alert.py +248 -0
- mlrun/api/schemas/__init__.py +4 -3
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +39 -254
- mlrun/artifacts/dataset.py +9 -190
- mlrun/artifacts/manager.py +73 -46
- mlrun/artifacts/model.py +30 -158
- mlrun/artifacts/plots.py +23 -380
- mlrun/common/constants.py +73 -2
- mlrun/common/db/sql_session.py +3 -2
- mlrun/common/formatters/__init__.py +21 -0
- mlrun/common/formatters/artifact.py +46 -0
- mlrun/common/formatters/base.py +113 -0
- mlrun/common/formatters/feature_set.py +44 -0
- mlrun/common/formatters/function.py +46 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/common/formatters/run.py +29 -0
- mlrun/common/helpers.py +11 -1
- mlrun/{runtimes → common/runtimes}/constants.py +32 -4
- mlrun/common/schemas/__init__.py +21 -4
- mlrun/common/schemas/alert.py +202 -0
- mlrun/common/schemas/api_gateway.py +113 -2
- mlrun/common/schemas/artifact.py +28 -1
- mlrun/common/schemas/auth.py +11 -0
- mlrun/common/schemas/client_spec.py +2 -1
- mlrun/common/schemas/common.py +7 -4
- mlrun/common/schemas/constants.py +3 -0
- mlrun/common/schemas/feature_store.py +58 -28
- mlrun/common/schemas/frontend_spec.py +8 -0
- mlrun/common/schemas/function.py +11 -0
- mlrun/common/schemas/hub.py +7 -9
- mlrun/common/schemas/model_monitoring/__init__.py +21 -4
- mlrun/common/schemas/model_monitoring/constants.py +136 -42
- mlrun/common/schemas/model_monitoring/grafana.py +9 -5
- mlrun/common/schemas/model_monitoring/model_endpoints.py +89 -41
- mlrun/common/schemas/notification.py +69 -12
- mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
- mlrun/common/schemas/pipeline.py +7 -0
- mlrun/common/schemas/project.py +67 -16
- mlrun/common/schemas/runs.py +17 -0
- mlrun/common/schemas/schedule.py +1 -1
- mlrun/common/schemas/workflow.py +10 -2
- mlrun/common/types.py +14 -1
- mlrun/config.py +224 -58
- mlrun/data_types/data_types.py +11 -1
- mlrun/data_types/spark.py +5 -4
- mlrun/data_types/to_pandas.py +75 -34
- mlrun/datastore/__init__.py +8 -10
- mlrun/datastore/alibaba_oss.py +131 -0
- mlrun/datastore/azure_blob.py +131 -43
- mlrun/datastore/base.py +107 -47
- mlrun/datastore/datastore.py +17 -7
- mlrun/datastore/datastore_profile.py +91 -7
- mlrun/datastore/dbfs_store.py +3 -7
- mlrun/datastore/filestore.py +1 -3
- mlrun/datastore/google_cloud_storage.py +92 -32
- mlrun/datastore/hdfs.py +5 -0
- mlrun/datastore/inmem.py +6 -3
- mlrun/datastore/redis.py +3 -2
- mlrun/datastore/s3.py +30 -12
- mlrun/datastore/snowflake_utils.py +45 -0
- mlrun/datastore/sources.py +274 -59
- mlrun/datastore/spark_utils.py +30 -0
- mlrun/datastore/store_resources.py +9 -7
- mlrun/datastore/storeytargets.py +151 -0
- mlrun/datastore/targets.py +374 -102
- mlrun/datastore/utils.py +68 -5
- mlrun/datastore/v3io.py +28 -50
- mlrun/db/auth_utils.py +152 -0
- mlrun/db/base.py +231 -22
- mlrun/db/factory.py +1 -4
- mlrun/db/httpdb.py +864 -228
- mlrun/db/nopdb.py +268 -16
- mlrun/errors.py +35 -5
- mlrun/execution.py +111 -38
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +46 -53
- mlrun/feature_store/common.py +6 -11
- mlrun/feature_store/feature_set.py +48 -23
- mlrun/feature_store/feature_vector.py +13 -2
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +9 -4
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +13 -4
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +24 -32
- mlrun/feature_store/steps.py +38 -19
- mlrun/features.py +6 -14
- mlrun/frameworks/_common/plan.py +3 -3
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
- mlrun/frameworks/_ml_common/plan.py +1 -1
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
- mlrun/frameworks/lgbm/__init__.py +1 -1
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/model_handler.py +1 -1
- mlrun/frameworks/parallel_coordinates.py +4 -4
- mlrun/frameworks/pytorch/__init__.py +2 -2
- mlrun/frameworks/sklearn/__init__.py +1 -1
- mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
- mlrun/frameworks/tf_keras/__init__.py +5 -2
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
- mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
- mlrun/frameworks/xgboost/__init__.py +1 -1
- mlrun/k8s_utils.py +57 -12
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +6 -5
- mlrun/launcher/client.py +13 -11
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +15 -5
- mlrun/launcher/remote.py +10 -3
- mlrun/lists.py +6 -2
- mlrun/model.py +297 -48
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +152 -357
- mlrun/model_monitoring/applications/__init__.py +10 -0
- mlrun/model_monitoring/applications/_application_steps.py +190 -0
- mlrun/model_monitoring/applications/base.py +108 -0
- mlrun/model_monitoring/applications/context.py +341 -0
- mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
- mlrun/model_monitoring/applications/histogram_data_drift.py +227 -91
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +130 -303
- mlrun/model_monitoring/{stores/models/sqlite.py → db/__init__.py} +5 -10
- mlrun/model_monitoring/db/stores/__init__.py +136 -0
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/db/stores/base/store.py +213 -0
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
- mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
- mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
- mlrun/model_monitoring/db/tsdb/base.py +448 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +298 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +522 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
- mlrun/model_monitoring/features_drift_table.py +34 -22
- mlrun/model_monitoring/helpers.py +177 -39
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +165 -398
- mlrun/model_monitoring/tracking_policy.py +7 -1
- mlrun/model_monitoring/writer.py +161 -125
- mlrun/package/packagers/default_packager.py +2 -2
- mlrun/package/packagers_manager.py +1 -0
- mlrun/package/utils/_formatter.py +2 -2
- mlrun/platforms/__init__.py +11 -10
- mlrun/platforms/iguazio.py +67 -228
- mlrun/projects/__init__.py +6 -1
- mlrun/projects/operations.py +47 -20
- mlrun/projects/pipelines.py +396 -249
- mlrun/projects/project.py +1125 -414
- mlrun/render.py +28 -22
- mlrun/run.py +207 -180
- mlrun/runtimes/__init__.py +76 -11
- mlrun/runtimes/base.py +40 -14
- mlrun/runtimes/daskjob.py +9 -2
- mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +1 -29
- mlrun/runtimes/kubejob.py +34 -128
- mlrun/runtimes/local.py +39 -10
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/abstract.py +8 -8
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/runtimes/nuclio/api_gateway.py +646 -177
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +758 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
- mlrun/runtimes/nuclio/function.py +188 -68
- mlrun/runtimes/nuclio/serving.py +57 -60
- mlrun/runtimes/pod.py +191 -58
- mlrun/runtimes/remotesparkjob.py +11 -8
- mlrun/runtimes/sparkjob/spark3job.py +17 -18
- mlrun/runtimes/utils.py +40 -73
- mlrun/secrets.py +6 -2
- mlrun/serving/__init__.py +8 -1
- mlrun/serving/remote.py +2 -3
- mlrun/serving/routers.py +89 -64
- mlrun/serving/server.py +54 -26
- mlrun/serving/states.py +187 -56
- mlrun/serving/utils.py +19 -11
- mlrun/serving/v2_serving.py +136 -63
- mlrun/track/tracker.py +2 -1
- mlrun/track/trackers/mlflow_tracker.py +5 -0
- mlrun/utils/async_http.py +26 -6
- mlrun/utils/db.py +18 -0
- mlrun/utils/helpers.py +375 -105
- mlrun/utils/http.py +2 -2
- mlrun/utils/logger.py +75 -9
- mlrun/utils/notifications/notification/__init__.py +14 -10
- mlrun/utils/notifications/notification/base.py +48 -0
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +24 -1
- mlrun/utils/notifications/notification/ipython.py +2 -0
- mlrun/utils/notifications/notification/slack.py +96 -21
- mlrun/utils/notifications/notification/webhook.py +63 -2
- mlrun/utils/notifications/notification_pusher.py +146 -16
- mlrun/utils/regex.py +9 -0
- mlrun/utils/retryer.py +3 -2
- mlrun/utils/v3io_clients.py +2 -3
- mlrun/utils/version/version.json +2 -2
- mlrun-1.7.2.dist-info/METADATA +390 -0
- mlrun-1.7.2.dist-info/RECORD +351 -0
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/WHEEL +1 -1
- mlrun/feature_store/retrieval/conversion.py +0 -271
- mlrun/kfpops.py +0 -868
- mlrun/model_monitoring/application.py +0 -310
- mlrun/model_monitoring/batch.py +0 -974
- mlrun/model_monitoring/controller_handler.py +0 -37
- mlrun/model_monitoring/prometheus.py +0 -216
- mlrun/model_monitoring/stores/__init__.py +0 -111
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -574
- mlrun/model_monitoring/stores/model_endpoint_store.py +0 -145
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/models/base.py +0 -84
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
- mlrun/platforms/other.py +0 -305
- mlrun-1.7.0rc5.dist-info/METADATA +0 -269
- mlrun-1.7.0rc5.dist-info/RECORD +0 -323
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/top_level.txt +0 -0
mlrun/datastore/base.py
CHANGED
|
@@ -24,20 +24,16 @@ import pandas as pd
|
|
|
24
24
|
import pyarrow
|
|
25
25
|
import pytz
|
|
26
26
|
import requests
|
|
27
|
-
import urllib3
|
|
28
27
|
from deprecated import deprecated
|
|
29
28
|
|
|
29
|
+
import mlrun.config
|
|
30
30
|
import mlrun.errors
|
|
31
31
|
from mlrun.errors import err_to_str
|
|
32
|
-
from mlrun.utils import StorePrefix,
|
|
32
|
+
from mlrun.utils import StorePrefix, is_jupyter, logger
|
|
33
33
|
|
|
34
34
|
from .store_resources import is_store_uri, parse_store_uri
|
|
35
35
|
from .utils import filter_df_start_end_time, select_columns_from_df
|
|
36
36
|
|
|
37
|
-
verify_ssl = False
|
|
38
|
-
if not verify_ssl:
|
|
39
|
-
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
40
|
-
|
|
41
37
|
|
|
42
38
|
class FileStats:
|
|
43
39
|
def __init__(self, size, modified, content_type=None):
|
|
@@ -160,6 +156,18 @@ class DataStore:
|
|
|
160
156
|
def put(self, key, data, append=False):
|
|
161
157
|
pass
|
|
162
158
|
|
|
159
|
+
def _prepare_put_data(self, data, append=False):
|
|
160
|
+
mode = "a" if append else "w"
|
|
161
|
+
if isinstance(data, bytearray):
|
|
162
|
+
data = bytes(data)
|
|
163
|
+
|
|
164
|
+
if isinstance(data, bytes):
|
|
165
|
+
return data, f"{mode}b"
|
|
166
|
+
elif isinstance(data, str):
|
|
167
|
+
return data, mode
|
|
168
|
+
else:
|
|
169
|
+
raise TypeError(f"Unable to put a value of type {type(self).__name__}")
|
|
170
|
+
|
|
163
171
|
def stat(self, key):
|
|
164
172
|
pass
|
|
165
173
|
|
|
@@ -182,11 +190,23 @@ class DataStore:
|
|
|
182
190
|
return {}
|
|
183
191
|
|
|
184
192
|
@staticmethod
|
|
185
|
-
def _parquet_reader(
|
|
193
|
+
def _parquet_reader(
|
|
194
|
+
df_module,
|
|
195
|
+
url,
|
|
196
|
+
file_system,
|
|
197
|
+
time_column,
|
|
198
|
+
start_time,
|
|
199
|
+
end_time,
|
|
200
|
+
additional_filters,
|
|
201
|
+
):
|
|
186
202
|
from storey.utils import find_filters, find_partitions
|
|
187
203
|
|
|
188
204
|
def set_filters(
|
|
189
|
-
partitions_time_attributes,
|
|
205
|
+
partitions_time_attributes,
|
|
206
|
+
start_time_inner,
|
|
207
|
+
end_time_inner,
|
|
208
|
+
filters_inner,
|
|
209
|
+
kwargs,
|
|
190
210
|
):
|
|
191
211
|
filters = []
|
|
192
212
|
find_filters(
|
|
@@ -196,20 +216,32 @@ class DataStore:
|
|
|
196
216
|
filters,
|
|
197
217
|
time_column,
|
|
198
218
|
)
|
|
219
|
+
if filters and filters_inner:
|
|
220
|
+
filters[0] += filters_inner
|
|
221
|
+
|
|
199
222
|
kwargs["filters"] = filters
|
|
200
223
|
|
|
201
224
|
def reader(*args, **kwargs):
|
|
202
|
-
if start_time or end_time:
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
225
|
+
if time_column is None and (start_time or end_time):
|
|
226
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
227
|
+
"When providing start_time or end_time, must provide time_column"
|
|
228
|
+
)
|
|
229
|
+
if (
|
|
230
|
+
start_time
|
|
231
|
+
and end_time
|
|
232
|
+
and start_time.utcoffset() != end_time.utcoffset()
|
|
233
|
+
):
|
|
234
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
235
|
+
"start_time and end_time must have the same time zone"
|
|
236
|
+
)
|
|
207
237
|
|
|
238
|
+
if start_time or end_time or additional_filters:
|
|
208
239
|
partitions_time_attributes = find_partitions(url, file_system)
|
|
209
240
|
set_filters(
|
|
210
241
|
partitions_time_attributes,
|
|
211
242
|
start_time,
|
|
212
243
|
end_time,
|
|
244
|
+
additional_filters,
|
|
213
245
|
kwargs,
|
|
214
246
|
)
|
|
215
247
|
try:
|
|
@@ -220,17 +252,23 @@ class DataStore:
|
|
|
220
252
|
):
|
|
221
253
|
raise ex
|
|
222
254
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
255
|
+
start_time_inner = None
|
|
256
|
+
if start_time:
|
|
257
|
+
start_time_inner = start_time.replace(
|
|
258
|
+
tzinfo=None if start_time.tzinfo else pytz.utc
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
end_time_inner = None
|
|
262
|
+
if end_time:
|
|
263
|
+
end_time_inner = end_time.replace(
|
|
264
|
+
tzinfo=None if end_time.tzinfo else pytz.utc
|
|
265
|
+
)
|
|
229
266
|
|
|
230
267
|
set_filters(
|
|
231
268
|
partitions_time_attributes,
|
|
232
269
|
start_time_inner,
|
|
233
270
|
end_time_inner,
|
|
271
|
+
additional_filters,
|
|
234
272
|
kwargs,
|
|
235
273
|
)
|
|
236
274
|
return df_module.read_parquet(*args, **kwargs)
|
|
@@ -249,6 +287,7 @@ class DataStore:
|
|
|
249
287
|
start_time=None,
|
|
250
288
|
end_time=None,
|
|
251
289
|
time_column=None,
|
|
290
|
+
additional_filters=None,
|
|
252
291
|
**kwargs,
|
|
253
292
|
):
|
|
254
293
|
df_module = df_module or pd
|
|
@@ -304,16 +343,18 @@ class DataStore:
|
|
|
304
343
|
dfs.append(df_module.read_csv(*updated_args, **kwargs))
|
|
305
344
|
return df_module.concat(dfs)
|
|
306
345
|
|
|
307
|
-
elif (
|
|
308
|
-
file_url.endswith(".parquet")
|
|
309
|
-
or file_url.endswith(".pq")
|
|
310
|
-
or format == "parquet"
|
|
311
|
-
):
|
|
346
|
+
elif mlrun.utils.helpers.is_parquet_file(file_url, format):
|
|
312
347
|
if columns:
|
|
313
348
|
kwargs["columns"] = columns
|
|
314
349
|
|
|
315
350
|
reader = self._parquet_reader(
|
|
316
|
-
df_module,
|
|
351
|
+
df_module,
|
|
352
|
+
url,
|
|
353
|
+
file_system,
|
|
354
|
+
time_column,
|
|
355
|
+
start_time,
|
|
356
|
+
end_time,
|
|
357
|
+
additional_filters,
|
|
317
358
|
)
|
|
318
359
|
|
|
319
360
|
elif file_url.endswith(".json") or format == "json":
|
|
@@ -365,7 +406,10 @@ class DataStore:
|
|
|
365
406
|
}
|
|
366
407
|
|
|
367
408
|
def rm(self, path, recursive=False, maxdepth=None):
|
|
368
|
-
|
|
409
|
+
try:
|
|
410
|
+
self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
|
|
411
|
+
except FileNotFoundError:
|
|
412
|
+
pass
|
|
369
413
|
|
|
370
414
|
@staticmethod
|
|
371
415
|
def _is_dd(df_module):
|
|
@@ -392,14 +436,15 @@ class DataItem:
|
|
|
392
436
|
|
|
393
437
|
|
|
394
438
|
# reading run results using DataItem (run.artifact())
|
|
395
|
-
train_run = train_iris_func.run(
|
|
396
|
-
|
|
439
|
+
train_run = train_iris_func.run(
|
|
440
|
+
inputs={"dataset": dataset}, params={"label_column": "label"}
|
|
441
|
+
)
|
|
397
442
|
|
|
398
|
-
train_run.artifact(
|
|
399
|
-
test_set = train_run.artifact(
|
|
443
|
+
train_run.artifact("confusion-matrix").show()
|
|
444
|
+
test_set = train_run.artifact("test_set").as_df()
|
|
400
445
|
|
|
401
446
|
# create and use DataItem from uri
|
|
402
|
-
data = mlrun.get_dataitem(
|
|
447
|
+
data = mlrun.get_dataitem("http://xyz/data.json").get()
|
|
403
448
|
"""
|
|
404
449
|
|
|
405
450
|
def __init__(
|
|
@@ -541,6 +586,7 @@ class DataItem:
|
|
|
541
586
|
time_column=None,
|
|
542
587
|
start_time=None,
|
|
543
588
|
end_time=None,
|
|
589
|
+
additional_filters=None,
|
|
544
590
|
**kwargs,
|
|
545
591
|
):
|
|
546
592
|
"""return a dataframe object (generated from the dataitem).
|
|
@@ -552,6 +598,12 @@ class DataItem:
|
|
|
552
598
|
:param end_time: filters out data after this time
|
|
553
599
|
:param time_column: Store timestamp_key will be used if None.
|
|
554
600
|
The results will be filtered by this column and start_time & end_time.
|
|
601
|
+
:param additional_filters: List of additional_filter conditions as tuples.
|
|
602
|
+
Each tuple should be in the format (column_name, operator, value).
|
|
603
|
+
Supported operators: "=", ">=", "<=", ">", "<".
|
|
604
|
+
Example: [("Product", "=", "Computer")]
|
|
605
|
+
For all supported filters, please see:
|
|
606
|
+
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
|
|
555
607
|
"""
|
|
556
608
|
df = self._store.as_df(
|
|
557
609
|
self._url,
|
|
@@ -562,18 +614,19 @@ class DataItem:
|
|
|
562
614
|
time_column=time_column,
|
|
563
615
|
start_time=start_time,
|
|
564
616
|
end_time=end_time,
|
|
617
|
+
additional_filters=additional_filters,
|
|
565
618
|
**kwargs,
|
|
566
619
|
)
|
|
567
620
|
return df
|
|
568
621
|
|
|
569
|
-
def show(self, format=None):
|
|
622
|
+
def show(self, format: Optional[str] = None) -> None:
|
|
570
623
|
"""show the data object content in Jupyter
|
|
571
624
|
|
|
572
625
|
:param format: format to use (when there is no/wrong suffix), e.g. 'png'
|
|
573
626
|
"""
|
|
574
|
-
if not
|
|
627
|
+
if not is_jupyter:
|
|
575
628
|
logger.warning(
|
|
576
|
-
"Jupyter
|
|
629
|
+
"Jupyter was not detected. `.show()` displays only inside Jupyter."
|
|
577
630
|
)
|
|
578
631
|
return
|
|
579
632
|
|
|
@@ -633,17 +686,6 @@ def basic_auth_header(user, password):
|
|
|
633
686
|
return {"Authorization": authstr}
|
|
634
687
|
|
|
635
688
|
|
|
636
|
-
def http_get(url, headers=None, auth=None):
|
|
637
|
-
try:
|
|
638
|
-
response = requests.get(url, headers=headers, auth=auth, verify=verify_ssl)
|
|
639
|
-
except OSError as exc:
|
|
640
|
-
raise OSError(f"error: cannot connect to {url}: {err_to_str(exc)}")
|
|
641
|
-
|
|
642
|
-
mlrun.errors.raise_for_status(response)
|
|
643
|
-
|
|
644
|
-
return response.content
|
|
645
|
-
|
|
646
|
-
|
|
647
689
|
class HttpStore(DataStore):
|
|
648
690
|
def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
|
|
649
691
|
super().__init__(parent, name, schema, endpoint, secrets)
|
|
@@ -671,7 +713,7 @@ class HttpStore(DataStore):
|
|
|
671
713
|
raise ValueError("unimplemented")
|
|
672
714
|
|
|
673
715
|
def get(self, key, size=None, offset=0):
|
|
674
|
-
data =
|
|
716
|
+
data = self._http_get(self.url + self._join(key), self._headers, self.auth)
|
|
675
717
|
if offset:
|
|
676
718
|
data = data[offset:]
|
|
677
719
|
if size:
|
|
@@ -691,13 +733,31 @@ class HttpStore(DataStore):
|
|
|
691
733
|
f"schema as it is not secure and is not recommended."
|
|
692
734
|
)
|
|
693
735
|
|
|
736
|
+
def _http_get(
|
|
737
|
+
self,
|
|
738
|
+
url,
|
|
739
|
+
headers=None,
|
|
740
|
+
auth=None,
|
|
741
|
+
):
|
|
742
|
+
# import here to prevent import cycle
|
|
743
|
+
from mlrun.config import config as mlconf
|
|
744
|
+
|
|
745
|
+
verify_ssl = mlconf.httpdb.http.verify
|
|
746
|
+
try:
|
|
747
|
+
response = requests.get(url, headers=headers, auth=auth, verify=verify_ssl)
|
|
748
|
+
except OSError as exc:
|
|
749
|
+
raise OSError(f"error: cannot connect to {url}: {err_to_str(exc)}")
|
|
750
|
+
|
|
751
|
+
mlrun.errors.raise_for_status(response)
|
|
752
|
+
return response.content
|
|
753
|
+
|
|
694
754
|
|
|
695
755
|
# This wrapper class is designed to extract the 'ds' schema and profile name from URL-formatted paths.
|
|
696
756
|
# Within fsspec, the AbstractFileSystem::_strip_protocol() internal method is used to handle complete URL paths.
|
|
697
757
|
# As an example, it converts an S3 URL 's3://s3bucket/path' to just 's3bucket/path'.
|
|
698
758
|
# Since 'ds' schemas are not inherently processed by fsspec, we have adapted the _strip_protocol()
|
|
699
759
|
# method specifically to strip away the 'ds' schema as required.
|
|
700
|
-
def
|
|
760
|
+
def make_datastore_schema_sanitizer(cls, using_bucket=False, *args, **kwargs):
|
|
701
761
|
if not issubclass(cls, fsspec.AbstractFileSystem):
|
|
702
762
|
raise ValueError("Class must be a subclass of fsspec.AbstractFileSystem")
|
|
703
763
|
|
mlrun/datastore/datastore.py
CHANGED
|
@@ -21,7 +21,7 @@ from mlrun.datastore.datastore_profile import datastore_profile_read
|
|
|
21
21
|
from mlrun.errors import err_to_str
|
|
22
22
|
from mlrun.utils.helpers import get_local_file_schema
|
|
23
23
|
|
|
24
|
-
from ..utils import DB_SCHEMA,
|
|
24
|
+
from ..utils import DB_SCHEMA, RunKeys
|
|
25
25
|
from .base import DataItem, DataStore, HttpStore
|
|
26
26
|
from .filestore import FileStore
|
|
27
27
|
from .inmem import InMemoryStore
|
|
@@ -32,6 +32,8 @@ in_memory_store = InMemoryStore()
|
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
def parse_url(url):
|
|
35
|
+
if url and url.startswith("v3io://") and not url.startswith("v3io:///"):
|
|
36
|
+
url = url.replace("v3io://", "v3io:///", 1)
|
|
35
37
|
parsed_url = urlparse(url)
|
|
36
38
|
schema = parsed_url.scheme.lower()
|
|
37
39
|
endpoint = parsed_url.hostname
|
|
@@ -94,10 +96,14 @@ def schema_to_store(schema):
|
|
|
94
96
|
from .dbfs_store import DBFSStore
|
|
95
97
|
|
|
96
98
|
return DBFSStore
|
|
97
|
-
elif schema
|
|
99
|
+
elif schema in ["hdfs", "webhdfs"]:
|
|
98
100
|
from .hdfs import HdfsStore
|
|
99
101
|
|
|
100
102
|
return HdfsStore
|
|
103
|
+
elif schema == "oss":
|
|
104
|
+
from .alibaba_oss import OSSStore
|
|
105
|
+
|
|
106
|
+
return OSSStore
|
|
101
107
|
else:
|
|
102
108
|
raise ValueError(f"unsupported store scheme ({schema})")
|
|
103
109
|
|
|
@@ -129,7 +135,7 @@ class StoreManager:
|
|
|
129
135
|
return self._db
|
|
130
136
|
|
|
131
137
|
def from_dict(self, struct: dict):
|
|
132
|
-
stor_list = struct.get(
|
|
138
|
+
stor_list = struct.get(RunKeys.data_stores)
|
|
133
139
|
if stor_list and isinstance(stor_list, list):
|
|
134
140
|
for stor in stor_list:
|
|
135
141
|
schema, endpoint, parsed_url = parse_url(stor.get("url"))
|
|
@@ -141,7 +147,7 @@ class StoreManager:
|
|
|
141
147
|
self._stores[stor["name"]] = new_stor
|
|
142
148
|
|
|
143
149
|
def to_dict(self, struct):
|
|
144
|
-
struct[
|
|
150
|
+
struct[RunKeys.data_stores] = [
|
|
145
151
|
stor.to_dict() for stor in self._stores.values() if stor.from_spec
|
|
146
152
|
]
|
|
147
153
|
|
|
@@ -203,7 +209,7 @@ class StoreManager:
|
|
|
203
209
|
) -> (DataStore, str, str):
|
|
204
210
|
schema, endpoint, parsed_url = parse_url(url)
|
|
205
211
|
subpath = parsed_url.path
|
|
206
|
-
store_key = f"{schema}://{endpoint}"
|
|
212
|
+
store_key = f"{schema}://{endpoint}" if endpoint else f"{schema}://"
|
|
207
213
|
|
|
208
214
|
if schema == "ds":
|
|
209
215
|
datastore_profile = datastore_profile_read(url, project_name, secrets)
|
|
@@ -219,6 +225,11 @@ class StoreManager:
|
|
|
219
225
|
subpath = url[len("memory://") :]
|
|
220
226
|
return in_memory_store, subpath, url
|
|
221
227
|
|
|
228
|
+
elif schema in get_local_file_schema():
|
|
229
|
+
# parse_url() will drop the windows drive-letter from the path for url like "c:\a\b".
|
|
230
|
+
# As a workaround, we set subpath to the url.
|
|
231
|
+
subpath = url.replace("file://", "", 1)
|
|
232
|
+
|
|
222
233
|
if not schema and endpoint:
|
|
223
234
|
if endpoint in self._stores.keys():
|
|
224
235
|
return self._stores[endpoint], subpath, url
|
|
@@ -237,8 +248,7 @@ class StoreManager:
|
|
|
237
248
|
)
|
|
238
249
|
if not secrets and not mlrun.config.is_running_as_api():
|
|
239
250
|
self._stores[store_key] = store
|
|
240
|
-
|
|
241
|
-
return store, url if store.kind == "file" else subpath, url
|
|
251
|
+
return store, subpath, url
|
|
242
252
|
|
|
243
253
|
def reset_secrets(self):
|
|
244
254
|
self._secrets = {}
|
|
@@ -16,6 +16,7 @@ import ast
|
|
|
16
16
|
import base64
|
|
17
17
|
import json
|
|
18
18
|
import typing
|
|
19
|
+
import warnings
|
|
19
20
|
from urllib.parse import ParseResult, urlparse, urlunparse
|
|
20
21
|
|
|
21
22
|
import pydantic
|
|
@@ -36,6 +37,7 @@ class DatastoreProfile(pydantic.BaseModel):
|
|
|
36
37
|
extra = pydantic.Extra.forbid
|
|
37
38
|
|
|
38
39
|
@pydantic.validator("name")
|
|
40
|
+
@classmethod
|
|
39
41
|
def lower_case(cls, v):
|
|
40
42
|
return v.lower()
|
|
41
43
|
|
|
@@ -68,6 +70,9 @@ class TemporaryClientDatastoreProfiles(metaclass=mlrun.utils.singleton.Singleton
|
|
|
68
70
|
def get(self, key):
|
|
69
71
|
return self._data.get(key, None)
|
|
70
72
|
|
|
73
|
+
def remove(self, key):
|
|
74
|
+
self._data.pop(key, None)
|
|
75
|
+
|
|
71
76
|
|
|
72
77
|
class DatastoreProfileBasic(DatastoreProfile):
|
|
73
78
|
type: str = pydantic.Field("basic")
|
|
@@ -79,13 +84,37 @@ class DatastoreProfileBasic(DatastoreProfile):
|
|
|
79
84
|
class DatastoreProfileKafkaTarget(DatastoreProfile):
|
|
80
85
|
type: str = pydantic.Field("kafka_target")
|
|
81
86
|
_private_attributes = "kwargs_private"
|
|
82
|
-
bootstrap_servers: str
|
|
87
|
+
bootstrap_servers: typing.Optional[str] = None
|
|
88
|
+
brokers: typing.Optional[str] = None
|
|
83
89
|
topic: str
|
|
84
90
|
kwargs_public: typing.Optional[dict]
|
|
85
91
|
kwargs_private: typing.Optional[dict]
|
|
86
92
|
|
|
93
|
+
def __init__(self, **kwargs):
|
|
94
|
+
super().__init__(**kwargs)
|
|
95
|
+
|
|
96
|
+
if not self.brokers and not self.bootstrap_servers:
|
|
97
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
98
|
+
"DatastoreProfileKafkaTarget requires the 'brokers' field to be set"
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
if self.bootstrap_servers:
|
|
102
|
+
if self.brokers:
|
|
103
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
104
|
+
"DatastoreProfileKafkaTarget cannot be created with both 'brokers' and 'bootstrap_servers'"
|
|
105
|
+
)
|
|
106
|
+
else:
|
|
107
|
+
self.brokers = self.bootstrap_servers
|
|
108
|
+
self.bootstrap_servers = None
|
|
109
|
+
warnings.warn(
|
|
110
|
+
"'bootstrap_servers' parameter is deprecated in 1.7.0 and will be removed in 1.9.0, "
|
|
111
|
+
"use 'brokers' instead.",
|
|
112
|
+
# TODO: Remove this in 1.9.0
|
|
113
|
+
FutureWarning,
|
|
114
|
+
)
|
|
115
|
+
|
|
87
116
|
def attributes(self):
|
|
88
|
-
attributes = {"
|
|
117
|
+
attributes = {"brokers": self.brokers or self.bootstrap_servers}
|
|
89
118
|
if self.kwargs_public:
|
|
90
119
|
attributes = merge(attributes, self.kwargs_public)
|
|
91
120
|
if self.kwargs_private:
|
|
@@ -157,6 +186,18 @@ class DatastoreProfileS3(DatastoreProfile):
|
|
|
157
186
|
assume_role_arn: typing.Optional[str] = None
|
|
158
187
|
access_key_id: typing.Optional[str] = None
|
|
159
188
|
secret_key: typing.Optional[str] = None
|
|
189
|
+
bucket: typing.Optional[str] = None
|
|
190
|
+
|
|
191
|
+
@pydantic.validator("bucket")
|
|
192
|
+
@classmethod
|
|
193
|
+
def check_bucket(cls, v):
|
|
194
|
+
if not v:
|
|
195
|
+
warnings.warn(
|
|
196
|
+
"The 'bucket' attribute will be mandatory starting from version 1.9",
|
|
197
|
+
FutureWarning,
|
|
198
|
+
stacklevel=2,
|
|
199
|
+
)
|
|
200
|
+
return v
|
|
160
201
|
|
|
161
202
|
def secrets(self) -> dict:
|
|
162
203
|
res = {}
|
|
@@ -175,7 +216,13 @@ class DatastoreProfileS3(DatastoreProfile):
|
|
|
175
216
|
return res
|
|
176
217
|
|
|
177
218
|
def url(self, subpath):
|
|
178
|
-
|
|
219
|
+
# TODO: There is an inconsistency with DatastoreProfileGCS. In DatastoreProfileGCS,
|
|
220
|
+
# we assume that the subpath can begin without a '/' character,
|
|
221
|
+
# while here we assume it always starts with one.
|
|
222
|
+
if self.bucket:
|
|
223
|
+
return f"s3://{self.bucket}{subpath}"
|
|
224
|
+
else:
|
|
225
|
+
return f"s3:/{subpath}"
|
|
179
226
|
|
|
180
227
|
|
|
181
228
|
class DatastoreProfileRedis(DatastoreProfile):
|
|
@@ -244,18 +291,36 @@ class DatastoreProfileGCS(DatastoreProfile):
|
|
|
244
291
|
_private_attributes = ("gcp_credentials",)
|
|
245
292
|
credentials_path: typing.Optional[str] = None # path to file.
|
|
246
293
|
gcp_credentials: typing.Optional[typing.Union[str, dict]] = None
|
|
294
|
+
bucket: typing.Optional[str] = None
|
|
295
|
+
|
|
296
|
+
@pydantic.validator("bucket")
|
|
297
|
+
@classmethod
|
|
298
|
+
def check_bucket(cls, v):
|
|
299
|
+
if not v:
|
|
300
|
+
warnings.warn(
|
|
301
|
+
"The 'bucket' attribute will be mandatory starting from version 1.9",
|
|
302
|
+
FutureWarning,
|
|
303
|
+
stacklevel=2,
|
|
304
|
+
)
|
|
305
|
+
return v
|
|
247
306
|
|
|
248
307
|
@pydantic.validator("gcp_credentials", pre=True, always=True)
|
|
308
|
+
@classmethod
|
|
249
309
|
def convert_dict_to_json(cls, v):
|
|
250
310
|
if isinstance(v, dict):
|
|
251
311
|
return json.dumps(v)
|
|
252
312
|
return v
|
|
253
313
|
|
|
254
314
|
def url(self, subpath) -> str:
|
|
315
|
+
# TODO: but there's something wrong with the subpath being assumed to not start with a slash here,
|
|
316
|
+
# but the opposite assumption is made in S3.
|
|
255
317
|
if subpath.startswith("/"):
|
|
256
318
|
# in gcs the path after schema is starts with bucket, wherefore it should not start with "/".
|
|
257
319
|
subpath = subpath[1:]
|
|
258
|
-
|
|
320
|
+
if self.bucket:
|
|
321
|
+
return f"gcs://{self.bucket}/{subpath}"
|
|
322
|
+
else:
|
|
323
|
+
return f"gcs://{subpath}"
|
|
259
324
|
|
|
260
325
|
def secrets(self) -> dict:
|
|
261
326
|
res = {}
|
|
@@ -283,12 +348,27 @@ class DatastoreProfileAzureBlob(DatastoreProfile):
|
|
|
283
348
|
client_secret: typing.Optional[str] = None
|
|
284
349
|
sas_token: typing.Optional[str] = None
|
|
285
350
|
credential: typing.Optional[str] = None
|
|
351
|
+
container: typing.Optional[str] = None
|
|
352
|
+
|
|
353
|
+
@pydantic.validator("container")
|
|
354
|
+
@classmethod
|
|
355
|
+
def check_container(cls, v):
|
|
356
|
+
if not v:
|
|
357
|
+
warnings.warn(
|
|
358
|
+
"The 'container' attribute will be mandatory starting from version 1.9",
|
|
359
|
+
FutureWarning,
|
|
360
|
+
stacklevel=2,
|
|
361
|
+
)
|
|
362
|
+
return v
|
|
286
363
|
|
|
287
364
|
def url(self, subpath) -> str:
|
|
288
365
|
if subpath.startswith("/"):
|
|
289
|
-
# in azure the path after schema is starts with
|
|
366
|
+
# in azure the path after schema is starts with container, wherefore it should not start with "/".
|
|
290
367
|
subpath = subpath[1:]
|
|
291
|
-
|
|
368
|
+
if self.container:
|
|
369
|
+
return f"az://{self.container}/{subpath}"
|
|
370
|
+
else:
|
|
371
|
+
return f"az://{subpath}"
|
|
292
372
|
|
|
293
373
|
def secrets(self) -> dict:
|
|
294
374
|
res = {}
|
|
@@ -332,7 +412,7 @@ class DatastoreProfileHdfs(DatastoreProfile):
|
|
|
332
412
|
return res or None
|
|
333
413
|
|
|
334
414
|
def url(self, subpath):
|
|
335
|
-
return f"
|
|
415
|
+
return f"webhdfs://{self.host}:{self.http_port}{subpath}"
|
|
336
416
|
|
|
337
417
|
|
|
338
418
|
class DatastoreProfile2Json(pydantic.BaseModel):
|
|
@@ -460,3 +540,7 @@ def register_temporary_client_datastore_profile(profile: DatastoreProfile):
|
|
|
460
540
|
It's beneficial for testing purposes.
|
|
461
541
|
"""
|
|
462
542
|
TemporaryClientDatastoreProfiles().add(profile)
|
|
543
|
+
|
|
544
|
+
|
|
545
|
+
def remove_temporary_client_datastore_profile(profile_name: str):
|
|
546
|
+
TemporaryClientDatastoreProfiles().remove(profile_name)
|
mlrun/datastore/dbfs_store.py
CHANGED
|
@@ -19,7 +19,7 @@ from fsspec.registry import get_filesystem_class
|
|
|
19
19
|
|
|
20
20
|
import mlrun.errors
|
|
21
21
|
|
|
22
|
-
from .base import DataStore, FileStats,
|
|
22
|
+
from .base import DataStore, FileStats, make_datastore_schema_sanitizer
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
class DatabricksFileBugFixed(DatabricksFile):
|
|
@@ -89,7 +89,7 @@ class DBFSStore(DataStore):
|
|
|
89
89
|
"""return fsspec file system object, if supported"""
|
|
90
90
|
filesystem_class = get_filesystem_class(protocol=self.kind)
|
|
91
91
|
if not self._filesystem:
|
|
92
|
-
self._filesystem =
|
|
92
|
+
self._filesystem = make_datastore_schema_sanitizer(
|
|
93
93
|
cls=filesystem_class,
|
|
94
94
|
using_bucket=False,
|
|
95
95
|
**self.get_storage_options(),
|
|
@@ -130,11 +130,7 @@ class DBFSStore(DataStore):
|
|
|
130
130
|
"Append mode not supported for Databricks file system"
|
|
131
131
|
)
|
|
132
132
|
# can not use append mode because it overrides data.
|
|
133
|
-
mode =
|
|
134
|
-
if isinstance(data, bytes):
|
|
135
|
-
mode += "b"
|
|
136
|
-
elif not isinstance(data, str):
|
|
137
|
-
raise TypeError(f"Unknown data type {type(data)}")
|
|
133
|
+
data, mode = self._prepare_put_data(data, append)
|
|
138
134
|
with self.filesystem.open(key, mode) as f:
|
|
139
135
|
f.write(data)
|
|
140
136
|
|
mlrun/datastore/filestore.py
CHANGED
|
@@ -66,9 +66,7 @@ class FileStore(DataStore):
|
|
|
66
66
|
dir_to_create = path.dirname(self._join(key))
|
|
67
67
|
if dir_to_create:
|
|
68
68
|
self._ensure_directory(dir_to_create)
|
|
69
|
-
mode =
|
|
70
|
-
if isinstance(data, bytes):
|
|
71
|
-
mode = mode + "b"
|
|
69
|
+
data, mode = self._prepare_put_data(data, append)
|
|
72
70
|
with open(self._join(key), mode) as fp:
|
|
73
71
|
fp.write(data)
|
|
74
72
|
fp.close()
|