mlrun 1.6.4rc7__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +11 -1
- mlrun/__main__.py +40 -122
- mlrun/alerts/__init__.py +15 -0
- mlrun/alerts/alert.py +248 -0
- mlrun/api/schemas/__init__.py +5 -4
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +47 -257
- mlrun/artifacts/dataset.py +11 -192
- mlrun/artifacts/manager.py +79 -47
- mlrun/artifacts/model.py +31 -159
- mlrun/artifacts/plots.py +23 -380
- mlrun/common/constants.py +74 -1
- mlrun/common/db/sql_session.py +5 -5
- mlrun/common/formatters/__init__.py +21 -0
- mlrun/common/formatters/artifact.py +45 -0
- mlrun/common/formatters/base.py +113 -0
- mlrun/common/formatters/feature_set.py +33 -0
- mlrun/common/formatters/function.py +46 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/common/formatters/run.py +29 -0
- mlrun/common/helpers.py +12 -3
- mlrun/common/model_monitoring/helpers.py +9 -5
- mlrun/{runtimes → common/runtimes}/constants.py +37 -9
- mlrun/common/schemas/__init__.py +31 -5
- mlrun/common/schemas/alert.py +202 -0
- mlrun/common/schemas/api_gateway.py +196 -0
- mlrun/common/schemas/artifact.py +25 -4
- mlrun/common/schemas/auth.py +16 -5
- mlrun/common/schemas/background_task.py +1 -1
- mlrun/common/schemas/client_spec.py +4 -2
- mlrun/common/schemas/common.py +7 -4
- mlrun/common/schemas/constants.py +3 -0
- mlrun/common/schemas/feature_store.py +74 -44
- mlrun/common/schemas/frontend_spec.py +15 -7
- mlrun/common/schemas/function.py +12 -1
- mlrun/common/schemas/hub.py +11 -18
- mlrun/common/schemas/memory_reports.py +2 -2
- mlrun/common/schemas/model_monitoring/__init__.py +20 -4
- mlrun/common/schemas/model_monitoring/constants.py +123 -42
- mlrun/common/schemas/model_monitoring/grafana.py +13 -9
- mlrun/common/schemas/model_monitoring/model_endpoints.py +101 -54
- mlrun/common/schemas/notification.py +71 -14
- mlrun/common/schemas/object.py +2 -2
- mlrun/{model_monitoring/controller_handler.py → common/schemas/pagination.py} +9 -12
- mlrun/common/schemas/pipeline.py +8 -1
- mlrun/common/schemas/project.py +69 -18
- mlrun/common/schemas/runs.py +7 -1
- mlrun/common/schemas/runtime_resource.py +8 -12
- mlrun/common/schemas/schedule.py +4 -4
- mlrun/common/schemas/tag.py +1 -2
- mlrun/common/schemas/workflow.py +12 -4
- mlrun/common/types.py +14 -1
- mlrun/config.py +154 -69
- mlrun/data_types/data_types.py +6 -1
- mlrun/data_types/spark.py +2 -2
- mlrun/data_types/to_pandas.py +67 -37
- mlrun/datastore/__init__.py +6 -8
- mlrun/datastore/alibaba_oss.py +131 -0
- mlrun/datastore/azure_blob.py +143 -42
- mlrun/datastore/base.py +102 -58
- mlrun/datastore/datastore.py +34 -13
- mlrun/datastore/datastore_profile.py +146 -20
- mlrun/datastore/dbfs_store.py +3 -7
- mlrun/datastore/filestore.py +1 -4
- mlrun/datastore/google_cloud_storage.py +97 -33
- mlrun/datastore/hdfs.py +56 -0
- mlrun/datastore/inmem.py +6 -3
- mlrun/datastore/redis.py +7 -2
- mlrun/datastore/s3.py +34 -12
- mlrun/datastore/snowflake_utils.py +45 -0
- mlrun/datastore/sources.py +303 -111
- mlrun/datastore/spark_utils.py +31 -2
- mlrun/datastore/store_resources.py +9 -7
- mlrun/datastore/storeytargets.py +151 -0
- mlrun/datastore/targets.py +453 -176
- mlrun/datastore/utils.py +72 -58
- mlrun/datastore/v3io.py +6 -1
- mlrun/db/base.py +274 -41
- mlrun/db/factory.py +1 -1
- mlrun/db/httpdb.py +893 -225
- mlrun/db/nopdb.py +291 -33
- mlrun/errors.py +36 -6
- mlrun/execution.py +115 -42
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +65 -73
- mlrun/feature_store/common.py +7 -12
- mlrun/feature_store/feature_set.py +76 -55
- mlrun/feature_store/feature_vector.py +39 -31
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +16 -11
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +13 -4
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +24 -32
- mlrun/feature_store/steps.py +45 -34
- mlrun/features.py +11 -21
- mlrun/frameworks/_common/artifacts_library.py +9 -9
- mlrun/frameworks/_common/mlrun_interface.py +5 -5
- mlrun/frameworks/_common/model_handler.py +48 -48
- mlrun/frameworks/_common/plan.py +5 -6
- mlrun/frameworks/_common/producer.py +3 -4
- mlrun/frameworks/_common/utils.py +5 -5
- mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
- mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
- mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
- mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
- mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
- mlrun/frameworks/_ml_common/model_handler.py +24 -24
- mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
- mlrun/frameworks/_ml_common/plan.py +2 -2
- mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
- mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
- mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
- mlrun/frameworks/_ml_common/utils.py +4 -4
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
- mlrun/frameworks/huggingface/model_server.py +4 -4
- mlrun/frameworks/lgbm/__init__.py +33 -33
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
- mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
- mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
- mlrun/frameworks/lgbm/model_handler.py +10 -10
- mlrun/frameworks/lgbm/model_server.py +6 -6
- mlrun/frameworks/lgbm/utils.py +5 -5
- mlrun/frameworks/onnx/dataset.py +8 -8
- mlrun/frameworks/onnx/mlrun_interface.py +3 -3
- mlrun/frameworks/onnx/model_handler.py +6 -6
- mlrun/frameworks/onnx/model_server.py +7 -7
- mlrun/frameworks/parallel_coordinates.py +6 -6
- mlrun/frameworks/pytorch/__init__.py +18 -18
- mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
- mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
- mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
- mlrun/frameworks/pytorch/model_handler.py +17 -17
- mlrun/frameworks/pytorch/model_server.py +7 -7
- mlrun/frameworks/sklearn/__init__.py +13 -13
- mlrun/frameworks/sklearn/estimator.py +4 -4
- mlrun/frameworks/sklearn/metrics_library.py +14 -14
- mlrun/frameworks/sklearn/mlrun_interface.py +16 -9
- mlrun/frameworks/sklearn/model_handler.py +2 -2
- mlrun/frameworks/tf_keras/__init__.py +10 -7
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
- mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
- mlrun/frameworks/tf_keras/model_handler.py +14 -14
- mlrun/frameworks/tf_keras/model_server.py +6 -6
- mlrun/frameworks/xgboost/__init__.py +13 -13
- mlrun/frameworks/xgboost/model_handler.py +6 -6
- mlrun/k8s_utils.py +61 -17
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +16 -15
- mlrun/launcher/client.py +13 -11
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +23 -13
- mlrun/launcher/remote.py +17 -10
- mlrun/lists.py +7 -6
- mlrun/model.py +478 -103
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +163 -371
- mlrun/{runtimes/mpijob/v1alpha1.py → model_monitoring/applications/__init__.py} +9 -15
- mlrun/model_monitoring/applications/_application_steps.py +188 -0
- mlrun/model_monitoring/applications/base.py +108 -0
- mlrun/model_monitoring/applications/context.py +341 -0
- mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
- mlrun/model_monitoring/applications/histogram_data_drift.py +354 -0
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +131 -278
- mlrun/model_monitoring/db/__init__.py +18 -0
- mlrun/model_monitoring/db/stores/__init__.py +136 -0
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/db/stores/base/store.py +213 -0
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
- mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
- mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
- mlrun/model_monitoring/db/tsdb/base.py +448 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +279 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +507 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
- mlrun/model_monitoring/features_drift_table.py +134 -106
- mlrun/model_monitoring/helpers.py +199 -55
- mlrun/model_monitoring/metrics/__init__.py +13 -0
- mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +131 -398
- mlrun/model_monitoring/tracking_policy.py +9 -2
- mlrun/model_monitoring/writer.py +161 -125
- mlrun/package/__init__.py +6 -6
- mlrun/package/context_handler.py +5 -5
- mlrun/package/packager.py +7 -7
- mlrun/package/packagers/default_packager.py +8 -8
- mlrun/package/packagers/numpy_packagers.py +15 -15
- mlrun/package/packagers/pandas_packagers.py +5 -5
- mlrun/package/packagers/python_standard_library_packagers.py +10 -10
- mlrun/package/packagers_manager.py +19 -23
- mlrun/package/utils/_formatter.py +6 -6
- mlrun/package/utils/_pickler.py +2 -2
- mlrun/package/utils/_supported_format.py +4 -4
- mlrun/package/utils/log_hint_utils.py +2 -2
- mlrun/package/utils/type_hint_utils.py +4 -9
- mlrun/platforms/__init__.py +11 -10
- mlrun/platforms/iguazio.py +24 -203
- mlrun/projects/operations.py +52 -25
- mlrun/projects/pipelines.py +191 -197
- mlrun/projects/project.py +1227 -400
- mlrun/render.py +16 -19
- mlrun/run.py +209 -184
- mlrun/runtimes/__init__.py +83 -15
- mlrun/runtimes/base.py +51 -35
- mlrun/runtimes/daskjob.py +17 -10
- mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
- mlrun/runtimes/databricks_job/databricks_runtime.py +8 -7
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +1 -29
- mlrun/runtimes/function_reference.py +1 -1
- mlrun/runtimes/kubejob.py +34 -128
- mlrun/runtimes/local.py +40 -11
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/abstract.py +9 -10
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
- mlrun/runtimes/nuclio/api_gateway.py +769 -0
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +758 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
- mlrun/runtimes/{function.py → nuclio/function.py} +200 -83
- mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
- mlrun/runtimes/{serving.py → nuclio/serving.py} +65 -68
- mlrun/runtimes/pod.py +281 -101
- mlrun/runtimes/remotesparkjob.py +12 -9
- mlrun/runtimes/sparkjob/spark3job.py +67 -51
- mlrun/runtimes/utils.py +41 -75
- mlrun/secrets.py +9 -5
- mlrun/serving/__init__.py +8 -1
- mlrun/serving/remote.py +2 -7
- mlrun/serving/routers.py +85 -69
- mlrun/serving/server.py +69 -44
- mlrun/serving/states.py +209 -36
- mlrun/serving/utils.py +22 -14
- mlrun/serving/v1_serving.py +6 -7
- mlrun/serving/v2_serving.py +129 -54
- mlrun/track/tracker.py +2 -1
- mlrun/track/tracker_manager.py +3 -3
- mlrun/track/trackers/mlflow_tracker.py +6 -2
- mlrun/utils/async_http.py +6 -8
- mlrun/utils/azure_vault.py +1 -1
- mlrun/utils/clones.py +1 -2
- mlrun/utils/condition_evaluator.py +3 -3
- mlrun/utils/db.py +21 -3
- mlrun/utils/helpers.py +405 -225
- mlrun/utils/http.py +3 -6
- mlrun/utils/logger.py +112 -16
- mlrun/utils/notifications/notification/__init__.py +17 -13
- mlrun/utils/notifications/notification/base.py +50 -2
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +24 -1
- mlrun/utils/notifications/notification/ipython.py +3 -1
- mlrun/utils/notifications/notification/slack.py +96 -21
- mlrun/utils/notifications/notification/webhook.py +59 -2
- mlrun/utils/notifications/notification_pusher.py +149 -30
- mlrun/utils/regex.py +9 -0
- mlrun/utils/retryer.py +208 -0
- mlrun/utils/singleton.py +1 -1
- mlrun/utils/v3io_clients.py +4 -6
- mlrun/utils/version/version.json +2 -2
- mlrun/utils/version/version.py +2 -6
- mlrun-1.7.0.dist-info/METADATA +378 -0
- mlrun-1.7.0.dist-info/RECORD +351 -0
- {mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/WHEEL +1 -1
- mlrun/feature_store/retrieval/conversion.py +0 -273
- mlrun/kfpops.py +0 -868
- mlrun/model_monitoring/application.py +0 -310
- mlrun/model_monitoring/batch.py +0 -1095
- mlrun/model_monitoring/prometheus.py +0 -219
- mlrun/model_monitoring/stores/__init__.py +0 -111
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -576
- mlrun/model_monitoring/stores/model_endpoint_store.py +0 -147
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/models/base.py +0 -84
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
- mlrun/platforms/other.py +0 -306
- mlrun-1.6.4rc7.dist-info/METADATA +0 -272
- mlrun-1.6.4rc7.dist-info/RECORD +0 -314
- {mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/LICENSE +0 -0
- {mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/entry_points.txt +0 -0
- {mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/top_level.txt +0 -0
mlrun/datastore/base.py
CHANGED
|
@@ -24,13 +24,12 @@ import pandas as pd
|
|
|
24
24
|
import pyarrow
|
|
25
25
|
import pytz
|
|
26
26
|
import requests
|
|
27
|
-
import urllib3
|
|
28
27
|
from deprecated import deprecated
|
|
29
28
|
|
|
30
29
|
import mlrun.config
|
|
31
30
|
import mlrun.errors
|
|
32
31
|
from mlrun.errors import err_to_str
|
|
33
|
-
from mlrun.utils import StorePrefix,
|
|
32
|
+
from mlrun.utils import StorePrefix, is_jupyter, logger
|
|
34
33
|
|
|
35
34
|
from .store_resources import is_store_uri, parse_store_uri
|
|
36
35
|
from .utils import filter_df_start_end_time, select_columns_from_df
|
|
@@ -144,6 +143,10 @@ class DataStore:
|
|
|
144
143
|
def url(self):
|
|
145
144
|
return f"{self.kind}://{self.endpoint}"
|
|
146
145
|
|
|
146
|
+
@property
|
|
147
|
+
def spark_url(self):
|
|
148
|
+
return self.url
|
|
149
|
+
|
|
147
150
|
def get(self, key, size=None, offset=0):
|
|
148
151
|
pass
|
|
149
152
|
|
|
@@ -153,6 +156,18 @@ class DataStore:
|
|
|
153
156
|
def put(self, key, data, append=False):
|
|
154
157
|
pass
|
|
155
158
|
|
|
159
|
+
def _prepare_put_data(self, data, append=False):
|
|
160
|
+
mode = "a" if append else "w"
|
|
161
|
+
if isinstance(data, bytearray):
|
|
162
|
+
data = bytes(data)
|
|
163
|
+
|
|
164
|
+
if isinstance(data, bytes):
|
|
165
|
+
return data, f"{mode}b"
|
|
166
|
+
elif isinstance(data, str):
|
|
167
|
+
return data, mode
|
|
168
|
+
else:
|
|
169
|
+
raise TypeError(f"Unable to put a value of type {type(self).__name__}")
|
|
170
|
+
|
|
156
171
|
def stat(self, key):
|
|
157
172
|
pass
|
|
158
173
|
|
|
@@ -175,11 +190,23 @@ class DataStore:
|
|
|
175
190
|
return {}
|
|
176
191
|
|
|
177
192
|
@staticmethod
|
|
178
|
-
def _parquet_reader(
|
|
193
|
+
def _parquet_reader(
|
|
194
|
+
df_module,
|
|
195
|
+
url,
|
|
196
|
+
file_system,
|
|
197
|
+
time_column,
|
|
198
|
+
start_time,
|
|
199
|
+
end_time,
|
|
200
|
+
additional_filters,
|
|
201
|
+
):
|
|
179
202
|
from storey.utils import find_filters, find_partitions
|
|
180
203
|
|
|
181
204
|
def set_filters(
|
|
182
|
-
partitions_time_attributes,
|
|
205
|
+
partitions_time_attributes,
|
|
206
|
+
start_time_inner,
|
|
207
|
+
end_time_inner,
|
|
208
|
+
filters_inner,
|
|
209
|
+
kwargs,
|
|
183
210
|
):
|
|
184
211
|
filters = []
|
|
185
212
|
find_filters(
|
|
@@ -189,20 +216,32 @@ class DataStore:
|
|
|
189
216
|
filters,
|
|
190
217
|
time_column,
|
|
191
218
|
)
|
|
219
|
+
if filters and filters_inner:
|
|
220
|
+
filters[0] += filters_inner
|
|
221
|
+
|
|
192
222
|
kwargs["filters"] = filters
|
|
193
223
|
|
|
194
224
|
def reader(*args, **kwargs):
|
|
195
|
-
if start_time or end_time:
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
225
|
+
if time_column is None and (start_time or end_time):
|
|
226
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
227
|
+
"When providing start_time or end_time, must provide time_column"
|
|
228
|
+
)
|
|
229
|
+
if (
|
|
230
|
+
start_time
|
|
231
|
+
and end_time
|
|
232
|
+
and start_time.utcoffset() != end_time.utcoffset()
|
|
233
|
+
):
|
|
234
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
235
|
+
"start_time and end_time must have the same time zone"
|
|
236
|
+
)
|
|
200
237
|
|
|
238
|
+
if start_time or end_time or additional_filters:
|
|
201
239
|
partitions_time_attributes = find_partitions(url, file_system)
|
|
202
240
|
set_filters(
|
|
203
241
|
partitions_time_attributes,
|
|
204
242
|
start_time,
|
|
205
243
|
end_time,
|
|
244
|
+
additional_filters,
|
|
206
245
|
kwargs,
|
|
207
246
|
)
|
|
208
247
|
try:
|
|
@@ -213,17 +252,23 @@ class DataStore:
|
|
|
213
252
|
):
|
|
214
253
|
raise ex
|
|
215
254
|
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
255
|
+
start_time_inner = None
|
|
256
|
+
if start_time:
|
|
257
|
+
start_time_inner = start_time.replace(
|
|
258
|
+
tzinfo=None if start_time.tzinfo else pytz.utc
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
end_time_inner = None
|
|
262
|
+
if end_time:
|
|
263
|
+
end_time_inner = end_time.replace(
|
|
264
|
+
tzinfo=None if end_time.tzinfo else pytz.utc
|
|
265
|
+
)
|
|
222
266
|
|
|
223
267
|
set_filters(
|
|
224
268
|
partitions_time_attributes,
|
|
225
269
|
start_time_inner,
|
|
226
270
|
end_time_inner,
|
|
271
|
+
additional_filters,
|
|
227
272
|
kwargs,
|
|
228
273
|
)
|
|
229
274
|
return df_module.read_parquet(*args, **kwargs)
|
|
@@ -242,6 +287,7 @@ class DataStore:
|
|
|
242
287
|
start_time=None,
|
|
243
288
|
end_time=None,
|
|
244
289
|
time_column=None,
|
|
290
|
+
additional_filters=None,
|
|
245
291
|
**kwargs,
|
|
246
292
|
):
|
|
247
293
|
df_module = df_module or pd
|
|
@@ -297,16 +343,18 @@ class DataStore:
|
|
|
297
343
|
dfs.append(df_module.read_csv(*updated_args, **kwargs))
|
|
298
344
|
return df_module.concat(dfs)
|
|
299
345
|
|
|
300
|
-
elif (
|
|
301
|
-
file_url.endswith(".parquet")
|
|
302
|
-
or file_url.endswith(".pq")
|
|
303
|
-
or format == "parquet"
|
|
304
|
-
):
|
|
346
|
+
elif mlrun.utils.helpers.is_parquet_file(file_url, format):
|
|
305
347
|
if columns:
|
|
306
348
|
kwargs["columns"] = columns
|
|
307
349
|
|
|
308
350
|
reader = self._parquet_reader(
|
|
309
|
-
df_module,
|
|
351
|
+
df_module,
|
|
352
|
+
url,
|
|
353
|
+
file_system,
|
|
354
|
+
time_column,
|
|
355
|
+
start_time,
|
|
356
|
+
end_time,
|
|
357
|
+
additional_filters,
|
|
310
358
|
)
|
|
311
359
|
|
|
312
360
|
elif file_url.endswith(".json") or format == "json":
|
|
@@ -317,31 +365,17 @@ class DataStore:
|
|
|
317
365
|
raise Exception(f"File type unhandled {url}")
|
|
318
366
|
|
|
319
367
|
if file_system:
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
# Pass the underlying file system
|
|
332
|
-
kwargs["filesystem"] = file_system
|
|
333
|
-
elif storage_options:
|
|
334
|
-
kwargs["storage_options"] = storage_options
|
|
335
|
-
df = reader(url, **kwargs)
|
|
336
|
-
else:
|
|
337
|
-
file = url
|
|
338
|
-
# Workaround for ARROW-12472 affecting pyarrow 3.x and 4.x.
|
|
339
|
-
if file_system.protocol != "file":
|
|
340
|
-
# If not dir, use file_system.open() to avoid regression when pandas < 1.2 and does not
|
|
341
|
-
# support the storage_options parameter.
|
|
342
|
-
file = file_system.open(url)
|
|
343
|
-
|
|
344
|
-
df = reader(file, **kwargs)
|
|
368
|
+
storage_options = self.get_storage_options()
|
|
369
|
+
if url.startswith("ds://"):
|
|
370
|
+
parsed_url = urllib.parse.urlparse(url)
|
|
371
|
+
url = parsed_url.path
|
|
372
|
+
if self.using_bucket:
|
|
373
|
+
url = url[1:]
|
|
374
|
+
# Pass the underlying file system
|
|
375
|
+
kwargs["filesystem"] = file_system
|
|
376
|
+
elif storage_options:
|
|
377
|
+
kwargs["storage_options"] = storage_options
|
|
378
|
+
df = reader(url, **kwargs)
|
|
345
379
|
else:
|
|
346
380
|
temp_file = tempfile.NamedTemporaryFile(delete=False)
|
|
347
381
|
self.download(self._join(subpath), temp_file.name)
|
|
@@ -372,7 +406,10 @@ class DataStore:
|
|
|
372
406
|
}
|
|
373
407
|
|
|
374
408
|
def rm(self, path, recursive=False, maxdepth=None):
|
|
375
|
-
|
|
409
|
+
try:
|
|
410
|
+
self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
|
|
411
|
+
except FileNotFoundError:
|
|
412
|
+
pass
|
|
376
413
|
|
|
377
414
|
@staticmethod
|
|
378
415
|
def _is_dd(df_module):
|
|
@@ -399,14 +436,15 @@ class DataItem:
|
|
|
399
436
|
|
|
400
437
|
|
|
401
438
|
# reading run results using DataItem (run.artifact())
|
|
402
|
-
train_run = train_iris_func.run(
|
|
403
|
-
|
|
439
|
+
train_run = train_iris_func.run(
|
|
440
|
+
inputs={"dataset": dataset}, params={"label_column": "label"}
|
|
441
|
+
)
|
|
404
442
|
|
|
405
|
-
train_run.artifact(
|
|
406
|
-
test_set = train_run.artifact(
|
|
443
|
+
train_run.artifact("confusion-matrix").show()
|
|
444
|
+
test_set = train_run.artifact("test_set").as_df()
|
|
407
445
|
|
|
408
446
|
# create and use DataItem from uri
|
|
409
|
-
data = mlrun.get_dataitem(
|
|
447
|
+
data = mlrun.get_dataitem("http://xyz/data.json").get()
|
|
410
448
|
"""
|
|
411
449
|
|
|
412
450
|
def __init__(
|
|
@@ -548,6 +586,7 @@ class DataItem:
|
|
|
548
586
|
time_column=None,
|
|
549
587
|
start_time=None,
|
|
550
588
|
end_time=None,
|
|
589
|
+
additional_filters=None,
|
|
551
590
|
**kwargs,
|
|
552
591
|
):
|
|
553
592
|
"""return a dataframe object (generated from the dataitem).
|
|
@@ -559,6 +598,12 @@ class DataItem:
|
|
|
559
598
|
:param end_time: filters out data after this time
|
|
560
599
|
:param time_column: Store timestamp_key will be used if None.
|
|
561
600
|
The results will be filtered by this column and start_time & end_time.
|
|
601
|
+
:param additional_filters: List of additional_filter conditions as tuples.
|
|
602
|
+
Each tuple should be in the format (column_name, operator, value).
|
|
603
|
+
Supported operators: "=", ">=", "<=", ">", "<".
|
|
604
|
+
Example: [("Product", "=", "Computer")]
|
|
605
|
+
For all supported filters, please see:
|
|
606
|
+
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
|
|
562
607
|
"""
|
|
563
608
|
df = self._store.as_df(
|
|
564
609
|
self._url,
|
|
@@ -569,18 +614,19 @@ class DataItem:
|
|
|
569
614
|
time_column=time_column,
|
|
570
615
|
start_time=start_time,
|
|
571
616
|
end_time=end_time,
|
|
617
|
+
additional_filters=additional_filters,
|
|
572
618
|
**kwargs,
|
|
573
619
|
)
|
|
574
620
|
return df
|
|
575
621
|
|
|
576
|
-
def show(self, format=None):
|
|
622
|
+
def show(self, format: Optional[str] = None) -> None:
|
|
577
623
|
"""show the data object content in Jupyter
|
|
578
624
|
|
|
579
625
|
:param format: format to use (when there is no/wrong suffix), e.g. 'png'
|
|
580
626
|
"""
|
|
581
|
-
if not
|
|
627
|
+
if not is_jupyter:
|
|
582
628
|
logger.warning(
|
|
583
|
-
"Jupyter
|
|
629
|
+
"Jupyter was not detected. `.show()` displays only inside Jupyter."
|
|
584
630
|
)
|
|
585
631
|
return
|
|
586
632
|
|
|
@@ -698,8 +744,6 @@ class HttpStore(DataStore):
|
|
|
698
744
|
|
|
699
745
|
verify_ssl = mlconf.httpdb.http.verify
|
|
700
746
|
try:
|
|
701
|
-
if not verify_ssl:
|
|
702
|
-
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
703
747
|
response = requests.get(url, headers=headers, auth=auth, verify=verify_ssl)
|
|
704
748
|
except OSError as exc:
|
|
705
749
|
raise OSError(f"error: cannot connect to {url}: {err_to_str(exc)}")
|
|
@@ -713,7 +757,7 @@ class HttpStore(DataStore):
|
|
|
713
757
|
# As an example, it converts an S3 URL 's3://s3bucket/path' to just 's3bucket/path'.
|
|
714
758
|
# Since 'ds' schemas are not inherently processed by fsspec, we have adapted the _strip_protocol()
|
|
715
759
|
# method specifically to strip away the 'ds' schema as required.
|
|
716
|
-
def
|
|
760
|
+
def make_datastore_schema_sanitizer(cls, using_bucket=False, *args, **kwargs):
|
|
717
761
|
if not issubclass(cls, fsspec.AbstractFileSystem):
|
|
718
762
|
raise ValueError("Class must be a subclass of fsspec.AbstractFileSystem")
|
|
719
763
|
|
mlrun/datastore/datastore.py
CHANGED
|
@@ -21,7 +21,7 @@ from mlrun.datastore.datastore_profile import datastore_profile_read
|
|
|
21
21
|
from mlrun.errors import err_to_str
|
|
22
22
|
from mlrun.utils.helpers import get_local_file_schema
|
|
23
23
|
|
|
24
|
-
from ..utils import DB_SCHEMA,
|
|
24
|
+
from ..utils import DB_SCHEMA, RunKeys
|
|
25
25
|
from .base import DataItem, DataStore, HttpStore
|
|
26
26
|
from .filestore import FileStore
|
|
27
27
|
from .inmem import InMemoryStore
|
|
@@ -32,6 +32,8 @@ in_memory_store = InMemoryStore()
|
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
def parse_url(url):
|
|
35
|
+
if url and url.startswith("v3io://") and not url.startswith("v3io:///"):
|
|
36
|
+
url = url.replace("v3io://", "v3io:///", 1)
|
|
35
37
|
parsed_url = urlparse(url)
|
|
36
38
|
schema = parsed_url.scheme.lower()
|
|
37
39
|
endpoint = parsed_url.hostname
|
|
@@ -94,6 +96,14 @@ def schema_to_store(schema):
|
|
|
94
96
|
from .dbfs_store import DBFSStore
|
|
95
97
|
|
|
96
98
|
return DBFSStore
|
|
99
|
+
elif schema in ["hdfs", "webhdfs"]:
|
|
100
|
+
from .hdfs import HdfsStore
|
|
101
|
+
|
|
102
|
+
return HdfsStore
|
|
103
|
+
elif schema == "oss":
|
|
104
|
+
from .alibaba_oss import OSSStore
|
|
105
|
+
|
|
106
|
+
return OSSStore
|
|
97
107
|
else:
|
|
98
108
|
raise ValueError(f"unsupported store scheme ({schema})")
|
|
99
109
|
|
|
@@ -125,7 +135,7 @@ class StoreManager:
|
|
|
125
135
|
return self._db
|
|
126
136
|
|
|
127
137
|
def from_dict(self, struct: dict):
|
|
128
|
-
stor_list = struct.get(
|
|
138
|
+
stor_list = struct.get(RunKeys.data_stores)
|
|
129
139
|
if stor_list and isinstance(stor_list, list):
|
|
130
140
|
for stor in stor_list:
|
|
131
141
|
schema, endpoint, parsed_url = parse_url(stor.get("url"))
|
|
@@ -137,7 +147,7 @@ class StoreManager:
|
|
|
137
147
|
self._stores[stor["name"]] = new_stor
|
|
138
148
|
|
|
139
149
|
def to_dict(self, struct):
|
|
140
|
-
struct[
|
|
150
|
+
struct[RunKeys.data_stores] = [
|
|
141
151
|
stor.to_dict() for stor in self._stores.values() if stor.from_spec
|
|
142
152
|
]
|
|
143
153
|
|
|
@@ -170,7 +180,7 @@ class StoreManager:
|
|
|
170
180
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
171
181
|
f"resource {url} does not have a valid/persistent offline target"
|
|
172
182
|
)
|
|
173
|
-
return resource, target
|
|
183
|
+
return resource, target or ""
|
|
174
184
|
|
|
175
185
|
def object(
|
|
176
186
|
self, url, key="", project="", allow_empty_resources=None, secrets: dict = None
|
|
@@ -182,17 +192,24 @@ class StoreManager:
|
|
|
182
192
|
url, project, allow_empty_resources, secrets
|
|
183
193
|
)
|
|
184
194
|
|
|
185
|
-
store, subpath = self.get_or_create_store(
|
|
195
|
+
store, subpath, url = self.get_or_create_store(
|
|
186
196
|
url, secrets=secrets, project_name=project
|
|
187
197
|
)
|
|
188
|
-
return DataItem(
|
|
198
|
+
return DataItem(
|
|
199
|
+
key,
|
|
200
|
+
store,
|
|
201
|
+
subpath,
|
|
202
|
+
url,
|
|
203
|
+
meta=meta,
|
|
204
|
+
artifact_url=artifact_url,
|
|
205
|
+
)
|
|
189
206
|
|
|
190
207
|
def get_or_create_store(
|
|
191
208
|
self, url, secrets: dict = None, project_name=""
|
|
192
|
-
) -> (DataStore, str):
|
|
209
|
+
) -> (DataStore, str, str):
|
|
193
210
|
schema, endpoint, parsed_url = parse_url(url)
|
|
194
211
|
subpath = parsed_url.path
|
|
195
|
-
store_key = f"{schema}://{endpoint}"
|
|
212
|
+
store_key = f"{schema}://{endpoint}" if endpoint else f"{schema}://"
|
|
196
213
|
|
|
197
214
|
if schema == "ds":
|
|
198
215
|
datastore_profile = datastore_profile_read(url, project_name, secrets)
|
|
@@ -206,17 +223,22 @@ class StoreManager:
|
|
|
206
223
|
|
|
207
224
|
if schema == "memory":
|
|
208
225
|
subpath = url[len("memory://") :]
|
|
209
|
-
return in_memory_store, subpath
|
|
226
|
+
return in_memory_store, subpath, url
|
|
227
|
+
|
|
228
|
+
elif schema in get_local_file_schema():
|
|
229
|
+
# parse_url() will drop the windows drive-letter from the path for url like "c:\a\b".
|
|
230
|
+
# As a workaround, we set subpath to the url.
|
|
231
|
+
subpath = url.replace("file://", "", 1)
|
|
210
232
|
|
|
211
233
|
if not schema and endpoint:
|
|
212
234
|
if endpoint in self._stores.keys():
|
|
213
|
-
return self._stores[endpoint], subpath
|
|
235
|
+
return self._stores[endpoint], subpath, url
|
|
214
236
|
else:
|
|
215
237
|
raise ValueError(f"no such store ({endpoint})")
|
|
216
238
|
|
|
217
239
|
if not secrets and not mlrun.config.is_running_as_api():
|
|
218
240
|
if store_key in self._stores.keys():
|
|
219
|
-
return self._stores[store_key], subpath
|
|
241
|
+
return self._stores[store_key], subpath, url
|
|
220
242
|
|
|
221
243
|
# support u/p embedding in url (as done in redis) by setting netloc as the "endpoint" parameter
|
|
222
244
|
# when running on server we don't cache the datastore, because there are multiple users and we don't want to
|
|
@@ -226,8 +248,7 @@ class StoreManager:
|
|
|
226
248
|
)
|
|
227
249
|
if not secrets and not mlrun.config.is_running_as_api():
|
|
228
250
|
self._stores[store_key] = store
|
|
229
|
-
|
|
230
|
-
return store, url if store.kind == "file" else subpath
|
|
251
|
+
return store, subpath, url
|
|
231
252
|
|
|
232
253
|
def reset_secrets(self):
|
|
233
254
|
self._secrets = {}
|