mlrun 1.7.0rc4__py3-none-any.whl → 1.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +11 -1
- mlrun/__main__.py +39 -121
- mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
- mlrun/alerts/alert.py +248 -0
- mlrun/api/schemas/__init__.py +4 -3
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +39 -254
- mlrun/artifacts/dataset.py +9 -190
- mlrun/artifacts/manager.py +73 -46
- mlrun/artifacts/model.py +30 -158
- mlrun/artifacts/plots.py +23 -380
- mlrun/common/constants.py +73 -1
- mlrun/common/db/sql_session.py +3 -2
- mlrun/common/formatters/__init__.py +21 -0
- mlrun/common/formatters/artifact.py +46 -0
- mlrun/common/formatters/base.py +113 -0
- mlrun/common/formatters/feature_set.py +44 -0
- mlrun/common/formatters/function.py +46 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/common/formatters/run.py +29 -0
- mlrun/common/helpers.py +11 -1
- mlrun/{runtimes → common/runtimes}/constants.py +32 -4
- mlrun/common/schemas/__init__.py +31 -4
- mlrun/common/schemas/alert.py +202 -0
- mlrun/common/schemas/api_gateway.py +196 -0
- mlrun/common/schemas/artifact.py +28 -1
- mlrun/common/schemas/auth.py +13 -2
- mlrun/common/schemas/client_spec.py +2 -1
- mlrun/common/schemas/common.py +7 -4
- mlrun/common/schemas/constants.py +3 -0
- mlrun/common/schemas/feature_store.py +58 -28
- mlrun/common/schemas/frontend_spec.py +8 -0
- mlrun/common/schemas/function.py +11 -0
- mlrun/common/schemas/hub.py +7 -9
- mlrun/common/schemas/model_monitoring/__init__.py +21 -4
- mlrun/common/schemas/model_monitoring/constants.py +136 -42
- mlrun/common/schemas/model_monitoring/grafana.py +9 -5
- mlrun/common/schemas/model_monitoring/model_endpoints.py +89 -41
- mlrun/common/schemas/notification.py +69 -12
- mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
- mlrun/common/schemas/pipeline.py +7 -0
- mlrun/common/schemas/project.py +67 -16
- mlrun/common/schemas/runs.py +17 -0
- mlrun/common/schemas/schedule.py +1 -1
- mlrun/common/schemas/workflow.py +10 -2
- mlrun/common/types.py +14 -1
- mlrun/config.py +233 -58
- mlrun/data_types/data_types.py +11 -1
- mlrun/data_types/spark.py +5 -4
- mlrun/data_types/to_pandas.py +75 -34
- mlrun/datastore/__init__.py +8 -10
- mlrun/datastore/alibaba_oss.py +131 -0
- mlrun/datastore/azure_blob.py +131 -43
- mlrun/datastore/base.py +107 -47
- mlrun/datastore/datastore.py +17 -7
- mlrun/datastore/datastore_profile.py +91 -7
- mlrun/datastore/dbfs_store.py +3 -7
- mlrun/datastore/filestore.py +1 -3
- mlrun/datastore/google_cloud_storage.py +92 -32
- mlrun/datastore/hdfs.py +5 -0
- mlrun/datastore/inmem.py +6 -3
- mlrun/datastore/redis.py +3 -2
- mlrun/datastore/s3.py +30 -12
- mlrun/datastore/snowflake_utils.py +45 -0
- mlrun/datastore/sources.py +274 -59
- mlrun/datastore/spark_utils.py +30 -0
- mlrun/datastore/store_resources.py +9 -7
- mlrun/datastore/storeytargets.py +151 -0
- mlrun/datastore/targets.py +387 -119
- mlrun/datastore/utils.py +68 -5
- mlrun/datastore/v3io.py +28 -50
- mlrun/db/auth_utils.py +152 -0
- mlrun/db/base.py +245 -20
- mlrun/db/factory.py +1 -4
- mlrun/db/httpdb.py +909 -231
- mlrun/db/nopdb.py +279 -14
- mlrun/errors.py +35 -5
- mlrun/execution.py +111 -38
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +46 -53
- mlrun/feature_store/common.py +6 -11
- mlrun/feature_store/feature_set.py +48 -23
- mlrun/feature_store/feature_vector.py +13 -2
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +9 -4
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +13 -4
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +24 -32
- mlrun/feature_store/steps.py +38 -19
- mlrun/features.py +6 -14
- mlrun/frameworks/_common/plan.py +3 -3
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
- mlrun/frameworks/_ml_common/plan.py +1 -1
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
- mlrun/frameworks/lgbm/__init__.py +1 -1
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/model_handler.py +1 -1
- mlrun/frameworks/parallel_coordinates.py +4 -4
- mlrun/frameworks/pytorch/__init__.py +2 -2
- mlrun/frameworks/sklearn/__init__.py +1 -1
- mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
- mlrun/frameworks/tf_keras/__init__.py +5 -2
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
- mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
- mlrun/frameworks/xgboost/__init__.py +1 -1
- mlrun/k8s_utils.py +57 -12
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +6 -5
- mlrun/launcher/client.py +13 -11
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +15 -5
- mlrun/launcher/remote.py +10 -3
- mlrun/lists.py +6 -2
- mlrun/model.py +297 -48
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +152 -357
- mlrun/model_monitoring/applications/__init__.py +10 -0
- mlrun/model_monitoring/applications/_application_steps.py +190 -0
- mlrun/model_monitoring/applications/base.py +108 -0
- mlrun/model_monitoring/applications/context.py +341 -0
- mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
- mlrun/model_monitoring/applications/histogram_data_drift.py +227 -91
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +130 -303
- mlrun/model_monitoring/{stores/models/sqlite.py → db/__init__.py} +5 -10
- mlrun/model_monitoring/db/stores/__init__.py +136 -0
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/db/stores/base/store.py +213 -0
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
- mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
- mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
- mlrun/model_monitoring/db/tsdb/base.py +448 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +298 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +522 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
- mlrun/model_monitoring/features_drift_table.py +34 -22
- mlrun/model_monitoring/helpers.py +177 -39
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +165 -398
- mlrun/model_monitoring/tracking_policy.py +7 -1
- mlrun/model_monitoring/writer.py +161 -125
- mlrun/package/packagers/default_packager.py +2 -2
- mlrun/package/packagers_manager.py +1 -0
- mlrun/package/utils/_formatter.py +2 -2
- mlrun/platforms/__init__.py +11 -10
- mlrun/platforms/iguazio.py +67 -228
- mlrun/projects/__init__.py +6 -1
- mlrun/projects/operations.py +47 -20
- mlrun/projects/pipelines.py +396 -249
- mlrun/projects/project.py +1176 -406
- mlrun/render.py +28 -22
- mlrun/run.py +208 -181
- mlrun/runtimes/__init__.py +76 -11
- mlrun/runtimes/base.py +54 -24
- mlrun/runtimes/daskjob.py +9 -2
- mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +1 -29
- mlrun/runtimes/kubejob.py +34 -128
- mlrun/runtimes/local.py +39 -10
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/abstract.py +8 -8
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/runtimes/nuclio/__init__.py +1 -0
- mlrun/runtimes/nuclio/api_gateway.py +769 -0
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +758 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
- mlrun/runtimes/nuclio/function.py +188 -68
- mlrun/runtimes/nuclio/serving.py +57 -60
- mlrun/runtimes/pod.py +191 -58
- mlrun/runtimes/remotesparkjob.py +11 -8
- mlrun/runtimes/sparkjob/spark3job.py +17 -18
- mlrun/runtimes/utils.py +40 -73
- mlrun/secrets.py +6 -2
- mlrun/serving/__init__.py +8 -1
- mlrun/serving/remote.py +2 -3
- mlrun/serving/routers.py +89 -64
- mlrun/serving/server.py +54 -26
- mlrun/serving/states.py +187 -56
- mlrun/serving/utils.py +19 -11
- mlrun/serving/v2_serving.py +136 -63
- mlrun/track/tracker.py +2 -1
- mlrun/track/trackers/mlflow_tracker.py +5 -0
- mlrun/utils/async_http.py +26 -6
- mlrun/utils/db.py +18 -0
- mlrun/utils/helpers.py +375 -105
- mlrun/utils/http.py +2 -2
- mlrun/utils/logger.py +75 -9
- mlrun/utils/notifications/notification/__init__.py +14 -10
- mlrun/utils/notifications/notification/base.py +48 -0
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +24 -1
- mlrun/utils/notifications/notification/ipython.py +2 -0
- mlrun/utils/notifications/notification/slack.py +96 -21
- mlrun/utils/notifications/notification/webhook.py +63 -2
- mlrun/utils/notifications/notification_pusher.py +146 -16
- mlrun/utils/regex.py +9 -0
- mlrun/utils/retryer.py +3 -2
- mlrun/utils/v3io_clients.py +2 -3
- mlrun/utils/version/version.json +2 -2
- mlrun-1.7.2.dist-info/METADATA +390 -0
- mlrun-1.7.2.dist-info/RECORD +351 -0
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/WHEEL +1 -1
- mlrun/feature_store/retrieval/conversion.py +0 -271
- mlrun/kfpops.py +0 -868
- mlrun/model_monitoring/application.py +0 -310
- mlrun/model_monitoring/batch.py +0 -974
- mlrun/model_monitoring/controller_handler.py +0 -37
- mlrun/model_monitoring/prometheus.py +0 -216
- mlrun/model_monitoring/stores/__init__.py +0 -111
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -574
- mlrun/model_monitoring/stores/model_endpoint_store.py +0 -145
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/models/base.py +0 -84
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
- mlrun/platforms/other.py +0 -305
- mlrun-1.7.0rc4.dist-info/METADATA +0 -269
- mlrun-1.7.0rc4.dist-info/RECORD +0 -321
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/top_level.txt +0 -0
mlrun/data_types/to_pandas.py
CHANGED
|
@@ -15,23 +15,13 @@
|
|
|
15
15
|
import warnings
|
|
16
16
|
from collections import Counter
|
|
17
17
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
IntegerType,
|
|
24
|
-
IntegralType,
|
|
25
|
-
LongType,
|
|
26
|
-
MapType,
|
|
27
|
-
ShortType,
|
|
28
|
-
TimestampType,
|
|
29
|
-
)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def toPandas(spark_df):
|
|
18
|
+
import pandas as pd
|
|
19
|
+
import semver
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _to_pandas(spark_df):
|
|
33
23
|
"""
|
|
34
|
-
Modified version of spark DataFrame.toPandas()
|
|
24
|
+
Modified version of spark DataFrame.toPandas() -
|
|
35
25
|
https://github.com/apache/spark/blob/v3.2.3/python/pyspark/sql/pandas/conversion.py#L35
|
|
36
26
|
|
|
37
27
|
The original code (which is only replaced in pyspark 3.5.0) fails with Pandas 2 installed, with the following error:
|
|
@@ -40,6 +30,12 @@ def toPandas(spark_df):
|
|
|
40
30
|
This modification adds the missing unit to the dtype.
|
|
41
31
|
"""
|
|
42
32
|
from pyspark.sql.dataframe import DataFrame
|
|
33
|
+
from pyspark.sql.types import (
|
|
34
|
+
BooleanType,
|
|
35
|
+
IntegralType,
|
|
36
|
+
MapType,
|
|
37
|
+
TimestampType,
|
|
38
|
+
)
|
|
43
39
|
|
|
44
40
|
assert isinstance(spark_df, DataFrame)
|
|
45
41
|
|
|
@@ -48,7 +44,6 @@ def toPandas(spark_df):
|
|
|
48
44
|
require_minimum_pandas_version()
|
|
49
45
|
|
|
50
46
|
import numpy as np
|
|
51
|
-
import pandas as pd
|
|
52
47
|
|
|
53
48
|
timezone = spark_df.sql_ctx._conf.sessionLocalTimeZone()
|
|
54
49
|
|
|
@@ -65,10 +60,10 @@ def toPandas(spark_df):
|
|
|
65
60
|
msg = (
|
|
66
61
|
"toPandas attempted Arrow optimization because "
|
|
67
62
|
"'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, "
|
|
68
|
-
"failed by the reason below:\n
|
|
63
|
+
f"failed by the reason below:\n {e}\n"
|
|
69
64
|
"Attempting non-optimization as "
|
|
70
65
|
"'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to "
|
|
71
|
-
"true."
|
|
66
|
+
"true."
|
|
72
67
|
)
|
|
73
68
|
warnings.warn(msg)
|
|
74
69
|
use_arrow = False
|
|
@@ -78,7 +73,7 @@ def toPandas(spark_df):
|
|
|
78
73
|
"'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
|
|
79
74
|
"reached the error below and will not continue because automatic fallback "
|
|
80
75
|
"with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to "
|
|
81
|
-
"false.\n
|
|
76
|
+
f"false.\n {e}"
|
|
82
77
|
)
|
|
83
78
|
warnings.warn(msg)
|
|
84
79
|
raise
|
|
@@ -144,7 +139,7 @@ def toPandas(spark_df):
|
|
|
144
139
|
"reached the error below and can not continue. Note that "
|
|
145
140
|
"'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an "
|
|
146
141
|
"effect on failures in the middle of "
|
|
147
|
-
"computation.\n
|
|
142
|
+
f"computation.\n {e}"
|
|
148
143
|
)
|
|
149
144
|
warnings.warn(msg)
|
|
150
145
|
raise
|
|
@@ -154,10 +149,10 @@ def toPandas(spark_df):
|
|
|
154
149
|
column_counter = Counter(spark_df.columns)
|
|
155
150
|
|
|
156
151
|
dtype = [None] * len(spark_df.schema)
|
|
157
|
-
for
|
|
152
|
+
for field_idx, field in enumerate(spark_df.schema):
|
|
158
153
|
# For duplicate column name, we use `iloc` to access it.
|
|
159
154
|
if column_counter[field.name] > 1:
|
|
160
|
-
pandas_col = pdf.iloc[:,
|
|
155
|
+
pandas_col = pdf.iloc[:, field_idx]
|
|
161
156
|
else:
|
|
162
157
|
pandas_col = pdf[field.name]
|
|
163
158
|
|
|
@@ -171,12 +166,12 @@ def toPandas(spark_df):
|
|
|
171
166
|
and field.nullable
|
|
172
167
|
and pandas_col.isnull().any()
|
|
173
168
|
):
|
|
174
|
-
dtype[
|
|
169
|
+
dtype[field_idx] = pandas_type
|
|
175
170
|
# Ensure we fall back to nullable numpy types, even when whole column is null:
|
|
176
171
|
if isinstance(field.dataType, IntegralType) and pandas_col.isnull().any():
|
|
177
|
-
dtype[
|
|
172
|
+
dtype[field_idx] = np.float64
|
|
178
173
|
if isinstance(field.dataType, BooleanType) and pandas_col.isnull().any():
|
|
179
|
-
dtype[
|
|
174
|
+
dtype[field_idx] = object
|
|
180
175
|
|
|
181
176
|
df = pd.DataFrame()
|
|
182
177
|
for index, t in enumerate(dtype):
|
|
@@ -217,22 +212,68 @@ def toPandas(spark_df):
|
|
|
217
212
|
|
|
218
213
|
def _to_corrected_pandas_type(dt):
|
|
219
214
|
import numpy as np
|
|
215
|
+
from pyspark.sql.types import (
|
|
216
|
+
BooleanType,
|
|
217
|
+
ByteType,
|
|
218
|
+
DoubleType,
|
|
219
|
+
FloatType,
|
|
220
|
+
IntegerType,
|
|
221
|
+
LongType,
|
|
222
|
+
ShortType,
|
|
223
|
+
TimestampType,
|
|
224
|
+
)
|
|
220
225
|
|
|
221
|
-
if
|
|
226
|
+
if isinstance(dt, ByteType):
|
|
222
227
|
return np.int8
|
|
223
|
-
elif
|
|
228
|
+
elif isinstance(dt, ShortType):
|
|
224
229
|
return np.int16
|
|
225
|
-
elif
|
|
230
|
+
elif isinstance(dt, IntegerType):
|
|
226
231
|
return np.int32
|
|
227
|
-
elif
|
|
232
|
+
elif isinstance(dt, LongType):
|
|
228
233
|
return np.int64
|
|
229
|
-
elif
|
|
234
|
+
elif isinstance(dt, FloatType):
|
|
230
235
|
return np.float32
|
|
231
|
-
elif
|
|
236
|
+
elif isinstance(dt, DoubleType):
|
|
232
237
|
return np.float64
|
|
233
|
-
elif
|
|
238
|
+
elif isinstance(dt, BooleanType):
|
|
234
239
|
return bool
|
|
235
|
-
elif
|
|
240
|
+
elif isinstance(dt, TimestampType):
|
|
236
241
|
return "datetime64[ns]"
|
|
237
242
|
else:
|
|
238
243
|
return None
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def spark_df_to_pandas(spark_df):
|
|
247
|
+
import pyspark
|
|
248
|
+
|
|
249
|
+
if semver.parse(pyspark.__version__) >= semver.Version(3, 5, 0):
|
|
250
|
+
|
|
251
|
+
def to_pandas(spark_df_inner):
|
|
252
|
+
return spark_df_inner.toPandas()
|
|
253
|
+
else:
|
|
254
|
+
to_pandas = _to_pandas
|
|
255
|
+
|
|
256
|
+
# as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
|
|
257
|
+
# when we upgrade pyspark, we should check whether this workaround is still necessary
|
|
258
|
+
# see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
|
|
259
|
+
if semver.parse(pd.__version__)["major"] >= 2:
|
|
260
|
+
import pyspark.sql.functions as pyspark_functions
|
|
261
|
+
|
|
262
|
+
type_conversion_dict = {}
|
|
263
|
+
for field in spark_df.schema.fields:
|
|
264
|
+
if str(field.dataType) == "TimestampType":
|
|
265
|
+
spark_df = spark_df.withColumn(
|
|
266
|
+
field.name,
|
|
267
|
+
pyspark_functions.date_format(
|
|
268
|
+
pyspark_functions.to_timestamp(field.name),
|
|
269
|
+
"yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS",
|
|
270
|
+
),
|
|
271
|
+
)
|
|
272
|
+
type_conversion_dict[field.name] = "datetime64[ns]"
|
|
273
|
+
|
|
274
|
+
df = to_pandas(spark_df)
|
|
275
|
+
if type_conversion_dict:
|
|
276
|
+
df = df.astype(type_conversion_dict)
|
|
277
|
+
return df
|
|
278
|
+
else:
|
|
279
|
+
return to_pandas(spark_df)
|
mlrun/datastore/__init__.py
CHANGED
|
@@ -64,7 +64,7 @@ from .store_resources import (
|
|
|
64
64
|
parse_store_uri,
|
|
65
65
|
)
|
|
66
66
|
from .targets import CSVTarget, NoSqlTarget, ParquetTarget, StreamTarget
|
|
67
|
-
from .utils import parse_kafka_url
|
|
67
|
+
from .utils import get_kafka_brokers_from_dict, parse_kafka_url
|
|
68
68
|
|
|
69
69
|
store_manager = StoreManager()
|
|
70
70
|
|
|
@@ -107,19 +107,17 @@ def get_stream_pusher(stream_path: str, **kwargs):
|
|
|
107
107
|
:param stream_path: path/url of stream
|
|
108
108
|
"""
|
|
109
109
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
)
|
|
114
|
-
return KafkaOutputStream(
|
|
115
|
-
topic, bootstrap_servers, kwargs.get("kafka_producer_options")
|
|
116
|
-
)
|
|
110
|
+
kafka_brokers = get_kafka_brokers_from_dict(kwargs)
|
|
111
|
+
if stream_path.startswith("kafka://") or kafka_brokers:
|
|
112
|
+
topic, brokers = parse_kafka_url(stream_path, kafka_brokers)
|
|
113
|
+
return KafkaOutputStream(topic, brokers, kwargs.get("kafka_producer_options"))
|
|
117
114
|
elif stream_path.startswith("http://") or stream_path.startswith("https://"):
|
|
118
115
|
return HTTPOutputStream(stream_path=stream_path)
|
|
119
116
|
elif "://" not in stream_path:
|
|
120
117
|
return OutputStream(stream_path, **kwargs)
|
|
121
118
|
elif stream_path.startswith("v3io"):
|
|
122
119
|
endpoint, stream_path = parse_path(stream_path)
|
|
120
|
+
endpoint = kwargs.pop("endpoint", None) or endpoint
|
|
123
121
|
return OutputStream(stream_path, endpoint=endpoint, **kwargs)
|
|
124
122
|
elif stream_path.startswith("dummy://"):
|
|
125
123
|
return _DummyStream(**kwargs)
|
|
@@ -133,9 +131,9 @@ class _DummyStream:
|
|
|
133
131
|
def __init__(self, event_list=None, **kwargs):
|
|
134
132
|
self.event_list = event_list or []
|
|
135
133
|
|
|
136
|
-
def push(self, data):
|
|
134
|
+
def push(self, data, **kwargs):
|
|
137
135
|
if not isinstance(data, list):
|
|
138
136
|
data = [data]
|
|
139
137
|
for item in data:
|
|
140
|
-
logger.info(f"dummy stream got event: {item}")
|
|
138
|
+
logger.info(f"dummy stream got event: {item}, kwargs={kwargs}")
|
|
141
139
|
self.event_list.append(item)
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# Copyright 2023 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import time
|
|
16
|
+
from datetime import datetime
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from urllib.parse import urlparse
|
|
19
|
+
|
|
20
|
+
import oss2
|
|
21
|
+
from fsspec.registry import get_filesystem_class
|
|
22
|
+
|
|
23
|
+
import mlrun.errors
|
|
24
|
+
|
|
25
|
+
from .base import DataStore, FileStats, make_datastore_schema_sanitizer
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class OSSStore(DataStore):
|
|
29
|
+
using_bucket = True
|
|
30
|
+
|
|
31
|
+
def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
|
|
32
|
+
super().__init__(parent, name, schema, endpoint, secrets)
|
|
33
|
+
# will be used in case user asks to assume a role and work through fsspec
|
|
34
|
+
|
|
35
|
+
access_key_id = self._get_secret_or_env("ALIBABA_ACCESS_KEY_ID")
|
|
36
|
+
secret_key = self._get_secret_or_env("ALIBABA_SECRET_ACCESS_KEY")
|
|
37
|
+
endpoint_url = self._get_secret_or_env("ALIBABA_ENDPOINT_URL")
|
|
38
|
+
if access_key_id and secret_key and endpoint_url:
|
|
39
|
+
self.auth = oss2.Auth(access_key_id, secret_key)
|
|
40
|
+
self.endpoint_url = endpoint_url
|
|
41
|
+
else:
|
|
42
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
43
|
+
"missing ALIBABA_ACCESS_KEY_ID or ALIBABA_SECRET_ACCESS_KEY ALIBABA_ENDPOINT_URL in environment"
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def filesystem(self):
|
|
48
|
+
"""return fsspec file system object, if supported"""
|
|
49
|
+
if self._filesystem:
|
|
50
|
+
return self._filesystem
|
|
51
|
+
try:
|
|
52
|
+
import ossfs # noqa
|
|
53
|
+
except ImportError as exc:
|
|
54
|
+
raise ImportError("ALIBABA ossfs not installed") from exc
|
|
55
|
+
filesystem_class = get_filesystem_class(protocol=self.kind)
|
|
56
|
+
self._filesystem = make_datastore_schema_sanitizer(
|
|
57
|
+
filesystem_class,
|
|
58
|
+
using_bucket=self.using_bucket,
|
|
59
|
+
**self.get_storage_options(),
|
|
60
|
+
)
|
|
61
|
+
return self._filesystem
|
|
62
|
+
|
|
63
|
+
def get_storage_options(self):
|
|
64
|
+
res = dict(
|
|
65
|
+
endpoint=self._get_secret_or_env("ALIBABA_ENDPOINT_URL"),
|
|
66
|
+
key=self._get_secret_or_env("ALIBABA_ACCESS_KEY_ID"),
|
|
67
|
+
secret=self._get_secret_or_env("ALIBABA_SECRET_ACCESS_KEY"),
|
|
68
|
+
)
|
|
69
|
+
return self._sanitize_storage_options(res)
|
|
70
|
+
|
|
71
|
+
def get_bucket_and_key(self, key):
|
|
72
|
+
path = self._join(key)[1:]
|
|
73
|
+
return self.endpoint, path
|
|
74
|
+
|
|
75
|
+
def upload(self, key, src_path):
|
|
76
|
+
bucket, key = self.get_bucket_and_key(key)
|
|
77
|
+
oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
|
|
78
|
+
oss.put_object(key, open(src_path, "rb"))
|
|
79
|
+
|
|
80
|
+
def get(self, key, size=None, offset=0):
|
|
81
|
+
bucket, key = self.get_bucket_and_key(key)
|
|
82
|
+
oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
|
|
83
|
+
if size or offset:
|
|
84
|
+
return oss.get_object(key, byte_range=self.get_range(size, offset)).read()
|
|
85
|
+
return oss.get_object(key).read()
|
|
86
|
+
|
|
87
|
+
def put(self, key, data, append=False):
|
|
88
|
+
data, _ = self._prepare_put_data(data, append)
|
|
89
|
+
bucket, key = self.get_bucket_and_key(key)
|
|
90
|
+
oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
|
|
91
|
+
oss.put_object(key, data)
|
|
92
|
+
|
|
93
|
+
def stat(self, key):
|
|
94
|
+
bucket, key = self.get_bucket_and_key(key)
|
|
95
|
+
oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
|
|
96
|
+
obj = oss.get_object_meta(key)
|
|
97
|
+
size = obj.content_length
|
|
98
|
+
modified = datetime.fromtimestamp(obj.last_modified)
|
|
99
|
+
return FileStats(size, time.mktime(modified.timetuple()))
|
|
100
|
+
|
|
101
|
+
def listdir(self, key):
|
|
102
|
+
remote_path = self._convert_key_to_remote_path(key)
|
|
103
|
+
if self.filesystem.isfile(remote_path):
|
|
104
|
+
return key
|
|
105
|
+
remote_path = f"{remote_path}/**"
|
|
106
|
+
files = self.filesystem.glob(remote_path)
|
|
107
|
+
key_length = len(key)
|
|
108
|
+
files = [
|
|
109
|
+
f.split("/", 1)[1][key_length:] for f in files if len(f.split("/")) > 1
|
|
110
|
+
]
|
|
111
|
+
return files
|
|
112
|
+
|
|
113
|
+
def delete(self, key):
|
|
114
|
+
bucket, key = self.get_bucket_and_key(key)
|
|
115
|
+
oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
|
|
116
|
+
oss.delete_object(key)
|
|
117
|
+
|
|
118
|
+
def _convert_key_to_remote_path(self, key):
|
|
119
|
+
key = key.strip("/")
|
|
120
|
+
schema = urlparse(key).scheme
|
|
121
|
+
# if called without passing dataitem - like in fset.purge_targets,
|
|
122
|
+
# key will include schema.
|
|
123
|
+
if not schema:
|
|
124
|
+
key = Path(self.endpoint, key).as_posix()
|
|
125
|
+
return key
|
|
126
|
+
|
|
127
|
+
@staticmethod
|
|
128
|
+
def get_range(size, offset):
|
|
129
|
+
if size:
|
|
130
|
+
return [offset, size]
|
|
131
|
+
return [offset, None]
|
mlrun/datastore/azure_blob.py
CHANGED
|
@@ -16,12 +16,13 @@ import time
|
|
|
16
16
|
from pathlib import Path
|
|
17
17
|
from urllib.parse import urlparse
|
|
18
18
|
|
|
19
|
+
from azure.storage.blob import BlobServiceClient
|
|
19
20
|
from azure.storage.blob._shared.base_client import parse_connection_str
|
|
20
21
|
from fsspec.registry import get_filesystem_class
|
|
21
22
|
|
|
22
23
|
import mlrun.errors
|
|
23
24
|
|
|
24
|
-
from .base import DataStore, FileStats,
|
|
25
|
+
from .base import DataStore, FileStats, make_datastore_schema_sanitizer
|
|
25
26
|
|
|
26
27
|
# Azure blobs will be represented with the following URL: az://<container name>. The storage account is already
|
|
27
28
|
# pointed to by the connection string, so the user is not expected to specify it in any way.
|
|
@@ -29,47 +30,131 @@ from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
|
|
|
29
30
|
|
|
30
31
|
class AzureBlobStore(DataStore):
|
|
31
32
|
using_bucket = True
|
|
33
|
+
max_concurrency = 100
|
|
34
|
+
max_blocksize = 1024 * 1024 * 4
|
|
35
|
+
max_single_put_size = (
|
|
36
|
+
1024 * 1024 * 8
|
|
37
|
+
) # for service_client property only, does not affect filesystem
|
|
32
38
|
|
|
33
39
|
def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
|
|
34
40
|
super().__init__(parent, name, schema, endpoint, secrets=secrets)
|
|
41
|
+
self._service_client = None
|
|
42
|
+
self._storage_options = None
|
|
43
|
+
|
|
44
|
+
def get_storage_options(self):
|
|
45
|
+
return self.storage_options
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def storage_options(self):
|
|
49
|
+
if not self._storage_options:
|
|
50
|
+
res = dict(
|
|
51
|
+
account_name=self._get_secret_or_env("account_name")
|
|
52
|
+
or self._get_secret_or_env("AZURE_STORAGE_ACCOUNT_NAME"),
|
|
53
|
+
account_key=self._get_secret_or_env("account_key")
|
|
54
|
+
or self._get_secret_or_env("AZURE_STORAGE_ACCOUNT_KEY"),
|
|
55
|
+
connection_string=self._get_secret_or_env("connection_string")
|
|
56
|
+
or self._get_secret_or_env("AZURE_STORAGE_CONNECTION_STRING"),
|
|
57
|
+
tenant_id=self._get_secret_or_env("tenant_id")
|
|
58
|
+
or self._get_secret_or_env("AZURE_STORAGE_TENANT_ID"),
|
|
59
|
+
client_id=self._get_secret_or_env("client_id")
|
|
60
|
+
or self._get_secret_or_env("AZURE_STORAGE_CLIENT_ID"),
|
|
61
|
+
client_secret=self._get_secret_or_env("client_secret")
|
|
62
|
+
or self._get_secret_or_env("AZURE_STORAGE_CLIENT_SECRET"),
|
|
63
|
+
sas_token=self._get_secret_or_env("sas_token")
|
|
64
|
+
or self._get_secret_or_env("AZURE_STORAGE_SAS_TOKEN"),
|
|
65
|
+
credential=self._get_secret_or_env("credential"),
|
|
66
|
+
)
|
|
67
|
+
self._storage_options = self._sanitize_storage_options(res)
|
|
68
|
+
return self._storage_options
|
|
35
69
|
|
|
36
70
|
@property
|
|
37
71
|
def filesystem(self):
|
|
38
72
|
"""return fsspec file system object, if supported"""
|
|
39
|
-
if self._filesystem:
|
|
40
|
-
return self._filesystem
|
|
41
73
|
try:
|
|
42
74
|
import adlfs # noqa
|
|
43
75
|
except ImportError as exc:
|
|
44
76
|
raise ImportError("Azure adlfs not installed") from exc
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
filesystem_class
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
77
|
+
|
|
78
|
+
if not self._filesystem:
|
|
79
|
+
# in order to support az and wasbs kinds
|
|
80
|
+
filesystem_class = get_filesystem_class(protocol=self.kind)
|
|
81
|
+
self._filesystem = make_datastore_schema_sanitizer(
|
|
82
|
+
filesystem_class,
|
|
83
|
+
using_bucket=self.using_bucket,
|
|
84
|
+
blocksize=self.max_blocksize,
|
|
85
|
+
**self.storage_options,
|
|
86
|
+
)
|
|
52
87
|
return self._filesystem
|
|
53
88
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
89
|
+
@property
|
|
90
|
+
def service_client(self):
|
|
91
|
+
try:
|
|
92
|
+
import azure # noqa
|
|
93
|
+
except ImportError as exc:
|
|
94
|
+
raise ImportError("Azure not installed") from exc
|
|
95
|
+
|
|
96
|
+
if not self._service_client:
|
|
97
|
+
self._do_connect()
|
|
98
|
+
return self._service_client
|
|
99
|
+
|
|
100
|
+
def _do_connect(self):
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
Creates a client for azure.
|
|
104
|
+
Raises MLRunInvalidArgumentError if none of the connection details are available
|
|
105
|
+
based on do_connect in AzureBlobFileSystem:
|
|
106
|
+
https://github.com/fsspec/adlfs/blob/2023.9.0/adlfs/spec.py#L422
|
|
107
|
+
"""
|
|
108
|
+
from azure.identity import ClientSecretCredential
|
|
109
|
+
|
|
110
|
+
storage_options = self.storage_options
|
|
111
|
+
connection_string = storage_options.get("connection_string")
|
|
112
|
+
client_name = storage_options.get("account_name")
|
|
113
|
+
account_key = storage_options.get("account_key")
|
|
114
|
+
sas_token = storage_options.get("sas_token")
|
|
115
|
+
client_id = storage_options.get("client_id")
|
|
116
|
+
credential = storage_options.get("credential")
|
|
117
|
+
|
|
118
|
+
credential_from_client_id = None
|
|
119
|
+
if (
|
|
120
|
+
credential is None
|
|
121
|
+
and account_key is None
|
|
122
|
+
and sas_token is None
|
|
123
|
+
and client_id is not None
|
|
124
|
+
):
|
|
125
|
+
credential_from_client_id = ClientSecretCredential(
|
|
126
|
+
tenant_id=storage_options.get("tenant_id"),
|
|
127
|
+
client_id=client_id,
|
|
128
|
+
client_secret=storage_options.get("client_secret"),
|
|
129
|
+
)
|
|
130
|
+
try:
|
|
131
|
+
if connection_string is not None:
|
|
132
|
+
self._service_client = BlobServiceClient.from_connection_string(
|
|
133
|
+
conn_str=connection_string,
|
|
134
|
+
max_block_size=self.max_blocksize,
|
|
135
|
+
max_single_put_size=self.max_single_put_size,
|
|
136
|
+
)
|
|
137
|
+
elif client_name is not None:
|
|
138
|
+
account_url = f"https://{client_name}.blob.core.windows.net"
|
|
139
|
+
cred = credential_from_client_id or credential or account_key
|
|
140
|
+
if not cred and sas_token is not None:
|
|
141
|
+
if not sas_token.startswith("?"):
|
|
142
|
+
sas_token = f"?{sas_token}"
|
|
143
|
+
account_url = account_url + sas_token
|
|
144
|
+
self._service_client = BlobServiceClient(
|
|
145
|
+
account_url=account_url,
|
|
146
|
+
credential=cred,
|
|
147
|
+
max_block_size=self.max_blocksize,
|
|
148
|
+
max_single_put_size=self.max_single_put_size,
|
|
149
|
+
)
|
|
150
|
+
else:
|
|
151
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
152
|
+
"Must provide either a connection_string or account_name with credentials"
|
|
153
|
+
)
|
|
154
|
+
except Exception as e:
|
|
155
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
156
|
+
f"unable to connect to account for {e}"
|
|
157
|
+
)
|
|
73
158
|
|
|
74
159
|
def _convert_key_to_remote_path(self, key):
|
|
75
160
|
key = key.strip("/")
|
|
@@ -82,7 +167,15 @@ class AzureBlobStore(DataStore):
|
|
|
82
167
|
|
|
83
168
|
def upload(self, key, src_path):
|
|
84
169
|
remote_path = self._convert_key_to_remote_path(key)
|
|
85
|
-
|
|
170
|
+
container, remote_path = remote_path.split("/", 1)
|
|
171
|
+
container_client = self.service_client.get_container_client(container=container)
|
|
172
|
+
with open(file=src_path, mode="rb") as data:
|
|
173
|
+
container_client.upload_blob(
|
|
174
|
+
name=remote_path,
|
|
175
|
+
data=data,
|
|
176
|
+
overwrite=True,
|
|
177
|
+
max_concurrency=self.max_concurrency,
|
|
178
|
+
)
|
|
86
179
|
|
|
87
180
|
def get(self, key, size=None, offset=0):
|
|
88
181
|
remote_path = self._convert_key_to_remote_path(key)
|
|
@@ -96,12 +189,7 @@ class AzureBlobStore(DataStore):
|
|
|
96
189
|
"Append mode not supported for Azure blob datastore"
|
|
97
190
|
)
|
|
98
191
|
remote_path = self._convert_key_to_remote_path(key)
|
|
99
|
-
|
|
100
|
-
mode = "wb"
|
|
101
|
-
elif isinstance(data, str):
|
|
102
|
-
mode = "w"
|
|
103
|
-
else:
|
|
104
|
-
raise TypeError("Data type unknown. Unable to put in Azure!")
|
|
192
|
+
data, mode = self._prepare_put_data(data, append)
|
|
105
193
|
with self.filesystem.open(remote_path, mode) as f:
|
|
106
194
|
f.write(data)
|
|
107
195
|
|
|
@@ -135,7 +223,7 @@ class AzureBlobStore(DataStore):
|
|
|
135
223
|
|
|
136
224
|
def get_spark_options(self):
|
|
137
225
|
res = {}
|
|
138
|
-
st = self.
|
|
226
|
+
st = self.storage_options
|
|
139
227
|
service = "blob"
|
|
140
228
|
primary_url = None
|
|
141
229
|
if st.get("connection_string"):
|
|
@@ -158,18 +246,17 @@ class AzureBlobStore(DataStore):
|
|
|
158
246
|
st[key] = parsed_value
|
|
159
247
|
|
|
160
248
|
account_name = st.get("account_name")
|
|
161
|
-
if not account_name:
|
|
162
|
-
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
163
|
-
"Property 'account_name' is absent both in storage settings and connection string"
|
|
164
|
-
)
|
|
165
249
|
if primary_url:
|
|
166
250
|
if primary_url.startswith("http://"):
|
|
167
251
|
primary_url = primary_url[len("http://") :]
|
|
168
252
|
if primary_url.startswith("https://"):
|
|
169
253
|
primary_url = primary_url[len("https://") :]
|
|
170
254
|
host = primary_url
|
|
171
|
-
|
|
255
|
+
elif account_name:
|
|
172
256
|
host = f"{account_name}.{service}.core.windows.net"
|
|
257
|
+
else:
|
|
258
|
+
return res
|
|
259
|
+
|
|
173
260
|
if "account_key" in st:
|
|
174
261
|
res[f"spark.hadoop.fs.azure.account.key.{host}"] = st["account_key"]
|
|
175
262
|
|
|
@@ -209,6 +296,7 @@ class AzureBlobStore(DataStore):
|
|
|
209
296
|
for key in spark_options:
|
|
210
297
|
if key.startswith(prefix):
|
|
211
298
|
account_key = key[len(prefix) :]
|
|
212
|
-
|
|
299
|
+
if not url.endswith(account_key):
|
|
300
|
+
url += f"@{account_key}"
|
|
213
301
|
break
|
|
214
302
|
return url
|