mlrun 1.7.0rc4__py3-none-any.whl → 1.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (235) hide show
  1. mlrun/__init__.py +11 -1
  2. mlrun/__main__.py +39 -121
  3. mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
  4. mlrun/alerts/alert.py +248 -0
  5. mlrun/api/schemas/__init__.py +4 -3
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +39 -254
  8. mlrun/artifacts/dataset.py +9 -190
  9. mlrun/artifacts/manager.py +73 -46
  10. mlrun/artifacts/model.py +30 -158
  11. mlrun/artifacts/plots.py +23 -380
  12. mlrun/common/constants.py +73 -1
  13. mlrun/common/db/sql_session.py +3 -2
  14. mlrun/common/formatters/__init__.py +21 -0
  15. mlrun/common/formatters/artifact.py +46 -0
  16. mlrun/common/formatters/base.py +113 -0
  17. mlrun/common/formatters/feature_set.py +44 -0
  18. mlrun/common/formatters/function.py +46 -0
  19. mlrun/common/formatters/pipeline.py +53 -0
  20. mlrun/common/formatters/project.py +51 -0
  21. mlrun/common/formatters/run.py +29 -0
  22. mlrun/common/helpers.py +11 -1
  23. mlrun/{runtimes → common/runtimes}/constants.py +32 -4
  24. mlrun/common/schemas/__init__.py +31 -4
  25. mlrun/common/schemas/alert.py +202 -0
  26. mlrun/common/schemas/api_gateway.py +196 -0
  27. mlrun/common/schemas/artifact.py +28 -1
  28. mlrun/common/schemas/auth.py +13 -2
  29. mlrun/common/schemas/client_spec.py +2 -1
  30. mlrun/common/schemas/common.py +7 -4
  31. mlrun/common/schemas/constants.py +3 -0
  32. mlrun/common/schemas/feature_store.py +58 -28
  33. mlrun/common/schemas/frontend_spec.py +8 -0
  34. mlrun/common/schemas/function.py +11 -0
  35. mlrun/common/schemas/hub.py +7 -9
  36. mlrun/common/schemas/model_monitoring/__init__.py +21 -4
  37. mlrun/common/schemas/model_monitoring/constants.py +136 -42
  38. mlrun/common/schemas/model_monitoring/grafana.py +9 -5
  39. mlrun/common/schemas/model_monitoring/model_endpoints.py +89 -41
  40. mlrun/common/schemas/notification.py +69 -12
  41. mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
  42. mlrun/common/schemas/pipeline.py +7 -0
  43. mlrun/common/schemas/project.py +67 -16
  44. mlrun/common/schemas/runs.py +17 -0
  45. mlrun/common/schemas/schedule.py +1 -1
  46. mlrun/common/schemas/workflow.py +10 -2
  47. mlrun/common/types.py +14 -1
  48. mlrun/config.py +233 -58
  49. mlrun/data_types/data_types.py +11 -1
  50. mlrun/data_types/spark.py +5 -4
  51. mlrun/data_types/to_pandas.py +75 -34
  52. mlrun/datastore/__init__.py +8 -10
  53. mlrun/datastore/alibaba_oss.py +131 -0
  54. mlrun/datastore/azure_blob.py +131 -43
  55. mlrun/datastore/base.py +107 -47
  56. mlrun/datastore/datastore.py +17 -7
  57. mlrun/datastore/datastore_profile.py +91 -7
  58. mlrun/datastore/dbfs_store.py +3 -7
  59. mlrun/datastore/filestore.py +1 -3
  60. mlrun/datastore/google_cloud_storage.py +92 -32
  61. mlrun/datastore/hdfs.py +5 -0
  62. mlrun/datastore/inmem.py +6 -3
  63. mlrun/datastore/redis.py +3 -2
  64. mlrun/datastore/s3.py +30 -12
  65. mlrun/datastore/snowflake_utils.py +45 -0
  66. mlrun/datastore/sources.py +274 -59
  67. mlrun/datastore/spark_utils.py +30 -0
  68. mlrun/datastore/store_resources.py +9 -7
  69. mlrun/datastore/storeytargets.py +151 -0
  70. mlrun/datastore/targets.py +387 -119
  71. mlrun/datastore/utils.py +68 -5
  72. mlrun/datastore/v3io.py +28 -50
  73. mlrun/db/auth_utils.py +152 -0
  74. mlrun/db/base.py +245 -20
  75. mlrun/db/factory.py +1 -4
  76. mlrun/db/httpdb.py +909 -231
  77. mlrun/db/nopdb.py +279 -14
  78. mlrun/errors.py +35 -5
  79. mlrun/execution.py +111 -38
  80. mlrun/feature_store/__init__.py +0 -2
  81. mlrun/feature_store/api.py +46 -53
  82. mlrun/feature_store/common.py +6 -11
  83. mlrun/feature_store/feature_set.py +48 -23
  84. mlrun/feature_store/feature_vector.py +13 -2
  85. mlrun/feature_store/ingestion.py +7 -6
  86. mlrun/feature_store/retrieval/base.py +9 -4
  87. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  88. mlrun/feature_store/retrieval/job.py +13 -4
  89. mlrun/feature_store/retrieval/local_merger.py +2 -0
  90. mlrun/feature_store/retrieval/spark_merger.py +24 -32
  91. mlrun/feature_store/steps.py +38 -19
  92. mlrun/features.py +6 -14
  93. mlrun/frameworks/_common/plan.py +3 -3
  94. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
  95. mlrun/frameworks/_ml_common/plan.py +1 -1
  96. mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
  97. mlrun/frameworks/lgbm/__init__.py +1 -1
  98. mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
  99. mlrun/frameworks/lgbm/model_handler.py +1 -1
  100. mlrun/frameworks/parallel_coordinates.py +4 -4
  101. mlrun/frameworks/pytorch/__init__.py +2 -2
  102. mlrun/frameworks/sklearn/__init__.py +1 -1
  103. mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
  104. mlrun/frameworks/tf_keras/__init__.py +5 -2
  105. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
  106. mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
  107. mlrun/frameworks/xgboost/__init__.py +1 -1
  108. mlrun/k8s_utils.py +57 -12
  109. mlrun/launcher/__init__.py +1 -1
  110. mlrun/launcher/base.py +6 -5
  111. mlrun/launcher/client.py +13 -11
  112. mlrun/launcher/factory.py +1 -1
  113. mlrun/launcher/local.py +15 -5
  114. mlrun/launcher/remote.py +10 -3
  115. mlrun/lists.py +6 -2
  116. mlrun/model.py +297 -48
  117. mlrun/model_monitoring/__init__.py +1 -1
  118. mlrun/model_monitoring/api.py +152 -357
  119. mlrun/model_monitoring/applications/__init__.py +10 -0
  120. mlrun/model_monitoring/applications/_application_steps.py +190 -0
  121. mlrun/model_monitoring/applications/base.py +108 -0
  122. mlrun/model_monitoring/applications/context.py +341 -0
  123. mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
  124. mlrun/model_monitoring/applications/histogram_data_drift.py +227 -91
  125. mlrun/model_monitoring/applications/results.py +99 -0
  126. mlrun/model_monitoring/controller.py +130 -303
  127. mlrun/model_monitoring/{stores/models/sqlite.py → db/__init__.py} +5 -10
  128. mlrun/model_monitoring/db/stores/__init__.py +136 -0
  129. mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
  130. mlrun/model_monitoring/db/stores/base/store.py +213 -0
  131. mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
  132. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
  133. mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
  134. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
  135. mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
  136. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
  137. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
  138. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
  139. mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
  140. mlrun/model_monitoring/db/tsdb/base.py +448 -0
  141. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  142. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  143. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +298 -0
  144. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
  145. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +522 -0
  146. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  147. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
  148. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
  149. mlrun/model_monitoring/features_drift_table.py +34 -22
  150. mlrun/model_monitoring/helpers.py +177 -39
  151. mlrun/model_monitoring/model_endpoint.py +3 -2
  152. mlrun/model_monitoring/stream_processing.py +165 -398
  153. mlrun/model_monitoring/tracking_policy.py +7 -1
  154. mlrun/model_monitoring/writer.py +161 -125
  155. mlrun/package/packagers/default_packager.py +2 -2
  156. mlrun/package/packagers_manager.py +1 -0
  157. mlrun/package/utils/_formatter.py +2 -2
  158. mlrun/platforms/__init__.py +11 -10
  159. mlrun/platforms/iguazio.py +67 -228
  160. mlrun/projects/__init__.py +6 -1
  161. mlrun/projects/operations.py +47 -20
  162. mlrun/projects/pipelines.py +396 -249
  163. mlrun/projects/project.py +1176 -406
  164. mlrun/render.py +28 -22
  165. mlrun/run.py +208 -181
  166. mlrun/runtimes/__init__.py +76 -11
  167. mlrun/runtimes/base.py +54 -24
  168. mlrun/runtimes/daskjob.py +9 -2
  169. mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
  170. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  171. mlrun/runtimes/funcdoc.py +1 -29
  172. mlrun/runtimes/kubejob.py +34 -128
  173. mlrun/runtimes/local.py +39 -10
  174. mlrun/runtimes/mpijob/__init__.py +0 -20
  175. mlrun/runtimes/mpijob/abstract.py +8 -8
  176. mlrun/runtimes/mpijob/v1.py +1 -1
  177. mlrun/runtimes/nuclio/__init__.py +1 -0
  178. mlrun/runtimes/nuclio/api_gateway.py +769 -0
  179. mlrun/runtimes/nuclio/application/__init__.py +15 -0
  180. mlrun/runtimes/nuclio/application/application.py +758 -0
  181. mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
  182. mlrun/runtimes/nuclio/function.py +188 -68
  183. mlrun/runtimes/nuclio/serving.py +57 -60
  184. mlrun/runtimes/pod.py +191 -58
  185. mlrun/runtimes/remotesparkjob.py +11 -8
  186. mlrun/runtimes/sparkjob/spark3job.py +17 -18
  187. mlrun/runtimes/utils.py +40 -73
  188. mlrun/secrets.py +6 -2
  189. mlrun/serving/__init__.py +8 -1
  190. mlrun/serving/remote.py +2 -3
  191. mlrun/serving/routers.py +89 -64
  192. mlrun/serving/server.py +54 -26
  193. mlrun/serving/states.py +187 -56
  194. mlrun/serving/utils.py +19 -11
  195. mlrun/serving/v2_serving.py +136 -63
  196. mlrun/track/tracker.py +2 -1
  197. mlrun/track/trackers/mlflow_tracker.py +5 -0
  198. mlrun/utils/async_http.py +26 -6
  199. mlrun/utils/db.py +18 -0
  200. mlrun/utils/helpers.py +375 -105
  201. mlrun/utils/http.py +2 -2
  202. mlrun/utils/logger.py +75 -9
  203. mlrun/utils/notifications/notification/__init__.py +14 -10
  204. mlrun/utils/notifications/notification/base.py +48 -0
  205. mlrun/utils/notifications/notification/console.py +2 -0
  206. mlrun/utils/notifications/notification/git.py +24 -1
  207. mlrun/utils/notifications/notification/ipython.py +2 -0
  208. mlrun/utils/notifications/notification/slack.py +96 -21
  209. mlrun/utils/notifications/notification/webhook.py +63 -2
  210. mlrun/utils/notifications/notification_pusher.py +146 -16
  211. mlrun/utils/regex.py +9 -0
  212. mlrun/utils/retryer.py +3 -2
  213. mlrun/utils/v3io_clients.py +2 -3
  214. mlrun/utils/version/version.json +2 -2
  215. mlrun-1.7.2.dist-info/METADATA +390 -0
  216. mlrun-1.7.2.dist-info/RECORD +351 -0
  217. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/WHEEL +1 -1
  218. mlrun/feature_store/retrieval/conversion.py +0 -271
  219. mlrun/kfpops.py +0 -868
  220. mlrun/model_monitoring/application.py +0 -310
  221. mlrun/model_monitoring/batch.py +0 -974
  222. mlrun/model_monitoring/controller_handler.py +0 -37
  223. mlrun/model_monitoring/prometheus.py +0 -216
  224. mlrun/model_monitoring/stores/__init__.py +0 -111
  225. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -574
  226. mlrun/model_monitoring/stores/model_endpoint_store.py +0 -145
  227. mlrun/model_monitoring/stores/models/__init__.py +0 -27
  228. mlrun/model_monitoring/stores/models/base.py +0 -84
  229. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
  230. mlrun/platforms/other.py +0 -305
  231. mlrun-1.7.0rc4.dist-info/METADATA +0 -269
  232. mlrun-1.7.0rc4.dist-info/RECORD +0 -321
  233. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/LICENSE +0 -0
  234. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/entry_points.txt +0 -0
  235. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/top_level.txt +0 -0
@@ -15,23 +15,13 @@
15
15
  import warnings
16
16
  from collections import Counter
17
17
 
18
- from pyspark.sql.types import (
19
- BooleanType,
20
- ByteType,
21
- DoubleType,
22
- FloatType,
23
- IntegerType,
24
- IntegralType,
25
- LongType,
26
- MapType,
27
- ShortType,
28
- TimestampType,
29
- )
30
-
31
-
32
- def toPandas(spark_df):
18
+ import pandas as pd
19
+ import semver
20
+
21
+
22
+ def _to_pandas(spark_df):
33
23
  """
34
- Modified version of spark DataFrame.toPandas()
24
+ Modified version of spark DataFrame.toPandas() -
35
25
  https://github.com/apache/spark/blob/v3.2.3/python/pyspark/sql/pandas/conversion.py#L35
36
26
 
37
27
  The original code (which is only replaced in pyspark 3.5.0) fails with Pandas 2 installed, with the following error:
@@ -40,6 +30,12 @@ def toPandas(spark_df):
40
30
  This modification adds the missing unit to the dtype.
41
31
  """
42
32
  from pyspark.sql.dataframe import DataFrame
33
+ from pyspark.sql.types import (
34
+ BooleanType,
35
+ IntegralType,
36
+ MapType,
37
+ TimestampType,
38
+ )
43
39
 
44
40
  assert isinstance(spark_df, DataFrame)
45
41
 
@@ -48,7 +44,6 @@ def toPandas(spark_df):
48
44
  require_minimum_pandas_version()
49
45
 
50
46
  import numpy as np
51
- import pandas as pd
52
47
 
53
48
  timezone = spark_df.sql_ctx._conf.sessionLocalTimeZone()
54
49
 
@@ -65,10 +60,10 @@ def toPandas(spark_df):
65
60
  msg = (
66
61
  "toPandas attempted Arrow optimization because "
67
62
  "'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, "
68
- "failed by the reason below:\n %s\n"
63
+ f"failed by the reason below:\n {e}\n"
69
64
  "Attempting non-optimization as "
70
65
  "'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to "
71
- "true." % str(e)
66
+ "true."
72
67
  )
73
68
  warnings.warn(msg)
74
69
  use_arrow = False
@@ -78,7 +73,7 @@ def toPandas(spark_df):
78
73
  "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
79
74
  "reached the error below and will not continue because automatic fallback "
80
75
  "with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to "
81
- "false.\n %s" % str(e)
76
+ f"false.\n {e}"
82
77
  )
83
78
  warnings.warn(msg)
84
79
  raise
@@ -144,7 +139,7 @@ def toPandas(spark_df):
144
139
  "reached the error below and can not continue. Note that "
145
140
  "'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an "
146
141
  "effect on failures in the middle of "
147
- "computation.\n %s" % str(e)
142
+ f"computation.\n {e}"
148
143
  )
149
144
  warnings.warn(msg)
150
145
  raise
@@ -154,10 +149,10 @@ def toPandas(spark_df):
154
149
  column_counter = Counter(spark_df.columns)
155
150
 
156
151
  dtype = [None] * len(spark_df.schema)
157
- for fieldIdx, field in enumerate(spark_df.schema):
152
+ for field_idx, field in enumerate(spark_df.schema):
158
153
  # For duplicate column name, we use `iloc` to access it.
159
154
  if column_counter[field.name] > 1:
160
- pandas_col = pdf.iloc[:, fieldIdx]
155
+ pandas_col = pdf.iloc[:, field_idx]
161
156
  else:
162
157
  pandas_col = pdf[field.name]
163
158
 
@@ -171,12 +166,12 @@ def toPandas(spark_df):
171
166
  and field.nullable
172
167
  and pandas_col.isnull().any()
173
168
  ):
174
- dtype[fieldIdx] = pandas_type
169
+ dtype[field_idx] = pandas_type
175
170
  # Ensure we fall back to nullable numpy types, even when whole column is null:
176
171
  if isinstance(field.dataType, IntegralType) and pandas_col.isnull().any():
177
- dtype[fieldIdx] = np.float64
172
+ dtype[field_idx] = np.float64
178
173
  if isinstance(field.dataType, BooleanType) and pandas_col.isnull().any():
179
- dtype[fieldIdx] = object
174
+ dtype[field_idx] = object
180
175
 
181
176
  df = pd.DataFrame()
182
177
  for index, t in enumerate(dtype):
@@ -217,22 +212,68 @@ def toPandas(spark_df):
217
212
 
218
213
  def _to_corrected_pandas_type(dt):
219
214
  import numpy as np
215
+ from pyspark.sql.types import (
216
+ BooleanType,
217
+ ByteType,
218
+ DoubleType,
219
+ FloatType,
220
+ IntegerType,
221
+ LongType,
222
+ ShortType,
223
+ TimestampType,
224
+ )
220
225
 
221
- if type(dt) == ByteType:
226
+ if isinstance(dt, ByteType):
222
227
  return np.int8
223
- elif type(dt) == ShortType:
228
+ elif isinstance(dt, ShortType):
224
229
  return np.int16
225
- elif type(dt) == IntegerType:
230
+ elif isinstance(dt, IntegerType):
226
231
  return np.int32
227
- elif type(dt) == LongType:
232
+ elif isinstance(dt, LongType):
228
233
  return np.int64
229
- elif type(dt) == FloatType:
234
+ elif isinstance(dt, FloatType):
230
235
  return np.float32
231
- elif type(dt) == DoubleType:
236
+ elif isinstance(dt, DoubleType):
232
237
  return np.float64
233
- elif type(dt) == BooleanType:
238
+ elif isinstance(dt, BooleanType):
234
239
  return bool
235
- elif type(dt) == TimestampType:
240
+ elif isinstance(dt, TimestampType):
236
241
  return "datetime64[ns]"
237
242
  else:
238
243
  return None
244
+
245
+
246
+ def spark_df_to_pandas(spark_df):
247
+ import pyspark
248
+
249
+ if semver.parse(pyspark.__version__) >= semver.Version(3, 5, 0):
250
+
251
+ def to_pandas(spark_df_inner):
252
+ return spark_df_inner.toPandas()
253
+ else:
254
+ to_pandas = _to_pandas
255
+
256
+ # as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
257
+ # when we upgrade pyspark, we should check whether this workaround is still necessary
258
+ # see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
259
+ if semver.parse(pd.__version__)["major"] >= 2:
260
+ import pyspark.sql.functions as pyspark_functions
261
+
262
+ type_conversion_dict = {}
263
+ for field in spark_df.schema.fields:
264
+ if str(field.dataType) == "TimestampType":
265
+ spark_df = spark_df.withColumn(
266
+ field.name,
267
+ pyspark_functions.date_format(
268
+ pyspark_functions.to_timestamp(field.name),
269
+ "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS",
270
+ ),
271
+ )
272
+ type_conversion_dict[field.name] = "datetime64[ns]"
273
+
274
+ df = to_pandas(spark_df)
275
+ if type_conversion_dict:
276
+ df = df.astype(type_conversion_dict)
277
+ return df
278
+ else:
279
+ return to_pandas(spark_df)
@@ -64,7 +64,7 @@ from .store_resources import (
64
64
  parse_store_uri,
65
65
  )
66
66
  from .targets import CSVTarget, NoSqlTarget, ParquetTarget, StreamTarget
67
- from .utils import parse_kafka_url
67
+ from .utils import get_kafka_brokers_from_dict, parse_kafka_url
68
68
 
69
69
  store_manager = StoreManager()
70
70
 
@@ -107,19 +107,17 @@ def get_stream_pusher(stream_path: str, **kwargs):
107
107
  :param stream_path: path/url of stream
108
108
  """
109
109
 
110
- if stream_path.startswith("kafka://") or "kafka_bootstrap_servers" in kwargs:
111
- topic, bootstrap_servers = parse_kafka_url(
112
- stream_path, kwargs.get("kafka_bootstrap_servers")
113
- )
114
- return KafkaOutputStream(
115
- topic, bootstrap_servers, kwargs.get("kafka_producer_options")
116
- )
110
+ kafka_brokers = get_kafka_brokers_from_dict(kwargs)
111
+ if stream_path.startswith("kafka://") or kafka_brokers:
112
+ topic, brokers = parse_kafka_url(stream_path, kafka_brokers)
113
+ return KafkaOutputStream(topic, brokers, kwargs.get("kafka_producer_options"))
117
114
  elif stream_path.startswith("http://") or stream_path.startswith("https://"):
118
115
  return HTTPOutputStream(stream_path=stream_path)
119
116
  elif "://" not in stream_path:
120
117
  return OutputStream(stream_path, **kwargs)
121
118
  elif stream_path.startswith("v3io"):
122
119
  endpoint, stream_path = parse_path(stream_path)
120
+ endpoint = kwargs.pop("endpoint", None) or endpoint
123
121
  return OutputStream(stream_path, endpoint=endpoint, **kwargs)
124
122
  elif stream_path.startswith("dummy://"):
125
123
  return _DummyStream(**kwargs)
@@ -133,9 +131,9 @@ class _DummyStream:
133
131
  def __init__(self, event_list=None, **kwargs):
134
132
  self.event_list = event_list or []
135
133
 
136
- def push(self, data):
134
+ def push(self, data, **kwargs):
137
135
  if not isinstance(data, list):
138
136
  data = [data]
139
137
  for item in data:
140
- logger.info(f"dummy stream got event: {item}")
138
+ logger.info(f"dummy stream got event: {item}, kwargs={kwargs}")
141
139
  self.event_list.append(item)
@@ -0,0 +1,131 @@
1
+ # Copyright 2023 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import time
16
+ from datetime import datetime
17
+ from pathlib import Path
18
+ from urllib.parse import urlparse
19
+
20
+ import oss2
21
+ from fsspec.registry import get_filesystem_class
22
+
23
+ import mlrun.errors
24
+
25
+ from .base import DataStore, FileStats, make_datastore_schema_sanitizer
26
+
27
+
28
+ class OSSStore(DataStore):
29
+ using_bucket = True
30
+
31
+ def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
32
+ super().__init__(parent, name, schema, endpoint, secrets)
33
+ # will be used in case user asks to assume a role and work through fsspec
34
+
35
+ access_key_id = self._get_secret_or_env("ALIBABA_ACCESS_KEY_ID")
36
+ secret_key = self._get_secret_or_env("ALIBABA_SECRET_ACCESS_KEY")
37
+ endpoint_url = self._get_secret_or_env("ALIBABA_ENDPOINT_URL")
38
+ if access_key_id and secret_key and endpoint_url:
39
+ self.auth = oss2.Auth(access_key_id, secret_key)
40
+ self.endpoint_url = endpoint_url
41
+ else:
42
+ raise mlrun.errors.MLRunInvalidArgumentError(
43
+ "missing ALIBABA_ACCESS_KEY_ID or ALIBABA_SECRET_ACCESS_KEY ALIBABA_ENDPOINT_URL in environment"
44
+ )
45
+
46
+ @property
47
+ def filesystem(self):
48
+ """return fsspec file system object, if supported"""
49
+ if self._filesystem:
50
+ return self._filesystem
51
+ try:
52
+ import ossfs # noqa
53
+ except ImportError as exc:
54
+ raise ImportError("ALIBABA ossfs not installed") from exc
55
+ filesystem_class = get_filesystem_class(protocol=self.kind)
56
+ self._filesystem = make_datastore_schema_sanitizer(
57
+ filesystem_class,
58
+ using_bucket=self.using_bucket,
59
+ **self.get_storage_options(),
60
+ )
61
+ return self._filesystem
62
+
63
+ def get_storage_options(self):
64
+ res = dict(
65
+ endpoint=self._get_secret_or_env("ALIBABA_ENDPOINT_URL"),
66
+ key=self._get_secret_or_env("ALIBABA_ACCESS_KEY_ID"),
67
+ secret=self._get_secret_or_env("ALIBABA_SECRET_ACCESS_KEY"),
68
+ )
69
+ return self._sanitize_storage_options(res)
70
+
71
+ def get_bucket_and_key(self, key):
72
+ path = self._join(key)[1:]
73
+ return self.endpoint, path
74
+
75
+ def upload(self, key, src_path):
76
+ bucket, key = self.get_bucket_and_key(key)
77
+ oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
78
+ oss.put_object(key, open(src_path, "rb"))
79
+
80
+ def get(self, key, size=None, offset=0):
81
+ bucket, key = self.get_bucket_and_key(key)
82
+ oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
83
+ if size or offset:
84
+ return oss.get_object(key, byte_range=self.get_range(size, offset)).read()
85
+ return oss.get_object(key).read()
86
+
87
+ def put(self, key, data, append=False):
88
+ data, _ = self._prepare_put_data(data, append)
89
+ bucket, key = self.get_bucket_and_key(key)
90
+ oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
91
+ oss.put_object(key, data)
92
+
93
+ def stat(self, key):
94
+ bucket, key = self.get_bucket_and_key(key)
95
+ oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
96
+ obj = oss.get_object_meta(key)
97
+ size = obj.content_length
98
+ modified = datetime.fromtimestamp(obj.last_modified)
99
+ return FileStats(size, time.mktime(modified.timetuple()))
100
+
101
+ def listdir(self, key):
102
+ remote_path = self._convert_key_to_remote_path(key)
103
+ if self.filesystem.isfile(remote_path):
104
+ return key
105
+ remote_path = f"{remote_path}/**"
106
+ files = self.filesystem.glob(remote_path)
107
+ key_length = len(key)
108
+ files = [
109
+ f.split("/", 1)[1][key_length:] for f in files if len(f.split("/")) > 1
110
+ ]
111
+ return files
112
+
113
+ def delete(self, key):
114
+ bucket, key = self.get_bucket_and_key(key)
115
+ oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
116
+ oss.delete_object(key)
117
+
118
+ def _convert_key_to_remote_path(self, key):
119
+ key = key.strip("/")
120
+ schema = urlparse(key).scheme
121
+ # if called without passing dataitem - like in fset.purge_targets,
122
+ # key will include schema.
123
+ if not schema:
124
+ key = Path(self.endpoint, key).as_posix()
125
+ return key
126
+
127
+ @staticmethod
128
+ def get_range(size, offset):
129
+ if size:
130
+ return [offset, size]
131
+ return [offset, None]
@@ -16,12 +16,13 @@ import time
16
16
  from pathlib import Path
17
17
  from urllib.parse import urlparse
18
18
 
19
+ from azure.storage.blob import BlobServiceClient
19
20
  from azure.storage.blob._shared.base_client import parse_connection_str
20
21
  from fsspec.registry import get_filesystem_class
21
22
 
22
23
  import mlrun.errors
23
24
 
24
- from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
25
+ from .base import DataStore, FileStats, make_datastore_schema_sanitizer
25
26
 
26
27
  # Azure blobs will be represented with the following URL: az://<container name>. The storage account is already
27
28
  # pointed to by the connection string, so the user is not expected to specify it in any way.
@@ -29,47 +30,131 @@ from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
29
30
 
30
31
  class AzureBlobStore(DataStore):
31
32
  using_bucket = True
33
+ max_concurrency = 100
34
+ max_blocksize = 1024 * 1024 * 4
35
+ max_single_put_size = (
36
+ 1024 * 1024 * 8
37
+ ) # for service_client property only, does not affect filesystem
32
38
 
33
39
  def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
34
40
  super().__init__(parent, name, schema, endpoint, secrets=secrets)
41
+ self._service_client = None
42
+ self._storage_options = None
43
+
44
+ def get_storage_options(self):
45
+ return self.storage_options
46
+
47
+ @property
48
+ def storage_options(self):
49
+ if not self._storage_options:
50
+ res = dict(
51
+ account_name=self._get_secret_or_env("account_name")
52
+ or self._get_secret_or_env("AZURE_STORAGE_ACCOUNT_NAME"),
53
+ account_key=self._get_secret_or_env("account_key")
54
+ or self._get_secret_or_env("AZURE_STORAGE_ACCOUNT_KEY"),
55
+ connection_string=self._get_secret_or_env("connection_string")
56
+ or self._get_secret_or_env("AZURE_STORAGE_CONNECTION_STRING"),
57
+ tenant_id=self._get_secret_or_env("tenant_id")
58
+ or self._get_secret_or_env("AZURE_STORAGE_TENANT_ID"),
59
+ client_id=self._get_secret_or_env("client_id")
60
+ or self._get_secret_or_env("AZURE_STORAGE_CLIENT_ID"),
61
+ client_secret=self._get_secret_or_env("client_secret")
62
+ or self._get_secret_or_env("AZURE_STORAGE_CLIENT_SECRET"),
63
+ sas_token=self._get_secret_or_env("sas_token")
64
+ or self._get_secret_or_env("AZURE_STORAGE_SAS_TOKEN"),
65
+ credential=self._get_secret_or_env("credential"),
66
+ )
67
+ self._storage_options = self._sanitize_storage_options(res)
68
+ return self._storage_options
35
69
 
36
70
  @property
37
71
  def filesystem(self):
38
72
  """return fsspec file system object, if supported"""
39
- if self._filesystem:
40
- return self._filesystem
41
73
  try:
42
74
  import adlfs # noqa
43
75
  except ImportError as exc:
44
76
  raise ImportError("Azure adlfs not installed") from exc
45
- # in order to support az and wasbs kinds.
46
- filesystem_class = get_filesystem_class(protocol=self.kind)
47
- self._filesystem = makeDatastoreSchemaSanitizer(
48
- filesystem_class,
49
- using_bucket=self.using_bucket,
50
- **self.get_storage_options(),
51
- )
77
+
78
+ if not self._filesystem:
79
+ # in order to support az and wasbs kinds
80
+ filesystem_class = get_filesystem_class(protocol=self.kind)
81
+ self._filesystem = make_datastore_schema_sanitizer(
82
+ filesystem_class,
83
+ using_bucket=self.using_bucket,
84
+ blocksize=self.max_blocksize,
85
+ **self.storage_options,
86
+ )
52
87
  return self._filesystem
53
88
 
54
- def get_storage_options(self):
55
- res = dict(
56
- account_name=self._get_secret_or_env("account_name")
57
- or self._get_secret_or_env("AZURE_STORAGE_ACCOUNT_NAME"),
58
- account_key=self._get_secret_or_env("account_key")
59
- or self._get_secret_or_env("AZURE_STORAGE_KEY"),
60
- connection_string=self._get_secret_or_env("connection_string")
61
- or self._get_secret_or_env("AZURE_STORAGE_CONNECTION_STRING"),
62
- tenant_id=self._get_secret_or_env("tenant_id")
63
- or self._get_secret_or_env("AZURE_STORAGE_TENANT_ID"),
64
- client_id=self._get_secret_or_env("client_id")
65
- or self._get_secret_or_env("AZURE_STORAGE_CLIENT_ID"),
66
- client_secret=self._get_secret_or_env("client_secret")
67
- or self._get_secret_or_env("AZURE_STORAGE_CLIENT_SECRET"),
68
- sas_token=self._get_secret_or_env("sas_token")
69
- or self._get_secret_or_env("AZURE_STORAGE_SAS_TOKEN"),
70
- credential=self._get_secret_or_env("credential"),
71
- )
72
- return self._sanitize_storage_options(res)
89
+ @property
90
+ def service_client(self):
91
+ try:
92
+ import azure # noqa
93
+ except ImportError as exc:
94
+ raise ImportError("Azure not installed") from exc
95
+
96
+ if not self._service_client:
97
+ self._do_connect()
98
+ return self._service_client
99
+
100
+ def _do_connect(self):
101
+ """
102
+
103
+ Creates a client for azure.
104
+ Raises MLRunInvalidArgumentError if none of the connection details are available
105
+ based on do_connect in AzureBlobFileSystem:
106
+ https://github.com/fsspec/adlfs/blob/2023.9.0/adlfs/spec.py#L422
107
+ """
108
+ from azure.identity import ClientSecretCredential
109
+
110
+ storage_options = self.storage_options
111
+ connection_string = storage_options.get("connection_string")
112
+ client_name = storage_options.get("account_name")
113
+ account_key = storage_options.get("account_key")
114
+ sas_token = storage_options.get("sas_token")
115
+ client_id = storage_options.get("client_id")
116
+ credential = storage_options.get("credential")
117
+
118
+ credential_from_client_id = None
119
+ if (
120
+ credential is None
121
+ and account_key is None
122
+ and sas_token is None
123
+ and client_id is not None
124
+ ):
125
+ credential_from_client_id = ClientSecretCredential(
126
+ tenant_id=storage_options.get("tenant_id"),
127
+ client_id=client_id,
128
+ client_secret=storage_options.get("client_secret"),
129
+ )
130
+ try:
131
+ if connection_string is not None:
132
+ self._service_client = BlobServiceClient.from_connection_string(
133
+ conn_str=connection_string,
134
+ max_block_size=self.max_blocksize,
135
+ max_single_put_size=self.max_single_put_size,
136
+ )
137
+ elif client_name is not None:
138
+ account_url = f"https://{client_name}.blob.core.windows.net"
139
+ cred = credential_from_client_id or credential or account_key
140
+ if not cred and sas_token is not None:
141
+ if not sas_token.startswith("?"):
142
+ sas_token = f"?{sas_token}"
143
+ account_url = account_url + sas_token
144
+ self._service_client = BlobServiceClient(
145
+ account_url=account_url,
146
+ credential=cred,
147
+ max_block_size=self.max_blocksize,
148
+ max_single_put_size=self.max_single_put_size,
149
+ )
150
+ else:
151
+ raise mlrun.errors.MLRunInvalidArgumentError(
152
+ "Must provide either a connection_string or account_name with credentials"
153
+ )
154
+ except Exception as e:
155
+ raise mlrun.errors.MLRunInvalidArgumentError(
156
+ f"unable to connect to account for {e}"
157
+ )
73
158
 
74
159
  def _convert_key_to_remote_path(self, key):
75
160
  key = key.strip("/")
@@ -82,7 +167,15 @@ class AzureBlobStore(DataStore):
82
167
 
83
168
  def upload(self, key, src_path):
84
169
  remote_path = self._convert_key_to_remote_path(key)
85
- self.filesystem.put_file(src_path, remote_path, overwrite=True)
170
+ container, remote_path = remote_path.split("/", 1)
171
+ container_client = self.service_client.get_container_client(container=container)
172
+ with open(file=src_path, mode="rb") as data:
173
+ container_client.upload_blob(
174
+ name=remote_path,
175
+ data=data,
176
+ overwrite=True,
177
+ max_concurrency=self.max_concurrency,
178
+ )
86
179
 
87
180
  def get(self, key, size=None, offset=0):
88
181
  remote_path = self._convert_key_to_remote_path(key)
@@ -96,12 +189,7 @@ class AzureBlobStore(DataStore):
96
189
  "Append mode not supported for Azure blob datastore"
97
190
  )
98
191
  remote_path = self._convert_key_to_remote_path(key)
99
- if isinstance(data, bytes):
100
- mode = "wb"
101
- elif isinstance(data, str):
102
- mode = "w"
103
- else:
104
- raise TypeError("Data type unknown. Unable to put in Azure!")
192
+ data, mode = self._prepare_put_data(data, append)
105
193
  with self.filesystem.open(remote_path, mode) as f:
106
194
  f.write(data)
107
195
 
@@ -135,7 +223,7 @@ class AzureBlobStore(DataStore):
135
223
 
136
224
  def get_spark_options(self):
137
225
  res = {}
138
- st = self.get_storage_options()
226
+ st = self.storage_options
139
227
  service = "blob"
140
228
  primary_url = None
141
229
  if st.get("connection_string"):
@@ -158,18 +246,17 @@ class AzureBlobStore(DataStore):
158
246
  st[key] = parsed_value
159
247
 
160
248
  account_name = st.get("account_name")
161
- if not account_name:
162
- raise mlrun.errors.MLRunInvalidArgumentError(
163
- "Property 'account_name' is absent both in storage settings and connection string"
164
- )
165
249
  if primary_url:
166
250
  if primary_url.startswith("http://"):
167
251
  primary_url = primary_url[len("http://") :]
168
252
  if primary_url.startswith("https://"):
169
253
  primary_url = primary_url[len("https://") :]
170
254
  host = primary_url
171
- else:
255
+ elif account_name:
172
256
  host = f"{account_name}.{service}.core.windows.net"
257
+ else:
258
+ return res
259
+
173
260
  if "account_key" in st:
174
261
  res[f"spark.hadoop.fs.azure.account.key.{host}"] = st["account_key"]
175
262
 
@@ -209,6 +296,7 @@ class AzureBlobStore(DataStore):
209
296
  for key in spark_options:
210
297
  if key.startswith(prefix):
211
298
  account_key = key[len(prefix) :]
212
- url += f"@{account_key}"
299
+ if not url.endswith(account_key):
300
+ url += f"@{account_key}"
213
301
  break
214
302
  return url