mlrun 1.7.0rc4__py3-none-any.whl → 1.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (235) hide show
  1. mlrun/__init__.py +11 -1
  2. mlrun/__main__.py +39 -121
  3. mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
  4. mlrun/alerts/alert.py +248 -0
  5. mlrun/api/schemas/__init__.py +4 -3
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +39 -254
  8. mlrun/artifacts/dataset.py +9 -190
  9. mlrun/artifacts/manager.py +73 -46
  10. mlrun/artifacts/model.py +30 -158
  11. mlrun/artifacts/plots.py +23 -380
  12. mlrun/common/constants.py +73 -1
  13. mlrun/common/db/sql_session.py +3 -2
  14. mlrun/common/formatters/__init__.py +21 -0
  15. mlrun/common/formatters/artifact.py +46 -0
  16. mlrun/common/formatters/base.py +113 -0
  17. mlrun/common/formatters/feature_set.py +44 -0
  18. mlrun/common/formatters/function.py +46 -0
  19. mlrun/common/formatters/pipeline.py +53 -0
  20. mlrun/common/formatters/project.py +51 -0
  21. mlrun/common/formatters/run.py +29 -0
  22. mlrun/common/helpers.py +11 -1
  23. mlrun/{runtimes → common/runtimes}/constants.py +32 -4
  24. mlrun/common/schemas/__init__.py +31 -4
  25. mlrun/common/schemas/alert.py +202 -0
  26. mlrun/common/schemas/api_gateway.py +196 -0
  27. mlrun/common/schemas/artifact.py +28 -1
  28. mlrun/common/schemas/auth.py +13 -2
  29. mlrun/common/schemas/client_spec.py +2 -1
  30. mlrun/common/schemas/common.py +7 -4
  31. mlrun/common/schemas/constants.py +3 -0
  32. mlrun/common/schemas/feature_store.py +58 -28
  33. mlrun/common/schemas/frontend_spec.py +8 -0
  34. mlrun/common/schemas/function.py +11 -0
  35. mlrun/common/schemas/hub.py +7 -9
  36. mlrun/common/schemas/model_monitoring/__init__.py +21 -4
  37. mlrun/common/schemas/model_monitoring/constants.py +136 -42
  38. mlrun/common/schemas/model_monitoring/grafana.py +9 -5
  39. mlrun/common/schemas/model_monitoring/model_endpoints.py +89 -41
  40. mlrun/common/schemas/notification.py +69 -12
  41. mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
  42. mlrun/common/schemas/pipeline.py +7 -0
  43. mlrun/common/schemas/project.py +67 -16
  44. mlrun/common/schemas/runs.py +17 -0
  45. mlrun/common/schemas/schedule.py +1 -1
  46. mlrun/common/schemas/workflow.py +10 -2
  47. mlrun/common/types.py +14 -1
  48. mlrun/config.py +233 -58
  49. mlrun/data_types/data_types.py +11 -1
  50. mlrun/data_types/spark.py +5 -4
  51. mlrun/data_types/to_pandas.py +75 -34
  52. mlrun/datastore/__init__.py +8 -10
  53. mlrun/datastore/alibaba_oss.py +131 -0
  54. mlrun/datastore/azure_blob.py +131 -43
  55. mlrun/datastore/base.py +107 -47
  56. mlrun/datastore/datastore.py +17 -7
  57. mlrun/datastore/datastore_profile.py +91 -7
  58. mlrun/datastore/dbfs_store.py +3 -7
  59. mlrun/datastore/filestore.py +1 -3
  60. mlrun/datastore/google_cloud_storage.py +92 -32
  61. mlrun/datastore/hdfs.py +5 -0
  62. mlrun/datastore/inmem.py +6 -3
  63. mlrun/datastore/redis.py +3 -2
  64. mlrun/datastore/s3.py +30 -12
  65. mlrun/datastore/snowflake_utils.py +45 -0
  66. mlrun/datastore/sources.py +274 -59
  67. mlrun/datastore/spark_utils.py +30 -0
  68. mlrun/datastore/store_resources.py +9 -7
  69. mlrun/datastore/storeytargets.py +151 -0
  70. mlrun/datastore/targets.py +387 -119
  71. mlrun/datastore/utils.py +68 -5
  72. mlrun/datastore/v3io.py +28 -50
  73. mlrun/db/auth_utils.py +152 -0
  74. mlrun/db/base.py +245 -20
  75. mlrun/db/factory.py +1 -4
  76. mlrun/db/httpdb.py +909 -231
  77. mlrun/db/nopdb.py +279 -14
  78. mlrun/errors.py +35 -5
  79. mlrun/execution.py +111 -38
  80. mlrun/feature_store/__init__.py +0 -2
  81. mlrun/feature_store/api.py +46 -53
  82. mlrun/feature_store/common.py +6 -11
  83. mlrun/feature_store/feature_set.py +48 -23
  84. mlrun/feature_store/feature_vector.py +13 -2
  85. mlrun/feature_store/ingestion.py +7 -6
  86. mlrun/feature_store/retrieval/base.py +9 -4
  87. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  88. mlrun/feature_store/retrieval/job.py +13 -4
  89. mlrun/feature_store/retrieval/local_merger.py +2 -0
  90. mlrun/feature_store/retrieval/spark_merger.py +24 -32
  91. mlrun/feature_store/steps.py +38 -19
  92. mlrun/features.py +6 -14
  93. mlrun/frameworks/_common/plan.py +3 -3
  94. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
  95. mlrun/frameworks/_ml_common/plan.py +1 -1
  96. mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
  97. mlrun/frameworks/lgbm/__init__.py +1 -1
  98. mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
  99. mlrun/frameworks/lgbm/model_handler.py +1 -1
  100. mlrun/frameworks/parallel_coordinates.py +4 -4
  101. mlrun/frameworks/pytorch/__init__.py +2 -2
  102. mlrun/frameworks/sklearn/__init__.py +1 -1
  103. mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
  104. mlrun/frameworks/tf_keras/__init__.py +5 -2
  105. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
  106. mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
  107. mlrun/frameworks/xgboost/__init__.py +1 -1
  108. mlrun/k8s_utils.py +57 -12
  109. mlrun/launcher/__init__.py +1 -1
  110. mlrun/launcher/base.py +6 -5
  111. mlrun/launcher/client.py +13 -11
  112. mlrun/launcher/factory.py +1 -1
  113. mlrun/launcher/local.py +15 -5
  114. mlrun/launcher/remote.py +10 -3
  115. mlrun/lists.py +6 -2
  116. mlrun/model.py +297 -48
  117. mlrun/model_monitoring/__init__.py +1 -1
  118. mlrun/model_monitoring/api.py +152 -357
  119. mlrun/model_monitoring/applications/__init__.py +10 -0
  120. mlrun/model_monitoring/applications/_application_steps.py +190 -0
  121. mlrun/model_monitoring/applications/base.py +108 -0
  122. mlrun/model_monitoring/applications/context.py +341 -0
  123. mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
  124. mlrun/model_monitoring/applications/histogram_data_drift.py +227 -91
  125. mlrun/model_monitoring/applications/results.py +99 -0
  126. mlrun/model_monitoring/controller.py +130 -303
  127. mlrun/model_monitoring/{stores/models/sqlite.py → db/__init__.py} +5 -10
  128. mlrun/model_monitoring/db/stores/__init__.py +136 -0
  129. mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
  130. mlrun/model_monitoring/db/stores/base/store.py +213 -0
  131. mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
  132. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
  133. mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
  134. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
  135. mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
  136. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
  137. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
  138. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
  139. mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
  140. mlrun/model_monitoring/db/tsdb/base.py +448 -0
  141. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  142. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  143. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +298 -0
  144. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
  145. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +522 -0
  146. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  147. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
  148. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
  149. mlrun/model_monitoring/features_drift_table.py +34 -22
  150. mlrun/model_monitoring/helpers.py +177 -39
  151. mlrun/model_monitoring/model_endpoint.py +3 -2
  152. mlrun/model_monitoring/stream_processing.py +165 -398
  153. mlrun/model_monitoring/tracking_policy.py +7 -1
  154. mlrun/model_monitoring/writer.py +161 -125
  155. mlrun/package/packagers/default_packager.py +2 -2
  156. mlrun/package/packagers_manager.py +1 -0
  157. mlrun/package/utils/_formatter.py +2 -2
  158. mlrun/platforms/__init__.py +11 -10
  159. mlrun/platforms/iguazio.py +67 -228
  160. mlrun/projects/__init__.py +6 -1
  161. mlrun/projects/operations.py +47 -20
  162. mlrun/projects/pipelines.py +396 -249
  163. mlrun/projects/project.py +1176 -406
  164. mlrun/render.py +28 -22
  165. mlrun/run.py +208 -181
  166. mlrun/runtimes/__init__.py +76 -11
  167. mlrun/runtimes/base.py +54 -24
  168. mlrun/runtimes/daskjob.py +9 -2
  169. mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
  170. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  171. mlrun/runtimes/funcdoc.py +1 -29
  172. mlrun/runtimes/kubejob.py +34 -128
  173. mlrun/runtimes/local.py +39 -10
  174. mlrun/runtimes/mpijob/__init__.py +0 -20
  175. mlrun/runtimes/mpijob/abstract.py +8 -8
  176. mlrun/runtimes/mpijob/v1.py +1 -1
  177. mlrun/runtimes/nuclio/__init__.py +1 -0
  178. mlrun/runtimes/nuclio/api_gateway.py +769 -0
  179. mlrun/runtimes/nuclio/application/__init__.py +15 -0
  180. mlrun/runtimes/nuclio/application/application.py +758 -0
  181. mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
  182. mlrun/runtimes/nuclio/function.py +188 -68
  183. mlrun/runtimes/nuclio/serving.py +57 -60
  184. mlrun/runtimes/pod.py +191 -58
  185. mlrun/runtimes/remotesparkjob.py +11 -8
  186. mlrun/runtimes/sparkjob/spark3job.py +17 -18
  187. mlrun/runtimes/utils.py +40 -73
  188. mlrun/secrets.py +6 -2
  189. mlrun/serving/__init__.py +8 -1
  190. mlrun/serving/remote.py +2 -3
  191. mlrun/serving/routers.py +89 -64
  192. mlrun/serving/server.py +54 -26
  193. mlrun/serving/states.py +187 -56
  194. mlrun/serving/utils.py +19 -11
  195. mlrun/serving/v2_serving.py +136 -63
  196. mlrun/track/tracker.py +2 -1
  197. mlrun/track/trackers/mlflow_tracker.py +5 -0
  198. mlrun/utils/async_http.py +26 -6
  199. mlrun/utils/db.py +18 -0
  200. mlrun/utils/helpers.py +375 -105
  201. mlrun/utils/http.py +2 -2
  202. mlrun/utils/logger.py +75 -9
  203. mlrun/utils/notifications/notification/__init__.py +14 -10
  204. mlrun/utils/notifications/notification/base.py +48 -0
  205. mlrun/utils/notifications/notification/console.py +2 -0
  206. mlrun/utils/notifications/notification/git.py +24 -1
  207. mlrun/utils/notifications/notification/ipython.py +2 -0
  208. mlrun/utils/notifications/notification/slack.py +96 -21
  209. mlrun/utils/notifications/notification/webhook.py +63 -2
  210. mlrun/utils/notifications/notification_pusher.py +146 -16
  211. mlrun/utils/regex.py +9 -0
  212. mlrun/utils/retryer.py +3 -2
  213. mlrun/utils/v3io_clients.py +2 -3
  214. mlrun/utils/version/version.json +2 -2
  215. mlrun-1.7.2.dist-info/METADATA +390 -0
  216. mlrun-1.7.2.dist-info/RECORD +351 -0
  217. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/WHEEL +1 -1
  218. mlrun/feature_store/retrieval/conversion.py +0 -271
  219. mlrun/kfpops.py +0 -868
  220. mlrun/model_monitoring/application.py +0 -310
  221. mlrun/model_monitoring/batch.py +0 -974
  222. mlrun/model_monitoring/controller_handler.py +0 -37
  223. mlrun/model_monitoring/prometheus.py +0 -216
  224. mlrun/model_monitoring/stores/__init__.py +0 -111
  225. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -574
  226. mlrun/model_monitoring/stores/model_endpoint_store.py +0 -145
  227. mlrun/model_monitoring/stores/models/__init__.py +0 -27
  228. mlrun/model_monitoring/stores/models/base.py +0 -84
  229. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
  230. mlrun/platforms/other.py +0 -305
  231. mlrun-1.7.0rc4.dist-info/METADATA +0 -269
  232. mlrun-1.7.0rc4.dist-info/RECORD +0 -321
  233. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/LICENSE +0 -0
  234. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/entry_points.txt +0 -0
  235. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/top_level.txt +0 -0
mlrun/datastore/base.py CHANGED
@@ -24,20 +24,16 @@ import pandas as pd
24
24
  import pyarrow
25
25
  import pytz
26
26
  import requests
27
- import urllib3
28
27
  from deprecated import deprecated
29
28
 
29
+ import mlrun.config
30
30
  import mlrun.errors
31
31
  from mlrun.errors import err_to_str
32
- from mlrun.utils import StorePrefix, is_ipython, logger
32
+ from mlrun.utils import StorePrefix, is_jupyter, logger
33
33
 
34
34
  from .store_resources import is_store_uri, parse_store_uri
35
35
  from .utils import filter_df_start_end_time, select_columns_from_df
36
36
 
37
- verify_ssl = False
38
- if not verify_ssl:
39
- urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
40
-
41
37
 
42
38
  class FileStats:
43
39
  def __init__(self, size, modified, content_type=None):
@@ -160,6 +156,18 @@ class DataStore:
160
156
  def put(self, key, data, append=False):
161
157
  pass
162
158
 
159
+ def _prepare_put_data(self, data, append=False):
160
+ mode = "a" if append else "w"
161
+ if isinstance(data, bytearray):
162
+ data = bytes(data)
163
+
164
+ if isinstance(data, bytes):
165
+ return data, f"{mode}b"
166
+ elif isinstance(data, str):
167
+ return data, mode
168
+ else:
169
+ raise TypeError(f"Unable to put a value of type {type(self).__name__}")
170
+
163
171
  def stat(self, key):
164
172
  pass
165
173
 
@@ -182,11 +190,23 @@ class DataStore:
182
190
  return {}
183
191
 
184
192
  @staticmethod
185
- def _parquet_reader(df_module, url, file_system, time_column, start_time, end_time):
193
+ def _parquet_reader(
194
+ df_module,
195
+ url,
196
+ file_system,
197
+ time_column,
198
+ start_time,
199
+ end_time,
200
+ additional_filters,
201
+ ):
186
202
  from storey.utils import find_filters, find_partitions
187
203
 
188
204
  def set_filters(
189
- partitions_time_attributes, start_time_inner, end_time_inner, kwargs
205
+ partitions_time_attributes,
206
+ start_time_inner,
207
+ end_time_inner,
208
+ filters_inner,
209
+ kwargs,
190
210
  ):
191
211
  filters = []
192
212
  find_filters(
@@ -196,20 +216,32 @@ class DataStore:
196
216
  filters,
197
217
  time_column,
198
218
  )
219
+ if filters and filters_inner:
220
+ filters[0] += filters_inner
221
+
199
222
  kwargs["filters"] = filters
200
223
 
201
224
  def reader(*args, **kwargs):
202
- if start_time or end_time:
203
- if time_column is None:
204
- raise mlrun.errors.MLRunInvalidArgumentError(
205
- "When providing start_time or end_time, must provide time_column"
206
- )
225
+ if time_column is None and (start_time or end_time):
226
+ raise mlrun.errors.MLRunInvalidArgumentError(
227
+ "When providing start_time or end_time, must provide time_column"
228
+ )
229
+ if (
230
+ start_time
231
+ and end_time
232
+ and start_time.utcoffset() != end_time.utcoffset()
233
+ ):
234
+ raise mlrun.errors.MLRunInvalidArgumentError(
235
+ "start_time and end_time must have the same time zone"
236
+ )
207
237
 
238
+ if start_time or end_time or additional_filters:
208
239
  partitions_time_attributes = find_partitions(url, file_system)
209
240
  set_filters(
210
241
  partitions_time_attributes,
211
242
  start_time,
212
243
  end_time,
244
+ additional_filters,
213
245
  kwargs,
214
246
  )
215
247
  try:
@@ -220,17 +252,23 @@ class DataStore:
220
252
  ):
221
253
  raise ex
222
254
 
223
- if start_time.tzinfo:
224
- start_time_inner = start_time.replace(tzinfo=None)
225
- end_time_inner = end_time.replace(tzinfo=None)
226
- else:
227
- start_time_inner = start_time.replace(tzinfo=pytz.utc)
228
- end_time_inner = end_time.replace(tzinfo=pytz.utc)
255
+ start_time_inner = None
256
+ if start_time:
257
+ start_time_inner = start_time.replace(
258
+ tzinfo=None if start_time.tzinfo else pytz.utc
259
+ )
260
+
261
+ end_time_inner = None
262
+ if end_time:
263
+ end_time_inner = end_time.replace(
264
+ tzinfo=None if end_time.tzinfo else pytz.utc
265
+ )
229
266
 
230
267
  set_filters(
231
268
  partitions_time_attributes,
232
269
  start_time_inner,
233
270
  end_time_inner,
271
+ additional_filters,
234
272
  kwargs,
235
273
  )
236
274
  return df_module.read_parquet(*args, **kwargs)
@@ -249,6 +287,7 @@ class DataStore:
249
287
  start_time=None,
250
288
  end_time=None,
251
289
  time_column=None,
290
+ additional_filters=None,
252
291
  **kwargs,
253
292
  ):
254
293
  df_module = df_module or pd
@@ -304,16 +343,18 @@ class DataStore:
304
343
  dfs.append(df_module.read_csv(*updated_args, **kwargs))
305
344
  return df_module.concat(dfs)
306
345
 
307
- elif (
308
- file_url.endswith(".parquet")
309
- or file_url.endswith(".pq")
310
- or format == "parquet"
311
- ):
346
+ elif mlrun.utils.helpers.is_parquet_file(file_url, format):
312
347
  if columns:
313
348
  kwargs["columns"] = columns
314
349
 
315
350
  reader = self._parquet_reader(
316
- df_module, url, file_system, time_column, start_time, end_time
351
+ df_module,
352
+ url,
353
+ file_system,
354
+ time_column,
355
+ start_time,
356
+ end_time,
357
+ additional_filters,
317
358
  )
318
359
 
319
360
  elif file_url.endswith(".json") or format == "json":
@@ -365,7 +406,10 @@ class DataStore:
365
406
  }
366
407
 
367
408
  def rm(self, path, recursive=False, maxdepth=None):
368
- self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
409
+ try:
410
+ self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
411
+ except FileNotFoundError:
412
+ pass
369
413
 
370
414
  @staticmethod
371
415
  def _is_dd(df_module):
@@ -392,14 +436,15 @@ class DataItem:
392
436
 
393
437
 
394
438
  # reading run results using DataItem (run.artifact())
395
- train_run = train_iris_func.run(inputs={'dataset': dataset},
396
- params={'label_column': 'label'})
439
+ train_run = train_iris_func.run(
440
+ inputs={"dataset": dataset}, params={"label_column": "label"}
441
+ )
397
442
 
398
- train_run.artifact('confusion-matrix').show()
399
- test_set = train_run.artifact('test_set').as_df()
443
+ train_run.artifact("confusion-matrix").show()
444
+ test_set = train_run.artifact("test_set").as_df()
400
445
 
401
446
  # create and use DataItem from uri
402
- data = mlrun.get_dataitem('http://xyz/data.json').get()
447
+ data = mlrun.get_dataitem("http://xyz/data.json").get()
403
448
  """
404
449
 
405
450
  def __init__(
@@ -541,6 +586,7 @@ class DataItem:
541
586
  time_column=None,
542
587
  start_time=None,
543
588
  end_time=None,
589
+ additional_filters=None,
544
590
  **kwargs,
545
591
  ):
546
592
  """return a dataframe object (generated from the dataitem).
@@ -552,6 +598,12 @@ class DataItem:
552
598
  :param end_time: filters out data after this time
553
599
  :param time_column: Store timestamp_key will be used if None.
554
600
  The results will be filtered by this column and start_time & end_time.
601
+ :param additional_filters: List of additional_filter conditions as tuples.
602
+ Each tuple should be in the format (column_name, operator, value).
603
+ Supported operators: "=", ">=", "<=", ">", "<".
604
+ Example: [("Product", "=", "Computer")]
605
+ For all supported filters, please see:
606
+ https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
555
607
  """
556
608
  df = self._store.as_df(
557
609
  self._url,
@@ -562,18 +614,19 @@ class DataItem:
562
614
  time_column=time_column,
563
615
  start_time=start_time,
564
616
  end_time=end_time,
617
+ additional_filters=additional_filters,
565
618
  **kwargs,
566
619
  )
567
620
  return df
568
621
 
569
- def show(self, format=None):
622
+ def show(self, format: Optional[str] = None) -> None:
570
623
  """show the data object content in Jupyter
571
624
 
572
625
  :param format: format to use (when there is no/wrong suffix), e.g. 'png'
573
626
  """
574
- if not is_ipython:
627
+ if not is_jupyter:
575
628
  logger.warning(
576
- "Jupyter/IPython was not detected, .show() will only display inside Jupyter"
629
+ "Jupyter was not detected. `.show()` displays only inside Jupyter."
577
630
  )
578
631
  return
579
632
 
@@ -633,17 +686,6 @@ def basic_auth_header(user, password):
633
686
  return {"Authorization": authstr}
634
687
 
635
688
 
636
- def http_get(url, headers=None, auth=None):
637
- try:
638
- response = requests.get(url, headers=headers, auth=auth, verify=verify_ssl)
639
- except OSError as exc:
640
- raise OSError(f"error: cannot connect to {url}: {err_to_str(exc)}")
641
-
642
- mlrun.errors.raise_for_status(response)
643
-
644
- return response.content
645
-
646
-
647
689
  class HttpStore(DataStore):
648
690
  def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
649
691
  super().__init__(parent, name, schema, endpoint, secrets)
@@ -671,7 +713,7 @@ class HttpStore(DataStore):
671
713
  raise ValueError("unimplemented")
672
714
 
673
715
  def get(self, key, size=None, offset=0):
674
- data = http_get(self.url + self._join(key), self._headers, self.auth)
716
+ data = self._http_get(self.url + self._join(key), self._headers, self.auth)
675
717
  if offset:
676
718
  data = data[offset:]
677
719
  if size:
@@ -691,13 +733,31 @@ class HttpStore(DataStore):
691
733
  f"schema as it is not secure and is not recommended."
692
734
  )
693
735
 
736
+ def _http_get(
737
+ self,
738
+ url,
739
+ headers=None,
740
+ auth=None,
741
+ ):
742
+ # import here to prevent import cycle
743
+ from mlrun.config import config as mlconf
744
+
745
+ verify_ssl = mlconf.httpdb.http.verify
746
+ try:
747
+ response = requests.get(url, headers=headers, auth=auth, verify=verify_ssl)
748
+ except OSError as exc:
749
+ raise OSError(f"error: cannot connect to {url}: {err_to_str(exc)}")
750
+
751
+ mlrun.errors.raise_for_status(response)
752
+ return response.content
753
+
694
754
 
695
755
  # This wrapper class is designed to extract the 'ds' schema and profile name from URL-formatted paths.
696
756
  # Within fsspec, the AbstractFileSystem::_strip_protocol() internal method is used to handle complete URL paths.
697
757
  # As an example, it converts an S3 URL 's3://s3bucket/path' to just 's3bucket/path'.
698
758
  # Since 'ds' schemas are not inherently processed by fsspec, we have adapted the _strip_protocol()
699
759
  # method specifically to strip away the 'ds' schema as required.
700
- def makeDatastoreSchemaSanitizer(cls, using_bucket=False, *args, **kwargs):
760
+ def make_datastore_schema_sanitizer(cls, using_bucket=False, *args, **kwargs):
701
761
  if not issubclass(cls, fsspec.AbstractFileSystem):
702
762
  raise ValueError("Class must be a subclass of fsspec.AbstractFileSystem")
703
763
 
@@ -21,7 +21,7 @@ from mlrun.datastore.datastore_profile import datastore_profile_read
21
21
  from mlrun.errors import err_to_str
22
22
  from mlrun.utils.helpers import get_local_file_schema
23
23
 
24
- from ..utils import DB_SCHEMA, run_keys
24
+ from ..utils import DB_SCHEMA, RunKeys
25
25
  from .base import DataItem, DataStore, HttpStore
26
26
  from .filestore import FileStore
27
27
  from .inmem import InMemoryStore
@@ -32,6 +32,8 @@ in_memory_store = InMemoryStore()
32
32
 
33
33
 
34
34
  def parse_url(url):
35
+ if url and url.startswith("v3io://") and not url.startswith("v3io:///"):
36
+ url = url.replace("v3io://", "v3io:///", 1)
35
37
  parsed_url = urlparse(url)
36
38
  schema = parsed_url.scheme.lower()
37
39
  endpoint = parsed_url.hostname
@@ -94,10 +96,14 @@ def schema_to_store(schema):
94
96
  from .dbfs_store import DBFSStore
95
97
 
96
98
  return DBFSStore
97
- elif schema == "hdfs":
99
+ elif schema in ["hdfs", "webhdfs"]:
98
100
  from .hdfs import HdfsStore
99
101
 
100
102
  return HdfsStore
103
+ elif schema == "oss":
104
+ from .alibaba_oss import OSSStore
105
+
106
+ return OSSStore
101
107
  else:
102
108
  raise ValueError(f"unsupported store scheme ({schema})")
103
109
 
@@ -129,7 +135,7 @@ class StoreManager:
129
135
  return self._db
130
136
 
131
137
  def from_dict(self, struct: dict):
132
- stor_list = struct.get(run_keys.data_stores)
138
+ stor_list = struct.get(RunKeys.data_stores)
133
139
  if stor_list and isinstance(stor_list, list):
134
140
  for stor in stor_list:
135
141
  schema, endpoint, parsed_url = parse_url(stor.get("url"))
@@ -141,7 +147,7 @@ class StoreManager:
141
147
  self._stores[stor["name"]] = new_stor
142
148
 
143
149
  def to_dict(self, struct):
144
- struct[run_keys.data_stores] = [
150
+ struct[RunKeys.data_stores] = [
145
151
  stor.to_dict() for stor in self._stores.values() if stor.from_spec
146
152
  ]
147
153
 
@@ -203,7 +209,7 @@ class StoreManager:
203
209
  ) -> (DataStore, str, str):
204
210
  schema, endpoint, parsed_url = parse_url(url)
205
211
  subpath = parsed_url.path
206
- store_key = f"{schema}://{endpoint}"
212
+ store_key = f"{schema}://{endpoint}" if endpoint else f"{schema}://"
207
213
 
208
214
  if schema == "ds":
209
215
  datastore_profile = datastore_profile_read(url, project_name, secrets)
@@ -219,6 +225,11 @@ class StoreManager:
219
225
  subpath = url[len("memory://") :]
220
226
  return in_memory_store, subpath, url
221
227
 
228
+ elif schema in get_local_file_schema():
229
+ # parse_url() will drop the windows drive-letter from the path for url like "c:\a\b".
230
+ # As a workaround, we set subpath to the url.
231
+ subpath = url.replace("file://", "", 1)
232
+
222
233
  if not schema and endpoint:
223
234
  if endpoint in self._stores.keys():
224
235
  return self._stores[endpoint], subpath, url
@@ -237,8 +248,7 @@ class StoreManager:
237
248
  )
238
249
  if not secrets and not mlrun.config.is_running_as_api():
239
250
  self._stores[store_key] = store
240
- # in file stores in windows path like c:\a\b the drive letter is dropped from the path, so we return the url
241
- return store, url if store.kind == "file" else subpath, url
251
+ return store, subpath, url
242
252
 
243
253
  def reset_secrets(self):
244
254
  self._secrets = {}
@@ -16,6 +16,7 @@ import ast
16
16
  import base64
17
17
  import json
18
18
  import typing
19
+ import warnings
19
20
  from urllib.parse import ParseResult, urlparse, urlunparse
20
21
 
21
22
  import pydantic
@@ -36,6 +37,7 @@ class DatastoreProfile(pydantic.BaseModel):
36
37
  extra = pydantic.Extra.forbid
37
38
 
38
39
  @pydantic.validator("name")
40
+ @classmethod
39
41
  def lower_case(cls, v):
40
42
  return v.lower()
41
43
 
@@ -68,6 +70,9 @@ class TemporaryClientDatastoreProfiles(metaclass=mlrun.utils.singleton.Singleton
68
70
  def get(self, key):
69
71
  return self._data.get(key, None)
70
72
 
73
+ def remove(self, key):
74
+ self._data.pop(key, None)
75
+
71
76
 
72
77
  class DatastoreProfileBasic(DatastoreProfile):
73
78
  type: str = pydantic.Field("basic")
@@ -79,13 +84,37 @@ class DatastoreProfileBasic(DatastoreProfile):
79
84
  class DatastoreProfileKafkaTarget(DatastoreProfile):
80
85
  type: str = pydantic.Field("kafka_target")
81
86
  _private_attributes = "kwargs_private"
82
- bootstrap_servers: str
87
+ bootstrap_servers: typing.Optional[str] = None
88
+ brokers: typing.Optional[str] = None
83
89
  topic: str
84
90
  kwargs_public: typing.Optional[dict]
85
91
  kwargs_private: typing.Optional[dict]
86
92
 
93
+ def __init__(self, **kwargs):
94
+ super().__init__(**kwargs)
95
+
96
+ if not self.brokers and not self.bootstrap_servers:
97
+ raise mlrun.errors.MLRunInvalidArgumentError(
98
+ "DatastoreProfileKafkaTarget requires the 'brokers' field to be set"
99
+ )
100
+
101
+ if self.bootstrap_servers:
102
+ if self.brokers:
103
+ raise mlrun.errors.MLRunInvalidArgumentError(
104
+ "DatastoreProfileKafkaTarget cannot be created with both 'brokers' and 'bootstrap_servers'"
105
+ )
106
+ else:
107
+ self.brokers = self.bootstrap_servers
108
+ self.bootstrap_servers = None
109
+ warnings.warn(
110
+ "'bootstrap_servers' parameter is deprecated in 1.7.0 and will be removed in 1.9.0, "
111
+ "use 'brokers' instead.",
112
+ # TODO: Remove this in 1.9.0
113
+ FutureWarning,
114
+ )
115
+
87
116
  def attributes(self):
88
- attributes = {"bootstrap_servers": self.bootstrap_servers}
117
+ attributes = {"brokers": self.brokers or self.bootstrap_servers}
89
118
  if self.kwargs_public:
90
119
  attributes = merge(attributes, self.kwargs_public)
91
120
  if self.kwargs_private:
@@ -157,6 +186,18 @@ class DatastoreProfileS3(DatastoreProfile):
157
186
  assume_role_arn: typing.Optional[str] = None
158
187
  access_key_id: typing.Optional[str] = None
159
188
  secret_key: typing.Optional[str] = None
189
+ bucket: typing.Optional[str] = None
190
+
191
+ @pydantic.validator("bucket")
192
+ @classmethod
193
+ def check_bucket(cls, v):
194
+ if not v:
195
+ warnings.warn(
196
+ "The 'bucket' attribute will be mandatory starting from version 1.9",
197
+ FutureWarning,
198
+ stacklevel=2,
199
+ )
200
+ return v
160
201
 
161
202
  def secrets(self) -> dict:
162
203
  res = {}
@@ -175,7 +216,13 @@ class DatastoreProfileS3(DatastoreProfile):
175
216
  return res
176
217
 
177
218
  def url(self, subpath):
178
- return f"s3:/{subpath}"
219
+ # TODO: There is an inconsistency with DatastoreProfileGCS. In DatastoreProfileGCS,
220
+ # we assume that the subpath can begin without a '/' character,
221
+ # while here we assume it always starts with one.
222
+ if self.bucket:
223
+ return f"s3://{self.bucket}{subpath}"
224
+ else:
225
+ return f"s3:/{subpath}"
179
226
 
180
227
 
181
228
  class DatastoreProfileRedis(DatastoreProfile):
@@ -244,18 +291,36 @@ class DatastoreProfileGCS(DatastoreProfile):
244
291
  _private_attributes = ("gcp_credentials",)
245
292
  credentials_path: typing.Optional[str] = None # path to file.
246
293
  gcp_credentials: typing.Optional[typing.Union[str, dict]] = None
294
+ bucket: typing.Optional[str] = None
295
+
296
+ @pydantic.validator("bucket")
297
+ @classmethod
298
+ def check_bucket(cls, v):
299
+ if not v:
300
+ warnings.warn(
301
+ "The 'bucket' attribute will be mandatory starting from version 1.9",
302
+ FutureWarning,
303
+ stacklevel=2,
304
+ )
305
+ return v
247
306
 
248
307
  @pydantic.validator("gcp_credentials", pre=True, always=True)
308
+ @classmethod
249
309
  def convert_dict_to_json(cls, v):
250
310
  if isinstance(v, dict):
251
311
  return json.dumps(v)
252
312
  return v
253
313
 
254
314
  def url(self, subpath) -> str:
315
+ # TODO: but there's something wrong with the subpath being assumed to not start with a slash here,
316
+ # but the opposite assumption is made in S3.
255
317
  if subpath.startswith("/"):
256
318
  # in gcs the path after schema is starts with bucket, wherefore it should not start with "/".
257
319
  subpath = subpath[1:]
258
- return f"gcs://{subpath}"
320
+ if self.bucket:
321
+ return f"gcs://{self.bucket}/{subpath}"
322
+ else:
323
+ return f"gcs://{subpath}"
259
324
 
260
325
  def secrets(self) -> dict:
261
326
  res = {}
@@ -283,12 +348,27 @@ class DatastoreProfileAzureBlob(DatastoreProfile):
283
348
  client_secret: typing.Optional[str] = None
284
349
  sas_token: typing.Optional[str] = None
285
350
  credential: typing.Optional[str] = None
351
+ container: typing.Optional[str] = None
352
+
353
+ @pydantic.validator("container")
354
+ @classmethod
355
+ def check_container(cls, v):
356
+ if not v:
357
+ warnings.warn(
358
+ "The 'container' attribute will be mandatory starting from version 1.9",
359
+ FutureWarning,
360
+ stacklevel=2,
361
+ )
362
+ return v
286
363
 
287
364
  def url(self, subpath) -> str:
288
365
  if subpath.startswith("/"):
289
- # in azure the path after schema is starts with bucket, wherefore it should not start with "/".
366
+ # in azure the path after schema is starts with container, wherefore it should not start with "/".
290
367
  subpath = subpath[1:]
291
- return f"az://{subpath}"
368
+ if self.container:
369
+ return f"az://{self.container}/{subpath}"
370
+ else:
371
+ return f"az://{subpath}"
292
372
 
293
373
  def secrets(self) -> dict:
294
374
  res = {}
@@ -332,7 +412,7 @@ class DatastoreProfileHdfs(DatastoreProfile):
332
412
  return res or None
333
413
 
334
414
  def url(self, subpath):
335
- return f"hdfs://{self.host}:{self.http_port}{subpath}"
415
+ return f"webhdfs://{self.host}:{self.http_port}{subpath}"
336
416
 
337
417
 
338
418
  class DatastoreProfile2Json(pydantic.BaseModel):
@@ -460,3 +540,7 @@ def register_temporary_client_datastore_profile(profile: DatastoreProfile):
460
540
  It's beneficial for testing purposes.
461
541
  """
462
542
  TemporaryClientDatastoreProfiles().add(profile)
543
+
544
+
545
+ def remove_temporary_client_datastore_profile(profile_name: str):
546
+ TemporaryClientDatastoreProfiles().remove(profile_name)
@@ -19,7 +19,7 @@ from fsspec.registry import get_filesystem_class
19
19
 
20
20
  import mlrun.errors
21
21
 
22
- from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
22
+ from .base import DataStore, FileStats, make_datastore_schema_sanitizer
23
23
 
24
24
 
25
25
  class DatabricksFileBugFixed(DatabricksFile):
@@ -89,7 +89,7 @@ class DBFSStore(DataStore):
89
89
  """return fsspec file system object, if supported"""
90
90
  filesystem_class = get_filesystem_class(protocol=self.kind)
91
91
  if not self._filesystem:
92
- self._filesystem = makeDatastoreSchemaSanitizer(
92
+ self._filesystem = make_datastore_schema_sanitizer(
93
93
  cls=filesystem_class,
94
94
  using_bucket=False,
95
95
  **self.get_storage_options(),
@@ -130,11 +130,7 @@ class DBFSStore(DataStore):
130
130
  "Append mode not supported for Databricks file system"
131
131
  )
132
132
  # can not use append mode because it overrides data.
133
- mode = "w"
134
- if isinstance(data, bytes):
135
- mode += "b"
136
- elif not isinstance(data, str):
137
- raise TypeError(f"Unknown data type {type(data)}")
133
+ data, mode = self._prepare_put_data(data, append)
138
134
  with self.filesystem.open(key, mode) as f:
139
135
  f.write(data)
140
136
 
@@ -66,9 +66,7 @@ class FileStore(DataStore):
66
66
  dir_to_create = path.dirname(self._join(key))
67
67
  if dir_to_create:
68
68
  self._ensure_directory(dir_to_create)
69
- mode = "a" if append else "w"
70
- if isinstance(data, bytes):
71
- mode = mode + "b"
69
+ data, mode = self._prepare_put_data(data, append)
72
70
  with open(self._join(key), mode) as fp:
73
71
  fp.write(data)
74
72
  fp.close()