mlrun 1.6.4rc2__py3-none-any.whl → 1.7.0rc20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (291) hide show
  1. mlrun/__init__.py +11 -1
  2. mlrun/__main__.py +26 -112
  3. mlrun/alerts/__init__.py +15 -0
  4. mlrun/alerts/alert.py +144 -0
  5. mlrun/api/schemas/__init__.py +5 -4
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +46 -257
  8. mlrun/artifacts/dataset.py +11 -192
  9. mlrun/artifacts/manager.py +47 -48
  10. mlrun/artifacts/model.py +31 -159
  11. mlrun/artifacts/plots.py +23 -380
  12. mlrun/common/constants.py +69 -0
  13. mlrun/common/db/sql_session.py +2 -3
  14. mlrun/common/formatters/__init__.py +19 -0
  15. mlrun/common/formatters/artifact.py +21 -0
  16. mlrun/common/formatters/base.py +78 -0
  17. mlrun/common/formatters/function.py +41 -0
  18. mlrun/common/formatters/pipeline.py +53 -0
  19. mlrun/common/formatters/project.py +51 -0
  20. mlrun/common/helpers.py +1 -2
  21. mlrun/common/model_monitoring/helpers.py +9 -5
  22. mlrun/{runtimes → common/runtimes}/constants.py +37 -9
  23. mlrun/common/schemas/__init__.py +24 -4
  24. mlrun/common/schemas/alert.py +203 -0
  25. mlrun/common/schemas/api_gateway.py +148 -0
  26. mlrun/common/schemas/artifact.py +18 -8
  27. mlrun/common/schemas/auth.py +11 -5
  28. mlrun/common/schemas/background_task.py +1 -1
  29. mlrun/common/schemas/client_spec.py +4 -1
  30. mlrun/common/schemas/feature_store.py +16 -16
  31. mlrun/common/schemas/frontend_spec.py +8 -7
  32. mlrun/common/schemas/function.py +5 -1
  33. mlrun/common/schemas/hub.py +11 -18
  34. mlrun/common/schemas/memory_reports.py +2 -2
  35. mlrun/common/schemas/model_monitoring/__init__.py +18 -3
  36. mlrun/common/schemas/model_monitoring/constants.py +83 -26
  37. mlrun/common/schemas/model_monitoring/grafana.py +13 -9
  38. mlrun/common/schemas/model_monitoring/model_endpoints.py +99 -16
  39. mlrun/common/schemas/notification.py +4 -4
  40. mlrun/common/schemas/object.py +2 -2
  41. mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
  42. mlrun/common/schemas/pipeline.py +1 -10
  43. mlrun/common/schemas/project.py +24 -23
  44. mlrun/common/schemas/runtime_resource.py +8 -12
  45. mlrun/common/schemas/schedule.py +3 -3
  46. mlrun/common/schemas/tag.py +1 -2
  47. mlrun/common/schemas/workflow.py +2 -2
  48. mlrun/common/types.py +7 -1
  49. mlrun/config.py +54 -17
  50. mlrun/data_types/to_pandas.py +10 -12
  51. mlrun/datastore/__init__.py +5 -8
  52. mlrun/datastore/alibaba_oss.py +130 -0
  53. mlrun/datastore/azure_blob.py +17 -5
  54. mlrun/datastore/base.py +62 -39
  55. mlrun/datastore/datastore.py +28 -9
  56. mlrun/datastore/datastore_profile.py +146 -20
  57. mlrun/datastore/filestore.py +0 -1
  58. mlrun/datastore/google_cloud_storage.py +6 -2
  59. mlrun/datastore/hdfs.py +56 -0
  60. mlrun/datastore/inmem.py +2 -2
  61. mlrun/datastore/redis.py +6 -2
  62. mlrun/datastore/s3.py +9 -0
  63. mlrun/datastore/snowflake_utils.py +43 -0
  64. mlrun/datastore/sources.py +201 -96
  65. mlrun/datastore/spark_utils.py +1 -2
  66. mlrun/datastore/store_resources.py +7 -7
  67. mlrun/datastore/targets.py +358 -104
  68. mlrun/datastore/utils.py +72 -58
  69. mlrun/datastore/v3io.py +5 -1
  70. mlrun/db/base.py +185 -35
  71. mlrun/db/factory.py +1 -1
  72. mlrun/db/httpdb.py +614 -179
  73. mlrun/db/nopdb.py +210 -26
  74. mlrun/errors.py +12 -1
  75. mlrun/execution.py +41 -24
  76. mlrun/feature_store/__init__.py +0 -2
  77. mlrun/feature_store/api.py +40 -72
  78. mlrun/feature_store/common.py +1 -1
  79. mlrun/feature_store/feature_set.py +76 -55
  80. mlrun/feature_store/feature_vector.py +28 -30
  81. mlrun/feature_store/ingestion.py +7 -6
  82. mlrun/feature_store/retrieval/base.py +16 -11
  83. mlrun/feature_store/retrieval/conversion.py +11 -13
  84. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  85. mlrun/feature_store/retrieval/job.py +9 -3
  86. mlrun/feature_store/retrieval/local_merger.py +2 -0
  87. mlrun/feature_store/retrieval/spark_merger.py +34 -24
  88. mlrun/feature_store/steps.py +37 -34
  89. mlrun/features.py +9 -20
  90. mlrun/frameworks/_common/artifacts_library.py +9 -9
  91. mlrun/frameworks/_common/mlrun_interface.py +5 -5
  92. mlrun/frameworks/_common/model_handler.py +48 -48
  93. mlrun/frameworks/_common/plan.py +2 -3
  94. mlrun/frameworks/_common/producer.py +3 -4
  95. mlrun/frameworks/_common/utils.py +5 -5
  96. mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
  97. mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
  98. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
  99. mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
  100. mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
  101. mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
  102. mlrun/frameworks/_ml_common/model_handler.py +24 -24
  103. mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
  104. mlrun/frameworks/_ml_common/plan.py +1 -1
  105. mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
  106. mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
  107. mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
  108. mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
  109. mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
  110. mlrun/frameworks/_ml_common/utils.py +4 -4
  111. mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
  112. mlrun/frameworks/huggingface/model_server.py +4 -4
  113. mlrun/frameworks/lgbm/__init__.py +33 -33
  114. mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
  115. mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
  116. mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
  117. mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
  118. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
  119. mlrun/frameworks/lgbm/model_handler.py +10 -10
  120. mlrun/frameworks/lgbm/model_server.py +6 -6
  121. mlrun/frameworks/lgbm/utils.py +5 -5
  122. mlrun/frameworks/onnx/dataset.py +8 -8
  123. mlrun/frameworks/onnx/mlrun_interface.py +3 -3
  124. mlrun/frameworks/onnx/model_handler.py +6 -6
  125. mlrun/frameworks/onnx/model_server.py +7 -7
  126. mlrun/frameworks/parallel_coordinates.py +4 -3
  127. mlrun/frameworks/pytorch/__init__.py +18 -18
  128. mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
  129. mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
  130. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
  131. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
  132. mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
  133. mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
  134. mlrun/frameworks/pytorch/model_handler.py +17 -17
  135. mlrun/frameworks/pytorch/model_server.py +7 -7
  136. mlrun/frameworks/sklearn/__init__.py +13 -13
  137. mlrun/frameworks/sklearn/estimator.py +4 -4
  138. mlrun/frameworks/sklearn/metrics_library.py +14 -14
  139. mlrun/frameworks/sklearn/mlrun_interface.py +3 -6
  140. mlrun/frameworks/sklearn/model_handler.py +2 -2
  141. mlrun/frameworks/tf_keras/__init__.py +10 -7
  142. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
  143. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
  144. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
  145. mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
  146. mlrun/frameworks/tf_keras/model_handler.py +14 -14
  147. mlrun/frameworks/tf_keras/model_server.py +6 -6
  148. mlrun/frameworks/xgboost/__init__.py +13 -13
  149. mlrun/frameworks/xgboost/model_handler.py +6 -6
  150. mlrun/k8s_utils.py +14 -16
  151. mlrun/launcher/__init__.py +1 -1
  152. mlrun/launcher/base.py +16 -15
  153. mlrun/launcher/client.py +8 -6
  154. mlrun/launcher/factory.py +1 -1
  155. mlrun/launcher/local.py +17 -11
  156. mlrun/launcher/remote.py +16 -10
  157. mlrun/lists.py +7 -6
  158. mlrun/model.py +238 -73
  159. mlrun/model_monitoring/__init__.py +1 -1
  160. mlrun/model_monitoring/api.py +138 -315
  161. mlrun/model_monitoring/application.py +5 -296
  162. mlrun/model_monitoring/applications/__init__.py +24 -0
  163. mlrun/model_monitoring/applications/_application_steps.py +157 -0
  164. mlrun/model_monitoring/applications/base.py +282 -0
  165. mlrun/model_monitoring/applications/context.py +214 -0
  166. mlrun/model_monitoring/applications/evidently_base.py +211 -0
  167. mlrun/model_monitoring/applications/histogram_data_drift.py +349 -0
  168. mlrun/model_monitoring/applications/results.py +99 -0
  169. mlrun/model_monitoring/controller.py +104 -84
  170. mlrun/model_monitoring/controller_handler.py +13 -5
  171. mlrun/model_monitoring/db/__init__.py +18 -0
  172. mlrun/model_monitoring/{stores → db/stores}/__init__.py +43 -36
  173. mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
  174. mlrun/model_monitoring/{stores/model_endpoint_store.py → db/stores/base/store.py} +64 -40
  175. mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
  176. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
  177. mlrun/model_monitoring/{stores → db/stores/sqldb}/models/base.py +109 -5
  178. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +88 -0
  179. mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
  180. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +684 -0
  181. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
  182. mlrun/model_monitoring/{stores/kv_model_endpoint_store.py → db/stores/v3io_kv/kv_store.py} +310 -165
  183. mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
  184. mlrun/model_monitoring/db/tsdb/base.py +329 -0
  185. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  186. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  187. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
  188. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
  189. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +397 -0
  190. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  191. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
  192. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +630 -0
  193. mlrun/model_monitoring/evidently_application.py +6 -118
  194. mlrun/model_monitoring/features_drift_table.py +134 -106
  195. mlrun/model_monitoring/helpers.py +127 -28
  196. mlrun/model_monitoring/metrics/__init__.py +13 -0
  197. mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
  198. mlrun/model_monitoring/model_endpoint.py +3 -2
  199. mlrun/model_monitoring/prometheus.py +1 -4
  200. mlrun/model_monitoring/stream_processing.py +62 -231
  201. mlrun/model_monitoring/tracking_policy.py +9 -2
  202. mlrun/model_monitoring/writer.py +152 -124
  203. mlrun/package/__init__.py +6 -6
  204. mlrun/package/context_handler.py +5 -5
  205. mlrun/package/packager.py +7 -7
  206. mlrun/package/packagers/default_packager.py +6 -6
  207. mlrun/package/packagers/numpy_packagers.py +15 -15
  208. mlrun/package/packagers/pandas_packagers.py +5 -5
  209. mlrun/package/packagers/python_standard_library_packagers.py +10 -10
  210. mlrun/package/packagers_manager.py +19 -23
  211. mlrun/package/utils/_formatter.py +6 -6
  212. mlrun/package/utils/_pickler.py +2 -2
  213. mlrun/package/utils/_supported_format.py +4 -4
  214. mlrun/package/utils/log_hint_utils.py +2 -2
  215. mlrun/package/utils/type_hint_utils.py +4 -9
  216. mlrun/platforms/__init__.py +11 -10
  217. mlrun/platforms/iguazio.py +24 -203
  218. mlrun/projects/operations.py +35 -21
  219. mlrun/projects/pipelines.py +68 -99
  220. mlrun/projects/project.py +830 -266
  221. mlrun/render.py +3 -11
  222. mlrun/run.py +162 -166
  223. mlrun/runtimes/__init__.py +62 -7
  224. mlrun/runtimes/base.py +39 -32
  225. mlrun/runtimes/daskjob.py +8 -8
  226. mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
  227. mlrun/runtimes/databricks_job/databricks_runtime.py +7 -7
  228. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  229. mlrun/runtimes/funcdoc.py +0 -28
  230. mlrun/runtimes/function_reference.py +1 -1
  231. mlrun/runtimes/kubejob.py +28 -122
  232. mlrun/runtimes/local.py +6 -3
  233. mlrun/runtimes/mpijob/__init__.py +0 -20
  234. mlrun/runtimes/mpijob/abstract.py +9 -10
  235. mlrun/runtimes/mpijob/v1.py +1 -1
  236. mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
  237. mlrun/runtimes/nuclio/api_gateway.py +709 -0
  238. mlrun/runtimes/nuclio/application/__init__.py +15 -0
  239. mlrun/runtimes/nuclio/application/application.py +523 -0
  240. mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
  241. mlrun/runtimes/{function.py → nuclio/function.py} +112 -73
  242. mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
  243. mlrun/runtimes/{serving.py → nuclio/serving.py} +45 -51
  244. mlrun/runtimes/pod.py +286 -88
  245. mlrun/runtimes/remotesparkjob.py +2 -2
  246. mlrun/runtimes/sparkjob/spark3job.py +51 -34
  247. mlrun/runtimes/utils.py +7 -75
  248. mlrun/secrets.py +9 -5
  249. mlrun/serving/remote.py +2 -7
  250. mlrun/serving/routers.py +13 -10
  251. mlrun/serving/server.py +22 -26
  252. mlrun/serving/states.py +99 -25
  253. mlrun/serving/utils.py +3 -3
  254. mlrun/serving/v1_serving.py +6 -7
  255. mlrun/serving/v2_serving.py +59 -20
  256. mlrun/track/tracker.py +2 -1
  257. mlrun/track/tracker_manager.py +3 -3
  258. mlrun/track/trackers/mlflow_tracker.py +1 -2
  259. mlrun/utils/async_http.py +5 -7
  260. mlrun/utils/azure_vault.py +1 -1
  261. mlrun/utils/clones.py +1 -2
  262. mlrun/utils/condition_evaluator.py +3 -3
  263. mlrun/utils/db.py +3 -3
  264. mlrun/utils/helpers.py +183 -197
  265. mlrun/utils/http.py +2 -5
  266. mlrun/utils/logger.py +76 -14
  267. mlrun/utils/notifications/notification/__init__.py +17 -12
  268. mlrun/utils/notifications/notification/base.py +14 -2
  269. mlrun/utils/notifications/notification/console.py +2 -0
  270. mlrun/utils/notifications/notification/git.py +3 -1
  271. mlrun/utils/notifications/notification/ipython.py +3 -1
  272. mlrun/utils/notifications/notification/slack.py +101 -21
  273. mlrun/utils/notifications/notification/webhook.py +11 -1
  274. mlrun/utils/notifications/notification_pusher.py +155 -30
  275. mlrun/utils/retryer.py +208 -0
  276. mlrun/utils/singleton.py +1 -1
  277. mlrun/utils/v3io_clients.py +2 -4
  278. mlrun/utils/version/version.json +2 -2
  279. mlrun/utils/version/version.py +2 -6
  280. {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/METADATA +31 -19
  281. mlrun-1.7.0rc20.dist-info/RECORD +353 -0
  282. mlrun/kfpops.py +0 -868
  283. mlrun/model_monitoring/batch.py +0 -1095
  284. mlrun/model_monitoring/stores/models/__init__.py +0 -27
  285. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
  286. mlrun/platforms/other.py +0 -306
  287. mlrun-1.6.4rc2.dist-info/RECORD +0 -314
  288. {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/LICENSE +0 -0
  289. {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/WHEEL +0 -0
  290. {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/entry_points.txt +0 -0
  291. {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/top_level.txt +0 -0
mlrun/datastore/base.py CHANGED
@@ -144,6 +144,10 @@ class DataStore:
144
144
  def url(self):
145
145
  return f"{self.kind}://{self.endpoint}"
146
146
 
147
+ @property
148
+ def spark_url(self):
149
+ return self.url
150
+
147
151
  def get(self, key, size=None, offset=0):
148
152
  pass
149
153
 
@@ -175,11 +179,23 @@ class DataStore:
175
179
  return {}
176
180
 
177
181
  @staticmethod
178
- def _parquet_reader(df_module, url, file_system, time_column, start_time, end_time):
182
+ def _parquet_reader(
183
+ df_module,
184
+ url,
185
+ file_system,
186
+ time_column,
187
+ start_time,
188
+ end_time,
189
+ additional_filters,
190
+ ):
179
191
  from storey.utils import find_filters, find_partitions
180
192
 
181
193
  def set_filters(
182
- partitions_time_attributes, start_time_inner, end_time_inner, kwargs
194
+ partitions_time_attributes,
195
+ start_time_inner,
196
+ end_time_inner,
197
+ filters_inner,
198
+ kwargs,
183
199
  ):
184
200
  filters = []
185
201
  find_filters(
@@ -189,20 +205,23 @@ class DataStore:
189
205
  filters,
190
206
  time_column,
191
207
  )
208
+ if filters and filters_inner:
209
+ filters[0] += filters_inner
210
+
192
211
  kwargs["filters"] = filters
193
212
 
194
213
  def reader(*args, **kwargs):
195
- if start_time or end_time:
196
- if time_column is None:
197
- raise mlrun.errors.MLRunInvalidArgumentError(
198
- "When providing start_time or end_time, must provide time_column"
199
- )
200
-
214
+ if time_column is None and (start_time or end_time):
215
+ raise mlrun.errors.MLRunInvalidArgumentError(
216
+ "When providing start_time or end_time, must provide time_column"
217
+ )
218
+ if start_time or end_time or additional_filters:
201
219
  partitions_time_attributes = find_partitions(url, file_system)
202
220
  set_filters(
203
221
  partitions_time_attributes,
204
222
  start_time,
205
223
  end_time,
224
+ additional_filters,
206
225
  kwargs,
207
226
  )
208
227
  try:
@@ -213,6 +232,7 @@ class DataStore:
213
232
  ):
214
233
  raise ex
215
234
 
235
+ # TODO: fix timezone issue (ML-6308)
216
236
  if start_time.tzinfo:
217
237
  start_time_inner = start_time.replace(tzinfo=None)
218
238
  end_time_inner = end_time.replace(tzinfo=None)
@@ -224,6 +244,7 @@ class DataStore:
224
244
  partitions_time_attributes,
225
245
  start_time_inner,
226
246
  end_time_inner,
247
+ additional_filters,
227
248
  kwargs,
228
249
  )
229
250
  return df_module.read_parquet(*args, **kwargs)
@@ -242,6 +263,7 @@ class DataStore:
242
263
  start_time=None,
243
264
  end_time=None,
244
265
  time_column=None,
266
+ additional_filters=None,
245
267
  **kwargs,
246
268
  ):
247
269
  df_module = df_module or pd
@@ -306,7 +328,13 @@ class DataStore:
306
328
  kwargs["columns"] = columns
307
329
 
308
330
  reader = self._parquet_reader(
309
- df_module, url, file_system, time_column, start_time, end_time
331
+ df_module,
332
+ url,
333
+ file_system,
334
+ time_column,
335
+ start_time,
336
+ end_time,
337
+ additional_filters,
310
338
  )
311
339
 
312
340
  elif file_url.endswith(".json") or format == "json":
@@ -317,31 +345,17 @@ class DataStore:
317
345
  raise Exception(f"File type unhandled {url}")
318
346
 
319
347
  if file_system:
320
- if (
321
- self.supports_isdir()
322
- and file_system.isdir(file_url)
323
- or self._is_dd(df_module)
324
- ):
325
- storage_options = self.get_storage_options()
326
- if url.startswith("ds://"):
327
- parsed_url = urllib.parse.urlparse(url)
328
- url = parsed_url.path
329
- if self.using_bucket:
330
- url = url[1:]
331
- # Pass the underlying file system
332
- kwargs["filesystem"] = file_system
333
- elif storage_options:
334
- kwargs["storage_options"] = storage_options
335
- df = reader(url, **kwargs)
336
- else:
337
- file = url
338
- # Workaround for ARROW-12472 affecting pyarrow 3.x and 4.x.
339
- if file_system.protocol != "file":
340
- # If not dir, use file_system.open() to avoid regression when pandas < 1.2 and does not
341
- # support the storage_options parameter.
342
- file = file_system.open(url)
343
-
344
- df = reader(file, **kwargs)
348
+ storage_options = self.get_storage_options()
349
+ if url.startswith("ds://"):
350
+ parsed_url = urllib.parse.urlparse(url)
351
+ url = parsed_url.path
352
+ if self.using_bucket:
353
+ url = url[1:]
354
+ # Pass the underlying file system
355
+ kwargs["filesystem"] = file_system
356
+ elif storage_options:
357
+ kwargs["storage_options"] = storage_options
358
+ df = reader(url, **kwargs)
345
359
  else:
346
360
  temp_file = tempfile.NamedTemporaryFile(delete=False)
347
361
  self.download(self._join(subpath), temp_file.name)
@@ -399,14 +413,15 @@ class DataItem:
399
413
 
400
414
 
401
415
  # reading run results using DataItem (run.artifact())
402
- train_run = train_iris_func.run(inputs={'dataset': dataset},
403
- params={'label_column': 'label'})
416
+ train_run = train_iris_func.run(
417
+ inputs={"dataset": dataset}, params={"label_column": "label"}
418
+ )
404
419
 
405
- train_run.artifact('confusion-matrix').show()
406
- test_set = train_run.artifact('test_set').as_df()
420
+ train_run.artifact("confusion-matrix").show()
421
+ test_set = train_run.artifact("test_set").as_df()
407
422
 
408
423
  # create and use DataItem from uri
409
- data = mlrun.get_dataitem('http://xyz/data.json').get()
424
+ data = mlrun.get_dataitem("http://xyz/data.json").get()
410
425
  """
411
426
 
412
427
  def __init__(
@@ -548,6 +563,7 @@ class DataItem:
548
563
  time_column=None,
549
564
  start_time=None,
550
565
  end_time=None,
566
+ additional_filters=None,
551
567
  **kwargs,
552
568
  ):
553
569
  """return a dataframe object (generated from the dataitem).
@@ -559,6 +575,12 @@ class DataItem:
559
575
  :param end_time: filters out data after this time
560
576
  :param time_column: Store timestamp_key will be used if None.
561
577
  The results will be filtered by this column and start_time & end_time.
578
+ :param additional_filters: List of additional_filter conditions as tuples.
579
+ Each tuple should be in the format (column_name, operator, value).
580
+ Supported operators: "=", ">=", "<=", ">", "<".
581
+ Example: [("Product", "=", "Computer")]
582
+ For all supported filters, please see:
583
+ https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
562
584
  """
563
585
  df = self._store.as_df(
564
586
  self._url,
@@ -569,6 +591,7 @@ class DataItem:
569
591
  time_column=time_column,
570
592
  start_time=start_time,
571
593
  end_time=end_time,
594
+ additional_filters=additional_filters,
572
595
  **kwargs,
573
596
  )
574
597
  return df
@@ -94,6 +94,14 @@ def schema_to_store(schema):
94
94
  from .dbfs_store import DBFSStore
95
95
 
96
96
  return DBFSStore
97
+ elif schema == "hdfs":
98
+ from .hdfs import HdfsStore
99
+
100
+ return HdfsStore
101
+ elif schema == "oss":
102
+ from .alibaba_oss import OSSStore
103
+
104
+ return OSSStore
97
105
  else:
98
106
  raise ValueError(f"unsupported store scheme ({schema})")
99
107
 
@@ -170,7 +178,7 @@ class StoreManager:
170
178
  raise mlrun.errors.MLRunInvalidArgumentError(
171
179
  f"resource {url} does not have a valid/persistent offline target"
172
180
  )
173
- return resource, target
181
+ return resource, target or ""
174
182
 
175
183
  def object(
176
184
  self, url, key="", project="", allow_empty_resources=None, secrets: dict = None
@@ -182,14 +190,21 @@ class StoreManager:
182
190
  url, project, allow_empty_resources, secrets
183
191
  )
184
192
 
185
- store, subpath = self.get_or_create_store(
193
+ store, subpath, url = self.get_or_create_store(
186
194
  url, secrets=secrets, project_name=project
187
195
  )
188
- return DataItem(key, store, subpath, url, meta=meta, artifact_url=artifact_url)
196
+ return DataItem(
197
+ key,
198
+ store,
199
+ subpath,
200
+ url,
201
+ meta=meta,
202
+ artifact_url=artifact_url,
203
+ )
189
204
 
190
205
  def get_or_create_store(
191
206
  self, url, secrets: dict = None, project_name=""
192
- ) -> (DataStore, str):
207
+ ) -> (DataStore, str, str):
193
208
  schema, endpoint, parsed_url = parse_url(url)
194
209
  subpath = parsed_url.path
195
210
  store_key = f"{schema}://{endpoint}"
@@ -206,17 +221,22 @@ class StoreManager:
206
221
 
207
222
  if schema == "memory":
208
223
  subpath = url[len("memory://") :]
209
- return in_memory_store, subpath
224
+ return in_memory_store, subpath, url
225
+
226
+ elif schema in get_local_file_schema():
227
+ # parse_url() will drop the windows drive-letter from the path for url like "c:\a\b".
228
+ # As a workaround, we set subpath to the url.
229
+ subpath = url.replace("file://", "", 1)
210
230
 
211
231
  if not schema and endpoint:
212
232
  if endpoint in self._stores.keys():
213
- return self._stores[endpoint], subpath
233
+ return self._stores[endpoint], subpath, url
214
234
  else:
215
235
  raise ValueError(f"no such store ({endpoint})")
216
236
 
217
237
  if not secrets and not mlrun.config.is_running_as_api():
218
238
  if store_key in self._stores.keys():
219
- return self._stores[store_key], subpath
239
+ return self._stores[store_key], subpath, url
220
240
 
221
241
  # support u/p embedding in url (as done in redis) by setting netloc as the "endpoint" parameter
222
242
  # when running on server we don't cache the datastore, because there are multiple users and we don't want to
@@ -226,8 +246,7 @@ class StoreManager:
226
246
  )
227
247
  if not secrets and not mlrun.config.is_running_as_api():
228
248
  self._stores[store_key] = store
229
- # in file stores in windows path like c:\a\b the drive letter is dropped from the path, so we return the url
230
- return store, url if store.kind == "file" else subpath
249
+ return store, subpath, url
231
250
 
232
251
  def reset_secrets(self):
233
252
  self._secrets = {}
@@ -16,6 +16,7 @@ import ast
16
16
  import base64
17
17
  import json
18
18
  import typing
19
+ import warnings
19
20
  from urllib.parse import ParseResult, urlparse, urlunparse
20
21
 
21
22
  import pydantic
@@ -30,12 +31,13 @@ from ..secrets import get_secret_or_env
30
31
  class DatastoreProfile(pydantic.BaseModel):
31
32
  type: str
32
33
  name: str
33
- _private_attributes: typing.List = ()
34
+ _private_attributes: list = ()
34
35
 
35
36
  class Config:
36
37
  extra = pydantic.Extra.forbid
37
38
 
38
39
  @pydantic.validator("name")
40
+ @classmethod
39
41
  def lower_case(cls, v):
40
42
  return v.lower()
41
43
 
@@ -68,6 +70,9 @@ class TemporaryClientDatastoreProfiles(metaclass=mlrun.utils.singleton.Singleton
68
70
  def get(self, key):
69
71
  return self._data.get(key, None)
70
72
 
73
+ def remove(self, key):
74
+ self._data.pop(key, None)
75
+
71
76
 
72
77
  class DatastoreProfileBasic(DatastoreProfile):
73
78
  type: str = pydantic.Field("basic")
@@ -79,13 +84,37 @@ class DatastoreProfileBasic(DatastoreProfile):
79
84
  class DatastoreProfileKafkaTarget(DatastoreProfile):
80
85
  type: str = pydantic.Field("kafka_target")
81
86
  _private_attributes = "kwargs_private"
82
- bootstrap_servers: str
87
+ bootstrap_servers: typing.Optional[str] = None
88
+ brokers: typing.Optional[str] = None
83
89
  topic: str
84
- kwargs_public: typing.Optional[typing.Dict]
85
- kwargs_private: typing.Optional[typing.Dict]
90
+ kwargs_public: typing.Optional[dict]
91
+ kwargs_private: typing.Optional[dict]
92
+
93
+ def __init__(self, **kwargs):
94
+ super().__init__(**kwargs)
95
+
96
+ if not self.brokers and not self.bootstrap_servers:
97
+ raise mlrun.errors.MLRunInvalidArgumentError(
98
+ "DatastoreProfileKafkaTarget requires the 'brokers' field to be set"
99
+ )
100
+
101
+ if self.bootstrap_servers:
102
+ if self.brokers:
103
+ raise mlrun.errors.MLRunInvalidArgumentError(
104
+ "DatastoreProfileKafkaTarget cannot be created with both 'brokers' and 'bootstrap_servers'"
105
+ )
106
+ else:
107
+ self.brokers = self.bootstrap_servers
108
+ self.bootstrap_servers = None
109
+ warnings.warn(
110
+ "'bootstrap_servers' parameter is deprecated in 1.7.0 and will be removed in 1.9.0, "
111
+ "use 'brokers' instead.",
112
+ # TODO: Remove this in 1.9.0
113
+ FutureWarning,
114
+ )
86
115
 
87
116
  def attributes(self):
88
- attributes = {"bootstrap_servers": self.bootstrap_servers}
117
+ attributes = {"brokers": self.brokers or self.bootstrap_servers}
89
118
  if self.kwargs_public:
90
119
  attributes = merge(attributes, self.kwargs_public)
91
120
  if self.kwargs_private:
@@ -96,15 +125,15 @@ class DatastoreProfileKafkaTarget(DatastoreProfile):
96
125
  class DatastoreProfileKafkaSource(DatastoreProfile):
97
126
  type: str = pydantic.Field("kafka_source")
98
127
  _private_attributes = ("kwargs_private", "sasl_user", "sasl_pass")
99
- brokers: typing.Union[str, typing.List[str]]
100
- topics: typing.Union[str, typing.List[str]]
128
+ brokers: typing.Union[str, list[str]]
129
+ topics: typing.Union[str, list[str]]
101
130
  group: typing.Optional[str] = "serving"
102
131
  initial_offset: typing.Optional[str] = "earliest"
103
- partitions: typing.Optional[typing.Union[str, typing.List[str]]]
132
+ partitions: typing.Optional[typing.Union[str, list[str]]]
104
133
  sasl_user: typing.Optional[str]
105
134
  sasl_pass: typing.Optional[str]
106
- kwargs_public: typing.Optional[typing.Dict]
107
- kwargs_private: typing.Optional[typing.Dict]
135
+ kwargs_public: typing.Optional[dict]
136
+ kwargs_private: typing.Optional[dict]
108
137
 
109
138
  def attributes(self):
110
139
  attributes = {}
@@ -132,6 +161,22 @@ class DatastoreProfileKafkaSource(DatastoreProfile):
132
161
  return attributes
133
162
 
134
163
 
164
+ class DatastoreProfileV3io(DatastoreProfile):
165
+ type: str = pydantic.Field("v3io")
166
+ v3io_access_key: typing.Optional[str] = None
167
+ _private_attributes = "v3io_access_key"
168
+
169
+ def url(self, subpath):
170
+ subpath = subpath.lstrip("/")
171
+ return f"v3io:///{subpath}"
172
+
173
+ def secrets(self) -> dict:
174
+ res = {}
175
+ if self.v3io_access_key:
176
+ res["V3IO_ACCESS_KEY"] = self.v3io_access_key
177
+ return res
178
+
179
+
135
180
  class DatastoreProfileS3(DatastoreProfile):
136
181
  type: str = pydantic.Field("s3")
137
182
  _private_attributes = ("access_key_id", "secret_key")
@@ -141,6 +186,18 @@ class DatastoreProfileS3(DatastoreProfile):
141
186
  assume_role_arn: typing.Optional[str] = None
142
187
  access_key_id: typing.Optional[str] = None
143
188
  secret_key: typing.Optional[str] = None
189
+ bucket: typing.Optional[str] = None
190
+
191
+ @pydantic.validator("bucket")
192
+ @classmethod
193
+ def check_bucket(cls, v):
194
+ if not v:
195
+ warnings.warn(
196
+ "The 'bucket' attribute will be mandatory starting from version 1.9",
197
+ FutureWarning,
198
+ stacklevel=2,
199
+ )
200
+ return v
144
201
 
145
202
  def secrets(self) -> dict:
146
203
  res = {}
@@ -156,10 +213,16 @@ class DatastoreProfileS3(DatastoreProfile):
156
213
  res["AWS_PROFILE"] = self.profile_name
157
214
  if self.assume_role_arn:
158
215
  res["MLRUN_AWS_ROLE_ARN"] = self.assume_role_arn
159
- return res if res else None
216
+ return res
160
217
 
161
218
  def url(self, subpath):
162
- return f"s3:/{subpath}"
219
+ # TODO: There is an inconsistency with DatastoreProfileGCS. In DatastoreProfileGCS,
220
+ # we assume that the subpath can begin without a '/' character,
221
+ # while here we assume it always starts with one.
222
+ if self.bucket:
223
+ return f"s3://{self.bucket}{subpath}"
224
+ else:
225
+ return f"s3:/{subpath}"
163
226
 
164
227
 
165
228
  class DatastoreProfileRedis(DatastoreProfile):
@@ -199,7 +262,7 @@ class DatastoreProfileRedis(DatastoreProfile):
199
262
  res["REDIS_USER"] = self.username
200
263
  if self.password:
201
264
  res["REDIS_PASSWORD"] = self.password
202
- return res if res else None
265
+ return res
203
266
 
204
267
  def url(self, subpath):
205
268
  return self.endpoint_url + subpath
@@ -220,26 +283,44 @@ class DatastoreProfileDBFS(DatastoreProfile):
220
283
  res["DATABRICKS_TOKEN"] = self.token
221
284
  if self.endpoint_url:
222
285
  res["DATABRICKS_HOST"] = self.endpoint_url
223
- return res if res else None
286
+ return res
224
287
 
225
288
 
226
289
  class DatastoreProfileGCS(DatastoreProfile):
227
290
  type: str = pydantic.Field("gcs")
228
291
  _private_attributes = ("gcp_credentials",)
229
292
  credentials_path: typing.Optional[str] = None # path to file.
230
- gcp_credentials: typing.Optional[typing.Union[str, typing.Dict]] = None
293
+ gcp_credentials: typing.Optional[typing.Union[str, dict]] = None
294
+ bucket: typing.Optional[str] = None
295
+
296
+ @pydantic.validator("bucket")
297
+ @classmethod
298
+ def check_bucket(cls, v):
299
+ if not v:
300
+ warnings.warn(
301
+ "The 'bucket' attribute will be mandatory starting from version 1.9",
302
+ FutureWarning,
303
+ stacklevel=2,
304
+ )
305
+ return v
231
306
 
232
307
  @pydantic.validator("gcp_credentials", pre=True, always=True)
308
+ @classmethod
233
309
  def convert_dict_to_json(cls, v):
234
310
  if isinstance(v, dict):
235
311
  return json.dumps(v)
236
312
  return v
237
313
 
238
314
  def url(self, subpath) -> str:
315
+ # TODO: but there's something wrong with the subpath being assumed to not start with a slash here,
316
+ # but the opposite assumption is made in S3.
239
317
  if subpath.startswith("/"):
240
318
  # in gcs the path after schema is starts with bucket, wherefore it should not start with "/".
241
319
  subpath = subpath[1:]
242
- return f"gcs://{subpath}"
320
+ if self.bucket:
321
+ return f"gcs://{self.bucket}/{subpath}"
322
+ else:
323
+ return f"gcs://{subpath}"
243
324
 
244
325
  def secrets(self) -> dict:
245
326
  res = {}
@@ -247,7 +328,7 @@ class DatastoreProfileGCS(DatastoreProfile):
247
328
  res["GOOGLE_APPLICATION_CREDENTIALS"] = self.credentials_path
248
329
  if self.gcp_credentials:
249
330
  res["GCP_CREDENTIALS"] = self.gcp_credentials
250
- return res if res else None
331
+ return res
251
332
 
252
333
 
253
334
  class DatastoreProfileAzureBlob(DatastoreProfile):
@@ -267,12 +348,27 @@ class DatastoreProfileAzureBlob(DatastoreProfile):
267
348
  client_secret: typing.Optional[str] = None
268
349
  sas_token: typing.Optional[str] = None
269
350
  credential: typing.Optional[str] = None
351
+ container: typing.Optional[str] = None
352
+
353
+ @pydantic.validator("container")
354
+ @classmethod
355
+ def check_container(cls, v):
356
+ if not v:
357
+ warnings.warn(
358
+ "The 'container' attribute will be mandatory starting from version 1.9",
359
+ FutureWarning,
360
+ stacklevel=2,
361
+ )
362
+ return v
270
363
 
271
364
  def url(self, subpath) -> str:
272
365
  if subpath.startswith("/"):
273
- # in azure the path after schema is starts with bucket, wherefore it should not start with "/".
366
+ # in azure the path after schema is starts with container, wherefore it should not start with "/".
274
367
  subpath = subpath[1:]
275
- return f"az://{subpath}"
368
+ if self.container:
369
+ return f"az://{self.container}/{subpath}"
370
+ else:
371
+ return f"az://{subpath}"
276
372
 
277
373
  def secrets(self) -> dict:
278
374
  res = {}
@@ -292,7 +388,31 @@ class DatastoreProfileAzureBlob(DatastoreProfile):
292
388
  res["sas_token"] = self.sas_token
293
389
  if self.credential:
294
390
  res["credential"] = self.credential
295
- return res if res else None
391
+ return res
392
+
393
+
394
+ class DatastoreProfileHdfs(DatastoreProfile):
395
+ type: str = pydantic.Field("hdfs")
396
+ _private_attributes = "token"
397
+ host: typing.Optional[str] = None
398
+ port: typing.Optional[int] = None
399
+ http_port: typing.Optional[int] = None
400
+ user: typing.Optional[str] = None
401
+
402
+ def secrets(self) -> dict:
403
+ res = {}
404
+ if self.host:
405
+ res["HDFS_HOST"] = self.host
406
+ if self.port:
407
+ res["HDFS_PORT"] = self.port
408
+ if self.port:
409
+ res["HDFS_HTTP_PORT"] = self.http_port
410
+ if self.user:
411
+ res["HDFS_USER"] = self.user
412
+ return res or None
413
+
414
+ def url(self, subpath):
415
+ return f"hdfs://{self.host}:{self.http_port}{subpath}"
296
416
 
297
417
 
298
418
  class DatastoreProfile2Json(pydantic.BaseModel):
@@ -346,6 +466,7 @@ class DatastoreProfile2Json(pydantic.BaseModel):
346
466
  decoded_dict = {k: safe_literal_eval(v) for k, v in decoded_dict.items()}
347
467
  datastore_type = decoded_dict.get("type")
348
468
  ds_profile_factory = {
469
+ "v3io": DatastoreProfileV3io,
349
470
  "s3": DatastoreProfileS3,
350
471
  "redis": DatastoreProfileRedis,
351
472
  "basic": DatastoreProfileBasic,
@@ -354,6 +475,7 @@ class DatastoreProfile2Json(pydantic.BaseModel):
354
475
  "dbfs": DatastoreProfileDBFS,
355
476
  "gcs": DatastoreProfileGCS,
356
477
  "az": DatastoreProfileAzureBlob,
478
+ "hdfs": DatastoreProfileHdfs,
357
479
  }
358
480
  if datastore_type in ds_profile_factory:
359
481
  return ds_profile_factory[datastore_type].parse_obj(decoded_dict)
@@ -418,3 +540,7 @@ def register_temporary_client_datastore_profile(profile: DatastoreProfile):
418
540
  It's beneficial for testing purposes.
419
541
  """
420
542
  TemporaryClientDatastoreProfiles().add(profile)
543
+
544
+
545
+ def remove_temporary_client_datastore_profile(profile_name: str):
546
+ TemporaryClientDatastoreProfiles().remove(profile_name)
@@ -105,4 +105,3 @@ class FileStore(DataStore):
105
105
  return
106
106
  except FileExistsError:
107
107
  time.sleep(0.1)
108
- pass
@@ -132,13 +132,13 @@ class GoogleCloudStorageStore(DataStore):
132
132
  self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
133
133
 
134
134
  def get_spark_options(self):
135
- res = None
135
+ res = {}
136
136
  st = self.get_storage_options()
137
137
  if "token" in st:
138
138
  res = {"spark.hadoop.google.cloud.auth.service.account.enable": "true"}
139
139
  if isinstance(st["token"], str):
140
140
  # Token is a filename, read json from it
141
- with open(st["token"], "r") as file:
141
+ with open(st["token"]) as file:
142
142
  credentials = json.load(file)
143
143
  else:
144
144
  # Token is a dictionary, use it directly
@@ -161,3 +161,7 @@ class GoogleCloudStorageStore(DataStore):
161
161
  if "client_id" in credentials:
162
162
  res["spark.hadoop.fs.gs.client.id"] = credentials["client_id"]
163
163
  return res
164
+
165
+ @property
166
+ def spark_url(self):
167
+ return f"gs://{self.endpoint}"
@@ -0,0 +1,56 @@
1
+ # Copyright 2024 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import os
15
+ from urllib.parse import urlparse
16
+
17
+ import fsspec
18
+
19
+ from mlrun.datastore.base import DataStore
20
+
21
+
22
+ class HdfsStore(DataStore):
23
+ def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
24
+ super().__init__(parent, name, schema, endpoint, secrets)
25
+
26
+ self.host = self._get_secret_or_env("HDFS_HOST")
27
+ self.port = self._get_secret_or_env("HDFS_PORT")
28
+ self.http_port = self._get_secret_or_env("HDFS_HTTP_PORT")
29
+ self.user = self._get_secret_or_env("HDFS_USER")
30
+ if not self.user:
31
+ self.user = os.environ.get("HADOOP_USER_NAME", os.environ.get("USER"))
32
+
33
+ self._filesystem = None
34
+
35
+ @property
36
+ def filesystem(self):
37
+ if not self._filesystem:
38
+ self._filesystem = fsspec.filesystem(
39
+ "webhdfs",
40
+ host=self.host,
41
+ port=self.http_port,
42
+ user=self.user,
43
+ )
44
+ return self._filesystem
45
+
46
+ @property
47
+ def url(self):
48
+ return f"webhdfs://{self.host}:{self.http_port}"
49
+
50
+ @property
51
+ def spark_url(self):
52
+ return f"hdfs://{self.host}:{self.port}"
53
+
54
+ def rm(self, url, recursive=False, maxdepth=None):
55
+ path = urlparse(url).path
56
+ self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
mlrun/datastore/inmem.py CHANGED
@@ -80,8 +80,8 @@ class InMemoryStore(DataStore):
80
80
  reader = df_module.read_json
81
81
  else:
82
82
  raise mlrun.errors.MLRunInvalidArgumentError(f"file type unhandled {url}")
83
- # InMemoryStore store do not filter on time
84
- for field in ["time_column", "start_time", "end_time"]:
83
+ # InMemoryStore store don't pass filters
84
+ for field in ["time_column", "start_time", "end_time", "additional_filters"]:
85
85
  kwargs.pop(field, None)
86
86
 
87
87
  return reader(item, **kwargs)