mlrun 1.6.4rc8__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (305) hide show
  1. mlrun/__init__.py +11 -1
  2. mlrun/__main__.py +40 -122
  3. mlrun/alerts/__init__.py +15 -0
  4. mlrun/alerts/alert.py +248 -0
  5. mlrun/api/schemas/__init__.py +5 -4
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +47 -257
  8. mlrun/artifacts/dataset.py +11 -192
  9. mlrun/artifacts/manager.py +79 -47
  10. mlrun/artifacts/model.py +31 -159
  11. mlrun/artifacts/plots.py +23 -380
  12. mlrun/common/constants.py +74 -1
  13. mlrun/common/db/sql_session.py +5 -5
  14. mlrun/common/formatters/__init__.py +21 -0
  15. mlrun/common/formatters/artifact.py +45 -0
  16. mlrun/common/formatters/base.py +113 -0
  17. mlrun/common/formatters/feature_set.py +33 -0
  18. mlrun/common/formatters/function.py +46 -0
  19. mlrun/common/formatters/pipeline.py +53 -0
  20. mlrun/common/formatters/project.py +51 -0
  21. mlrun/common/formatters/run.py +29 -0
  22. mlrun/common/helpers.py +12 -3
  23. mlrun/common/model_monitoring/helpers.py +9 -5
  24. mlrun/{runtimes → common/runtimes}/constants.py +37 -9
  25. mlrun/common/schemas/__init__.py +31 -5
  26. mlrun/common/schemas/alert.py +202 -0
  27. mlrun/common/schemas/api_gateway.py +196 -0
  28. mlrun/common/schemas/artifact.py +25 -4
  29. mlrun/common/schemas/auth.py +16 -5
  30. mlrun/common/schemas/background_task.py +1 -1
  31. mlrun/common/schemas/client_spec.py +4 -2
  32. mlrun/common/schemas/common.py +7 -4
  33. mlrun/common/schemas/constants.py +3 -0
  34. mlrun/common/schemas/feature_store.py +74 -44
  35. mlrun/common/schemas/frontend_spec.py +15 -7
  36. mlrun/common/schemas/function.py +12 -1
  37. mlrun/common/schemas/hub.py +11 -18
  38. mlrun/common/schemas/memory_reports.py +2 -2
  39. mlrun/common/schemas/model_monitoring/__init__.py +20 -4
  40. mlrun/common/schemas/model_monitoring/constants.py +123 -42
  41. mlrun/common/schemas/model_monitoring/grafana.py +13 -9
  42. mlrun/common/schemas/model_monitoring/model_endpoints.py +101 -54
  43. mlrun/common/schemas/notification.py +71 -14
  44. mlrun/common/schemas/object.py +2 -2
  45. mlrun/{model_monitoring/controller_handler.py → common/schemas/pagination.py} +9 -12
  46. mlrun/common/schemas/pipeline.py +8 -1
  47. mlrun/common/schemas/project.py +69 -18
  48. mlrun/common/schemas/runs.py +7 -1
  49. mlrun/common/schemas/runtime_resource.py +8 -12
  50. mlrun/common/schemas/schedule.py +4 -4
  51. mlrun/common/schemas/tag.py +1 -2
  52. mlrun/common/schemas/workflow.py +12 -4
  53. mlrun/common/types.py +14 -1
  54. mlrun/config.py +154 -69
  55. mlrun/data_types/data_types.py +6 -1
  56. mlrun/data_types/spark.py +2 -2
  57. mlrun/data_types/to_pandas.py +67 -37
  58. mlrun/datastore/__init__.py +6 -8
  59. mlrun/datastore/alibaba_oss.py +131 -0
  60. mlrun/datastore/azure_blob.py +143 -42
  61. mlrun/datastore/base.py +102 -58
  62. mlrun/datastore/datastore.py +34 -13
  63. mlrun/datastore/datastore_profile.py +146 -20
  64. mlrun/datastore/dbfs_store.py +3 -7
  65. mlrun/datastore/filestore.py +1 -4
  66. mlrun/datastore/google_cloud_storage.py +97 -33
  67. mlrun/datastore/hdfs.py +56 -0
  68. mlrun/datastore/inmem.py +6 -3
  69. mlrun/datastore/redis.py +7 -2
  70. mlrun/datastore/s3.py +34 -12
  71. mlrun/datastore/snowflake_utils.py +45 -0
  72. mlrun/datastore/sources.py +303 -111
  73. mlrun/datastore/spark_utils.py +31 -2
  74. mlrun/datastore/store_resources.py +9 -7
  75. mlrun/datastore/storeytargets.py +151 -0
  76. mlrun/datastore/targets.py +453 -176
  77. mlrun/datastore/utils.py +72 -58
  78. mlrun/datastore/v3io.py +6 -1
  79. mlrun/db/base.py +274 -41
  80. mlrun/db/factory.py +1 -1
  81. mlrun/db/httpdb.py +893 -225
  82. mlrun/db/nopdb.py +291 -33
  83. mlrun/errors.py +36 -6
  84. mlrun/execution.py +115 -42
  85. mlrun/feature_store/__init__.py +0 -2
  86. mlrun/feature_store/api.py +65 -73
  87. mlrun/feature_store/common.py +7 -12
  88. mlrun/feature_store/feature_set.py +76 -55
  89. mlrun/feature_store/feature_vector.py +39 -31
  90. mlrun/feature_store/ingestion.py +7 -6
  91. mlrun/feature_store/retrieval/base.py +16 -11
  92. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  93. mlrun/feature_store/retrieval/job.py +13 -4
  94. mlrun/feature_store/retrieval/local_merger.py +2 -0
  95. mlrun/feature_store/retrieval/spark_merger.py +24 -32
  96. mlrun/feature_store/steps.py +45 -34
  97. mlrun/features.py +11 -21
  98. mlrun/frameworks/_common/artifacts_library.py +9 -9
  99. mlrun/frameworks/_common/mlrun_interface.py +5 -5
  100. mlrun/frameworks/_common/model_handler.py +48 -48
  101. mlrun/frameworks/_common/plan.py +5 -6
  102. mlrun/frameworks/_common/producer.py +3 -4
  103. mlrun/frameworks/_common/utils.py +5 -5
  104. mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
  105. mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
  106. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
  107. mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
  108. mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
  109. mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
  110. mlrun/frameworks/_ml_common/model_handler.py +24 -24
  111. mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
  112. mlrun/frameworks/_ml_common/plan.py +2 -2
  113. mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
  114. mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
  115. mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
  116. mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
  117. mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
  118. mlrun/frameworks/_ml_common/utils.py +4 -4
  119. mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
  120. mlrun/frameworks/huggingface/model_server.py +4 -4
  121. mlrun/frameworks/lgbm/__init__.py +33 -33
  122. mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
  123. mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
  124. mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
  125. mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
  126. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
  127. mlrun/frameworks/lgbm/model_handler.py +10 -10
  128. mlrun/frameworks/lgbm/model_server.py +6 -6
  129. mlrun/frameworks/lgbm/utils.py +5 -5
  130. mlrun/frameworks/onnx/dataset.py +8 -8
  131. mlrun/frameworks/onnx/mlrun_interface.py +3 -3
  132. mlrun/frameworks/onnx/model_handler.py +6 -6
  133. mlrun/frameworks/onnx/model_server.py +7 -7
  134. mlrun/frameworks/parallel_coordinates.py +6 -6
  135. mlrun/frameworks/pytorch/__init__.py +18 -18
  136. mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
  137. mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
  138. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
  139. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
  140. mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
  141. mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
  142. mlrun/frameworks/pytorch/model_handler.py +17 -17
  143. mlrun/frameworks/pytorch/model_server.py +7 -7
  144. mlrun/frameworks/sklearn/__init__.py +13 -13
  145. mlrun/frameworks/sklearn/estimator.py +4 -4
  146. mlrun/frameworks/sklearn/metrics_library.py +14 -14
  147. mlrun/frameworks/sklearn/mlrun_interface.py +16 -9
  148. mlrun/frameworks/sklearn/model_handler.py +2 -2
  149. mlrun/frameworks/tf_keras/__init__.py +10 -7
  150. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
  151. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
  152. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
  153. mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
  154. mlrun/frameworks/tf_keras/model_handler.py +14 -14
  155. mlrun/frameworks/tf_keras/model_server.py +6 -6
  156. mlrun/frameworks/xgboost/__init__.py +13 -13
  157. mlrun/frameworks/xgboost/model_handler.py +6 -6
  158. mlrun/k8s_utils.py +61 -17
  159. mlrun/launcher/__init__.py +1 -1
  160. mlrun/launcher/base.py +16 -15
  161. mlrun/launcher/client.py +13 -11
  162. mlrun/launcher/factory.py +1 -1
  163. mlrun/launcher/local.py +23 -13
  164. mlrun/launcher/remote.py +17 -10
  165. mlrun/lists.py +7 -6
  166. mlrun/model.py +478 -103
  167. mlrun/model_monitoring/__init__.py +1 -1
  168. mlrun/model_monitoring/api.py +163 -371
  169. mlrun/{runtimes/mpijob/v1alpha1.py → model_monitoring/applications/__init__.py} +9 -15
  170. mlrun/model_monitoring/applications/_application_steps.py +188 -0
  171. mlrun/model_monitoring/applications/base.py +108 -0
  172. mlrun/model_monitoring/applications/context.py +341 -0
  173. mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
  174. mlrun/model_monitoring/applications/histogram_data_drift.py +354 -0
  175. mlrun/model_monitoring/applications/results.py +99 -0
  176. mlrun/model_monitoring/controller.py +131 -278
  177. mlrun/model_monitoring/db/__init__.py +18 -0
  178. mlrun/model_monitoring/db/stores/__init__.py +136 -0
  179. mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
  180. mlrun/model_monitoring/db/stores/base/store.py +213 -0
  181. mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
  182. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
  183. mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
  184. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
  185. mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
  186. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
  187. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
  188. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
  189. mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
  190. mlrun/model_monitoring/db/tsdb/base.py +448 -0
  191. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  192. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  193. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +279 -0
  194. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
  195. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +507 -0
  196. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  197. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
  198. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
  199. mlrun/model_monitoring/features_drift_table.py +134 -106
  200. mlrun/model_monitoring/helpers.py +199 -55
  201. mlrun/model_monitoring/metrics/__init__.py +13 -0
  202. mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
  203. mlrun/model_monitoring/model_endpoint.py +3 -2
  204. mlrun/model_monitoring/stream_processing.py +134 -398
  205. mlrun/model_monitoring/tracking_policy.py +9 -2
  206. mlrun/model_monitoring/writer.py +161 -125
  207. mlrun/package/__init__.py +6 -6
  208. mlrun/package/context_handler.py +5 -5
  209. mlrun/package/packager.py +7 -7
  210. mlrun/package/packagers/default_packager.py +8 -8
  211. mlrun/package/packagers/numpy_packagers.py +15 -15
  212. mlrun/package/packagers/pandas_packagers.py +5 -5
  213. mlrun/package/packagers/python_standard_library_packagers.py +10 -10
  214. mlrun/package/packagers_manager.py +19 -23
  215. mlrun/package/utils/_formatter.py +6 -6
  216. mlrun/package/utils/_pickler.py +2 -2
  217. mlrun/package/utils/_supported_format.py +4 -4
  218. mlrun/package/utils/log_hint_utils.py +2 -2
  219. mlrun/package/utils/type_hint_utils.py +4 -9
  220. mlrun/platforms/__init__.py +11 -10
  221. mlrun/platforms/iguazio.py +24 -203
  222. mlrun/projects/operations.py +52 -25
  223. mlrun/projects/pipelines.py +191 -197
  224. mlrun/projects/project.py +1227 -400
  225. mlrun/render.py +16 -19
  226. mlrun/run.py +209 -184
  227. mlrun/runtimes/__init__.py +83 -15
  228. mlrun/runtimes/base.py +51 -35
  229. mlrun/runtimes/daskjob.py +17 -10
  230. mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
  231. mlrun/runtimes/databricks_job/databricks_runtime.py +8 -7
  232. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  233. mlrun/runtimes/funcdoc.py +1 -29
  234. mlrun/runtimes/function_reference.py +1 -1
  235. mlrun/runtimes/kubejob.py +34 -128
  236. mlrun/runtimes/local.py +40 -11
  237. mlrun/runtimes/mpijob/__init__.py +0 -20
  238. mlrun/runtimes/mpijob/abstract.py +9 -10
  239. mlrun/runtimes/mpijob/v1.py +1 -1
  240. mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
  241. mlrun/runtimes/nuclio/api_gateway.py +769 -0
  242. mlrun/runtimes/nuclio/application/__init__.py +15 -0
  243. mlrun/runtimes/nuclio/application/application.py +758 -0
  244. mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
  245. mlrun/runtimes/{function.py → nuclio/function.py} +200 -83
  246. mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
  247. mlrun/runtimes/{serving.py → nuclio/serving.py} +65 -68
  248. mlrun/runtimes/pod.py +281 -101
  249. mlrun/runtimes/remotesparkjob.py +12 -9
  250. mlrun/runtimes/sparkjob/spark3job.py +67 -51
  251. mlrun/runtimes/utils.py +41 -75
  252. mlrun/secrets.py +9 -5
  253. mlrun/serving/__init__.py +8 -1
  254. mlrun/serving/remote.py +2 -7
  255. mlrun/serving/routers.py +85 -69
  256. mlrun/serving/server.py +69 -44
  257. mlrun/serving/states.py +209 -36
  258. mlrun/serving/utils.py +22 -14
  259. mlrun/serving/v1_serving.py +6 -7
  260. mlrun/serving/v2_serving.py +133 -54
  261. mlrun/track/tracker.py +2 -1
  262. mlrun/track/tracker_manager.py +3 -3
  263. mlrun/track/trackers/mlflow_tracker.py +6 -2
  264. mlrun/utils/async_http.py +6 -8
  265. mlrun/utils/azure_vault.py +1 -1
  266. mlrun/utils/clones.py +1 -2
  267. mlrun/utils/condition_evaluator.py +3 -3
  268. mlrun/utils/db.py +21 -3
  269. mlrun/utils/helpers.py +405 -225
  270. mlrun/utils/http.py +3 -6
  271. mlrun/utils/logger.py +112 -16
  272. mlrun/utils/notifications/notification/__init__.py +17 -13
  273. mlrun/utils/notifications/notification/base.py +50 -2
  274. mlrun/utils/notifications/notification/console.py +2 -0
  275. mlrun/utils/notifications/notification/git.py +24 -1
  276. mlrun/utils/notifications/notification/ipython.py +3 -1
  277. mlrun/utils/notifications/notification/slack.py +96 -21
  278. mlrun/utils/notifications/notification/webhook.py +59 -2
  279. mlrun/utils/notifications/notification_pusher.py +149 -30
  280. mlrun/utils/regex.py +9 -0
  281. mlrun/utils/retryer.py +208 -0
  282. mlrun/utils/singleton.py +1 -1
  283. mlrun/utils/v3io_clients.py +4 -6
  284. mlrun/utils/version/version.json +2 -2
  285. mlrun/utils/version/version.py +2 -6
  286. mlrun-1.7.0.dist-info/METADATA +378 -0
  287. mlrun-1.7.0.dist-info/RECORD +351 -0
  288. {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/WHEEL +1 -1
  289. mlrun/feature_store/retrieval/conversion.py +0 -273
  290. mlrun/kfpops.py +0 -868
  291. mlrun/model_monitoring/application.py +0 -310
  292. mlrun/model_monitoring/batch.py +0 -1095
  293. mlrun/model_monitoring/prometheus.py +0 -219
  294. mlrun/model_monitoring/stores/__init__.py +0 -111
  295. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -576
  296. mlrun/model_monitoring/stores/model_endpoint_store.py +0 -147
  297. mlrun/model_monitoring/stores/models/__init__.py +0 -27
  298. mlrun/model_monitoring/stores/models/base.py +0 -84
  299. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
  300. mlrun/platforms/other.py +0 -306
  301. mlrun-1.6.4rc8.dist-info/METADATA +0 -272
  302. mlrun-1.6.4rc8.dist-info/RECORD +0 -314
  303. {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/LICENSE +0 -0
  304. {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/entry_points.txt +0 -0
  305. {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/top_level.txt +0 -0
mlrun/datastore/base.py CHANGED
@@ -24,13 +24,12 @@ import pandas as pd
24
24
  import pyarrow
25
25
  import pytz
26
26
  import requests
27
- import urllib3
28
27
  from deprecated import deprecated
29
28
 
30
29
  import mlrun.config
31
30
  import mlrun.errors
32
31
  from mlrun.errors import err_to_str
33
- from mlrun.utils import StorePrefix, is_ipython, logger
32
+ from mlrun.utils import StorePrefix, is_jupyter, logger
34
33
 
35
34
  from .store_resources import is_store_uri, parse_store_uri
36
35
  from .utils import filter_df_start_end_time, select_columns_from_df
@@ -144,6 +143,10 @@ class DataStore:
144
143
  def url(self):
145
144
  return f"{self.kind}://{self.endpoint}"
146
145
 
146
+ @property
147
+ def spark_url(self):
148
+ return self.url
149
+
147
150
  def get(self, key, size=None, offset=0):
148
151
  pass
149
152
 
@@ -153,6 +156,18 @@ class DataStore:
153
156
  def put(self, key, data, append=False):
154
157
  pass
155
158
 
159
+ def _prepare_put_data(self, data, append=False):
160
+ mode = "a" if append else "w"
161
+ if isinstance(data, bytearray):
162
+ data = bytes(data)
163
+
164
+ if isinstance(data, bytes):
165
+ return data, f"{mode}b"
166
+ elif isinstance(data, str):
167
+ return data, mode
168
+ else:
169
+ raise TypeError(f"Unable to put a value of type {type(self).__name__}")
170
+
156
171
  def stat(self, key):
157
172
  pass
158
173
 
@@ -175,11 +190,23 @@ class DataStore:
175
190
  return {}
176
191
 
177
192
  @staticmethod
178
- def _parquet_reader(df_module, url, file_system, time_column, start_time, end_time):
193
+ def _parquet_reader(
194
+ df_module,
195
+ url,
196
+ file_system,
197
+ time_column,
198
+ start_time,
199
+ end_time,
200
+ additional_filters,
201
+ ):
179
202
  from storey.utils import find_filters, find_partitions
180
203
 
181
204
  def set_filters(
182
- partitions_time_attributes, start_time_inner, end_time_inner, kwargs
205
+ partitions_time_attributes,
206
+ start_time_inner,
207
+ end_time_inner,
208
+ filters_inner,
209
+ kwargs,
183
210
  ):
184
211
  filters = []
185
212
  find_filters(
@@ -189,20 +216,32 @@ class DataStore:
189
216
  filters,
190
217
  time_column,
191
218
  )
219
+ if filters and filters_inner:
220
+ filters[0] += filters_inner
221
+
192
222
  kwargs["filters"] = filters
193
223
 
194
224
  def reader(*args, **kwargs):
195
- if start_time or end_time:
196
- if time_column is None:
197
- raise mlrun.errors.MLRunInvalidArgumentError(
198
- "When providing start_time or end_time, must provide time_column"
199
- )
225
+ if time_column is None and (start_time or end_time):
226
+ raise mlrun.errors.MLRunInvalidArgumentError(
227
+ "When providing start_time or end_time, must provide time_column"
228
+ )
229
+ if (
230
+ start_time
231
+ and end_time
232
+ and start_time.utcoffset() != end_time.utcoffset()
233
+ ):
234
+ raise mlrun.errors.MLRunInvalidArgumentError(
235
+ "start_time and end_time must have the same time zone"
236
+ )
200
237
 
238
+ if start_time or end_time or additional_filters:
201
239
  partitions_time_attributes = find_partitions(url, file_system)
202
240
  set_filters(
203
241
  partitions_time_attributes,
204
242
  start_time,
205
243
  end_time,
244
+ additional_filters,
206
245
  kwargs,
207
246
  )
208
247
  try:
@@ -213,17 +252,23 @@ class DataStore:
213
252
  ):
214
253
  raise ex
215
254
 
216
- if start_time.tzinfo:
217
- start_time_inner = start_time.replace(tzinfo=None)
218
- end_time_inner = end_time.replace(tzinfo=None)
219
- else:
220
- start_time_inner = start_time.replace(tzinfo=pytz.utc)
221
- end_time_inner = end_time.replace(tzinfo=pytz.utc)
255
+ start_time_inner = None
256
+ if start_time:
257
+ start_time_inner = start_time.replace(
258
+ tzinfo=None if start_time.tzinfo else pytz.utc
259
+ )
260
+
261
+ end_time_inner = None
262
+ if end_time:
263
+ end_time_inner = end_time.replace(
264
+ tzinfo=None if end_time.tzinfo else pytz.utc
265
+ )
222
266
 
223
267
  set_filters(
224
268
  partitions_time_attributes,
225
269
  start_time_inner,
226
270
  end_time_inner,
271
+ additional_filters,
227
272
  kwargs,
228
273
  )
229
274
  return df_module.read_parquet(*args, **kwargs)
@@ -242,6 +287,7 @@ class DataStore:
242
287
  start_time=None,
243
288
  end_time=None,
244
289
  time_column=None,
290
+ additional_filters=None,
245
291
  **kwargs,
246
292
  ):
247
293
  df_module = df_module or pd
@@ -297,16 +343,18 @@ class DataStore:
297
343
  dfs.append(df_module.read_csv(*updated_args, **kwargs))
298
344
  return df_module.concat(dfs)
299
345
 
300
- elif (
301
- file_url.endswith(".parquet")
302
- or file_url.endswith(".pq")
303
- or format == "parquet"
304
- ):
346
+ elif mlrun.utils.helpers.is_parquet_file(file_url, format):
305
347
  if columns:
306
348
  kwargs["columns"] = columns
307
349
 
308
350
  reader = self._parquet_reader(
309
- df_module, url, file_system, time_column, start_time, end_time
351
+ df_module,
352
+ url,
353
+ file_system,
354
+ time_column,
355
+ start_time,
356
+ end_time,
357
+ additional_filters,
310
358
  )
311
359
 
312
360
  elif file_url.endswith(".json") or format == "json":
@@ -317,31 +365,17 @@ class DataStore:
317
365
  raise Exception(f"File type unhandled {url}")
318
366
 
319
367
  if file_system:
320
- if (
321
- self.supports_isdir()
322
- and file_system.isdir(file_url)
323
- or self._is_dd(df_module)
324
- ):
325
- storage_options = self.get_storage_options()
326
- if url.startswith("ds://"):
327
- parsed_url = urllib.parse.urlparse(url)
328
- url = parsed_url.path
329
- if self.using_bucket:
330
- url = url[1:]
331
- # Pass the underlying file system
332
- kwargs["filesystem"] = file_system
333
- elif storage_options:
334
- kwargs["storage_options"] = storage_options
335
- df = reader(url, **kwargs)
336
- else:
337
- file = url
338
- # Workaround for ARROW-12472 affecting pyarrow 3.x and 4.x.
339
- if file_system.protocol != "file":
340
- # If not dir, use file_system.open() to avoid regression when pandas < 1.2 and does not
341
- # support the storage_options parameter.
342
- file = file_system.open(url)
343
-
344
- df = reader(file, **kwargs)
368
+ storage_options = self.get_storage_options()
369
+ if url.startswith("ds://"):
370
+ parsed_url = urllib.parse.urlparse(url)
371
+ url = parsed_url.path
372
+ if self.using_bucket:
373
+ url = url[1:]
374
+ # Pass the underlying file system
375
+ kwargs["filesystem"] = file_system
376
+ elif storage_options:
377
+ kwargs["storage_options"] = storage_options
378
+ df = reader(url, **kwargs)
345
379
  else:
346
380
  temp_file = tempfile.NamedTemporaryFile(delete=False)
347
381
  self.download(self._join(subpath), temp_file.name)
@@ -372,7 +406,10 @@ class DataStore:
372
406
  }
373
407
 
374
408
  def rm(self, path, recursive=False, maxdepth=None):
375
- self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
409
+ try:
410
+ self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
411
+ except FileNotFoundError:
412
+ pass
376
413
 
377
414
  @staticmethod
378
415
  def _is_dd(df_module):
@@ -399,14 +436,15 @@ class DataItem:
399
436
 
400
437
 
401
438
  # reading run results using DataItem (run.artifact())
402
- train_run = train_iris_func.run(inputs={'dataset': dataset},
403
- params={'label_column': 'label'})
439
+ train_run = train_iris_func.run(
440
+ inputs={"dataset": dataset}, params={"label_column": "label"}
441
+ )
404
442
 
405
- train_run.artifact('confusion-matrix').show()
406
- test_set = train_run.artifact('test_set').as_df()
443
+ train_run.artifact("confusion-matrix").show()
444
+ test_set = train_run.artifact("test_set").as_df()
407
445
 
408
446
  # create and use DataItem from uri
409
- data = mlrun.get_dataitem('http://xyz/data.json').get()
447
+ data = mlrun.get_dataitem("http://xyz/data.json").get()
410
448
  """
411
449
 
412
450
  def __init__(
@@ -548,6 +586,7 @@ class DataItem:
548
586
  time_column=None,
549
587
  start_time=None,
550
588
  end_time=None,
589
+ additional_filters=None,
551
590
  **kwargs,
552
591
  ):
553
592
  """return a dataframe object (generated from the dataitem).
@@ -559,6 +598,12 @@ class DataItem:
559
598
  :param end_time: filters out data after this time
560
599
  :param time_column: Store timestamp_key will be used if None.
561
600
  The results will be filtered by this column and start_time & end_time.
601
+ :param additional_filters: List of additional_filter conditions as tuples.
602
+ Each tuple should be in the format (column_name, operator, value).
603
+ Supported operators: "=", ">=", "<=", ">", "<".
604
+ Example: [("Product", "=", "Computer")]
605
+ For all supported filters, please see:
606
+ https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
562
607
  """
563
608
  df = self._store.as_df(
564
609
  self._url,
@@ -569,18 +614,19 @@ class DataItem:
569
614
  time_column=time_column,
570
615
  start_time=start_time,
571
616
  end_time=end_time,
617
+ additional_filters=additional_filters,
572
618
  **kwargs,
573
619
  )
574
620
  return df
575
621
 
576
- def show(self, format=None):
622
+ def show(self, format: Optional[str] = None) -> None:
577
623
  """show the data object content in Jupyter
578
624
 
579
625
  :param format: format to use (when there is no/wrong suffix), e.g. 'png'
580
626
  """
581
- if not is_ipython:
627
+ if not is_jupyter:
582
628
  logger.warning(
583
- "Jupyter/IPython was not detected, .show() will only display inside Jupyter"
629
+ "Jupyter was not detected. `.show()` displays only inside Jupyter."
584
630
  )
585
631
  return
586
632
 
@@ -698,8 +744,6 @@ class HttpStore(DataStore):
698
744
 
699
745
  verify_ssl = mlconf.httpdb.http.verify
700
746
  try:
701
- if not verify_ssl:
702
- urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
703
747
  response = requests.get(url, headers=headers, auth=auth, verify=verify_ssl)
704
748
  except OSError as exc:
705
749
  raise OSError(f"error: cannot connect to {url}: {err_to_str(exc)}")
@@ -713,7 +757,7 @@ class HttpStore(DataStore):
713
757
  # As an example, it converts an S3 URL 's3://s3bucket/path' to just 's3bucket/path'.
714
758
  # Since 'ds' schemas are not inherently processed by fsspec, we have adapted the _strip_protocol()
715
759
  # method specifically to strip away the 'ds' schema as required.
716
- def makeDatastoreSchemaSanitizer(cls, using_bucket=False, *args, **kwargs):
760
+ def make_datastore_schema_sanitizer(cls, using_bucket=False, *args, **kwargs):
717
761
  if not issubclass(cls, fsspec.AbstractFileSystem):
718
762
  raise ValueError("Class must be a subclass of fsspec.AbstractFileSystem")
719
763
 
@@ -21,7 +21,7 @@ from mlrun.datastore.datastore_profile import datastore_profile_read
21
21
  from mlrun.errors import err_to_str
22
22
  from mlrun.utils.helpers import get_local_file_schema
23
23
 
24
- from ..utils import DB_SCHEMA, run_keys
24
+ from ..utils import DB_SCHEMA, RunKeys
25
25
  from .base import DataItem, DataStore, HttpStore
26
26
  from .filestore import FileStore
27
27
  from .inmem import InMemoryStore
@@ -32,6 +32,8 @@ in_memory_store = InMemoryStore()
32
32
 
33
33
 
34
34
  def parse_url(url):
35
+ if url and url.startswith("v3io://") and not url.startswith("v3io:///"):
36
+ url = url.replace("v3io://", "v3io:///", 1)
35
37
  parsed_url = urlparse(url)
36
38
  schema = parsed_url.scheme.lower()
37
39
  endpoint = parsed_url.hostname
@@ -94,6 +96,14 @@ def schema_to_store(schema):
94
96
  from .dbfs_store import DBFSStore
95
97
 
96
98
  return DBFSStore
99
+ elif schema in ["hdfs", "webhdfs"]:
100
+ from .hdfs import HdfsStore
101
+
102
+ return HdfsStore
103
+ elif schema == "oss":
104
+ from .alibaba_oss import OSSStore
105
+
106
+ return OSSStore
97
107
  else:
98
108
  raise ValueError(f"unsupported store scheme ({schema})")
99
109
 
@@ -125,7 +135,7 @@ class StoreManager:
125
135
  return self._db
126
136
 
127
137
  def from_dict(self, struct: dict):
128
- stor_list = struct.get(run_keys.data_stores)
138
+ stor_list = struct.get(RunKeys.data_stores)
129
139
  if stor_list and isinstance(stor_list, list):
130
140
  for stor in stor_list:
131
141
  schema, endpoint, parsed_url = parse_url(stor.get("url"))
@@ -137,7 +147,7 @@ class StoreManager:
137
147
  self._stores[stor["name"]] = new_stor
138
148
 
139
149
  def to_dict(self, struct):
140
- struct[run_keys.data_stores] = [
150
+ struct[RunKeys.data_stores] = [
141
151
  stor.to_dict() for stor in self._stores.values() if stor.from_spec
142
152
  ]
143
153
 
@@ -170,7 +180,7 @@ class StoreManager:
170
180
  raise mlrun.errors.MLRunInvalidArgumentError(
171
181
  f"resource {url} does not have a valid/persistent offline target"
172
182
  )
173
- return resource, target
183
+ return resource, target or ""
174
184
 
175
185
  def object(
176
186
  self, url, key="", project="", allow_empty_resources=None, secrets: dict = None
@@ -182,17 +192,24 @@ class StoreManager:
182
192
  url, project, allow_empty_resources, secrets
183
193
  )
184
194
 
185
- store, subpath = self.get_or_create_store(
195
+ store, subpath, url = self.get_or_create_store(
186
196
  url, secrets=secrets, project_name=project
187
197
  )
188
- return DataItem(key, store, subpath, url, meta=meta, artifact_url=artifact_url)
198
+ return DataItem(
199
+ key,
200
+ store,
201
+ subpath,
202
+ url,
203
+ meta=meta,
204
+ artifact_url=artifact_url,
205
+ )
189
206
 
190
207
  def get_or_create_store(
191
208
  self, url, secrets: dict = None, project_name=""
192
- ) -> (DataStore, str):
209
+ ) -> (DataStore, str, str):
193
210
  schema, endpoint, parsed_url = parse_url(url)
194
211
  subpath = parsed_url.path
195
- store_key = f"{schema}://{endpoint}"
212
+ store_key = f"{schema}://{endpoint}" if endpoint else f"{schema}://"
196
213
 
197
214
  if schema == "ds":
198
215
  datastore_profile = datastore_profile_read(url, project_name, secrets)
@@ -206,17 +223,22 @@ class StoreManager:
206
223
 
207
224
  if schema == "memory":
208
225
  subpath = url[len("memory://") :]
209
- return in_memory_store, subpath
226
+ return in_memory_store, subpath, url
227
+
228
+ elif schema in get_local_file_schema():
229
+ # parse_url() will drop the windows drive-letter from the path for url like "c:\a\b".
230
+ # As a workaround, we set subpath to the url.
231
+ subpath = url.replace("file://", "", 1)
210
232
 
211
233
  if not schema and endpoint:
212
234
  if endpoint in self._stores.keys():
213
- return self._stores[endpoint], subpath
235
+ return self._stores[endpoint], subpath, url
214
236
  else:
215
237
  raise ValueError(f"no such store ({endpoint})")
216
238
 
217
239
  if not secrets and not mlrun.config.is_running_as_api():
218
240
  if store_key in self._stores.keys():
219
- return self._stores[store_key], subpath
241
+ return self._stores[store_key], subpath, url
220
242
 
221
243
  # support u/p embedding in url (as done in redis) by setting netloc as the "endpoint" parameter
222
244
  # when running on server we don't cache the datastore, because there are multiple users and we don't want to
@@ -226,8 +248,7 @@ class StoreManager:
226
248
  )
227
249
  if not secrets and not mlrun.config.is_running_as_api():
228
250
  self._stores[store_key] = store
229
- # in file stores in windows path like c:\a\b the drive letter is dropped from the path, so we return the url
230
- return store, url if store.kind == "file" else subpath
251
+ return store, subpath, url
231
252
 
232
253
  def reset_secrets(self):
233
254
  self._secrets = {}