mlrun 1.7.0rc5__py3-none-any.whl → 1.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (234) hide show
  1. mlrun/__init__.py +11 -1
  2. mlrun/__main__.py +39 -121
  3. mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
  4. mlrun/alerts/alert.py +248 -0
  5. mlrun/api/schemas/__init__.py +4 -3
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +39 -254
  8. mlrun/artifacts/dataset.py +9 -190
  9. mlrun/artifacts/manager.py +73 -46
  10. mlrun/artifacts/model.py +30 -158
  11. mlrun/artifacts/plots.py +23 -380
  12. mlrun/common/constants.py +73 -2
  13. mlrun/common/db/sql_session.py +3 -2
  14. mlrun/common/formatters/__init__.py +21 -0
  15. mlrun/common/formatters/artifact.py +46 -0
  16. mlrun/common/formatters/base.py +113 -0
  17. mlrun/common/formatters/feature_set.py +44 -0
  18. mlrun/common/formatters/function.py +46 -0
  19. mlrun/common/formatters/pipeline.py +53 -0
  20. mlrun/common/formatters/project.py +51 -0
  21. mlrun/common/formatters/run.py +29 -0
  22. mlrun/common/helpers.py +11 -1
  23. mlrun/{runtimes → common/runtimes}/constants.py +32 -4
  24. mlrun/common/schemas/__init__.py +21 -4
  25. mlrun/common/schemas/alert.py +202 -0
  26. mlrun/common/schemas/api_gateway.py +113 -2
  27. mlrun/common/schemas/artifact.py +28 -1
  28. mlrun/common/schemas/auth.py +11 -0
  29. mlrun/common/schemas/client_spec.py +2 -1
  30. mlrun/common/schemas/common.py +7 -4
  31. mlrun/common/schemas/constants.py +3 -0
  32. mlrun/common/schemas/feature_store.py +58 -28
  33. mlrun/common/schemas/frontend_spec.py +8 -0
  34. mlrun/common/schemas/function.py +11 -0
  35. mlrun/common/schemas/hub.py +7 -9
  36. mlrun/common/schemas/model_monitoring/__init__.py +21 -4
  37. mlrun/common/schemas/model_monitoring/constants.py +136 -42
  38. mlrun/common/schemas/model_monitoring/grafana.py +9 -5
  39. mlrun/common/schemas/model_monitoring/model_endpoints.py +89 -41
  40. mlrun/common/schemas/notification.py +69 -12
  41. mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
  42. mlrun/common/schemas/pipeline.py +7 -0
  43. mlrun/common/schemas/project.py +67 -16
  44. mlrun/common/schemas/runs.py +17 -0
  45. mlrun/common/schemas/schedule.py +1 -1
  46. mlrun/common/schemas/workflow.py +10 -2
  47. mlrun/common/types.py +14 -1
  48. mlrun/config.py +224 -58
  49. mlrun/data_types/data_types.py +11 -1
  50. mlrun/data_types/spark.py +5 -4
  51. mlrun/data_types/to_pandas.py +75 -34
  52. mlrun/datastore/__init__.py +8 -10
  53. mlrun/datastore/alibaba_oss.py +131 -0
  54. mlrun/datastore/azure_blob.py +131 -43
  55. mlrun/datastore/base.py +107 -47
  56. mlrun/datastore/datastore.py +17 -7
  57. mlrun/datastore/datastore_profile.py +91 -7
  58. mlrun/datastore/dbfs_store.py +3 -7
  59. mlrun/datastore/filestore.py +1 -3
  60. mlrun/datastore/google_cloud_storage.py +92 -32
  61. mlrun/datastore/hdfs.py +5 -0
  62. mlrun/datastore/inmem.py +6 -3
  63. mlrun/datastore/redis.py +3 -2
  64. mlrun/datastore/s3.py +30 -12
  65. mlrun/datastore/snowflake_utils.py +45 -0
  66. mlrun/datastore/sources.py +274 -59
  67. mlrun/datastore/spark_utils.py +30 -0
  68. mlrun/datastore/store_resources.py +9 -7
  69. mlrun/datastore/storeytargets.py +151 -0
  70. mlrun/datastore/targets.py +374 -102
  71. mlrun/datastore/utils.py +68 -5
  72. mlrun/datastore/v3io.py +28 -50
  73. mlrun/db/auth_utils.py +152 -0
  74. mlrun/db/base.py +231 -22
  75. mlrun/db/factory.py +1 -4
  76. mlrun/db/httpdb.py +864 -228
  77. mlrun/db/nopdb.py +268 -16
  78. mlrun/errors.py +35 -5
  79. mlrun/execution.py +111 -38
  80. mlrun/feature_store/__init__.py +0 -2
  81. mlrun/feature_store/api.py +46 -53
  82. mlrun/feature_store/common.py +6 -11
  83. mlrun/feature_store/feature_set.py +48 -23
  84. mlrun/feature_store/feature_vector.py +13 -2
  85. mlrun/feature_store/ingestion.py +7 -6
  86. mlrun/feature_store/retrieval/base.py +9 -4
  87. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  88. mlrun/feature_store/retrieval/job.py +13 -4
  89. mlrun/feature_store/retrieval/local_merger.py +2 -0
  90. mlrun/feature_store/retrieval/spark_merger.py +24 -32
  91. mlrun/feature_store/steps.py +38 -19
  92. mlrun/features.py +6 -14
  93. mlrun/frameworks/_common/plan.py +3 -3
  94. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
  95. mlrun/frameworks/_ml_common/plan.py +1 -1
  96. mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
  97. mlrun/frameworks/lgbm/__init__.py +1 -1
  98. mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
  99. mlrun/frameworks/lgbm/model_handler.py +1 -1
  100. mlrun/frameworks/parallel_coordinates.py +4 -4
  101. mlrun/frameworks/pytorch/__init__.py +2 -2
  102. mlrun/frameworks/sklearn/__init__.py +1 -1
  103. mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
  104. mlrun/frameworks/tf_keras/__init__.py +5 -2
  105. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
  106. mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
  107. mlrun/frameworks/xgboost/__init__.py +1 -1
  108. mlrun/k8s_utils.py +57 -12
  109. mlrun/launcher/__init__.py +1 -1
  110. mlrun/launcher/base.py +6 -5
  111. mlrun/launcher/client.py +13 -11
  112. mlrun/launcher/factory.py +1 -1
  113. mlrun/launcher/local.py +15 -5
  114. mlrun/launcher/remote.py +10 -3
  115. mlrun/lists.py +6 -2
  116. mlrun/model.py +297 -48
  117. mlrun/model_monitoring/__init__.py +1 -1
  118. mlrun/model_monitoring/api.py +152 -357
  119. mlrun/model_monitoring/applications/__init__.py +10 -0
  120. mlrun/model_monitoring/applications/_application_steps.py +190 -0
  121. mlrun/model_monitoring/applications/base.py +108 -0
  122. mlrun/model_monitoring/applications/context.py +341 -0
  123. mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
  124. mlrun/model_monitoring/applications/histogram_data_drift.py +227 -91
  125. mlrun/model_monitoring/applications/results.py +99 -0
  126. mlrun/model_monitoring/controller.py +130 -303
  127. mlrun/model_monitoring/{stores/models/sqlite.py → db/__init__.py} +5 -10
  128. mlrun/model_monitoring/db/stores/__init__.py +136 -0
  129. mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
  130. mlrun/model_monitoring/db/stores/base/store.py +213 -0
  131. mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
  132. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
  133. mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
  134. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
  135. mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
  136. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
  137. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
  138. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
  139. mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
  140. mlrun/model_monitoring/db/tsdb/base.py +448 -0
  141. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  142. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  143. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +298 -0
  144. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
  145. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +522 -0
  146. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  147. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
  148. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
  149. mlrun/model_monitoring/features_drift_table.py +34 -22
  150. mlrun/model_monitoring/helpers.py +177 -39
  151. mlrun/model_monitoring/model_endpoint.py +3 -2
  152. mlrun/model_monitoring/stream_processing.py +165 -398
  153. mlrun/model_monitoring/tracking_policy.py +7 -1
  154. mlrun/model_monitoring/writer.py +161 -125
  155. mlrun/package/packagers/default_packager.py +2 -2
  156. mlrun/package/packagers_manager.py +1 -0
  157. mlrun/package/utils/_formatter.py +2 -2
  158. mlrun/platforms/__init__.py +11 -10
  159. mlrun/platforms/iguazio.py +67 -228
  160. mlrun/projects/__init__.py +6 -1
  161. mlrun/projects/operations.py +47 -20
  162. mlrun/projects/pipelines.py +396 -249
  163. mlrun/projects/project.py +1125 -414
  164. mlrun/render.py +28 -22
  165. mlrun/run.py +207 -180
  166. mlrun/runtimes/__init__.py +76 -11
  167. mlrun/runtimes/base.py +40 -14
  168. mlrun/runtimes/daskjob.py +9 -2
  169. mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
  170. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  171. mlrun/runtimes/funcdoc.py +1 -29
  172. mlrun/runtimes/kubejob.py +34 -128
  173. mlrun/runtimes/local.py +39 -10
  174. mlrun/runtimes/mpijob/__init__.py +0 -20
  175. mlrun/runtimes/mpijob/abstract.py +8 -8
  176. mlrun/runtimes/mpijob/v1.py +1 -1
  177. mlrun/runtimes/nuclio/api_gateway.py +646 -177
  178. mlrun/runtimes/nuclio/application/__init__.py +15 -0
  179. mlrun/runtimes/nuclio/application/application.py +758 -0
  180. mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
  181. mlrun/runtimes/nuclio/function.py +188 -68
  182. mlrun/runtimes/nuclio/serving.py +57 -60
  183. mlrun/runtimes/pod.py +191 -58
  184. mlrun/runtimes/remotesparkjob.py +11 -8
  185. mlrun/runtimes/sparkjob/spark3job.py +17 -18
  186. mlrun/runtimes/utils.py +40 -73
  187. mlrun/secrets.py +6 -2
  188. mlrun/serving/__init__.py +8 -1
  189. mlrun/serving/remote.py +2 -3
  190. mlrun/serving/routers.py +89 -64
  191. mlrun/serving/server.py +54 -26
  192. mlrun/serving/states.py +187 -56
  193. mlrun/serving/utils.py +19 -11
  194. mlrun/serving/v2_serving.py +136 -63
  195. mlrun/track/tracker.py +2 -1
  196. mlrun/track/trackers/mlflow_tracker.py +5 -0
  197. mlrun/utils/async_http.py +26 -6
  198. mlrun/utils/db.py +18 -0
  199. mlrun/utils/helpers.py +375 -105
  200. mlrun/utils/http.py +2 -2
  201. mlrun/utils/logger.py +75 -9
  202. mlrun/utils/notifications/notification/__init__.py +14 -10
  203. mlrun/utils/notifications/notification/base.py +48 -0
  204. mlrun/utils/notifications/notification/console.py +2 -0
  205. mlrun/utils/notifications/notification/git.py +24 -1
  206. mlrun/utils/notifications/notification/ipython.py +2 -0
  207. mlrun/utils/notifications/notification/slack.py +96 -21
  208. mlrun/utils/notifications/notification/webhook.py +63 -2
  209. mlrun/utils/notifications/notification_pusher.py +146 -16
  210. mlrun/utils/regex.py +9 -0
  211. mlrun/utils/retryer.py +3 -2
  212. mlrun/utils/v3io_clients.py +2 -3
  213. mlrun/utils/version/version.json +2 -2
  214. mlrun-1.7.2.dist-info/METADATA +390 -0
  215. mlrun-1.7.2.dist-info/RECORD +351 -0
  216. {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/WHEEL +1 -1
  217. mlrun/feature_store/retrieval/conversion.py +0 -271
  218. mlrun/kfpops.py +0 -868
  219. mlrun/model_monitoring/application.py +0 -310
  220. mlrun/model_monitoring/batch.py +0 -974
  221. mlrun/model_monitoring/controller_handler.py +0 -37
  222. mlrun/model_monitoring/prometheus.py +0 -216
  223. mlrun/model_monitoring/stores/__init__.py +0 -111
  224. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -574
  225. mlrun/model_monitoring/stores/model_endpoint_store.py +0 -145
  226. mlrun/model_monitoring/stores/models/__init__.py +0 -27
  227. mlrun/model_monitoring/stores/models/base.py +0 -84
  228. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
  229. mlrun/platforms/other.py +0 -305
  230. mlrun-1.7.0rc5.dist-info/METADATA +0 -269
  231. mlrun-1.7.0rc5.dist-info/RECORD +0 -323
  232. {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/LICENSE +0 -0
  233. {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/entry_points.txt +0 -0
  234. {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/top_level.txt +0 -0
@@ -12,51 +12,93 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import json
15
+ import os
15
16
  from pathlib import Path
16
17
 
17
18
  from fsspec.registry import get_filesystem_class
19
+ from google.auth.credentials import Credentials
20
+ from google.cloud.storage import Client, transfer_manager
21
+ from google.oauth2 import service_account
18
22
 
19
23
  import mlrun.errors
20
24
  from mlrun.utils import logger
21
25
 
22
- from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
26
+ from .base import DataStore, FileStats, make_datastore_schema_sanitizer
23
27
 
24
28
  # Google storage objects will be represented with the following URL: gcs://<bucket name>/<path> or gs://...
25
29
 
26
30
 
27
31
  class GoogleCloudStorageStore(DataStore):
28
32
  using_bucket = True
33
+ workers = 8
34
+ chunk_size = 32 * 1024 * 1024
29
35
 
30
36
  def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
31
37
  super().__init__(parent, name, schema, endpoint, secrets=secrets)
38
+ self._storage_client = None
39
+ self._storage_options = None
40
+
41
+ @property
42
+ def storage_client(self):
43
+ if self._storage_client:
44
+ return self._storage_client
45
+
46
+ token = self._get_credentials().get("token")
47
+ access = "https://www.googleapis.com/auth/devstorage.full_control"
48
+ if isinstance(token, str):
49
+ if os.path.exists(token):
50
+ credentials = service_account.Credentials.from_service_account_file(
51
+ token, scopes=[access]
52
+ )
53
+ else:
54
+ raise mlrun.errors.MLRunInvalidArgumentError(
55
+ "gcsfs authentication file not found!"
56
+ )
57
+ elif isinstance(token, dict):
58
+ credentials = service_account.Credentials.from_service_account_info(
59
+ token, scopes=[access]
60
+ )
61
+ elif isinstance(token, Credentials):
62
+ credentials = token
63
+ else:
64
+ raise ValueError(f"Unsupported token type: {type(token)}")
65
+ self._storage_client = Client(credentials=credentials)
66
+ return self._storage_client
32
67
 
33
68
  @property
34
69
  def filesystem(self):
35
70
  """return fsspec file system object, if supported"""
36
- if self._filesystem:
37
- return self._filesystem
38
- try:
39
- import gcsfs # noqa
40
- except ImportError as exc:
41
- raise ImportError(
42
- "Google gcsfs not installed, run pip install gcsfs"
43
- ) from exc
44
- filesystem_class = get_filesystem_class(protocol=self.kind)
45
- self._filesystem = makeDatastoreSchemaSanitizer(
46
- filesystem_class,
47
- using_bucket=self.using_bucket,
48
- **self.get_storage_options(),
49
- )
71
+ if not self._filesystem:
72
+ filesystem_class = get_filesystem_class(protocol=self.kind)
73
+ self._filesystem = make_datastore_schema_sanitizer(
74
+ filesystem_class,
75
+ using_bucket=self.using_bucket,
76
+ **self.storage_options,
77
+ )
50
78
  return self._filesystem
51
79
 
52
- def get_storage_options(self):
80
+ @property
81
+ def storage_options(self):
82
+ if self._storage_options:
83
+ return self._storage_options
84
+ credentials = self._get_credentials()
85
+ # due to caching problem introduced in gcsfs 2024.3.1 (ML-7636)
86
+ credentials["use_listings_cache"] = False
87
+ self._storage_options = credentials
88
+ return self._storage_options
89
+
90
+ def _get_credentials(self):
53
91
  credentials = self._get_secret_or_env(
54
92
  "GCP_CREDENTIALS"
55
93
  ) or self._get_secret_or_env("GOOGLE_APPLICATION_CREDENTIALS")
56
94
  if credentials:
57
95
  try:
58
- # Try to handle credentials as a json connection string
59
- token = json.loads(credentials)
96
+ # Try to handle credentials as a json connection string or do nothing if already a dict
97
+ token = (
98
+ credentials
99
+ if isinstance(credentials, dict)
100
+ else json.loads(credentials)
101
+ )
60
102
  except json.JSONDecodeError:
61
103
  # If it's not json, handle it as a filename
62
104
  token = credentials
@@ -67,6 +109,9 @@ class GoogleCloudStorageStore(DataStore):
67
109
  )
68
110
  return self._sanitize_storage_options(None)
69
111
 
112
+ def get_storage_options(self):
113
+ return self.storage_options
114
+
70
115
  def _make_path(self, key):
71
116
  key = key.strip("/")
72
117
  path = Path(self.endpoint, key).as_posix()
@@ -86,21 +131,34 @@ class GoogleCloudStorageStore(DataStore):
86
131
  raise mlrun.errors.MLRunInvalidArgumentError(
87
132
  "Append mode not supported for Google cloud storage datastore"
88
133
  )
89
-
90
- if isinstance(data, bytes):
91
- mode = "wb"
92
- elif isinstance(data, str):
93
- mode = "w"
94
- else:
95
- raise TypeError(
96
- "Data type unknown. Unable to put in Google cloud storage!"
97
- )
134
+ data, mode = self._prepare_put_data(data, append)
98
135
  with self.filesystem.open(path, mode) as f:
99
136
  f.write(data)
100
137
 
101
138
  def upload(self, key, src_path):
102
- path = self._make_path(key)
103
- self.filesystem.put_file(src_path, path, overwrite=True)
139
+ file_size = os.path.getsize(src_path)
140
+ united_path = self._make_path(key)
141
+
142
+ # Multiple upload limitation recommendations as described in
143
+ # https://cloud.google.com/storage/docs/multipart-uploads#storage-upload-object-chunks-python
144
+
145
+ if file_size <= self.chunk_size:
146
+ self.filesystem.put_file(src_path, united_path, overwrite=True)
147
+ return
148
+
149
+ bucket = self.storage_client.bucket(self.endpoint)
150
+ blob = bucket.blob(key.strip("/"))
151
+
152
+ try:
153
+ transfer_manager.upload_chunks_concurrently(
154
+ src_path, blob, chunk_size=self.chunk_size, max_workers=self.workers
155
+ )
156
+ except Exception as upload_chunks_concurrently_exception:
157
+ logger.warning(
158
+ f"gcs: failed to concurrently upload {src_path},"
159
+ f" exception: {upload_chunks_concurrently_exception}. Retrying with single part upload."
160
+ )
161
+ self.filesystem.put_file(src_path, united_path, overwrite=True)
104
162
 
105
163
  def stat(self, key):
106
164
  path = self._make_path(key)
@@ -129,11 +187,13 @@ class GoogleCloudStorageStore(DataStore):
129
187
 
130
188
  def rm(self, path, recursive=False, maxdepth=None):
131
189
  path = self._make_path(path)
132
- self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
190
+ # in order to raise an error in case of a connection error (ML-7056)
191
+ self.filesystem.exists(path)
192
+ super().rm(path, recursive=recursive, maxdepth=maxdepth)
133
193
 
134
194
  def get_spark_options(self):
135
- res = None
136
- st = self.get_storage_options()
195
+ res = {}
196
+ st = self._get_credentials()
137
197
  if "token" in st:
138
198
  res = {"spark.hadoop.google.cloud.auth.service.account.enable": "true"}
139
199
  if isinstance(st["token"], str):
mlrun/datastore/hdfs.py CHANGED
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import os
15
+ from urllib.parse import urlparse
15
16
 
16
17
  import fsspec
17
18
 
@@ -49,3 +50,7 @@ class HdfsStore(DataStore):
49
50
  @property
50
51
  def spark_url(self):
51
52
  return f"hdfs://{self.host}:{self.port}"
53
+
54
+ def rm(self, url, recursive=False, maxdepth=None):
55
+ path = urlparse(url).path
56
+ self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
mlrun/datastore/inmem.py CHANGED
@@ -72,7 +72,7 @@ class InMemoryStore(DataStore):
72
72
  if columns:
73
73
  kwargs["usecols"] = columns
74
74
  reader = df_module.read_csv
75
- elif url.endswith(".parquet") or url.endswith(".pq") or format == "parquet":
75
+ elif mlrun.utils.helpers.is_parquet_file(url, format):
76
76
  if columns:
77
77
  kwargs["columns"] = columns
78
78
  reader = df_module.read_parquet
@@ -80,8 +80,11 @@ class InMemoryStore(DataStore):
80
80
  reader = df_module.read_json
81
81
  else:
82
82
  raise mlrun.errors.MLRunInvalidArgumentError(f"file type unhandled {url}")
83
- # InMemoryStore store do not filter on time
84
- for field in ["time_column", "start_time", "end_time"]:
83
+ # InMemoryStore store don't pass filters
84
+ for field in ["time_column", "start_time", "end_time", "additional_filters"]:
85
85
  kwargs.pop(field, None)
86
86
 
87
87
  return reader(item, **kwargs)
88
+
89
+ def rm(self, path, recursive=False, maxdepth=None):
90
+ self._items.pop(path, None)
mlrun/datastore/redis.py CHANGED
@@ -31,7 +31,7 @@ class RedisStore(DataStore):
31
31
  """
32
32
 
33
33
  def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
34
- REDIS_DEFAULT_PORT = "6379"
34
+ redis_default_port = "6379"
35
35
  super().__init__(parent, name, schema, endpoint, secrets=secrets)
36
36
  self.headers = None
37
37
 
@@ -49,7 +49,7 @@ class RedisStore(DataStore):
49
49
  user = self._get_secret_or_env("REDIS_USER", "", credentials_prefix)
50
50
  password = self._get_secret_or_env("REDIS_PASSWORD", "", credentials_prefix)
51
51
  host = parsed_endpoint.hostname
52
- port = parsed_endpoint.port if parsed_endpoint.port else REDIS_DEFAULT_PORT
52
+ port = parsed_endpoint.port if parsed_endpoint.port else redis_default_port
53
53
  schema = parsed_endpoint.scheme
54
54
  if user or password:
55
55
  endpoint = f"{schema}://{user}:{password}@{host}:{port}"
@@ -126,6 +126,7 @@ class RedisStore(DataStore):
126
126
 
127
127
  def put(self, key, data, append=False):
128
128
  key = RedisStore.build_redis_key(key)
129
+ data, _ = self._prepare_put_data(data, append)
129
130
  if append:
130
131
  self.redis.append(key, data)
131
132
  else:
mlrun/datastore/s3.py CHANGED
@@ -15,11 +15,12 @@
15
15
  import time
16
16
 
17
17
  import boto3
18
+ from boto3.s3.transfer import TransferConfig
18
19
  from fsspec.registry import get_filesystem_class
19
20
 
20
21
  import mlrun.errors
21
22
 
22
- from .base import DataStore, FileStats, get_range, makeDatastoreSchemaSanitizer
23
+ from .base import DataStore, FileStats, get_range, make_datastore_schema_sanitizer
23
24
 
24
25
 
25
26
  class S3Store(DataStore):
@@ -35,11 +36,18 @@ class S3Store(DataStore):
35
36
 
36
37
  access_key_id = self._get_secret_or_env("AWS_ACCESS_KEY_ID")
37
38
  secret_key = self._get_secret_or_env("AWS_SECRET_ACCESS_KEY")
39
+ token_file = self._get_secret_or_env("AWS_CONTAINER_AUTHORIZATION_TOKEN_FILE")
38
40
  endpoint_url = self._get_secret_or_env("S3_ENDPOINT_URL")
39
41
  force_non_anonymous = self._get_secret_or_env("S3_NON_ANONYMOUS")
40
42
  profile_name = self._get_secret_or_env("AWS_PROFILE")
41
43
  assume_role_arn = self._get_secret_or_env("MLRUN_AWS_ROLE_ARN")
42
44
 
45
+ self.config = TransferConfig(
46
+ multipart_threshold=1024 * 1024 * 25,
47
+ max_concurrency=10,
48
+ multipart_chunksize=1024 * 1024 * 25,
49
+ )
50
+
43
51
  # If user asks to assume a role, this needs to go through the STS client and retrieve temporary creds
44
52
  if assume_role_arn:
45
53
  client = boto3.client(
@@ -87,14 +95,15 @@ class S3Store(DataStore):
87
95
  self.s3 = boto3.resource(
88
96
  "s3", region_name=region, endpoint_url=endpoint_url
89
97
  )
90
- # If not using credentials, boto will still attempt to sign the requests, and will fail any operations
91
- # due to no credentials found. These commands disable signing and allow anonymous mode (same as
92
- # anon in the storage_options when working with fsspec).
93
- from botocore.handlers import disable_signing
94
-
95
- self.s3.meta.client.meta.events.register(
96
- "choose-signer.s3.*", disable_signing
97
- )
98
+ if not token_file:
99
+ # If not using credentials, boto will still attempt to sign the requests, and will fail any operations
100
+ # due to no credentials found. These commands disable signing and allow anonymous mode (same as
101
+ # anon in the storage_options when working with fsspec).
102
+ from botocore.handlers import disable_signing
103
+
104
+ self.s3.meta.client.meta.events.register(
105
+ "choose-signer.s3.*", disable_signing
106
+ )
98
107
 
99
108
  def get_spark_options(self):
100
109
  res = {}
@@ -119,7 +128,7 @@ class S3Store(DataStore):
119
128
  except ImportError as exc:
120
129
  raise ImportError("AWS s3fs not installed") from exc
121
130
  filesystem_class = get_filesystem_class(protocol=self.kind)
122
- self._filesystem = makeDatastoreSchemaSanitizer(
131
+ self._filesystem = make_datastore_schema_sanitizer(
123
132
  filesystem_class,
124
133
  using_bucket=self.using_bucket,
125
134
  **self.get_storage_options(),
@@ -132,6 +141,7 @@ class S3Store(DataStore):
132
141
  endpoint_url = self._get_secret_or_env("S3_ENDPOINT_URL")
133
142
  access_key_id = self._get_secret_or_env("AWS_ACCESS_KEY_ID")
134
143
  secret = self._get_secret_or_env("AWS_SECRET_ACCESS_KEY")
144
+ token_file = self._get_secret_or_env("AWS_CONTAINER_AUTHORIZATION_TOKEN_FILE")
135
145
 
136
146
  if self._temp_credentials:
137
147
  access_key_id = self._temp_credentials["AccessKeyId"]
@@ -141,7 +151,7 @@ class S3Store(DataStore):
141
151
  token = None
142
152
 
143
153
  storage_options = dict(
144
- anon=not (force_non_anonymous or (access_key_id and secret)),
154
+ anon=not (force_non_anonymous or (access_key_id and secret) or token_file),
145
155
  key=access_key_id,
146
156
  secret=secret,
147
157
  token=token,
@@ -166,7 +176,7 @@ class S3Store(DataStore):
166
176
 
167
177
  def upload(self, key, src_path):
168
178
  bucket, key = self.get_bucket_and_key(key)
169
- self.s3.Object(bucket, key).put(Body=open(src_path, "rb"))
179
+ self.s3.Bucket(bucket).upload_file(src_path, key, Config=self.config)
170
180
 
171
181
  def get(self, key, size=None, offset=0):
172
182
  bucket, key = self.get_bucket_and_key(key)
@@ -176,6 +186,7 @@ class S3Store(DataStore):
176
186
  return obj.get()["Body"].read()
177
187
 
178
188
  def put(self, key, data, append=False):
189
+ data, _ = self._prepare_put_data(data, append)
179
190
  bucket, key = self.get_bucket_and_key(key)
180
191
  self.s3.Object(bucket, key).put(Body=data)
181
192
 
@@ -198,6 +209,13 @@ class S3Store(DataStore):
198
209
  bucket = self.s3.Bucket(bucket)
199
210
  return [obj.key[key_length:] for obj in bucket.objects.filter(Prefix=key)]
200
211
 
212
+ def rm(self, path, recursive=False, maxdepth=None):
213
+ bucket, key = self.get_bucket_and_key(path)
214
+ path = f"{bucket}/{key}"
215
+ # In order to raise an error if there is connection error, ML-7056.
216
+ self.filesystem.exists(path=path)
217
+ self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
218
+
201
219
 
202
220
  def parse_s3_bucket_and_key(s3_path):
203
221
  try:
@@ -0,0 +1,45 @@
1
+ # Copyright 2024 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+
16
+ import mlrun
17
+
18
+
19
+ def get_snowflake_password():
20
+ key = "SNOWFLAKE_PASSWORD"
21
+ snowflake_password = mlrun.get_secret_or_env(key)
22
+
23
+ if not snowflake_password:
24
+ raise mlrun.errors.MLRunInvalidArgumentError(
25
+ f"No password provided. Set password using the {key} "
26
+ "project secret or environment variable."
27
+ )
28
+
29
+ return snowflake_password
30
+
31
+
32
+ def get_snowflake_spark_options(attributes):
33
+ if not attributes:
34
+ return {}
35
+ return {
36
+ "format": "net.snowflake.spark.snowflake",
37
+ "sfURL": attributes.get("url"),
38
+ "sfUser": attributes.get("user"),
39
+ "sfPassword": get_snowflake_password(),
40
+ "sfDatabase": attributes.get("database"),
41
+ "sfSchema": attributes.get("db_schema"),
42
+ "sfWarehouse": attributes.get("warehouse"),
43
+ "application": "iguazio_platform",
44
+ "TIMESTAMP_TYPE_MAPPING": "TIMESTAMP_LTZ",
45
+ }