mlrun 1.7.0rc28__py3-none-any.whl → 1.7.0rc55__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (135) hide show
  1. mlrun/__main__.py +4 -2
  2. mlrun/alerts/alert.py +75 -8
  3. mlrun/artifacts/base.py +1 -0
  4. mlrun/artifacts/manager.py +9 -2
  5. mlrun/common/constants.py +4 -1
  6. mlrun/common/db/sql_session.py +3 -2
  7. mlrun/common/formatters/__init__.py +1 -0
  8. mlrun/common/formatters/artifact.py +1 -0
  9. mlrun/{model_monitoring/application.py → common/formatters/feature_set.py} +20 -6
  10. mlrun/common/formatters/run.py +3 -0
  11. mlrun/common/helpers.py +0 -1
  12. mlrun/common/schemas/__init__.py +3 -1
  13. mlrun/common/schemas/alert.py +15 -12
  14. mlrun/common/schemas/api_gateway.py +6 -6
  15. mlrun/common/schemas/auth.py +5 -0
  16. mlrun/common/schemas/client_spec.py +0 -1
  17. mlrun/common/schemas/common.py +7 -4
  18. mlrun/common/schemas/frontend_spec.py +7 -0
  19. mlrun/common/schemas/function.py +7 -0
  20. mlrun/common/schemas/model_monitoring/__init__.py +4 -3
  21. mlrun/common/schemas/model_monitoring/constants.py +41 -26
  22. mlrun/common/schemas/model_monitoring/model_endpoints.py +23 -47
  23. mlrun/common/schemas/notification.py +69 -12
  24. mlrun/common/schemas/project.py +45 -12
  25. mlrun/common/schemas/workflow.py +10 -2
  26. mlrun/common/types.py +1 -0
  27. mlrun/config.py +91 -35
  28. mlrun/data_types/data_types.py +6 -1
  29. mlrun/data_types/spark.py +2 -2
  30. mlrun/data_types/to_pandas.py +57 -25
  31. mlrun/datastore/__init__.py +1 -0
  32. mlrun/datastore/alibaba_oss.py +3 -2
  33. mlrun/datastore/azure_blob.py +125 -37
  34. mlrun/datastore/base.py +42 -21
  35. mlrun/datastore/datastore.py +4 -2
  36. mlrun/datastore/datastore_profile.py +1 -1
  37. mlrun/datastore/dbfs_store.py +3 -7
  38. mlrun/datastore/filestore.py +1 -3
  39. mlrun/datastore/google_cloud_storage.py +85 -29
  40. mlrun/datastore/inmem.py +4 -1
  41. mlrun/datastore/redis.py +1 -0
  42. mlrun/datastore/s3.py +25 -12
  43. mlrun/datastore/sources.py +76 -4
  44. mlrun/datastore/spark_utils.py +30 -0
  45. mlrun/datastore/storeytargets.py +151 -0
  46. mlrun/datastore/targets.py +102 -131
  47. mlrun/datastore/v3io.py +1 -0
  48. mlrun/db/base.py +15 -6
  49. mlrun/db/httpdb.py +57 -28
  50. mlrun/db/nopdb.py +29 -5
  51. mlrun/errors.py +20 -3
  52. mlrun/execution.py +46 -5
  53. mlrun/feature_store/api.py +25 -1
  54. mlrun/feature_store/common.py +6 -11
  55. mlrun/feature_store/feature_vector.py +3 -1
  56. mlrun/feature_store/retrieval/job.py +4 -1
  57. mlrun/feature_store/retrieval/spark_merger.py +10 -39
  58. mlrun/feature_store/steps.py +8 -0
  59. mlrun/frameworks/_common/plan.py +3 -3
  60. mlrun/frameworks/_ml_common/plan.py +1 -1
  61. mlrun/frameworks/parallel_coordinates.py +2 -3
  62. mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
  63. mlrun/k8s_utils.py +48 -2
  64. mlrun/launcher/client.py +6 -6
  65. mlrun/launcher/local.py +2 -2
  66. mlrun/model.py +215 -34
  67. mlrun/model_monitoring/api.py +38 -24
  68. mlrun/model_monitoring/applications/__init__.py +1 -2
  69. mlrun/model_monitoring/applications/_application_steps.py +60 -29
  70. mlrun/model_monitoring/applications/base.py +2 -174
  71. mlrun/model_monitoring/applications/context.py +197 -70
  72. mlrun/model_monitoring/applications/evidently_base.py +11 -85
  73. mlrun/model_monitoring/applications/histogram_data_drift.py +21 -16
  74. mlrun/model_monitoring/applications/results.py +4 -4
  75. mlrun/model_monitoring/controller.py +110 -282
  76. mlrun/model_monitoring/db/stores/__init__.py +8 -3
  77. mlrun/model_monitoring/db/stores/base/store.py +3 -0
  78. mlrun/model_monitoring/db/stores/sqldb/models/base.py +9 -7
  79. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +18 -3
  80. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +43 -23
  81. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +48 -35
  82. mlrun/model_monitoring/db/tsdb/__init__.py +7 -2
  83. mlrun/model_monitoring/db/tsdb/base.py +147 -15
  84. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +94 -55
  85. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -3
  86. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +144 -38
  87. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +44 -3
  88. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +246 -57
  89. mlrun/model_monitoring/helpers.py +70 -50
  90. mlrun/model_monitoring/stream_processing.py +96 -195
  91. mlrun/model_monitoring/writer.py +13 -5
  92. mlrun/package/packagers/default_packager.py +2 -2
  93. mlrun/projects/operations.py +16 -8
  94. mlrun/projects/pipelines.py +126 -115
  95. mlrun/projects/project.py +286 -129
  96. mlrun/render.py +3 -3
  97. mlrun/run.py +38 -19
  98. mlrun/runtimes/__init__.py +19 -8
  99. mlrun/runtimes/base.py +4 -1
  100. mlrun/runtimes/daskjob.py +1 -1
  101. mlrun/runtimes/funcdoc.py +1 -1
  102. mlrun/runtimes/kubejob.py +6 -6
  103. mlrun/runtimes/local.py +12 -5
  104. mlrun/runtimes/nuclio/api_gateway.py +68 -8
  105. mlrun/runtimes/nuclio/application/application.py +307 -70
  106. mlrun/runtimes/nuclio/function.py +63 -14
  107. mlrun/runtimes/nuclio/serving.py +10 -10
  108. mlrun/runtimes/pod.py +25 -19
  109. mlrun/runtimes/remotesparkjob.py +2 -5
  110. mlrun/runtimes/sparkjob/spark3job.py +16 -17
  111. mlrun/runtimes/utils.py +34 -0
  112. mlrun/serving/routers.py +2 -5
  113. mlrun/serving/server.py +37 -19
  114. mlrun/serving/states.py +30 -3
  115. mlrun/serving/v2_serving.py +44 -35
  116. mlrun/track/trackers/mlflow_tracker.py +5 -0
  117. mlrun/utils/async_http.py +1 -1
  118. mlrun/utils/db.py +18 -0
  119. mlrun/utils/helpers.py +150 -36
  120. mlrun/utils/http.py +1 -1
  121. mlrun/utils/notifications/notification/__init__.py +0 -1
  122. mlrun/utils/notifications/notification/webhook.py +8 -1
  123. mlrun/utils/notifications/notification_pusher.py +1 -1
  124. mlrun/utils/v3io_clients.py +2 -2
  125. mlrun/utils/version/version.json +2 -2
  126. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/METADATA +153 -66
  127. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/RECORD +131 -134
  128. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/WHEEL +1 -1
  129. mlrun/feature_store/retrieval/conversion.py +0 -271
  130. mlrun/model_monitoring/controller_handler.py +0 -37
  131. mlrun/model_monitoring/evidently_application.py +0 -20
  132. mlrun/model_monitoring/prometheus.py +0 -216
  133. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/LICENSE +0 -0
  134. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/entry_points.txt +0 -0
  135. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/top_level.txt +0 -0
@@ -16,12 +16,13 @@ import time
16
16
  from pathlib import Path
17
17
  from urllib.parse import urlparse
18
18
 
19
+ from azure.storage.blob import BlobServiceClient
19
20
  from azure.storage.blob._shared.base_client import parse_connection_str
20
21
  from fsspec.registry import get_filesystem_class
21
22
 
22
23
  import mlrun.errors
23
24
 
24
- from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
25
+ from .base import DataStore, FileStats, make_datastore_schema_sanitizer
25
26
 
26
27
  # Azure blobs will be represented with the following URL: az://<container name>. The storage account is already
27
28
  # pointed to by the connection string, so the user is not expected to specify it in any way.
@@ -29,47 +30,131 @@ from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
29
30
 
30
31
  class AzureBlobStore(DataStore):
31
32
  using_bucket = True
33
+ max_concurrency = 100
34
+ max_blocksize = 1024 * 1024 * 4
35
+ max_single_put_size = (
36
+ 1024 * 1024 * 8
37
+ ) # for service_client property only, does not affect filesystem
32
38
 
33
39
  def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
34
40
  super().__init__(parent, name, schema, endpoint, secrets=secrets)
41
+ self._service_client = None
42
+ self._storage_options = None
43
+
44
+ def get_storage_options(self):
45
+ return self.storage_options
46
+
47
+ @property
48
+ def storage_options(self):
49
+ if not self._storage_options:
50
+ res = dict(
51
+ account_name=self._get_secret_or_env("account_name")
52
+ or self._get_secret_or_env("AZURE_STORAGE_ACCOUNT_NAME"),
53
+ account_key=self._get_secret_or_env("account_key")
54
+ or self._get_secret_or_env("AZURE_STORAGE_ACCOUNT_KEY"),
55
+ connection_string=self._get_secret_or_env("connection_string")
56
+ or self._get_secret_or_env("AZURE_STORAGE_CONNECTION_STRING"),
57
+ tenant_id=self._get_secret_or_env("tenant_id")
58
+ or self._get_secret_or_env("AZURE_STORAGE_TENANT_ID"),
59
+ client_id=self._get_secret_or_env("client_id")
60
+ or self._get_secret_or_env("AZURE_STORAGE_CLIENT_ID"),
61
+ client_secret=self._get_secret_or_env("client_secret")
62
+ or self._get_secret_or_env("AZURE_STORAGE_CLIENT_SECRET"),
63
+ sas_token=self._get_secret_or_env("sas_token")
64
+ or self._get_secret_or_env("AZURE_STORAGE_SAS_TOKEN"),
65
+ credential=self._get_secret_or_env("credential"),
66
+ )
67
+ self._storage_options = self._sanitize_storage_options(res)
68
+ return self._storage_options
35
69
 
36
70
  @property
37
71
  def filesystem(self):
38
72
  """return fsspec file system object, if supported"""
39
- if self._filesystem:
40
- return self._filesystem
41
73
  try:
42
74
  import adlfs # noqa
43
75
  except ImportError as exc:
44
76
  raise ImportError("Azure adlfs not installed") from exc
45
- # in order to support az and wasbs kinds.
46
- filesystem_class = get_filesystem_class(protocol=self.kind)
47
- self._filesystem = makeDatastoreSchemaSanitizer(
48
- filesystem_class,
49
- using_bucket=self.using_bucket,
50
- **self.get_storage_options(),
51
- )
77
+
78
+ if not self._filesystem:
79
+ # in order to support az and wasbs kinds
80
+ filesystem_class = get_filesystem_class(protocol=self.kind)
81
+ self._filesystem = make_datastore_schema_sanitizer(
82
+ filesystem_class,
83
+ using_bucket=self.using_bucket,
84
+ blocksize=self.max_blocksize,
85
+ **self.storage_options,
86
+ )
52
87
  return self._filesystem
53
88
 
54
- def get_storage_options(self):
55
- res = dict(
56
- account_name=self._get_secret_or_env("account_name")
57
- or self._get_secret_or_env("AZURE_STORAGE_ACCOUNT_NAME"),
58
- account_key=self._get_secret_or_env("account_key")
59
- or self._get_secret_or_env("AZURE_STORAGE_KEY"),
60
- connection_string=self._get_secret_or_env("connection_string")
61
- or self._get_secret_or_env("AZURE_STORAGE_CONNECTION_STRING"),
62
- tenant_id=self._get_secret_or_env("tenant_id")
63
- or self._get_secret_or_env("AZURE_STORAGE_TENANT_ID"),
64
- client_id=self._get_secret_or_env("client_id")
65
- or self._get_secret_or_env("AZURE_STORAGE_CLIENT_ID"),
66
- client_secret=self._get_secret_or_env("client_secret")
67
- or self._get_secret_or_env("AZURE_STORAGE_CLIENT_SECRET"),
68
- sas_token=self._get_secret_or_env("sas_token")
69
- or self._get_secret_or_env("AZURE_STORAGE_SAS_TOKEN"),
70
- credential=self._get_secret_or_env("credential"),
71
- )
72
- return self._sanitize_storage_options(res)
89
+ @property
90
+ def service_client(self):
91
+ try:
92
+ import azure # noqa
93
+ except ImportError as exc:
94
+ raise ImportError("Azure not installed") from exc
95
+
96
+ if not self._service_client:
97
+ self._do_connect()
98
+ return self._service_client
99
+
100
+ def _do_connect(self):
101
+ """
102
+
103
+ Creates a client for azure.
104
+ Raises MLRunInvalidArgumentError if none of the connection details are available
105
+ based on do_connect in AzureBlobFileSystem:
106
+ https://github.com/fsspec/adlfs/blob/2023.9.0/adlfs/spec.py#L422
107
+ """
108
+ from azure.identity import ClientSecretCredential
109
+
110
+ storage_options = self.storage_options
111
+ connection_string = storage_options.get("connection_string")
112
+ client_name = storage_options.get("account_name")
113
+ account_key = storage_options.get("account_key")
114
+ sas_token = storage_options.get("sas_token")
115
+ client_id = storage_options.get("client_id")
116
+ credential = storage_options.get("credential")
117
+
118
+ credential_from_client_id = None
119
+ if (
120
+ credential is None
121
+ and account_key is None
122
+ and sas_token is None
123
+ and client_id is not None
124
+ ):
125
+ credential_from_client_id = ClientSecretCredential(
126
+ tenant_id=storage_options.get("tenant_id"),
127
+ client_id=client_id,
128
+ client_secret=storage_options.get("client_secret"),
129
+ )
130
+ try:
131
+ if connection_string is not None:
132
+ self._service_client = BlobServiceClient.from_connection_string(
133
+ conn_str=connection_string,
134
+ max_block_size=self.max_blocksize,
135
+ max_single_put_size=self.max_single_put_size,
136
+ )
137
+ elif client_name is not None:
138
+ account_url = f"https://{client_name}.blob.core.windows.net"
139
+ cred = credential_from_client_id or credential or account_key
140
+ if not cred and sas_token is not None:
141
+ if not sas_token.startswith("?"):
142
+ sas_token = f"?{sas_token}"
143
+ account_url = account_url + sas_token
144
+ self._service_client = BlobServiceClient(
145
+ account_url=account_url,
146
+ credential=cred,
147
+ max_block_size=self.max_blocksize,
148
+ max_single_put_size=self.max_single_put_size,
149
+ )
150
+ else:
151
+ raise mlrun.errors.MLRunInvalidArgumentError(
152
+ "Must provide either a connection_string or account_name with credentials"
153
+ )
154
+ except Exception as e:
155
+ raise mlrun.errors.MLRunInvalidArgumentError(
156
+ f"unable to connect to account for {e}"
157
+ )
73
158
 
74
159
  def _convert_key_to_remote_path(self, key):
75
160
  key = key.strip("/")
@@ -82,7 +167,15 @@ class AzureBlobStore(DataStore):
82
167
 
83
168
  def upload(self, key, src_path):
84
169
  remote_path = self._convert_key_to_remote_path(key)
85
- self.filesystem.put_file(src_path, remote_path, overwrite=True)
170
+ container, remote_path = remote_path.split("/", 1)
171
+ container_client = self.service_client.get_container_client(container=container)
172
+ with open(file=src_path, mode="rb") as data:
173
+ container_client.upload_blob(
174
+ name=remote_path,
175
+ data=data,
176
+ overwrite=True,
177
+ max_concurrency=self.max_concurrency,
178
+ )
86
179
 
87
180
  def get(self, key, size=None, offset=0):
88
181
  remote_path = self._convert_key_to_remote_path(key)
@@ -96,12 +189,7 @@ class AzureBlobStore(DataStore):
96
189
  "Append mode not supported for Azure blob datastore"
97
190
  )
98
191
  remote_path = self._convert_key_to_remote_path(key)
99
- if isinstance(data, bytes):
100
- mode = "wb"
101
- elif isinstance(data, str):
102
- mode = "w"
103
- else:
104
- raise TypeError("Data type unknown. Unable to put in Azure!")
192
+ data, mode = self._prepare_put_data(data, append)
105
193
  with self.filesystem.open(remote_path, mode) as f:
106
194
  f.write(data)
107
195
 
@@ -135,7 +223,7 @@ class AzureBlobStore(DataStore):
135
223
 
136
224
  def get_spark_options(self):
137
225
  res = {}
138
- st = self.get_storage_options()
226
+ st = self.storage_options
139
227
  service = "blob"
140
228
  primary_url = None
141
229
  if st.get("connection_string"):
mlrun/datastore/base.py CHANGED
@@ -24,13 +24,12 @@ import pandas as pd
24
24
  import pyarrow
25
25
  import pytz
26
26
  import requests
27
- import urllib3
28
27
  from deprecated import deprecated
29
28
 
30
29
  import mlrun.config
31
30
  import mlrun.errors
32
31
  from mlrun.errors import err_to_str
33
- from mlrun.utils import StorePrefix, is_ipython, logger
32
+ from mlrun.utils import StorePrefix, is_jupyter, logger
34
33
 
35
34
  from .store_resources import is_store_uri, parse_store_uri
36
35
  from .utils import filter_df_start_end_time, select_columns_from_df
@@ -157,6 +156,18 @@ class DataStore:
157
156
  def put(self, key, data, append=False):
158
157
  pass
159
158
 
159
+ def _prepare_put_data(self, data, append=False):
160
+ mode = "a" if append else "w"
161
+ if isinstance(data, bytearray):
162
+ data = bytes(data)
163
+
164
+ if isinstance(data, bytes):
165
+ return data, f"{mode}b"
166
+ elif isinstance(data, str):
167
+ return data, mode
168
+ else:
169
+ raise TypeError(f"Unable to put a value of type {type(self).__name__}")
170
+
160
171
  def stat(self, key):
161
172
  pass
162
173
 
@@ -215,6 +226,15 @@ class DataStore:
215
226
  raise mlrun.errors.MLRunInvalidArgumentError(
216
227
  "When providing start_time or end_time, must provide time_column"
217
228
  )
229
+ if (
230
+ start_time
231
+ and end_time
232
+ and start_time.utcoffset() != end_time.utcoffset()
233
+ ):
234
+ raise mlrun.errors.MLRunInvalidArgumentError(
235
+ "start_time and end_time must have the same time zone"
236
+ )
237
+
218
238
  if start_time or end_time or additional_filters:
219
239
  partitions_time_attributes = find_partitions(url, file_system)
220
240
  set_filters(
@@ -232,13 +252,17 @@ class DataStore:
232
252
  ):
233
253
  raise ex
234
254
 
235
- # TODO: fix timezone issue (ML-6308)
236
- if start_time.tzinfo:
237
- start_time_inner = start_time.replace(tzinfo=None)
238
- end_time_inner = end_time.replace(tzinfo=None)
239
- else:
240
- start_time_inner = start_time.replace(tzinfo=pytz.utc)
241
- end_time_inner = end_time.replace(tzinfo=pytz.utc)
255
+ start_time_inner = None
256
+ if start_time:
257
+ start_time_inner = start_time.replace(
258
+ tzinfo=None if start_time.tzinfo else pytz.utc
259
+ )
260
+
261
+ end_time_inner = None
262
+ if end_time:
263
+ end_time_inner = end_time.replace(
264
+ tzinfo=None if end_time.tzinfo else pytz.utc
265
+ )
242
266
 
243
267
  set_filters(
244
268
  partitions_time_attributes,
@@ -319,11 +343,7 @@ class DataStore:
319
343
  dfs.append(df_module.read_csv(*updated_args, **kwargs))
320
344
  return df_module.concat(dfs)
321
345
 
322
- elif (
323
- file_url.endswith(".parquet")
324
- or file_url.endswith(".pq")
325
- or format == "parquet"
326
- ):
346
+ elif mlrun.utils.helpers.is_parquet_file(file_url, format):
327
347
  if columns:
328
348
  kwargs["columns"] = columns
329
349
 
@@ -386,7 +406,10 @@ class DataStore:
386
406
  }
387
407
 
388
408
  def rm(self, path, recursive=False, maxdepth=None):
389
- self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
409
+ try:
410
+ self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
411
+ except FileNotFoundError:
412
+ pass
390
413
 
391
414
  @staticmethod
392
415
  def _is_dd(df_module):
@@ -596,14 +619,14 @@ class DataItem:
596
619
  )
597
620
  return df
598
621
 
599
- def show(self, format=None):
622
+ def show(self, format: Optional[str] = None) -> None:
600
623
  """show the data object content in Jupyter
601
624
 
602
625
  :param format: format to use (when there is no/wrong suffix), e.g. 'png'
603
626
  """
604
- if not is_ipython:
627
+ if not is_jupyter:
605
628
  logger.warning(
606
- "Jupyter/IPython was not detected, .show() will only display inside Jupyter"
629
+ "Jupyter was not detected. `.show()` displays only inside Jupyter."
607
630
  )
608
631
  return
609
632
 
@@ -721,8 +744,6 @@ class HttpStore(DataStore):
721
744
 
722
745
  verify_ssl = mlconf.httpdb.http.verify
723
746
  try:
724
- if not verify_ssl:
725
- urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
726
747
  response = requests.get(url, headers=headers, auth=auth, verify=verify_ssl)
727
748
  except OSError as exc:
728
749
  raise OSError(f"error: cannot connect to {url}: {err_to_str(exc)}")
@@ -736,7 +757,7 @@ class HttpStore(DataStore):
736
757
  # As an example, it converts an S3 URL 's3://s3bucket/path' to just 's3bucket/path'.
737
758
  # Since 'ds' schemas are not inherently processed by fsspec, we have adapted the _strip_protocol()
738
759
  # method specifically to strip away the 'ds' schema as required.
739
- def makeDatastoreSchemaSanitizer(cls, using_bucket=False, *args, **kwargs):
760
+ def make_datastore_schema_sanitizer(cls, using_bucket=False, *args, **kwargs):
740
761
  if not issubclass(cls, fsspec.AbstractFileSystem):
741
762
  raise ValueError("Class must be a subclass of fsspec.AbstractFileSystem")
742
763
 
@@ -32,6 +32,8 @@ in_memory_store = InMemoryStore()
32
32
 
33
33
 
34
34
  def parse_url(url):
35
+ if url and url.startswith("v3io://") and not url.startswith("v3io:///"):
36
+ url = url.replace("v3io://", "v3io:///", 1)
35
37
  parsed_url = urlparse(url)
36
38
  schema = parsed_url.scheme.lower()
37
39
  endpoint = parsed_url.hostname
@@ -94,7 +96,7 @@ def schema_to_store(schema):
94
96
  from .dbfs_store import DBFSStore
95
97
 
96
98
  return DBFSStore
97
- elif schema == "hdfs":
99
+ elif schema in ["hdfs", "webhdfs"]:
98
100
  from .hdfs import HdfsStore
99
101
 
100
102
  return HdfsStore
@@ -207,7 +209,7 @@ class StoreManager:
207
209
  ) -> (DataStore, str, str):
208
210
  schema, endpoint, parsed_url = parse_url(url)
209
211
  subpath = parsed_url.path
210
- store_key = f"{schema}://{endpoint}"
212
+ store_key = f"{schema}://{endpoint}" if endpoint else f"{schema}://"
211
213
 
212
214
  if schema == "ds":
213
215
  datastore_profile = datastore_profile_read(url, project_name, secrets)
@@ -412,7 +412,7 @@ class DatastoreProfileHdfs(DatastoreProfile):
412
412
  return res or None
413
413
 
414
414
  def url(self, subpath):
415
- return f"hdfs://{self.host}:{self.http_port}{subpath}"
415
+ return f"webhdfs://{self.host}:{self.http_port}{subpath}"
416
416
 
417
417
 
418
418
  class DatastoreProfile2Json(pydantic.BaseModel):
@@ -19,7 +19,7 @@ from fsspec.registry import get_filesystem_class
19
19
 
20
20
  import mlrun.errors
21
21
 
22
- from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
22
+ from .base import DataStore, FileStats, make_datastore_schema_sanitizer
23
23
 
24
24
 
25
25
  class DatabricksFileBugFixed(DatabricksFile):
@@ -89,7 +89,7 @@ class DBFSStore(DataStore):
89
89
  """return fsspec file system object, if supported"""
90
90
  filesystem_class = get_filesystem_class(protocol=self.kind)
91
91
  if not self._filesystem:
92
- self._filesystem = makeDatastoreSchemaSanitizer(
92
+ self._filesystem = make_datastore_schema_sanitizer(
93
93
  cls=filesystem_class,
94
94
  using_bucket=False,
95
95
  **self.get_storage_options(),
@@ -130,11 +130,7 @@ class DBFSStore(DataStore):
130
130
  "Append mode not supported for Databricks file system"
131
131
  )
132
132
  # can not use append mode because it overrides data.
133
- mode = "w"
134
- if isinstance(data, bytes):
135
- mode += "b"
136
- elif not isinstance(data, str):
137
- raise TypeError(f"Unknown data type {type(data)}")
133
+ data, mode = self._prepare_put_data(data, append)
138
134
  with self.filesystem.open(key, mode) as f:
139
135
  f.write(data)
140
136
 
@@ -66,9 +66,7 @@ class FileStore(DataStore):
66
66
  dir_to_create = path.dirname(self._join(key))
67
67
  if dir_to_create:
68
68
  self._ensure_directory(dir_to_create)
69
- mode = "a" if append else "w"
70
- if isinstance(data, bytes):
71
- mode = mode + "b"
69
+ data, mode = self._prepare_put_data(data, append)
72
70
  with open(self._join(key), mode) as fp:
73
71
  fp.write(data)
74
72
  fp.close()
@@ -12,44 +12,82 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import json
15
+ import os
15
16
  from pathlib import Path
16
17
 
17
18
  from fsspec.registry import get_filesystem_class
19
+ from google.auth.credentials import Credentials
20
+ from google.cloud.storage import Client, transfer_manager
21
+ from google.oauth2 import service_account
18
22
 
19
23
  import mlrun.errors
20
24
  from mlrun.utils import logger
21
25
 
22
- from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
26
+ from .base import DataStore, FileStats, make_datastore_schema_sanitizer
23
27
 
24
28
  # Google storage objects will be represented with the following URL: gcs://<bucket name>/<path> or gs://...
25
29
 
26
30
 
27
31
  class GoogleCloudStorageStore(DataStore):
28
32
  using_bucket = True
33
+ workers = 8
34
+ chunk_size = 32 * 1024 * 1024
29
35
 
30
36
  def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
31
37
  super().__init__(parent, name, schema, endpoint, secrets=secrets)
38
+ self._storage_client = None
39
+ self._storage_options = None
40
+
41
+ @property
42
+ def storage_client(self):
43
+ if self._storage_client:
44
+ return self._storage_client
45
+
46
+ token = self._get_credentials().get("token")
47
+ access = "https://www.googleapis.com/auth/devstorage.full_control"
48
+ if isinstance(token, str):
49
+ if os.path.exists(token):
50
+ credentials = service_account.Credentials.from_service_account_file(
51
+ token, scopes=[access]
52
+ )
53
+ else:
54
+ raise mlrun.errors.MLRunInvalidArgumentError(
55
+ "gcsfs authentication file not found!"
56
+ )
57
+ elif isinstance(token, dict):
58
+ credentials = service_account.Credentials.from_service_account_info(
59
+ token, scopes=[access]
60
+ )
61
+ elif isinstance(token, Credentials):
62
+ credentials = token
63
+ else:
64
+ raise ValueError(f"Unsupported token type: {type(token)}")
65
+ self._storage_client = Client(credentials=credentials)
66
+ return self._storage_client
32
67
 
33
68
  @property
34
69
  def filesystem(self):
35
70
  """return fsspec file system object, if supported"""
36
- if self._filesystem:
37
- return self._filesystem
38
- try:
39
- import gcsfs # noqa
40
- except ImportError as exc:
41
- raise ImportError(
42
- "Google gcsfs not installed, run pip install gcsfs"
43
- ) from exc
44
- filesystem_class = get_filesystem_class(protocol=self.kind)
45
- self._filesystem = makeDatastoreSchemaSanitizer(
46
- filesystem_class,
47
- using_bucket=self.using_bucket,
48
- **self.get_storage_options(),
49
- )
71
+ if not self._filesystem:
72
+ filesystem_class = get_filesystem_class(protocol=self.kind)
73
+ self._filesystem = make_datastore_schema_sanitizer(
74
+ filesystem_class,
75
+ using_bucket=self.using_bucket,
76
+ **self.storage_options,
77
+ )
50
78
  return self._filesystem
51
79
 
52
- def get_storage_options(self):
80
+ @property
81
+ def storage_options(self):
82
+ if self._storage_options:
83
+ return self._storage_options
84
+ credentials = self._get_credentials()
85
+ # due to caching problem introduced in gcsfs 2024.3.1 (ML-7636)
86
+ credentials["use_listings_cache"] = False
87
+ self._storage_options = credentials
88
+ return self._storage_options
89
+
90
+ def _get_credentials(self):
53
91
  credentials = self._get_secret_or_env(
54
92
  "GCP_CREDENTIALS"
55
93
  ) or self._get_secret_or_env("GOOGLE_APPLICATION_CREDENTIALS")
@@ -71,6 +109,9 @@ class GoogleCloudStorageStore(DataStore):
71
109
  )
72
110
  return self._sanitize_storage_options(None)
73
111
 
112
+ def get_storage_options(self):
113
+ return self.storage_options
114
+
74
115
  def _make_path(self, key):
75
116
  key = key.strip("/")
76
117
  path = Path(self.endpoint, key).as_posix()
@@ -90,21 +131,34 @@ class GoogleCloudStorageStore(DataStore):
90
131
  raise mlrun.errors.MLRunInvalidArgumentError(
91
132
  "Append mode not supported for Google cloud storage datastore"
92
133
  )
93
-
94
- if isinstance(data, bytes):
95
- mode = "wb"
96
- elif isinstance(data, str):
97
- mode = "w"
98
- else:
99
- raise TypeError(
100
- "Data type unknown. Unable to put in Google cloud storage!"
101
- )
134
+ data, mode = self._prepare_put_data(data, append)
102
135
  with self.filesystem.open(path, mode) as f:
103
136
  f.write(data)
104
137
 
105
138
  def upload(self, key, src_path):
106
- path = self._make_path(key)
107
- self.filesystem.put_file(src_path, path, overwrite=True)
139
+ file_size = os.path.getsize(src_path)
140
+ united_path = self._make_path(key)
141
+
142
+ # Multiple upload limitation recommendations as described in
143
+ # https://cloud.google.com/storage/docs/multipart-uploads#storage-upload-object-chunks-python
144
+
145
+ if file_size <= self.chunk_size:
146
+ self.filesystem.put_file(src_path, united_path, overwrite=True)
147
+ return
148
+
149
+ bucket = self.storage_client.bucket(self.endpoint)
150
+ blob = bucket.blob(key.strip("/"))
151
+
152
+ try:
153
+ transfer_manager.upload_chunks_concurrently(
154
+ src_path, blob, chunk_size=self.chunk_size, max_workers=self.workers
155
+ )
156
+ except Exception as upload_chunks_concurrently_exception:
157
+ logger.warning(
158
+ f"gcs: failed to concurrently upload {src_path},"
159
+ f" exception: {upload_chunks_concurrently_exception}. Retrying with single part upload."
160
+ )
161
+ self.filesystem.put_file(src_path, united_path, overwrite=True)
108
162
 
109
163
  def stat(self, key):
110
164
  path = self._make_path(key)
@@ -133,11 +187,13 @@ class GoogleCloudStorageStore(DataStore):
133
187
 
134
188
  def rm(self, path, recursive=False, maxdepth=None):
135
189
  path = self._make_path(path)
136
- self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
190
+ # in order to raise an error in case of a connection error (ML-7056)
191
+ self.filesystem.exists(path)
192
+ super().rm(path, recursive=recursive, maxdepth=maxdepth)
137
193
 
138
194
  def get_spark_options(self):
139
195
  res = {}
140
- st = self.get_storage_options()
196
+ st = self._get_credentials()
141
197
  if "token" in st:
142
198
  res = {"spark.hadoop.google.cloud.auth.service.account.enable": "true"}
143
199
  if isinstance(st["token"], str):
mlrun/datastore/inmem.py CHANGED
@@ -72,7 +72,7 @@ class InMemoryStore(DataStore):
72
72
  if columns:
73
73
  kwargs["usecols"] = columns
74
74
  reader = df_module.read_csv
75
- elif url.endswith(".parquet") or url.endswith(".pq") or format == "parquet":
75
+ elif mlrun.utils.helpers.is_parquet_file(url, format):
76
76
  if columns:
77
77
  kwargs["columns"] = columns
78
78
  reader = df_module.read_parquet
@@ -85,3 +85,6 @@ class InMemoryStore(DataStore):
85
85
  kwargs.pop(field, None)
86
86
 
87
87
  return reader(item, **kwargs)
88
+
89
+ def rm(self, path, recursive=False, maxdepth=None):
90
+ self._items.pop(path, None)
mlrun/datastore/redis.py CHANGED
@@ -126,6 +126,7 @@ class RedisStore(DataStore):
126
126
 
127
127
  def put(self, key, data, append=False):
128
128
  key = RedisStore.build_redis_key(key)
129
+ data, _ = self._prepare_put_data(data, append)
129
130
  if append:
130
131
  self.redis.append(key, data)
131
132
  else: