mlrun 1.7.0rc1__py3-none-any.whl → 1.7.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

mlrun/artifacts/model.py CHANGED
@@ -13,7 +13,9 @@
13
13
  # limitations under the License.
14
14
  import tempfile
15
15
  from os import path
16
+ from typing import Any
16
17
 
18
+ import pandas as pd
17
19
  import yaml
18
20
  from deprecated import deprecated
19
21
 
@@ -259,6 +261,7 @@ class ModelArtifact(Artifact):
259
261
  """
260
262
  subset = df
261
263
  inferer = get_infer_interface(subset)
264
+ numeric_columns = self._extract_numeric_features(df)
262
265
  if label_columns:
263
266
  if not isinstance(label_columns, list):
264
267
  label_columns = [label_columns]
@@ -272,9 +275,13 @@ class ModelArtifact(Artifact):
272
275
  )
273
276
  if with_stats:
274
277
  self.spec.feature_stats = inferer.get_stats(
275
- df, options=InferOptions.Histogram, num_bins=num_bins
278
+ df[numeric_columns], options=InferOptions.Histogram, num_bins=num_bins
276
279
  )
277
280
 
281
+ @staticmethod
282
+ def _extract_numeric_features(df: pd.DataFrame) -> list[Any]:
283
+ return [col for col in df.columns if pd.api.types.is_numeric_dtype(df[col])]
284
+
278
285
  @property
279
286
  def is_dir(self):
280
287
  return True
@@ -62,9 +62,12 @@ def _init_engine(dsn=None):
62
62
  max_overflow = config.httpdb.db.connections_pool_max_overflow
63
63
  if max_overflow is None:
64
64
  max_overflow = config.httpdb.max_workers
65
+
65
66
  kwargs = {
66
67
  "pool_size": pool_size,
67
68
  "max_overflow": max_overflow,
69
+ "pool_pre_ping": config.httpdb.db.connections_pool_pre_ping,
70
+ "pool_recycle": config.httpdb.db.connections_pool_recycle,
68
71
  }
69
72
  engine = create_engine(dsn, **kwargs)
70
73
  _engines[dsn] = engine
mlrun/config.py CHANGED
@@ -109,7 +109,10 @@ default_config = {
109
109
  "runs": {
110
110
  # deleting runs is a heavy operation that includes deleting runtime resources, therefore we do it in chunks
111
111
  "batch_delete_runs_chunk_size": 10,
112
- }
112
+ },
113
+ "resources": {
114
+ "delete_crd_resources_timeout": "5 minutes",
115
+ },
113
116
  },
114
117
  # the grace period (in seconds) that will be given to runtime resources (after they're in terminal state)
115
118
  # before deleting them (4 hours)
@@ -303,7 +306,11 @@ default_config = {
303
306
  # default is 16MB, max 1G, for more info https://dev.mysql.com/doc/refman/8.0/en/packet-too-large.html
304
307
  "max_allowed_packet": 64000000, # 64MB
305
308
  },
306
- # None will set this to be equal to the httpdb.max_workers
309
+ # tests connections for liveness upon each checkout
310
+ "connections_pool_pre_ping": True,
311
+ # this setting causes the pool to recycle connections after the given number of seconds has passed
312
+ "connections_pool_recycle": 60 * 60,
313
+ # None defaults to httpdb.max_workers
307
314
  "connections_pool_size": None,
308
315
  "connections_pool_max_overflow": None,
309
316
  # below is a db-specific configuration
@@ -408,7 +415,7 @@ default_config = {
408
415
  "iguazio_access_key": "",
409
416
  "iguazio_list_projects_default_page_size": 200,
410
417
  "iguazio_client_job_cache_ttl": "20 minutes",
411
- "nuclio_project_deletion_verification_timeout": "60 seconds",
418
+ "nuclio_project_deletion_verification_timeout": "300 seconds",
412
419
  "nuclio_project_deletion_verification_interval": "5 seconds",
413
420
  },
414
421
  # The API needs to know what is its k8s svc url so it could enrich it in the jobs it creates
mlrun/datastore/base.py CHANGED
@@ -654,34 +654,6 @@ def http_get(url, headers=None, auth=None):
654
654
  return response.content
655
655
 
656
656
 
657
- def http_head(url, headers=None, auth=None):
658
- try:
659
- response = requests.head(url, headers=headers, auth=auth, verify=verify_ssl)
660
- except OSError as exc:
661
- raise OSError(f"error: cannot connect to {url}: {err_to_str(exc)}")
662
-
663
- mlrun.errors.raise_for_status(response)
664
-
665
- return response.headers
666
-
667
-
668
- def http_put(url, data, headers=None, auth=None, session=None):
669
- try:
670
- put_api = session.put if session else requests.put
671
- response = put_api(
672
- url, data=data, headers=headers, auth=auth, verify=verify_ssl
673
- )
674
- except OSError as exc:
675
- raise OSError(f"error: cannot connect to {url}: {err_to_str(exc)}") from exc
676
-
677
- mlrun.errors.raise_for_status(response)
678
-
679
-
680
- def http_upload(url, file_path, headers=None, auth=None):
681
- with open(file_path, "rb") as data:
682
- http_put(url, data, headers, auth)
683
-
684
-
685
657
  class HttpStore(DataStore):
686
658
  def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
687
659
  super().__init__(parent, name, schema, endpoint, secrets)
@@ -382,6 +382,18 @@ def datastore_profile_read(url, project_name="", secrets: dict = None):
382
382
  public_profile = mlrun.db.get_run_db().get_datastore_profile(
383
383
  profile_name, project_name
384
384
  )
385
+ # The mlrun.db.get_run_db().get_datastore_profile() function is capable of returning
386
+ # two distinct types of objects based on its execution context.
387
+ # If it operates from the client or within the pod (which is the common scenario),
388
+ # it yields an instance of `mlrun.datastore.DatastoreProfile`. Conversely,
389
+ # when executed on the server with a direct call to `sqldb`, it produces an instance of
390
+ # mlrun.common.schemas.DatastoreProfile.
391
+ # In the latter scenario, an extra conversion step is required to transform the object
392
+ # into mlrun.datastore.DatastoreProfile.
393
+ if isinstance(public_profile, mlrun.common.schemas.DatastoreProfile):
394
+ public_profile = DatastoreProfile2Json.create_from_json(
395
+ public_json=public_profile.object
396
+ )
385
397
  project_ds_name_private = DatastoreProfile.generate_secret_key(
386
398
  profile_name, project_name
387
399
  )
@@ -848,8 +848,6 @@ class HttpSource(OnlineSource):
848
848
 
849
849
 
850
850
  class StreamSource(OnlineSource):
851
- """Sets stream source for the flow. If stream doesn't exist it will create it"""
852
-
853
851
  kind = "v3ioStream"
854
852
 
855
853
  def __init__(
@@ -863,7 +861,7 @@ class StreamSource(OnlineSource):
863
861
  **kwargs,
864
862
  ):
865
863
  """
866
- Sets stream source for the flow. If stream doesn't exist it will create it
864
+ Sets the stream source for the flow. If the stream doesn't exist it will create it.
867
865
 
868
866
  :param name: stream name. Default "stream"
869
867
  :param group: consumer group. Default "serving"
@@ -915,8 +913,6 @@ class StreamSource(OnlineSource):
915
913
 
916
914
 
917
915
  class KafkaSource(OnlineSource):
918
- """Sets kafka source for the flow"""
919
-
920
916
  kind = "kafka"
921
917
 
922
918
  def __init__(
@@ -727,7 +727,7 @@ class BaseStoreTarget(DataTargetBase):
727
727
 
728
728
 
729
729
  class ParquetTarget(BaseStoreTarget):
730
- """parquet target storage driver, used to materialize feature set/vector data into parquet files
730
+ """Parquet target storage driver, used to materialize feature set/vector data into parquet files.
731
731
 
732
732
  :param name: optional, target name. By default will be called ParquetTarget
733
733
  :param path: optional, Output path. Can be either a file or directory.
@@ -1911,12 +1911,16 @@ class SQLTarget(BaseStoreTarget):
1911
1911
  # creat new table with the given name
1912
1912
  columns = []
1913
1913
  for col, col_type in self.schema.items():
1914
- col_type = TYPE_TO_SQL_TYPE.get(col_type)
1915
- if col_type is None:
1916
- raise TypeError(f"{col_type} unsupported type")
1914
+ col_type_sql = TYPE_TO_SQL_TYPE.get(col_type)
1915
+ if col_type_sql is None:
1916
+ raise TypeError(
1917
+ f"'{col_type}' unsupported type for column '{col}'"
1918
+ )
1917
1919
  columns.append(
1918
1920
  sqlalchemy.Column(
1919
- col, col_type, primary_key=(col in primary_key_for_check)
1921
+ col,
1922
+ col_type_sql,
1923
+ primary_key=(col in primary_key_for_check),
1920
1924
  )
1921
1925
  )
1922
1926
 
mlrun/datastore/v3io.py CHANGED
@@ -15,12 +15,11 @@
15
15
  import mmap
16
16
  import os
17
17
  import time
18
- from copy import deepcopy
19
18
  from datetime import datetime
20
19
 
21
20
  import fsspec
22
- import requests
23
- import v3io.dataplane
21
+ import v3io
22
+ from v3io.dataplane.response import HttpResponseError
24
23
 
25
24
  import mlrun
26
25
  from mlrun.datastore.helpers import ONE_GB, ONE_MB
@@ -30,11 +29,6 @@ from .base import (
30
29
  DataStore,
31
30
  FileStats,
32
31
  basic_auth_header,
33
- get_range,
34
- http_get,
35
- http_head,
36
- http_put,
37
- http_upload,
38
32
  )
39
33
 
40
34
  V3IO_LOCAL_ROOT = "v3io"
@@ -47,17 +41,18 @@ class V3ioStore(DataStore):
47
41
 
48
42
  self.headers = None
49
43
  self.secure = self.kind == "v3ios"
44
+
45
+ token = self._get_secret_or_env("V3IO_ACCESS_KEY")
46
+ username = self._get_secret_or_env("V3IO_USERNAME")
47
+ password = self._get_secret_or_env("V3IO_PASSWORD")
50
48
  if self.endpoint.startswith("https://"):
51
49
  self.endpoint = self.endpoint[len("https://") :]
52
50
  self.secure = True
53
51
  elif self.endpoint.startswith("http://"):
54
52
  self.endpoint = self.endpoint[len("http://") :]
55
53
  self.secure = False
56
-
57
- token = self._get_secret_or_env("V3IO_ACCESS_KEY")
58
- username = self._get_secret_or_env("V3IO_USERNAME")
59
- password = self._get_secret_or_env("V3IO_PASSWORD")
60
-
54
+ self.client = v3io.dataplane.Client(access_key=token, endpoint=self.url)
55
+ self.object = self.client.object
61
56
  self.auth = None
62
57
  self.token = token
63
58
  if token:
@@ -65,6 +60,16 @@ class V3ioStore(DataStore):
65
60
  elif username and password:
66
61
  self.headers = basic_auth_header(username, password)
67
62
 
63
+ @staticmethod
64
+ def _do_object_request(function: callable, *args, **kwargs):
65
+ try:
66
+ return function(*args, **kwargs)
67
+ except HttpResponseError as http_response_error:
68
+ raise mlrun.errors.err_for_status_code(
69
+ status_code=http_response_error.status_code,
70
+ message=mlrun.errors.err_to_str(http_response_error),
71
+ )
72
+
68
73
  @staticmethod
69
74
  def uri_to_ipython(endpoint, subpath):
70
75
  return V3IO_LOCAL_ROOT + subpath
@@ -91,13 +96,19 @@ class V3ioStore(DataStore):
91
96
 
92
97
  def _upload(self, key: str, src_path: str, max_chunk_size: int = ONE_GB):
93
98
  """helper function for upload method, allows for controlling max_chunk_size in testing"""
99
+ container, path = split_path(self._join(key))
94
100
  file_size = os.path.getsize(src_path) # in bytes
95
101
  if file_size <= ONE_MB:
96
- http_upload(self.url + self._join(key), src_path, self.headers, None)
102
+ with open(src_path, "rb") as source_file:
103
+ data = source_file.read()
104
+ self._do_object_request(
105
+ self.object.put,
106
+ container=container,
107
+ path=path,
108
+ body=data,
109
+ append=False,
110
+ )
97
111
  return
98
- append_header = deepcopy(self.headers)
99
- append_header["Range"] = "-1"
100
-
101
112
  # chunk must be a multiple of the ALLOCATIONGRANULARITY
102
113
  # https://docs.python.org/3/library/mmap.html
103
114
  if residue := max_chunk_size % mmap.ALLOCATIONGRANULARITY:
@@ -114,11 +125,13 @@ class V3ioStore(DataStore):
114
125
  access=mmap.ACCESS_READ,
115
126
  offset=file_offset,
116
127
  ) as mmap_obj:
117
- http_put(
118
- self.url + self._join(key),
119
- mmap_obj,
120
- append_header if file_offset else self.headers,
121
- None,
128
+ append = file_offset != 0
129
+ self._do_object_request(
130
+ self.object.put,
131
+ container=container,
132
+ path=path,
133
+ body=mmap_obj,
134
+ append=append,
122
135
  )
123
136
  file_offset += chunk_size
124
137
 
@@ -126,43 +139,55 @@ class V3ioStore(DataStore):
126
139
  return self._upload(key, src_path)
127
140
 
128
141
  def get(self, key, size=None, offset=0):
129
- headers = self.headers
130
- if size or offset:
131
- headers = deepcopy(headers)
132
- headers["Range"] = get_range(size, offset)
133
- return http_get(self.url + self._join(key), headers)
142
+ container, path = split_path(self._join(key))
143
+ return self._do_object_request(
144
+ function=self.object.get,
145
+ container=container,
146
+ path=path,
147
+ offset=offset,
148
+ num_bytes=size,
149
+ ).body
134
150
 
135
- def _put(self, key, data, max_chunk_size: int = ONE_GB):
151
+ def _put(self, key, data, append=False, max_chunk_size: int = ONE_GB):
136
152
  """helper function for put method, allows for controlling max_chunk_size in testing"""
153
+ container, path = split_path(self._join(key))
137
154
  buffer_size = len(data) # in bytes
138
155
  if buffer_size <= ONE_MB:
139
- http_put(self.url + self._join(key), data, self.headers, None)
156
+ self._do_object_request(
157
+ self.object.put,
158
+ container=container,
159
+ path=path,
160
+ body=data,
161
+ append=append,
162
+ )
140
163
  return
141
- append_header = deepcopy(self.headers)
142
- append_header["Range"] = "-1"
143
164
  buffer_offset = 0
144
165
  try:
145
166
  data = memoryview(data)
146
167
  except TypeError:
147
168
  pass
148
169
 
149
- with requests.Session() as requests_session:
150
- while buffer_offset < buffer_size:
151
- chunk_size = min(buffer_size - buffer_offset, max_chunk_size)
152
- http_put(
153
- self.url + self._join(key),
154
- data[buffer_offset : buffer_offset + chunk_size],
155
- append_header if buffer_offset else self.headers,
156
- None,
157
- requests_session,
158
- )
159
- buffer_offset += chunk_size
170
+ while buffer_offset < buffer_size:
171
+ chunk_size = min(buffer_size - buffer_offset, max_chunk_size)
172
+ append = True if buffer_offset or append else False
173
+ self._do_object_request(
174
+ self.object.put,
175
+ container=container,
176
+ path=path,
177
+ body=data[buffer_offset : buffer_offset + chunk_size],
178
+ append=append,
179
+ )
180
+ buffer_offset += chunk_size
160
181
 
161
182
  def put(self, key, data, append=False):
162
- return self._put(key, data)
183
+ return self._put(key, data, append)
163
184
 
164
185
  def stat(self, key):
165
- head = http_head(self.url + self._join(key), self.headers)
186
+ container, path = split_path(self._join(key))
187
+ response = self._do_object_request(
188
+ function=self.object.head, container=container, path=path
189
+ )
190
+ head = dict(response.headers)
166
191
  size = int(head.get("Content-Length", "0"))
167
192
  datestr = head.get("Last-Modified", "0")
168
193
  modified = time.mktime(
@@ -171,7 +196,6 @@ class V3ioStore(DataStore):
171
196
  return FileStats(size, modified)
172
197
 
173
198
  def listdir(self, key):
174
- v3io_client = v3io.dataplane.Client(endpoint=self.url, access_key=self.token)
175
199
  container, subpath = split_path(self._join(key))
176
200
  if not subpath.endswith("/"):
177
201
  subpath += "/"
@@ -180,7 +204,7 @@ class V3ioStore(DataStore):
180
204
  subpath_length = len(subpath) - 1
181
205
 
182
206
  try:
183
- response = v3io_client.container.list(
207
+ response = self.client.container.list(
184
208
  container=container,
185
209
  path=subpath,
186
210
  get_all_attributes=False,
@@ -114,44 +114,6 @@ def get_offline_features(
114
114
  spark_service: str = None,
115
115
  timestamp_for_filtering: Union[str, dict[str, str]] = None,
116
116
  ):
117
- return _get_offline_features(
118
- feature_vector,
119
- entity_rows,
120
- entity_timestamp_column,
121
- target,
122
- run_config,
123
- drop_columns,
124
- start_time,
125
- end_time,
126
- with_indexes,
127
- update_stats,
128
- engine,
129
- engine_args,
130
- query,
131
- order_by,
132
- spark_service,
133
- timestamp_for_filtering,
134
- )
135
-
136
-
137
- def _get_offline_features(
138
- feature_vector: Union[str, FeatureVector],
139
- entity_rows=None,
140
- entity_timestamp_column: str = None,
141
- target: DataTargetBase = None,
142
- run_config: RunConfig = None,
143
- drop_columns: list[str] = None,
144
- start_time: Union[str, datetime] = None,
145
- end_time: Union[str, datetime] = None,
146
- with_indexes: bool = False,
147
- update_stats: bool = False,
148
- engine: str = None,
149
- engine_args: dict = None,
150
- query: str = None,
151
- order_by: Union[str, list[str]] = None,
152
- spark_service: str = None,
153
- timestamp_for_filtering: Union[str, dict[str, str]] = None,
154
- ) -> Union[OfflineVectorResponse, RemoteVectorResponse]:
155
117
  """retrieve offline feature vector results
156
118
 
157
119
  specify a feature vector object/uri and retrieve the desired features, their metadata
@@ -212,6 +174,44 @@ def _get_offline_features(
212
174
  merge process using start_time and end_time params.
213
175
 
214
176
  """
177
+ return _get_offline_features(
178
+ feature_vector,
179
+ entity_rows,
180
+ entity_timestamp_column,
181
+ target,
182
+ run_config,
183
+ drop_columns,
184
+ start_time,
185
+ end_time,
186
+ with_indexes,
187
+ update_stats,
188
+ engine,
189
+ engine_args,
190
+ query,
191
+ order_by,
192
+ spark_service,
193
+ timestamp_for_filtering,
194
+ )
195
+
196
+
197
+ def _get_offline_features(
198
+ feature_vector: Union[str, FeatureVector],
199
+ entity_rows=None,
200
+ entity_timestamp_column: str = None,
201
+ target: DataTargetBase = None,
202
+ run_config: RunConfig = None,
203
+ drop_columns: list[str] = None,
204
+ start_time: Union[str, datetime] = None,
205
+ end_time: Union[str, datetime] = None,
206
+ with_indexes: bool = False,
207
+ update_stats: bool = False,
208
+ engine: str = None,
209
+ engine_args: dict = None,
210
+ query: str = None,
211
+ order_by: Union[str, list[str]] = None,
212
+ spark_service: str = None,
213
+ timestamp_for_filtering: Union[str, dict[str, str]] = None,
214
+ ) -> Union[OfflineVectorResponse, RemoteVectorResponse]:
215
215
  if entity_rows is None and entity_timestamp_column is not None:
216
216
  raise mlrun.errors.MLRunInvalidArgumentError(
217
217
  "entity_timestamp_column param "
@@ -281,24 +281,6 @@ def get_online_feature_service(
281
281
  update_stats: bool = False,
282
282
  entity_keys: list[str] = None,
283
283
  ):
284
- return _get_online_feature_service(
285
- feature_vector,
286
- run_config,
287
- fixed_window_type,
288
- impute_policy,
289
- update_stats,
290
- entity_keys,
291
- )
292
-
293
-
294
- def _get_online_feature_service(
295
- feature_vector: Union[str, FeatureVector],
296
- run_config: RunConfig = None,
297
- fixed_window_type: FixedWindowType = FixedWindowType.LastClosedWindow,
298
- impute_policy: dict = None,
299
- update_stats: bool = False,
300
- entity_keys: list[str] = None,
301
- ) -> OnlineVectorService:
302
284
  """initialize and return online feature vector service api,
303
285
  returns :py:class:`~mlrun.feature_store.OnlineVectorService`
304
286
 
@@ -362,6 +344,24 @@ def _get_online_feature_service(
362
344
  :return: Initialize the `OnlineVectorService`.
363
345
  Will be used in subclasses where `support_online=True`.
364
346
  """
347
+ return _get_online_feature_service(
348
+ feature_vector,
349
+ run_config,
350
+ fixed_window_type,
351
+ impute_policy,
352
+ update_stats,
353
+ entity_keys,
354
+ )
355
+
356
+
357
+ def _get_online_feature_service(
358
+ feature_vector: Union[str, FeatureVector],
359
+ run_config: RunConfig = None,
360
+ fixed_window_type: FixedWindowType = FixedWindowType.LastClosedWindow,
361
+ impute_policy: dict = None,
362
+ update_stats: bool = False,
363
+ entity_keys: list[str] = None,
364
+ ) -> OnlineVectorService:
365
365
  if isinstance(feature_vector, FeatureVector):
366
366
  update_stats = True
367
367
  feature_vector = _features_to_vector_and_check_permissions(
@@ -318,8 +318,6 @@ def emit_policy_to_dict(policy: EmitPolicy):
318
318
 
319
319
 
320
320
  class FeatureSet(ModelObj):
321
- """Feature set object, defines a set of features and their data pipeline"""
322
-
323
321
  kind = mlrun.common.schemas.ObjectKind.feature_set.value
324
322
  _dict_fields = ["kind", "metadata", "spec", "status"]
325
323